File size: 4,986 Bytes
ff8e6c1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
#!/usr/bin/env python
# coding: utf-8
# %%
import numpy as np
import os
from pathlib import Path
from tqdm import tqdm
from utils.ecg_utils import (
    remove_baseline_wander,
    wavelet_denoise_signal,
    plot_12_lead_ecg,
)


# %%
def calculate_means_stds(npy_directory, n):
    
    npy_directory = Path(npy_directory)
    filelist = os.listdir(npy_directory)
    np.random.shuffle(filelist)

    full_batch = np.zeros((n, 5000, 12))
    count = 0

    for i, npy_filename in enumerate(tqdm(filelist[:n])):
        npy_filepath = npy_directory / npy_filename
        ekg_numpy_array = np.load(npy_filepath)

        if ekg_numpy_array.shape[0] != 5000:
            continue

        full_batch[count] = ekg_numpy_array
        count += 1

    full_batch = full_batch[:count]  # Trim the array to remove unused entries
    ecg_means = np.mean(full_batch, axis=(0, 1))
    ecg_stds = np.std(full_batch, axis=(0, 1))

    if ecg_means.shape[0] == ecg_stds.shape[0] == 12:
        print('Shape of mean and std for ECG normalization are correct!')

    return ecg_means, ecg_stds


# %%
# run the function on a list of filenames.
def ecg_denoising(
        raw_directory=raw_directory,
        output_directory=output_directory,
        ecg_means = ecg_means,
        ecg_stds = ecg_stds
    ):

    filelist = os.listdir(raw_directory)
    
    for i, filename in enumerate(tqdm(filelist[:n])):
        
        # Signal processing
        raw_directory = Path(raw_directory)
        ecg_filepath = raw_directory / filename
        ecg_numpy_array = np.load(ecg_filepath)
        # 1. Wandering baseline removal
        ecg_numpy_array = remove_baseline_wander(
            ecg_numpy_array, sampling_frequency=sampling_frequency
        )

        # Discrete wavelet transform denoising
        for lead in range(12):
            ecg_numpy_array[:, lead] = wavelet_denoise_signal(ecg_numpy_array[:, lead])

        # Lead-wise normalization with precomputed means and standard deviations
        ecg_numpy_array = (ecg_numpy_array - ecg_means) / ecg_stds

        np.save(output_directory / filename, ecg_numpy_array)
        
        return True


# %%
def segmentation(data_path, manifest_path, output_path, length, steps):
    manifest = pd.read_csv(manifest_path)

    data = []
    print('Staring segmenting ECG......')
    for index in tqdm(range(manifest.shape[0])):
        mrn = manifest['MRN'].iloc[index]            #MRN as column name for medical record number
        filename = manifest['filename'].iloc[index]  #filename as column name for ecg filename
        k = manifest['TEST_RSLT'].iloc[index]        #TEST_RSLT as column name for potassium level

        ecg_array = np.load(os.path.join(data_path, filename))
        ecg_array = ecg_array[:, 0]  # assume lead I is the first lead in npy file

        # Loop through every second as start point:
        for start in range(0, len(ecg_array), 500*steps):
            end = start + 500 * length    # 500 points for each seconds

            if start >= 0 and end <= len(ecg_array):
                sample = ecg_array[start:end]

                if len(sample) == 500 * length:
                    data.append({'mrn': mrn, 'original_filename': filename, 'ecg': sample, 'label': k})
                else:
                    print(f'Different sample size for {filename}: {len(sample)}')

    df = pd.DataFrame(data)
    df['filename'] = None

    if not os.path.exists(output_path):
        os.makedirs(output_path)
    
    print('Saving segmented ECG......')
    for index, row in df.iterrows():
        original_filename = row['original_filename']
        ecg_array = row['ecg']
        new_file_name = f"{original_filename.rsplit('.', 1)[0]}_{index+1}.npy"
        new_file_path = os.path.join(output_path, new_file_name)
        
        df.at[index, 'filename'] = new_file_name
        np.save(new_file_path, ecg_array)

    df.drop(columns=['ecg'], inplace=True)
    df.to_csv(f'{output_path}/{length}seconds_length_{steps}seconds_step_ecg_manifest.csv')


# %%
if __name__ == "__main__":
    npy_directory = "ecg folder for entire database" 
    n = 100000  # the number of ecg for calculating mean and std
    
    print('Calculating ECG means and stds........')
    ecg_means, ecg_stds = calculate_means_stds(npy_directory, n)

    raw_directory = "path/to/raw_data_directory"  #raw ecg folder for target task
    output_directory = "path/to/output_directory" #output ecg folder for target task

    print('Denoising and Normalizing ECGs........')
    ecg_denoising(raw_directory, output_directory, ecg_means, ecg_stds)
    
    
    data_path = output_directory  # Output directory from the above step
    manifest_path = "/path/to/manifest.csv"  # Manifest file path
    output_path = "path/to/output_path"  # Output path for segmented ECGs
    length = 5  # Length of each segment in seconds
    steps = 1  # Number of seconds step

    process_ecg(data_path, manifest_path, output_path, length, steps)