File size: 4,986 Bytes
ff8e6c1 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 |
#!/usr/bin/env python
# coding: utf-8
# %%
import numpy as np
import os
from pathlib import Path
from tqdm import tqdm
from utils.ecg_utils import (
remove_baseline_wander,
wavelet_denoise_signal,
plot_12_lead_ecg,
)
# %%
def calculate_means_stds(npy_directory, n):
npy_directory = Path(npy_directory)
filelist = os.listdir(npy_directory)
np.random.shuffle(filelist)
full_batch = np.zeros((n, 5000, 12))
count = 0
for i, npy_filename in enumerate(tqdm(filelist[:n])):
npy_filepath = npy_directory / npy_filename
ekg_numpy_array = np.load(npy_filepath)
if ekg_numpy_array.shape[0] != 5000:
continue
full_batch[count] = ekg_numpy_array
count += 1
full_batch = full_batch[:count] # Trim the array to remove unused entries
ecg_means = np.mean(full_batch, axis=(0, 1))
ecg_stds = np.std(full_batch, axis=(0, 1))
if ecg_means.shape[0] == ecg_stds.shape[0] == 12:
print('Shape of mean and std for ECG normalization are correct!')
return ecg_means, ecg_stds
# %%
# run the function on a list of filenames.
def ecg_denoising(
raw_directory=raw_directory,
output_directory=output_directory,
ecg_means = ecg_means,
ecg_stds = ecg_stds
):
filelist = os.listdir(raw_directory)
for i, filename in enumerate(tqdm(filelist[:n])):
# Signal processing
raw_directory = Path(raw_directory)
ecg_filepath = raw_directory / filename
ecg_numpy_array = np.load(ecg_filepath)
# 1. Wandering baseline removal
ecg_numpy_array = remove_baseline_wander(
ecg_numpy_array, sampling_frequency=sampling_frequency
)
# Discrete wavelet transform denoising
for lead in range(12):
ecg_numpy_array[:, lead] = wavelet_denoise_signal(ecg_numpy_array[:, lead])
# Lead-wise normalization with precomputed means and standard deviations
ecg_numpy_array = (ecg_numpy_array - ecg_means) / ecg_stds
np.save(output_directory / filename, ecg_numpy_array)
return True
# %%
def segmentation(data_path, manifest_path, output_path, length, steps):
manifest = pd.read_csv(manifest_path)
data = []
print('Staring segmenting ECG......')
for index in tqdm(range(manifest.shape[0])):
mrn = manifest['MRN'].iloc[index] #MRN as column name for medical record number
filename = manifest['filename'].iloc[index] #filename as column name for ecg filename
k = manifest['TEST_RSLT'].iloc[index] #TEST_RSLT as column name for potassium level
ecg_array = np.load(os.path.join(data_path, filename))
ecg_array = ecg_array[:, 0] # assume lead I is the first lead in npy file
# Loop through every second as start point:
for start in range(0, len(ecg_array), 500*steps):
end = start + 500 * length # 500 points for each seconds
if start >= 0 and end <= len(ecg_array):
sample = ecg_array[start:end]
if len(sample) == 500 * length:
data.append({'mrn': mrn, 'original_filename': filename, 'ecg': sample, 'label': k})
else:
print(f'Different sample size for {filename}: {len(sample)}')
df = pd.DataFrame(data)
df['filename'] = None
if not os.path.exists(output_path):
os.makedirs(output_path)
print('Saving segmented ECG......')
for index, row in df.iterrows():
original_filename = row['original_filename']
ecg_array = row['ecg']
new_file_name = f"{original_filename.rsplit('.', 1)[0]}_{index+1}.npy"
new_file_path = os.path.join(output_path, new_file_name)
df.at[index, 'filename'] = new_file_name
np.save(new_file_path, ecg_array)
df.drop(columns=['ecg'], inplace=True)
df.to_csv(f'{output_path}/{length}seconds_length_{steps}seconds_step_ecg_manifest.csv')
# %%
if __name__ == "__main__":
npy_directory = "ecg folder for entire database"
n = 100000 # the number of ecg for calculating mean and std
print('Calculating ECG means and stds........')
ecg_means, ecg_stds = calculate_means_stds(npy_directory, n)
raw_directory = "path/to/raw_data_directory" #raw ecg folder for target task
output_directory = "path/to/output_directory" #output ecg folder for target task
print('Denoising and Normalizing ECGs........')
ecg_denoising(raw_directory, output_directory, ecg_means, ecg_stds)
data_path = output_directory # Output directory from the above step
manifest_path = "/path/to/manifest.csv" # Manifest file path
output_path = "path/to/output_path" # Output path for segmented ECGs
length = 5 # Length of each segment in seconds
steps = 1 # Number of seconds step
process_ecg(data_path, manifest_path, output_path, length, steps)
|