|
|
|
|
|
|
|
import numpy as np |
|
import os |
|
from pathlib import Path |
|
from tqdm import tqdm |
|
from utils.ecg_utils import ( |
|
remove_baseline_wander, |
|
wavelet_denoise_signal, |
|
plot_12_lead_ecg, |
|
) |
|
|
|
|
|
|
|
def calculate_means_stds(npy_directory, n): |
|
|
|
npy_directory = Path(npy_directory) |
|
filelist = os.listdir(npy_directory) |
|
np.random.shuffle(filelist) |
|
|
|
full_batch = np.zeros((n, 5000, 12)) |
|
count = 0 |
|
|
|
for i, npy_filename in enumerate(tqdm(filelist[:n])): |
|
npy_filepath = npy_directory / npy_filename |
|
ekg_numpy_array = np.load(npy_filepath) |
|
|
|
if ekg_numpy_array.shape[0] != 5000: |
|
continue |
|
|
|
full_batch[count] = ekg_numpy_array |
|
count += 1 |
|
|
|
full_batch = full_batch[:count] |
|
ecg_means = np.mean(full_batch, axis=(0, 1)) |
|
ecg_stds = np.std(full_batch, axis=(0, 1)) |
|
|
|
if ecg_means.shape[0] == ecg_stds.shape[0] == 12: |
|
print('Shape of mean and std for ECG normalization are correct!') |
|
|
|
return ecg_means, ecg_stds |
|
|
|
|
|
|
|
|
|
def ecg_denoising( |
|
raw_directory=raw_directory, |
|
output_directory=output_directory, |
|
ecg_means = ecg_means, |
|
ecg_stds = ecg_stds |
|
): |
|
|
|
filelist = os.listdir(raw_directory) |
|
|
|
for i, filename in enumerate(tqdm(filelist[:n])): |
|
|
|
|
|
raw_directory = Path(raw_directory) |
|
ecg_filepath = raw_directory / filename |
|
ecg_numpy_array = np.load(ecg_filepath) |
|
|
|
ecg_numpy_array = remove_baseline_wander( |
|
ecg_numpy_array, sampling_frequency=sampling_frequency |
|
) |
|
|
|
|
|
for lead in range(12): |
|
ecg_numpy_array[:, lead] = wavelet_denoise_signal(ecg_numpy_array[:, lead]) |
|
|
|
|
|
ecg_numpy_array = (ecg_numpy_array - ecg_means) / ecg_stds |
|
|
|
np.save(output_directory / filename, ecg_numpy_array) |
|
|
|
return True |
|
|
|
|
|
|
|
def segmentation(data_path, manifest_path, output_path, length, steps): |
|
manifest = pd.read_csv(manifest_path) |
|
|
|
data = [] |
|
print('Staring segmenting ECG......') |
|
for index in tqdm(range(manifest.shape[0])): |
|
mrn = manifest['MRN'].iloc[index] |
|
filename = manifest['filename'].iloc[index] |
|
k = manifest['TEST_RSLT'].iloc[index] |
|
|
|
ecg_array = np.load(os.path.join(data_path, filename)) |
|
ecg_array = ecg_array[:, 0] |
|
|
|
|
|
for start in range(0, len(ecg_array), 500*steps): |
|
end = start + 500 * length |
|
|
|
if start >= 0 and end <= len(ecg_array): |
|
sample = ecg_array[start:end] |
|
|
|
if len(sample) == 500 * length: |
|
data.append({'mrn': mrn, 'original_filename': filename, 'ecg': sample, 'label': k}) |
|
else: |
|
print(f'Different sample size for {filename}: {len(sample)}') |
|
|
|
df = pd.DataFrame(data) |
|
df['filename'] = None |
|
|
|
if not os.path.exists(output_path): |
|
os.makedirs(output_path) |
|
|
|
print('Saving segmented ECG......') |
|
for index, row in df.iterrows(): |
|
original_filename = row['original_filename'] |
|
ecg_array = row['ecg'] |
|
new_file_name = f"{original_filename.rsplit('.', 1)[0]}_{index+1}.npy" |
|
new_file_path = os.path.join(output_path, new_file_name) |
|
|
|
df.at[index, 'filename'] = new_file_name |
|
np.save(new_file_path, ecg_array) |
|
|
|
df.drop(columns=['ecg'], inplace=True) |
|
df.to_csv(f'{output_path}/{length}seconds_length_{steps}seconds_step_ecg_manifest.csv') |
|
|
|
|
|
|
|
if __name__ == "__main__": |
|
npy_directory = "ecg folder for entire database" |
|
n = 100000 |
|
|
|
print('Calculating ECG means and stds........') |
|
ecg_means, ecg_stds = calculate_means_stds(npy_directory, n) |
|
|
|
raw_directory = "path/to/raw_data_directory" |
|
output_directory = "path/to/output_directory" |
|
|
|
print('Denoising and Normalizing ECGs........') |
|
ecg_denoising(raw_directory, output_directory, ecg_means, ecg_stds) |
|
|
|
|
|
data_path = output_directory |
|
manifest_path = "/path/to/manifest.csv" |
|
output_path = "path/to/output_path" |
|
length = 5 |
|
steps = 1 |
|
|
|
process_ecg(data_path, manifest_path, output_path, length, steps) |
|
|
|
|