Spaces:
Sleeping
Sleeping
Marcos12886
commited on
Upload model.py
Browse files
model.py
ADDED
@@ -0,0 +1,117 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import torch
|
2 |
+
import numpy as np
|
3 |
+
import os
|
4 |
+
from huggingface_hub import login, upload_folder
|
5 |
+
from datasets import load_dataset, Audio
|
6 |
+
from transformers.integrations import TensorBoardCallback
|
7 |
+
from transformers import (
|
8 |
+
Wav2Vec2FeatureExtractor, AutoModelForAudioClassification,
|
9 |
+
Trainer, TrainingArguments,
|
10 |
+
EarlyStoppingCallback
|
11 |
+
)
|
12 |
+
import json
|
13 |
+
# SE USA FLOAT32 EN EL MODELO ORIGINAL
|
14 |
+
MODEL = "ntu-spml/distilhubert" # modelo base utilizado, para usar otro basta con cambiar esto
|
15 |
+
FEATURE_EXTRACTOR = Wav2Vec2FeatureExtractor.from_pretrained(MODEL)
|
16 |
+
seed = 123
|
17 |
+
MAX_DURATION = 1.00
|
18 |
+
SAMPLING_RATE = FEATURE_EXTRACTOR.sampling_rate # 16000 # antes estaba float16
|
19 |
+
access_token = os.getenv('HF_ACCESS_TOKEN') # Se podría cambiar para meterlo cada vez. Pereza.
|
20 |
+
config_file = "models_config.json"
|
21 |
+
clasificador = "class"
|
22 |
+
monitor = "mon"
|
23 |
+
|
24 |
+
def seed_everything():
|
25 |
+
np.random.seed(seed)
|
26 |
+
torch.manual_seed(seed)
|
27 |
+
torch.cuda.manual_seed(seed)
|
28 |
+
torch.backends.cudnn.deterministic = True
|
29 |
+
torch.backends.cudnn.benchmark = False
|
30 |
+
os.environ['CUBLAS_WORKSPACE_CONFIG'] = ':16384:8'
|
31 |
+
|
32 |
+
def preprocess_audio(audio_arrays, batch=True):
|
33 |
+
if batch:
|
34 |
+
audios = [x["array"] for x in audio_arrays["audio"]] # para usar aquí
|
35 |
+
else:
|
36 |
+
audios = [audio_arrays] # para usar en realtime.py
|
37 |
+
inputs = Wav2Vec2FeatureExtractor.from_pretrained(MODEL)(
|
38 |
+
raw_speech=audios,
|
39 |
+
sampling_rate=SAMPLING_RATE,
|
40 |
+
return_tensors="pt", # Devolver tensores de PyTorch
|
41 |
+
max_length=int(SAMPLING_RATE * MAX_DURATION), # Necesario para truncation
|
42 |
+
truncation=True, # Muchísimo más rápido.
|
43 |
+
padding=True, # Vectores igual longitud
|
44 |
+
do_normalize=True, # No afecta 1ª época, no sé si necesario
|
45 |
+
# return_attention_mask=True, # Empeora 1ª época. No sé si necesario
|
46 |
+
padding_value=0.0, # No afecta 1ª época, no sé si necesario
|
47 |
+
float=32 # No afecta 1ª época, no sé si necesario
|
48 |
+
)
|
49 |
+
return inputs
|
50 |
+
|
51 |
+
def load_and_prepare_dataset(dataset_path):
|
52 |
+
dataset = load_dataset(dataset_path, split="train") # Split para que no ponga train de primeras
|
53 |
+
# dataset = dataset.cast_column("audio", Audio(sampling_rate=SAMPLING_RATE)) # Da mejor accuracy pero creo que cambia el preprocesado.
|
54 |
+
encoded_dataset = dataset.map(preprocess_audio, remove_columns=["audio"], batched=True) # num_proc hace q no vaya realtime
|
55 |
+
labels = encoded_dataset.features["label"].names
|
56 |
+
label2id = {label: str(i) for i, label in enumerate(labels)}
|
57 |
+
id2label = {str(i): label for i, label in enumerate(labels)}
|
58 |
+
encoded_dataset = encoded_dataset.train_test_split(test_size=0.2, seed=seed, stratify_by_column="label")
|
59 |
+
return encoded_dataset, label2id, id2label
|
60 |
+
|
61 |
+
def load_model(num_labels, label2id, id2label):
|
62 |
+
model = AutoModelForAudioClassification.from_pretrained(
|
63 |
+
MODEL,
|
64 |
+
num_labels=num_labels,
|
65 |
+
label2id=label2id,
|
66 |
+
id2label=id2label
|
67 |
+
)
|
68 |
+
return model
|
69 |
+
|
70 |
+
def model_params(dataset_path):
|
71 |
+
login(token, add_to_git_credential=True)
|
72 |
+
seed_everything()
|
73 |
+
encoded_dataset, label2id, id2label = load_and_prepare_dataset(dataset_path)
|
74 |
+
model = load_model(len(id2label), label2id, id2label)
|
75 |
+
return model, encoded_dataset, id2label
|
76 |
+
|
77 |
+
def compute_metrics(eval_pred):
|
78 |
+
predictions = np.argmax(eval_pred.predictions, axis=1)
|
79 |
+
references = eval_pred.label_ids
|
80 |
+
return {
|
81 |
+
"accuracy": np.mean(predictions == references),
|
82 |
+
}
|
83 |
+
|
84 |
+
def model_training(training_args, output_dir, dataset_path):
|
85 |
+
model, encoded_dataset, _ = model_params(dataset_path)
|
86 |
+
tensorboard_callback = TensorBoardCallback()
|
87 |
+
early_stopping_callback = EarlyStoppingCallback(early_stopping_patience=3)
|
88 |
+
trainer = Trainer(
|
89 |
+
model=model,
|
90 |
+
args=training_args,
|
91 |
+
compute_metrics=compute_metrics,
|
92 |
+
train_dataset=encoded_dataset["train"],
|
93 |
+
eval_dataset=encoded_dataset["test"],
|
94 |
+
callbacks=[tensorboard_callback, early_stopping_callback]
|
95 |
+
)
|
96 |
+
torch.cuda.empty_cache() # liberar memoria de la GPU
|
97 |
+
trainer.train() # se pueden modificar los parámetros para continuar el train
|
98 |
+
trainer.push_to_hub(token=token) # Subir modelo a mi cuenta. Necesario para hacer la predicción, no sé por qué.
|
99 |
+
trainer.save_model(output_dir) # para subir el modelo a Hugging Face. Necesario para hacer la predicción, no sé por qué.
|
100 |
+
os.makedirs(output_dir, exist_ok=True) # Crear carpeta con el modelo si no existe
|
101 |
+
upload_folder(repo_id=f"A-POR-LOS-8000/{output_dir}",folder_path=output_dir, token=token) # subir modelo a organización
|
102 |
+
|
103 |
+
def load_config(model_name):
|
104 |
+
with open(config_file, 'r') as f:
|
105 |
+
config = json.load(f)
|
106 |
+
model_config = config[model_name]
|
107 |
+
training_args = TrainingArguments(**model_config["training_args"])
|
108 |
+
model_config["training_args"] = training_args
|
109 |
+
return model_config
|
110 |
+
|
111 |
+
if __name__ == "__main__":
|
112 |
+
config = load_config(clasificador) # PARA CAMBIAR MODELOS
|
113 |
+
# config = load_config(monitor) # PARA CAMBI
|
114 |
+
training_args = config["training_args"]
|
115 |
+
output_dir = config["output_dir"]
|
116 |
+
dataset_path = config["dataset_path"]
|
117 |
+
model_training(training_args, output_dir, dataset_path)
|