import torch import numpy as np import os from huggingface_hub import login, upload_folder from datasets import load_dataset, Audio from transformers.integrations import TensorBoardCallback from transformers import ( Wav2Vec2FeatureExtractor, AutoModelForAudioClassification, Trainer, TrainingArguments, EarlyStoppingCallback ) import json # SE USA FLOAT32 EN EL MODELO ORIGINAL MODEL = "ntu-spml/distilhubert" # modelo base utilizado, para usar otro basta con cambiar esto FEATURE_EXTRACTOR = Wav2Vec2FeatureExtractor.from_pretrained(MODEL) seed = 123 MAX_DURATION = 1.00 SAMPLING_RATE = FEATURE_EXTRACTOR.sampling_rate # 16000 # antes estaba float16 access_token = os.getenv('HF_ACCESS_TOKEN') # Se podría cambiar para meterlo cada vez. Pereza. config_file = "models_config.json" clasificador = "class" monitor = "mon" def seed_everything(): np.random.seed(seed) torch.manual_seed(seed) torch.cuda.manual_seed(seed) torch.backends.cudnn.deterministic = True torch.backends.cudnn.benchmark = False os.environ['CUBLAS_WORKSPACE_CONFIG'] = ':16384:8' def preprocess_audio(audio_arrays, batch=True): if batch: audios = [x["array"] for x in audio_arrays["audio"]] # para usar aquí else: audios = [audio_arrays] # para usar en realtime.py inputs = Wav2Vec2FeatureExtractor.from_pretrained(MODEL)( raw_speech=audios, sampling_rate=SAMPLING_RATE, return_tensors="pt", # Devolver tensores de PyTorch max_length=int(SAMPLING_RATE * MAX_DURATION), # Necesario para truncation truncation=True, # Muchísimo más rápido. padding=True, # Vectores igual longitud do_normalize=True, # No afecta 1ª época, no sé si necesario # return_attention_mask=True, # Empeora 1ª época. No sé si necesario padding_value=0.0, # No afecta 1ª época, no sé si necesario float=32 # No afecta 1ª época, no sé si necesario ) return inputs def load_and_prepare_dataset(dataset_path): dataset = load_dataset(dataset_path, split="train") # Split para que no ponga train de primeras # dataset = dataset.cast_column("audio", Audio(sampling_rate=SAMPLING_RATE)) # Da mejor accuracy pero creo que cambia el preprocesado. encoded_dataset = dataset.map(preprocess_audio, remove_columns=["audio"], batched=True) # num_proc hace q no vaya realtime labels = encoded_dataset.features["label"].names label2id = {label: str(i) for i, label in enumerate(labels)} id2label = {str(i): label for i, label in enumerate(labels)} encoded_dataset = encoded_dataset.train_test_split(test_size=0.2, seed=seed, stratify_by_column="label") return encoded_dataset, label2id, id2label def load_model(num_labels, label2id, id2label): model = AutoModelForAudioClassification.from_pretrained( MODEL, num_labels=num_labels, label2id=label2id, id2label=id2label ) return model def model_params(dataset_path): login(token, add_to_git_credential=True) seed_everything() encoded_dataset, label2id, id2label = load_and_prepare_dataset(dataset_path) model = load_model(len(id2label), label2id, id2label) return model, encoded_dataset, id2label def compute_metrics(eval_pred): predictions = np.argmax(eval_pred.predictions, axis=1) references = eval_pred.label_ids return { "accuracy": np.mean(predictions == references), } def model_training(training_args, output_dir, dataset_path): model, encoded_dataset, _ = model_params(dataset_path) tensorboard_callback = TensorBoardCallback() early_stopping_callback = EarlyStoppingCallback(early_stopping_patience=3) trainer = Trainer( model=model, args=training_args, compute_metrics=compute_metrics, train_dataset=encoded_dataset["train"], eval_dataset=encoded_dataset["test"], callbacks=[tensorboard_callback, early_stopping_callback] ) torch.cuda.empty_cache() # liberar memoria de la GPU trainer.train() # se pueden modificar los parámetros para continuar el train trainer.push_to_hub(token=token) # Subir modelo a mi cuenta. Necesario para hacer la predicción, no sé por qué. trainer.save_model(output_dir) # para subir el modelo a Hugging Face. Necesario para hacer la predicción, no sé por qué. os.makedirs(output_dir, exist_ok=True) # Crear carpeta con el modelo si no existe upload_folder(repo_id=f"A-POR-LOS-8000/{output_dir}",folder_path=output_dir, token=token) # subir modelo a organización def load_config(model_name): with open(config_file, 'r') as f: config = json.load(f) model_config = config[model_name] training_args = TrainingArguments(**model_config["training_args"]) model_config["training_args"] = training_args return model_config if __name__ == "__main__": config = load_config(clasificador) # PARA CAMBIAR MODELOS # config = load_config(monitor) # PARA CAMBI training_args = config["training_args"] output_dir = config["output_dir"] dataset_path = config["dataset_path"] model_training(training_args, output_dir, dataset_path)