Orange
/

SSA-HuBERT-base-60k

+# Seed needs to be set at top of yaml, before objects with parameters are made
+seed: 1986
+__set_seed: !apply:torch.manual_seed [!ref <seed>]
+lang_csv: Swahili
+output_folder: !ref results/finetune_hubert_ASR_char/<seed>/<lang_csv>
+output_wer_folder: !ref <output_folder>/
+save_folder: !ref <output_folder>/save
+train_log: !ref <output_folder>/train_log.txt
+# huggingface format
+hubert_hub: Orange/SSA-HuBERT-base-60k
+hubert_folder: !ref <save_folder>/hubert_checkpoint
+# Data files
+data_folder: !ref PATH_TO_YOUR_FOLDER/data_speechbrain/<lang_csv>
+ckpt_interval_minutes: 10 # save checkpoint every N min
+train_csv: !ref <data_folder>/train.csv
+valid_csv: !ref <data_folder>/validation.csv
+test_csv:
+        - !ref <data_folder>/test.csv
+####################### Training Parameters ####################################
+number_of_epochs: 10
+lr: 0.1
+lr_hubert: 0.000005
+sorting: ascending
+precision: fp32 # bf16, fp16 or fp32
+sample_rate: 16000
+# skip audio file longer than
+avoid_if_longer_than: 60
+batch_size: 2
+test_batch_size: 2
+# Dataloader options
+train_dataloader_opts:
+   batch_size: !ref <batch_size>
+valid_dataloader_opts:
+   batch_size: !ref <batch_size>
+test_dataloader_opts:
+   batch_size: !ref <test_batch_size>
+####################### Model Parameters #######################################
+activation: !name:torch.nn.LeakyReLU
+dnn_layers: 2
+dnn_neurons: 1024
+freeze_hubert: False
+# Outputs
+output_neurons: 66  # BPE size, index(blank/eos/bos) = 0
+blank_index: 0
+#
+# Functions and classes
+#
+label_encoder: !new:speechbrain.dataio.encoder.CTCTextEncoder
+epoch_counter: !new:speechbrain.utils.epoch_loop.EpochCounter
+   limit: !ref <number_of_epochs>
+hubert: !new:speechbrain.lobes.models.huggingface_transformers.hubert.HuBERT
+   source: !ref <hubert_hub>
+   output_norm: True
+   freeze: !ref <freeze_hubert>
+   save_path: !ref <hubert_folder>
+top_lin: !new:speechbrain.lobes.models.VanillaNN.VanillaNN
+   input_shape: [null, null, 768] # 768 == output of hubert base model
+   activation: !ref <activation>
+   dnn_blocks: !ref <dnn_layers>
+   dnn_neurons: !ref <dnn_neurons>
+ctc_lin: !new:speechbrain.nnet.linear.Linear
+   input_size: !ref <dnn_neurons>
+   n_neurons: !ref <output_neurons>
+log_softmax: !new:speechbrain.nnet.activations.Softmax
+   apply_log: True
+ctc_cost: !name:speechbrain.nnet.losses.ctc_loss
+   blank_index: !ref <blank_index>
+modules:
+   hubert: !ref <hubert>
+   top_lin: !ref <top_lin>
+   ctc_lin: !ref <ctc_lin>
+model: !new:torch.nn.ModuleList
+   - [!ref <top_lin>, !ref <ctc_lin>]
+model_opt_class: !name:torch.optim.Adadelta
+   lr: !ref <lr>
+   rho: 0.95
+   eps: 1.e-8
+hubert_opt_class: !name:torch.optim.Adam
+   lr: !ref <lr_hubert>
+lr_annealing_model: !new:speechbrain.nnet.schedulers.NewBobScheduler
+   initial_value: !ref <lr>
+   improvement_threshold: 0.0025
+   annealing_factor: 0.8
+   patient: 0
+lr_annealing_hubert: !new:speechbrain.nnet.schedulers.NewBobScheduler
+   initial_value: !ref <lr_hubert>
+   improvement_threshold: 0.0025
+   annealing_factor: 0.9
+   patient: 0
+############################## Augmentations ###################################
+# Speed perturbation
+speed_perturb: !new:speechbrain.augment.time_domain.SpeedPerturb
+   orig_freq: !ref <sample_rate>
+   speeds: [95, 100, 105]
+# Frequency drop: randomly drops a number of frequency bands to zero.
+drop_freq: !new:speechbrain.augment.time_domain.DropFreq
+   drop_freq_low: 0
+   drop_freq_high: 1
+   drop_freq_count_low: 1
+   drop_freq_count_high: 3
+   drop_freq_width: 0.05
+# Time drop: randomly drops a number of temporal chunks.
+drop_chunk: !new:speechbrain.augment.time_domain.DropChunk
+   drop_length_low: 1000
+   drop_length_high: 2000
+   drop_count_low: 1
+   drop_count_high: 5
+# Augmenter: Combines previously defined augmentations to perform data augmentation
+wav_augment: !new:speechbrain.augment.augmenter.Augmenter
+   concat_original: True
+   min_augmentations: 4
+   max_augmentations: 4
+   augment_prob: 1.0
+   augmentations: [
+      !ref <speed_perturb>,
+      !ref <drop_freq>,
+      !ref <drop_chunk>]
+############################## Decoding ########################################
+# Decoding parameters
+test_beam_search:
+   beam_size: 143
+   topk: 1
+   blank_index: !ref <blank_index>
+   space_token: ' ' # make sure this is the same as the one used in the tokenizer
+   beam_prune_logp: -12.0
+   token_prune_min_logp: -1.20
+   prune_history: True
+   alpha: 0.8
+   beta: 1.2
+   # can be downloaded from here https://www.openslr.org/11/ or trained with kenLM
+   # It can either be a .bin or .arpa ; note: .arpa is much slower at loading
+   # If you don't want to use an LM, comment it out or set it to null
+   kenlm_model_path: null
+############################## Logging and Pretrainer ##########################
+checkpointer: !new:speechbrain.utils.checkpoints.Checkpointer
+   checkpoints_dir: !ref <save_folder>
+   recoverables:
+      hubert: !ref <hubert>
+      model: !ref <model>
+      scheduler_model: !ref <lr_annealing_model>
+      scheduler_hubert: !ref <lr_annealing_hubert>
+      counter: !ref <epoch_counter>
+      tokenizer: !ref <label_encoder>
+train_logger: !new:speechbrain.utils.train_logger.FileTrainLogger
+   save_file: !ref <train_log>
+error_rate_computer: !name:speechbrain.utils.metric_stats.ErrorRateStats
+cer_computer: !name:speechbrain.utils.metric_stats.ErrorRateStats
+   split_tokens: True

SB_ASR_finetuning_notebook.ipynb ADDED Viewed

	@@ -0,0 +1,689 @@

+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "49b85514-0fb6-49c6-be76-259bfeb638c6",
+   "metadata": {},
+   "source": [
+    "# Introduction\n",
+    "N'hésitez pas à nous contacter en cas de questions : antoine.caubriere@orange.com & elodie.gauthier@orange.com\n",
+    "\n",
+    "Pensez à modifier l'ensemble des PATH dans le fichier de configuration ASR_FLEURS-swahili_hf.yaml et dans le code python ci-dessous (PATH_TO_YOUR_FOLDER).\n",
+    "\n",
+    "Dans le cas d'un changement de corpus (autre sous partie de FLEURS / vos propres jeux de données), pensez à modifier la taille de la couche de sortie du modèle : ASR_swahili_hf.yaml/output_neurons\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "e62faa86-911a-48ce-82bc-8a34e13ffbc4",
+   "metadata": {},
+   "source": [
+    "# Préparation des données FLEURS"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "c6ccf4a5-cad1-4632-8954-f4e454ff3540",
+   "metadata": {},
+   "source": [
+    "### 1. Installation des dépendances"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "7bb8b44e-826f-4f13-b128-eebbd18dedc5",
+   "metadata": {
+    "jupyter": {
+     "source_hidden": true
+    }
+   },
+   "outputs": [],
+   "source": [
+    "pip install datasets librosa soundfile"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "016d7646-bcca-4422-8b28-9d12d4b86c8f",
+   "metadata": {},
+   "source": [
+    "### 2. Téléchargement et formatage du dataset"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "da273973-05ee-4de5-830e-34d7f2220353",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from datasets import load_dataset\n",
+    "from pathlib import Path\n",
+    "from collections import OrderedDict\n",
+    "from tqdm import tqdm\n",
+    "import shutil\n",
+    "import os\n",
+    "\n",
+    "dataset_write_base = \"PATH_TO_YOUR_FOLDER/data_speechbrain\"\n",
+    "cache_dir = \"PATH_TO_YOUR_FOLDER/data_huggingface\"\n",
+    "\n",
+    "if os.path.isdir(cache_dir):\n",
+    "    print(\"rm -rf \"+cache_dir)\n",
+    "    os.system(\"rm -rf \"+cache_dir)\n",
+    "\n",
+    "if os.path.isdir(dataset_write_base):\n",
+    "    print(\"rm -rf \"+dataset_write_base)\n",
+    "    os.system(\"rm -rf \"+dataset_write_base)\n",
+    "\n",
+    "# **************************************\n",
+    "# choix des langues à extraire de FLEURS\n",
+    "# **************************************\n",
+    "lang_dict = OrderedDict([\n",
+    "    #(\"Afrikaans\",\"af_za\"),\n",
+    "    #(\"Amharic\", \"am_et\"),\n",
+    "    #(\"Fula\", \"ff_sn\"),\n",
+    "    #(\"Ganda\", \"lg_ug\"),\n",
+    "    #(\"Hausa\", \"ha_ng\"),\n",
+    "    #(\"Igbo\", \"ig_ng\"),\n",
+    "    #(\"Kamba\", \"kam_ke\"),\n",
+    "    #(\"Lingala\", \"ln_cd\"),\n",
+    "    #(\"Luo\", \"luo_ke\"),\n",
+    "    #(\"Northern-Sotho\", \"nso_za\"),\n",
+    "    #(\"Nyanja\", \"ny_mw\"),\n",
+    "    #(\"Oromo\", \"om_et\"),\n",
+    "    #(\"Shona\", \"sn_zw\"),\n",
+    "    #(\"Somali\", \"so_so\"),\n",
+    "    (\"Swahili\", \"sw_ke\"),\n",
+    "    #(\"Umbundu\", \"umb_ao\"),\n",
+    "    #(\"Wolof\", \"wo_sn\"), \n",
+    "    #(\"Xhosa\", \"xh_za\"), \n",
+    "    #(\"Yoruba\", \"yo_ng\"), \n",
+    "    #(\"Zulu\", \"zu_za\")\n",
+    "    ])\n",
+    "\n",
+    "# ********************************\n",
+    "# choix des sous-parties à traiter\n",
+    "# ********************************\n",
+    "datasets = [\"train\",\"test\",\"validation\"]\n",
+    "\n",
+    "for lang in lang_dict:\n",
+    "    print(\"Prepare --->\", lang)\n",
+    "    \n",
+    "    # ********************************\n",
+    "    # Download FLEURS from huggingface\n",
+    "    # ********************************\n",
+    "    fleurs_asr = load_dataset(\"google/fleurs\", lang_dict[lang],cache_dir=cache_dir, trust_remote_code=True)\n",
+    "\n",
+    "    for subparts in datasets:\n",
+    "        \n",
+    "        used_ID = []\n",
+    "        Path(dataset_write_base+\"/\"+lang+\"/wavs/\"+subparts).mkdir(parents=True, exist_ok=True)\n",
+    "        \n",
+    "        # csv header\n",
+    "        f = open(dataset_write_base+\"/\"+lang+\"/\"+subparts+\".csv\", \"w\")\n",
+    "        f.write(\"ID,duration,wav,spk_id,wrd\\n\")\n",
+    "\n",
+    "        for uid in tqdm(range(len(fleurs_asr[subparts]))):\n",
+    "\n",
+    "            # ***************\n",
+    "            # format CSV line\n",
+    "            # ***************\n",
+    "            text_id = lang+\"_\"+str(fleurs_asr[subparts][uid][\"id\"])\n",
+    "            \n",
+    "            # some ID are duplicated (same speaker, same transcription BUT different recording)\n",
+    "            while(text_id in used_ID):\n",
+    "                text_id += \"_bis\"\n",
+    "            used_ID.append(text_id)\n",
+    "\n",
+    "            duration = \"{:.3f}\".format(round(float(fleurs_asr[subparts][uid][\"num_samples\"])/float(fleurs_asr[subparts][uid][\"audio\"][\"sampling_rate\"]),3))\n",
+    "            wav_path = \"/\".join([dataset_write_base, lang, \"wavs\",subparts, fleurs_asr[subparts][uid][\"audio\"][\"path\"].split('/')[-1]])\n",
+    "            spk_id = \"spk_\" + text_id\n",
+    "            # AC : \"pseudo-normalisation\" de cas marginaux -- TODO mieux\n",
+    "            wrd = fleurs_asr[subparts][uid][\"transcription\"].replace(',','').replace('$',' $ ').replace('\"','').replace('”','').replace('  ',' ')\n",
+    "\n",
+    "            # **************\n",
+    "            # write CSV line\n",
+    "            # **************\n",
+    "            f.write(text_id+\",\"+duration+\",\"+wav_path+\",\"+spk_id+\",\"+wrd+\"\\n\") \n",
+    "\n",
+    "            # *******************\n",
+    "            # Move wav from cache\n",
+    "            # *******************\n",
+    "            previous_path = \"/\".join(fleurs_asr[subparts][uid][\"path\"].split('/')[:-1]) + \"/\" + fleurs_asr[subparts][uid][\"audio\"][\"path\"]\n",
+    "            new_path = \"/\".join([dataset_write_base,lang,\"wavs\",subparts,fleurs_asr[subparts][uid][\"audio\"][\"path\"].split('/')[-1]])\n",
+    "            shutil.move(previous_path,new_path)\n",
+    "        \n",
+    "        f.close()\n",
+    "    print(\"--->\", lang, \"done\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "4c32e369-f0f9-4695-8c9a-aa3a9de7bf7b",
+   "metadata": {},
+   "source": [
+    "# Recette ASR"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "77fb2c55-3f8c-4f34-81f0-ad48a632e010",
+   "metadata": {
+    "jp-MarkdownHeadingCollapsed": true
+   },
+   "source": [
+    "## 1. Installation des dépendances"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "fbe25635-e765-480c-8416-c48a31ee6140",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "pip install torch==2.2.2 torchaudio==2.2.2 torchvision==0.17.2 speechbrain transformers jdc"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "6acf1f8c-2cf3-4c9c-8a45-e2580ecbee27",
+   "metadata": {},
+   "source": [
+    "## 2. Mise en place de la recette Speechbrain -- class Brain"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "d5e8884d-3542-40ff-a454-597078fcf97c",
+   "metadata": {},
+   "source": [
+    "### 2.1 Imports & logger"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "6c677f9f-6abe-423f-b4dd-fdf5ded357cd",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import logging\n",
+    "import os\n",
+    "import sys\n",
+    "from pathlib import Path\n",
+    "\n",
+    "import torch\n",
+    "from hyperpyyaml import load_hyperpyyaml\n",
+    "\n",
+    "import speechbrain as sb\n",
+    "from speechbrain.utils.distributed import if_main_process, run_on_main\n",
+    "\n",
+    "import jdc\n",
+    "\n",
+    "logger = logging.getLogger(__name__)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "9698bb92-16ad-4b61-8938-c74b62ee93b2",
+   "metadata": {},
+   "source": [
+    "### 2.2 Création de notre classe héritant de la classe brain"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "7c7cd624-6249-449b-8ee9-d4a73b7b3301",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Define training procedure\n",
+    "class MY_SSA_ASR(sb.Brain):\n",
+    "    print(\"\")\n",
+    "    # define here"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "ecf31c9c-15dd-4428-aa10-b3cc5e127f0d",
+   "metadata": {},
+   "source": [
+    "### 2.3 Définition de la fonction forward "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "4368b488-b9d8-49ff-8ce3-78a12d46be83",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%%add_to MY_SSA_ASR\n",
+    "def compute_forward(self, batch, stage):\n",
+    "    \"\"\"Forward computations from the waveform batches to the output probabilities.\"\"\"\n",
+    "    batch = batch.to(self.device)\n",
+    "    wavs, wav_lens = batch.sig\n",
+    "    wavs, wav_lens = wavs.to(self.device), wav_lens.to(self.device)\n",
+    "\n",
+    "    # Downsample the inputs if specified\n",
+    "    if hasattr(self.modules, \"downsampler\"):\n",
+    "        wavs = self.modules.downsampler(wavs)\n",
+    "\n",
+    "    # Add waveform augmentation if specified.\n",
+    "    if stage == sb.Stage.TRAIN and hasattr(self.hparams, \"wav_augment\"):\n",
+    "        wavs, wav_lens = self.hparams.wav_augment(wavs, wav_lens)\n",
+    "\n",
+    "    # Forward pass\n",
+    "    feats = self.modules.hubert(wavs, wav_lens)\n",
+    "    x = self.modules.top_lin(feats)\n",
+    "\n",
+    "    # Compute outputs\n",
+    "    logits = self.modules.ctc_lin(x)\n",
+    "    p_ctc = self.hparams.log_softmax(logits)\n",
+    "\n",
+    "\n",
+    "    p_tokens = None\n",
+    "    if stage == sb.Stage.VALID:\n",
+    "        p_tokens = sb.decoders.ctc_greedy_decode(p_ctc, wav_lens, blank_id=self.hparams.blank_index)\n",
+    "\n",
+    "    elif stage == sb.Stage.TEST:\n",
+    "        p_tokens = test_searcher(p_ctc, wav_lens)\n",
+    "\n",
+    "        candidates = []\n",
+    "        scores = []\n",
+    "\n",
+    "        for batch in p_tokens:\n",
+    "            candidates.append([hyp.text for hyp in batch])\n",
+    "            scores.append([hyp.score for hyp in batch])\n",
+    "\n",
+    "        if hasattr(self.hparams, \"rescorer\"):\n",
+    "            p_tokens, _ = self.hparams.rescorer.rescore(candidates, scores)\n",
+    "\n",
+    "    return p_ctc, wav_lens, p_tokens\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "f0052b79-5a27-4c4c-8601-7ab064e8c951",
+   "metadata": {},
+   "source": [
+    "### 2.4 Définition de la fonction objectives"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "3608aee8-c9c3-4e34-98bc-667513fa7f7b",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%%add_to MY_SSA_ASR\n",
+    "def compute_objectives(self, predictions, batch, stage):\n",
+    "    \"\"\"Computes the loss (CTC+NLL) given predictions and targets.\"\"\"\n",
+    "\n",
+    "    p_ctc, wav_lens, predicted_tokens = predictions\n",
+    "\n",
+    "    ids = batch.id\n",
+    "    tokens, tokens_lens = batch.tokens\n",
+    "\n",
+    "    # Labels must be extended if parallel augmentation or concatenated\n",
+    "    # augmentation was performed on the input (increasing the time dimension)\n",
+    "    if stage == sb.Stage.TRAIN and hasattr(self.hparams, \"wav_augment\"):\n",
+    "        (tokens, tokens_lens) = self.hparams.wav_augment.replicate_multiple_labels(tokens, tokens_lens)\n",
+    "\n",
+    "\n",
+    "\n",
+    "    # Compute loss\n",
+    "    loss = self.hparams.ctc_cost(p_ctc, tokens, wav_lens, tokens_lens)\n",
+    "\n",
+    "    if stage == sb.Stage.VALID:\n",
+    "        # Decode token terms to words\n",
+    "        predicted_words = [\"\".join(self.tokenizer.decode_ndim(utt_seq)).split(\" \") for utt_seq in predicted_tokens]\n",
+    "        \n",
+    "    elif stage == sb.Stage.TEST:\n",
+    "        predicted_words = [hyp[0].text.split(\" \") for hyp in predicted_tokens]\n",
+    "\n",
+    "    if stage != sb.Stage.TRAIN:\n",
+    "        target_words = [wrd.split(\" \") for wrd in batch.wrd]\n",
+    "        self.wer_metric.append(ids, predicted_words, target_words)\n",
+    "        self.cer_metric.append(ids, predicted_words, target_words)\n",
+    "\n",
+    "    return loss\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "9a514c50-89ad-41cb-882a-23daf829a538",
+   "metadata": {},
+   "source": [
+    "### 2.5 définition du comportement au début d'un \"stage\""
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "609814ce-3ef0-4818-a70f-cadc293c9dd2",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%%add_to MY_SSA_ASR\n",
+    "# stage gestion\n",
+    "def on_stage_start(self, stage, epoch):\n",
+    "    \"\"\"Gets called at the beginning of each epoch\"\"\"\n",
+    "    if stage != sb.Stage.TRAIN:\n",
+    "        self.cer_metric = self.hparams.cer_computer()\n",
+    "        self.wer_metric = self.hparams.error_rate_computer()\n",
+    "\n",
+    "    if stage == sb.Stage.TEST:\n",
+    "        if hasattr(self.hparams, \"rescorer\"):\n",
+    "            self.hparams.rescorer.move_rescorers_to_device()\n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "55929209-c94a-4f8b-8f2e-9dd5d9de8be9",
+   "metadata": {},
+   "source": [
+    "### 2.6 définition du comportement à la fin d'un \"stage\""
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "8f297542-10d5-47bf-9938-c141f5a99ab8",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%%add_to MY_SSA_ASR\n",
+    "def on_stage_end(self, stage, stage_loss, epoch):\n",
+    "    \"\"\"Gets called at the end of an epoch.\"\"\"\n",
+    "    # Compute/store important stats\n",
+    "    stage_stats = {\"loss\": stage_loss}\n",
+    "    if stage == sb.Stage.TRAIN:\n",
+    "        self.train_stats = stage_stats\n",
+    "    else:\n",
+    "        stage_stats[\"CER\"] = self.cer_metric.summarize(\"error_rate\")\n",
+    "        stage_stats[\"WER\"] = self.wer_metric.summarize(\"error_rate\")\n",
+    "\n",
+    "    # Perform end-of-iteration things, like annealing, logging, etc.\n",
+    "    if stage == sb.Stage.VALID:\n",
+    "        # *******************************\n",
+    "        # Anneal and update Learning Rate\n",
+    "        # *******************************\n",
+    "        old_lr_model, new_lr_model = self.hparams.lr_annealing_model(stage_stats[\"loss\"])\n",
+    "        old_lr_hubert, new_lr_hubert = self.hparams.lr_annealing_hubert(stage_stats[\"loss\"])\n",
+    "        sb.nnet.schedulers.update_learning_rate(self.model_optimizer, new_lr_model)\n",
+    "        sb.nnet.schedulers.update_learning_rate(self.hubert_optimizer, new_lr_hubert)\n",
+    "\n",
+    "        # *****************\n",
+    "        # Logs informations\n",
+    "        # *****************\n",
+    "        self.hparams.train_logger.log_stats(stats_meta={\"epoch\": epoch, \"lr_model\": old_lr_model, \"lr_hubert\": old_lr_hubert}, train_stats=self.train_stats, valid_stats=stage_stats)\n",
+    "\n",
+    "        # ***************\n",
+    "        # Save checkpoint\n",
+    "        # ***************\n",
+    "        self.checkpointer.save_and_keep_only(meta={\"WER\": stage_stats[\"WER\"]},min_keys=[\"WER\"])\n",
+    "\n",
+    "    elif stage == sb.Stage.TEST:\n",
+    "        self.hparams.train_logger.log_stats(stats_meta={\"Epoch loaded\": self.hparams.epoch_counter.current},test_stats=stage_stats)\n",
+    "        if if_main_process():\n",
+    "            with open(self.hparams.test_wer_file, \"w\") as w:\n",
+    "                self.wer_metric.write_stats(w)\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "0c656457-6b61-4316-8199-70021f92babf",
+   "metadata": {},
+   "source": [
+    "### 2.7 définition de l'initialisation des optimizers"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "da8d9cb5-c5ad-4e78-83d3-e129e138a741",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%%add_to MY_SSA_ASR\n",
+    "def init_optimizers(self):\n",
+    "    \"Initializes the hubert optimizer and model optimizer\"\n",
+    "    self.hubert_optimizer = self.hparams.hubert_opt_class(self.modules.hubert.parameters())\n",
+    "    self.model_optimizer = self.hparams.model_opt_class(self.hparams.model.parameters())\n",
+    "\n",
+    "    # save the optimizers in a dictionary\n",
+    "    # the key will be used in `freeze_optimizers()`\n",
+    "    self.optimizers_dict = {\"model_optimizer\": self.model_optimizer}\n",
+    "    if not self.hparams.freeze_hubert:\n",
+    "        self.optimizers_dict[\"hubert_optimizer\"] = self.hubert_optimizer\n",
+    "\n",
+    "    if self.checkpointer is not None:\n",
+    "        self.checkpointer.add_recoverable(\"hubert_opt\", self.hubert_optimizer)\n",
+    "        self.checkpointer.add_recoverable(\"model_opt\", self.model_optimizer)\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "cf2e730c-2faa-41f2-b98d-e5fbb2305cc2",
+   "metadata": {},
+   "source": [
+    "## 3 Définition de la lecture des datasets"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "c5e667f7-6269-4b49-88bb-5e431762c8fe",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def dataio_prepare(hparams):\n",
+    "    \"\"\"This function prepares the datasets to be used in the brain class.\n",
+    "    It also defines the data processing pipeline through user-defined functions.\n",
+    "    \"\"\"\n",
+    "\n",
+    "    # **************\n",
+    "    # Load CSV files\n",
+    "    # **************\n",
+    "    data_folder = hparams[\"data_folder\"]\n",
+    "\n",
+    "    train_data = sb.dataio.dataset.DynamicItemDataset.from_csv(csv_path=hparams[\"train_csv\"],replacements={\"data_root\": data_folder})\n",
+    "    # we sort training data to speed up training and get better results.\n",
+    "    train_data = train_data.filtered_sorted(sort_key=\"duration\")\n",
+    "    hparams[\"train_dataloader_opts\"][\"shuffle\"] = False # when sorting do not shuffle in dataloader ! otherwise is pointless\n",
+    "\n",
+    "    valid_data = sb.dataio.dataset.DynamicItemDataset.from_csv(csv_path=hparams[\"valid_csv\"],replacements={\"data_root\": data_folder})\n",
+    "    valid_data = valid_data.filtered_sorted(sort_key=\"duration\")\n",
+    "\n",
+    "    # test is separate\n",
+    "    test_datasets = {}\n",
+    "    for csv_file in hparams[\"test_csv\"]:\n",
+    "        name = Path(csv_file).stem\n",
+    "        test_datasets[name] = sb.dataio.dataset.DynamicItemDataset.from_csv(csv_path=csv_file, replacements={\"data_root\": data_folder})\n",
+    "        test_datasets[name] = test_datasets[name].filtered_sorted(sort_key=\"duration\")\n",
+    "\n",
+    "    datasets = [train_data, valid_data] + [i for k, i in test_datasets.items()]\n",
+    "\n",
+    "    # *************************\n",
+    "    # 2. Define audio pipeline:\n",
+    "    # *************************\n",
+    "    @sb.utils.data_pipeline.takes(\"wav\")\n",
+    "    @sb.utils.data_pipeline.provides(\"sig\")\n",
+    "    def audio_pipeline(wav):\n",
+    "        sig = sb.dataio.dataio.read_audio(wav)\n",
+    "        return sig\n",
+    "\n",
+    "    sb.dataio.dataset.add_dynamic_item(datasets, audio_pipeline)\n",
+    "\n",
+    "    # ************************\n",
+    "    # 3. Define text pipeline:\n",
+    "    # ************************\n",
+    "    label_encoder = sb.dataio.encoder.CTCTextEncoder()\n",
+    "    \n",
+    "    @sb.utils.data_pipeline.takes(\"wrd\")\n",
+    "    @sb.utils.data_pipeline.provides(\"wrd\", \"char_list\", \"tokens_list\", \"tokens\")\n",
+    "    def text_pipeline(wrd):\n",
+    "        yield wrd\n",
+    "        char_list = list(wrd)\n",
+    "        yield char_list\n",
+    "        tokens_list = label_encoder.encode_sequence(char_list)\n",
+    "        yield tokens_list\n",
+    "        tokens = torch.LongTensor(tokens_list)\n",
+    "        yield tokens\n",
+    "\n",
+    "    sb.dataio.dataset.add_dynamic_item(datasets, text_pipeline)\n",
+    "\n",
+    "\n",
+    "    # *******************************\n",
+    "    # 4. Create or load label encoder\n",
+    "    # *******************************\n",
+    "    lab_enc_file = os.path.join(hparams[\"save_folder\"], \"label_encoder.txt\")\n",
+    "    special_labels = {\"blank_label\": hparams[\"blank_index\"]}\n",
+    "    label_encoder.add_unk()\n",
+    "    label_encoder.load_or_create(path=lab_enc_file, from_didatasets=[train_data], output_key=\"char_list\", special_labels=special_labels, sequence_input=True)\n",
+    "\n",
+    "    # **************\n",
+    "    # 5. Set output:\n",
+    "    # **************\n",
+    "    sb.dataio.dataset.set_output_keys(datasets,[\"id\", \"sig\", \"wrd\", \"char_list\", \"tokens\"],)\n",
+    "\n",
+    "    return train_data, valid_data, test_datasets, label_encoder\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "e97c4f20-6951-4d12-8e17-9eb818a52bb1",
+   "metadata": {},
+   "source": [
+    "## 4. Utilisation de la recette Créée"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "76b72148-6bd0-48bd-ad40-cb6f8bfd34c0",
+   "metadata": {},
+   "source": [
+    "### 4.1 Préparation au lancement"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "d47ec39a-5562-4a63-8243-656c9235b7a2",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "hparams_file, run_opts, overrides = sb.parse_arguments([\"PATH_TO_YOUR_FOLDER/ASR_FLEURS-swahili_hf.yaml\"])\n",
+    "# create ddp_group with the right communication protocol\n",
+    "sb.utils.distributed.ddp_init_group(run_opts)\n",
+    "\n",
+    "# ***********************************\n",
+    "# Chargement du fichier de paramètres\n",
+    "# ***********************************\n",
+    "with open(hparams_file) as fin:\n",
+    "    hparams = load_hyperpyyaml(fin, overrides)\n",
+    "\n",
+    "# ***************************\n",
+    "# Create experiment directory\n",
+    "# ***************************\n",
+    "sb.create_experiment_directory(experiment_directory=hparams[\"output_folder\"], hyperparams_to_save=hparams_file, overrides=overrides)\n",
+    "\n",
+    "# ***************************\n",
+    "# Create the datasets objects\n",
+    "# ***************************\n",
+    "train_data, valid_data, test_datasets, label_encoder = dataio_prepare(hparams)\n",
+    "\n",
+    "# **********************\n",
+    "# Trainer initialization\n",
+    "# **********************\n",
+    "asr_brain = MY_SSA_ASR(modules=hparams[\"modules\"], hparams=hparams, run_opts=run_opts, checkpointer=hparams[\"checkpointer\"])\n",
+    "asr_brain.tokenizer = label_encoder"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "62ae72eb-416c-4ef0-9348-d02bbc268fbd",
+   "metadata": {},
+   "source": [
+    "### 4.2 Apprentissage du modèle"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "d3dd30ee-89c0-40ea-a9d2-0e2b9d8c8686",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# ********\n",
+    "# Training\n",
+    "# ********\n",
+    "asr_brain.fit(asr_brain.hparams.epoch_counter, \n",
+    "              train_data, valid_data, \n",
+    "              train_loader_kwargs=hparams[\"train_dataloader_opts\"], \n",
+    "              valid_loader_kwargs=hparams[\"valid_dataloader_opts\"],\n",
+    "             )\n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "1b55af4c-c544-45ff-8435-58226218328f",
+   "metadata": {},
+   "source": [
+    "### 4.3 Test du Modèle"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "9cef9011-1a3e-43a4-ab16-8cfb2b57dbd9",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# *******\n",
+    "# Testing\n",
+    "# *******\n",
+    "if not os.path.exists(hparams[\"output_wer_folder\"]):\n",
+    "    os.makedirs(hparams[\"output_wer_folder\"])\n",
+    "\n",
+    "from speechbrain.decoders.ctc import CTCBeamSearcher\n",
+    "\n",
+    "ind2lab = label_encoder.ind2lab\n",
+    "vocab_list = [ind2lab[x] for x in range(len(ind2lab))]\n",
+    "test_searcher = CTCBeamSearcher(**hparams[\"test_beam_search\"], vocab_list=vocab_list)\n",
+    "\n",
+    "for k in test_datasets.keys():  # Allow multiple evaluation throught list of test sets\n",
+    "    asr_brain.hparams.test_wer_file = os.path.join(hparams[\"output_wer_folder\"], f\"wer_{k}.txt\")\n",
+    "    asr_brain.evaluate(test_datasets[k], test_loader_kwargs=hparams[\"test_dataloader_opts\"], min_key=\"WER\")\n"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.14"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}