crystal-technologies
/

CRYSTAL-R1

Model card Files Files and versions Community

crystal-technologies commited on Oct 25, 2023

Commit

2d8da09

•

1 Parent(s): c1bb68d

Upload 1287 files

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

SoundScribe/SpeakerID/Dockerfile +140 -0
SoundScribe/SpeakerID/Jenkinsfile +0 -0
SoundScribe/SpeakerID/LICENSE +201 -0
SoundScribe/SpeakerID/ci.groovy +119 -0
SoundScribe/SpeakerID/external/get_collections.py +90 -0
SoundScribe/SpeakerID/external/get_modules.py +159 -0
SoundScribe/SpeakerID/nemo/README.md +9 -0
SoundScribe/SpeakerID/nemo/__init__.py +28 -0
SoundScribe/SpeakerID/nemo/__pycache__/__init__.cpython-310.pyc +0 -0
SoundScribe/SpeakerID/nemo/__pycache__/__init__.cpython-39.pyc +0 -0
SoundScribe/SpeakerID/nemo/__pycache__/constants.cpython-310.pyc +0 -0
SoundScribe/SpeakerID/nemo/__pycache__/package_info.cpython-310.pyc +0 -0
SoundScribe/SpeakerID/nemo/collections/__init__.py +13 -0
SoundScribe/SpeakerID/nemo/collections/__pycache__/__init__.cpython-310.pyc +0 -0
SoundScribe/SpeakerID/nemo/collections/__pycache__/__init__.cpython-39.pyc +0 -0
SoundScribe/SpeakerID/nemo/collections/asr/__init__.py +25 -0
SoundScribe/SpeakerID/nemo/collections/asr/__pycache__/__init__.cpython-310.pyc +0 -0
SoundScribe/SpeakerID/nemo/collections/asr/__pycache__/__init__.cpython-39.pyc +0 -0
SoundScribe/SpeakerID/nemo/collections/asr/data/__init__.py +13 -0
SoundScribe/SpeakerID/nemo/collections/asr/data/__pycache__/__init__.cpython-310.pyc +0 -0
SoundScribe/SpeakerID/nemo/collections/asr/data/__pycache__/audio_to_audio.cpython-310.pyc +0 -0
SoundScribe/SpeakerID/nemo/collections/asr/data/__pycache__/audio_to_audio_dataset.cpython-310.pyc +0 -0
SoundScribe/SpeakerID/nemo/collections/asr/data/__pycache__/audio_to_diar_label.cpython-310.pyc +0 -0
SoundScribe/SpeakerID/nemo/collections/asr/data/__pycache__/audio_to_label.cpython-310.pyc +0 -0
SoundScribe/SpeakerID/nemo/collections/asr/data/__pycache__/audio_to_label_dataset.cpython-310.pyc +0 -0
SoundScribe/SpeakerID/nemo/collections/asr/data/__pycache__/audio_to_text.cpython-310.pyc +0 -0
SoundScribe/SpeakerID/nemo/collections/asr/data/__pycache__/audio_to_text_dali.cpython-310.pyc +0 -0
SoundScribe/SpeakerID/nemo/collections/asr/data/__pycache__/audio_to_text_dataset.cpython-310.pyc +0 -0
SoundScribe/SpeakerID/nemo/collections/asr/data/__pycache__/feature_to_label.cpython-310.pyc +0 -0
SoundScribe/SpeakerID/nemo/collections/asr/data/__pycache__/feature_to_label_dataset.cpython-310.pyc +0 -0
SoundScribe/SpeakerID/nemo/collections/asr/data/audio_to_audio.py +1136 -0
SoundScribe/SpeakerID/nemo/collections/asr/data/audio_to_audio_dataset.py +95 -0
SoundScribe/SpeakerID/nemo/collections/asr/data/audio_to_ctm_dataset.py +95 -0
SoundScribe/SpeakerID/nemo/collections/asr/data/audio_to_diar_label.py +853 -0
SoundScribe/SpeakerID/nemo/collections/asr/data/audio_to_label.py +1294 -0
SoundScribe/SpeakerID/nemo/collections/asr/data/audio_to_label_dataset.py +304 -0
SoundScribe/SpeakerID/nemo/collections/asr/data/audio_to_text.py +1366 -0
SoundScribe/SpeakerID/nemo/collections/asr/data/audio_to_text_dali.py +772 -0
SoundScribe/SpeakerID/nemo/collections/asr/data/audio_to_text_dataset.py +950 -0
SoundScribe/SpeakerID/nemo/collections/asr/data/data_simulation.py +0 -0
SoundScribe/SpeakerID/nemo/collections/asr/data/feature_to_label.py +497 -0
SoundScribe/SpeakerID/nemo/collections/asr/data/feature_to_label_dataset.py +68 -0
SoundScribe/SpeakerID/nemo/collections/asr/data/feature_to_text.py +488 -0
SoundScribe/SpeakerID/nemo/collections/asr/data/feature_to_text_dataset.py +94 -0
SoundScribe/SpeakerID/nemo/collections/asr/data/text_to_text.py +482 -0
SoundScribe/SpeakerID/nemo/collections/asr/losses/__init__.py +22 -0
SoundScribe/SpeakerID/nemo/collections/asr/losses/__pycache__/__init__.cpython-310.pyc +0 -0
SoundScribe/SpeakerID/nemo/collections/asr/losses/__pycache__/angularloss.cpython-310.pyc +0 -0
SoundScribe/SpeakerID/nemo/collections/asr/losses/__pycache__/audio_losses.cpython-310.pyc +0 -0
SoundScribe/SpeakerID/nemo/collections/asr/losses/__pycache__/ctc.cpython-310.pyc +0 -0

SoundScribe/SpeakerID/Dockerfile ADDED Viewed

	@@ -0,0 +1,140 @@

+# syntax=docker/dockerfile:experimental
+# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+ARG BASE_IMAGE=nvcr.io/nvidia/pytorch:23.08-py3
+# build an image that includes only the nemo dependencies, ensures that dependencies
+# are included first for optimal caching, and useful for building a development
+# image (by specifying build target as `nemo-deps`)
+FROM ${BASE_IMAGE} as nemo-deps
+# dependency flags; should be declared after FROM
+# torchaudio: not required by default
+ARG REQUIRE_TORCHAUDIO=false
+# k2: not required by default
+ARG REQUIRE_K2=false
+# ais cli: not required by default, install only if required
+ARG REQUIRE_AIS_CLI=false
+# Ensure apt-get won't prompt for selecting options
+ENV DEBIAN_FRONTEND=noninteractive
+# libavdevice-dev rerquired for latest torchaudio
+RUN apt-get update && \
+  apt-get upgrade -y && \
+  apt-get install -y \
+  libsndfile1 sox \
+  libfreetype6 \
+  swig \
+  ffmpeg \
+  libavdevice-dev && \
+  rm -rf /var/lib/apt/lists/*
+WORKDIR /workspace/
+# install megatron core, this can be removed once 0.3 pip package is released
+RUN git clone https://github.com/NVIDIA/Megatron-LM.git && \
+  cd Megatron-LM && \
+  git checkout ab0336a5c8eab77aa74ae604ba1e73decbf6d560 && \
+  pip install -e .
+WORKDIR /tmp/
+# Distributed Adam support for multiple dtypes
+RUN git clone https://github.com/NVIDIA/apex.git && \
+  cd apex && \
+  git checkout 52e18c894223800cb611682dce27d88050edf1de && \
+  pip3 install -v --no-build-isolation --disable-pip-version-check --no-cache-dir --global-option="--cpp_ext" --global-option="--cuda_ext" --global-option="--fast_layer_norm" --global-option="--distributed_adam" --global-option="--deprecated_fused_adam" ./
+# uninstall stuff from base container
+RUN pip3 uninstall -y sacrebleu torchtext
+# build torchaudio
+WORKDIR /tmp/torchaudio_build
+COPY scripts/installers /tmp/torchaudio_build/scripts/installers/
+RUN INSTALL_MSG=$(/bin/bash /tmp/torchaudio_build/scripts/installers/install_torchaudio_latest.sh); INSTALL_CODE=$?; \
+  echo ${INSTALL_MSG}; \
+  if [ ${INSTALL_CODE} -ne 0 ]; then \
+  echo "torchaudio installation failed";  \
+  if [ "${REQUIRE_TORCHAUDIO}" = true ]; then \
+  exit ${INSTALL_CODE};  \
+  else echo "Skipping failed torchaudio installation"; fi \
+  else echo "torchaudio installed successfully"; fi
+# install nemo dependencies
+WORKDIR /tmp/nemo
+COPY requirements .
+RUN for f in $(ls requirements*.txt); do pip3 install --disable-pip-version-check --no-cache-dir -r $f; done
+# install flash attention dependencies
+RUN pip install flash-attn
+# pinned triton version for flash-attention https://github.com/HazyResearch/flash-attention/blob/main/flash_attn/flash_attn_triton.py#L3
+RUN pip install triton==2.0.0.dev20221202
+# install numba for latest containers
+RUN pip install numba>=0.57.1
+# install k2, skip if installation fails
+COPY scripts /tmp/nemo/scripts/
+RUN INSTALL_MSG=$(/bin/bash /tmp/nemo/scripts/speech_recognition/k2/setup.sh); INSTALL_CODE=$?; \
+  echo ${INSTALL_MSG}; \
+  if [ ${INSTALL_CODE} -ne 0 ]; then \
+  echo "k2 installation failed";  \
+  if [ "${REQUIRE_K2}" = true ]; then \
+  exit ${INSTALL_CODE};  \
+  else echo "Skipping failed k2 installation"; fi \
+  else echo "k2 installed successfully"; fi
+# copy nemo source into a scratch image
+FROM scratch as nemo-src
+COPY . .
+# start building the final container
+FROM nemo-deps as nemo
+ARG NEMO_VERSION=1.21.0
+# Check that NEMO_VERSION is set. Build will fail without this. Expose NEMO and base container
+# version information as runtime environment variable for introspection purposes
+RUN /usr/bin/test -n "$NEMO_VERSION" && \
+  /bin/echo "export NEMO_VERSION=${NEMO_VERSION}" >> /root/.bashrc && \
+  /bin/echo "export BASE_IMAGE=${BASE_IMAGE}" >> /root/.bashrc
+# Install NeMo
+RUN --mount=from=nemo-src,target=/tmp/nemo,rw cd /tmp/nemo && pip install ".[all]"
+# Check install
+RUN python -c "import nemo.collections.nlp as nemo_nlp" && \
+  python -c "import nemo.collections.tts as nemo_tts" && \
+  python -c "import nemo_text_processing.text_normalization as text_normalization"
+# copy scripts/examples/tests into container for end user
+WORKDIR /workspace/nemo
+COPY scripts /workspace/nemo/scripts
+COPY examples /workspace/nemo/examples
+COPY tests /workspace/nemo/tests
+COPY tutorials /workspace/nemo/tutorials
+# COPY README.rst LICENSE /workspace/nemo/
+RUN printf "#!/bin/bash\njupyter lab --no-browser --allow-root --ip=0.0.0.0" >> start-jupyter.sh && \
+  chmod +x start-jupyter.sh
+# If required, install AIS CLI
+RUN if [ "${REQUIRE_AIS_CLI}" = true ]; then \
+  INSTALL_MSG=$(/bin/bash scripts/installers/install_ais_cli_latest.sh); INSTALL_CODE=$?; \
+  echo ${INSTALL_MSG}; \
+  if [ ${INSTALL_CODE} -ne 0 ]; then \
+  echo "AIS CLI installation failed"; \
+  exit ${INSTALL_CODE}; \
+  else echo "AIS CLI installed successfully"; fi \
+  else echo "Skipping AIS CLI installation"; fi

SoundScribe/SpeakerID/Jenkinsfile ADDED Viewed

The diff for this file is too large to render. See raw diff

SoundScribe/SpeakerID/LICENSE ADDED Viewed

	@@ -0,0 +1,201 @@

+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+   1. Definitions.
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+   END OF TERMS AND CONDITIONS
+   APPENDIX: How to apply the Apache License to your work.
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+   Copyright [yyyy] [name of copyright owner]
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+       http://www.apache.org/licenses/LICENSE-2.0
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.

SoundScribe/SpeakerID/ci.groovy ADDED Viewed

	@@ -0,0 +1,119 @@

+@Library('blossom-github-lib@master')
+import ipp.blossom.*
+podTemplate(cloud:'sc-ipp-blossom-prod', yaml : """
+apiVersion: v1
+kind: Pod
+metadata:
+  labels:
+    some-label: some-label-value
+spec:
+  volumes:
+  - name: scratch
+    nfs:
+      server: ipp1-cdot01-col01
+      path: /vol/scratch1/scratch.okuchaiev_blossom
+  containers:
+  - name: latestdlfw
+    image: nvcr.io/nvidia/pytorch:23.02-py3
+    command:
+    - cat
+    volumeMounts:
+    - name: scratch
+      mountPath: /testdata
+    resources:
+          limits:
+             nvidia.com/gpu: 2
+    restartPolicy: Never
+    backoffLimit: 4
+    tty: true
+    shm-size: 32g
+  nodeSelector:
+    kubernetes.io/os: linux
+    nvidia.com/gpu_type: "Tesla_T4x4"
+    nvidia.com/node_type: gpu_tester
+    nvidia.com/driver_version: "510.20"
+"""
+)   {
+      node(POD_LABEL) {
+          def githubHelper
+          stage('Get Token') {
+              withCredentials([usernamePassword(credentialsId: 'GHAtoken', passwordVariable: 'GIT_PASSWORD', usernameVariable: 'GIT_USERNAME')]) {
+                  // create new instance of helper object
+                  githubHelper = GithubHelper.getInstance("${GIT_PASSWORD}", githubData)
+              }
+          }
+          def stageName = ''
+          try {
+              currentBuild.description = githubHelper.getBuildDescription()
+              container('latestdlfw') {
+                stage('Code checkout') {
+                    // update status on github
+                    githubHelper.updateCommitStatus("$BUILD_URL", "$stageName Running", GitHubCommitState.PENDING)
+                    checkout changelog: true, poll: true, scm: [$class: 'GitSCM', branches: [[name: "pr/"+githubHelper.getPRNumber()]],
+                    doGenerateSubmoduleConfigurations: false,
+                    submoduleCfg: [],
+                    userRemoteConfigs: [[credentialsId: 'github-token', url: githubHelper.getCloneUrl(), refspec: '+refs/pull/*/head:refs/remotes/origin/pr/*']]]
+                }
+                stage('Code Style') {
+                        sh "apt-get update && \
+                            apt-get install -y bc && \
+                            nvidia-smi && \
+                            pip install -r requirements/requirements_test.txt && \
+                            python setup.py style && ls -l /testdata/TestData && ln -s /testdata/TestData /home/TestData && \
+                            ls -l /home && ls -l /home/TestData"
+                }
+                stage('Installation') {
+                  sh "git config --global --add safe.directory '*' && nvidia-smi && ./reinstall.sh release"
+                }
+                stage('L0: GPU unit tests') {
+                            sh "NEMO_NUMBA_MINVER=0.53 pytest -m 'not pleasefixme'"
+                }
+                parallel( //USE CUDA_VISIBLE_DEVICES to execute 2 single GPU tests in parallel here
+                [
+                    "L1: NMT Training Pre-LN": { sh 'CUDA_VISIBLE_DEVICES=0 python examples/nlp/machine_translation/enc_dec_nmt.py \
+                            --config-path=conf \
+                            --config-name=aayn_base \
+                            do_testing=true \
+                            model.train_ds.src_file_name=/testdata/TestData/nlp/nmt/toy_data/wmt14-de-en.src \
+                            model.train_ds.tgt_file_name=/testdata/TestData/nlp/nmt/toy_data/wmt14-de-en.ref \
+                            model.validation_ds.src_file_name=/testdata/TestData/nlp/nmt/toy_data/wmt14-de-en.src \
+                            model.validation_ds.tgt_file_name=/testdata/TestData/nlp/nmt/toy_data/wmt14-de-en.src \
+                            model.test_ds.src_file_name=/testdata/TestData/nlp/nmt/toy_data/wmt14-de-en.src \
+                            model.test_ds.tgt_file_name=/testdata/TestData/nlp/nmt/toy_data/wmt14-de-en.src \
+                            model.encoder_tokenizer.tokenizer_model=/testdata/TestData/nlp/nmt/toy_data/tt_tokenizer.BPE.4096.model \
+                            model.decoder_tokenizer.tokenizer_model=/testdata/TestData/nlp/nmt/toy_data/tt_tokenizer.BPE.4096.model \
+                            model.encoder.pre_ln=true \
+                            model.decoder.pre_ln=true \
+                            trainer.devices=[0] \
+                            trainer.accelerator="gpu" \
+                            +trainer.fast_dev_run=true \
+                            +trainer.limit_test_batches=2 \
+                            exp_manager=null \
+                            '},
+                    "L1: Speech to text": { sh 'CUDA_VISIBLE_DEVICES=1 python examples/asr/asr_ctc/speech_to_text_ctc.py \
+                            model.train_ds.manifest_filepath=/testdata/TestData/an4_dataset/an4_train.json \
+                            model.validation_ds.manifest_filepath=/testdata/TestData/an4_dataset/an4_val.json \
+                            trainer.devices=[0] \
+                            trainer.accelerator="gpu" \
+                            +trainer.fast_dev_run=True \
+                            exp_manager=null \
+                            '}
+                ]
+                )//end of parallel
+              }
+              githubHelper.updateCommitStatus("$BUILD_URL", "Complete", GitHubCommitState.SUCCESS)
+          }
+          catch (Exception ex){
+              currentBuild.result = 'FAILURE'
+              println ex
+              githubHelper.updateCommitStatus("$BUILD_URL", "$stageName Failed", GitHubCommitState.FAILURE)
+          }
+      }
+  }

SoundScribe/SpeakerID/external/get_collections.py ADDED Viewed

	@@ -0,0 +1,90 @@

+# -*- coding: utf-8 -*-
+# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" Script responsible for generation of a JSON file with list of NeMo collections. """
+import argparse
+import importlib
+import json
+import os
+import nemo
+from nemo.utils import logging
+def process_collection(id, col):
+    """ Helper function processing the collection.
+    Args:
+        id: (short) name of the collection.
+        col: a collection (python module).
+    """
+    return {
+        "id": id,
+        "name": col.__name__,
+        "description": col.__description__,
+        "version": col.__version__,
+        "author": col.__author__,
+    }
+def main():
+    """ Main function generating a JSON file with list of NeMo collections. """
+    # Parse filename.
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--filename', help='Name of the output JSON file', type=str, default="collections.json")
+    args = parser.parse_args()
+    # Get collections directory.
+    colletions_dir = os.path.dirname(nemo.collections.__file__)
+    logging.info('Analysing collections in `{}`'.format(colletions_dir))
+    # Generate list of NeMo collections - from the list of collection subfolders.
+    collections = {}
+    for sub_dir in os.listdir(colletions_dir):
+        # Skip cache.
+        if sub_dir == "__pycache__":
+            continue
+        # Check if it is a directory.
+        if os.path.isdir(os.path.join(colletions_dir, sub_dir)):
+            collections[sub_dir] = "nemo.collections." + sub_dir
+    output_list = []
+    # Iterate over all collections.
+    for key, val in collections.items():
+        # Try to get module specification.
+        module_spec = importlib.util.find_spec(val)
+        if module_spec is None:
+            logging.warning("  * Failed to process `{}`".format(val))
+        else:
+            try:
+                # Import the module from the module specification.
+                module = importlib.util.module_from_spec(module_spec)
+                module_spec.loader.exec_module(module)
+                # Add to list.
+                output_list.append(process_collection(key, module))
+                logging.info("  * Processed `{}`".format(val))
+            except AttributeError:
+                logging.warning("  * Failed to process `{}`".format(val))
+    # Export to JSON.
+    with open(args.filename, 'w', encoding='utf-8') as outfile:
+        json.dump(output_list, outfile)
+    logging.info('Finshed the analysis, results exported to `{}`.'.format(args.filename))
+if __name__ == '__main__':
+    main()

SoundScribe/SpeakerID/external/get_modules.py ADDED Viewed

	@@ -0,0 +1,159 @@

+# -*- coding: utf-8 -*-
+# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" Script responsible for generation of a JSON file containing list of modules of a given collection. """
+import argparse
+import importlib
+import inspect
+import json
+import os
+import nemo
+from nemo.utils import logging
+def process_member(name, obj, module_list):
+    """ Helper function processing the passed object and, if ok, adding a record to the module list.
+    Args:
+        name: name of the member
+        obj: member (class/function etc.)
+        module_list: list of modules that (probably) will be expanded.
+    """
+    # It is not a class - skip it.
+    if not inspect.isclass(obj):
+        return
+    # Check inheritance - we know that all our datasets/modules/losses inherit from Serialization,
+    # Btw. Serialization is also required by this script.
+    if not issubclass(obj, nemo.core.Serialization):
+        return
+    logging.info("  * Processing `{}`".format(str(obj)))
+    module_list.append(
+        {
+            "name": name,
+            "cls": str(obj),
+            # Temporary solution: mockup arguments.
+            "arguments": [
+                "jasper",
+                "activation",
+                "feat_in",
+                "normalization_mode",
+                "residual_mode",
+                "norm_groups",
+                "conv_mask",
+                "frame_splicing",
+                "init_mode",
+            ],
+            # Temporary solution: mockup input types.
+            "input_types": {
+                "audio_signal": "axes: (batch, dimension, time); elements_type: MelSpectrogramType",
+                "length": "axes: (batch,); elements_type: LengthType",
+            },
+            # Temporary solution: mockup output types.
+            "output_types": {
+                "encoder_output": "axes: (batch, dimension, time); elements_type: AcousticEncodedRepresentation"
+            },
+        }
+    )
+def main():
+    """ Main function analysing the indicated NeMo collection and generating a JSON file with module descriptions. """
+    # Parse filename.
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--collection', help='ID of the collection', type=str)
+    parser.add_argument('--filename', help='Name of the output JSON file', type=str, default="modules.json")
+    args = parser.parse_args()
+    # Get collections directory.
+    colletions_dir = os.path.dirname(nemo.collections.__file__)
+    logging.info('Analysing collections in `{}`'.format(colletions_dir))
+    # Generate list of NeMo collections - from the list of collection subfolders.
+    collections = {}
+    for sub_dir in os.listdir(colletions_dir):
+        # Skip cache.
+        if sub_dir == "__pycache__":
+            continue
+        # Check if it is a directory.
+        if os.path.isdir(os.path.join(colletions_dir, sub_dir)):
+            collections[sub_dir] = "nemo.collections." + sub_dir
+    # Check the collection.
+    if args.collection not in collections.keys():
+        logging.error("Coudn't process the incidated `{}` collection".format(args.collection))
+        logging.info(
+            "Please select one of the existing collections using `--collection [{}]`".format("|".join(collections))
+        )
+        exit(-1)
+    # Load the collection specification.
+    collection_spec = importlib.util.find_spec(collections[args.collection])
+    if collection_spec is None:
+        logging.error("Failed to load the `{}` collection".format(val))
+    # Import the module from the module specification.
+    collection = importlib.util.module_from_spec(collection_spec)
+    collection_spec.loader.exec_module(collection)
+    module_list = []
+    # Iterate over the packages in the indicated collection.
+    logging.info("Analysing the `{}` collection".format(args.collection))
+    try:  # Datasets in dataset folder
+        logging.info("Analysing the 'data' package")
+        for name, obj in inspect.getmembers(collection.data):
+            process_member(name, obj, module_list)
+    except AttributeError as e:
+        logging.info("  * No datasets found")
+    try:  # Datasets in dataset folder
+        logging.info("Analysing the 'datasets' package")
+        for name, obj in inspect.getmembers(collection.datasets):
+            process_member(name, obj, module_list)
+    except AttributeError as e:
+        logging.info("  * No datasets found")
+    try:  # Modules
+        logging.info("Analysing the 'modules' package")
+        for name, obj in inspect.getmembers(collection.modules):
+            process_member(name, obj, module_list)
+    except AttributeError as e:
+        logging.info("  * No modules found")
+    try:  # Losses
+        logging.info("Analysing the 'losses' package")
+        for name, obj in inspect.getmembers(collection.losses):
+            process_member(name, obj, module_list)
+    except AttributeError as e:
+        logging.info("  * No losses found")
+    # Add prefix - only for default name.
+    filename = args.filename if args.filename != "modules.json" else args.collection + "_" + args.filename
+    # Export to JSON.
+    with open(filename, 'w', encoding='utf-8') as outfile:
+        json.dump(module_list, outfile)
+    logging.info(
+        'Finished analysis of the `{}` collection, results exported to `{}`.'.format(args.collection, filename)
+    )
+if __name__ == '__main__':
+    main()

SoundScribe/SpeakerID/nemo/README.md ADDED Viewed

	@@ -0,0 +1,9 @@

+NeMo (**Ne**ural **Mo**dules) is a toolkit for creating AI applications built around **neural modules**, conceptual blocks of neural networks that take *typed* inputs and produce *typed* outputs.
+**NeMo Core** provides common APIs all modules and models have to implement.
+**NeMo Collections**
+* ASR - collection of modules and models for building speech recognition networks
+* TTS - collection of modules and models for building speech synthesis networks
+* NLP - collection of modules and models for building NLP networks

SoundScribe/SpeakerID/nemo/__init__.py ADDED Viewed

	@@ -0,0 +1,28 @@

+# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from nemo.package_info import (
+    __contact_emails__,
+    __contact_names__,
+    __description__,
+    __download_url__,
+    __homepage__,
+    __keywords__,
+    __license__,
+    __package_name__,
+    __repository_url__,
+    __shortversion__,
+    __version__,
+)

SoundScribe/SpeakerID/nemo/__pycache__/__init__.cpython-310.pyc ADDED Viewed

Binary file (454 Bytes). View file

SoundScribe/SpeakerID/nemo/__pycache__/__init__.cpython-39.pyc ADDED Viewed

Binary file (452 Bytes). View file

SoundScribe/SpeakerID/nemo/__pycache__/constants.cpython-310.pyc ADDED Viewed

Binary file (549 Bytes). View file

SoundScribe/SpeakerID/nemo/__pycache__/package_info.cpython-310.pyc ADDED Viewed

Binary file (909 Bytes). View file

SoundScribe/SpeakerID/nemo/collections/__init__.py ADDED Viewed

	@@ -0,0 +1,13 @@

+# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.

SoundScribe/SpeakerID/nemo/collections/__pycache__/__init__.cpython-310.pyc ADDED Viewed

Binary file (152 Bytes). View file

SoundScribe/SpeakerID/nemo/collections/__pycache__/__init__.cpython-39.pyc ADDED Viewed

Binary file (150 Bytes). View file

SoundScribe/SpeakerID/nemo/collections/asr/__init__.py ADDED Viewed

	@@ -0,0 +1,25 @@

+# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from nemo.collections.asr import data, losses, models, modules
+from nemo.package_info import __version__
+# Set collection version equal to NeMo version.
+__version = __version__
+# Authorship.
+__author__ = "NVIDIA Corporation"
+# Set collection name.
+__description__ = "Automatic Speech Recognition collection"

SoundScribe/SpeakerID/nemo/collections/asr/__pycache__/__init__.cpython-310.pyc ADDED Viewed

Binary file (429 Bytes). View file

SoundScribe/SpeakerID/nemo/collections/asr/__pycache__/__init__.cpython-39.pyc ADDED Viewed

Binary file (427 Bytes). View file

SoundScribe/SpeakerID/nemo/collections/asr/data/__init__.py ADDED Viewed

	@@ -0,0 +1,13 @@

+# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.

SoundScribe/SpeakerID/nemo/collections/asr/data/__pycache__/__init__.cpython-310.pyc ADDED Viewed

Binary file (161 Bytes). View file

SoundScribe/SpeakerID/nemo/collections/asr/data/__pycache__/audio_to_audio.cpython-310.pyc ADDED Viewed

Binary file (37.9 kB). View file

SoundScribe/SpeakerID/nemo/collections/asr/data/__pycache__/audio_to_audio_dataset.cpython-310.pyc ADDED Viewed

Binary file (2.42 kB). View file

SoundScribe/SpeakerID/nemo/collections/asr/data/__pycache__/audio_to_diar_label.cpython-310.pyc ADDED Viewed

Binary file (34.5 kB). View file

SoundScribe/SpeakerID/nemo/collections/asr/data/__pycache__/audio_to_label.cpython-310.pyc ADDED Viewed

Binary file (50.4 kB). View file

SoundScribe/SpeakerID/nemo/collections/asr/data/__pycache__/audio_to_label_dataset.cpython-310.pyc ADDED Viewed

Binary file (7.75 kB). View file

SoundScribe/SpeakerID/nemo/collections/asr/data/__pycache__/audio_to_text.cpython-310.pyc ADDED Viewed

Binary file (50.8 kB). View file

SoundScribe/SpeakerID/nemo/collections/asr/data/__pycache__/audio_to_text_dali.cpython-310.pyc ADDED Viewed

Binary file (24.9 kB). View file

SoundScribe/SpeakerID/nemo/collections/asr/data/__pycache__/audio_to_text_dataset.cpython-310.pyc ADDED Viewed

Binary file (23.8 kB). View file

SoundScribe/SpeakerID/nemo/collections/asr/data/__pycache__/feature_to_label.cpython-310.pyc ADDED Viewed

Binary file (16.1 kB). View file

SoundScribe/SpeakerID/nemo/collections/asr/data/__pycache__/feature_to_label_dataset.cpython-310.pyc ADDED Viewed

Binary file (1.78 kB). View file

SoundScribe/SpeakerID/nemo/collections/asr/data/audio_to_audio.py ADDED Viewed

	@@ -0,0 +1,1136 @@

+# Copyright (c) 2022, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import abc
+import math
+import random
+from collections import OrderedDict, namedtuple
+from dataclasses import dataclass
+from typing import Callable, Dict, List, Optional, Tuple, Type, Union
+import librosa
+import numpy as np
+import torch
+from nemo.collections.asr.parts.preprocessing.segment import AudioSegment
+from nemo.collections.asr.parts.utils.audio_utils import ChannelSelectorType
+from nemo.collections.common.parts.preprocessing import collections
+from nemo.collections.common.parts.utils import flatten
+from nemo.core.classes import Dataset
+from nemo.core.neural_types import AudioSignal, EncodedRepresentation, LengthsType, NeuralType
+from nemo.utils import logging
+__all__ = [
+    'AudioToTargetDataset',
+    'AudioToTargetWithReferenceDataset',
+    'AudioToTargetWithEmbeddingDataset',
+]
+def _audio_collate_fn(batch: List[dict]) -> Tuple[torch.Tensor]:
+    """Collate a batch of items returned by __getitem__.
+    Examples for each signal are zero padded to the same length
+    (batch_length), which is determined by the longest example.
+    Lengths of the original signals are returned in the output.
+    Args:
+        batch: List of dictionaries. Each element of the list
+            has the following format
+            ```
+            {
+                'signal_0': 1D or 2D tensor,
+                'signal_1': 1D or 2D tensor,
+                ...
+                'signal_N': 1D or 2D tensor,
+            }
+            ```
+            1D tensors have shape (num_samples,) and 2D tensors
+            have shape (num_channels, num_samples)
+    Returns:
+        A tuple containing signal tensor and signal length tensor (in samples)
+        for each signal.
+        The output has the following format:
+        ```
+        (signal_0, signal_0_length, signal_1, signal_1_length, ..., signal_N, signal_N_length)
+        ```
+        Note that the output format is obtained by interleaving signals and their length.
+    """
+    signals = batch[0].keys()
+    batched = tuple()
+    for signal in signals:
+        signal_length = [b[signal].shape[-1] for b in batch]
+        # Batch length is determined by the longest signal in the batch
+        batch_length = max(signal_length)
+        b_signal = []
+        for s_len, b in zip(signal_length, batch):
+            # check if padding is necessary
+            if s_len < batch_length:
+                if b[signal].ndim == 1:
+                    # single-channel signal
+                    pad = (0, batch_length - s_len)
+                elif b[signal].ndim == 2:
+                    # multi-channel signal
+                    pad = (0, batch_length - s_len, 0, 0)
+                else:
+                    raise RuntimeError(
+                        f'Signal {signal} has unsuported dimensions {signal.shape}. Currently, only 1D and 2D arrays are supported.'
+                    )
+                b[signal] = torch.nn.functional.pad(b[signal], pad)
+            # append the current padded signal
+            b_signal.append(b[signal])
+        # (signal_batched, signal_length)
+        batched += (torch.stack(b_signal), torch.tensor(signal_length, dtype=torch.int32))
+    # Currently, outputs are expected to be in a tuple, where each element must correspond
+    # to the output type in the OrderedDict returned by output_types.
+    #
+    # Therefore, we return batched signals by interleaving signals and their length:
+    #   (signal_0, signal_0_length, signal_1, signal_1_length, ...)
+    return batched
+@dataclass
+class SignalSetup:
+    signals: List[str]  # signal names
+    duration: Optional[Union[float, list]] = None  # duration for each signal
+    channel_selectors: Optional[List[ChannelSelectorType]] = None  # channel selector for loading each signal
+class ASRAudioProcessor:
+    """Class that processes an example from Audio collection and returns
+    a dictionary with prepared signals.
+    For example, the output dictionary may be the following
+    ```
+    {
+        'input_signal': input_signal_tensor,
+        'target_signal': target_signal_tensor,
+        'reference_signal': reference_signal_tensor,
+        'embedding_vector': embedding_vector
+    }
+    ```
+    Keys in the output dictionary are ordered with synchronous signals given first,
+    followed by asynchronous signals and embedding.
+    Args:
+        sample_rate: sample rate used for all audio signals
+        random_offset: If `True`, offset will be randomized when loading a subsegment
+                       from a file.
+    """
+    def __init__(
+        self, sample_rate: float, random_offset: bool,
+    ):
+        self.sample_rate = sample_rate
+        self.random_offset = random_offset
+        self.sync_setup = None
+        self.async_setup = None
+        self.embedding_setup = None
+    @property
+    def sample_rate(self) -> float:
+        return self._sample_rate
+    @sample_rate.setter
+    def sample_rate(self, value: float):
+        if value <= 0:
+            raise ValueError(f'Sample rate must be positive, received {value}')
+        self._sample_rate = value
+    @property
+    def random_offset(self) -> bool:
+        return self._random_offset
+    @random_offset.setter
+    def random_offset(self, value: bool):
+        self._random_offset = value
+    @property
+    def sync_setup(self) -> SignalSetup:
+        """Return the current setup for synchronous signals.
+        Returns:
+            A dataclass containing the list of signals, their
+            duration and channel selectors.
+        """
+        return self._sync_setup
+    @sync_setup.setter
+    def sync_setup(self, value: Optional[SignalSetup]):
+        """Setup signals to be loaded synchronously.
+        Args:
+            value: An instance of SignalSetup with the following fields
+                - signals: list of signals (keys of example.audio_signals) which will be loaded
+                           synchronously with the same start time and duration.
+                - duration: Duration for each signal to be loaded.
+                            If duration is set to None, the whole file will be loaded.
+                - channel_selectors: A list of channel selector for each signal. If channel selector
+                                     is None, all channels in the audio file will be loaded.
+        """
+        if value is None or isinstance(value, SignalSetup):
+            self._sync_setup = value
+        else:
+            raise ValueError(f'Unexpected type {type(value)} for value {value}.')
+    @property
+    def async_setup(self) -> SignalSetup:
+        """Return the current setup for asynchronous signals.
+        Returns:
+            A dataclass containing the list of signals, their
+            duration and channel selectors.
+        """
+        return self._async_setup
+    @async_setup.setter
+    def async_setup(self, value: Optional[SignalSetup]):
+        """Setup signals to be loaded asynchronously.
+        Args:
+        Args:
+            value: An instance of SignalSetup with the following fields
+                - signals: list of signals (keys of example.audio_signals) which will be loaded
+                           asynchronously with signals possibly having different start and duration
+                - duration: Duration for each signal to be loaded.
+                            If duration is set to None, the whole file will be loaded.
+                - channel_selectors: A list of channel selector for each signal. If channel selector
+                                     is None, all channels in the audio file will be loaded.
+        """
+        if value is None or isinstance(value, SignalSetup):
+            self._async_setup = value
+        else:
+            raise ValueError(f'Unexpected type {type(value)} for value {value}.')
+    @property
+    def embedding_setup(self) -> SignalSetup:
+        """Setup signals corresponding to an embedding vector.
+        """
+        return self._embedding_setup
+    @embedding_setup.setter
+    def embedding_setup(self, value: SignalSetup):
+        """Setup signals corresponding to an embedding vector.
+        Args:
+            value: An instance of SignalSetup with the following fields
+                - signals: list of signals (keys of example.audio_signals) which will be loaded
+                           as embedding vectors.
+        """
+        if value is None or isinstance(value, SignalSetup):
+            self._embedding_setup = value
+        else:
+            raise ValueError(f'Unexpected type {type(value)} for value {value}.')
+    def process(self, example: collections.Audio.OUTPUT_TYPE) -> Dict[str, torch.Tensor]:
+        """Process an example from a collection of audio examples.
+        Args:
+            example: an example from Audio collection.
+        Returns:
+            An ordered dictionary of signals and their tensors.
+            For example, the output dictionary may be the following
+            ```
+            {
+                'input_signal': input_signal_tensor,
+                'target_signal': target_signal_tensor,
+                'reference_signal': reference_signal_tensor,
+                'embedding_vector': embedding_vector
+            }
+            ```
+            Keys in the output dictionary are ordered with synchronous signals given first,
+            followed by asynchronous signals and embedding.
+        """
+        audio = self.load_audio(example=example)
+        audio = self.process_audio(audio=audio)
+        return audio
+    def load_audio(self, example: collections.Audio.OUTPUT_TYPE) -> Dict[str, torch.Tensor]:
+        """Given an example, load audio from `example.audio_files` and prepare
+        the output dictionary.
+        Args:
+            example: An example from an audio collection
+        Returns:
+            An ordered dictionary of signals and their tensors.
+            For example, the output dictionary may be the following
+            ```
+            {
+                'input_signal': input_signal_tensor,
+                'target_signal': target_signal_tensor,
+                'reference_signal': reference_signal_tensor,
+                'embedding_vector': embedding_vector
+            }
+            ```
+            Keys in the output dictionary are ordered with synchronous signals given first,
+            followed by asynchronous signals and embedding.
+        """
+        output = OrderedDict()
+        if self.sync_setup is not None:
+            # Load all signals with the same start and duration
+            sync_signals = self.load_sync_signals(example)
+            output.update(sync_signals)
+        if self.async_setup is not None:
+            # Load each signal independently
+            async_signals = self.load_async_signals(example)
+            output.update(async_signals)
+        # Load embedding vector
+        if self.embedding_setup is not None:
+            embedding = self.load_embedding(example)
+            output.update(embedding)
+        if not output:
+            raise RuntimeError('Output dictionary is empty. Please use `_setup` methods to setup signals to be loaded')
+        return output
+    def process_audio(self, audio: Dict[str, torch.Tensor]) -> Dict[str, torch.Tensor]:
+        """Process audio signals available in the input dictionary.
+        Args:
+            audio: A dictionary containing loaded signals `signal: tensor`
+        Returns:
+            An ordered dictionary of signals and their tensors.
+        """
+        # Currently, not doing any processing of the loaded signals.
+        return audio
+    def load_sync_signals(self, example: collections.Audio.OUTPUT_TYPE) -> Dict[str, torch.Tensor]:
+        """Load signals with the same start and duration.
+        Args:
+            example: an example from audio collection
+        Returns:
+            An ordered dictionary of signals and their tensors.
+        """
+        output = OrderedDict()
+        sync_audio_files = [example.audio_files[s] for s in self.sync_setup.signals]
+        sync_samples = self.get_samples_synchronized(
+            audio_files=sync_audio_files,
+            channel_selectors=self.sync_setup.channel_selectors,
+            sample_rate=self.sample_rate,
+            duration=self.sync_setup.duration,
+            fixed_offset=example.offset,
+            random_offset=self.random_offset,
+        )
+        for signal, samples in zip(self.sync_setup.signals, sync_samples):
+            output[signal] = torch.tensor(samples)
+        return output
+    def load_async_signals(self, example: collections.Audio.OUTPUT_TYPE) -> Dict[str, torch.Tensor]:
+        """Load each async signal independently, no constraints on starting
+        from the same time.
+        Args:
+            example: an example from audio collection
+        Returns:
+            An ordered dictionary of signals and their tensors.
+        """
+        output = OrderedDict()
+        for idx, signal in enumerate(self.async_setup.signals):
+            samples = self.get_samples(
+                audio_file=example.audio_files[signal],
+                sample_rate=self.sample_rate,
+                duration=self.async_setup.duration[idx],
+                channel_selector=self.async_setup.channel_selectors[idx],
+                fixed_offset=example.offset,
+                random_offset=self.random_offset,
+            )
+            output[signal] = torch.tensor(samples)
+        return output
+    @classmethod
+    def get_samples(
+        cls,
+        audio_file: str,
+        sample_rate: int,
+        duration: Optional[float] = None,
+        channel_selector: ChannelSelectorType = None,
+        fixed_offset: float = 0,
+        random_offset: bool = False,
+    ) -> np.ndarray:
+        """Get samples from an audio file.
+        For a single-channel signal, the output is shape (num_samples,).
+        For a multi-channel signal, the output is shape (num_samples, num_channels).
+        Args:
+            audio_file: path to an audio file
+            sample_rate: desired sample rate for output samples
+            duration: Optional desired duration of output samples.
+                    If `None`, the complete file will be loaded.
+                    If set, a segment of `duration` seconds will be loaded.
+            channel_selector: Optional channel selector, for selecting a subset of channels.
+            fixed_offset: Optional fixed offset when loading samples.
+            random_offset: If `True`, offset will be randomized when loading a short segment
+                        from a file. The value is randomized between fixed_offset and
+                        max_offset (set depending on the duration and fixed_offset).
+        Returns:
+            Numpy array with samples from audio file.
+            The array has shape (num_samples,) for a single-channel signal
+            or (num_channels, num_samples) for a multi-channel signal.
+        """
+        output = cls.get_samples_synchronized(
+            audio_files=[audio_file],
+            sample_rate=sample_rate,
+            duration=duration,
+            channel_selectors=[channel_selector],
+            fixed_offset=fixed_offset,
+            random_offset=random_offset,
+        )
+        return output[0]
+    @classmethod
+    def get_samples_synchronized(
+        cls,
+        audio_files: List[str],
+        sample_rate: int,
+        duration: Optional[float] = None,
+        channel_selectors: Optional[List[ChannelSelectorType]] = None,
+        fixed_offset: float = 0,
+        random_offset: bool = False,
+    ) -> List[np.ndarray]:
+        """Get samples from multiple files with the same start and end point.
+        Args:
+            audio_files: list of paths to audio files
+            sample_rate: desired sample rate for output samples
+            duration: Optional desired duration of output samples.
+                    If `None`, the complete files will be loaded.
+                    If set, a segment of `duration` seconds will be loaded from
+                    all files. Segment is synchronized across files, so that
+                    start and end points are the same.
+            channel_selectors: Optional channel selector for each signal, for selecting
+                            a subset of channels.
+            fixed_offset: Optional fixed offset when loading samples.
+            random_offset: If `True`, offset will be randomized when loading a short segment
+                        from a file. The value is randomized between fixed_offset and
+                        max_offset (set depending on the duration and fixed_offset).
+        Returns:
+            List with the same size as `audio_files` but containing numpy arrays
+            with samples from each audio file.
+            Each array has shape (num_samples,) or (num_channels, num_samples), for single-
+            or multi-channel signal, respectively.
+            For example, if `audio_files = [path/to/file_1.wav, path/to/file_2.wav]`,
+            the output will be a list `output = [samples_file_1, samples_file_2]`.
+        """
+        if channel_selectors is None:
+            channel_selectors = [None] * len(audio_files)
+        if duration is None:
+            # Load complete files starting from a fixed offset
+            offset = fixed_offset  # fixed offset
+            num_samples = None  # no constrain on the number of samples
+        else:
+            # Fixed duration of the output
+            audio_durations = cls.get_duration(audio_files)
+            min_audio_duration = min(audio_durations)
+            available_duration = min_audio_duration - fixed_offset
+            if available_duration <= 0:
+                raise ValueError(f'Fixed offset {fixed_offset}s is larger than shortest file {min_duration}s.')
+            if duration + fixed_offset > min_audio_duration:
+                # The shortest file is shorter than the requested duration
+                logging.debug(
+                    f'Shortest file ({min_audio_duration}s) is less than the desired duration {duration}s + fixed offset {fixed_offset}s. Returned signals will be shortened to {available_duration} seconds.'
+                )
+                offset = fixed_offset
+                duration = available_duration
+            elif random_offset:
+                # Randomize offset based on the shortest file
+                max_offset = min_audio_duration - duration
+                offset = random.uniform(fixed_offset, max_offset)
+            else:
+                # Fixed offset
+                offset = fixed_offset
+            # Fixed number of samples
+            num_samples = math.floor(duration * sample_rate)
+        output = []
+        # Prepare segments
+        for idx, audio_file in enumerate(audio_files):
+            segment_samples = cls.get_samples_from_file(
+                audio_file=audio_file,
+                sample_rate=sample_rate,
+                offset=offset,
+                num_samples=num_samples,
+                channel_selector=channel_selectors[idx],
+            )
+            output.append(segment_samples)
+        return output
+    @classmethod
+    def get_samples_from_file(
+        cls,
+        audio_file: Union[str, List[str]],
+        sample_rate: int,
+        offset: float,
+        num_samples: Optional[int] = None,
+        channel_selector: Optional[ChannelSelectorType] = None,
+    ) -> np.ndarray:
+        """Get samples from a single or multiple files.
+        If loading samples from multiple files, they will
+        be concatenated along the channel dimension.
+        Args:
+            audio_file: path or a list of paths.
+            sample_rate: sample rate of the loaded samples
+            offset: fixed offset in seconds
+            num_samples: Optional, number of samples to load.
+                         If `None`, all available samples will be loaded.
+            channel_selector: Select a subset of available channels.
+        Returns:
+            An array with shape (samples,) or (channels, samples)
+        """
+        if isinstance(audio_file, str):
+            # Load samples from a single file
+            segment_samples = cls.get_segment_from_file(
+                audio_file=audio_file,
+                sample_rate=sample_rate,
+                offset=offset,
+                num_samples=num_samples,
+                channel_selector=channel_selector,
+            )
+        elif isinstance(audio_file, list):
+            # Load samples from multiple files and form a multi-channel signal
+            segment_samples = []
+            for a_file in audio_file:
+                a_file_samples = cls.get_segment_from_file(
+                    audio_file=a_file,
+                    sample_rate=sample_rate,
+                    offset=offset,
+                    num_samples=num_samples,
+                    channel_selector=channel_selector,
+                )
+                segment_samples.append(a_file_samples)
+            segment_samples = cls.list_to_multichannel(segment_samples)
+        elif audio_file is None:
+            # Support for inference, when the target signal is `None`
+            segment_samples = []
+        else:
+            raise RuntimeError(f'Unexpected audio_file type {type(audio_file)}')
+        return segment_samples
+    @staticmethod
+    def get_segment_from_file(
+        audio_file: str,
+        sample_rate: int,
+        offset: float,
+        num_samples: Optional[int] = None,
+        channel_selector: Optional[ChannelSelectorType] = None,
+    ) -> np.ndarray:
+        """Get a segment of samples from a single audio file.
+        Args:
+            audio_file: path to an audio file
+            sample_rate: sample rate of the loaded samples
+            offset: fixed offset in seconds
+            num_samples: Optional, number of samples to load.
+                         If `None`, all available samples will be loaded.
+            channel_selector: Select a subset of available channels.
+        Returns:
+           An array with shape (samples,) or (channels, samples)
+        """
+        if num_samples is None:
+            segment = AudioSegment.from_file(
+                audio_file=audio_file, target_sr=sample_rate, offset=offset, channel_selector=channel_selector,
+            )
+        else:
+            segment = AudioSegment.segment_from_file(
+                audio_file=audio_file,
+                target_sr=sample_rate,
+                n_segments=num_samples,
+                offset=offset,
+                channel_selector=channel_selector,
+            )
+        if segment.samples.ndim == 1:
+            # Single-channel signal
+            return segment.samples
+        elif segment.samples.ndim == 2:
+            # Use multi-channel format as (channels, samples)
+            return segment.samples.T
+        else:
+            raise RuntimeError(f'Unexpected samples shape: {segment.samples.shape}')
+    @staticmethod
+    def list_to_multichannel(signal: Union[np.ndarray, List[np.ndarray]]) -> np.ndarray:
+        """Convert a list of signals into a multi-channel signal by concatenating
+        the elements of the list along the channel dimension.
+        If input is not a list, it is returned unmodified.
+        Args:
+            signal: list of arrays
+        Returns:
+            Numpy array obtained by concatenating the elements of the list
+            along the channel dimension (axis=0).
+        """
+        if not isinstance(signal, list):
+            # Nothing to do there
+            return signal
+        elif len(signal) == 0:
+            # Nothing to do, return as is
+            return signal
+        elif len(signal) == 1:
+            # Nothing to concatenate, return the original format
+            return signal[0]
+        # If multiple signals are provided in a list, we concatenate them along the channel dimension
+        if signal[0].ndim == 1:
+            # Single-channel individual files
+            mc_signal = np.stack(signal, axis=0)
+        elif signal[0].ndim == 2:
+            # Multi-channel individual files
+            mc_signal = np.concatenate(signal, axis=0)
+        else:
+            raise RuntimeError(f'Unexpected target with {signal[0].ndim} dimensions.')
+        return mc_signal
+    @staticmethod
+    def get_duration(audio_files: List[str]) -> List[float]:
+        """Get duration for each audio file in `audio_files`.
+        Args:
+            audio_files: list of paths to audio files
+        Returns:
+            List of durations in seconds.
+        """
+        duration = [librosa.get_duration(path=f) for f in flatten(audio_files)]
+        return duration
+    def load_embedding(self, example: collections.Audio.OUTPUT_TYPE) -> Dict[str, torch.Tensor]:
+        """Given an example, load embedding from `example.audio_files[embedding]`
+        and return it in a dictionary.
+        Args:
+            example: An example from audio collection
+        Returns:
+            An dictionary of embedding keys and their tensors.
+        """
+        output = OrderedDict()
+        for idx, signal in enumerate(self.embedding_setup.signals):
+            embedding_file = example.audio_files[signal]
+            embedding = self.load_embedding_vector(embedding_file)
+            output[signal] = torch.tensor(embedding)
+        return output
+    @staticmethod
+    def load_embedding_vector(filepath: str) -> np.ndarray:
+        """Load an embedding vector from a file.
+        Args:
+            filepath: path to a file storing a vector.
+                    Currently, it is assumed the file is a npy file.
+        Returns:
+            Array loaded from filepath.
+        """
+        if filepath.endswith('.npy'):
+            with open(filepath, 'rb') as f:
+                embedding = np.load(f)
+        else:
+            raise RuntimeError(f'Unknown embedding file format in file: {filepath}')
+        return embedding
+class BaseAudioDataset(Dataset):
+    """Base class of audio datasets, providing common functionality
+    for other audio datasets.
+    Args:
+        collection: Collection of audio examples prepared from manifest files.
+        audio_processor: Used to process every example from the collection.
+                         A callable with `process` method. For reference,
+                         please check ASRAudioProcessor.
+    """
+    @property
+    @abc.abstractmethod
+    def output_types(self) -> Optional[Dict[str, NeuralType]]:
+        """Returns definitions of module output ports.
+        """
+    def __init__(self, collection: collections.Audio, audio_processor: Callable, output_type: Type[namedtuple]):
+        """Instantiates an audio dataset.
+        """
+        super().__init__()
+        self.collection = collection
+        self.audio_processor = audio_processor
+        self.output_type = output_type
+    def num_channels(self, signal_key) -> int:
+        """Returns the number of channels for a particular signal in
+        items prepared by this dictionary.
+        More specifically, this will get the tensor from the first
+        item in the dataset, check if it's a one- or two-dimensional
+        tensor, and return the number of channels based on the size
+        of the first axis (shape[0]).
+        NOTE:
+        This assumes that all examples have the same number of channels.
+        Args:
+            signal_key: string, used to select a signal from the dictionary
+                        output by __getitem__
+        Returns:
+            Number of channels for the selected signal.
+        """
+        # Assumption: whole dataset has the same number of channels
+        item = self.__getitem__(0)
+        if item[signal_key].ndim == 1:
+            return 1
+        elif item[signal_key].ndim == 2:
+            return item[signal_key].shape[0]
+        else:
+            raise RuntimeError(
+                f'Unexpected number of dimension for signal {signal_key} with shape {item[signal_key].shape}'
+            )
+    def __getitem__(self, index: int) -> Dict[str, torch.Tensor]:
+        """Return a single example from the dataset.
+        Args:
+            index: integer index of an example in the collection
+        Returns:
+            Dictionary providing mapping from signal to its tensor.
+            For example:
+            ```
+            {
+                'input_signal': input_signal_tensor,
+                'target_signal': target_signal_tensor,
+            }
+            ```
+        """
+        example = self.collection[index]
+        output = self.audio_processor.process(example=example)
+        return output
+    def __len__(self) -> int:
+        """Return the number of examples in the dataset.
+        """
+        return len(self.collection)
+    def _collate_fn(self, batch) -> Tuple[torch.Tensor]:
+        """Collate items in a batch.
+        """
+        return self.output_type(*_audio_collate_fn(batch))
+AudioToTargetExample = namedtuple(
+    typename='AudioToTargetExample', field_names='input_signal input_length target_signal target_length'
+)
+class AudioToTargetDataset(BaseAudioDataset):
+    """A dataset for audio-to-audio tasks where the goal is to use
+    an input signal to recover the corresponding target signal.
+    Each line of the manifest file is expected to have the following format
+        ```
+        {
+            'input_key': 'path/to/input.wav',
+            'target_key': 'path/to/path_to_target.wav',
+            'duration': duration_of_input,
+        }
+        ```
+    Additionally, multiple audio files may be provided for each key in the manifest, for example,
+        ```
+        {
+            'input_key': 'path/to/input.wav',
+            'target_key': ['path/to/path_to_target_ch0.wav', 'path/to/path_to_target_ch1.wav'],
+            'duration': duration_of_input,
+        }
+        ```
+    Keys for input and target signals can be configured in the constructor (`input_key` and `target_key`).
+    Args:
+        manifest_filepath: Path to manifest file in a format described above.
+        sample_rate: Sample rate for loaded audio signals.
+        input_key: Key pointing to input audio files in the manifest
+        target_key: Key pointing to target audio files in manifest
+        audio_duration: Optional duration of each item returned by __getitem__.
+                        If `None`, complete audio will be loaded.
+                        If set, a random subsegment will be loaded synchronously from
+                        target and audio, i.e., with the same start and end point.
+        random_offset: If `True`, offset will be randomized when loading a subsegment
+                       from a file.
+        max_duration: If audio exceeds this length, do not include in dataset.
+        min_duration: If audio is less than this length, do not include in dataset.
+        max_utts: Limit number of utterances.
+        input_channel_selector: Optional, select subset of channels from each input audio file.
+                                If `None`, all channels will be loaded.
+        target_channel_selector: Optional, select subset of channels from each input audio file.
+                                 If `None`, all channels will be loaded.
+    """
+    def __init__(
+        self,
+        manifest_filepath: str,
+        sample_rate: int,
+        input_key: str,
+        target_key: str,
+        audio_duration: Optional[float] = None,
+        random_offset: bool = False,
+        max_duration: Optional[float] = None,
+        min_duration: Optional[float] = None,
+        max_utts: Optional[int] = None,
+        input_channel_selector: Optional[int] = None,
+        target_channel_selector: Optional[int] = None,
+    ):
+        audio_to_manifest_key = {
+            'input_signal': input_key,
+            'target_signal': target_key,
+        }
+        collection = collections.AudioCollection(
+            manifest_files=manifest_filepath,
+            audio_to_manifest_key=audio_to_manifest_key,
+            min_duration=min_duration,
+            max_duration=max_duration,
+            max_number=max_utts,
+        )
+        audio_processor = ASRAudioProcessor(sample_rate=sample_rate, random_offset=random_offset,)
+        audio_processor.sync_setup = SignalSetup(
+            signals=['input_signal', 'target_signal'],
+            duration=audio_duration,
+            channel_selectors=[input_channel_selector, target_channel_selector],
+        )
+        super().__init__(collection=collection, audio_processor=audio_processor, output_type=AudioToTargetExample)
+    @property
+    def output_types(self) -> Optional[Dict[str, NeuralType]]:
+        """Returns definitions of module output ports.
+        Returns:
+            Ordered dictionary in the following form:
+            ```
+            {
+                'input_signal': batched single- or multi-channel format,
+                'input_length': batched original length of each input signal
+                'target_signal': batched single- or multi-channel format,
+                'target_length': batched original length of each target signal
+            }
+            ```
+        """
+        sc_audio_type = NeuralType(('B', 'T'), AudioSignal())
+        mc_audio_type = NeuralType(('B', 'C', 'T'), AudioSignal())
+        return OrderedDict(
+            input_signal=sc_audio_type if self.num_channels('input_signal') == 1 else mc_audio_type,
+            input_length=NeuralType(('B',), LengthsType()),
+            target_signal=sc_audio_type if self.num_channels('target_signal') == 1 else mc_audio_type,
+            target_length=NeuralType(('B',), LengthsType()),
+        )
+AudioToTargetWithReferenceExample = namedtuple(
+    typename='AudioToTargetWithReferenceExample',
+    field_names='input_signal input_length target_signal target_length reference_signal reference_length',
+)
+class AudioToTargetWithReferenceDataset(BaseAudioDataset):
+    """A dataset for audio-to-audio tasks where the goal is to use
+    an input signal to recover the corresponding target signal and an
+    additional reference signal is available.
+    This can be used, for example, when a reference signal is
+    available from
+    - enrollment utterance for the target signal
+    - echo reference from playback
+    - reference from another sensor that correlates with the target signal
+    Each line of the manifest file is expected to have the following format
+        ```
+        {
+            'input_key': 'path/to/input.wav',
+            'target_key': 'path/to/path_to_target.wav',
+            'reference_key': 'path/to/path_to_reference.wav',
+            'duration': duration_of_input,
+        }
+        ```
+    Keys for input, target and reference signals can be configured in the constructor.
+    Args:
+        manifest_filepath: Path to manifest file in a format described above.
+        sample_rate: Sample rate for loaded audio signals.
+        input_key: Key pointing to input audio files in the manifest
+        target_key: Key pointing to target audio files in manifest
+        reference_key: Key pointing to reference audio files in manifest
+        audio_duration: Optional duration of each item returned by __getitem__.
+                        If `None`, complete audio will be loaded.
+                        If set, a random subsegment will be loaded synchronously from
+                        target and audio, i.e., with the same start and end point.
+        random_offset: If `True`, offset will be randomized when loading a subsegment
+                       from a file.
+        max_duration: If audio exceeds this length, do not include in dataset.
+        min_duration: If audio is less than this length, do not include in dataset.
+        max_utts: Limit number of utterances.
+        input_channel_selector: Optional, select subset of channels from each input audio file.
+                                If `None`, all channels will be loaded.
+        target_channel_selector: Optional, select subset of channels from each input audio file.
+                                 If `None`, all channels will be loaded.
+        reference_channel_selector: Optional, select subset of channels from each input audio file.
+                                    If `None`, all channels will be loaded.
+        reference_is_synchronized: If True, it is assumed that the reference signal is synchronized
+                                   with the input signal, so the same subsegment will be loaded as for
+                                   input and target. If False, reference signal will be loaded independently
+                                   from input and target.
+        reference_duration: Optional, can be used to set a fixed duration of the reference utterance. If `None`,
+                            complete audio file will be loaded.
+    """
+    def __init__(
+        self,
+        manifest_filepath: str,
+        sample_rate: int,
+        input_key: str,
+        target_key: str,
+        reference_key: str,
+        audio_duration: Optional[float] = None,
+        random_offset: bool = False,
+        max_duration: Optional[float] = None,
+        min_duration: Optional[float] = None,
+        max_utts: Optional[int] = None,
+        input_channel_selector: Optional[int] = None,
+        target_channel_selector: Optional[int] = None,
+        reference_channel_selector: Optional[int] = None,
+        reference_is_synchronized: bool = True,
+        reference_duration: Optional[float] = None,
+    ):
+        audio_to_manifest_key = {
+            'input_signal': input_key,
+            'target_signal': target_key,
+            'reference_signal': reference_key,
+        }
+        collection = collections.AudioCollection(
+            manifest_files=manifest_filepath,
+            audio_to_manifest_key=audio_to_manifest_key,
+            min_duration=min_duration,
+            max_duration=max_duration,
+            max_number=max_utts,
+        )
+        audio_processor = ASRAudioProcessor(sample_rate=sample_rate, random_offset=random_offset,)
+        if reference_is_synchronized:
+            audio_processor.sync_setup = SignalSetup(
+                signals=['input_signal', 'target_signal', 'reference_signal'],
+                duration=audio_duration,
+                channel_selectors=[input_channel_selector, target_channel_selector, reference_channel_selector],
+            )
+        else:
+            audio_processor.sync_setup = SignalSetup(
+                signals=['input_signal', 'target_signal'],
+                duration=audio_duration,
+                channel_selectors=[input_channel_selector, target_channel_selector],
+            )
+            audio_processor.async_setup = SignalSetup(
+                signals=['reference_signal'],
+                duration=[reference_duration],
+                channel_selectors=[reference_channel_selector],
+            )
+        super().__init__(
+            collection=collection, audio_processor=audio_processor, output_type=AudioToTargetWithReferenceExample
+        )
+    @property
+    def output_types(self) -> Optional[Dict[str, NeuralType]]:
+        """Returns definitions of module output ports.
+        Returns:
+            Ordered dictionary in the following form:
+            ```
+            {
+                'input_signal': batched single- or multi-channel format,
+                'input_length': batched original length of each input signal
+                'target_signal': batched single- or multi-channel format,
+                'target_length': batched original length of each target signal
+                'reference_signal': single- or multi-channel format,
+                'reference_length': original length of each reference signal
+            }
+            ```
+        """
+        sc_audio_type = NeuralType(('B', 'T'), AudioSignal())
+        mc_audio_type = NeuralType(('B', 'C', 'T'), AudioSignal())
+        return OrderedDict(
+            input_signal=sc_audio_type if self.num_channels('input_signal') == 1 else mc_audio_type,
+            input_length=NeuralType(('B',), LengthsType()),
+            target_signal=sc_audio_type if self.num_channels('target_signal') == 1 else mc_audio_type,
+            target_length=NeuralType(('B',), LengthsType()),
+            reference_signal=sc_audio_type if self.num_channels('reference_signal') == 1 else mc_audio_type,
+            reference_length=NeuralType(('B',), LengthsType()),
+        )
+AudioToTargetWithEmbeddingExample = namedtuple(
+    typename='AudioToTargetWithEmbeddingExample',
+    field_names='input_signal input_length target_signal target_length embedding_vector embedding_length',
+)
+class AudioToTargetWithEmbeddingDataset(BaseAudioDataset):
+    """A dataset for audio-to-audio tasks where the goal is to use
+    an input signal to recover the corresponding target signal and an
+    additional embedding signal. It is assumed that the embedding
+    is in a form of a vector.
+    Each line of the manifest file is expected to have the following format
+        ```
+        {
+            input_key: 'path/to/input.wav',
+            target_key: 'path/to/path_to_target.wav',
+            embedding_key: 'path/to/path_to_reference.npy',
+            'duration': duration_of_input,
+        }
+        ```
+    Keys for input, target and embedding signals can be configured in the constructor.
+    Args:
+        manifest_filepath: Path to manifest file in a format described above.
+        sample_rate: Sample rate for loaded audio signals.
+        input_key: Key pointing to input audio files in the manifest
+        target_key: Key pointing to target audio files in manifest
+        embedding_key: Key pointing to embedding files in manifest
+        audio_duration: Optional duration of each item returned by __getitem__.
+                        If `None`, complete audio will be loaded.
+                        If set, a random subsegment will be loaded synchronously from
+                        target and audio, i.e., with the same start and end point.
+        random_offset: If `True`, offset will be randomized when loading a subsegment
+                       from a file.
+        max_duration: If audio exceeds this length, do not include in dataset.
+        min_duration: If audio is less than this length, do not include in dataset.
+        max_utts: Limit number of utterances.
+        input_channel_selector: Optional, select subset of channels from each input audio file.
+                                If `None`, all channels will be loaded.
+        target_channel_selector: Optional, select subset of channels from each input audio file.
+                                 If `None`, all channels will be loaded.
+    """
+    def __init__(
+        self,
+        manifest_filepath: str,
+        sample_rate: int,
+        input_key: str,
+        target_key: str,
+        embedding_key: str,
+        audio_duration: Optional[float] = None,
+        random_offset: bool = False,
+        max_duration: Optional[float] = None,
+        min_duration: Optional[float] = None,
+        max_utts: Optional[int] = None,
+        input_channel_selector: Optional[int] = None,
+        target_channel_selector: Optional[int] = None,
+    ):
+        audio_to_manifest_key = {
+            'input_signal': input_key,
+            'target_signal': target_key,
+            'embedding_vector': embedding_key,
+        }
+        collection = collections.AudioCollection(
+            manifest_files=manifest_filepath,
+            audio_to_manifest_key=audio_to_manifest_key,
+            min_duration=min_duration,
+            max_duration=max_duration,
+            max_number=max_utts,
+        )
+        audio_processor = ASRAudioProcessor(sample_rate=sample_rate, random_offset=random_offset,)
+        audio_processor.sync_setup = SignalSetup(
+            signals=['input_signal', 'target_signal'],
+            duration=audio_duration,
+            channel_selectors=[input_channel_selector, target_channel_selector],
+        )
+        audio_processor.embedding_setup = SignalSetup(signals=['embedding_vector'])
+        super().__init__(
+            collection=collection, audio_processor=audio_processor, output_type=AudioToTargetWithEmbeddingExample
+        )
+    @property
+    def output_types(self) -> Optional[Dict[str, NeuralType]]:
+        """Returns definitions of module output ports.
+        Returns:
+            Ordered dictionary in the following form:
+            ```
+            {
+                'input_signal': batched single- or multi-channel format,
+                'input_length': batched original length of each input signal
+                'target_signal': batched single- or multi-channel format,
+                'target_length': batched original length of each target signal
+                'embedding_vector': batched embedded vector format,
+                'embedding_length': batched original length of each embedding vector
+            }
+            ```
+        """
+        sc_audio_type = NeuralType(('B', 'T'), AudioSignal())
+        mc_audio_type = NeuralType(('B', 'C', 'T'), AudioSignal())
+        return OrderedDict(
+            input_signal=sc_audio_type if self.num_channels('input_signal') == 1 else mc_audio_type,
+            input_length=NeuralType(('B',), LengthsType()),
+            target_signal=sc_audio_type if self.num_channels('target_signal') == 1 else mc_audio_type,
+            target_length=NeuralType(('B',), LengthsType()),
+            embedding_vector=NeuralType(('B', 'D'), EncodedRepresentation()),
+            embedding_length=NeuralType(('B',), LengthsType()),
+        )

SoundScribe/SpeakerID/nemo/collections/asr/data/audio_to_audio_dataset.py ADDED Viewed

	@@ -0,0 +1,95 @@

+# Copyright (c) 2022, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from nemo.collections.asr.data import audio_to_audio
+def get_audio_to_target_dataset(config: dict) -> audio_to_audio.AudioToTargetDataset:
+    """Instantiates an audio-to-audio dataset.
+    Args:
+        config: Config of AudioToTargetDataset.
+    Returns:
+        An instance of AudioToTargetDataset
+    """
+    dataset = audio_to_audio.AudioToTargetDataset(
+        manifest_filepath=config['manifest_filepath'],
+        sample_rate=config['sample_rate'],
+        input_key=config['input_key'],
+        target_key=config['target_key'],
+        audio_duration=config.get('audio_duration', None),
+        random_offset=config.get('random_offset', False),
+        max_duration=config.get('max_duration', None),
+        min_duration=config.get('min_duration', None),
+        max_utts=config.get('max_utts', 0),
+        input_channel_selector=config.get('input_channel_selector', None),
+        target_channel_selector=config.get('target_channel_selector', None),
+    )
+    return dataset
+def get_audio_to_target_with_reference_dataset(config: dict) -> audio_to_audio.AudioToTargetWithReferenceDataset:
+    """Instantiates an audio-to-audio dataset.
+    Args:
+        config: Config of AudioToTargetWithReferenceDataset.
+    Returns:
+        An instance of AudioToTargetWithReferenceDataset
+    """
+    dataset = audio_to_audio.AudioToTargetWithReferenceDataset(
+        manifest_filepath=config['manifest_filepath'],
+        sample_rate=config['sample_rate'],
+        input_key=config['input_key'],
+        target_key=config['target_key'],
+        reference_key=config['reference_key'],
+        audio_duration=config.get('audio_duration', None),
+        random_offset=config.get('random_offset', False),
+        max_duration=config.get('max_duration', None),
+        min_duration=config.get('min_duration', None),
+        max_utts=config.get('max_utts', 0),
+        input_channel_selector=config.get('input_channel_selector', None),
+        target_channel_selector=config.get('target_channel_selector', None),
+        reference_channel_selector=config.get('reference_channel_selector', None),
+        reference_is_synchronized=config.get('reference_is_synchronized', True),
+        reference_duration=config.get('reference_duration', None),
+    )
+    return dataset
+def get_audio_to_target_with_embedding_dataset(config: dict) -> audio_to_audio.AudioToTargetWithEmbeddingDataset:
+    """Instantiates an audio-to-audio dataset.
+    Args:
+        config: Config of AudioToTargetWithEmbeddingDataset.
+    Returns:
+        An instance of AudioToTargetWithEmbeddingDataset
+    """
+    dataset = audio_to_audio.AudioToTargetWithEmbeddingDataset(
+        manifest_filepath=config['manifest_filepath'],
+        sample_rate=config['sample_rate'],
+        input_key=config['input_key'],
+        target_key=config['target_key'],
+        embedding_key=config['embedding_key'],
+        audio_duration=config.get('audio_duration', None),
+        random_offset=config.get('random_offset', False),
+        max_duration=config.get('max_duration', None),
+        min_duration=config.get('min_duration', None),
+        max_utts=config.get('max_utts', 0),
+        input_channel_selector=config.get('input_channel_selector', None),
+        target_channel_selector=config.get('target_channel_selector', None),
+    )
+    return dataset

SoundScribe/SpeakerID/nemo/collections/asr/data/audio_to_ctm_dataset.py ADDED Viewed

	@@ -0,0 +1,95 @@

+# Copyright (c) 2022, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import json
+import os
+from dataclasses import dataclass
+from pathlib import Path
+from typing import Any, List, Tuple
+from nemo.collections.asr.data.audio_to_text_dataset import ASRPredictionWriter
+from nemo.utils import logging
+@dataclass
+class FrameCtmUnit:
+    """A container class for one CTM unit with start and length countable in frames.
+    """
+    label: str
+    start_frame: int
+    length: int
+    probability: float
+    def __repr__(self) -> str:
+        return f"{self.label}\t({self.probability:1.3f}): [{self.start_frame:6d}, {self.length:6d}]"
+    @property
+    def end_frame(self):
+        return self.start_frame + self.length
+    def to_ctm_str(self, time_per_frame: int) -> str:
+        """Represents the data as part of the CTM line.
+        The CTM line format is
+            <utterance_name> <channel> <start_time> <duration> <label_str> <probability>
+        This method prepares the last four entities."""
+        return f"{self.start_frame * time_per_frame :.3f} {self.length * time_per_frame :.3f} {self.label} {self.probability :1.3f}"
+class ASRCTMPredictionWriter(ASRPredictionWriter):
+    def __init__(self, dataset, output_file: str, output_ctm_dir: str, time_per_frame: float):
+        super().__init__(dataset, output_file)
+        self.output_ctm_dir = output_ctm_dir
+        self.time_per_frame = time_per_frame
+        os.makedirs(self.output_ctm_dir, exist_ok=True)
+    def write_ctm(self, name, filepath, frameCtmUnits):
+        with open(filepath, "tw", encoding="utf-8") as f:
+            for unit in frameCtmUnits:
+                f.write(f"{name} 1 {unit.to_ctm_str(self.time_per_frame)}\n")
+    def write_on_batch_end(
+        self,
+        trainer,
+        pl_module: 'LightningModule',
+        prediction: Tuple[int, List[FrameCtmUnit]],
+        batch_indices: List[int],
+        batch: Any,
+        batch_idx: int,
+        dataloader_idx: int,
+    ):
+        for sample_id, units in prediction:
+            sample = self.dataset.get_manifest_sample(sample_id)
+            with_ctm = True
+            if len(units) == 0:
+                logging.warning(
+                    f"""Do not producing CTM output for item `{sample.audio_file}`.
+                    Check if text is empty or if duration is too short: `{sample.text_raw}`, {sample.duration}"""
+                )
+                with_ctm = False
+            item = {}
+            item["audio_filepath"] = sample.audio_file
+            item["duration"] = sample.duration
+            item["text"] = sample.text_raw
+            if with_ctm:
+                utt_name = Path(sample.audio_file).stem
+                ctm_filepath = os.path.join(self.output_ctm_dir, utt_name) + ".ctm"
+                self.write_ctm(utt_name, ctm_filepath, units)
+                item["ctm_filepath"] = ctm_filepath
+            else:
+                item["ctm_filepath"] = ""
+            self.outf.write(json.dumps(item) + "\n")
+            self.samples_num += 1
+        return

SoundScribe/SpeakerID/nemo/collections/asr/data/audio_to_diar_label.py ADDED Viewed

	@@ -0,0 +1,853 @@

+# Copyright (c) 2022, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import os
+from collections import OrderedDict
+from statistics import mode
+from typing import Dict, Optional
+import torch
+from nemo.collections.asr.parts.utils.offline_clustering import get_argmin_mat
+from nemo.collections.asr.parts.utils.speaker_utils import convert_rttm_line, prepare_split_data
+from nemo.collections.common.parts.preprocessing.collections import DiarizationSpeechLabel
+from nemo.core.classes import Dataset
+from nemo.core.neural_types import AudioSignal, EncodedRepresentation, LengthsType, NeuralType, ProbsType
+def get_scale_mapping_list(uniq_timestamps):
+    """
+    Call get_argmin_mat function to find the index of the non-base-scale segment that is closest to the
+    given base-scale segment. For each scale and each segment, a base-scale segment is assigned.
+    Args:
+        uniq_timestamps: (dict)
+            The dictionary containing embeddings, timestamps and multiscale weights.
+            If uniq_timestamps contains only one scale, single scale diarization is performed.
+    Returns:
+        scale_mapping_argmat (torch.tensor):
+            The element at the m-th row and the n-th column of the scale mapping matrix indicates the (m+1)-th scale
+            segment index which has the closest center distance with (n+1)-th segment in the base scale.
+            - Example:
+                `scale_mapping_argmat[2][101] = 85`
+            In the above example, the code snippet means that 86-th segment in the 3rd scale (python index is 2) is
+            mapped to the 102-th segment in the base scale. Thus, the longer segments bound to have more repeating
+            numbers since multiple base scale segments (since the base scale has the shortest length) fall into the
+            range of the longer segments. At the same time, each row contains N numbers of indices where N is number
+            of segments in the base-scale (i.e., the finest scale).
+    """
+    timestamps_in_scales = []
+    for key, val in uniq_timestamps['scale_dict'].items():
+        timestamps_in_scales.append(torch.tensor(val['time_stamps']))
+    session_scale_mapping_list = get_argmin_mat(timestamps_in_scales)
+    scale_mapping_argmat = [[] for _ in range(len(uniq_timestamps['scale_dict'].keys()))]
+    for scale_idx in range(len(session_scale_mapping_list)):
+        scale_mapping_argmat[scale_idx] = session_scale_mapping_list[scale_idx]
+    scale_mapping_argmat = torch.stack(scale_mapping_argmat)
+    return scale_mapping_argmat
+def extract_seg_info_from_rttm(uniq_id, rttm_lines, mapping_dict=None, target_spks=None):
+    """
+    Get RTTM lines containing speaker labels, start time and end time. target_spks contains two targeted
+    speaker indices for creating groundtruth label files. Only speakers in target_spks variable will be
+    included in the output lists.
+    Args:
+        uniq_id (str):
+            Unique file ID that refers to an input audio file and corresponding RTTM (Annotation) file.
+        rttm_lines (list):
+            List containing RTTM lines in str format.
+        mapping_dict (dict):
+            Mapping between the estimated speakers and the speakers in the ground-truth annotation.
+            `mapping_dict` variable is only provided when the inference mode is running in sequence-eval mode.
+            Sequence eval mode uses the mapping between the estimated speakers and the speakers in ground-truth annotation.
+    Returns:
+        rttm_tup (tuple):
+            Tuple containing lists of start time, end time and speaker labels.
+    """
+    stt_list, end_list, speaker_list, pairwise_infer_spks = [], [], [], []
+    if target_spks:
+        inv_map = {v: k for k, v in mapping_dict.items()}
+        for spk_idx in target_spks:
+            spk_str = f'speaker_{spk_idx}'
+            if spk_str in inv_map:
+                pairwise_infer_spks.append(inv_map[spk_str])
+    for rttm_line in rttm_lines:
+        start, end, speaker = convert_rttm_line(rttm_line)
+        if target_spks is None or speaker in pairwise_infer_spks:
+            end_list.append(end)
+            stt_list.append(start)
+            speaker_list.append(speaker)
+    rttm_tup = (stt_list, end_list, speaker_list)
+    return rttm_tup
+def assign_frame_level_spk_vector(rttm_timestamps, round_digits, frame_per_sec, target_spks, min_spks=2):
+    """
+    Create a multi-dimensional vector sequence containing speaker timestamp information in RTTM.
+    The unit-length is the frame shift length of the acoustic feature. The feature-level annotations
+    `fr_level_target` will later be converted to base-segment level diarization label.
+    Args:
+        rttm_timestamps (list):
+            List containing start and end time for each speaker segment label.
+            stt_list, end_list and speaker_list are contained.
+        frame_per_sec (int):
+            Number of feature frames per second. This quantity is determined by window_stride variable in preprocessing module.
+        target_spks (tuple):
+            Speaker indices that are generated from combinations. If there are only one or two speakers,
+            only a single target_spks variable is generated.
+    Returns:
+        fr_level_target (torch.tensor):
+            Tensor containing label for each feature level frame.
+    """
+    stt_list, end_list, speaker_list = rttm_timestamps
+    if len(speaker_list) == 0:
+        return None
+    else:
+        sorted_speakers = sorted(list(set(speaker_list)))
+        total_fr_len = int(max(end_list) * (10 ** round_digits))
+        spk_num = max(len(sorted_speakers), min_spks)
+        speaker_mapping_dict = {rttm_key: x_int for x_int, rttm_key in enumerate(sorted_speakers)}
+        fr_level_target = torch.zeros(total_fr_len, spk_num)
+        # If RTTM is not provided, then there is no speaker mapping dict in target_spks.
+        # Thus, return a zero-filled tensor as a placeholder.
+        for count, (stt, end, spk_rttm_key) in enumerate(zip(stt_list, end_list, speaker_list)):
+            stt, end = round(stt, round_digits), round(end, round_digits)
+            spk = speaker_mapping_dict[spk_rttm_key]
+            stt_fr, end_fr = int(round(stt, 2) * frame_per_sec), int(round(end, round_digits) * frame_per_sec)
+            fr_level_target[stt_fr:end_fr, spk] = 1
+        return fr_level_target
+class _AudioMSDDTrainDataset(Dataset):
+    """
+    Dataset class that loads a json file containing paths to audio files,
+    RTTM files and number of speakers. This Dataset class is designed for
+    training or fine-tuning speaker embedding extractor and diarization decoder
+    at the same time.
+    Example:
+    {"audio_filepath": "/path/to/audio_0.wav", "num_speakers": 2,
+    "rttm_filepath": "/path/to/diar_label_0.rttm}
+    ...
+    {"audio_filepath": "/path/to/audio_n.wav", "num_speakers": 2,
+    "rttm_filepath": "/path/to/diar_label_n.rttm}
+    Args:
+        manifest_filepath (str):
+            Path to input manifest json files.
+        multiscale_args_dict (dict):
+            Dictionary containing the parameters for multiscale segmentation and clustering.
+        emb_dir (str):
+            Path to a temporary folder where segmentation information for embedding extraction is saved.
+        soft_label_thres (float):
+            Threshold that determines the label of each segment based on RTTM file information.
+        featurizer:
+            Featurizer instance for generating features from the raw waveform.
+        window_stride (float):
+            Window stride for acoustic feature. This value is used for calculating the numbers of feature-level frames.
+        emb_batch_size (int):
+            Number of embedding vectors that are trained with attached computational graphs.
+        pairwise_infer (bool):
+            This variable should be True if dataloader is created for an inference task.
+        random_flip (bool):
+            If True, the two labels and input signals are randomly flipped per every epoch while training.
+    """
+    @property
+    def output_types(self) -> Optional[Dict[str, NeuralType]]:
+        """Returns definitions of module output ports."""
+        output_types = {
+            "features": NeuralType(('B', 'T'), AudioSignal()),
+            "feature_length": NeuralType(('B'), LengthsType()),
+            "ms_seg_timestamps": NeuralType(('B', 'C', 'T', 'D'), LengthsType()),
+            "ms_seg_counts": NeuralType(('B', 'C'), LengthsType()),
+            "clus_label_index": NeuralType(('B', 'T'), LengthsType()),
+            "scale_mapping": NeuralType(('B', 'C', 'T'), LengthsType()),
+            "targets": NeuralType(('B', 'T', 'C'), ProbsType()),
+        }
+        return output_types
+    def __init__(
+        self,
+        *,
+        manifest_filepath: str,
+        multiscale_args_dict: str,
+        emb_dir: str,
+        soft_label_thres: float,
+        featurizer,
+        window_stride,
+        emb_batch_size,
+        pairwise_infer: bool,
+        random_flip: bool = True,
+        global_rank: int = 0,
+    ):
+        super().__init__()
+        self.collection = DiarizationSpeechLabel(
+            manifests_files=manifest_filepath.split(','),
+            emb_dict=None,
+            clus_label_dict=None,
+            pairwise_infer=pairwise_infer,
+        )
+        self.featurizer = featurizer
+        self.multiscale_args_dict = multiscale_args_dict
+        self.emb_dir = emb_dir
+        self.round_digits = 2
+        self.decim = 10 ** self.round_digits
+        self.soft_label_thres = soft_label_thres
+        self.pairwise_infer = pairwise_infer
+        self.max_spks = 2
+        self.frame_per_sec = int(1 / window_stride)
+        self.emb_batch_size = emb_batch_size
+        self.random_flip = random_flip
+        self.global_rank = global_rank
+        self.manifest_filepath = manifest_filepath
+        self.multiscale_timestamp_dict = prepare_split_data(
+            self.manifest_filepath, self.emb_dir, self.multiscale_args_dict, self.global_rank,
+        )
+    def __len__(self):
+        return len(self.collection)
+    def assign_labels_to_longer_segs(self, uniq_id, base_scale_clus_label):
+        """
+        Assign the generated speaker labels from the base scale (the finest scale) to the longer scales.
+        This process is needed to get the cluster labels for each scale. The cluster labels are needed to
+        calculate the cluster-average speaker embedding for each scale.
+        Args:
+            uniq_id (str):
+                Unique sample ID for training.
+            base_scale_clus_label (torch.tensor):
+                Tensor variable containing the speaker labels for the base-scale segments.
+        Returns:
+            per_scale_clus_label (torch.tensor):
+                Tensor variable containing the speaker labels for each segment in each scale.
+                Note that the total length of the speaker label sequence differs over scale since
+                each scale has a different number of segments for the same session.
+            scale_mapping (torch.tensor):
+                Matrix containing the segment indices of each scale. scale_mapping is necessary for reshaping the
+                multiscale embeddings to form an input matrix for the MSDD model.
+        """
+        per_scale_clus_label = []
+        self.scale_n = len(self.multiscale_timestamp_dict[uniq_id]['scale_dict'])
+        uniq_scale_mapping = get_scale_mapping_list(self.multiscale_timestamp_dict[uniq_id])
+        for scale_index in range(self.scale_n):
+            new_clus_label = []
+            scale_seq_len = len(self.multiscale_timestamp_dict[uniq_id]["scale_dict"][scale_index]["time_stamps"])
+            for seg_idx in range(scale_seq_len):
+                if seg_idx in uniq_scale_mapping[scale_index]:
+                    seg_clus_label = mode(base_scale_clus_label[uniq_scale_mapping[scale_index] == seg_idx])
+                else:
+                    seg_clus_label = 0 if len(new_clus_label) == 0 else new_clus_label[-1]
+                new_clus_label.append(seg_clus_label)
+            per_scale_clus_label.extend(new_clus_label)
+        per_scale_clus_label = torch.tensor(per_scale_clus_label)
+        return per_scale_clus_label, uniq_scale_mapping
+    def get_diar_target_labels(self, uniq_id, sample, fr_level_target):
+        """
+        Convert frame-level diarization target variable into segment-level target variable. Since the granularity is reduced
+        from frame level (10ms) to segment level (100ms~500ms), we need a threshold value, `soft_label_thres`, which determines
+        the label of each segment based on the overlap between a segment range (start and end time) and the frame-level target variable.
+        Args:
+            uniq_id (str):
+                Unique file ID that refers to an input audio file and corresponding RTTM (Annotation) file.
+            sample:
+                `DiarizationSpeechLabel` instance containing sample information such as audio filepath and RTTM filepath.
+            fr_level_target (torch.tensor):
+                Tensor containing label for each feature-level frame.
+        Returns:
+            seg_target (torch.tensor):
+                Tensor containing binary speaker labels for base-scale segments.
+            base_clus_label (torch.tensor):
+                Representative speaker label for each segment. This variable only has one speaker label for each base-scale segment.
+                -1 means that there is no corresponding speaker in the target_spks tuple.
+        """
+        seg_target_list, base_clus_label = [], []
+        self.scale_n = len(self.multiscale_timestamp_dict[uniq_id]['scale_dict'])
+        subseg_time_stamp_list = self.multiscale_timestamp_dict[uniq_id]["scale_dict"][self.scale_n - 1]["time_stamps"]
+        for (seg_stt, seg_end) in subseg_time_stamp_list:
+            seg_stt_fr, seg_end_fr = int(seg_stt * self.frame_per_sec), int(seg_end * self.frame_per_sec)
+            soft_label_vec_sess = torch.sum(fr_level_target[seg_stt_fr:seg_end_fr, :], axis=0) / (
+                seg_end_fr - seg_stt_fr
+            )
+            label_int_sess = torch.argmax(soft_label_vec_sess)
+            soft_label_vec = soft_label_vec_sess.unsqueeze(0)[:, sample.target_spks].squeeze()
+            if label_int_sess in sample.target_spks and torch.sum(soft_label_vec_sess) > 0:
+                label_int = sample.target_spks.index(label_int_sess)
+            else:
+                label_int = -1
+            label_vec = (soft_label_vec > self.soft_label_thres).float()
+            seg_target_list.append(label_vec.detach())
+            base_clus_label.append(label_int)
+        seg_target = torch.stack(seg_target_list)
+        base_clus_label = torch.tensor(base_clus_label)
+        return seg_target, base_clus_label
+    def parse_rttm_for_ms_targets(self, sample):
+        """
+        Generate target tensor variable by extracting groundtruth diarization labels from an RTTM file.
+        This function converts (start, end, speaker_id) format into base-scale (the finest scale) segment level
+        diarization label in a matrix form.
+        Example of seg_target:
+            [[0., 1.], [0., 1.], [1., 1.], [1., 0.], [1., 0.], ..., [0., 1.]]
+        Args:
+            sample:
+                `DiarizationSpeechLabel` instance containing sample information such as audio filepath and RTTM filepath.
+            target_spks (tuple):
+                Speaker indices that are generated from combinations. If there are only one or two speakers,
+                only a single target_spks tuple is generated.
+        Returns:
+            clus_label_index (torch.tensor):
+                Groundtruth clustering label (cluster index for each segment) from RTTM files for training purpose.
+            seg_target  (torch.tensor):
+                Tensor variable containing hard-labels of speaker activity in each base-scale segment.
+            scale_mapping (torch.tensor):
+                Matrix containing the segment indices of each scale. scale_mapping is necessary for reshaping the
+                multiscale embeddings to form an input matrix for the MSDD model.
+        """
+        rttm_lines = open(sample.rttm_file).readlines()
+        uniq_id = self.get_uniq_id_with_range(sample)
+        rttm_timestamps = extract_seg_info_from_rttm(uniq_id, rttm_lines)
+        fr_level_target = assign_frame_level_spk_vector(
+            rttm_timestamps, self.round_digits, self.frame_per_sec, target_spks=sample.target_spks
+        )
+        seg_target, base_clus_label = self.get_diar_target_labels(uniq_id, sample, fr_level_target)
+        clus_label_index, scale_mapping = self.assign_labels_to_longer_segs(uniq_id, base_clus_label)
+        return clus_label_index, seg_target, scale_mapping
+    def get_uniq_id_with_range(self, sample, deci=3):
+        """
+        Generate unique training sample ID from unique file ID, offset and duration. The start-end time added
+        unique ID is required for identifying the sample since multiple short audio samples are generated from a single
+        audio file. The start time and end time of the audio stream uses millisecond units if `deci=3`.
+        Args:
+            sample:
+                `DiarizationSpeechLabel` instance from collections.
+        Returns:
+            uniq_id (str):
+                Unique sample ID which includes start and end time of the audio stream.
+                Example: abc1001_3122_6458
+        """
+        bare_uniq_id = os.path.splitext(os.path.basename(sample.rttm_file))[0]
+        offset = str(int(round(sample.offset, deci) * pow(10, deci)))
+        endtime = str(int(round(sample.offset + sample.duration, deci) * pow(10, deci)))
+        uniq_id = f"{bare_uniq_id}_{offset}_{endtime}"
+        return uniq_id
+    def get_ms_seg_timestamps(self, sample):
+        """
+        Get start and end time of segments in each scale.
+        Args:
+            sample:
+                `DiarizationSpeechLabel` instance from preprocessing.collections
+        Returns:
+            ms_seg_timestamps (torch.tensor):
+                Tensor containing Multiscale segment timestamps.
+            ms_seg_counts (torch.tensor):
+                Number of segments for each scale. This information is used for reshaping embedding batch
+                during forward propagation.
+        """
+        uniq_id = self.get_uniq_id_with_range(sample)
+        ms_seg_timestamps_list = []
+        max_seq_len = len(self.multiscale_timestamp_dict[uniq_id]["scale_dict"][self.scale_n - 1]["time_stamps"])
+        ms_seg_counts = [0 for _ in range(self.scale_n)]
+        for scale_idx in range(self.scale_n):
+            scale_ts_list = []
+            for k, (seg_stt, seg_end) in enumerate(
+                self.multiscale_timestamp_dict[uniq_id]["scale_dict"][scale_idx]["time_stamps"]
+            ):
+                stt, end = (
+                    int((seg_stt - sample.offset) * self.frame_per_sec),
+                    int((seg_end - sample.offset) * self.frame_per_sec),
+                )
+                scale_ts_list.append(torch.tensor([stt, end]).detach())
+            ms_seg_counts[scale_idx] = len(
+                self.multiscale_timestamp_dict[uniq_id]["scale_dict"][scale_idx]["time_stamps"]
+            )
+            scale_ts = torch.stack(scale_ts_list)
+            scale_ts_padded = torch.cat([scale_ts, torch.zeros(max_seq_len - len(scale_ts_list), 2)], dim=0)
+            ms_seg_timestamps_list.append(scale_ts_padded.detach())
+        ms_seg_timestamps = torch.stack(ms_seg_timestamps_list)
+        ms_seg_counts = torch.tensor(ms_seg_counts)
+        return ms_seg_timestamps, ms_seg_counts
+    def __getitem__(self, index):
+        sample = self.collection[index]
+        if sample.offset is None:
+            sample.offset = 0
+        clus_label_index, targets, scale_mapping = self.parse_rttm_for_ms_targets(sample)
+        features = self.featurizer.process(sample.audio_file, offset=sample.offset, duration=sample.duration)
+        feature_length = torch.tensor(features.shape[0]).long()
+        ms_seg_timestamps, ms_seg_counts = self.get_ms_seg_timestamps(sample)
+        if self.random_flip:
+            torch.manual_seed(index)
+            flip = torch.cat([torch.randperm(self.max_spks), torch.tensor(-1).unsqueeze(0)])
+            clus_label_index, targets = flip[clus_label_index], targets[:, flip[: self.max_spks]]
+        return features, feature_length, ms_seg_timestamps, ms_seg_counts, clus_label_index, scale_mapping, targets
+class _AudioMSDDInferDataset(Dataset):
+    """
+    Dataset class that loads a json file containing paths to audio files,
+    RTTM files and number of speakers. This Dataset class is built for diarization inference and
+    evaluation. Speaker embedding sequences, segment timestamps, cluster-average speaker embeddings
+    are loaded from memory and fed into the dataloader.
+    Example:
+    {"audio_filepath": "/path/to/audio_0.wav", "num_speakers": 2,
+    "rttm_filepath": "/path/to/diar_label_0.rttm}
+    ...
+    {"audio_filepath": "/path/to/audio_n.wav", "num_speakers": 2,
+    "rttm_filepath": "/path/to/diar_label_n.rttm}
+    Args:
+        manifest_filepath (str):
+             Path to input manifest json files.
+        emb_dict (dict):
+            Dictionary containing cluster-average embeddings and speaker mapping information.
+        emb_seq (dict):
+            Dictionary containing multiscale speaker embedding sequence, scale mapping and corresponding segment timestamps.
+        clus_label_dict (dict):
+            Subsegment-level (from base-scale) speaker labels from clustering results.
+        soft_label_thres (float):
+            A threshold that determines the label of each segment based on RTTM file information.
+        featurizer:
+            Featurizer instance for generating features from raw waveform.
+        seq_eval_mode (bool):
+            If True, F1 score will be calculated for each speaker pair during inference mode.
+        window_stride (float):
+            Window stride for acoustic feature. This value is used for calculating the numbers of feature-level frames.
+        use_single_scale_clus (bool):
+            Use only one scale for clustering instead of using multiple scales of embeddings for clustering.
+        pairwise_infer (bool):
+            This variable should be True if dataloader is created for an inference task.
+    """
+    @property
+    def output_types(self) -> Optional[Dict[str, NeuralType]]:
+        """Returns definitions of module output ports."""
+        output_types = OrderedDict(
+            {
+                "ms_emb_seq": NeuralType(('B', 'T', 'C', 'D'), SpectrogramType()),
+                "length": NeuralType(tuple('B'), LengthsType()),
+                "ms_avg_embs": NeuralType(('B', 'C', 'D', 'C'), EncodedRepresentation()),
+                "targets": NeuralType(('B', 'T', 'C'), ProbsType()),
+            }
+        )
+        return output_types
+    def __init__(
+        self,
+        *,
+        manifest_filepath: str,
+        emb_dict: Dict,
+        emb_seq: Dict,
+        clus_label_dict: Dict,
+        soft_label_thres: float,
+        seq_eval_mode: bool,
+        window_stride: float,
+        use_single_scale_clus: bool,
+        pairwise_infer: bool,
+    ):
+        super().__init__()
+        self.collection = DiarizationSpeechLabel(
+            manifests_files=manifest_filepath.split(','),
+            emb_dict=emb_dict,
+            clus_label_dict=clus_label_dict,
+            seq_eval_mode=seq_eval_mode,
+            pairwise_infer=pairwise_infer,
+        )
+        self.emb_dict = emb_dict
+        self.emb_seq = emb_seq
+        self.clus_label_dict = clus_label_dict
+        self.round_digits = 2
+        self.decim = 10 ** self.round_digits
+        self.frame_per_sec = int(1 / window_stride)
+        self.soft_label_thres = soft_label_thres
+        self.pairwise_infer = pairwise_infer
+        self.max_spks = 2
+        self.use_single_scale_clus = use_single_scale_clus
+        self.seq_eval_mode = seq_eval_mode
+    def __len__(self):
+        return len(self.collection)
+    def parse_rttm_multiscale(self, sample):
+        """
+        Generate target tensor variable by extracting groundtruth diarization labels from an RTTM file.
+        This function is only used when ``self.seq_eval_mode=True`` and RTTM files are provided. This function converts
+        (start, end, speaker_id) format into base-scale (the finest scale) segment level diarization label in a matrix
+        form to create target matrix.
+        Args:
+            sample:
+                DiarizationSpeechLabel instance containing sample information such as audio filepath and RTTM filepath.
+            target_spks (tuple):
+                Two Indices of targeted speakers for evaluation.
+                Example of target_spks: (2, 3)
+        Returns:
+            seg_target (torch.tensor):
+                Tensor variable containing hard-labels of speaker activity in each base-scale segment.
+        """
+        if sample.rttm_file is None:
+            raise ValueError(f"RTTM file is not provided for this sample {sample}")
+        rttm_lines = open(sample.rttm_file).readlines()
+        uniq_id = os.path.splitext(os.path.basename(sample.rttm_file))[0]
+        mapping_dict = self.emb_dict[max(self.emb_dict.keys())][uniq_id]['mapping']
+        rttm_timestamps = extract_seg_info_from_rttm(uniq_id, rttm_lines, mapping_dict, sample.target_spks)
+        fr_level_target = assign_frame_level_spk_vector(
+            rttm_timestamps, self.round_digits, self.frame_per_sec, sample.target_spks
+        )
+        seg_target = self.get_diar_target_labels_from_fr_target(uniq_id, fr_level_target)
+        return seg_target
+    def get_diar_target_labels_from_fr_target(self, uniq_id, fr_level_target):
+        """
+        Generate base-scale level binary diarization label from frame-level target matrix. For the given frame-level
+        speaker target matrix fr_level_target, we count the number of frames that belong to each speaker and calculate
+        ratios for each speaker into the `soft_label_vec` variable. Finally, `soft_label_vec` variable is compared with `soft_label_thres`
+        to determine whether a label vector should contain 0 or 1 for each speaker bin. Note that seg_target variable has
+        dimension of (number of base-scale segments x 2) dimension.
+        Example of seg_target:
+            [[0., 1.], [0., 1.], [1., 1.], [1., 0.], [1., 0.], ..., [0., 1.]]
+        Args:
+            uniq_id (str):
+                Unique file ID that refers to an input audio file and corresponding RTTM (Annotation) file.
+            fr_level_target (torch.tensor):
+                frame-level binary speaker annotation (1: exist 0: non-exist) generated from RTTM file.
+        Returns:
+            seg_target (torch.tensor):
+                Tensor variable containing binary hard-labels of speaker activity in each base-scale segment.
+        """
+        if fr_level_target is None:
+            return None
+        else:
+            seg_target_list = []
+            for (seg_stt, seg_end, label_int) in self.clus_label_dict[uniq_id]:
+                seg_stt_fr, seg_end_fr = int(seg_stt * self.frame_per_sec), int(seg_end * self.frame_per_sec)
+                soft_label_vec = torch.sum(fr_level_target[seg_stt_fr:seg_end_fr, :], axis=0) / (
+                    seg_end_fr - seg_stt_fr
+                )
+                label_vec = (soft_label_vec > self.soft_label_thres).int()
+                seg_target_list.append(label_vec)
+            seg_target = torch.stack(seg_target_list)
+            return seg_target
+    def __getitem__(self, index):
+        sample = self.collection[index]
+        if sample.offset is None:
+            sample.offset = 0
+        uniq_id = os.path.splitext(os.path.basename(sample.audio_file))[0]
+        scale_n = len(self.emb_dict.keys())
+        _avg_embs = torch.stack([self.emb_dict[scale_index][uniq_id]['avg_embs'] for scale_index in range(scale_n)])
+        if self.pairwise_infer:
+            avg_embs = _avg_embs[:, :, self.collection[index].target_spks]
+        else:
+            avg_embs = _avg_embs
+        if avg_embs.shape[2] > self.max_spks:
+            raise ValueError(
+                f" avg_embs.shape[2] {avg_embs.shape[2]} should be less than or equal to self.max_num_speakers {self.max_spks}"
+            )
+        feats = []
+        for scale_index in range(scale_n):
+            repeat_mat = self.emb_seq["session_scale_mapping"][uniq_id][scale_index]
+            feats.append(self.emb_seq[scale_index][uniq_id][repeat_mat, :])
+        feats_out = torch.stack(feats).permute(1, 0, 2)
+        feats_len = feats_out.shape[0]
+        if self.seq_eval_mode:
+            targets = self.parse_rttm_multiscale(sample)
+        else:
+            targets = torch.zeros(feats_len, 2).float()
+        return feats_out, feats_len, targets, avg_embs
+def _msdd_train_collate_fn(self, batch):
+    """
+    Collate batch of variables that are needed for raw waveform to diarization label training.
+    The following variables are included in training/validation batch:
+    Args:
+        batch (tuple):
+            Batch tuple containing the variables for the diarization training.
+    Returns:
+        features (torch.tensor):
+            Raw waveform samples (time series) loaded from the audio_filepath in the input manifest file.
+        feature lengths (time series sample length):
+            A list of lengths of the raw waveform samples.
+        ms_seg_timestamps (torch.tensor):
+            Matrix containing the start time and end time (timestamps) for each segment and each scale.
+            ms_seg_timestamps is needed for extracting acoustic features from raw waveforms.
+        ms_seg_counts (torch.tensor):
+            Matrix containing The number of segments for each scale. ms_seg_counts is necessary for reshaping
+            the input matrix for the MSDD model.
+        clus_label_index (torch.tensor):
+            Groundtruth Clustering label (cluster index for each segment) from RTTM files for training purpose.
+            clus_label_index is necessary for calculating cluster-average embedding.
+        scale_mapping (torch.tensor):
+            Matrix containing the segment indices of each scale. scale_mapping is necessary for reshaping the
+            multiscale embeddings to form an input matrix for the MSDD model.
+        targets (torch.tensor):
+            Groundtruth Speaker label for the given input embedding sequence.
+    """
+    packed_batch = list(zip(*batch))
+    features, feature_length, ms_seg_timestamps, ms_seg_counts, clus_label_index, scale_mapping, targets = packed_batch
+    features_list, feature_length_list = [], []
+    ms_seg_timestamps_list, ms_seg_counts_list, scale_clus_label_list, scale_mapping_list, targets_list = (
+        [],
+        [],
+        [],
+        [],
+        [],
+    )
+    max_raw_feat_len = max([x.shape[0] for x in features])
+    max_target_len = max([x.shape[0] for x in targets])
+    max_total_seg_len = max([x.shape[0] for x in clus_label_index])
+    for feat, feat_len, ms_seg_ts, ms_seg_ct, scale_clus, scl_map, tgt in batch:
+        seq_len = tgt.shape[0]
+        pad_feat = (0, max_raw_feat_len - feat_len)
+        pad_tgt = (0, 0, 0, max_target_len - seq_len)
+        pad_sm = (0, max_target_len - seq_len)
+        pad_ts = (0, 0, 0, max_target_len - seq_len)
+        pad_sc = (0, max_total_seg_len - scale_clus.shape[0])
+        padded_feat = torch.nn.functional.pad(feat, pad_feat)
+        padded_tgt = torch.nn.functional.pad(tgt, pad_tgt)
+        padded_sm = torch.nn.functional.pad(scl_map, pad_sm)
+        padded_ms_seg_ts = torch.nn.functional.pad(ms_seg_ts, pad_ts)
+        padded_scale_clus = torch.nn.functional.pad(scale_clus, pad_sc)
+        features_list.append(padded_feat)
+        feature_length_list.append(feat_len.clone().detach())
+        ms_seg_timestamps_list.append(padded_ms_seg_ts)
+        ms_seg_counts_list.append(ms_seg_ct.clone().detach())
+        scale_clus_label_list.append(padded_scale_clus)
+        scale_mapping_list.append(padded_sm)
+        targets_list.append(padded_tgt)
+    features = torch.stack(features_list)
+    feature_length = torch.stack(feature_length_list)
+    ms_seg_timestamps = torch.stack(ms_seg_timestamps_list)
+    clus_label_index = torch.stack(scale_clus_label_list)
+    ms_seg_counts = torch.stack(ms_seg_counts_list)
+    scale_mapping = torch.stack(scale_mapping_list)
+    targets = torch.stack(targets_list)
+    return features, feature_length, ms_seg_timestamps, ms_seg_counts, clus_label_index, scale_mapping, targets
+def _msdd_infer_collate_fn(self, batch):
+    """
+    Collate batch of feats (speaker embeddings), feature lengths, target label sequences and cluster-average embeddings.
+    Args:
+        batch (tuple):
+            Batch tuple containing feats, feats_len, targets and ms_avg_embs.
+    Returns:
+        feats (torch.tensor):
+            Collated speaker embedding with unified length.
+        feats_len (torch.tensor):
+            The actual length of each embedding sequence without zero padding.
+        targets (torch.tensor):
+            Groundtruth Speaker label for the given input embedding sequence.
+        ms_avg_embs (torch.tensor):
+            Cluster-average speaker embedding vectors.
+    """
+    packed_batch = list(zip(*batch))
+    feats, feats_len, targets, ms_avg_embs = packed_batch
+    feats_list, flen_list, targets_list, ms_avg_embs_list = [], [], [], []
+    max_audio_len = max(feats_len)
+    max_target_len = max([x.shape[0] for x in targets])
+    for feature, feat_len, target, ivector in batch:
+        flen_list.append(feat_len)
+        ms_avg_embs_list.append(ivector)
+        if feat_len < max_audio_len:
+            pad_a = (0, 0, 0, 0, 0, max_audio_len - feat_len)
+            pad_t = (0, 0, 0, max_target_len - target.shape[0])
+            padded_feature = torch.nn.functional.pad(feature, pad_a)
+            padded_target = torch.nn.functional.pad(target, pad_t)
+            feats_list.append(padded_feature)
+            targets_list.append(padded_target)
+        else:
+            targets_list.append(target.clone().detach())
+            feats_list.append(feature.clone().detach())
+    feats = torch.stack(feats_list)
+    feats_len = torch.tensor(flen_list)
+    targets = torch.stack(targets_list)
+    ms_avg_embs = torch.stack(ms_avg_embs_list)
+    return feats, feats_len, targets, ms_avg_embs
+class AudioToSpeechMSDDTrainDataset(_AudioMSDDTrainDataset):
+    """
+    Dataset class that loads a json file containing paths to audio files,
+    rttm files and number of speakers. This Dataset class is designed for
+    training or fine-tuning speaker embedding extractor and diarization decoder
+    at the same time.
+    Example:
+    {"audio_filepath": "/path/to/audio_0.wav", "num_speakers": 2,
+    "rttm_filepath": "/path/to/diar_label_0.rttm}
+    ...
+    {"audio_filepath": "/path/to/audio_n.wav", "num_speakers": 2,
+    "rttm_filepath": "/path/to/diar_label_n.rttm}
+    Args:
+        manifest_filepath (str):
+            Path to input manifest json files.
+        multiscale_args_dict (dict):
+            Dictionary containing the parameters for multiscale segmentation and clustering.
+        emb_dir (str):
+            Path to a temporary folder where segmentation information for embedding extraction is saved.
+        soft_label_thres (float):
+            A threshold that determines the label of each segment based on RTTM file information.
+        featurizer:
+            Featurizer instance for generating features from the raw waveform.
+        window_stride (float):
+            Window stride for acoustic feature. This value is used for calculating the numbers of feature-level frames.
+        emb_batch_size (int):
+            Number of embedding vectors that are trained with attached computational graphs.
+        pairwise_infer (bool):
+            This variable should be True if dataloader is created for an inference task.
+    """
+    def __init__(
+        self,
+        *,
+        manifest_filepath: str,
+        multiscale_args_dict: Dict,
+        emb_dir: str,
+        soft_label_thres: float,
+        featurizer,
+        window_stride,
+        emb_batch_size,
+        pairwise_infer: bool,
+        global_rank: int,
+    ):
+        super().__init__(
+            manifest_filepath=manifest_filepath,
+            multiscale_args_dict=multiscale_args_dict,
+            emb_dir=emb_dir,
+            soft_label_thres=soft_label_thres,
+            featurizer=featurizer,
+            window_stride=window_stride,
+            emb_batch_size=emb_batch_size,
+            pairwise_infer=pairwise_infer,
+            global_rank=global_rank,
+        )
+    def msdd_train_collate_fn(self, batch):
+        return _msdd_train_collate_fn(self, batch)
+class AudioToSpeechMSDDInferDataset(_AudioMSDDInferDataset):
+    """
+    Dataset class that loads a json file containing paths to audio files,
+    rttm files and number of speakers. The created labels are used for diarization inference.
+    Example:
+    {"audio_filepath": "/path/to/audio_0.wav", "num_speakers": 2,
+    "rttm_filepath": "/path/to/diar_label_0.rttm}
+    ...
+    {"audio_filepath": "/path/to/audio_n.wav", "num_speakers": 2,
+    "rttm_filepath": "/path/to/diar_label_n.rttm}
+    Args:
+        manifest_filepath (str):
+            Path to input manifest json files.
+        emb_dict (dict):
+            Dictionary containing cluster-average embeddings and speaker mapping information.
+        emb_seq (dict):
+            Dictionary containing multiscale speaker embedding sequence, scale mapping and corresponding segment timestamps.
+        clus_label_dict (dict):
+            Subsegment-level (from base-scale) speaker labels from clustering results.
+        soft_label_thres (float):
+            Threshold that determines speaker labels of segments depending on the overlap with groundtruth speaker timestamps.
+        featurizer:
+            Featurizer instance for generating features from raw waveform.
+        use_single_scale_clus (bool):
+            Use only one scale for clustering instead of using multiple scales of embeddings for clustering.
+        seq_eval_mode (bool):
+            If True, F1 score will be calculated for each speaker pair during inference mode.
+        window_stride (float):
+            Window stride for acoustic feature. This value is used for calculating the numbers of feature-level frames.
+        pairwise_infer (bool):
+            If True, this Dataset class operates in inference mode. In inference mode, a set of speakers in the input audio
+            is split into multiple pairs of speakers and speaker tuples (e.g. 3 speakers: [(0,1), (1,2), (0,2)]) and then
+            fed into the MSDD to merge the individual results.
+    """
+    def __init__(
+        self,
+        *,
+        manifest_filepath: str,
+        emb_dict: Dict,
+        emb_seq: Dict,
+        clus_label_dict: Dict,
+        soft_label_thres: float,
+        use_single_scale_clus: bool,
+        seq_eval_mode: bool,
+        window_stride: float,
+        pairwise_infer: bool,
+    ):
+        super().__init__(
+            manifest_filepath=manifest_filepath,
+            emb_dict=emb_dict,
+            emb_seq=emb_seq,
+            clus_label_dict=clus_label_dict,
+            soft_label_thres=soft_label_thres,
+            use_single_scale_clus=use_single_scale_clus,
+            window_stride=window_stride,
+            seq_eval_mode=seq_eval_mode,
+            pairwise_infer=pairwise_infer,
+        )
+    def msdd_infer_collate_fn(self, batch):
+        return _msdd_infer_collate_fn(self, batch)

SoundScribe/SpeakerID/nemo/collections/asr/data/audio_to_label.py ADDED Viewed

	@@ -0,0 +1,1294 @@

+# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import io
+import os
+from typing import Dict, List, Optional, Union
+import torch
+import webdataset as wd
+from nemo.collections.asr.data.audio_to_text import cache_datastore_manifests, expand_sharded_filepaths
+from nemo.collections.asr.parts.preprocessing.features import WaveformFeaturizer
+from nemo.collections.asr.parts.preprocessing.segment import available_formats as valid_sf_formats
+from nemo.collections.common.parts.preprocessing import collections
+from nemo.core.classes import Dataset, IterableDataset
+from nemo.core.neural_types import AudioSignal, LabelsType, LengthsType, NeuralType, RegressionValuesType
+from nemo.utils import logging
+# List of valid file formats (prioritized by order of importance)
+VALID_FILE_FORMATS = ';'.join(['wav', 'mp3', 'flac'] + [fmt.lower() for fmt in valid_sf_formats.keys()])
+def repeat_signal(signal: torch.Tensor, sig_len: int, required_length: int) -> torch.Tensor:
+    """repeat signal to make short signal to have required_length
+    Args:
+        signal (Tensor): input signal
+        sig_len (int): length of input signal
+        required_length (int): length of generated signal
+    Returns:
+        signal (Tensor): generated signal of required_length by repeating itself.
+    """
+    sub: torch.Tensor = torch.tensor([])
+    repeat = int(required_length // sig_len)
+    rem = int(required_length % sig_len)
+    sub: torch.Tensor = torch.tensor([])
+    rep_sig: torch.Tensor = torch.cat(repeat * [signal])
+    if rem > 0:
+        sub = signal[-rem:]
+        signal = torch.cat((rep_sig, sub))
+    else:
+        signal = rep_sig
+    return signal
+def normalize(signal):
+    """normalize signal
+    Args:
+        signal(FloatTensor): signal to be normalized.
+    """
+    signal_minusmean = signal - signal.mean()
+    return signal_minusmean / signal_minusmean.abs().max()
+def count_occurence(manifest_file_id):
+    """Count number of wav files in Dict manifest_file_id. Use for _TarredAudioToLabelDataset.
+    Args:
+        manifest_file_id (Dict): Dict of files and their corresponding id. {'A-sub0' : 1, ..., 'S-sub10':100}
+    Returns:
+        count (Dict): Dict of wav files {'A' : 2, ..., 'S':10}
+    """
+    count = dict()
+    for i in manifest_file_id:
+        audio_filename = i.split("-sub")[0]
+        count[audio_filename] = count.get(audio_filename, 0) + 1
+    return count
+def _speech_collate_fn(batch, pad_id):
+    """collate batch of audio sig, audio len, tokens, tokens len
+    Args:
+        batch (Optional[FloatTensor], Optional[LongTensor], LongTensor,
+               LongTensor):  A tuple of tuples of signal, signal lengths,
+               encoded tokens, and encoded tokens length.  This collate func
+               assumes the signals are 1d torch tensors (i.e. mono audio).
+    """
+    _, audio_lengths, _, tokens_lengths = zip(*batch)
+    max_audio_len = 0
+    has_audio = audio_lengths[0] is not None
+    if has_audio:
+        max_audio_len = max(audio_lengths).item()
+    max_tokens_len = max(tokens_lengths).item()
+    audio_signal, tokens = [], []
+    for sig, sig_len, tokens_i, tokens_i_len in batch:
+        if has_audio:
+            sig_len = sig_len.item()
+            if sig_len < max_audio_len:
+                pad = (0, max_audio_len - sig_len)
+                sig = torch.nn.functional.pad(sig, pad)
+            audio_signal.append(sig)
+        tokens_i_len = tokens_i_len.item()
+        if tokens_i_len < max_tokens_len:
+            pad = (0, max_tokens_len - tokens_i_len)
+            tokens_i = torch.nn.functional.pad(tokens_i, pad, value=pad_id)
+        tokens.append(tokens_i)
+    if has_audio:
+        audio_signal = torch.stack(audio_signal)
+        audio_lengths = torch.stack(audio_lengths)
+    else:
+        audio_signal, audio_lengths = None, None
+    tokens = torch.stack(tokens)
+    tokens_lengths = torch.stack(tokens_lengths)
+    return audio_signal, audio_lengths, tokens, tokens_lengths
+def _fixed_seq_collate_fn(self, batch):
+    """collate batch of audio sig, audio len, tokens, tokens len
+        Args:
+            batch (Optional[FloatTensor], Optional[LongTensor], LongTensor,
+                LongTensor):  A tuple of tuples of signal, signal lengths,
+                encoded tokens, and encoded tokens length.  This collate func
+                assumes the signals are 1d torch tensors (i.e. mono audio).
+        """
+    _, audio_lengths, _, tokens_lengths = zip(*batch)
+    has_audio = audio_lengths[0] is not None
+    fixed_length = int(max(audio_lengths))
+    audio_signal, tokens, new_audio_lengths = [], [], []
+    for sig, sig_len, tokens_i, _ in batch:
+        if has_audio:
+            sig_len = sig_len.item()
+            chunck_len = sig_len - fixed_length
+            if chunck_len < 0:
+                repeat = fixed_length // sig_len
+                rem = fixed_length % sig_len
+                sub = sig[-rem:] if rem > 0 else torch.tensor([])
+                rep_sig = torch.cat(repeat * [sig])
+                sig = torch.cat((rep_sig, sub))
+            new_audio_lengths.append(torch.tensor(fixed_length))
+            audio_signal.append(sig)
+        tokens.append(tokens_i)
+    if has_audio:
+        audio_signal = torch.stack(audio_signal)
+        audio_lengths = torch.stack(new_audio_lengths)
+    else:
+        audio_signal, audio_lengths = None, None
+    tokens = torch.stack(tokens)
+    tokens_lengths = torch.stack(tokens_lengths)
+    return audio_signal, audio_lengths, tokens, tokens_lengths
+def _vad_frame_seq_collate_fn(self, batch):
+    """collate batch of audio sig, audio len, tokens, tokens len
+    Args:
+        batch (Optional[FloatTensor], Optional[LongTensor], LongTensor,
+            LongTensor):  A tuple of tuples of signal, signal lengths,
+            encoded tokens, and encoded tokens length.  This collate func
+            assumes the signals are 1d torch tensors (i.e. mono audio).
+            batch size equals to 1.
+    """
+    slice_length = int(self.featurizer.sample_rate * self.window_length_in_sec)
+    _, audio_lengths, _, tokens_lengths = zip(*batch)
+    slice_length = int(min(slice_length, max(audio_lengths)))
+    shift = int(self.featurizer.sample_rate * self.shift_length_in_sec)
+    has_audio = audio_lengths[0] is not None
+    audio_signal, num_slices, tokens, audio_lengths = [], [], [], []
+    append_len_start = slice_length // 2
+    append_len_end = slice_length - slice_length // 2
+    for sig, sig_len, tokens_i, _ in batch:
+        if self.normalize_audio:
+            sig = normalize(sig)
+        start = torch.zeros(append_len_start)
+        end = torch.zeros(append_len_end)
+        sig = torch.cat((start, sig, end))
+        sig_len += slice_length
+        if has_audio:
+            slices = torch.div(sig_len - slice_length, shift, rounding_mode='trunc')
+            for slice_id in range(slices):
+                start_idx = slice_id * shift
+                end_idx = start_idx + slice_length
+                signal = sig[start_idx:end_idx]
+                audio_signal.append(signal)
+            num_slices.append(slices)
+            tokens.extend([tokens_i] * slices)
+            audio_lengths.extend([slice_length] * slices)
+    if has_audio:
+        audio_signal = torch.stack(audio_signal)
+        audio_lengths = torch.tensor(audio_lengths)
+    else:
+        audio_signal, audio_lengths = None, None
+    tokens = torch.stack(tokens)
+    tokens_lengths = torch.tensor(num_slices)
+    return audio_signal, audio_lengths, tokens, tokens_lengths
+class _AudioLabelDataset(Dataset):
+    """
+    Dataset that loads tensors via a json file containing paths to audio files,
+    labels, and durations and offsets(in seconds). Each new line is a
+    different sample. Example below:
+    and their target labels. JSON files should be of the following format::
+        {"audio_filepath": "/path/to/audio_wav_0.wav", "duration": time_in_sec_0, "label": \
+target_label_0, "offset": offset_in_sec_0}
+        ...
+        {"audio_filepath": "/path/to/audio_wav_n.wav", "duration": time_in_sec_n, "label": \
+target_label_n, "offset": offset_in_sec_n}
+    Args:
+        manifest_filepath (Union[str, List[str]]): Dataset parameter. Path to JSON containing data.
+        labels (list): Dataset parameter. List of target classes that can be output by the speaker recognition model.
+        featurizer
+        min_duration (float): Dataset parameter. All training files which have a duration less than min_duration
+            are dropped. Note: Duration is read from the manifest JSON.
+            Defaults to 0.1.
+        max_duration (float): Dataset parameter.
+            All training files which have a duration more than max_duration
+            are dropped. Note: Duration is read from the manifest JSON.
+            Defaults to None.
+        trim (bool): Whether to use trim silence from beginning and end of audio signal using librosa.effects.trim().
+            Defaults to False.
+    """
+    @property
+    def output_types(self) -> Optional[Dict[str, NeuralType]]:
+        """Returns definitions of module output ports.
+        """
+        output_types = {
+            'audio_signal': NeuralType(
+                ('B', 'T'),
+                AudioSignal(freq=self._sample_rate)
+                if self is not None and hasattr(self, '_sample_rate')
+                else AudioSignal(),
+            ),
+            'a_sig_length': NeuralType(tuple('B'), LengthsType()),
+        }
+        if self.is_regression_task:
+            output_types.update(
+                {
+                    'targets': NeuralType(tuple('B'), RegressionValuesType()),
+                    'targets_length': NeuralType(tuple('B'), LengthsType()),
+                }
+            )
+        else:
+            output_types.update(
+                {'label': NeuralType(tuple('B'), LabelsType()), 'label_length': NeuralType(tuple('B'), LengthsType()),}
+            )
+        return output_types
+    def __init__(
+        self,
+        *,
+        manifest_filepath: Union[str, List[str]],
+        labels: List[str],
+        featurizer,
+        min_duration: Optional[float] = 0.1,
+        max_duration: Optional[float] = None,
+        trim: bool = False,
+        is_regression_task: bool = False,
+        cal_labels_occurrence: Optional[bool] = False,
+    ):
+        super().__init__()
+        if isinstance(manifest_filepath, str):
+            manifest_filepath = manifest_filepath.split(',')
+        cache_datastore_manifests(manifest_filepaths=manifest_filepath, cache_audio=True)
+        self.collection = collections.ASRSpeechLabel(
+            manifests_files=manifest_filepath,
+            min_duration=min_duration,
+            max_duration=max_duration,
+            is_regression_task=is_regression_task,
+            cal_labels_occurrence=cal_labels_occurrence,
+        )
+        self.featurizer = featurizer
+        self.trim = trim
+        self.is_regression_task = is_regression_task
+        if not is_regression_task:
+            self.labels = labels if labels else self.collection.uniq_labels
+            self.num_classes = len(self.labels) if self.labels is not None else 1
+            self.label2id, self.id2label = {}, {}
+            self.id2occurrence, self.labels_occurrence = {}, []
+            for label_id, label in enumerate(self.labels):
+                self.label2id[label] = label_id
+                self.id2label[label_id] = label
+                if cal_labels_occurrence:
+                    self.id2occurrence[label_id] = self.collection.labels_occurrence[label]
+            if cal_labels_occurrence:
+                self.labels_occurrence = [self.id2occurrence[k] for k in sorted(self.id2occurrence)]
+            for idx in range(len(self.labels[:5])):
+                logging.debug(" label id {} and its mapped label {}".format(idx, self.id2label[idx]))
+        else:
+            self.labels = []
+            self.num_classes = 1
+    def __len__(self):
+        return len(self.collection)
+    def __getitem__(self, index):
+        sample = self.collection[index]
+        offset = sample.offset
+        if offset is None:
+            offset = 0
+        features = self.featurizer.process(sample.audio_file, offset=offset, duration=sample.duration, trim=self.trim)
+        f, fl = features, torch.tensor(features.shape[0]).long()
+        if not self.is_regression_task:
+            t = torch.tensor(self.label2id[sample.label]).long()
+        else:
+            t = torch.tensor(sample.label).float()
+        tl = torch.tensor(1).long()  # For compatibility with collate_fn used later
+        return f, fl, t, tl
+# Ported from https://github.com/NVIDIA/OpenSeq2Seq/blob/master/open_seq2seq/data/speech2text/speech_commands.py
+class AudioToClassificationLabelDataset(_AudioLabelDataset):
+    """
+    Dataset that loads tensors via a json file containing paths to audio
+    files, command class, and durations (in seconds). Each new line is a
+    different sample. Example below:
+    {"audio_filepath": "/path/to/audio_wav_0.wav", "duration": time_in_sec_0, "label": \
+        target_label_0, "offset": offset_in_sec_0}
+    ...
+    {"audio_filepath": "/path/to/audio_wav_n.wav", "duration": time_in_sec_n, "label": \
+        target_label_n, "offset": offset_in_sec_n}
+    Args:
+        manifest_filepath (Union[str, List[str]]): Path to manifest json as described above. Can
+            be comma-separated paths.
+        labels (Optional[list]): String containing all the possible labels to map to
+            if None then automatically picks from ASRSpeechLabel collection.
+        featurizer: Initialized featurizer class that converts paths of
+            audio to feature tensors
+        max_duration: If audio exceeds this length, do not include in dataset
+        min_duration: If audio is less than this length, do not include
+            in dataset
+        trim: Boolean flag whether to trim the audio
+    """
+    def _collate_fn(self, batch):
+        return _speech_collate_fn(batch, pad_id=0)
+class AudioToSpeechLabelDataset(_AudioLabelDataset):
+    """
+    Dataset that loads tensors via a json file containing paths to audio
+    files, command class, and durations (in seconds). Each new line is a
+    different sample. Example below:
+    {"audio_filepath": "/path/to/audio_wav_0.wav", "duration": time_in_sec_0, "label": \
+        target_label_0, "offset": offset_in_sec_0}
+    ...
+    {"audio_filepath": "/path/to/audio_wav_n.wav", "duration": time_in_sec_n, "label": \
+        target_label_n, "offset": offset_in_sec_n}
+    Args:
+        manifest_filepath (Union[str, List[str]]): Path to manifest json as described above. Can
+            be comma-separated paths.
+        labels (Optional[list]): String containing all the possible labels to map to
+            if None then automatically picks from ASRSpeechLabel collection.
+        min_duration (float): Dataset parameter.
+            All training files which have a duration less than min_duration
+            are dropped. Note: Duration is read from the manifest JSON.
+            Defaults to 0.1.
+        max_duration (float): Dataset parameter.
+            All training files which have a duration more than max_duration
+            are dropped. Note: Duration is read from the manifest JSON.
+            Defaults to None.
+        trim (bool): Whether to use trim silence from beginning and end
+            of audio signal using librosa.effects.trim().
+            Defaults to False.
+        window_length_in_sec (float): length of window/slice (in seconds)
+            Use this for speaker recognition and VAD tasks.
+        shift_length_in_sec (float): amount of shift of window for generating the frame for VAD task in a batch
+            Use this for VAD task during inference.
+        normalize_audio (bool): Whether to normalize audio signal.
+            Defaults to False.
+        is_regression_task (bool): Whether the dataset is for a regression task instead of classification.
+            Defaults to False.
+        cal_labels_occurrence (bool): Whether to calculate occurrence of labels
+            Defaults to False.
+    """
+    def __init__(
+        self,
+        *,
+        manifest_filepath: Union[str, List[str]],
+        labels: List[str],
+        featurizer,
+        min_duration: Optional[float] = 0.1,
+        max_duration: Optional[float] = None,
+        trim: bool = False,
+        window_length_in_sec: Optional[float] = 8,
+        shift_length_in_sec: Optional[float] = 1,
+        normalize_audio: bool = False,
+        is_regression_task: bool = False,
+        cal_labels_occurrence: Optional[bool] = False,
+    ):
+        self.window_length_in_sec = window_length_in_sec
+        self.shift_length_in_sec = shift_length_in_sec
+        self.normalize_audio = normalize_audio
+        logging.debug("Window/slice length considered for collate func is {}".format(self.window_length_in_sec))
+        logging.debug("Shift length considered for collate func is {}".format(self.shift_length_in_sec))
+        super().__init__(
+            manifest_filepath=manifest_filepath,
+            labels=labels,
+            featurizer=featurizer,
+            min_duration=min_duration,
+            max_duration=max_duration,
+            trim=trim,
+            is_regression_task=is_regression_task,
+            cal_labels_occurrence=cal_labels_occurrence,
+        )
+    def fixed_seq_collate_fn(self, batch):
+        return _fixed_seq_collate_fn(self, batch)
+    def vad_frame_seq_collate_fn(self, batch):
+        return _vad_frame_seq_collate_fn(self, batch)
+class _TarredAudioLabelDataset(IterableDataset):
+    """
+    A similar Dataset to the AudioLabelDataSet, but which loads tarred audio files.
+    Accepts a single comma-separated JSON manifest file (in the same style as for the AudioToSpeechLabelDataset),
+    as well as the path(s) to the tarball(s) containing the wav files. Each line of the manifest should
+    contain the information for one audio file, including at least the label and name of the audio
+    file within the tarball.
+    Valid formats for the audio_tar_filepaths argument include:
+    (1) a single string that can be brace-expanded, e.g. 'path/to/audio.tar' or 'path/to/audio_{1..100}.tar.gz', or
+    (2) a list of file paths that will not be brace-expanded, e.g. ['audio_1.tar', 'audio_2.tar', ...].
+    Note: For brace expansion in (1), there may be cases where `{x..y}` syntax cannot be used due to shell interference.
+    This occurs most commonly inside SLURM scripts. Therefore we provide a few equivalent replacements.
+    Supported opening braces - { <=> (, [, < and the special tag _OP_.
+    Supported closing braces - } <=> ), ], > and the special tag _CL_.
+    For SLURM based tasks, we suggest the use of the special tags for ease of use.
+    See the documentation for more information about accepted data and input formats.
+    If using multiple processes the number of shards should be divisible by the number of workers to ensure an
+    even split among workers. If it is not divisible, logging will give a warning but training will proceed.
+    In addition, if using mutiprocessing, each shard MUST HAVE THE SAME NUMBER OF ENTRIES after filtering
+    is applied. We currently do not check for this, but your program may hang if the shards are uneven!
+    Notice that a few arguments are different from the AudioLabelDataSet; for example, shuffle (bool) has been
+    replaced by shuffle_n (int).
+    Additionally, please note that the len() of this DataLayer is assumed to be the length of the manifest
+    after filtering. An incorrect manifest length may lead to some DataLoader issues down the line.
+    Args:
+        audio_tar_filepaths: Either a list of audio tarball filepaths, or a
+            string (can be brace-expandable).
+        manifest_filepath (str): Path to the manifest.
+        labels (list): Dataset parameter.
+            List of target classes that can be output by the speaker recognition model.
+        featurizer
+        shuffle_n (int): How many samples to look ahead and load to be shuffled.
+            See WebDataset documentation for more details.
+            Defaults to 0.
+        min_duration (float): Dataset parameter.
+            All training files which have a duration less than min_duration
+            are dropped. Note: Duration is read from the manifest JSON.
+            Defaults to 0.1.
+        max_duration (float): Dataset parameter.
+            All training files which have a duration more than max_duration
+            are dropped. Note: Duration is read from the manifest JSON.
+            Defaults to None.
+        trim(bool): Whether to use trim silence from beginning and end
+            of audio signal using librosa.effects.trim().
+            Defaults to False.
+        window_length_in_sec (float): length of slice/window (in seconds) # Pass this only for speaker recognition and VAD task
+        shift_length_in_sec (float): amount of shift of window for generating the frame for VAD task. in a batch # Pass this only for VAD task during inference.
+        normalize_audio (bool): Whether to normalize audio signal. Defaults to False.
+        shard_strategy (str): Tarred dataset shard distribution strategy chosen as a str value during ddp.
+            -   `scatter`: The default shard strategy applied by WebDataset, where each node gets
+                a unique set of shards, which are permanently pre-allocated and never changed at runtime.
+            -   `replicate`: Optional shard strategy, where each node gets all of the set of shards
+                available in the tarred dataset, which are permanently pre-allocated and never changed at runtime.
+                The benefit of replication is that it allows each node to sample data points from the entire
+                dataset independently of other nodes, and reduces dependence on the value of `shuffle_n`.
+                .. warning::
+                    Replicated strategy allows every node to sample the entire set of available tarfiles,
+                    and therefore more than one node may sample the same tarfile, and even sample the same
+                    data points! As such, there is no assured guarantee that all samples in the dataset will be
+                    sampled at least once during 1 epoch. Scattered strategy, on the other hand, on specific
+                    occasions (when the number of shards is not divisible with ``world_size``), will not sample
+                    the entire dataset. For these reasons it is not advisable to use tarred datasets as validation
+                    or test datasets.
+        global_rank (int): Worker rank, used for partitioning shards. Defaults to 0.
+        world_size (int): Total number of processes, used for partitioning shards. Defaults to 0.
+        is_regression_task (bool): Whether it is a regression task. Defualts to False.
+    """
+    def __init__(
+        self,
+        *,
+        audio_tar_filepaths: Union[str, List[str]],
+        manifest_filepath: Union[str, List[str]],
+        labels: List[str],
+        featurizer,
+        shuffle_n: int = 0,
+        min_duration: Optional[float] = 0.1,
+        max_duration: Optional[float] = None,
+        trim: bool = False,
+        shard_strategy: str = "scatter",
+        global_rank: int = 0,
+        world_size: int = 0,
+        is_regression_task: bool = False,
+    ):
+        cache_datastore_manifests(manifest_filepaths=manifest_filepath)
+        self.collection = collections.ASRSpeechLabel(
+            manifests_files=manifest_filepath,
+            min_duration=min_duration,
+            max_duration=max_duration,
+            index_by_file_id=True,  # Must set this so the manifest lines can be indexed by file ID
+        )
+        self.file_occurence = count_occurence(self.collection.mapping)
+        self.featurizer = featurizer
+        self.trim = trim
+        self.labels = labels if labels else self.collection.uniq_labels
+        self.num_classes = len(self.labels)
+        self.label2id, self.id2label = {}, {}
+        for label_id, label in enumerate(self.labels):
+            self.label2id[label] = label_id
+            self.id2label[label_id] = label
+        for idx in range(len(self.labels[:5])):
+            logging.debug(" label id {} and its mapped label {}".format(idx, self.id2label[idx]))
+        audio_tar_filepaths = expand_sharded_filepaths(
+            sharded_filepaths=audio_tar_filepaths,
+            shard_strategy=shard_strategy,
+            world_size=world_size,
+            global_rank=global_rank,
+        )
+        # Put together WebDataset
+        self._dataset = wd.WebDataset(urls=audio_tar_filepaths, nodesplitter=None)
+        if shuffle_n > 0:
+            self._dataset = self._dataset.shuffle(shuffle_n)
+        else:
+            logging.info("WebDataset will not shuffle files within the tar files.")
+        self._dataset = (
+            self._dataset.rename(audio=VALID_FILE_FORMATS, key='__key__')
+            .to_tuple('audio', 'key')
+            .pipe(self._filter)
+            .map(f=self._build_sample)
+        )
+    def _filter(self, iterator):
+        """This function is used to remove samples that have been filtered out by ASRSpeechLabel already.
+        Otherwise, we would get a KeyError as _build_sample attempts to find the manifest entry for a sample
+        that was filtered out (e.g. for duration).
+        Note that if using multi-GPU training, filtering may lead to an imbalance in samples in each shard,
+        which may make your code hang as one process will finish before the other.
+        """
+        class TarredAudioFilter:
+            def __init__(self, collection, file_occurence):
+                self.iterator = iterator
+                self.collection = collection
+                self.file_occurence = file_occurence
+                self._iterable = self._internal_generator()
+            def __iter__(self):
+                self._iterable = self._internal_generator()
+                return self
+            def __next__(self):
+                try:
+                    values = next(self._iterable)
+                except StopIteration:
+                    # reset generator
+                    self._iterable = self._internal_generator()
+                    values = next(self._iterable)
+                return values
+            def _internal_generator(self):
+                """
+                WebDataset requires an Iterator, but we require an iterable that yields 1-or-more
+                values per value inside self.iterator.
+                Therefore wrap the iterator with a generator function that will yield 1-or-more
+                values per sample in the iterator.
+                """
+                for _, tup in enumerate(self.iterator):
+                    audio_bytes, audio_filename = tup
+                    file_id, _ = os.path.splitext(os.path.basename(audio_filename))
+                    if audio_filename in self.file_occurence:
+                        for j in range(0, self.file_occurence[file_id]):
+                            if j == 0:
+                                audio_filename = file_id
+                            else:
+                                audio_filename = file_id + "-sub" + str(j)
+                            yield audio_bytes, audio_filename
+        return TarredAudioFilter(self.collection, self.file_occurence)
+    def _build_sample(self, tup):
+        """Builds the training sample by combining the data from the WebDataset with the manifest info.
+        """
+        audio_bytes, audio_filename = tup
+        # Grab manifest entry from self.collection
+        file_id, _ = os.path.splitext(os.path.basename(audio_filename))
+        manifest_idx = self.collection.mapping[file_id]
+        manifest_entry = self.collection[manifest_idx]
+        offset = manifest_entry.offset
+        if offset is None:
+            offset = 0
+        # Convert audio bytes to IO stream for processing (for SoundFile to read)
+        audio_filestream = io.BytesIO(audio_bytes)
+        features = self.featurizer.process(
+            audio_filestream, offset=offset, duration=manifest_entry.duration, trim=self.trim,
+        )
+        audio_filestream.close()
+        # Audio features
+        f, fl = features, torch.tensor(features.shape[0]).long()
+        t = self.label2id[manifest_entry.label]
+        tl = 1  # For compatibility with collate_fn used later
+        return f, fl, torch.tensor(t).long(), torch.tensor(tl).long()
+    def __iter__(self):
+        return self._dataset.__iter__()
+    def __len__(self):
+        return len(self.collection)
+class TarredAudioToClassificationLabelDataset(_TarredAudioLabelDataset):
+    """
+    A similar Dataset to the AudioToClassificationLabelDataset, but which loads tarred audio files.
+    Accepts a single comma-separated JSON manifest file (in the same style as for the AudioToClassificationLabelDataset),
+    as well as the path(s) to the tarball(s) containing the wav files. Each line of the manifest should
+    contain the information for one audio file, including at least the transcript and name of the audio
+    file within the tarball.
+    Valid formats for the audio_tar_filepaths argument include:
+    (1) a single string that can be brace-expanded, e.g. 'path/to/audio.tar' or 'path/to/audio_{1..100}.tar.gz', or
+    (2) a list of file paths that will not be brace-expanded, e.g. ['audio_1.tar', 'audio_2.tar', ...].
+    See the WebDataset documentation for more information about accepted data and input formats.
+    If using multiple processes the number of shards should be divisible by the number of workers to ensure an
+    even split among workers. If it is not divisible, logging will give a warning but training will proceed.
+    In addition, if using mutiprocessing, each shard MUST HAVE THE SAME NUMBER OF ENTRIES after filtering
+    is applied. We currently do not check for this, but your program may hang if the shards are uneven!
+    Notice that a few arguments are different from the AudioToBPEDataset; for example, shuffle (bool) has been
+    replaced by shuffle_n (int).
+    Additionally, please note that the len() of this DataLayer is assumed to be the length of the manifest
+    after filtering. An incorrect manifest length may lead to some DataLoader issues down the line.
+    Args:
+        audio_tar_filepaths: Either a list of audio tarball filepaths, or a
+            string (can be brace-expandable).
+        manifest_filepath (str): Path to the manifest.
+        labels (list): Dataset parameter.
+            List of target classes that can be output by the speaker recognition model.
+        featurizer
+        shuffle_n (int): How many samples to look ahead and load to be shuffled.
+            See WebDataset documentation for more details.
+            Defaults to 0.
+        min_duration (float): Dataset parameter.
+            All training files which have a duration less than min_duration
+            are dropped. Note: Duration is read from the manifest JSON.
+            Defaults to 0.1.
+        max_duration (float): Dataset parameter.
+            All training files which have a duration more than max_duration
+            are dropped. Note: Duration is read from the manifest JSON.
+            Defaults to None.
+        trim(bool): Whether to use trim silence from beginning and end
+            of audio signal using librosa.effects.trim().
+            Defaults to False.
+        shard_strategy (str): Tarred dataset shard distribution strategy chosen as a str value during ddp.
+            -   `scatter`: The default shard strategy applied by WebDataset, where each node gets
+                a unique set of shards, which are permanently pre-allocated and never changed at runtime.
+            -   `replicate`: Optional shard strategy, where each node gets all of the set of shards
+                available in the tarred dataset, which are permanently pre-allocated and never changed at runtime.
+                The benefit of replication is that it allows each node to sample data points from the entire
+                dataset independently of other nodes, and reduces dependence on value of `shuffle_n`.
+                .. warning::
+                    Replicated strategy allows every node to sample the entire set of available tarfiles,
+                    and therefore more than one node may sample the same tarfile, and even sample the same
+                    data points! As such, there is no assured guarantee that all samples in the dataset will be
+                    sampled at least once during 1 epoch. Scattered strategy, on the other hand, on specific
+                    occasions (when the number of shards is not divisible with ``world_size``), will not sample
+                    the entire dataset. For these reasons it is not advisable to use tarred datasets as validation
+                    or test datasets.
+        global_rank (int): Worker rank, used for partitioning shards. Defaults to 0.
+        world_size (int): Total number of processes, used for partitioning shards. Defaults to 0.
+        is_regression_task (bool): Whether it is a regression task. Defualts to False.
+    """
+    def _collate_fn(self, batch):
+        return _speech_collate_fn(batch, pad_id=0)
+class TarredAudioToSpeechLabelDataset(_TarredAudioLabelDataset):
+    """
+    A similar Dataset to the AudioToSpeechLabelDataset, but which loads tarred audio files.
+    Accepts a single comma-separated JSON manifest file (in the same style as for the AudioToSpeechLabelDataset),
+    as well as the path(s) to the tarball(s) containing the wav files. Each line of the manifest should
+    contain the information for one audio file, including at least the transcript and name of the audio
+    file within the tarball.
+    Valid formats for the audio_tar_filepaths argument include:
+    (1) a single string that can be brace-expanded, e.g. 'path/to/audio.tar' or 'path/to/audio_{1..100}.tar.gz', or
+    (2) a list of file paths that will not be brace-expanded, e.g. ['audio_1.tar', 'audio_2.tar', ...].
+    See the WebDataset documentation for more information about accepted data and input formats.
+    If using multiple processes the number of shards should be divisible by the number of workers to ensure an
+    even split among workers. If it is not divisible, logging will give a warning but training will proceed.
+    In addition, if using mutiprocessing, each shard MUST HAVE THE SAME NUMBER OF ENTRIES after filtering
+    is applied. We currently do not check for this, but your program may hang if the shards are uneven!
+    Notice that a few arguments are different from the AudioToBPEDataset; for example, shuffle (bool) has been
+    replaced by shuffle_n (int).
+    Additionally, please note that the len() of this DataLayer is assumed to be the length of the manifest
+    after filtering. An incorrect manifest length may lead to some DataLoader issues down the line.
+    Args:
+        audio_tar_filepaths: Either a list of audio tarball filepaths, or a
+            string (can be brace-expandable).
+        manifest_filepath (str): Path to the manifest.
+        labels (list): Dataset parameter.
+            List of target classes that can be output by the speaker recognition model.
+        featurizer
+        shuffle_n (int): How many samples to look ahead and load to be shuffled.
+            See WebDataset documentation for more details.
+            Defaults to 0.
+        min_duration (float): Dataset parameter.
+            All training files which have a duration less than min_duration
+            are dropped. Note: Duration is read from the manifest JSON.
+            Defaults to 0.1.
+        max_duration (float): Dataset parameter.
+            All training files which have a duration more than max_duration
+            are dropped. Note: Duration is read from the manifest JSON.
+            Defaults to None.
+        trim(bool): Whether to use trim silence from beginning and end
+            of audio signal using librosa.effects.trim().
+            Defaults to False.
+        window_length_in_sec (float): time length of window/slice (in seconds) # Pass this only for speaker recognition and VAD task
+        shift_length_in_sec (float): amount of shift of window for generating the frame for VAD task. in a batch # Pass this only for VAD task during inference.
+        normalize_audio (bool): Whether to normalize audio signal. Defaults to False.
+        shard_strategy (str): Tarred dataset shard distribution strategy chosen as a str value during ddp.
+            -   `scatter`: The default shard strategy applied by WebDataset, where each node gets
+                a unique set of shards, which are permanently pre-allocated and never changed at runtime.
+            -   `replicate`: Optional shard strategy, where each node gets all of the set of shards
+                available in the tarred dataset, which are permanently pre-allocated and never changed at runtime.
+                The benefit of replication is that it allows each node to sample data points from the entire
+                dataset independently of other nodes, and reduces dependence on value of `shuffle_n`.
+                .. warning::
+                    Replicated strategy allows every node to sample the entire set of available tarfiles,
+                    and therefore more than one node may sample the same tarfile, and even sample the same
+                    data points! As such, there is no assured guarantee that all samples in the dataset will be
+                    sampled at least once during 1 epoch. Scattered strategy, on the other hand, on specific
+                    occasions (when the number of shards is not divisible with ``world_size``), will not sample
+                    the entire dataset. For these reasons it is not advisable to use tarred datasets as validation
+                    or test datasets.
+        global_rank (int): Worker rank, used for partitioning shards. Defaults to 0.
+        world_size (int): Total number of processes, used for partitioning shards. Defaults to 0.
+    """
+    def __init__(
+        self,
+        *,
+        audio_tar_filepaths: Union[str, List[str]],
+        manifest_filepath: Union[str, List[str]],
+        labels: List[str],
+        featurizer,
+        shuffle_n: int = 0,
+        min_duration: Optional[float] = 0.1,
+        max_duration: Optional[float] = None,
+        trim: bool = False,
+        window_length_in_sec: Optional[float] = 8,
+        shift_length_in_sec: Optional[float] = 1,
+        normalize_audio: bool = False,
+        shard_strategy: str = "scatter",
+        global_rank: int = 0,
+        world_size: int = 0,
+    ):
+        logging.info("Window/slice length considered for collate func is {}".format(window_length_in_sec))
+        logging.info("Shift length considered for collate func is {}".format(shift_length_in_sec))
+        self.window_length_in_sec = window_length_in_sec
+        self.shift_length_in_sec = shift_length_in_sec
+        self.normalize_audio = normalize_audio
+        super().__init__(
+            audio_tar_filepaths=audio_tar_filepaths,
+            manifest_filepath=manifest_filepath,
+            labels=labels,
+            featurizer=featurizer,
+            shuffle_n=shuffle_n,
+            min_duration=min_duration,
+            max_duration=max_duration,
+            trim=trim,
+            shard_strategy=shard_strategy,
+            global_rank=global_rank,
+            world_size=world_size,
+        )
+    def fixed_seq_collate_fn(self, batch):
+        return _fixed_seq_collate_fn(self, batch)
+    def sliced_seq_collate_fn(self, batch):
+        raise NotImplementedError
+    def vad_frame_seq_collate_fn(self, batch):
+        return _vad_frame_seq_collate_fn(self, batch)
+class AudioToMultiLabelDataset(Dataset):
+    """
+    Dataset that loads a json file containing paths to audio files, durations (in seconds), and a sequence of labels.
+    Each new line is a different sample. Example below:
+    {"audio_filepath": "/path/to/audio_wav_0.wav", "duration": time_in_sec_0, "label": \
+        "0 1 1 0 1", "offset": offset_in_sec_0}
+    ...
+    {"audio_filepath": "/path/to/audio_wav_n.wav", "duration": time_in_sec_n, "label": \
+        "0 1 0 0 1", "offset": offset_in_sec_n}
+    Args:
+        manifest_filepath (Union[str, List[str]]): Path to manifest json as described above. Can
+            be comma-separated paths.
+        labels (Optional[list]): String containing all the possible labels to map to
+            if None then automatically picks from ASRSpeechLabel collection.
+        min_duration (float): Dataset parameter.
+            All training files which have a duration less than min_duration
+            are dropped. Note: Duration is read from the manifest JSON.
+            Defaults to 0.1.
+        max_duration (float): Dataset parameter.
+            All training files which have a duration more than max_duration
+            are dropped. Note: Duration is read from the manifest JSON.
+            Defaults to None.
+        trim (bool): Whether to use trim silence from beginning and end
+            of audio signal using librosa.effects.trim().
+            Defaults to False.
+        window_length_in_sec (float): length of window/slice (in seconds)
+            Use this for speaker recognition and VAD tasks.
+        shift_length_in_sec (float): amount of shift of window for generating the frame for VAD task in a batch
+            Use this for VAD task during inference.
+        normalize_audio (bool): Whether to normalize audio signal.
+            Defaults to False.
+        is_regression_task (bool): Whether the dataset is for a regression task instead of classification.
+            Defaults to False.
+        cal_labels_occurrence (bool): Whether to calculate occurrence of labels
+            Defaults to False.
+        delimiter (Optional[str]): Delimiter to use when splitting the label string, default to None.
+        normalize_audio_db (Optional[float]):  normalize audio signal to a target db, default to None.
+    """
+    @property
+    def output_types(self) -> Optional[Dict[str, NeuralType]]:
+        """Returns definitions of module output ports.
+        """
+        output_types = {
+            'audio_signal': NeuralType(
+                ('B', 'T'),
+                AudioSignal(freq=self._sample_rate)
+                if self is not None and hasattr(self, '_sample_rate')
+                else AudioSignal(),
+            ),
+            'a_sig_length': NeuralType(tuple('B'), LengthsType()),
+        }
+        if self.is_regression_task:
+            output_types.update(
+                {
+                    'targets': NeuralType(tuple('B, T'), RegressionValuesType()),
+                    'targets_length': NeuralType(tuple('B'), LengthsType()),
+                }
+            )
+        else:
+            output_types.update(
+                {'label': NeuralType(('B', 'T'), LabelsType()), 'label_length': NeuralType(tuple('B'), LengthsType()),}
+            )
+        return output_types
+    def __init__(
+        self,
+        *,
+        manifest_filepath: Union[str, List[str]],
+        sample_rate: int,
+        labels: Optional[List[str]] = None,
+        int_values: bool = False,
+        augmentor: 'nemo.collections.asr.parts.perturb.AudioAugmentor' = None,
+        min_duration: Optional[float] = 0.1,
+        max_duration: Optional[float] = None,
+        trim_silence: bool = False,
+        is_regression_task: bool = False,
+        cal_labels_occurrence: Optional[bool] = False,
+        delimiter: Optional[str] = None,
+        normalize_audio_db: Optional[float] = None,
+    ):
+        super().__init__()
+        if isinstance(manifest_filepath, str):
+            manifest_filepath = manifest_filepath.split(',')
+        self.delimiter = delimiter
+        self.normalize_audio_db = normalize_audio_db
+        self.collection = collections.ASRSpeechLabel(
+            manifests_files=manifest_filepath,
+            min_duration=min_duration,
+            max_duration=max_duration,
+            is_regression_task=is_regression_task,
+            cal_labels_occurrence=cal_labels_occurrence,
+            delimiter=delimiter,
+        )
+        self.featurizer = WaveformFeaturizer(sample_rate=sample_rate, int_values=int_values, augmentor=augmentor)
+        self.trim = trim_silence
+        self.is_regression_task = is_regression_task
+        self.id2occurrence = {}
+        self.labels_occurrence = None
+        if not is_regression_task:
+            self.labels = labels if labels else self._get_label_set()
+            self.num_classes = len(self.labels) if self.labels is not None else 1
+            self.label2id, self.id2label = {}, {}
+            for label_id, label in enumerate(self.labels):
+                self.label2id[label] = label_id
+                self.id2label[label_id] = label
+                if cal_labels_occurrence:
+                    self.id2occurrence[label_id] = self.collection.labels_occurrence[label]
+                    self.labels_occurrence.append(self.id2occurrence[label_id])
+            for idx in range(len(self.labels[:5])):
+                logging.debug(" label id {} and its mapped label {}".format(idx, self.id2label[idx]))
+        else:
+            self.labels = []
+            self.num_classes = 1
+    def _get_label_set(self):
+        labels = []
+        for sample in self.collection:
+            label_str = sample.label
+            if label_str:
+                label_str_list = label_str.split(self.delimiter) if self.delimiter else label_str.split()
+                labels.extend(label_str_list)
+        return sorted(set(labels))
+    def _label_str_to_tensor(self, label_str: str):
+        labels = label_str.split(self.delimiter) if self.delimiter else label_str.split()
+        if self.is_regression_task:
+            labels = [float(s) for s in labels]
+            labels = torch.tensor(labels).float()
+        else:
+            labels = [self.label2id[s] for s in labels]
+            labels = torch.tensor(labels).long()
+        return labels
+    def __len__(self):
+        return len(self.collection)
+    def __getitem__(self, index):
+        sample = self.collection[index]
+        offset = sample.offset
+        if offset is None:
+            offset = 0
+        features = self.featurizer.process(
+            sample.audio_file,
+            offset=offset,
+            duration=sample.duration,
+            trim=self.trim,
+            normalize_db=self.normalize_audio_db,
+        )
+        f, fl = features, torch.tensor(features.size(0)).long()
+        t = self._label_str_to_tensor(sample.label)
+        tl = torch.tensor(t.size(0)).long()
+        return f, fl, t, tl
+    def _collate_fn(self, batch):
+        return _speech_collate_fn(batch, pad_id=0)
+class TarredAudioToMultiLabelDataset(IterableDataset):
+    """
+    A similar Dataset to the AudioToMultiLabelDataset, but which loads tarred audio files.
+    Accepts a single comma-separated JSON manifest file (in the same style as for the AudioToSpeechLabelDataset),
+    as well as the path(s) to the tarball(s) containing the wav files. Each line of the manifest should
+    contain the information for one audio file, including at least the transcript and name of the audio
+    file within the tarball.
+    Valid formats for the audio_tar_filepaths argument include:
+    (1) a single string that can be brace-expanded, e.g. 'path/to/audio.tar' or 'path/to/audio_{1..100}.tar.gz', or
+    (2) a list of file paths that will not be brace-expanded, e.g. ['audio_1.tar', 'audio_2.tar', ...].
+    See the WebDataset documentation for more information about accepted data and input formats.
+    If using multiple processes the number of shards should be divisible by the number of workers to ensure an
+    even split among workers. If it is not divisible, logging will give a warning but training will proceed.
+    In addition, if using mutiprocessing, each shard MUST HAVE THE SAME NUMBER OF ENTRIES after filtering
+    is applied. We currently do not check for this, but your program may hang if the shards are uneven!
+    Notice that a few arguments are different from the AudioToBPEDataset; for example, shuffle (bool) has been
+    replaced by shuffle_n (int).
+    Additionally, please note that the len() of this DataLayer is assumed to be the length of the manifest
+    after filtering. An incorrect manifest length may lead to some DataLoader issues down the line.
+    Args:
+        audio_tar_filepaths: Either a list of audio tarball filepaths, or a
+            string (can be brace-expandable).
+        manifest_filepath (str): Path to the manifest.
+        labels (list): Dataset parameter.
+            List of target classes that can be output by the speaker recognition model.
+        shuffle_n (int): How many samples to look ahead and load to be shuffled.
+            See WebDataset documentation for more details.
+            Defaults to 0.
+        min_duration (float): Dataset parameter.
+            All training files which have a duration less than min_duration
+            are dropped. Note: Duration is read from the manifest JSON.
+            Defaults to 0.1.
+        max_duration (float): Dataset parameter.
+            All training files which have a duration more than max_duration
+            are dropped. Note: Duration is read from the manifest JSON.
+            Defaults to None.
+        trim(bool): Whether to use trim silence from beginning and end
+            of audio signal using librosa.effects.trim().
+            Defaults to False.
+        window_length_in_sec (float): time length of window/slice (in seconds) # Pass this only for speaker recognition and VAD task
+        shift_length_in_sec (float): amount of shift of window for generating the frame for VAD task. in a batch # Pass this only for VAD task during inference.
+        normalize_audio (bool): Whether to normalize audio signal. Defaults to False.
+        shard_strategy (str): Tarred dataset shard distribution strategy chosen as a str value during ddp.
+            -   `scatter`: The default shard strategy applied by WebDataset, where each node gets
+                a unique set of shards, which are permanently pre-allocated and never changed at runtime.
+            -   `replicate`: Optional shard strategy, where each node gets all of the set of shards
+                available in the tarred dataset, which are permanently pre-allocated and never changed at runtime.
+                The benefit of replication is that it allows each node to sample data points from the entire
+                dataset independently of other nodes, and reduces dependence on value of `shuffle_n`.
+                .. warning::
+                    Replicated strategy allows every node to sample the entire set of available tarfiles,
+                    and therefore more than one node may sample the same tarfile, and even sample the same
+                    data points! As such, there is no assured guarantee that all samples in the dataset will be
+                    sampled at least once during 1 epoch. Scattered strategy, on the other hand, on specific
+                    occasions (when the number of shards is not divisible with ``world_size``), will not sample
+                    the entire dataset. For these reasons it is not advisable to use tarred datasets as validation
+                    or test datasets.
+        global_rank (int): Worker rank, used for partitioning shards. Defaults to 0.
+        world_size (int): Total number of processes, used for partitioning shards. Defaults to 0.
+        delimiter (Optional[str]): Delimiter to use when splitting the label string, default to None.
+        normalize_audio_db (Optional[float]):  normalize audio signal to a target db, default to None.
+    """
+    def __init__(
+        self,
+        *,
+        audio_tar_filepaths: Union[str, List[str]],
+        manifest_filepath: Union[str, List[str]],
+        sample_rate: int,
+        labels: Optional[List[str]] = None,
+        shuffle_n: int = 0,
+        int_values: bool = False,
+        augmentor: 'nemo.collections.asr.parts.perturb.AudioAugmentor' = None,
+        min_duration: Optional[float] = 0.1,
+        max_duration: Optional[float] = None,
+        trim_silence: bool = False,
+        is_regression_task: bool = False,
+        shard_strategy: str = "scatter",
+        global_rank: int = 0,
+        world_size: int = 0,
+        delimiter: Optional[str] = None,
+        normalize_audio_db: Optional[float] = None,
+    ):
+        super().__init__()
+        if isinstance(manifest_filepath, str):
+            manifest_filepath = manifest_filepath.split(',')
+        self.trim = trim_silence
+        self.is_regression_task = is_regression_task
+        self.delimiter = delimiter
+        self.normalize_audio_db = normalize_audio_db
+        self.collection = collections.ASRSpeechLabel(
+            manifests_files=manifest_filepath,
+            min_duration=min_duration,
+            max_duration=max_duration,
+            is_regression_task=is_regression_task,
+            index_by_file_id=True,
+        )
+        self.file_occurence = count_occurence(self.collection.mapping)
+        self.featurizer = WaveformFeaturizer(sample_rate=sample_rate, int_values=int_values, augmentor=augmentor)
+        if not is_regression_task:
+            self.labels = labels if labels else self._get_label_set()
+            self.num_classes = len(self.labels) if self.labels is not None else 1
+            self.label2id, self.id2label = {}, {}
+            for label_id, label in enumerate(self.labels):
+                self.label2id[label] = label_id
+                self.id2label[label_id] = label
+            for idx in range(len(self.labels[:5])):
+                logging.debug(" label id {} and its mapped label {}".format(idx, self.id2label[idx]))
+        else:
+            self.labels = []
+            self.num_classes = 1
+        audio_tar_filepaths = expand_sharded_filepaths(
+            sharded_filepaths=audio_tar_filepaths,
+            shard_strategy=shard_strategy,
+            world_size=world_size,
+            global_rank=global_rank,
+        )
+        # Put together WebDataset
+        self._dataset = wd.WebDataset(urls=audio_tar_filepaths, nodesplitter=None)
+        if shuffle_n > 0:
+            self._dataset = self._dataset.shuffle(shuffle_n)
+        else:
+            logging.info("WebDataset will not shuffle files within the tar files.")
+        self._dataset = (
+            self._dataset.rename(audio=VALID_FILE_FORMATS, key='__key__')
+            .to_tuple('audio', 'key')
+            .pipe(self._filter)
+            .map(f=self._build_sample)
+        )
+    def _get_label_set(self):
+        labels = []
+        for sample in self.collection:
+            label_str = sample.label
+            if label_str:
+                label_str_list = label_str.split(self.delimiter) if self.delimiter else label_str.split()
+                labels.extend(label_str_list)
+        return sorted(set(labels))
+    def _label_str_to_tensor(self, label_str: str):
+        labels = label_str.split(self.delimiter) if self.delimiter else label_str.split()
+        if self.is_regression_task:
+            labels = [float(s) for s in labels]
+            labels = torch.tensor(labels).float()
+        else:
+            labels = [self.label2id[s] for s in labels]
+            labels = torch.tensor(labels).long()
+        return labels
+    def _filter(self, iterator):
+        """This function is used to remove samples that have been filtered out by ASRSpeechLabel already.
+        Otherwise, we would get a KeyError as _build_sample attempts to find the manifest entry for a sample
+        that was filtered out (e.g. for duration).
+        Note that if using multi-GPU training, filtering may lead to an imbalance in samples in each shard,
+        which may make your code hang as one process will finish before the other.
+        """
+        class TarredAudioFilter:
+            def __init__(self, collection, file_occurence):
+                self.iterator = iterator
+                self.collection = collection
+                self.file_occurence = file_occurence
+                self._iterable = self._internal_generator()
+            def __iter__(self):
+                self._iterable = self._internal_generator()
+                return self
+            def __next__(self):
+                try:
+                    values = next(self._iterable)
+                except StopIteration:
+                    # reset generator
+                    self._iterable = self._internal_generator()
+                    values = next(self._iterable)
+                return values
+            def _internal_generator(self):
+                """
+                WebDataset requires an Iterator, but we require an iterable that yields 1-or-more
+                values per value inside self.iterator.
+                Therefore wrap the iterator with a generator function that will yield 1-or-more
+                values per sample in the iterator.
+                """
+                for _, tup in enumerate(self.iterator):
+                    audio_bytes, audio_filename = tup
+                    file_id, _ = os.path.splitext(os.path.basename(audio_filename))
+                    if audio_filename in self.file_occurence:
+                        for j in range(0, self.file_occurence[file_id]):
+                            if j == 0:
+                                audio_filename = file_id
+                            else:
+                                audio_filename = file_id + "-sub" + str(j)
+                            yield audio_bytes, audio_filename
+        return TarredAudioFilter(self.collection, self.file_occurence)
+    def _build_sample(self, tup):
+        """Builds the training sample by combining the data from the WebDataset with the manifest info.
+        """
+        audio_bytes, audio_filename = tup
+        # Grab manifest entry from self.collection
+        file_id, _ = os.path.splitext(os.path.basename(audio_filename))
+        manifest_idx = self.collection.mapping[file_id]
+        manifest_entry = self.collection[manifest_idx]
+        offset = manifest_entry.offset
+        if offset is None:
+            offset = 0
+        # Convert audio bytes to IO stream for processing (for SoundFile to read)
+        audio_filestream = io.BytesIO(audio_bytes)
+        features = self.featurizer.process(
+            audio_filestream,
+            offset=offset,
+            duration=manifest_entry.duration,
+            trim=self.trim,
+            normalize_db=self.normalize_audio_db,
+        )
+        audio_filestream.close()
+        # Audio features
+        f, fl = features, torch.tensor(features.shape[0]).long()
+        t = self._label_str_to_tensor(manifest_entry.label)
+        tl = torch.tensor(t.size(0)).long()
+        return f, fl, t, tl
+    def __iter__(self):
+        return self._dataset.__iter__()
+    def __len__(self):
+        return len(self.collection)
+    def _collate_fn(self, batch):
+        return _speech_collate_fn(batch, pad_id=0)

SoundScribe/SpeakerID/nemo/collections/asr/data/audio_to_label_dataset.py ADDED Viewed

	@@ -0,0 +1,304 @@

+# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import copy
+from omegaconf import DictConfig
+from nemo.collections.asr.data import audio_to_label
+from nemo.collections.asr.data.audio_to_text_dataset import convert_to_config_list, get_chain_dataset
+from nemo.collections.asr.parts.preprocessing.perturb import process_augmentations
+from nemo.collections.common.data.dataset import ConcatDataset
+def get_classification_label_dataset(featurizer, config: dict) -> audio_to_label.AudioToClassificationLabelDataset:
+    """
+    Instantiates a Classification AudioLabelDataset.
+    Args:
+        config: Config of the AudioToClassificationLabelDataset.
+    Returns:
+        An instance of AudioToClassificationLabelDataset.
+    """
+    dataset = audio_to_label.AudioToClassificationLabelDataset(
+        manifest_filepath=config['manifest_filepath'],
+        labels=config['labels'],
+        featurizer=featurizer,
+        max_duration=config.get('max_duration', None),
+        min_duration=config.get('min_duration', None),
+        trim=config.get('trim_silence', False),
+        is_regression_task=config.get('is_regression_task', False),
+        cal_labels_occurrence=config.get('cal_labels_occurrence', False),
+    )
+    return dataset
+def get_speech_label_dataset(featurizer, config: dict) -> audio_to_label.AudioToSpeechLabelDataset:
+    """
+    Instantiates a Speech Label (e.g. VAD, speaker recognition) AudioLabelDataset.
+    Args:
+        config: Config of the AudioToSpeechLabelDataSet.
+    Returns:
+        An instance of AudioToSpeechLabelDataset.
+    """
+    dataset = audio_to_label.AudioToSpeechLabelDataset(
+        manifest_filepath=config['manifest_filepath'],
+        labels=config['labels'],
+        featurizer=featurizer,
+        max_duration=config.get('max_duration', None),
+        min_duration=config.get('min_duration', None),
+        trim=config.get('trim_silence', False),
+        window_length_in_sec=config.get('window_length_in_sec', 0.31),
+        shift_length_in_sec=config.get('shift_length_in_sec', 0.01),
+        normalize_audio=config.get('normalize_audio', False),
+        cal_labels_occurrence=config.get('cal_labels_occurrence', False),
+    )
+    return dataset
+def get_tarred_classification_label_dataset(
+    featurizer, config: dict, shuffle_n: int, global_rank: int, world_size: int
+) -> audio_to_label.TarredAudioToClassificationLabelDataset:
+    """
+    Instantiates a Classification TarredAudioLabelDataset.
+    Args:
+        config: Config of the TarredAudioToClassificationLabelDataset.
+        shuffle_n: How many samples to look ahead and load to be shuffled.
+            See WebDataset documentation for more details.
+        global_rank: Global rank of this device.
+        world_size: Global world size in the training method.
+    Returns:
+        An instance of TarredAudioToClassificationLabelDataset.
+    """
+    tarred_audio_filepaths = config['tarred_audio_filepaths']
+    manifest_filepaths = config['manifest_filepath']
+    datasets = []
+    tarred_audio_filepaths = convert_to_config_list(tarred_audio_filepaths)
+    manifest_filepaths = convert_to_config_list(manifest_filepaths)
+    bucketing_weights = config.get('bucketing_weights', None)  # For upsampling buckets
+    if bucketing_weights:
+        for idx, weight in enumerate(bucketing_weights):
+            if not isinstance(weight, int) or weight <= 0:
+                raise ValueError(f"bucket weights must be positive integers")
+    if len(manifest_filepaths) != len(tarred_audio_filepaths):
+        raise ValueError(
+            f"manifest_filepaths (length={len(manifest_filepaths)}) and tarred_audio_filepaths (length={len(tarred_audio_filepaths)}) need to have the same number of buckets."
+        )
+    for dataset_idx, (tarred_audio_filepath, manifest_filepath) in enumerate(
+        zip(tarred_audio_filepaths, manifest_filepaths)
+    ):
+        if len(tarred_audio_filepath) == 1:
+            tarred_audio_filepath = tarred_audio_filepath[0]
+        dataset = audio_to_label.TarredAudioToClassificationLabelDataset(
+            audio_tar_filepaths=tarred_audio_filepath,
+            manifest_filepath=manifest_filepath,
+            labels=config['labels'],
+            featurizer=featurizer,
+            shuffle_n=shuffle_n,
+            max_duration=config.get('max_duration', None),
+            min_duration=config.get('min_duration', None),
+            trim=config.get('trim_silence', False),
+            shard_strategy=config.get('tarred_shard_strategy', 'scatter'),
+            global_rank=global_rank,
+            world_size=world_size,
+            is_regression_task=config.get('is_regression_task', False),
+        )
+        if bucketing_weights:
+            [datasets.append(dataset) for _ in range(bucketing_weights[dataset_idx])]
+        else:
+            datasets.append(dataset)
+    return get_chain_dataset(datasets=datasets, ds_config=config, rank=global_rank)
+def get_concat_tarred_speech_label_dataset(
+    featurizer, config: dict, shuffle_n: int, global_rank: int, world_size: int,
+):
+    tarred_audio_filepaths = config['tarred_audio_filepaths']
+    manifest_filepaths = config['manifest_filepath']
+    datasets = []
+    for dataset_idx, (tarred_audio_filepath, manifest_filepath) in enumerate(
+        zip(tarred_audio_filepaths, manifest_filepaths)
+    ):
+        conf = copy.deepcopy(config)
+        conf['manifest_filepath'] = manifest_filepath
+        conf['tarred_audio_filepaths'] = tarred_audio_filepath
+        dataset = get_tarred_speech_label_dataset(
+            config=conf, featurizer=featurizer, shuffle_n=shuffle_n, global_rank=global_rank, world_size=world_size,
+        )
+        datasets.append(dataset)
+    dataset = ConcatDataset(
+        datasets,
+        sampling_technique=config.get('concat_sampling_technique', 'temperature'),
+        sampling_temperature=config.get('concat_sampling_temperature', 5),
+        sampling_probabilities=config.get('concat_sampling_probabilities', None),
+        global_rank=global_rank,
+        world_size=world_size,
+        shuffle=config['shuffle'],
+    )
+    return dataset
+def get_tarred_speech_label_dataset(
+    featurizer, config: dict, shuffle_n: int, global_rank: int, world_size: int,
+) -> audio_to_label.TarredAudioToSpeechLabelDataset:
+    """
+    InInstantiates a Speech Label (e.g. VAD, speaker recognition) TarredAudioLabelDataset.
+    Args:
+        config: Config of the TarredAudioToSpeechLabelDataset.
+        shuffle_n: How many samples to look ahead and load to be shuffled.
+            See WebDataset documentation for more details.
+        global_rank: Global rank of this device.
+        world_size: Global world size in the training method.
+    Returns:
+        An instance of TarredAudioToSpeechLabelDataset.
+    """
+    tarred_audio_filepaths = config['tarred_audio_filepaths']
+    manifest_filepaths = config['manifest_filepath']
+    datasets = []
+    tarred_audio_filepaths = convert_to_config_list(tarred_audio_filepaths)
+    manifest_filepaths = convert_to_config_list(manifest_filepaths)
+    bucketing_weights = config.get('bucketing_weights', None)  # For upsampling buckets
+    if bucketing_weights:
+        for idx, weight in enumerate(bucketing_weights):
+            if not isinstance(weight, int) or weight <= 0:
+                raise ValueError(f"bucket weights must be positive integers")
+    if len(manifest_filepaths) != len(tarred_audio_filepaths):
+        raise ValueError(
+            f"manifest_filepaths (length={len(manifest_filepaths)}) and tarred_audio_filepaths (length={len(tarred_audio_filepaths)}) need to have the same number of buckets."
+        )
+    for dataset_idx, (tarred_audio_filepath, manifest_filepath) in enumerate(
+        zip(tarred_audio_filepaths, manifest_filepaths)
+    ):
+        if len(tarred_audio_filepath) == 1:
+            tarred_audio_filepath = tarred_audio_filepath[0]
+        dataset = audio_to_label.TarredAudioToSpeechLabelDataset(
+            audio_tar_filepaths=tarred_audio_filepath,
+            manifest_filepath=manifest_filepath,
+            labels=config['labels'],
+            featurizer=featurizer,
+            shuffle_n=shuffle_n,
+            max_duration=config.get('max_duration', None),
+            min_duration=config.get('min_duration', None),
+            trim=config.get('trim_silence', False),
+            window_length_in_sec=config.get('window_length_in_sec', 8),
+            shift_length_in_sec=config.get('shift_length_in_sec', 0.075),
+            normalize_audio=config.get('normalize_audio', False),
+            shard_strategy=config.get('tarred_shard_strategy', 'scatter'),
+            global_rank=global_rank,
+            world_size=world_size,
+        )
+        if bucketing_weights:
+            [datasets.append(dataset) for _ in range(bucketing_weights[dataset_idx])]
+        else:
+            datasets.append(dataset)
+    return get_chain_dataset(datasets=datasets, ds_config=config, rank=global_rank)
+def get_audio_multi_label_dataset(cfg: DictConfig) -> audio_to_label.AudioToMultiLabelDataset:
+    if "augmentor" in cfg:
+        augmentor = process_augmentations(cfg.augmentor)
+    else:
+        augmentor = None
+    dataset = audio_to_label.AudioToMultiLabelDataset(
+        manifest_filepath=cfg.get("manifest_filepath"),
+        sample_rate=cfg.get("sample_rate"),
+        labels=cfg.get("labels", None),
+        int_values=cfg.get("int_values", False),
+        augmentor=augmentor,
+        min_duration=cfg.get("min_duration", None),
+        max_duration=cfg.get("max_duration", None),
+        trim_silence=cfg.get("trim_silence", False),
+        is_regression_task=cfg.get("is_regression_task", False),
+        cal_labels_occurrence=cfg.get("cal_labels_occurrence", False),
+        delimiter=cfg.get("delimiter", None),
+        normalize_audio_db=cfg.get("normalize_audio_db", None),
+    )
+    return dataset
+def get_tarred_audio_multi_label_dataset(
+    cfg: DictConfig, shuffle_n: int, global_rank: int, world_size: int
+) -> audio_to_label.TarredAudioToMultiLabelDataset:
+    if "augmentor" in cfg:
+        augmentor = process_augmentations(cfg.augmentor)
+    else:
+        augmentor = None
+    tarred_audio_filepaths = cfg['tarred_audio_filepaths']
+    manifest_filepaths = cfg['manifest_filepath']
+    datasets = []
+    tarred_audio_filepaths = convert_to_config_list(tarred_audio_filepaths)
+    manifest_filepaths = convert_to_config_list(manifest_filepaths)
+    bucketing_weights = cfg.get('bucketing_weights', None)  # For upsampling buckets
+    if bucketing_weights:
+        for idx, weight in enumerate(bucketing_weights):
+            if not isinstance(weight, int) or weight <= 0:
+                raise ValueError(f"bucket weights must be positive integers")
+    if len(manifest_filepaths) != len(tarred_audio_filepaths):
+        raise ValueError(
+            f"manifest_filepaths (length={len(manifest_filepaths)}) and tarred_audio_filepaths (length={len(tarred_audio_filepaths)}) need to have the same number of buckets."
+        )
+    for dataset_idx, (tarred_audio_filepath, manifest_filepath) in enumerate(
+        zip(tarred_audio_filepaths, manifest_filepaths)
+    ):
+        if len(tarred_audio_filepath) == 1:
+            tarred_audio_filepath = tarred_audio_filepath[0]
+        dataset = audio_to_label.TarredAudioToMultiLabelDataset(
+            audio_tar_filepaths=tarred_audio_filepath,
+            manifest_filepath=manifest_filepath,
+            sample_rate=cfg["sample_rate"],
+            labels=cfg['labels'],
+            shuffle_n=shuffle_n,
+            int_values=cfg.get("int_values", False),
+            augmentor=augmentor,
+            min_duration=cfg.get('min_duration', None),
+            max_duration=cfg.get('max_duration', None),
+            trim_silence=cfg.get('trim_silence', False),
+            is_regression_task=cfg.get('is_regression_task', False),
+            delimiter=cfg.get("delimiter", None),
+            shard_strategy=cfg.get('tarred_shard_strategy', 'scatter'),
+            global_rank=global_rank,
+            world_size=world_size,
+            normalize_audio_db=cfg.get("normalize_audio_db", None),
+        )
+        if bucketing_weights:
+            [datasets.append(dataset) for _ in range(bucketing_weights[dataset_idx])]
+        else:
+            datasets.append(dataset)
+    return get_chain_dataset(datasets=datasets, ds_config=cfg, rank=global_rank)

SoundScribe/SpeakerID/nemo/collections/asr/data/audio_to_text.py ADDED Viewed

	@@ -0,0 +1,1366 @@

+# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import io
+import json
+import math
+import multiprocessing
+import os
+from typing import Callable, Dict, Iterable, List, Optional, Tuple, Union
+import braceexpand
+import numpy as np
+import torch
+import webdataset as wd
+from torch.utils.data import ChainDataset
+from tqdm import tqdm
+from nemo.collections.asr.parts.preprocessing.features import WaveformFeaturizer
+from nemo.collections.asr.parts.utils.audio_utils import ChannelSelectorType
+from nemo.collections.common import tokenizers
+from nemo.collections.common.parts.preprocessing import collections, parsers
+from nemo.core.classes import Dataset, IterableDataset
+from nemo.core.neural_types import *
+from nemo.utils import logging
+from nemo.utils.data_utils import (
+    DataStoreObject,
+    datastore_object_get,
+    datastore_path_to_webdataset_url,
+    is_datastore_cache_shared,
+    is_datastore_path,
+    is_tarred_path,
+)
+from nemo.utils.get_rank import is_global_rank_zero
+__all__ = [
+    'AudioToCharDataset',
+    'AudioToBPEDataset',
+    'TarredAudioToCharDataset',
+    'TarredAudioToBPEDataset',
+]
+def _speech_collate_fn(batch, pad_id):
+    """collate batch of audio sig, audio len, tokens, tokens len
+    Args:
+        batch (Optional[FloatTensor], Optional[LongTensor], LongTensor,
+               LongTensor):  A tuple of tuples of signal, signal lengths,
+               encoded tokens, and encoded tokens length.  This collate func
+               assumes the signals are 1d torch tensors (i.e. mono audio).
+    """
+    packed_batch = list(zip(*batch))
+    if len(packed_batch) == 5:
+        _, audio_lengths, _, tokens_lengths, sample_ids = packed_batch
+    elif len(packed_batch) == 4:
+        sample_ids = None
+        _, audio_lengths, _, tokens_lengths = packed_batch
+    else:
+        raise ValueError("Expects 4 or 5 tensors in the batch!")
+    max_audio_len = 0
+    has_audio = audio_lengths[0] is not None
+    if has_audio:
+        max_audio_len = max(audio_lengths).item()
+    max_tokens_len = max(tokens_lengths).item()
+    audio_signal, tokens = [], []
+    for b in batch:
+        if len(b) == 5:
+            sig, sig_len, tokens_i, tokens_i_len, _ = b
+        else:
+            sig, sig_len, tokens_i, tokens_i_len = b
+        if has_audio:
+            sig_len = sig_len.item()
+            if sig_len < max_audio_len:
+                pad = (0, max_audio_len - sig_len)
+                sig = torch.nn.functional.pad(sig, pad)
+            audio_signal.append(sig)
+        tokens_i_len = tokens_i_len.item()
+        if tokens_i_len < max_tokens_len:
+            pad = (0, max_tokens_len - tokens_i_len)
+            tokens_i = torch.nn.functional.pad(tokens_i, pad, value=pad_id)
+        tokens.append(tokens_i)
+    if has_audio:
+        audio_signal = torch.stack(audio_signal)
+        audio_lengths = torch.stack(audio_lengths)
+    else:
+        audio_signal, audio_lengths = None, None
+    tokens = torch.stack(tokens)
+    tokens_lengths = torch.stack(tokens_lengths)
+    if sample_ids is None:
+        return audio_signal, audio_lengths, tokens, tokens_lengths
+    else:
+        sample_ids = torch.tensor(sample_ids, dtype=torch.int32)
+        return audio_signal, audio_lengths, tokens, tokens_lengths, sample_ids
+class ASRManifestProcessor:
+    """
+    Class that processes a manifest json file containing paths to audio files, transcripts, and durations (in seconds).
+    Each new line is a different sample. Example below:
+    {"audio_filepath": "/path/to/audio.wav", "text_filepath": "/path/to/audio.txt", "duration": 23.147}
+    ...
+    {"audio_filepath": "/path/to/audio.wav", "text": "the transcription", "offset": 301.75, "duration": 0.82, "utt":
+    "utterance_id", "ctm_utt": "en_4156", "side": "A"}
+    Args:
+        manifest_filepath: Path to manifest json as described above. Can be comma-separated paths.
+        parser: Str for a language specific preprocessor or a callable.
+        max_duration: If audio exceeds this length, do not include in dataset.
+        min_duration: If audio is less than this length, do not include in dataset.
+        max_utts: Limit number of utterances.
+        bos_id: Id of beginning of sequence symbol to append if not None.
+        eos_id: Id of end of sequence symbol to append if not None.
+        pad_id: Id of pad symbol. Defaults to 0.
+    """
+    def __init__(
+        self,
+        manifest_filepath: str,
+        parser: Union[str, Callable],
+        max_duration: Optional[float] = None,
+        min_duration: Optional[float] = None,
+        max_utts: int = 0,
+        bos_id: Optional[int] = None,
+        eos_id: Optional[int] = None,
+        pad_id: int = 0,
+        index_by_file_id: bool = False,
+    ):
+        self.parser = parser
+        self.collection = collections.ASRAudioText(
+            manifests_files=manifest_filepath,
+            parser=parser,
+            min_duration=min_duration,
+            max_duration=max_duration,
+            max_number=max_utts,
+            index_by_file_id=index_by_file_id,
+        )
+        self.eos_id = eos_id
+        self.bos_id = bos_id
+        self.pad_id = pad_id
+    def process_text_by_id(self, index: int) -> Tuple[List[int], int]:
+        sample = self.collection[index]
+        return self.process_text_by_sample(sample)
+    def process_text_by_file_id(self, file_id: str) -> Tuple[List[int], int]:
+        manifest_idx = self.collection.mapping[file_id][0]
+        sample = self.collection[manifest_idx]
+        return self.process_text_by_sample(sample)
+    def process_text_by_sample(self, sample: collections.ASRAudioText.OUTPUT_TYPE) -> Tuple[List[int], int]:
+        t, tl = sample.text_tokens, len(sample.text_tokens)
+        if self.bos_id is not None:
+            t = [self.bos_id] + t
+            tl += 1
+        if self.eos_id is not None:
+            t = t + [self.eos_id]
+            tl += 1
+        return t, tl
+def expand_sharded_filepaths(sharded_filepaths, shard_strategy: str, world_size: int, global_rank: int):
+    valid_shard_strategies = ['scatter', 'replicate']
+    if shard_strategy not in valid_shard_strategies:
+        raise ValueError(f"`shard_strategy` must be one of {valid_shard_strategies}")
+    if isinstance(sharded_filepaths, str):
+        # Replace '(' and '[' with '{'
+        brace_keys_open = ['(', '[', '<', '_OP_']
+        for bkey in brace_keys_open:
+            if bkey in sharded_filepaths:
+                sharded_filepaths = sharded_filepaths.replace(bkey, "{")
+        # Replace ')' and ']' with '}'
+        brace_keys_close = [')', ']', '>', '_CL_']
+        for bkey in brace_keys_close:
+            if bkey in sharded_filepaths:
+                sharded_filepaths = sharded_filepaths.replace(bkey, "}")
+    if isinstance(sharded_filepaths, str):
+        # Brace expand, set escape=False for Windows compatibility
+        sharded_filepaths = list(braceexpand.braceexpand(sharded_filepaths, escape=False))
+    # Expand store paths into WebDataset URLs
+    sharded_filepaths = [
+        datastore_path_to_webdataset_url(p) if is_datastore_path(p) and is_tarred_path(p) else p
+        for p in sharded_filepaths
+    ]
+    # Check for distributed and partition shards accordingly
+    if world_size > 1:
+        if shard_strategy == 'scatter':
+            logging.info("All tarred dataset shards will be scattered evenly across all nodes.")
+            if len(sharded_filepaths) % world_size != 0:
+                logging.warning(
+                    f"Number of shards in tarred dataset ({len(sharded_filepaths)}) is not divisible "
+                    f"by number of distributed workers ({world_size})."
+                )
+            begin_idx = (len(sharded_filepaths) // world_size) * global_rank
+            end_idx = begin_idx + len(sharded_filepaths) // world_size
+            sharded_filepaths = sharded_filepaths[begin_idx:end_idx]
+            logging.info(
+                "Partitioning tarred dataset: process (%d) taking shards [%d, %d)", global_rank, begin_idx, end_idx
+            )
+        elif shard_strategy == 'replicate':
+            logging.info("All tarred dataset shards will be replicated across all nodes.")
+        else:
+            raise ValueError(f"Invalid shard strategy ! Allowed values are : {valid_shard_strategies}")
+    return sharded_filepaths
+def cache_datastore_manifests(
+    manifest_filepaths: Union[str, List[str]],
+    cache_audio: bool = False,
+    shared_cache: Optional[bool] = None,
+    num_workers: Optional[int] = None,
+    max_num_workers: int = 20,
+):
+    """Cache manifests and audio from an object store.
+    It is assumed that remote manifests are using relative paths.
+    Args:
+        manifest_filepaths: list of paths to manifest files (list of strings or a string with `,` as separator)
+        cache_audio: If True, audio from manifest will also be cached
+        shared_cache: Optional, True if cache is shared across all nodes
+        num_workers: Optional, number of workers to be used for download
+        max_num_workers: max number of workers to be used for download, used when setting num_workers automatically
+    """
+    if isinstance(manifest_filepaths, str):
+        manifest_filepaths = manifest_filepaths.split(',')
+    num_datastore_manifests = sum([is_datastore_path(f) for f in manifest_filepaths])
+    if num_datastore_manifests > 0:
+        # Local utility function
+        def cache_data(manifest_filepaths, cache_audio, num_workers, max_num_workers):
+            """Cache manifests and audio data from object store.
+            """
+            # Determine the number of workers to use
+            if num_workers is None:
+                num_workers = os.cpu_count() - 1
+            num_workers = min(num_workers, max_num_workers)
+            # Process each manifest file
+            for manifest_file in manifest_filepaths:
+                # If manifest is on a data store, then cache it.
+                # Otherwise, nothing to do.
+                if is_datastore_path(manifest_file):
+                    logging.info('Cache manifest file: %s', manifest_file)
+                    cached_manifest_file = DataStoreObject(manifest_file).get()
+                    logging.info('Cached at: %s', str(cached_manifest_file))
+                    if cache_audio:
+                        # Each audio file from manifest will be cached.
+                        logging.info('Cache audio from manifest file: %s', manifest_file)
+                        # Assumes that manifest is using relative paths
+                        manifest_dir = os.path.dirname(manifest_file)
+                        # Prepare all store objects
+                        audio_objects = []
+                        with open(cached_manifest_file, 'r') as f:
+                            for line in f:
+                                item = json.loads(line)
+                                store_path = os.path.join(manifest_dir, item['audio_filepath'])
+                                audio_objects.append(DataStoreObject(store_path=store_path))
+                        if num_workers is not None and num_workers > 1:
+                            logging.debug('Using multiprocessing with num_workers: %d.', num_workers)
+                            with multiprocessing.Pool(processes=num_workers) as p:
+                                result = list(
+                                    tqdm(p.imap(datastore_object_get, audio_objects), total=len(audio_objects))
+                                )
+                        else:
+                            logging.debug('Using a single process.')
+                            result = []
+                            for audio_object in tqdm(audio_objects):
+                                result.append(audio_object.get() is not None)
+                        if not all(result):
+                            raise RuntimeError('Some files not downloaded successfully')
+                        logging.info('Caching complete')
+                else:
+                    # Nothing to do here
+                    logging.debug('Manifest is not on a data store: %s', manifest_file)
+        if torch.distributed.is_available() and torch.distributed.is_initialized():
+            logging.debug('Distributed environment is available and initialized.')
+            # Handle distributed environment
+            if shared_cache is None:
+                shared_cache = is_datastore_cache_shared()
+            if shared_cache:
+                logging.debug('Cache is shared among nodes, cache data on global rank zero.')
+                is_rank_zero = is_global_rank_zero()
+            else:
+                logging.debug('Cache is not shared among nodes, cache data on local rank zero.')
+                local_rank = int(os.environ.get("LOCAL_RANK", 0))
+                is_rank_zero = local_rank == 0
+            if is_rank_zero:
+                logging.info('Cache data from %s rank 0', 'global' if shared_cache else 'local')
+                cache_data(
+                    manifest_filepaths=manifest_filepaths,
+                    cache_audio=cache_audio,
+                    num_workers=num_workers,
+                    max_num_workers=max_num_workers,
+                )
+            logging.debug('Reached barrier')
+            torch.distributed.barrier()
+        elif is_global_rank_zero():
+            # Handle non-distributed environment, e.g., if running on a single GPU
+            logging.warning(
+                'Torch distributed is not initialized and caching may be prone to data race conditions. '
+                'Now caching data from global rank 0. If there are other ranks and they pass this '
+                'before rank 0, errors might result.'
+            )
+            cache_data(
+                manifest_filepaths=manifest_filepaths,
+                cache_audio=cache_audio,
+                num_workers=num_workers,
+                max_num_workers=max_num_workers,
+            )
+        else:
+            raise RuntimeError(
+                'Torch distributed is not initialized and caching on nodes other than global rank zero is disabled '
+                'to avoid race condition between different ranks. To ensure distributed environment is '
+                'initialized, please update data config to use `defer_setup = True`.'
+            )
+"""Optionally expand / shard the list of manifests
+    This is made to use the same notation as the sharded audio files
+    Args:
+        manifest_filepaths: list of manifest files (the sharded notation)
+        shard_strategy: scatter or replicate (scatter by default)
+        shard_manifests: bool, if False, no sharding / manifest filepath expansion will be attempted
+        global_rank: int, the rank of this worker
+        world_size: int, total number of workers
+"""
+def shard_manifests_if_needed(
+    manifest_filepaths: Union[str, List[str]],
+    shard_strategy: str,
+    shard_manifests: bool,
+    global_rank: int,
+    world_size: int,
+):
+    if shard_manifests:
+        if not torch.distributed.is_available():
+            logging.warning("Not running in torch.distributed mode. Manifest sharding not available")
+            return manifest_filepaths
+        if not torch.distributed.is_initialized():
+            logging.warning(
+                'Manifest sharding was requested but torch.distributed is not initialized '
+                'Did you intend to set the defer_setup flag?'
+            )
+            return manifest_filepaths
+        manifest_filepaths = expand_sharded_filepaths(
+            sharded_filepaths=manifest_filepaths,
+            shard_strategy=shard_strategy,
+            world_size=world_size,
+            global_rank=global_rank,
+        )
+    return manifest_filepaths
+class _AudioTextDataset(Dataset):
+    """
+    Dataset that loads tensors via a json file containing paths to audio files, transcripts, and durations (in seconds).
+    Each new line is a different sample. Example below:
+    {"audio_filepath": "/path/to/audio.wav", "text_filepath": "/path/to/audio.txt", "duration": 23.147}
+    ...
+    {"audio_filepath": "/path/to/audio.wav", "text": "the transcription", "offset": 301.75, "duration": 0.82, "utt":
+    "utterance_id", "ctm_utt": "en_4156", "side": "A"}
+    Args:
+        manifest_filepath: Path to manifest json as described above. Can be comma-separated paths.
+        parser: Str for a language specific preprocessor or a callable.
+        sample_rate (int): Sample rate to resample loaded audio to
+        int_values (bool): If true, load samples as 32-bit integers. Defauts to False.
+        augmentor (nemo.collections.asr.parts.perturb.AudioAugmentor): An AudioAugmentor object used to augment loaded
+            audio
+        max_duration: If audio exceeds this length, do not include in dataset
+        min_duration: If audio is less than this length, do not include in dataset
+        max_utts: Limit number of utterances
+        trim: whether or not to trim silence. Defaults to False
+        bos_id: Id of beginning of sequence symbol to append if not None
+        eos_id: Id of end of sequence symbol to append if not None
+        pad_id: Id of pad symbol. Defaults to 0
+        return_sample_id (bool): whether to return the sample_id as a part of each sample
+        channel_selector (int | Iterable[int] | str): select a single channel or a subset of channels from multi-channel audio. If set to `'average'`, it performs averaging across channels. Disabled if set to `None`. Defaults to `None`. Uses zero-based indexing.
+    """
+    @property
+    def output_types(self) -> Optional[Dict[str, NeuralType]]:
+        """Returns definitions of module output ports.
+               """
+        return {
+            'audio_signal': NeuralType(('B', 'T'), AudioSignal()),
+            'a_sig_length': NeuralType(tuple('B'), LengthsType()),
+            'transcripts': NeuralType(('B', 'T'), LabelsType()),
+            'transcript_length': NeuralType(tuple('B'), LengthsType()),
+            'sample_id': NeuralType(tuple('B'), LengthsType(), optional=True),
+        }
+    def __init__(
+        self,
+        manifest_filepath: str,
+        parser: Union[str, Callable],
+        sample_rate: int,
+        int_values: bool = False,
+        augmentor: 'nemo.collections.asr.parts.perturb.AudioAugmentor' = None,
+        max_duration: Optional[int] = None,
+        min_duration: Optional[int] = None,
+        max_utts: int = 0,
+        trim: bool = False,
+        bos_id: Optional[int] = None,
+        eos_id: Optional[int] = None,
+        pad_id: int = 0,
+        return_sample_id: bool = False,
+        channel_selector: Optional[ChannelSelectorType] = None,
+    ):
+        if type(manifest_filepath) == str:
+            manifest_filepath = manifest_filepath.split(",")
+        # If necessary, cache manifests and audio from object store
+        cache_datastore_manifests(manifest_filepaths=manifest_filepath, cache_audio=True)
+        self.manifest_processor = ASRManifestProcessor(
+            manifest_filepath=manifest_filepath,
+            parser=parser,
+            max_duration=max_duration,
+            min_duration=min_duration,
+            max_utts=max_utts,
+            bos_id=bos_id,
+            eos_id=eos_id,
+            pad_id=pad_id,
+        )
+        self.featurizer = WaveformFeaturizer(sample_rate=sample_rate, int_values=int_values, augmentor=augmentor)
+        self.trim = trim
+        self.return_sample_id = return_sample_id
+        self.channel_selector = channel_selector
+    def get_manifest_sample(self, sample_id):
+        return self.manifest_processor.collection[sample_id]
+    def __getitem__(self, index):
+        sample = self.manifest_processor.collection[index]
+        offset = sample.offset
+        if offset is None:
+            offset = 0
+        features = self.featurizer.process(
+            sample.audio_file,
+            offset=offset,
+            duration=sample.duration,
+            trim=self.trim,
+            orig_sr=sample.orig_sr,
+            channel_selector=self.channel_selector,
+        )
+        f, fl = features, torch.tensor(features.shape[0]).long()
+        t, tl = self.manifest_processor.process_text_by_sample(sample=sample)
+        if self.return_sample_id:
+            output = f, fl, torch.tensor(t).long(), torch.tensor(tl).long(), index
+        else:
+            output = f, fl, torch.tensor(t).long(), torch.tensor(tl).long()
+        return output
+    def __len__(self):
+        return len(self.manifest_processor.collection)
+    def _collate_fn(self, batch):
+        return _speech_collate_fn(batch, pad_id=self.manifest_processor.pad_id)
+class AudioToCharDataset(_AudioTextDataset):
+    """
+    Dataset that loads tensors via a json file containing paths to audio
+    files, transcripts, and durations (in seconds). Each new line is a
+    different sample. Example below:
+    {"audio_filepath": "/path/to/audio.wav", "text_filepath":
+    "/path/to/audio.txt", "duration": 23.147}
+    ...
+    {"audio_filepath": "/path/to/audio.wav", "text": "the
+    transcription", "offset": 301.75, "duration": 0.82, "utt":
+    "utterance_id", "ctm_utt": "en_4156", "side": "A"}
+    Args:
+        manifest_filepath: Path to manifest json as described above. Can
+            be comma-separated paths.
+        labels: String containing all the possible characters to map to
+        sample_rate (int): Sample rate to resample loaded audio to
+        int_values (bool): If true, load samples as 32-bit integers. Defauts to False.
+        augmentor (nemo.collections.asr.parts.perturb.AudioAugmentor): An AudioAugmentor
+            object used to augment loaded audio
+        max_duration: If audio exceeds this length, do not include in dataset
+        min_duration: If audio is less than this length, do not include
+            in dataset
+        max_utts: Limit number of utterances
+        blank_index: blank character index, default = -1
+        unk_index: unk_character index, default = -1
+        normalize: whether to normalize transcript text (default): True
+        bos_id: Id of beginning of sequence symbol to append if not None
+        eos_id: Id of end of sequence symbol to append if not None
+        return_sample_id (bool): whether to return the sample_id as a part of each sample
+        channel_selector (int | Iterable[int] | str): select a single channel or a subset of channels from multi-channel audio. If set to `'average'`, it performs averaging across channels. Disabled if set to `None`. Defaults to `None`. Uses zero-based indexing.
+    """
+    @property
+    def output_types(self) -> Optional[Dict[str, NeuralType]]:
+        """Returns definitions of module output ports.
+               """
+        return {
+            'audio_signal': NeuralType(('B', 'T'), AudioSignal()),
+            'a_sig_length': NeuralType(tuple('B'), LengthsType()),
+            'transcripts': NeuralType(('B', 'T'), LabelsType()),
+            'transcript_length': NeuralType(tuple('B'), LengthsType()),
+            'sample_id': NeuralType(tuple('B'), LengthsType(), optional=True),
+        }
+    def __init__(
+        self,
+        manifest_filepath: str,
+        labels: Union[str, List[str]],
+        sample_rate: int,
+        int_values: bool = False,
+        augmentor: 'nemo.collections.asr.parts.perturb.AudioAugmentor' = None,
+        max_duration: Optional[float] = None,
+        min_duration: Optional[float] = None,
+        max_utts: int = 0,
+        blank_index: int = -1,
+        unk_index: int = -1,
+        normalize: bool = True,
+        trim: bool = False,
+        bos_id: Optional[int] = None,
+        eos_id: Optional[int] = None,
+        pad_id: int = 0,
+        parser: Union[str, Callable] = 'en',
+        return_sample_id: bool = False,
+        channel_selector: Optional[ChannelSelectorType] = None,
+    ):
+        self.labels = labels
+        parser = parsers.make_parser(
+            labels=labels, name=parser, unk_id=unk_index, blank_id=blank_index, do_normalize=normalize
+        )
+        super().__init__(
+            manifest_filepath=manifest_filepath,
+            parser=parser,
+            sample_rate=sample_rate,
+            int_values=int_values,
+            augmentor=augmentor,
+            max_duration=max_duration,
+            min_duration=min_duration,
+            max_utts=max_utts,
+            trim=trim,
+            bos_id=bos_id,
+            eos_id=eos_id,
+            pad_id=pad_id,
+            return_sample_id=return_sample_id,
+            channel_selector=channel_selector,
+        )
+class AudioToBPEDataset(_AudioTextDataset):
+    """
+    Dataset that loads tensors via a json file containing paths to audio
+    files, transcripts, and durations (in seconds). Each new line is a
+    different sample. Example below:
+    {"audio_filepath": "/path/to/audio.wav", "text_filepath":
+    "/path/to/audio.txt", "duration": 23.147}
+    ...
+    {"audio_filepath": "/path/to/audio.wav", "text": "the
+    transcription", "offset": 301.75, "duration": 0.82, "utt":
+    "utterance_id", "ctm_utt": "en_4156", "side": "A"}
+    In practice, the dataset and manifest used for character encoding and byte pair encoding
+    are exactly the same. The only difference lies in how the dataset tokenizes the text in
+    the manifest.
+    Args:
+        manifest_filepath: Path to manifest json as described above. Can
+            be comma-separated paths.
+        tokenizer: A subclass of the Tokenizer wrapper found in the common collection,
+            nemo.collections.common.tokenizers.TokenizerSpec. ASR Models support a subset of
+            all available tokenizers.
+        sample_rate (int): Sample rate to resample loaded audio to
+        int_values (bool): If true, load samples as 32-bit integers. Defauts to False.
+        augmentor (nemo.collections.asr.parts.perturb.AudioAugmentor): An AudioAugmentor
+            object used to augment loaded audio
+        max_duration: If audio exceeds this length, do not include in dataset
+        min_duration: If audio is less than this length, do not include
+            in dataset
+        max_utts: Limit number of utterances
+        trim: Whether to trim silence segments
+        use_start_end_token: Boolean which dictates whether to add [BOS] and [EOS]
+            tokens to beginning and ending of speech respectively.
+        return_sample_id (bool): whether to return the sample_id as a part of each sample
+        channel_selector (int | Iterable[int] | str): select a single channel or a subset of channels from multi-channel audio. If set to `'average'`, it performs averaging across channels. Disabled if set to `None`. Defaults to `None`. Uses zero-based indexing.
+    """
+    @property
+    def output_types(self) -> Optional[Dict[str, NeuralType]]:
+        """Returns definitions of module output ports.
+               """
+        return {
+            'audio_signal': NeuralType(('B', 'T'), AudioSignal()),
+            'a_sig_length': NeuralType(tuple('B'), LengthsType()),
+            'transcripts': NeuralType(('B', 'T'), LabelsType()),
+            'transcript_length': NeuralType(tuple('B'), LengthsType()),
+            'sample_id': NeuralType(tuple('B'), LengthsType(), optional=True),
+        }
+    def __init__(
+        self,
+        manifest_filepath: str,
+        tokenizer: 'nemo.collections.common.tokenizers.TokenizerSpec',
+        sample_rate: int,
+        int_values: bool = False,
+        augmentor: 'nemo.collections.asr.parts.perturb.AudioAugmentor' = None,
+        max_duration: Optional[int] = None,
+        min_duration: Optional[int] = None,
+        max_utts: int = 0,
+        trim: bool = False,
+        use_start_end_token: bool = True,
+        return_sample_id: bool = False,
+        channel_selector: Optional[ChannelSelectorType] = None,
+    ):
+        if use_start_end_token and hasattr(tokenizer, "bos_id") and tokenizer.bos_id > 0:
+            bos_id = tokenizer.bos_id
+        else:
+            bos_id = None
+        if use_start_end_token and hasattr(tokenizer, "eos_id") and tokenizer.eos_id > 0:
+            eos_id = tokenizer.eos_id
+        else:
+            eos_id = None
+        if hasattr(tokenizer, "pad_id") and tokenizer.pad_id > 0:
+            pad_id = tokenizer.pad_id
+        else:
+            pad_id = 0
+        class TokenizerWrapper:
+            def __init__(self, tokenizer):
+                if isinstance(tokenizer, tokenizers.aggregate_tokenizer.AggregateTokenizer):
+                    self.is_aggregate = True
+                else:
+                    self.is_aggregate = False
+                self._tokenizer = tokenizer
+            def __call__(self, *args):
+                if isinstance(args[0], List) and self.is_aggregate:
+                    t = []
+                    for span in args[0]:
+                        t.extend(self._tokenizer.text_to_ids(span['str'], span['lang']))
+                    return t
+                t = self._tokenizer.text_to_ids(*args)
+                return t
+        super().__init__(
+            manifest_filepath=manifest_filepath,
+            parser=TokenizerWrapper(tokenizer),
+            sample_rate=sample_rate,
+            int_values=int_values,
+            augmentor=augmentor,
+            max_duration=max_duration,
+            min_duration=min_duration,
+            max_utts=max_utts,
+            bos_id=bos_id,
+            eos_id=eos_id,
+            pad_id=pad_id,
+            trim=trim,
+            return_sample_id=return_sample_id,
+            channel_selector=channel_selector,
+        )
+class _TarredAudioToTextDataset(IterableDataset):
+    """
+    A similar Dataset to the AudioToCharDataset/AudioToBPEDataset, but which loads tarred audio files.
+    Accepts a single comma-separated JSON manifest file (in the same style as for the AudioToCharDataset/AudioToBPEDataset),
+    as well as the path(s) to the tarball(s) containing the wav files. Each line of the manifest should
+    contain the information for one audio file, including at least the transcript and name of the audio
+    file within the tarball.
+    Valid formats for the audio_tar_filepaths argument include:
+    (1) a single string that can be brace-expanded, e.g. 'path/to/audio.tar' or 'path/to/audio_{1..100}.tar.gz', or
+    (2) a list of file paths that will not be brace-expanded, e.g. ['audio_1.tar', 'audio_2.tar', ...].
+    Note: For brace expansion in (1), there may be cases where `{x..y}` syntax cannot be used due to shell interference.
+    This occurs most commonly inside SLURM scripts. Therefore we provide a few equivalent replacements.
+    Supported opening braces - { <=> (, [, < and the special tag _OP_.
+    Supported closing braces - } <=> ), ], > and the special tag _CL_.
+    For SLURM based tasks, we suggest the use of the special tags for ease of use.
+    See the WebDataset documentation for more information about accepted data and input formats.
+    If using multiple workers the number of shards should be divisible by world_size to ensure an
+    even split among workers. If it is not divisible, logging will give a warning but training will proceed.
+    In addition, if using mutiprocessing, each shard MUST HAVE THE SAME NUMBER OF ENTRIES after filtering
+    is applied. We currently do not check for this, but your program may hang if the shards are uneven!
+    Notice that a few arguments are different from the AudioToCharDataset; for example, shuffle (bool) has been
+    replaced by shuffle_n (int).
+    Additionally, please note that the len() of this DataLayer is assumed to be the length of the manifest
+    after filtering. An incorrect manifest length may lead to some DataLoader issues down the line.
+    Args:
+        audio_tar_filepaths: Either a list of audio tarball filepaths, or a
+            string (can be brace-expandable).
+        manifest_filepath (str): Path to the manifest.
+        parser (callable): A callable which is used to pre-process the text output.
+        sample_rate (int): Sample rate to resample loaded audio to
+        int_values (bool): If true, load samples as 32-bit integers. Defauts to False.
+        augmentor (nemo.collections.asr.parts.perturb.AudioAugmentor): An AudioAugmentor
+            object used to augment loaded audio
+        shuffle_n (int): How many samples to look ahead and load to be shuffled.
+            See WebDataset documentation for more details.
+            Defaults to 0.
+        min_duration (float): Dataset parameter.
+            All training files which have a duration less than min_duration
+            are dropped. Note: Duration is read from the manifest JSON.
+            Defaults to 0.1.
+        max_duration (float): Dataset parameter.
+            All training files which have a duration more than max_duration
+            are dropped. Note: Duration is read from the manifest JSON.
+            Defaults to None.
+        blank_index (int): Blank character index, defaults to -1.
+        unk_index (int): Unknown character index, defaults to -1.
+        normalize (bool): Dataset parameter.
+            Whether to use automatic text cleaning.
+            It is highly recommended to manually clean text for best results.
+            Defaults to True.
+        trim (bool): Whether to use trim silence from beginning and end
+            of audio signal using librosa.effects.trim().
+            Defaults to False.
+        bos_id (id): Dataset parameter.
+            Beginning of string symbol id used for seq2seq models.
+            Defaults to None.
+        eos_id (id): Dataset parameter.
+            End of string symbol id used for seq2seq models.
+            Defaults to None.
+        pad_id (id): Token used to pad when collating samples in batches.
+            If this is None, pads using 0s.
+            Defaults to None.
+        shard_strategy (str): Tarred dataset shard distribution strategy chosen as a str value during ddp.
+            -   `scatter`: The default shard strategy applied by WebDataset, where each node gets
+                a unique set of shards, which are permanently pre-allocated and never changed at runtime.
+            -   `replicate`: Optional shard strategy, where each node gets all of the set of shards
+                available in the tarred dataset, which are permanently pre-allocated and never changed at runtime.
+                The benefit of replication is that it allows each node to sample data points from the entire
+                dataset independently of other nodes, and reduces dependence on value of `shuffle_n`.
+                .. warning::
+                    Replicated strategy allows every node to sample the entire set of available tarfiles,
+                    and therefore more than one node may sample the same tarfile, and even sample the same
+                    data points! As such, there is no assured guarantee that all samples in the dataset will be
+                    sampled at least once during 1 epoch. Scattered strategy, on the other hand, on specific
+                    occasions (when the number of shards is not divisible with ``world_size``), will not sample
+                    the entire dataset. For these reasons it is not advisable to use tarred datasets as validation
+                    or test datasets.
+        shard_manifests (bool): Whether or not to try / shard manifests. Defaults to False.
+        global_rank (int): Worker rank, used for partitioning shards. Defaults to 0.
+        world_size (int): Total number of processes, used for partitioning shards. Defaults to 0.
+        return_sample_id (bool): whether to return the sample_id as a part of each sample
+    """
+    def __init__(
+        self,
+        audio_tar_filepaths: Union[str, List[str]],
+        manifest_filepath: str,
+        parser: Callable,
+        sample_rate: int,
+        int_values: bool = False,
+        augmentor: Optional['nemo.collections.asr.parts.perturb.AudioAugmentor'] = None,
+        shuffle_n: int = 0,
+        min_duration: Optional[float] = None,
+        max_duration: Optional[float] = None,
+        trim: bool = False,
+        bos_id: Optional[int] = None,
+        eos_id: Optional[int] = None,
+        pad_id: int = 0,
+        shard_strategy: str = "scatter",
+        shard_manifests: bool = False,
+        global_rank: int = 0,
+        world_size: int = 0,
+        return_sample_id: bool = False,
+    ):
+        self.shard_manifests = shard_manifests
+        # Shard manifests if necessary and possible and then expand the paths
+        manifest_filepath = shard_manifests_if_needed(
+            shard_manifests=shard_manifests,
+            shard_strategy=shard_strategy,
+            manifest_filepaths=manifest_filepath,
+            world_size=world_size,
+            global_rank=global_rank,
+        )
+        # If necessary, cache manifests from object store
+        cache_datastore_manifests(manifest_filepaths=manifest_filepath)
+        self.manifest_processor = ASRManifestProcessor(
+            manifest_filepath=manifest_filepath,
+            parser=parser,
+            max_duration=max_duration,
+            min_duration=min_duration,
+            max_utts=0,
+            bos_id=bos_id,
+            eos_id=eos_id,
+            pad_id=pad_id,
+            index_by_file_id=True,  # Must set this so the manifest lines can be indexed by file ID
+        )
+        self.len = self._compute_len()
+        self.featurizer = WaveformFeaturizer(sample_rate=sample_rate, int_values=int_values, augmentor=augmentor)
+        self.trim = trim
+        self.eos_id = eos_id
+        self.bos_id = bos_id
+        self.pad_id = pad_id
+        self.return_sample_id = return_sample_id
+        audio_tar_filepaths = expand_sharded_filepaths(
+            sharded_filepaths=audio_tar_filepaths,
+            shard_strategy=shard_strategy,
+            world_size=world_size,
+            global_rank=global_rank,
+        )
+        # Put together WebDataset
+        self._dataset = wd.WebDataset(urls=audio_tar_filepaths, nodesplitter=None)
+        if shuffle_n > 0:
+            self._dataset = self._dataset.shuffle(shuffle_n)
+        else:
+            logging.info("WebDataset will not shuffle files within the tar files.")
+        self._dataset = (
+            self._dataset.rename(audio='wav;ogg;flac', key='__key__')
+            .to_tuple('audio', 'key')
+            .pipe(self._filter)
+            .pipe(self._loop_offsets)
+            .map(f=self._build_sample)
+        )
+    def _filter(self, iterator):
+        """This function is used to remove samples that have been filtered out by ASRAudioText already.
+        Otherwise, we would get a KeyError as _build_sample attempts to find the manifest entry for a sample
+        that was filtered out (e.g. for duration).
+        Note that if using multi-GPU training, filtering may lead to an imbalance in samples in each shard,
+        which may make your code hang as one process will finish before the other.
+        """
+        class TarredAudioFilter:
+            def __init__(self, collection):
+                self.iterator = iterator
+                self.collection = collection
+            def __iter__(self):
+                return self
+            def __next__(self):
+                while True:
+                    audio_bytes, audio_filename = next(self.iterator)
+                    file_id, _ = os.path.splitext(os.path.basename(audio_filename))
+                    if file_id in self.collection.mapping:
+                        return audio_bytes, audio_filename
+        return TarredAudioFilter(self.manifest_processor.collection)
+    def _loop_offsets(self, iterator):
+        """This function is used to iterate through utterances with different offsets for each file.
+        """
+        class TarredAudioLoopOffsets:
+            def __init__(self, collection):
+                self.iterator = iterator
+                self.collection = collection
+                self.current_fn = None
+                self.current_bytes = None
+                self.offset_id = 0
+            def __iter__(self):
+                return self
+            def __next__(self):
+                if self.current_fn is None:
+                    self.current_bytes, self.current_fn = next(self.iterator)
+                    self.offset_id = 0
+                else:
+                    offset_list = self.collection.mapping[self.current_fn]
+                    if len(offset_list) == self.offset_id + 1:
+                        self.current_bytes, self.current_fn = next(self.iterator)
+                        self.offset_id = 0
+                    else:
+                        self.offset_id += 1
+                return self.current_bytes, self.current_fn, self.offset_id
+        return TarredAudioLoopOffsets(self.manifest_processor.collection)
+    def _collate_fn(self, batch):
+        return _speech_collate_fn(batch, self.pad_id)
+    def _build_sample(self, tup):
+        """Builds the training sample by combining the data from the WebDataset with the manifest info.
+        """
+        audio_bytes, audio_filename, offset_id = tup
+        # Grab manifest entry from self.manifest_preprocessor.collection
+        file_id, _ = os.path.splitext(os.path.basename(audio_filename))
+        manifest_idx = self.manifest_processor.collection.mapping[file_id][offset_id]
+        manifest_entry = self.manifest_processor.collection[manifest_idx]
+        offset = manifest_entry.offset
+        if offset is None:
+            offset = 0
+        # Convert audio bytes to IO stream for processing (for SoundFile to read)
+        audio_filestream = io.BytesIO(audio_bytes)
+        features = self.featurizer.process(
+            audio_filestream,
+            offset=offset,
+            duration=manifest_entry.duration,
+            trim=self.trim,
+            orig_sr=manifest_entry.orig_sr,
+        )
+        audio_filestream.close()
+        # Audio features
+        f, fl = features, torch.tensor(features.shape[0]).long()
+        # Text features
+        t, tl = manifest_entry.text_tokens, len(manifest_entry.text_tokens)
+        self.manifest_processor.process_text_by_sample(sample=manifest_entry)
+        if self.bos_id is not None:
+            t = [self.bos_id] + t
+            tl += 1
+        if self.eos_id is not None:
+            t = t + [self.eos_id]
+            tl += 1
+        if self.return_sample_id:
+            return f, fl, torch.tensor(t).long(), torch.tensor(tl).long(), manifest_idx
+        else:
+            return f, fl, torch.tensor(t).long(), torch.tensor(tl).long()
+    def get_manifest_sample(self, sample_id):
+        return self.manifest_processor.collection[sample_id]
+    def __iter__(self):
+        return self._dataset.__iter__()
+    def _compute_len(self):
+        if self.shard_manifests and torch.distributed.is_available() and torch.distributed.is_initialized():
+            my_len = torch.tensor(len(self.manifest_processor.collection), dtype=torch.int32).cuda()
+            torch.distributed.all_reduce(my_len)
+            my_len = my_len.int()
+            logging.info(f'Sharded manifests: Total length: {my_len}')
+        else:
+            my_len = len(self.manifest_processor.collection)
+        return my_len
+    def __len__(self):
+        return self.len
+class TarredAudioToCharDataset(_TarredAudioToTextDataset):
+    """
+    A similar Dataset to the AudioToCharDataset, but which loads tarred audio files.
+    Accepts a single comma-separated JSON manifest file (in the same style as for the AudioToCharDataset),
+    as well as the path(s) to the tarball(s) containing the wav files. Each line of the manifest should
+    contain the information for one audio file, including at least the transcript and name of the audio
+    file within the tarball.
+    Valid formats for the audio_tar_filepaths argument include:
+    (1) a single string that can be brace-expanded, e.g. 'path/to/audio.tar' or 'path/to/audio_{1..100}.tar.gz', or
+    (2) a list of file paths that will not be brace-expanded, e.g. ['audio_1.tar', 'audio_2.tar', ...].
+    See the WebDataset documentation for more information about accepted data and input formats.
+    If using multiple workers the number of shards should be divisible by world_size to ensure an
+    even split among workers. If it is not divisible, logging will give a warning but training will proceed.
+    In addition, if using mutiprocessing, each shard MUST HAVE THE SAME NUMBER OF ENTRIES after filtering
+    is applied. We currently do not check for this, but your program may hang if the shards are uneven!
+    Notice that a few arguments are different from the AudioToCharDataset; for example, shuffle (bool) has been
+    replaced by shuffle_n (int).
+    Additionally, please note that the len() of this DataLayer is assumed to be the length of the manifest
+    after filtering. An incorrect manifest length may lead to some DataLoader issues down the line.
+    Args:
+        audio_tar_filepaths: Either a list of audio tarball filepaths, or a
+            string (can be brace-expandable).
+        manifest_filepath (str): Path to the manifest.
+        labels (list): List of characters that can be output by the ASR model.
+            For Jasper, this is the 28 character set {a-z '}. The CTC blank
+            symbol is automatically added later for models using ctc.
+        sample_rate (int): Sample rate to resample loaded audio to
+        int_values (bool): If true, load samples as 32-bit integers. Defauts to False.
+        augmentor (nemo.collections.asr.parts.perturb.AudioAugmentor): An AudioAugmentor
+            object used to augment loaded audio
+        shuffle_n (int): How many samples to look ahead and load to be shuffled.
+            See WebDataset documentation for more details.
+            Defaults to 0.
+        min_duration (float): Dataset parameter.
+            All training files which have a duration less than min_duration
+            are dropped. Note: Duration is read from the manifest JSON.
+            Defaults to 0.1.
+        max_duration (float): Dataset parameter.
+            All training files which have a duration more than max_duration
+            are dropped. Note: Duration is read from the manifest JSON.
+            Defaults to None.
+        blank_index (int): Blank character index, defaults to -1.
+        unk_index (int): Unknown character index, defaults to -1.
+        normalize (bool): Dataset parameter.
+            Whether to use automatic text cleaning.
+            It is highly recommended to manually clean text for best results.
+            Defaults to True.
+        trim (bool): Whether to use trim silence from beginning and end
+            of audio signal using librosa.effects.trim().
+            Defaults to False.
+        bos_id (id): Dataset parameter.
+            Beginning of string symbol id used for seq2seq models.
+            Defaults to None.
+        eos_id (id): Dataset parameter.
+            End of string symbol id used for seq2seq models.
+            Defaults to None.
+        pad_id (id): Token used to pad when collating samples in batches.
+            If this is None, pads using 0s.
+            Defaults to None.
+        shard_strategy (str): Tarred dataset shard distribution strategy chosen as a str value during ddp.
+            -   `scatter`: The default shard strategy applied by WebDataset, where each node gets
+                a unique set of shards, which are permanently pre-allocated and never changed at runtime.
+            -   `replicate`: Optional shard strategy, where each node gets all of the set of shards
+                available in the tarred dataset, which are permanently pre-allocated and never changed at runtime.
+                The benefit of replication is that it allows each node to sample data points from the entire
+                dataset independently of other nodes, and reduces dependence on value of `shuffle_n`.
+                .. warning::
+                    Replicated strategy allows every node to sample the entire set of available tarfiles,
+                    and therefore more than one node may sample the same tarfile, and even sample the same
+                    data points! As such, there is no assured guarantee that all samples in the dataset will be
+                    sampled at least once during 1 epoch. Scattered strategy, on the other hand, on specific
+                    occasions (when the number of shards is not divisible with ``world_size``), will not sample
+                    the entire dataset. For these reasons it is not advisable to use tarred datasets as validation
+                    or test datasets.
+        global_rank (int): Worker rank, used for partitioning shards. Defaults to 0.
+        world_size (int): Total number of processes, used for partitioning shards. Defaults to 0.
+        return_sample_id (bool): whether to return the sample_id as a part of each sample
+    """
+    def __init__(
+        self,
+        audio_tar_filepaths: Union[str, List[str]],
+        manifest_filepath: str,
+        labels: List[str],
+        sample_rate: int,
+        int_values: bool = False,
+        augmentor: Optional['nemo.collections.asr.parts.perturb.AudioAugmentor'] = None,
+        shuffle_n: int = 0,
+        min_duration: Optional[float] = None,
+        max_duration: Optional[float] = None,
+        blank_index: int = -1,
+        unk_index: int = -1,
+        normalize: bool = True,
+        trim: bool = False,
+        bos_id: Optional[int] = None,
+        eos_id: Optional[int] = None,
+        parser: Optional[str] = 'en',
+        pad_id: int = 0,
+        shard_strategy: str = "scatter",
+        shard_manifests: bool = False,
+        global_rank: int = 0,
+        world_size: int = 0,
+        return_sample_id: bool = False,
+    ):
+        self.labels = labels
+        parser = parsers.make_parser(
+            labels=labels, name=parser, unk_id=unk_index, blank_id=blank_index, do_normalize=normalize
+        )
+        super().__init__(
+            audio_tar_filepaths=audio_tar_filepaths,
+            manifest_filepath=manifest_filepath,
+            parser=parser,
+            sample_rate=sample_rate,
+            int_values=int_values,
+            augmentor=augmentor,
+            shuffle_n=shuffle_n,
+            min_duration=min_duration,
+            max_duration=max_duration,
+            trim=trim,
+            bos_id=bos_id,
+            eos_id=eos_id,
+            pad_id=pad_id,
+            shard_strategy=shard_strategy,
+            shard_manifests=shard_manifests,
+            global_rank=global_rank,
+            world_size=world_size,
+            return_sample_id=return_sample_id,
+        )
+class TarredAudioToBPEDataset(_TarredAudioToTextDataset):
+    """
+    A similar Dataset to the AudioToBPEDataset, but which loads tarred audio files.
+    Accepts a single comma-separated JSON manifest file (in the same style as for the AudioToBPEDataset),
+    as well as the path(s) to the tarball(s) containing the wav files. Each line of the manifest should
+    contain the information for one audio file, including at least the transcript and name of the audio
+    file within the tarball.
+    Valid formats for the audio_tar_filepaths argument include:
+    (1) a single string that can be brace-expanded, e.g. 'path/to/audio.tar' or 'path/to/audio_{1..100}.tar.gz', or
+    (2) a list of file paths that will not be brace-expanded, e.g. ['audio_1.tar', 'audio_2.tar', ...].
+    See the WebDataset documentation for more information about accepted data and input formats.
+    If using multiple workers the number of shards should be divisible by world_size to ensure an
+    even split among workers. If it is not divisible, logging will give a warning but training will proceed.
+    In addition, if using mutiprocessing, each shard MUST HAVE THE SAME NUMBER OF ENTRIES after filtering
+    is applied. We currently do not check for this, but your program may hang if the shards are uneven!
+    Notice that a few arguments are different from the AudioToBPEDataset; for example, shuffle (bool) has been
+    replaced by shuffle_n (int).
+    Additionally, please note that the len() of this DataLayer is assumed to be the length of the manifest
+    after filtering. An incorrect manifest length may lead to some DataLoader issues down the line.
+    Args:
+        audio_tar_filepaths: Either a list of audio tarball filepaths, or a
+            string (can be brace-expandable).
+        manifest_filepath (str): Path to the manifest.
+        tokenizer (TokenizerSpec): Either a Word Piece Encoding tokenizer (BERT),
+            or a Sentence Piece Encoding tokenizer (BPE). The CTC blank
+            symbol is automatically added later for models using ctc.
+        sample_rate (int): Sample rate to resample loaded audio to
+        int_values (bool): If true, load samples as 32-bit integers. Defauts to False.
+        augmentor (nemo.collections.asr.parts.perturb.AudioAugmentor): An AudioAugmentor
+            object used to augment loaded audio
+        shuffle_n (int): How many samples to look ahead and load to be shuffled.
+            See WebDataset documentation for more details.
+            Defaults to 0.
+        min_duration (float): Dataset parameter.
+            All training files which have a duration less than min_duration
+            are dropped. Note: Duration is read from the manifest JSON.
+            Defaults to 0.1.
+        max_duration (float): Dataset parameter.
+            All training files which have a duration more than max_duration
+            are dropped. Note: Duration is read from the manifest JSON.
+            Defaults to None.
+        trim (bool): Whether to use trim silence from beginning and end
+            of audio signal using librosa.effects.trim().
+            Defaults to False.
+        use_start_end_token: Boolean which dictates whether to add [BOS] and [EOS]
+            tokens to beginning and ending of speech respectively.
+        pad_id (id): Token used to pad when collating samples in batches.
+            If this is None, pads using 0s.
+            Defaults to None.
+        shard_strategy (str): Tarred dataset shard distribution strategy chosen as a str value during ddp.
+            -   `scatter`: The default shard strategy applied by WebDataset, where each node gets
+                a unique set of shards, which are permanently pre-allocated and never changed at runtime.
+            -   `replicate`: Optional shard strategy, where each node gets all of the set of shards
+                available in the tarred dataset, which are permanently pre-allocated and never changed at runtime.
+                The benefit of replication is that it allows each node to sample data points from the entire
+                dataset independently of other nodes, and reduces dependence on value of `shuffle_n`.
+                .. warning::
+                    Replicated strategy allows every node to sample the entire set of available tarfiles,
+                    and therefore more than one node may sample the same tarfile, and even sample the same
+                    data points! As such, there is no assured guarantee that all samples in the dataset will be
+                    sampled at least once during 1 epoch. Scattered strategy, on the other hand, on specific
+                    occasions (when the number of shards is not divisible with ``world_size``), will not sample
+                    the entire dataset. For these reasons it is not advisable to use tarred datasets as validation
+                    or test datasets.
+        global_rank (int): Worker rank, used for partitioning shards. Defaults to 0.
+        world_size (int): Total number of processes, used for partitioning shards. Defaults to 0.
+        return_sample_id (bool): whether to return the sample_id as a part of each sample
+    """
+    def __init__(
+        self,
+        audio_tar_filepaths: Union[str, List[str]],
+        manifest_filepath: str,
+        tokenizer: 'nemo.collections.common.tokenizers.TokenizerSpec',
+        sample_rate: int,
+        int_values: bool = False,
+        augmentor: Optional['nemo.collections.asr.parts.perturb.AudioAugmentor'] = None,
+        shuffle_n: int = 0,
+        min_duration: Optional[float] = None,
+        max_duration: Optional[float] = None,
+        trim: bool = False,
+        use_start_end_token: bool = True,
+        shard_strategy: str = "scatter",
+        shard_manifests: bool = False,
+        global_rank: int = 0,
+        world_size: int = 0,
+        return_sample_id: bool = False,
+    ):
+        if use_start_end_token and hasattr(tokenizer, "bos_id") and tokenizer.bos_id > 0:
+            bos_id = tokenizer.bos_id
+        else:
+            bos_id = None
+        if use_start_end_token and hasattr(tokenizer, "eos_id") and tokenizer.eos_id > 0:
+            eos_id = tokenizer.eos_id
+        else:
+            eos_id = None
+        if hasattr(tokenizer, "pad_id") and tokenizer.pad_id > 0:
+            pad_id = tokenizer.pad_id
+        else:
+            pad_id = 0
+        class TokenizerWrapper:
+            def __init__(self, tokenizer):
+                if isinstance(tokenizer, tokenizers.aggregate_tokenizer.AggregateTokenizer):
+                    self.is_aggregate = True
+                else:
+                    self.is_aggregate = False
+                self._tokenizer = tokenizer
+            def __call__(self, *args):
+                if isinstance(args[0], List) and self.is_aggregate:
+                    t = []
+                    for span in args[0]:
+                        t.extend(self._tokenizer.text_to_ids(span['str'], span['lang']))
+                    return t
+                t = self._tokenizer.text_to_ids(*args)
+                return t
+        super().__init__(
+            audio_tar_filepaths=audio_tar_filepaths,
+            manifest_filepath=manifest_filepath,
+            parser=TokenizerWrapper(tokenizer),
+            sample_rate=sample_rate,
+            int_values=int_values,
+            augmentor=augmentor,
+            shuffle_n=shuffle_n,
+            min_duration=min_duration,
+            max_duration=max_duration,
+            trim=trim,
+            bos_id=bos_id,
+            eos_id=eos_id,
+            pad_id=pad_id,
+            shard_strategy=shard_strategy,
+            shard_manifests=shard_manifests,
+            global_rank=global_rank,
+            world_size=world_size,
+            return_sample_id=return_sample_id,
+        )
+class BucketingDataset(IterableDataset):
+    """
+    A Dataset which wraps another IterableDataset and adopts it for bucketing
+    Args:
+        dataset (IterableDataset): The IterableDataset to get wrapped
+        bucketing_batch_size (int): Number of samples to build a batch
+    """
+    def __init__(
+        self, dataset: IterableDataset, bucketing_batch_size: int,
+    ):
+        self.wrapped_dataset = dataset
+        self.bucketing_batch_size = bucketing_batch_size
+        super().__init__()
+    def _collate_fn(self, batch):
+        return _speech_collate_fn(batch[0], self.wrapped_dataset.pad_id)
+    def __iter__(self):
+        return BucketingIterator(
+            wrapped_ds=self.wrapped_dataset._dataset, bucketing_batch_size=self.bucketing_batch_size
+        ).__iter__()
+    def __len__(self):
+        return int(math.ceil(len(self.wrapped_dataset) / float(self.bucketing_batch_size)))
+class BucketingIterator:
+    def __init__(self, wrapped_ds, bucketing_batch_size):
+        self.wrapped_ds = wrapped_ds
+        self.wrapped_iter = None
+        self.bucketing_batch_size = bucketing_batch_size
+    def __iter__(self):
+        self.wrapped_iter = iter(self.wrapped_ds)
+        return self
+    def __next__(self):
+        batches = []
+        for idx in range(self.bucketing_batch_size):
+            try:
+                sample = next(self.wrapped_iter)
+            except StopIteration:
+                break
+            batches.append(sample)
+        if len(batches) == 0:
+            raise StopIteration
+        return batches
+class RandomizedChainDataset(ChainDataset):
+    def __init__(self, datasets: Iterable[Dataset], rnd_seed=0) -> None:
+        super(RandomizedChainDataset, self).__init__(list(datasets))
+        self.rnd_gen = np.random.RandomState(rnd_seed)
+    def __iter__(self):
+        shuffled_order = self.rnd_gen.permutation(len(self.datasets))
+        for dataset_idx in shuffled_order:
+            d = self.datasets[dataset_idx]
+            assert isinstance(d, IterableDataset), "ChainDataset only supports IterableDataset"
+            for idx, x in enumerate(d):
+                yield x
+                # in case d is an infinite dataset, we want to break the loop
+                # so that the other datasets get a chance to yield too
+                if idx >= len(d) - 1:
+                    break

SoundScribe/SpeakerID/nemo/collections/asr/data/audio_to_text_dali.py ADDED Viewed

	@@ -0,0 +1,772 @@

+# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import math
+import operator
+import os.path
+import time
+from collections.abc import Iterator
+from typing import Callable, List, Optional, Union
+import torch
+from omegaconf import DictConfig
+from nemo.collections.asr.data.audio_to_text import ASRManifestProcessor, expand_sharded_filepaths
+from nemo.collections.common.parts.preprocessing import parsers
+from nemo.utils import logging, model_utils
+try:
+    import nvidia.dali as dali
+    from nvidia.dali.pipeline import Pipeline
+    from nvidia.dali.plugin.pytorch import DALIGenericIterator as DALIPytorchIterator
+    from nvidia.dali.plugin.pytorch import LastBatchPolicy as LastBatchPolicy
+    HAVE_DALI = True
+except (ImportError, ModuleNotFoundError):
+    HAVE_DALI = False
+__all__ = [
+    'AudioToCharDALIDataset',
+    'AudioToBPEDALIDataset',
+]
+"""
+Below minimum version is required to access the "read_idxs" argument in
+dali.fn.readers.nemo_asr
+"""
+__DALI_MINIMUM_VERSION__ = "1.11"
+DALI_INSTALLATION_MESSAGE = (
+    "Could not import `nvidia.dali`.\n"
+    "Please install DALI by following the steps provided here - \n"
+    "https://docs.nvidia.com/deeplearning/dali/user-guide/docs/installation.html"
+)
+def is_dali_supported(min_version: str, verbose: bool = False) -> bool:
+    """
+    Checks if DALI in installed, and version is >= min_verion.
+    Args:
+        min_version: A semver str that is the minimum requirement.
+        verbose: Whether to log the installation instructions if DALI is not found.
+    Returns:
+        bool - whether DALI could be imported or not.
+    """
+    module_available, _ = model_utils.check_lib_version(
+        'nvidia.dali', checked_version=min_version, operator=operator.ge
+    )
+    # If DALI is not installed
+    if module_available is None:
+        if verbose:
+            logging.info(DALI_INSTALLATION_MESSAGE)
+        return False
+    return module_available
+class DALIOutputs(object):
+    def __init__(self, out_dict):
+        self._has_processed_signal = 'processed_signal' in out_dict and 'processed_signal_len' in out_dict
+        if not self._has_processed_signal:
+            assert 'audio' in out_dict and 'audio_len' in out_dict
+        assert 'transcript' in out_dict and 'transcript_len' in out_dict
+        if self._has_processed_signal:
+            self._outs = (
+                out_dict['processed_signal'],
+                out_dict['processed_signal_len'].reshape(-1),
+                out_dict['transcript'],
+                out_dict['transcript_len'].reshape(-1),
+            )
+        else:
+            self._outs = (
+                out_dict['audio'],
+                out_dict['audio_len'].reshape(-1),
+                out_dict['transcript'],
+                out_dict['transcript_len'].reshape(-1),
+            )
+    @property
+    def has_processed_signal(self):
+        return self._has_processed_signal
+    def __getitem__(self, key):
+        return self._outs[key]
+    def __len__(self):
+        return len(self._outs)
+class _AudioTextDALIDataset(Iterator):
+    """
+    NVIDIA DALI pipeline that loads tensors via one or more manifest files where each line containing a sample descriptor in JSON,
+    including audio files, transcripts, and durations (in seconds).
+    Here's an example:
+    {"audio_filepath": "/path/to/audio.wav", "text_filepath": "/path/to/audio.txt", "duration": 23.147}
+    ...
+    {"audio_filepath": "/path/to/audio.wav", "text": "the transcription", "offset": 301.75, "duration": 0.82, "utt":
+    "utterance_id", "ctm_utt": "en_4156", "side": "A"}
+    Args:
+        manifest_filepath: Path to manifest file with the format described above. Can be comma-separated paths.
+        device (str): Determines the device type to be used for preprocessing. Allowed values are: 'cpu', 'gpu'.
+        batch_size (int): Number of samples in a batch.
+        parser (str, callable): A str for an inbuilt parser, or a callable with signature f(str) -> List[int].
+        sample_rate (int): Sample rate to resample loaded audio to.
+        num_threads (int): Number of CPU processing threads to be created by the DALI pipeline.
+        max_duration (float): Determines the maximum allowed duration, in seconds, of the loaded audio files.
+        min_duration (float): Determines the minimum allowed duration, in seconds, of the loaded audio files.
+        bos_id (int): Id of beginning of sequence symbol to append if not None
+        eos_id (int): Id of end of sequence symbol to append if not None
+        pad_id (int): Id used to pad the input. Defaults to 0 if not provided.
+        trim (bool): If True, it will extract the nonsilent region of the loaded audio signal.
+        shuffle (bool): If set to True, the dataset will shuffled after loading.
+        drop_last (bool): If set to True, the last batch will be dropped if incomplete. This will be the case when the shard size is not divisible by the batch size.
+                          If set to False and the size of dataset is not divisible by the batch size, then the last batch will be smaller.
+        device_id (int): Index of the GPU to be used (local_rank). Only applicable when device == 'gpu'. Defaults to 0.
+        global_rank (int): Worker rank, used for partitioning shards. Defaults to 0.
+        world_size (int): Total number of processes, used for partitioning shards. Defaults to 1.
+        preprocessor_cfg (DictConfig): Preprocessor configuration. Supports AudioToMelSpectrogramPreprocessor and AudioToMFCCPreprocessor.
+        return_sample_id (bool): whether to return the sample_id as a part of each sample (not supported yet).
+    """
+    def __init__(
+        self,
+        manifest_filepath: str,
+        device: str,
+        batch_size: int,
+        parser: Union[str, Callable],
+        audio_tar_filepaths: Optional[Union[str, List[str]]] = None,
+        audio_tar_index_filepaths: Optional[Union[str, List[str]]] = None,
+        sample_rate: int = 16000,
+        num_threads: int = 4,
+        max_duration: float = 0.0,
+        min_duration: float = 0.0,
+        bos_id: Optional[int] = None,
+        eos_id: Optional[int] = None,
+        pad_id: int = 0,
+        trim: bool = False,
+        shuffle: bool = False,
+        drop_last: bool = False,
+        shard_strategy: str = "scatter",
+        device_id: int = 0,
+        global_rank: int = 0,
+        world_size: int = 1,
+        preprocessor_cfg: DictConfig = None,
+        return_sample_id: bool = False,
+    ):
+        self.drop_last = drop_last  # used by lr_scheduler
+        if return_sample_id:
+            raise ValueError(
+                "Currently DALI data layers don't support returning the sample_id and return_sample_id can not be enabled."
+            )
+        self.return_sample_id = return_sample_id
+        if not HAVE_DALI:
+            raise ModuleNotFoundError(
+                f"{self} requires NVIDIA DALI to be installed. "
+                f"See: https://docs.nvidia.com/deeplearning/dali/user-guide/docs/installation.html#id1"
+            )
+        if device not in ('cpu', 'gpu'):
+            raise ValueError(
+                f"{self} received an unexpected device argument {device}. Supported values are: 'cpu', 'gpu'"
+            )
+        device_id = device_id if device == 'gpu' else None
+        self.batch_size = batch_size  # Used by NeMo
+        self.device = device
+        self.device_id = device_id
+        if world_size > 1:
+            self.shard_id = global_rank
+            self.num_shards = world_size
+        else:
+            self.shard_id = None
+            self.num_shards = None
+        self.eos_id = eos_id
+        self.bos_id = bos_id
+        self.sample_rate = sample_rate
+        self.pipe = Pipeline(
+            batch_size=batch_size,
+            num_threads=num_threads,
+            device_id=self.device_id,
+            exec_async=True,
+            exec_pipelined=True,
+        )
+        has_preprocessor = preprocessor_cfg is not None
+        if has_preprocessor:
+            if preprocessor_cfg._target_ == "nemo.collections.asr.modules.AudioToMelSpectrogramPreprocessor":
+                feature_type = "mel_spectrogram"
+            elif preprocessor_cfg._target_ == "nemo.collections.asr.modules.AudioToMFCCPreprocessor":
+                feature_type = "mfcc"
+            else:
+                raise ValueError(
+                    f"{self} received an unexpected preprocessor configuration: {preprocessor_cfg._target_}."
+                    f" Supported preprocessors are: AudioToMelSpectrogramPreprocessor, AudioToMFCCPreprocessor"
+                )
+            # Default values taken from AudioToMelSpectrogramPreprocessor
+            params = preprocessor_cfg
+            self.dither = params['dither'] if 'dither' in params else 0.0
+            self.preemph = params['preemph'] if 'preemph' in params else 0.97
+            self.window_size_sec = params['window_size'] if 'window_size' in params else 0.02
+            self.window_stride_sec = params['window_stride'] if 'window_stride' in params else 0.01
+            self.sample_rate = params['sample_rate'] if 'sample_rate' in params else sample_rate
+            self.window_size = int(self.window_size_sec * self.sample_rate)
+            self.window_stride = int(self.window_stride_sec * self.sample_rate)
+            normalize = params['normalize'] if 'normalize' in params else 'per_feature'
+            if normalize == 'per_feature':  # Each freq channel independently
+                self.normalization_axes = (1,)
+            elif normalize == 'all_features':
+                self.normalization_axes = (0, 1)
+            else:
+                raise ValueError(
+                    f"{self} received {normalize} for the normalize parameter."
+                    f" It must be either 'per_feature' or 'all_features'."
+                )
+            self.window = None
+            window_name = params['window'] if 'window' in params else 'hann'
+            torch_windows = {
+                'hann': torch.hann_window,
+                'hamming': torch.hamming_window,
+                'blackman': torch.blackman_window,
+                'bartlett': torch.bartlett_window,
+                'none': None,
+            }
+            if window_name == 'ones':
+                window_tensor = torch.ones(self.window_size)
+            else:
+                try:
+                    window_fn = torch_windows.get(window_name, None)
+                except:
+                    raise ValueError(
+                        f"{self} received '{window_name}' for the window parameter."
+                        f" It must be one of: ('hann', 'ones', 'hamming', 'blackman', 'bartlett', None)."
+                        f" None is equivalent to 'hann'."
+                    )
+                window_tensor = window_fn(self.window_size, periodic=False) if window_fn else None
+            self.window = window_tensor.numpy().tolist() if window_tensor is not None else None
+            self.n_fft = params['n_fft'] if 'n_fft' in params else 2 ** math.ceil(math.log2(self.window_size))
+            self.n_mels = params['n_mels'] if 'n_mels' in params else 64
+            self.n_mfcc = params['n_mfcc'] if 'n_mfcc' in params else 64
+            features = params['features'] if 'features' in params else 0
+            if features > 0:
+                if feature_type == 'mel_spectrogram':
+                    self.n_mels = features
+                elif feature_type == 'mfcc':
+                    self.n_mfcc = features
+            # TODO Implement frame splicing
+            if 'frame_splicing' in params:
+                assert params['frame_splicing'] == 1, "Frame splicing is not implemented"
+            self.freq_low = params['lowfreq'] if 'lowfreq' in params else 0.0
+            self.freq_high = params['highfreq'] if 'highfreq' in params else self.sample_rate / 2.0
+            self.log_features = params['log'] if 'log' in params else True
+            # We want to avoid taking the log of zero
+            # There are two options: either adding or clamping to a small value
+            self.log_zero_guard_type = params['log_zero_guard_type'] if 'log_zero_guard_type' in params else 'add'
+            if self.log_zero_guard_type not in ["add", "clamp"]:
+                raise ValueError(
+                    f"{self} received {self.log_zero_guard_type} for the "
+                    f"log_zero_guard_type parameter. It must be either 'add' or "
+                    f"'clamp'."
+                )
+            self.log_zero_guard_value = (
+                params['log_zero_guard_value'] if 'log_zero_guard_value' in params else 2 ** -24
+            )
+            if isinstance(self.log_zero_guard_value, str):
+                if self.log_zero_guard_value == "tiny":
+                    self.log_zero_guard_value = torch.finfo(torch.float32).tiny
+                elif self.log_zero_guard_value == "eps":
+                    self.log_zero_guard_value = torch.finfo(torch.float32).eps
+                else:
+                    raise ValueError(
+                        f"{self} received {self.log_zero_guard_value} for the log_zero_guard_type parameter."
+                        f"It must be either a number, 'tiny', or 'eps'"
+                    )
+            self.mag_power = params['mag_power'] if 'mag_power' in params else 2
+            if self.mag_power != 1.0 and self.mag_power != 2.0:
+                raise ValueError(
+                    f"{self} received {self.mag_power} for the mag_power parameter." f" It must be either 1.0 or 2.0."
+                )
+            self.pad_to = max(params['pad_to'], 1) if 'pad_to' in params else 16
+            self.pad_value = params['pad_value'] if 'pad_value' in params else 0.0
+        with self.pipe:
+            if audio_tar_filepaths is None and audio_tar_index_filepaths is None:
+                audio, indices = dali.fn.readers.nemo_asr(
+                    name="Reader",
+                    manifest_filepaths=manifest_filepath.split(','),
+                    dtype=dali.types.FLOAT,
+                    downmix=True,
+                    sample_rate=float(self.sample_rate),
+                    min_duration=min_duration,
+                    max_duration=max_duration,
+                    read_sample_rate=False,
+                    read_text=False,
+                    read_idxs=True,
+                    random_shuffle=shuffle,
+                    shard_id=self.shard_id,
+                    num_shards=self.num_shards,
+                    pad_last_batch=True,
+                )
+                self.is_tarred_dataset = False
+            elif audio_tar_filepaths is not None and audio_tar_index_filepaths is not None:
+                audio_tar_filepaths = expand_sharded_filepaths(
+                    audio_tar_filepaths, shard_strategy=shard_strategy, world_size=world_size, global_rank=global_rank
+                )
+                audio_tar_index_filepaths = expand_sharded_filepaths(
+                    audio_tar_index_filepaths,
+                    shard_strategy=shard_strategy,
+                    world_size=world_size,
+                    global_rank=global_rank,
+                )
+                if len(audio_tar_filepaths) != len(audio_tar_index_filepaths) and len(audio_tar_index_filepaths) != 0:
+                    raise ValueError(
+                        f"Number of filepaths provided for `audio_tar_filepaths` must match "
+                        f"`audio_tar_index_filepaths`. Got {len(audio_tar_filepaths)} audio_tar_filepaths and "
+                        f"{len(audio_tar_index_filepaths)} audio_tar_index_filepaths."
+                    )
+                tar_file = dali.fn.readers.webdataset(
+                    paths=audio_tar_filepaths,
+                    index_paths=audio_tar_index_filepaths,
+                    name="Reader",
+                    ext=["wav"],
+                    missing_component_behavior="error",
+                    random_shuffle=shuffle,
+                    shard_id=self.shard_id,
+                    num_shards=self.num_shards,
+                    pad_last_batch=True,
+                )
+                audio, _ = dali.fn.decoders.audio(
+                    tar_file, dtype=dali.types.FLOAT, downmix=True, sample_rate=float(self.sample_rate),
+                )
+                indices = dali.fn.get_property(tar_file, key="source_info")
+                indices = dali.fn.pad(indices)
+                self.is_tarred_dataset = True
+            else:
+                raise RuntimeError(
+                    "When using DALI datasets, either `audio_tar_filepaths` "
+                    "and `audio_tar_index_filepaths` should either both be None (sequential dataset)"
+                    "or provided (tarred dataset)."
+                )
+            # Extract nonsilent region, if necessary
+            if trim:
+                # Need to extract non-silent region before moving to the GPU
+                roi_start, roi_len = dali.fn.nonsilent_region(audio, cutoff_db=-60)
+                audio = audio.gpu() if self.device == 'gpu' else audio
+                audio = dali.fn.slice(
+                    audio, roi_start, roi_len, normalized_anchor=False, normalized_shape=False, axes=[0]
+                )
+            else:
+                audio = audio.gpu() if self.device == 'gpu' else audio
+            if not has_preprocessor:
+                # No preprocessing, the output is the audio signal
+                audio_len = dali.fn.shapes(dali.fn.reshape(audio, shape=[-1]))
+                audio = dali.fn.pad(audio)
+                self.pipe.set_outputs(audio, audio_len, indices)
+            else:
+                # Additive gaussian noise (dither)
+                if self.dither > 0.0:
+                    gaussian_noise = dali.fn.random.normal(audio)
+                    audio = audio + self.dither * gaussian_noise
+                # Preemphasis filter
+                if self.preemph > 0.0:
+                    audio = dali.fn.preemphasis_filter(audio, preemph_coeff=self.preemph, border='zero')
+                # Power spectrogram
+                spec = dali.fn.spectrogram(
+                    audio,
+                    nfft=self.n_fft,
+                    window_length=self.window_size,
+                    window_step=self.window_stride,
+                    window_fn=self.window,
+                )
+                if feature_type == 'mel_spectrogram' or feature_type == 'mfcc':
+                    # Spectrogram to Mel Spectrogram
+                    spec = dali.fn.mel_filter_bank(
+                        spec,
+                        sample_rate=self.sample_rate,
+                        nfilter=self.n_mels,
+                        normalize=True,
+                        freq_low=self.freq_low,
+                        freq_high=self.freq_high,
+                    )
+                    # Mel Spectrogram to MFCC
+                    if feature_type == 'mfcc':
+                        spec = dali.fn.mfcc(spec, n_mfcc=self.n_mfcc)
+                # Logarithm
+                if self.log_zero_guard_type == 'add':
+                    spec = spec + self.log_zero_guard_value
+                spec = dali.fn.to_decibels(
+                    spec, multiplier=math.log(10), reference=1.0, cutoff_db=math.log(self.log_zero_guard_value)
+                )
+                # Normalization
+                spec = dali.fn.normalize(spec, axes=self.normalization_axes, epsilon=1e-5 ** 2, ddof=1)
+                # Extracting the length of the spectrogram
+                spec_len = dali.fn.slice(dali.fn.shapes(spec), 1, 1, axes=(0,))
+                # Pads feature dimension to be a multiple of `pad_to` and the temporal dimension to be as big as the largest sample (shape -1)
+                spec = dali.fn.pad(spec, fill_value=self.pad_value, axes=(0, 1), align=(self.pad_to, 1), shape=(1, -1))
+                self.pipe.set_outputs(spec, spec_len, indices)
+        x = time.time()
+        # Building DALI pipeline
+        self.pipe.build()
+        y = time.time()
+        logging.info(f"Time for pipe.build() : {(y - x)} seconds")
+        if has_preprocessor:
+            output_names = ['processed_signal', 'processed_signal_len', 'manifest_indices']
+        else:
+            output_names = ['audio', 'audio_len', 'manifest_indices']
+        x = time.time()
+        last_batch_policy = LastBatchPolicy.DROP if drop_last else LastBatchPolicy.PARTIAL
+        self._iter = DALIPytorchIterator(
+            [self.pipe],
+            output_map=output_names,
+            reader_name="Reader",
+            last_batch_policy=last_batch_policy,
+            dynamic_shape=True,
+            auto_reset=True,
+        )
+        y = time.time()
+        logging.info(f"Time for DALIPytorchIterator to initialize : {(y - x)} seconds")
+        # TODO come up with a better solution
+        class DummyDataset:
+            def __init__(self, parent):
+                self.parent = parent
+            def __len__(self):
+                return self.parent.size
+        self.dataset = DummyDataset(self)  # Used by NeMo
+        x = time.time()
+        self.manifest_processor = ASRManifestProcessor(
+            manifest_filepath=manifest_filepath,
+            parser=parser,
+            max_duration=max_duration,
+            min_duration=min_duration,
+            max_utts=0,
+            bos_id=bos_id,
+            eos_id=eos_id,
+            pad_id=pad_id,
+            index_by_file_id=self.is_tarred_dataset,
+        )
+        y = time.time()
+        logging.info(f"Time to build nemo manifest processor - {(y - x)} seconds")
+    def reset(self):
+        self._iter.reset()
+    def __iter__(self):
+        return self
+    def next(self):
+        return self.__next__()
+    @property
+    def size(self):
+        return self._iter.size
+    def __len__(self):
+        return len(self._iter)
+    def __next__(self):
+        outputs = self._iter.next()
+        assert len(outputs) == 1
+        dali_out = outputs[0]
+        manifest_indices = dali_out['manifest_indices'].numpy()
+        out = {}
+        out_names = ['processed_signal', 'processed_signal_len', 'audio', 'audio_len']
+        for out_name in out_names:
+            if out_name in dali_out:
+                out[out_name] = dali_out[out_name].detach().clone()
+        text_tokens = []
+        text_tokens_len = []
+        max_len = 0
+        batch_size = manifest_indices.shape[0]
+        for i, manifest_index in enumerate(manifest_indices):
+            if not self.is_tarred_dataset:
+                # Loose-file dataset. Index is integer based.
+                manifest_index = manifest_index[0]
+                text, text_length = self.manifest_processor.process_text_by_id(manifest_index)
+            else:
+                # Tarred-file dataset. Index is filename based.
+                resolved_manifest_indices = manifest_index.tobytes().decode().split(":")
+                resolved_manifest_index = resolved_manifest_indices[2]  # we require just the filename segment
+                resolved_manifest_index = os.path.splitext(resolved_manifest_index)[0]  # we dont need file extension
+                text, text_length = self.manifest_processor.process_text_by_file_id(resolved_manifest_index)
+            text_tokens_len.append(text_length)
+            text_tokens.append(text)
+            if text_length > max_len:
+                max_len = text_length
+        transcript_out = torch.full([batch_size, max_len], fill_value=self.manifest_processor.pad_id, dtype=torch.long)
+        for i, n in enumerate(text_tokens_len):
+            transcript_out[i, :n] = torch.tensor(text_tokens[i], dtype=torch.long)
+        transcript_len_out = torch.tensor(text_tokens_len, dtype=torch.long)
+        out['transcript'] = transcript_out
+        out['transcript_len'] = transcript_len_out
+        return DALIOutputs(out)
+class AudioToCharDALIDataset(_AudioTextDALIDataset):
+    """
+    Character based NVIDIA DALI pipeline that loads tensors via one or more manifest files where each line containing a
+    sample descriptor in JSON, including audio files, transcripts, and durations (in seconds).
+    Here's an example:
+    {"audio_filepath": "/path/to/audio.wav", "text_filepath": "/path/to/audio.txt", "duration": 23.147}
+    ...
+    {"audio_filepath": "/path/to/audio.wav", "text": "the transcription", "offset": 301.75, "duration": 0.82, "utt":
+    "utterance_id", "ctm_utt": "en_4156", "side": "A"}
+    Args:
+        manifest_filepath: Path to manifest file with the format described above. Can be comma-separated paths.
+        device (str): Determines the device type to be used for preprocessing. Allowed values are: 'cpu', 'gpu'.
+        batch_size (int): Number of samples in a batch.
+        labels (List[str]): String containing all the possible characters to map to.
+        sample_rate (int): Sample rate to resample loaded audio to.
+        num_threads (int): Number of CPU processing threads to be created by the DALI pipeline.
+        max_duration (float): Determines the maximum allowed duration, in seconds, of the loaded audio files.
+        min_duration (float): Determines the minimum allowed duration, in seconds, of the loaded audio files.
+        blank_index (int): blank character index, default = -1
+        unk_index (int): unk_character index, default = -1
+        normalize (bool): whether to normalize transcript text (default): True
+        bos_id (int): Id of beginning of sequence symbol to append if not None
+        eos_id (int): Id of end of sequence symbol to append if not None
+        pad_id (int): Id used to pad the input. Defaults to 0 if not provided.
+        trim (bool): If True, it will extract the nonsilent region of the loaded audio signal.
+        shuffle (bool): If set to True, the dataset will shuffled after loading.
+        drop_last (bool): If set to True, the last batch will be dropped if incomplete. This will be the case when the shard size is not divisible by the batch size.
+                          If set to False and the size of dataset is not divisible by the batch size, then the last batch will be smaller.
+        parser (str, callable): A str for an inbuilt parser, or a callable with signature f(str) -> List[int].
+        device_id (int): Index of the GPU to be used (local_rank). Only applicable when device == 'gpu'. Defaults to 0.
+        global_rank (int): Worker rank, used for partitioning shards. Defaults to 0.
+        world_size (int): Total number of processes, used for partitioning shards. Defaults to 1.
+        preprocessor_cfg (DictConfig): Preprocessor configuration. Supports AudioToMelSpectrogramPreprocessor and AudioToMFCCPreprocessor.
+        return_sample_id (bool): whether to return the sample_id as a part of each sample (not supported yet).
+    """
+    def __init__(
+        self,
+        manifest_filepath: str,
+        device: str,
+        batch_size: int,
+        labels: Union[str, List[str]],
+        sample_rate: int = 16000,
+        audio_tar_filepaths: Optional[Union[str, List[str]]] = None,
+        audio_tar_index_filepaths: Optional[Union[str, List[str]]] = None,
+        num_threads: int = 4,
+        max_duration: float = 0.0,
+        min_duration: float = 0.0,
+        blank_index: int = -1,
+        unk_index: int = -1,
+        normalize: bool = True,
+        bos_id: Optional[int] = None,
+        eos_id: Optional[int] = None,
+        pad_id: int = 0,
+        trim: bool = False,
+        shuffle: bool = False,
+        drop_last: bool = False,
+        parser: Union[str, Callable] = 'en',
+        shard_strategy: str = "scatter",
+        device_id: int = 0,
+        global_rank: int = 0,
+        world_size: int = 1,
+        preprocessor_cfg: DictConfig = None,
+        return_sample_id: bool = False,
+    ):
+        self.labels = labels
+        parser = parsers.make_parser(
+            labels=labels, name=parser, unk_id=unk_index, blank_id=blank_index, do_normalize=normalize
+        )
+        super().__init__(
+            manifest_filepath=manifest_filepath,
+            device=device,
+            batch_size=batch_size,
+            audio_tar_filepaths=audio_tar_filepaths,
+            audio_tar_index_filepaths=audio_tar_index_filepaths,
+            sample_rate=sample_rate,
+            num_threads=num_threads,
+            max_duration=max_duration,
+            min_duration=min_duration,
+            bos_id=bos_id,
+            eos_id=eos_id,
+            pad_id=pad_id,
+            trim=trim,
+            shuffle=shuffle,
+            drop_last=drop_last,
+            parser=parser,
+            shard_strategy=shard_strategy,
+            device_id=device_id,
+            global_rank=global_rank,
+            world_size=world_size,
+            preprocessor_cfg=preprocessor_cfg,
+            return_sample_id=return_sample_id,
+        )
+class AudioToBPEDALIDataset(_AudioTextDALIDataset):
+    """
+    Subword based NVIDIA DALI pipeline that loads tensors via one or more manifest files where each line containing a
+    sample descriptor in JSON, including audio files, transcripts, and durations (in seconds).
+    Here's an example:
+    {"audio_filepath": "/path/to/audio.wav", "text_filepath": "/path/to/audio.txt", "duration": 23.147}
+    ...
+    {"audio_filepath": "/path/to/audio.wav", "text": "the transcription", "offset": 301.75, "duration": 0.82, "utt":
+    "utterance_id", "ctm_utt": "en_4156", "side": "A"}
+    Args:
+        manifest_filepath: Path to manifest file with the format described above. Can be comma-separated paths.
+        tokenizer (TokenizerSpec): A TokenizerSpec implementation that wraps a tokenization implementation.
+        device (str): Determines the device type to be used for preprocessing. Allowed values are: 'cpu', 'gpu'.
+        batch_size (int): Number of samples in a batch.
+        sample_rate (int): Sample rate to resample loaded audio to.
+        num_threads (int): Number of CPU processing threads to be created by the DALI pipeline.
+        max_duration (float): Determines the maximum allowed duration, in seconds, of the loaded audio files.
+        min_duration (float): Determines the minimum allowed duration, in seconds, of the loaded audio files.
+        bos_id (int): Id of beginning of sequence symbol to append if not None. Injected from the tokenizer.
+        eos_id (int): Id of end of sequence symbol to append if not None. Injected from the tokenizer.
+        pad_id (int): Id used to pad the input. Defaults to 0 if not provided. Injected from the tokenizer.
+        trim (bool): If True, it will extract the nonsilent region of the loaded audio signal.
+        shuffle (bool): If set to True, the dataset will shuffled after loading.
+        drop_last (bool): If set to True, the last batch will be dropped if incomplete. This will be the case when the shard size is not divisible by the batch size.
+                          If set to False and the size of dataset is not divisible by the batch size, then the last batch will be smaller.
+        device_id (int): Index of the GPU to be used (local_rank). Only applicable when device == 'gpu'. Defaults to 0.
+        global_rank (int): Worker rank, used for partitioning shards. Defaults to 0.
+        world_size (int): Total number of processes, used for partitioning shards. Defaults to 1.
+        preprocessor_cfg (DictConfig): Preprocessor configuration. Supports AudioToMelSpectrogramPreprocessor and AudioToMFCCPreprocessor.
+        use_start_end_token (bool): Boolean which dictates whether to add [BOS] and [EOS] tokens to beginning and
+            ending of speech respectively.
+        return_sample_id (bool): whether to return the sample_id as a part of each sample (not supported yet).
+    """
+    def __init__(
+        self,
+        manifest_filepath: str,
+        tokenizer: 'nemo.collections.common.tokenizers.TokenizerSpec',
+        device: str,
+        batch_size: int,
+        sample_rate: int = 16000,
+        audio_tar_filepaths: Optional[Union[str, List[str]]] = None,
+        audio_tar_index_filepaths: Optional[Union[str, List[str]]] = None,
+        num_threads: int = 4,
+        max_duration: float = 0.0,
+        min_duration: float = 0.0,
+        trim: bool = False,
+        shuffle: bool = False,
+        drop_last: bool = False,
+        shard_strategy: str = "scatter",
+        device_id: int = 0,
+        global_rank: int = 0,
+        world_size: int = 1,
+        preprocessor_cfg: DictConfig = None,
+        use_start_end_token: bool = True,
+        return_sample_id: bool = False,
+    ):
+        if use_start_end_token and hasattr(tokenizer, 'bos_token'):
+            bos_id = tokenizer.bos_id
+        else:
+            bos_id = None
+        if use_start_end_token and hasattr(tokenizer, 'eos_token'):
+            eos_id = tokenizer.eos_id
+        else:
+            eos_id = None
+        if hasattr(tokenizer, 'pad_token'):
+            pad_id = tokenizer.pad_id
+        else:
+            pad_id = 0
+        class TokenizerWrapper:
+            def __init__(self, tokenizer):
+                self._tokenizer = tokenizer
+            def __call__(self, text):
+                t = self._tokenizer.text_to_ids(text)
+                return t
+        super().__init__(
+            manifest_filepath=manifest_filepath,
+            device=device,
+            batch_size=batch_size,
+            sample_rate=sample_rate,
+            audio_tar_filepaths=audio_tar_filepaths,
+            audio_tar_index_filepaths=audio_tar_index_filepaths,
+            num_threads=num_threads,
+            max_duration=max_duration,
+            min_duration=min_duration,
+            bos_id=bos_id,
+            eos_id=eos_id,
+            pad_id=pad_id,
+            trim=trim,
+            shuffle=shuffle,
+            drop_last=drop_last,
+            parser=TokenizerWrapper(tokenizer),
+            shard_strategy=shard_strategy,
+            device_id=device_id,
+            global_rank=global_rank,
+            world_size=world_size,
+            preprocessor_cfg=preprocessor_cfg,
+            return_sample_id=return_sample_id,
+        )

SoundScribe/SpeakerID/nemo/collections/asr/data/audio_to_text_dataset.py ADDED Viewed

	@@ -0,0 +1,950 @@

+# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import copy
+import json
+import random
+from math import isclose
+from typing import Any, List, Optional, Union
+import torch
+from omegaconf import DictConfig, OmegaConf, open_dict
+from omegaconf.listconfig import ListConfig
+from pytorch_lightning.callbacks import BasePredictionWriter
+from torch.utils.data import ChainDataset
+from nemo.collections.asr.data import audio_to_text, audio_to_text_dali
+from nemo.collections.asr.parts.preprocessing.perturb import process_augmentations
+from nemo.collections.common.data.dataset import CodeSwitchedDataset, ConcatDataset
+from nemo.utils import logging
+def inject_dataloader_value_from_model_config(model_cfg: dict, dataloader_cfg: DictConfig, key: str):
+    """
+    Extracts the label set provided at the top level of the model, and propagates it to the dataloader
+    config.
+    Args:
+        model_cfg: A DictConfig representing the model's config.
+        dataloader_cfg: A DictConfig representing the individual data loader
+        key: A str value representing a key in the model_cfg whose value will be propagated to the
+            dataloader config.
+    """
+    if key not in model_cfg:
+        logging.info(
+            f"Model level config does not contain `{key}`, please explicitly provide `{key}` to the dataloaders."
+        )
+        return
+    if not isinstance(dataloader_cfg, DictConfig):
+        dataloader_cfg = DictConfig(dataloader_cfg)
+    # If key exists in the data loader config (either set explicitly or as a placeholder (via None))
+    if key in dataloader_cfg:
+        # Dataloader `labels` is provided and is non-null
+        if dataloader_cfg[key] is not None and model_cfg[key] != dataloader_cfg[key]:
+            # Model level `labels` dont match Dataloader level `labels`
+            logging.warning(
+                f'`{key}` is explicitly provided to the data loader, and is different from '
+                f'the `{key}` provided at the model level config.\n'
+                f'If this is incorrect, please set the dataloader\'s `{key}` to None.'
+            )
+        else:
+            # Dataloader `key` is None or values match
+            # Propagate from model level `key` (even if they match)
+            with open_dict(dataloader_cfg):
+                dataloader_cfg[key] = model_cfg[key]
+    else:
+        # If key key doesnt even exist in dataloader_cfg, inject it explicitly
+        with open_dict(dataloader_cfg):
+            dataloader_cfg[key] = model_cfg[key]
+def get_concat_char_dataset(
+    config: dict, global_rank: int, world_size: int, augmentor: Optional['AudioAugmentor'] = None
+) -> ConcatDataset:
+    """
+    Instantiates an instance of ConcatDataset containing one or more intances of
+    Character Encoding based AudioToCharDataset.
+    Args:
+        config: Config of the AudioToCharDataset.
+        global_rank: Global rank of this device.
+        world_size: Global world size in the training method.
+        augmentor: Optional AudioAugmentor object for augmentations on audio data.
+    Returns:
+        An instance of ConcatDataset containing one or more instances of AudioToCharDataset.
+    """
+    if 'labels' not in config:
+        logging.warning(f"dataset does not have explicitly defined labels")
+    manifest_filepaths = config['manifest_filepath']
+    datasets = []
+    # needed to support validation Concat Datasets that arrive here as
+    # [[dataset1,dataset2]] otherwise ModelPT would interfere
+    if len(manifest_filepaths) == 1 and not isinstance(manifest_filepaths[0], str):
+        logging.info(f"removing an extra nesting level from {manifest_filepaths}")
+        manifest_filepaths = config['manifest_filepath'][0]
+    for manifest_filepath in manifest_filepaths:
+        conf = copy.deepcopy(config)
+        conf['manifest_filepath'] = manifest_filepath
+        dataset = get_char_dataset(config=conf, augmentor=augmentor)
+        datasets.append(dataset)
+    dataset = ConcatDataset(
+        datasets,
+        sampling_technique=config.get('concat_sampling_technique', 'temperature'),
+        sampling_temperature=config.get('concat_sampling_temperature', 5),
+        sampling_scale=config.get('concat_sampling_scale', 1),
+        sampling_probabilities=config.get('concat_sampling_probabilities', None),
+        shuffle=config.get('concat_shuffle', True),
+        seed=config.get('concat_sampling_seed', None),
+        global_rank=global_rank,
+        world_size=world_size,
+    )
+    return dataset
+def get_char_dataset(config: dict, augmentor: Optional['AudioAugmentor'] = None) -> audio_to_text.AudioToCharDataset:
+    """
+    Instantiates a Character Encoding based AudioToCharDataset.
+    Args:
+        config: Config of the AudioToCharDataset.
+        augmentor: Optional AudioAugmentor object for augmentations on audio data.
+    Returns:
+        An instance of AudioToCharDataset.
+    """
+    if 'labels' not in config:
+        logging.warning(f"dataset does not have explicitly defined labels")
+    dataset = audio_to_text.AudioToCharDataset(
+        manifest_filepath=config['manifest_filepath'],
+        labels=config.get('labels', None),
+        sample_rate=config['sample_rate'],
+        int_values=config.get('int_values', False),
+        augmentor=augmentor,
+        max_duration=config.get('max_duration', None),
+        min_duration=config.get('min_duration', None),
+        max_utts=config.get('max_utts', 0),
+        blank_index=config.get('blank_index', -1),
+        unk_index=config.get('unk_index', -1),
+        normalize=config.get('normalize_transcripts', False),
+        trim=config.get('trim_silence', False),
+        parser=config.get('parser', 'en'),
+        return_sample_id=config.get('return_sample_id', False),
+        channel_selector=config.get('channel_selector', None),
+    )
+    return dataset
+def get_concat_bpe_dataset(
+    config: dict,
+    tokenizer: 'TokenizerSpec',
+    global_rank: int,
+    world_size: int,
+    augmentor: Optional['AudioAugmentor'] = None,
+) -> ConcatDataset:
+    """
+    Instantiates a ContactDataset based on several Byte Pair Encoding / Word Piece Encoding based AudioToBPEDatasets.
+    Args:
+        config: Config of the AudioToBPEDataset.
+        tokenizer: An instance of a TokenizerSpec object.
+        global_rank: Global rank of this device.
+        world_size: Global world size in the training method.
+        augmentor: Optional AudioAugmentor object for augmentations on audio data.
+    Returns:
+        An instance of ConcatDataset containing several instances of AudioToBPEDataset.
+    """
+    manifest_filepaths = config['manifest_filepath']
+    datasets = []
+    # needed to support validation Concat Datasets that arrive here as
+    # [[dataset1,dataset2]] otherwise ModelPT would interfere
+    if len(manifest_filepaths) == 1 and not isinstance(manifest_filepaths[0], str):
+        logging.info(f"removing an extra nesting level from {manifest_filepaths}")
+        manifest_filepaths = config['manifest_filepath'][0]
+    for manifest_filepath in manifest_filepaths:
+        conf = copy.deepcopy(config)
+        conf['manifest_filepath'] = manifest_filepath
+        dataset = get_bpe_dataset(config=conf, tokenizer=tokenizer, augmentor=augmentor)
+        datasets.append(dataset)
+    dataset = ConcatDataset(
+        datasets,
+        sampling_technique=config.get('concat_sampling_technique', 'temperature'),
+        sampling_temperature=config.get('concat_sampling_temperature', 5),
+        sampling_scale=config.get('concat_sampling_scale', 1),
+        sampling_probabilities=config.get('concat_sampling_probabilities', None),
+        shuffle=config.get('concat_shuffle', True),
+        seed=config.get('concat_sampling_seed', None),
+        global_rank=global_rank,
+        world_size=world_size,
+    )
+    return dataset
+def get_bpe_dataset(
+    config: dict, tokenizer: 'TokenizerSpec', augmentor: Optional['AudioAugmentor'] = None
+) -> audio_to_text.AudioToBPEDataset:
+    """
+    Instantiates a Byte Pair Encoding / Word Piece Encoding based AudioToBPEDataset.
+    Args:
+        config: Config of the AudioToBPEDataset.
+        tokenizer: An instance of a TokenizerSpec object.
+        augmentor: Optional AudioAugmentor object for augmentations on audio data.
+    Returns:
+        An instance of AudioToBPEDataset.
+    """
+    dataset = audio_to_text.AudioToBPEDataset(
+        manifest_filepath=config['manifest_filepath'],
+        tokenizer=tokenizer,
+        sample_rate=config['sample_rate'],
+        int_values=config.get('int_values', False),
+        augmentor=augmentor,
+        max_duration=config.get('max_duration', None),
+        min_duration=config.get('min_duration', None),
+        max_utts=config.get('max_utts', 0),
+        trim=config.get('trim_silence', False),
+        use_start_end_token=config.get('use_start_end_token', True),
+        return_sample_id=config.get('return_sample_id', False),
+        channel_selector=config.get('channel_selector', None),
+    )
+    return dataset
+def get_concat_tarred_dataset(
+    config: dict,
+    shuffle_n: int,
+    global_rank: int,
+    world_size: int,
+    tokenizer: Optional['TokenizerSpec'] = None,
+    augmentor: Optional['AudioAugmentor'] = None,
+) -> ConcatDataset:
+    """
+    Instantiates a ConcatDataset containing multiple Word Piece/BPE Encoding based TarredAudioToBPEDataset or a char based TarredAudioToCharDataset.
+    Args:
+        config: Config of the TarredAudioToBPEDataset or TarredAudioToCharDataset.
+        shuffle_n: How many samples to look ahead and load to be shuffled.
+            See WebDataset documentation for more details.
+        tokenizer: An instance of a TokenizerSpec object if BPE dataset is needed.
+        global_rank: Global rank of this device.
+        world_size: Global world size in the training method.
+            Passsing None would return a char-based dataset.
+        augmentor: Optional AudioAugmentor object for augmentations on audio data.
+    Returns:
+        An instance of ConcatDataset containing one or more TarredAudioToBPEDatasets or TarredAudioToCharDatasets.
+    """
+    tarred_audio_filepaths = config['tarred_audio_filepaths']
+    manifest_filepaths = config['manifest_filepath']
+    datasets = []
+    for dataset_idx, (tarred_audio_filepath, manifest_filepath) in enumerate(
+        zip(tarred_audio_filepaths, manifest_filepaths)
+    ):
+        conf = copy.deepcopy(config)
+        conf['manifest_filepath'] = manifest_filepath
+        conf['tarred_audio_filepaths'] = tarred_audio_filepath
+        dataset = get_tarred_dataset(
+            config=conf,
+            tokenizer=tokenizer,
+            shuffle_n=shuffle_n,
+            global_rank=global_rank,
+            world_size=world_size,
+            augmentor=augmentor,
+        )
+        datasets.append(dataset)
+    dataset = ConcatDataset(
+        datasets,
+        sampling_technique=config.get('concat_sampling_technique', 'temperature'),
+        sampling_temperature=config.get('concat_sampling_temperature', 5),
+        sampling_scale=config.get('concat_sampling_scale', 1),
+        sampling_probabilities=config.get('concat_sampling_probabilities', None),
+        shuffle=config.get('concat_shuffle', True),
+        seed=config.get('concat_sampling_seed', None),
+        global_rank=global_rank,
+        world_size=world_size,
+    )
+    return dataset
+def get_tarred_dataset(
+    config: dict,
+    shuffle_n: int,
+    global_rank: int,
+    world_size: int,
+    tokenizer: Optional['TokenizerSpec'] = None,
+    augmentor: Optional['AudioAugmentor'] = None,
+) -> Union[audio_to_text.TarredAudioToBPEDataset, audio_to_text.TarredAudioToCharDataset]:
+    """
+    Instantiates a Word Piece/BPE Encoding based TarredAudioToBPEDataset or a char based TarredAudioToCharDataset.
+    Args:
+        config: Config of the TarredAudioToBPEDataset or TarredAudioToCharDataset.
+        shuffle_n: How many samples to look ahead and load to be shuffled.
+            See WebDataset documentation for more details.
+        tokenizer: An instance of a TokenizerSpec object if BPE dataset is needed.
+        global_rank: Global rank of this device.
+        world_size: Global world size in the training method.
+            Passsing None would return a char-based dataset.
+        augmentor: Optional AudioAugmentor object for augmentations on audio data.
+    Returns:
+        An instance of TarredAudioToBPEDataset or TarredAudioToCharDataset.
+    """
+    tarred_audio_filepaths = config['tarred_audio_filepaths']
+    manifest_filepaths = config['manifest_filepath']
+    datasets = []
+    tarred_audio_filepaths = convert_to_config_list(tarred_audio_filepaths)
+    manifest_filepaths = convert_to_config_list(manifest_filepaths)
+    bucketing_weights = config.get('bucketing_weights', None)  # For upsampling buckets
+    if bucketing_weights:
+        for idx, weight in enumerate(bucketing_weights):
+            if not isinstance(weight, int) or weight <= 0:
+                raise ValueError(f"bucket weights must be positive integers")
+    if len(manifest_filepaths) != len(tarred_audio_filepaths):
+        raise ValueError(
+            f"manifest_filepaths (length={len(manifest_filepaths)}) and tarred_audio_filepaths (length={len(tarred_audio_filepaths)}) need to have the same number of buckets."
+        )
+    if 'labels' not in config:
+        logging.warning(f"dataset does not have explicitly defined labels")
+    if 'max_utts' in config:
+        raise ValueError('"max_utts" parameter is not supported for tarred datasets')
+    for dataset_idx, (tarred_audio_filepath, manifest_filepath) in enumerate(
+        zip(tarred_audio_filepaths, manifest_filepaths)
+    ):
+        if len(tarred_audio_filepath) == 1:
+            tarred_audio_filepath = tarred_audio_filepath[0]
+        if len(manifest_filepath) == 1:
+            manifest_filepath = manifest_filepath[0]
+        if tokenizer is None:
+            dataset = audio_to_text.TarredAudioToCharDataset(
+                audio_tar_filepaths=tarred_audio_filepath,
+                manifest_filepath=manifest_filepath,
+                labels=config.get('labels', None),
+                sample_rate=config['sample_rate'],
+                int_values=config.get('int_values', False),
+                augmentor=augmentor,
+                shuffle_n=shuffle_n,
+                max_duration=config.get('max_duration', None),
+                min_duration=config.get('min_duration', None),
+                blank_index=config.get('blank_index', -1),
+                unk_index=config.get('unk_index', -1),
+                normalize=config.get('normalize_transcripts', False),
+                trim=config.get('trim_silence', False),
+                parser=config.get('parser', 'en'),
+                shard_strategy=config.get('tarred_shard_strategy', 'scatter'),
+                shard_manifests=config.get('shard_manifests', False),
+                global_rank=global_rank,
+                world_size=world_size,
+                return_sample_id=config.get('return_sample_id', False),
+            )
+        else:
+            dataset = audio_to_text.TarredAudioToBPEDataset(
+                audio_tar_filepaths=tarred_audio_filepath,
+                manifest_filepath=manifest_filepath,
+                tokenizer=tokenizer,
+                sample_rate=config['sample_rate'],
+                int_values=config.get('int_values', False),
+                augmentor=augmentor,
+                shuffle_n=shuffle_n,
+                max_duration=config.get('max_duration', None),
+                min_duration=config.get('min_duration', None),
+                trim=config.get('trim_silence', False),
+                use_start_end_token=config.get('use_start_end_token', True),
+                shard_strategy=config.get('tarred_shard_strategy', 'scatter'),
+                shard_manifests=config.get('shard_manifests', False),
+                global_rank=global_rank,
+                world_size=world_size,
+                return_sample_id=config.get('return_sample_id', False),
+            )
+        if bucketing_weights:
+            [datasets.append(dataset) for _ in range(bucketing_weights[dataset_idx])]
+        else:
+            datasets.append(dataset)
+    return get_chain_dataset(datasets=datasets, ds_config=config, rank=global_rank)
+def get_code_switched_dataset(
+    config: dict,
+    shuffle_n: int,
+    global_rank: int,
+    world_size: int,
+    tokenizer: Optional['TokenizerSpec'] = None,
+    augmentor: Optional['AudioAugmentor'] = None,
+) -> CodeSwitchedDataset:
+    if 'manifest_filepath' not in config:
+        raise ValueError("`manifest_filepath` must be provided in the dataset config if `is_code_switched=True`")
+    if 'code_switched' not in config:
+        raise ValueError("`code_switched` param group must be in the dataset config if `is_code_switched=True`")
+    manifest_filepaths = config['manifest_filepath']
+    tarred_audio_filepaths = config.get('tarred_audio_filepaths', None)
+    cs_config = OmegaConf.to_container(config['code_switched'])
+    # needed to support validation Datasets that arrive here as
+    # [[dataset1,dataset2]] otherwise ModelPT would interfere
+    if len(manifest_filepaths) == 1 and not isinstance(manifest_filepaths[0], str):
+        manifest_filepaths = config['manifest_filepath'][0]
+    if tarred_audio_filepaths is None:
+        tarred_audio_filepaths = [None] * len(manifest_filepaths)
+    if len(manifest_filepaths) != len(tarred_audio_filepaths):
+        raise ValueError(
+            f"manifest_filepaths (length={len(manifest_filepaths)}) and tarred_audio_filepaths (length={len(tarred_audio_filepaths)}) need to have the same number of items."
+        )
+    datasets = []
+    for dataset_idx, (tarred_audio_filepath, manifest_filepath) in enumerate(
+        zip(tarred_audio_filepaths, manifest_filepaths)
+    ):
+        conf = copy.deepcopy(config)
+        conf['manifest_filepath'] = manifest_filepath
+        with open_dict(conf):
+            conf['tarred_audio_filepaths'] = tarred_audio_filepath
+        if tarred_audio_filepath is None or len(tarred_audio_filepath) == 0:
+            if tokenizer is None:
+                dataset = get_char_dataset(config=conf, augmentor=None)
+            else:
+                dataset = get_bpe_dataset(config=conf, tokenizer=tokenizer, augmentor=None)
+        else:
+            dataset = get_tarred_dataset(
+                config=conf,
+                tokenizer=tokenizer,
+                shuffle_n=shuffle_n,
+                global_rank=global_rank,
+                world_size=world_size,
+                augmentor=None,
+            )
+        datasets.append(dataset)
+    config = OmegaConf.to_container(config)
+    dataset = CodeSwitchedDataset(
+        datasets,
+        shuffle=cs_config.get('shuffle', True),
+        min_duration=cs_config.get('min_duration', 4),
+        max_duration=cs_config.get('max_duration', 20),
+        min_monolingual=cs_config.get('min_monolingual', 0.3),
+        lang_probs=cs_config.get('probs', None),
+        db_norm=cs_config.get('db_norm', -25.0),
+        pause_start=cs_config.get('pause_start', 0),
+        pause_join=cs_config.get('pause_join', 0),
+        pause_end=cs_config.get('pause_end', 0),
+        sampling_scales=cs_config.get('sampling_scales', None),
+        seed=cs_config.get('seed', None),
+        global_rank=global_rank,
+        world_size=world_size,
+        pure_random=cs_config.get('pure_random', False),
+        force_monochannel=cs_config.get('force_monochannel', True),
+        infinity_mode=cs_config.get('infinity_mode', False),
+        sample_rate=config['sample_rate'],
+        augmentor=augmentor,
+    )
+    return dataset
+def get_dali_char_dataset(
+    config: dict,
+    shuffle: bool,
+    device_id: int,
+    global_rank: int,
+    world_size: int,
+    preprocessor_cfg: Optional[DictConfig] = None,
+) -> audio_to_text_dali.AudioToCharDALIDataset:
+    """
+    Instantiates a Character Encoding based AudioToCharDALIDataset.
+    Args:
+        config: Config of the AudioToCharDALIDataset.
+        shuffle: Bool flag whether to shuffle the dataset.
+        device_id: Index of the GPU to be used (local_rank). Only applicable when device == 'gpu'. Defaults to 0.
+        global_rank: Global rank of this device.
+        world_size: Global world size in the training method.
+        augmentor: Optional AudioAugmentor object for augmentations on audio data.
+        preprocessor_cfg: Preprocessor configuration. Supports AudioToMelSpectrogramPreprocessor and AudioToMFCCPreprocessor.
+    Returns:
+        An instance of AudioToCharDALIDataset.
+    """
+    device = 'gpu' if torch.cuda.is_available() else 'cpu'
+    dataset = audio_to_text_dali.AudioToCharDALIDataset(
+        manifest_filepath=config['manifest_filepath'],
+        device=device,
+        batch_size=config['batch_size'],
+        labels=config['labels'],
+        sample_rate=config['sample_rate'],
+        audio_tar_filepaths=config.get('tarred_audio_filepaths', None),
+        audio_tar_index_filepaths=config.get('tarred_audio_index_filepaths', None),
+        max_duration=config.get('max_duration', None),
+        min_duration=config.get('min_duration', None),
+        blank_index=config.get('blank_index', -1),
+        unk_index=config.get('unk_index', -1),
+        normalize=config.get('normalize_transcripts', False),
+        trim=config.get('trim_silence', False),
+        parser=config.get('parser', 'en'),
+        shuffle=shuffle,
+        shard_strategy=config.get('tarred_shard_strategy', 'scatter'),
+        device_id=device_id,
+        global_rank=global_rank,
+        world_size=world_size,
+        preprocessor_cfg=preprocessor_cfg,
+        return_sample_id=config.get('return_sample_id', False),
+    )
+    return dataset
+def get_dali_bpe_dataset(
+    config: dict,
+    tokenizer,
+    shuffle: bool,
+    device_id: int,
+    global_rank: int,
+    world_size: int,
+    preprocessor_cfg: Optional[DictConfig] = None,
+) -> audio_to_text_dali.AudioToCharDALIDataset:
+    """
+    Instantiates a Subword Encoding based AudioToBPEDALIDataset.
+    Args:
+        config: Config of the AudioToBPEDALIDataset.
+        tokenizer: An implementation of NeMo TokenizerSpec.
+        shuffle: Bool flag whether to shuffle the dataset.
+        device_id: Index of the GPU to be used (local_rank). Only applicable when device == 'gpu'. Defaults to 0.
+        global_rank: Global rank of this device.
+        world_size: Global world size in the training method.
+        preprocessor_cfg: Preprocessor configuration. Supports AudioToMelSpectrogramPreprocessor and AudioToMFCCPreprocessor.
+    Returns:
+        An instance of AudioToCharDALIDataset.
+    """
+    device = 'gpu' if torch.cuda.is_available() else 'cpu'
+    dataset = audio_to_text_dali.AudioToBPEDALIDataset(
+        manifest_filepath=config['manifest_filepath'],
+        tokenizer=tokenizer,
+        device=device,
+        batch_size=config['batch_size'],
+        sample_rate=config['sample_rate'],
+        audio_tar_filepaths=config.get('tarred_audio_filepaths', None),
+        audio_tar_index_filepaths=config.get('tarred_audio_index_filepaths', None),
+        max_duration=config.get('max_duration', None),
+        min_duration=config.get('min_duration', None),
+        trim=config.get('trim_silence', False),
+        use_start_end_token=config.get('use_start_end_token', True),
+        shuffle=shuffle,
+        shard_strategy=config.get('tarred_shard_strategy', 'scatter'),
+        device_id=device_id,
+        global_rank=global_rank,
+        world_size=world_size,
+        preprocessor_cfg=preprocessor_cfg,
+        return_sample_id=config.get('return_sample_id', False),
+    )
+    return dataset
+def get_audio_to_text_char_dataset_from_config(
+    config, local_rank: int, global_rank: int, world_size: int, preprocessor_cfg: Optional[DictConfig] = None
+):
+    """
+    Construct Audio-To-Text Char dataset from a config.
+    Args:
+        config: dataset config
+        local_rank: model local rank
+        global_rank: model global rand
+        world_size: world size
+        preprocessor_cfg: preprocessor config, for DALI dataset
+    Returns:
+        constructed dataset or None if dataset config is invalid or nothing to load
+    """
+    if 'augmentor' in config:
+        augmentor = process_augmentations(config['augmentor'], global_rank=global_rank, world_size=world_size)
+    else:
+        augmentor = None
+    is_concat = config.get('is_concat', False)
+    if is_concat:
+        if 'concat_sampling_technique' in config and config['concat_sampling_technique'] is None:
+            logging.warning(
+                f"Concat dataset requires `concat_sampling_technique` but it was not provided. Config: {config}"
+            )
+            return None
+        if config['concat_sampling_technique'] == 'random':
+            if not 'concat_sampling_probabilities' in config:
+                logging.warning(f"Concat dataset requires `concat_sampling_probabilities` list. Config: {config}")
+                return None
+            else:
+                if not isclose(sum(config['concat_sampling_probabilities']), 1, abs_tol=1e-6):
+                    logging.warning(f"`concat_sampling_probabilities` need to sum to 1. Config: {config}")
+                    return None
+    shuffle = config['shuffle']
+    device = 'gpu' if torch.cuda.is_available() else 'cpu'
+    if config.get('use_dali', False):
+        device_id = local_rank if device == 'gpu' else None
+        dataset = get_dali_char_dataset(
+            config=config,
+            shuffle=shuffle,
+            device_id=device_id,
+            global_rank=global_rank,
+            world_size=world_size,
+            preprocessor_cfg=preprocessor_cfg,
+        )
+        return dataset
+    # Instantiate a code-switched dataset if config is present
+    if config.get('is_code_switched', False):
+        if 'manifest_filepath' in config and config['manifest_filepath'] is None:
+            logging.warning(f"Could not load dataset as `manifest_filepath` was None. Provided config : {config}")
+            return None
+        if not ('code_switched' in config and config['code_switched'] is not None):
+            logging.warning(
+                f"Code switched dataset requires `*_ds.code_switched.*` dict but it was not provided. Config: {config}"
+            )
+            return None
+        if (
+            ('probs' in config['code_switched'])
+            and (config['code_switched']['probs'] is not None)
+            and (not isclose(sum(config['code_switched']['probs']), 1, abs_tol=1e-6))
+        ):
+            logging.warning(f"`.code_switched.probs` need to sum to 1. Config: {config['code_switched']}")
+            return None
+        shuffle_n = config.get('shuffle_n', 4 * config['batch_size']) if shuffle else 0
+        dataset = get_code_switched_dataset(
+            config=config,
+            shuffle_n=shuffle_n,
+            global_rank=global_rank,
+            world_size=world_size,
+            tokenizer=None,
+            augmentor=augmentor,
+        )
+    # Instantiate tarred dataset loader or normal dataset loader
+    elif config.get('is_tarred', False):
+        if ('tarred_audio_filepaths' in config and config['tarred_audio_filepaths'] is None) or (
+            'manifest_filepath' in config and config['manifest_filepath'] is None
+        ):
+            logging.warning(
+                "Could not load dataset as `manifest_filepath` was None or "
+                f"`tarred_audio_filepaths` is None. Provided config : {config}"
+            )
+            return None
+        shuffle_n = config.get('shuffle_n', 4 * config['batch_size']) if shuffle else 0
+        if is_concat:
+            dataset = get_concat_tarred_dataset(
+                config=config,
+                shuffle_n=shuffle_n,
+                global_rank=global_rank,
+                world_size=world_size,
+                augmentor=augmentor,
+            )
+        else:
+            dataset = get_tarred_dataset(
+                config=config,
+                shuffle_n=shuffle_n,
+                global_rank=global_rank,
+                world_size=world_size,
+                augmentor=augmentor,
+            )
+    else:
+        if 'manifest_filepath' in config and config['manifest_filepath'] is None:
+            logging.warning(f"Could not load dataset as `manifest_filepath` was None. Provided config : {config}")
+            return None
+        if is_concat:
+            dataset = get_concat_char_dataset(
+                config=config, global_rank=global_rank, world_size=world_size, augmentor=augmentor
+            )
+        else:
+            dataset = get_char_dataset(config=config, augmentor=augmentor)
+    return dataset
+def get_audio_to_text_bpe_dataset_from_config(
+    config,
+    local_rank: int,
+    global_rank: int,
+    world_size: int,
+    tokenizer,
+    preprocessor_cfg: Optional[DictConfig] = None,
+):
+    """
+    Construct Audio-To-Text BPE dataset from a config.
+    Args:
+        config: BPE dataset config
+        local_rank: model local rank
+        global_rank: model global rand
+        world_size: world size
+        tokenizer: BPE tokenizer
+        preprocessor_cfg: preprocessor config, for DALI BPE dataset
+    Returns:
+        constructed dataset or None if dataset config is invalid or nothing to load
+    """
+    if 'augmentor' in config:
+        augmentor = process_augmentations(config['augmentor'], global_rank=global_rank, world_size=world_size)
+    else:
+        augmentor = None
+    is_concat = config.get('is_concat', False)
+    if is_concat:
+        if 'concat_sampling_technique' in config and config['concat_sampling_technique'] is None:
+            logging.warning(
+                f"Concat dataset requires `concat_sampling_technique` but it was not provided. Config: {config}"
+            )
+            return None
+        if config['concat_sampling_technique'] == 'random':
+            if not 'concat_sampling_probabilities' in config:
+                logging.warning(f"Concat dataset requires `concat_sampling_probabilities` list. Config: {config}")
+                return None
+            else:
+                if not isclose(sum(config['concat_sampling_probabilities']), 1, abs_tol=1e-6):
+                    logging.warning(f"`concat_sampling_probabilities` need to sum to 1. Config: {config}")
+                    return None
+    shuffle = config['shuffle']
+    device = 'gpu' if torch.cuda.is_available() else 'cpu'
+    if config.get('use_dali', False):
+        device_id = local_rank if device == 'gpu' else None
+        dataset = get_dali_bpe_dataset(
+            config=config,
+            tokenizer=tokenizer,
+            shuffle=shuffle,
+            device_id=device_id,
+            global_rank=global_rank,
+            world_size=world_size,
+            preprocessor_cfg=preprocessor_cfg,
+        )
+        return dataset
+    # Instantiate a code-switched dataset if config is present
+    if config.get('is_code_switched', False):
+        if 'manifest_filepath' in config and config['manifest_filepath'] is None:
+            logging.warning(f"Could not load dataset as `manifest_filepath` was None. Provided config : {config}")
+            return None
+        if not ('code_switched' in config and config['code_switched'] is not None):
+            logging.warning(
+                f"Code switched dataset requires `*_ds.code_switched.*` dict but it was not provided. Config: {config}"
+            )
+            return None
+        if (
+            ('probs' in config['code_switched'])
+            and (config['code_switched']['probs'] is not None)
+            and (not isclose(sum(config['code_switched']['probs']), 1, abs_tol=1e-6))
+        ):
+            logging.warning(f"`.code_switched.probs` need to sum to 1. Config: {config['code_switched']}")
+            return None
+        shuffle_n = config.get('shuffle_n', 4 * config['batch_size']) if shuffle else 0
+        dataset = get_code_switched_dataset(
+            config=config,
+            shuffle_n=shuffle_n,
+            global_rank=global_rank,
+            world_size=world_size,
+            tokenizer=tokenizer,
+            augmentor=augmentor,
+        )
+    # Instantiate tarred dataset loader or normal dataset loader
+    elif config.get('is_tarred', False):
+        if ('tarred_audio_filepaths' in config and config['tarred_audio_filepaths'] is None) or (
+            'manifest_filepath' in config and config['manifest_filepath'] is None
+        ):
+            logging.warning(
+                "Could not load dataset as `manifest_filepath` was None or "
+                f"`tarred_audio_filepaths` is None. Provided config : {config}"
+            )
+            return None
+        shuffle_n = config.get('shuffle_n', 4 * config['batch_size']) if shuffle else 0
+        if is_concat:
+            dataset = get_concat_tarred_dataset(
+                config=config,
+                tokenizer=tokenizer,
+                shuffle_n=shuffle_n,
+                global_rank=global_rank,
+                world_size=world_size,
+                augmentor=augmentor,
+            )
+        else:
+            dataset = get_tarred_dataset(
+                config=config,
+                tokenizer=tokenizer,
+                shuffle_n=shuffle_n,
+                global_rank=global_rank,
+                world_size=world_size,
+                augmentor=augmentor,
+            )
+    else:
+        if 'manifest_filepath' in config and config['manifest_filepath'] is None:
+            logging.warning(f"Could not load dataset as `manifest_filepath` was None. Provided config : {config}")
+            return None
+        if is_concat:
+            dataset = get_concat_bpe_dataset(
+                config=config,
+                global_rank=global_rank,
+                world_size=world_size,
+                tokenizer=tokenizer,
+                augmentor=augmentor,
+            )
+        else:
+            dataset = get_bpe_dataset(config=config, tokenizer=tokenizer, augmentor=augmentor)
+    return dataset
+class ASRPredictionWriter(BasePredictionWriter):
+    def __init__(self, dataset, output_file: str):
+        super().__init__(write_interval="batch")
+        self.outf = open(output_file, 'w', encoding='utf-8')
+        self.dataset = dataset
+        self.samples_num = 0
+    def write_on_batch_end(
+        self,
+        trainer,
+        pl_module: 'LightningModule',
+        prediction: Any,
+        batch_indices: List[int],
+        batch: Any,
+        batch_idx: int,
+        dataloader_idx: int,
+    ):
+        for sample_id, transcribed_text in prediction:
+            item = {}
+            sample = self.dataset.get_manifest_sample(sample_id)
+            item["audio_filepath"] = sample.audio_file
+            item["offset"] = sample.offset
+            item["duration"] = sample.duration
+            item["text"] = sample.text_raw
+            item["pred_text"] = transcribed_text
+            self.outf.write(json.dumps(item) + "\n")
+            self.samples_num += 1
+        return
+    def close_output_file(self):
+        self.outf.close()
+        return self.samples_num
+def convert_to_config_list(initial_list):
+    if type(initial_list) is str:
+        initial_list = initial_list.split(",")
+    if initial_list is None or initial_list == []:
+        raise ValueError("manifest_filepaths and tarred_audio_filepaths must not be empty.")
+    if not isinstance(initial_list, ListConfig):
+        initial_list = ListConfig([initial_list])
+    for list_idx, list_val in enumerate(initial_list):
+        if type(list_val) != type(initial_list[0]):
+            raise ValueError(
+                "manifest_filepaths and tarred_audio_filepaths need to be a list of lists for bucketing or just a list of strings"
+            )
+    if type(initial_list[0]) is not ListConfig:
+        initial_list = ListConfig([initial_list])
+    return initial_list
+def get_chain_dataset(datasets, ds_config, rank=0):
+    if len(datasets) > 1:
+        if ds_config.get('bucketing_batch_size', None) is not None:
+            bucketing_batch_sizes = calc_bucketing_batch_sizes(ds_config, len(datasets))
+            logging.info(
+                f"Batch bucketing is enabled for {len(datasets)} buckets with adaptive batch sizes of {bucketing_batch_sizes}!"
+            )
+            for idx, dataset in enumerate(datasets):
+                datasets[idx] = audio_to_text.BucketingDataset(
+                    dataset=dataset, bucketing_batch_size=bucketing_batch_sizes[idx]
+                )
+        else:
+            logging.info(
+                f"Batch bucketing is enabled for {len(datasets)} buckets with fixed batch size of {ds_config['batch_size']}!"
+            )
+    if len(datasets) == 1:
+        return datasets[0]
+    bucketing_strategy = ds_config.get('bucketing_strategy', 'synced_randomized')
+    if bucketing_strategy == 'fixed_order':
+        return ChainDataset(datasets)
+    elif bucketing_strategy == 'synced_randomized':
+        return audio_to_text.RandomizedChainDataset(datasets=datasets, rnd_seed=0)
+    elif bucketing_strategy == 'fully_randomized':
+        return audio_to_text.RandomizedChainDataset(datasets=datasets, rnd_seed=random.randint(0, 30000) + rank)
+    else:
+        raise ValueError(
+            f'bucketing_strategy={bucketing_strategy} is not supported! Supported strategies are [fixed_order, fully_randomized, synced_randomized].'
+        )
+def calc_bucketing_batch_sizes(ds_config, datasets_len):
+    bucketing_batch_size = ds_config['bucketing_batch_size']
+    bucketing_weights = ds_config.get('bucketing_weights', None)  # To adjust for upsampled buckets
+    bucketing_batch_sizes = []
+    if ds_config['batch_size'] != 1:
+        raise ValueError(
+            f"batch_size should be set to one when bucketing_batch_size is set and adaptive bucketing is enabled (batch_size={ds_config['batch_size']}!"
+        )
+    if type(bucketing_batch_size) == int:  # linear scaling
+        if bucketing_weights:  # Want same batchsize for the same duplicated bucket
+            for idx, weight in enumerate(bucketing_weights):
+                scale_factor = datasets_len - idx
+                [bucketing_batch_sizes.append(scale_factor * bucketing_batch_size) for _ in range(weight)]
+        else:
+            for idx in range(datasets_len):
+                scale_factor = datasets_len - idx
+                bucketing_batch_sizes.append(scale_factor * bucketing_batch_size)
+    elif isinstance(bucketing_batch_size, ListConfig) or isinstance(
+        bucketing_batch_size, list
+    ):  # assigned bucket sizes
+        if bucketing_weights:  # Want same batchsize for same duplicated bucket
+            for idx, weight in enumerate(bucketing_weights):
+                [bucketing_batch_sizes.append(bucketing_batch_size[idx]) for _ in range(weight)]
+        else:
+            bucketing_batch_sizes = bucketing_batch_size
+    else:
+        raise ValueError(
+            f"bucketing_batch_size should be an integer or a list (bucketing_batch_size={bucketing_batch_size})!"
+        )
+    if len(bucketing_batch_sizes) != datasets_len:
+        raise ValueError(
+            f"batch_size should have the same length as the number of buckets ({len(bucketing_batch_sizes)}!={datasets_len}) "
+        )
+    return bucketing_batch_sizes

SoundScribe/SpeakerID/nemo/collections/asr/data/data_simulation.py ADDED Viewed

The diff for this file is too large to render. See raw diff

SoundScribe/SpeakerID/nemo/collections/asr/data/feature_to_label.py ADDED Viewed

	@@ -0,0 +1,497 @@

+# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import Dict, List, Optional
+import torch
+from nemo.collections.asr.parts.preprocessing.feature_loader import ExternalFeatureLoader
+from nemo.collections.common.parts.preprocessing import collections
+from nemo.core.classes import Dataset
+from nemo.core.neural_types import AcousticEncodedRepresentation, LabelsType, LengthsType, NeuralType
+from nemo.utils import logging
+def _feature_collate_fn(batch):
+    """collate batch of feat sig, feat len, labels, labels len, assuming all features have the same shape.
+    Args:
+        batch (FloatTensor, LongTensor, LongTensor, LongTensor):  A tuple of tuples of feature, feature lengths,
+               encoded labels, and encoded labels length.
+    """
+    packed_batch = list(zip(*batch))
+    if len(packed_batch) == 5:
+        _, feat_lengths, _, labels_lengths, sample_ids = packed_batch
+    elif len(packed_batch) == 4:
+        sample_ids = None
+        _, feat_lengths, _, labels_lengths = packed_batch
+    else:
+        raise ValueError("Expects 4 or 5 tensors in the batch!")
+    features, labels = [], []
+    for b in batch:
+        feat_i, labels_i = b[0], b[2]
+        features.append(feat_i)
+        labels.append(labels_i)
+    features = torch.stack(features)
+    feat_lengths = torch.stack(feat_lengths)
+    labels = torch.stack(labels)
+    labels_lengths = torch.stack(labels_lengths)
+    if sample_ids is None:
+        return features, feat_lengths, labels, labels_lengths
+    else:
+        sample_ids = torch.tensor(sample_ids, dtype=torch.int32)
+        return features, feat_lengths, labels, labels_lengths, sample_ids
+def _audio_feature_collate_fn(batch, feat_pad_val, label_pad_id):
+    """collate batch of audio feature, audio len, labels, labels len
+    Args:
+        batch (Optional[FloatTensor], Optional[LongTensor], LongTensor,
+               LongTensor):  A tuple of tuples of feature, feature lengths,
+               labels, and label lengths.  This collate func assumes the
+               features are torch tensors of Log-Melspectrogram (i.e. [N_MEL, T]).
+    """
+    packed_batch = list(zip(*batch))
+    if len(packed_batch) == 5:
+        _, feat_lengths, _, labels_lengths, sample_ids = packed_batch
+    elif len(packed_batch) == 4:
+        sample_ids = None
+        _, feat_lengths, _, labels_lengths = packed_batch
+    else:
+        raise ValueError("Expects 4 or 5 tensors in the batch!")
+    max_feat_len = 0
+    has_feat = feat_lengths[0] is not None
+    if has_feat:
+        max_feat_len = max(feat_lengths).item()
+    max_labels_len = max(labels_lengths).item()
+    features, labels = [], []
+    for b in batch:
+        feat_i, feat_i_len, label_i, label_i_len = b[0], b[1], b[2], b[3]
+        if has_feat:
+            feat_i_len = feat_i_len.item()
+            if feat_i_len < max_feat_len:
+                pad = (0, max_feat_len - feat_i_len)
+                feat_i = torch.nn.functional.pad(feat_i, pad, value=feat_pad_val)
+            features.append(feat_i)
+        label_i_len = label_i_len.item()
+        if label_i_len < max_labels_len:
+            pad = (0, max_labels_len - label_i_len)
+            label_i = torch.nn.functional.pad(label_i, pad, value=label_pad_id)
+        labels.append(label_i)
+    if has_feat:
+        features = torch.stack(features)
+        feature_lengths = torch.stack(feat_lengths)
+    else:
+        features, feat_lengths = None, None
+    labels = torch.stack(labels)
+    labels_lengths = torch.stack(labels_lengths)
+    if sample_ids is None:
+        return features, feature_lengths, labels, labels_lengths
+    else:
+        sample_ids = torch.tensor(sample_ids, dtype=torch.int32)
+        return features, feature_lengths, labels, labels_lengths, sample_ids
+def _vad_feature_segment_collate_fn(batch, window_length_in_sec, shift_length_in_sec, frame_unit_in_sec):
+    """collate batch of audio features, features len, tokens, tokens len
+    Args:
+        batch (Optional[FloatTensor], Optional[LongTensor], LongTensor,
+            LongTensor):  A tuple of tuples of signal, signal lengths,
+            encoded tokens, and encoded tokens length.  This collate func
+            assumes the signals are 1d torch tensors (i.e. mono audio).
+            batch size equals to 1.
+    """
+    slice_length = int(window_length_in_sec / frame_unit_in_sec)
+    audio_features, feat_lengths, _, tokens_lengths = zip(*batch)
+    slice_length = int(min(slice_length, max(feat_lengths)))
+    shift = int(shift_length_in_sec / frame_unit_in_sec)
+    has_audio = feat_lengths[0] is not None
+    f_dim = audio_features[0].shape[0]
+    audio_features, num_slices, tokens, feat_lengths = [], [], [], []
+    append_len_start = torch.div(slice_length, 2, rounding_mode='trunc')
+    append_len_end = slice_length - torch.div(slice_length, 2, rounding_mode='trunc')
+    for feat_i, feat_i_len, tokens_i, _ in batch:
+        start = torch.zeros(f_dim, append_len_start)
+        end = torch.zeros(f_dim, append_len_end)
+        feat_i = torch.cat((start, feat_i, end), dim=1)
+        feat_i_len += slice_length
+        if has_audio:
+            slices = max(1, torch.div(feat_i_len - slice_length, shift, rounding_mode='trunc'))
+            for slice_id in range(slices):
+                start_idx = slice_id * shift
+                end_idx = start_idx + slice_length
+                feat_slice = feat_i[:, start_idx:end_idx]
+                audio_features.append(feat_slice)
+            num_slices.append(slices)
+            tokens.extend([tokens_i] * slices)
+            feat_lengths.extend([slice_length] * slices)
+    if has_audio:
+        audio_features = torch.stack(audio_features)
+        feat_lengths = torch.tensor(feat_lengths)
+    else:
+        audio_features, feat_lengths = None, None
+    tokens = torch.stack(tokens)
+    tokens_lengths = torch.tensor(num_slices)
+    return audio_features, feat_lengths, tokens, tokens_lengths
+class _FeatureSeqSpeakerLabelDataset(Dataset):
+    """
+    Dataset that loads tensors via a json file containing paths to feature files, sequences of labels.
+    Each new line is a different sample. Example below:
+    and their target labels. JSON files should be of the following format:
+        {"feature_filepath": "/path/to/feature_0.p", "seq_label": speakerA speakerB SpeakerA ....} \
+        ...
+        {"feature_filepath": "/path/to/feature_n.p", "seq_label": target_seq_label_n}
+    target_seq_label_n is the string of sequence of speaker label, separated by space.
+    Args:
+        manifest_filepath (str): Dataset parameter. Path to JSON containing data.
+        labels (Optional[list]): Dataset parameter. List of unique labels collected from all samples.
+        feature_loader : Dataset parameter. Feature loader to load (external) feature.
+    """
+    @property
+    def output_types(self) -> Optional[Dict[str, NeuralType]]:
+        """Returns definitions of module output ports.
+        """
+        # TODO output type for external features
+        output_types = {
+            'external_feat': NeuralType(('B', 'D', 'T'), AcousticEncodedRepresentation()),
+            'feat_length': NeuralType(tuple('B'), LengthsType()),
+        }
+        if self.is_speaker_emb:
+            output_types.update(
+                {
+                    'embs': NeuralType(('B', 'D', 'T'), AcousticEncodedRepresentation()),
+                    'embs_length': NeuralType(tuple('B'), LengthsType()),
+                    'label': NeuralType(('B', 'T'), LabelsType()),
+                    'label_length': NeuralType(tuple('B'), LengthsType()),
+                }
+            )
+        else:
+            output_types.update(
+                {'label': NeuralType(('B', 'T'), LabelsType()), 'label_length': NeuralType(tuple('B'), LengthsType()),}
+            )
+        return output_types
+    def __init__(
+        self, *, manifest_filepath: str, labels: List[str], feature_loader, is_speaker_emb: bool = False,
+    ):
+        super().__init__()
+        self.collection = collections.ASRFeatureSequenceLabel(manifests_files=manifest_filepath.split(','),)
+        self.feature_loader = feature_loader
+        self.labels = labels if labels else self.collection.uniq_labels
+        self.is_speaker_emb = is_speaker_emb
+        self.label2id, self.id2label = {}, {}
+        for label_id, label in enumerate(self.labels):
+            self.label2id[label] = label_id
+            self.id2label[label_id] = label
+        for idx in range(len(self.labels[:5])):
+            logging.debug(" label id {} and its mapped label {}".format(idx, self.id2label[idx]))
+    def __len__(self):
+        return len(self.collection)
+    def __getitem__(self, index):
+        sample = self.collection[index]
+        features = self.feature_loader.process(sample.feature_file)
+        f, fl = features, torch.tensor(features.shape[0]).long()
+        t = torch.tensor(sample.seq_label).float()
+        tl = torch.tensor(len(sample.seq_label)).long()
+        return f, fl, t, tl
+class FeatureToSeqSpeakerLabelDataset(_FeatureSeqSpeakerLabelDataset):
+    """
+    Dataset that loads tensors via a json file containing paths to feature
+    files and sequence of speakers. Each new line is a
+    different sample. Example below:
+    {"feature_filepath": "/path/to/feature_0.p", "seq_label": speakerA speakerB SpeakerA ....} \
+    ...
+    {"feature_filepath": "/path/to/feature_n.p", "seq_label": target_seq_label_n}
+    target_seq_label_n is the string of sequence of speaker label, separated by space.
+    Args:
+        manifest_filepath (str): Path to manifest json as described above. Canbe comma-separated paths.
+        labels (Optional[list]): String containing all the possible labels to map to
+            if None then automatically picks from ASRFeatureSequenceLabel collection.
+        feature_loader, Feature load to loader (external) feature.
+    """
+    def _collate_fn(self, batch):
+        return _feature_collate_fn(batch)
+class FeatureToLabelDataset(Dataset):
+    """
+    Dataset that loads tensors via a json file containing paths to feature files and their labels.
+    Each new line is a different sample. Example below:
+    and their target labels. JSON files should be of the following format:
+        {"feature_filepath": "/path/to/audio_feature.pt", "label": "1"}
+        ...
+        {"feature_filepath": "/path/to/audio_feature.pt", "label": "0"}
+    Args:
+        manifest_filepath (str): Path to JSON containing data.
+        labels (Optional[list]): List of unique labels collected from all samples.
+        augmentor (Optional): feature augmentation
+        window_length_in_sec (float): Window length in seconds.
+        shift_length_in_sec (float): Shift length in seconds.
+        is_regression_task (bool): if True, the labels are treated as for a regression task.
+        cal_labels_occurrence (bool): if True, the labels occurrence will be calculated.
+        zero_spec_db_val (float): Value to replace non-speech signals in log-melspectrogram.
+        min_duration (float): Minimum duration of the audio file in seconds.
+        max_duration (float): Maximum duration of the audio file in seconds.
+    """
+    ZERO_LEVEL_SPEC_DB_VAL = -16.635  # Log-Melspectrogram value for zero signal
+    FRAME_UNIT_TIME_SECS = 0.01
+    @property
+    def output_types(self) -> Optional[Dict[str, NeuralType]]:
+        """Returns definitions of module output ports.
+        """
+        output_types = {
+            'audio_feat': NeuralType(('B', 'D', 'T'), AcousticEncodedRepresentation()),
+            'feat_length': NeuralType(tuple('B'), LengthsType()),
+            'labels': NeuralType(('B'), LabelsType()),
+            'labels_length': NeuralType(tuple('B'), LengthsType()),
+        }
+        return output_types
+    def __init__(
+        self,
+        *,
+        manifest_filepath: str,
+        labels: List[str] = None,
+        augmentor: 'nemo.collections.asr.parts.perturb.AudioAugmentor' = None,
+        window_length_in_sec: float = 0.63,
+        shift_length_in_sec: float = 0.01,
+        is_regression_task: bool = False,
+        cal_labels_occurrence: Optional[bool] = False,
+        zero_spec_db_val: float = -16.635,
+        min_duration: Optional[float] = None,
+        max_duration: Optional[float] = None,
+    ):
+        super().__init__()
+        self.window_length_in_sec = window_length_in_sec
+        self.shift_length_in_sec = shift_length_in_sec
+        self.zero_spec_db_val = zero_spec_db_val
+        if isinstance(manifest_filepath, str):
+            manifest_filepath = manifest_filepath.split(',')
+        self.collection = collections.ASRFeatureLabel(
+            manifests_files=manifest_filepath,
+            is_regression_task=is_regression_task,
+            cal_labels_occurrence=cal_labels_occurrence,
+            min_duration=min_duration,
+            max_duration=max_duration,
+        )
+        self.feature_loader = ExternalFeatureLoader(augmentor=augmentor)
+        self.labels = labels if labels else self.collection.uniq_labels
+        self.is_regression_task = is_regression_task
+        if not is_regression_task:
+            self.labels = labels if labels else self.collection.uniq_labels
+            self.num_classes = len(self.labels) if self.labels is not None else 1
+            self.label2id, self.id2label = {}, {}
+            self.id2occurrence, self.labels_occurrence = {}, []
+            for label_id, label in enumerate(self.labels):
+                self.label2id[label] = label_id
+                self.id2label[label_id] = label
+                if cal_labels_occurrence:
+                    self.id2occurrence[label_id] = self.collection.labels_occurrence[label]
+            if cal_labels_occurrence:
+                self.labels_occurrence = [self.id2occurrence[k] for k in sorted(self.id2occurrence)]
+            for idx in range(len(self.labels[:5])):
+                logging.debug(" label id {} and its mapped label {}".format(idx, self.id2label[idx]))
+        else:
+            self.labels = []
+            self.num_classes = 1
+    def __len__(self):
+        return len(self.collection)
+    def __getitem__(self, index):
+        sample = self.collection[index]
+        features = self.feature_loader.process(sample.feature_file)
+        f, fl = features, torch.tensor(features.shape[1]).long()
+        t = torch.tensor(self.label2id[sample.label])
+        tl = torch.tensor(1).long()
+        return f, fl, t, tl
+    def _collate_fn(self, batch):
+        return _audio_feature_collate_fn(batch, self.zero_spec_db_val, 0)
+    def _vad_segment_collate_fn(self, batch):
+        return _vad_feature_segment_collate_fn(
+            batch, self.window_length_in_sec, self.shift_length_in_sec, self.FRAME_UNIT_TIME_SECS
+        )
+class FeatureToMultiLabelDataset(Dataset):
+    """
+    Dataset that loads tensors via a json file containing paths to feature files and their labels.
+    Each new line is a different sample. Example below:
+    and their target labels. JSON files should be of the following format:
+        {"feature_filepath": "/path/to/audio_feature.pt", "label": "1 1 0 0 1"}
+        ...
+        {"feature_filepath": "/path/to/audio_feature.pt", "label": "0 1 0 0"}
+    Args:
+        manifest_filepath (str): Path to JSON containing data.
+        labels (Optional[list]): List of unique labels collected from all samples.
+        augmentor (Optional): feature augmentation
+        delimiter (str): delimiter to split the labels.
+        is_regression_task (bool): if True, the labels are treated as for a regression task.
+        cal_labels_occurrence (bool): if True, the labels occurrence will be calculated.
+        zero_spec_db_val (float): Value to replace non-speech signals in log-melspectrogram.
+        min_duration (float): Minimum duration of the audio file in seconds.
+        max_duration (float): Maximum duration of the audio file in seconds.
+    """
+    ZERO_LEVEL_SPEC_DB_VAL = -16.635  # Log-Melspectrogram value for zero signal
+    @property
+    def output_types(self) -> Optional[Dict[str, NeuralType]]:
+        """Returns definitions of module output ports.
+        """
+        output_types = {
+            'audio_feat': NeuralType(('B', 'D', 'T'), AcousticEncodedRepresentation()),
+            'feat_length': NeuralType(tuple('B'), LengthsType()),
+            'labels': NeuralType(('B', 'T'), LabelsType()),
+            'labels_length': NeuralType(tuple('B'), LengthsType()),
+        }
+        return output_types
+    def __init__(
+        self,
+        *,
+        manifest_filepath: str,
+        labels: List[str] = None,
+        augmentor: 'nemo.collections.asr.parts.perturb.AudioAugmentor' = None,
+        delimiter: Optional[str] = None,
+        is_regression_task: bool = False,
+        cal_labels_occurrence: Optional[bool] = False,
+        zero_spec_db_val: float = -16.635,
+        min_duration: Optional[float] = None,
+        max_duration: Optional[float] = None,
+    ):
+        super().__init__()
+        self.delimiter = delimiter
+        self.zero_spec_db_val = zero_spec_db_val
+        if isinstance(manifest_filepath, str):
+            manifest_filepath = manifest_filepath.split(',')
+        self.collection = collections.ASRFeatureLabel(
+            manifests_files=manifest_filepath,
+            is_regression_task=is_regression_task,
+            cal_labels_occurrence=cal_labels_occurrence,
+            delimiter=delimiter,
+            min_duration=min_duration,
+            max_duration=max_duration,
+        )
+        self.is_regression_task = is_regression_task
+        self.feature_loader = ExternalFeatureLoader(augmentor=augmentor)
+        self.labels = labels if labels else self.collection.uniq_labels
+        self.label2id, self.id2label = {}, {}
+        if not is_regression_task:
+            self.labels = labels if labels else self._get_label_set()
+            self.num_classes = len(self.labels) if self.labels is not None else 1
+            self.label2id, self.id2label = {}, {}
+            for label_id, label in enumerate(self.labels):
+                self.label2id[label] = label_id
+                self.id2label[label_id] = label
+                if cal_labels_occurrence:
+                    self.id2occurrence[label_id] = self.collection.labels_occurrence[label]
+                    self.labels_occurrence.append(self.id2occurrence[label_id])
+            for idx in range(len(self.labels[:5])):
+                logging.debug(" label id {} and its mapped label {}".format(idx, self.id2label[idx]))
+        else:
+            self.labels = []
+            self.num_classes = 1
+    def _get_label_set(self):
+        labels = []
+        for sample in self.collection:
+            label_str = sample.label
+            if label_str:
+                label_str_list = label_str.split(self.delimiter) if self.delimiter else label_str.split()
+                labels.extend(label_str_list)
+        return sorted(set(labels))
+    def _label_str_to_tensor(self, label_str: str):
+        labels = label_str.split(self.delimiter) if self.delimiter else label_str.split()
+        if self.is_regression_task:
+            labels = [float(s) for s in labels]
+            labels = torch.tensor(labels).float()
+        else:
+            labels = [self.label2id[s] for s in labels]
+            labels = torch.tensor(labels).long()
+        return labels
+    def __len__(self):
+        return len(self.collection)
+    def __getitem__(self, index):
+        sample = self.collection[index]
+        features = self.feature_loader.process(sample.feature_file)
+        f, fl = features, torch.tensor(features.shape[1]).long()
+        t = self._label_str_to_tensor(sample.label)
+        tl = torch.tensor(t.size(0)).long()
+        return f, fl, t, tl
+    def _collate_fn(self, batch):
+        return _audio_feature_collate_fn(batch, self.zero_spec_db_val, 0)

SoundScribe/SpeakerID/nemo/collections/asr/data/feature_to_label_dataset.py ADDED Viewed

	@@ -0,0 +1,68 @@

+# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import Optional
+from nemo.collections.asr.data import feature_to_label
+def get_feature_seq_speakerlabel_dataset(
+    feature_loader, config: dict
+) -> feature_to_label.FeatureToSeqSpeakerLabelDataset:
+    """
+    Instantiates a FeatureSeqSpeakerLabelDataset.
+    Args:
+        config: Config of the FeatureToSeqSpeakerLabelDataset.
+    Returns:
+        An instance of FeatureToSeqSpeakerLabelDataset.
+    """
+    dataset = feature_to_label.FeatureToSeqSpeakerLabelDataset(
+        manifest_filepath=config['manifest_filepath'], labels=config['labels'], feature_loader=feature_loader,
+    )
+    return dataset
+def get_feature_label_dataset(
+    config: dict, augmentor: Optional['FeatureAugmentor'] = None
+) -> feature_to_label.FeatureToLabelDataset:
+    dataset = feature_to_label.FeatureToLabelDataset(
+        manifest_filepath=config['manifest_filepath'],
+        labels=config['labels'],
+        augmentor=augmentor,
+        window_length_in_sec=config.get("window_length_in_sec", 0.63),
+        shift_length_in_sec=config.get("shift_length_in_sec", 0.08),
+        is_regression_task=config.get("is_regression_task", False),
+        cal_labels_occurrence=config.get("cal_labels_occurrence", False),
+        zero_spec_db_val=config.get("zero_spec_db_val", -16.635),
+        max_duration=config.get('max_duration', None),
+        min_duration=config.get('min_duration', None),
+    )
+    return dataset
+def get_feature_multi_label_dataset(
+    config: dict, augmentor: Optional['FeatureAugmentor'] = None
+) -> feature_to_label.FeatureToMultiLabelDataset:
+    dataset = feature_to_label.FeatureToMultiLabelDataset(
+        manifest_filepath=config['manifest_filepath'],
+        labels=config['labels'],
+        augmentor=augmentor,
+        delimiter=config.get('delimiter', None),
+        is_regression_task=config.get("is_regression_task", False),
+        cal_labels_occurrence=config.get("cal_labels_occurrence", False),
+        zero_spec_db_val=config.get("zero_spec_db_val", -16.635),
+        max_duration=config.get('max_duration', None),
+        min_duration=config.get('min_duration', None),
+    )
+    return dataset

SoundScribe/SpeakerID/nemo/collections/asr/data/feature_to_text.py ADDED Viewed

	@@ -0,0 +1,488 @@

+# Copyright (c) 2022, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import Callable, Dict, List, Optional, Tuple, Union
+import torch
+from nemo.collections.asr.data.feature_to_label import _audio_feature_collate_fn
+from nemo.collections.asr.parts.preprocessing.feature_loader import ExternalFeatureLoader
+from nemo.collections.asr.parts.preprocessing.features import normalize_batch
+from nemo.collections.asr.parts.utils.audio_utils import ChannelSelectorType
+from nemo.collections.asr.parts.utils.vad_utils import load_speech_segments_from_rttm
+from nemo.collections.common import tokenizers
+from nemo.collections.common.parts.preprocessing import collections, parsers
+from nemo.core.classes import Dataset
+from nemo.core.neural_types import AcousticEncodedRepresentation, LabelsType, LengthsType, NeuralType
+class ASRFeatureManifestProcessor:
+    def __init__(
+        self,
+        manifest_filepath: str,
+        parser: Union[str, Callable],
+        max_duration: Optional[float] = None,
+        min_duration: Optional[float] = None,
+        max_utts: int = 0,
+        bos_id: Optional[int] = None,
+        eos_id: Optional[int] = None,
+        pad_id: int = 0,
+        index_by_file_id: bool = False,
+    ):
+        self.parser = parser
+        self.collection = collections.ASRFeatureText(
+            manifests_files=manifest_filepath,
+            parser=parser,
+            min_duration=min_duration,
+            max_duration=max_duration,
+            max_number=max_utts,
+            index_by_file_id=index_by_file_id,
+        )
+        self.eos_id = eos_id
+        self.bos_id = bos_id
+        self.pad_id = pad_id
+    def process_text_by_id(self, index: int) -> Tuple[List[int], int]:
+        sample = self.collection[index]
+        return self.process_text_by_sample(sample)
+    def process_text_by_file_id(self, file_id: str) -> Tuple[List[int], int]:
+        manifest_idx = self.collection.mapping[file_id][0]
+        sample = self.collection[manifest_idx]
+        return self.process_text_by_sample(sample)
+    def process_text_by_sample(self, sample: collections.ASRAudioText.OUTPUT_TYPE) -> Tuple[List[int], int]:
+        t, tl = sample.text_tokens, len(sample.text_tokens)
+        if self.bos_id is not None:
+            t = [self.bos_id] + t
+            tl += 1
+        if self.eos_id is not None:
+            t = t + [self.eos_id]
+            tl += 1
+        return t, tl
+class _FeatureTextDataset(Dataset):
+    """
+    Dataset that loads tensors via a json file containing paths to audio feature files, transcripts,
+    durations (in seconds) and optional RTTM files. Each new line is a different sample. Example below:
+    {"feature_filepath": "/path/to/audio_feature.pt", "text_filepath": "/path/to/audio.txt",
+    "rttm_filepath": "/path/to/audio_rttm.rttm", "duration": 23.147}
+    ...
+    {"feature_filepath": "/path/to/audio_feature.pt", "text": "the transcription", "offset": 301.75, "duration": 0.82, "utt":
+    "utterance_id", "ctm_utt": "en_4156", "side": "A"}
+    Args:
+        manifest_filepath (str): Path to manifest json as described above. Can be comma-separated paths.
+        parser: Str for a language specific preprocessor or a callable.
+        normalize (bool): whether and where to normalize feature, must be one of [None, "post_norm", "pre_norm"]
+        normalize_type (Union[str, dict]): how to normalize feature, see `nemo.collections.asr.parts.preprocessing.features.normalize_batch`
+        use_rttm (bool): whether to use RTTM files if there is any, default to False
+        rttm_mode (str): how to use RTTM files, must be one of ['mask', 'drop'], default to 'mask'
+        feat_min_len (int): minimum length of feature when rttm_mode=deop, default to 4.
+        feat_mask_val (Optional[float]): value used to mask features with RTTM files, default to None to use zero mel-spectralgram
+        frame_unit_time_secs (float): time in seconds for each frame
+        sample_rate (int): Sample rate to resample loaded audio to
+        int_values (bool): If true, load samples as 32-bit integers. Defauts to False.
+        augmentor (nemo.collections.asr.parts.perturb.AudioAugmentor): An AudioAugmentor object used to augment loaded audio
+        max_duration (float): If audio exceeds this length, do not include in dataset
+        min_duration (float): If audio is less than this length, do not include in dataset
+        max_utts (int): Limit number of utterances
+        trim (bool): whether or not to trim silence. Defaults to False
+        bos_id (int): Id of beginning of sequence symbol to append if not None
+        eos_id (int): Id of end of sequence symbol to append if not None
+        pad_id (int): Id of pad symbol. Defaults to 0
+        return_sample_id (bool): whether to return the sample_id as a part of each sample
+        channel_selector (int | Iterable[int] | str): select a single channel or a subset of channels from multi-channel audio. If set to `'average'`, it performs averaging across channels. Disabled if set to `None`. Defaults to `None`. Uses zero-based indexing.
+    """
+    ZERO_LEVEL_SPEC_DB_VAL = -16.635  # Log-Melspectrogram value for zero signal
+    NORM_MODES = ["pre_norm", "post_norm"]
+    RTTM_MODES = ["mask", "drop"]
+    @property
+    def output_types(self) -> Optional[Dict[str, NeuralType]]:
+        """Returns definitions of module output ports.
+               """
+        return {
+            'features': NeuralType(('B', 'D', 'T'), AcousticEncodedRepresentation()),
+            'feature_length': NeuralType(tuple('B'), LengthsType()),
+            'transcripts': NeuralType(('B', 'T'), LabelsType()),
+            'transcript_length': NeuralType(tuple('B'), LengthsType()),
+            'sample_id': NeuralType(tuple('B'), LengthsType(), optional=True),
+        }
+    def __init__(
+        self,
+        manifest_filepath: str,
+        parser: Union[str, Callable],
+        normalize: Optional[str] = "post_norm",
+        normalize_type: Union[str, dict] = "per_feature",
+        use_rttm: bool = False,
+        rttm_mode: str = "mask",
+        feat_min_len: int = 4,
+        feat_mask_val: Optional[float] = None,
+        frame_unit_time_secs: float = 0.01,
+        sample_rate: Optional[int] = 16000,
+        augmentor: 'nemo.collections.asr.parts.perturb.FeatureAugmentor' = None,
+        max_duration: Optional[int] = None,
+        min_duration: Optional[int] = None,
+        max_utts: int = 0,
+        trim: bool = False,
+        bos_id: Optional[int] = None,
+        eos_id: Optional[int] = None,
+        pad_id: int = 0,
+        return_sample_id: bool = False,
+        channel_selector: Optional[ChannelSelectorType] = None,
+    ):
+        if type(manifest_filepath) == str:
+            manifest_filepath = manifest_filepath.split(",")
+        self.sample_rate = sample_rate
+        self.normalize = normalize
+        self.normalize_type = normalize_type
+        self.use_rttm = use_rttm
+        self.rttm_mode = rttm_mode
+        if self.use_rttm and self.rttm_mode not in self.RTTM_MODES:
+            raise ValueError(f"`rttm_mode` must be one of {self.RTTM_MODES}, got `{rttm_mode}` instead")
+        self.feat_min_len = feat_min_len
+        if feat_mask_val is not None:
+            self.feat_mask_val = feat_mask_val
+        elif normalize == "pre_norm":
+            self.feat_mask_val = 0.0  # similar to SpectralAugmentation
+        else:
+            self.feat_mask_val = self.ZERO_LEVEL_SPEC_DB_VAL
+        if normalize is not None and normalize not in self.NORM_MODES:
+            raise ValueError(f"`normalize` must be one of {self.NORM_MODES}, got `{normalize}` instead")
+        self.frame_unit_time_secs = frame_unit_time_secs
+        self.manifest_processor = ASRFeatureManifestProcessor(
+            manifest_filepath=manifest_filepath,
+            parser=parser,
+            max_duration=max_duration,
+            min_duration=min_duration,
+            max_utts=max_utts,
+            bos_id=bos_id,
+            eos_id=eos_id,
+            pad_id=pad_id,
+        )
+        self.featurizer = ExternalFeatureLoader(augmentor=augmentor)
+        self.trim = trim
+        self.return_sample_id = return_sample_id
+        self.channel_selector = channel_selector
+    def get_manifest_sample(self, sample_id):
+        return self.manifest_processor.collection[sample_id]
+    def __getitem__(self, index):
+        sample = self.manifest_processor.collection[index]
+        offset = sample.offset
+        if offset is None:
+            offset = 0
+        features = self.featurizer.process(sample.feature_file)
+        f, fl = features, torch.tensor(features.shape[1]).long()
+        t, tl = self.manifest_processor.process_text_by_sample(sample=sample)
+        # Feature normalization
+        if self.normalize is None:
+            if self.use_rttm and sample.rttm_file:
+                f = self.process_features_with_rttm(f, offset, sample.rttm_file, self.feat_mask_val)
+        elif self.normalize == "post_norm":
+            # (Optional) Masking based on RTTM file
+            if self.use_rttm and sample.rttm_file:
+                f = self.process_features_with_rttm(f, offset, sample.rttm_file, self.feat_mask_val)
+            f = self.normalize_feature(f)
+        else:  # pre-norm
+            f = self.normalize_feature(f)
+            # (Optional) Masking based on RTTM file
+            if self.use_rttm and sample.rttm_file:
+                f = self.process_features_with_rttm(f, offset, sample.rttm_file, self.feat_mask_val)
+        if self.return_sample_id:
+            output = f, fl, torch.tensor(t).long(), torch.tensor(tl).long(), index
+        else:
+            output = f, fl, torch.tensor(t).long(), torch.tensor(tl).long()
+        return output
+    def process_features_with_rttm(self, features, offset, rttm_file, mask_val):
+        segments = load_speech_segments_from_rttm(rttm_file)
+        new_features = features.clone()
+        sid, fid = 0, 0
+        for i in range(features.size(1)):
+            t = offset + i * self.frame_unit_time_secs
+            while sid < len(segments) - 1 and segments[sid][1] < t:
+                sid += 1
+            if segments[sid][1] == 0 or t < segments[sid][0] or t > segments[sid][1]:
+                # not in speech segment
+                if self.rttm_mode == "drop":
+                    # drop the frame
+                    continue
+                else:
+                    # mask the frame with specified value
+                    new_features[:, i] = mask_val
+                    fid += 1
+            else:
+                # in speech segment
+                new_features[:, fid] = features[:, i]
+                fid += 1
+        if fid < self.feat_min_len and self.rttm_mode == "drop":
+            new_features[:, : self.feat_min_len] = mask_val
+            return new_features[:, : self.feat_min_len]
+        return new_features[:, :fid]
+    def __len__(self):
+        return len(self.manifest_processor.collection)
+    def _collate_fn(self, batch):
+        return _audio_feature_collate_fn(
+            batch, feat_pad_val=self.feat_mask_val, label_pad_id=self.manifest_processor.pad_id
+        )
+    def normalize_feature(self, feat):
+        """
+        Args:
+            feat: feature tensor of shape [M, T]
+        """
+        feat = feat.unsqueeze(0)  # add batch dim
+        feat, _, _ = normalize_batch(feat, torch.tensor([feat.size(-1)]), self.normalize_type)
+        return feat.squeeze(0)  # delete batch dim
+class FeatureToCharDataset(_FeatureTextDataset):
+    """
+    Dataset that loads tensors via a json file containing paths to audio feature
+    files, transcripts, durations (in seconds) and optional RTTM files. Each new line is a
+    different sample. Example below:
+    {"feature_filepath": "/path/to/audio_feature.pt", "text_filepath":
+    "/path/to/audio.txt", "duration": 23.147, "rttm_filepath": "/path/to/audio_rttm.rttm",}
+    ...
+    {"feature_filepath": "/path/to/audio_feature.pt", "text": "the
+    transcription", "offset": 301.75, "duration": 0.82, "utt":
+    "utterance_id", "ctm_utt": "en_4156", "side": "A"}
+    Args:
+        manifest_filepath (str): Path to manifest json as described above. Can
+            be comma-separated paths.
+        labels (str): String containing all the possible characters to map to
+        normalize (str): how to normalize feature, must be one of [None, "post_norm", "pre_norm"]
+        normalize_type (Union[str, dict]): how to normalize feature, see `nemo.collections.asr.parts.preprocessing.features.normalize_batch`
+        use_rttm (bool): whether to use RTTM files if there is any, default to False
+        rttm_mode (str): how to use RTTM files, must be one of ['mask', 'drop'], default to 'mask'
+        feat_min_len (int): minimum length of feature, default to 4
+        feat_mask_val (Optional[float]): value used to mask features with RTTM files, default to None to use zero mel-spectralgram
+        frame_unit_time_secs: time in seconds for each frame
+        sample_rate (int): Sample rate to resample loaded audio to
+        int_values (bool): If true, load samples as 32-bit integers. Defauts to False.
+        augmentor (nemo.collections.asr.parts.perturb.AudioAugmentor): An AudioAugmentor
+            object used to augment loaded audio
+        max_duration: If audio exceeds this length, do not include in dataset
+        min_duration: If audio is less than this length, do not include
+            in dataset
+        max_utts: Limit number of utterances
+        blank_index: blank character index, default = -1
+        unk_index: unk_character index, default = -1
+        bos_id: Id of beginning of sequence symbol to append if not None
+        eos_id: Id of end of sequence symbol to append if not None
+        return_sample_id (bool): whether to return the sample_id as a part of each sample
+        channel_selector (int | Iterable[int] | str): select a single channel or a subset of channels from multi-channel audio. If set to `'average'`, it performs averaging across channels. Disabled if set to `None`. Defaults to `None`. Uses zero-based indexing.
+    """
+    def __init__(
+        self,
+        manifest_filepath: str,
+        labels: Union[str, List[str]],
+        normalize: Optional[str] = "post_norm",
+        normalize_type: Union[str, dict] = "per_feature",
+        use_rttm: bool = False,
+        rttm_mode: str = "mask",
+        feat_min_len: int = 4,
+        feat_mask_val: Optional[float] = None,
+        frame_unit_time_secs: float = 0.01,
+        sample_rate: Optional[int] = 16000,
+        augmentor: 'nemo.collections.asr.parts.perturb.FeatureAugmentor' = None,
+        max_duration: Optional[int] = None,
+        min_duration: Optional[int] = None,
+        max_utts: int = 0,
+        blank_index: int = -1,
+        unk_index: int = -1,
+        trim: bool = False,
+        bos_id: Optional[int] = None,
+        eos_id: Optional[int] = None,
+        pad_id: int = 0,
+        parser: Union[str, Callable] = 'en',
+        return_sample_id: bool = False,
+        channel_selector: Optional[ChannelSelectorType] = None,
+    ):
+        self.labels = labels
+        parser = parsers.make_parser(
+            labels=labels, name=parser, unk_id=unk_index, blank_id=blank_index, do_normalize=normalize
+        )
+        super().__init__(
+            manifest_filepath=manifest_filepath,
+            parser=parser,
+            normalize=normalize,
+            normalize_type=normalize_type,
+            use_rttm=use_rttm,
+            rttm_mode=rttm_mode,
+            feat_min_len=feat_min_len,
+            feat_mask_val=feat_mask_val,
+            frame_unit_time_secs=frame_unit_time_secs,
+            sample_rate=sample_rate,
+            augmentor=augmentor,
+            max_duration=max_duration,
+            min_duration=min_duration,
+            max_utts=max_utts,
+            trim=trim,
+            bos_id=bos_id,
+            eos_id=eos_id,
+            pad_id=pad_id,
+            return_sample_id=return_sample_id,
+            channel_selector=channel_selector,
+        )
+class FeatureToBPEDataset(_FeatureTextDataset):
+    """
+    Dataset that loads tensors via a json file containing paths to audio feature
+    files, transcripts, durations (in seconds) and optional RTTM files. Each new line is a different sample.
+    Example below:
+    {"audio_filepath": "/path/to/audio.wav", "text_filepath":
+    "/path/to/audio.txt", "duration": 23.147, "rttm_filepath": "/path/to/audio_rttm.rttm",}
+    ...
+    {"audio_filepath": "/path/to/audio.wav", "text": "the
+    transcription", "offset": 301.75, "duration": 0.82, "utt":
+    "utterance_id", "ctm_utt": "en_4156", "side": "A"}
+    In practice, the dataset and manifest used for character encoding and byte pair encoding
+    are exactly the same. The only difference lies in how the dataset tokenizes the text in
+    the manifest.
+    Args:
+        manifest_filepath (str): Path to manifest json as described above. Can
+            be comma-separated paths.
+        tokenizer: A subclass of the Tokenizer wrapper found in the common collection,
+            nemo.collections.common.tokenizers.TokenizerSpec. ASR Models support a subset of
+            all available tokenizers.
+        normalize (str): how to normalize feature, must be one of [None, "post_norm", "pre_norm"]
+        normalize_type (Union[str, dict]): how to normalize feature, see `nemo.collections.asr.parts.preprocessing.features.normalize_batch`
+        use_rttm (bool): whether to use RTTM files if there is any, default to False
+        rttm_mode (str): how to use RTTM files, must be one of ['mask', 'drop'], default to 'mask'
+        feat_min_len (int): minimum length of feature, default to 4
+        feat_mask_val (Optional[float]): value used to mask features with RTTM files, default to None to use zero mel-spectralgram
+        frame_unit_time_secs: time in seconds for each frame
+        sample_rate (int): Sample rate to resample loaded audio to
+        int_values (bool): If true, load samples as 32-bit integers. Defauts to False.
+        augmentor (nemo.collections.asr.parts.perturb.AudioAugmentor): An AudioAugmentor
+            object used to augment loaded audio
+        max_duration: If audio exceeds this length, do not include in dataset
+        min_duration: If audio is less than this length, do not include
+            in dataset
+        max_utts: Limit number of utterances
+        trim: Whether to trim silence segments
+        use_start_end_token: Boolean which dictates whether to add [BOS] and [EOS]
+            tokens to beginning and ending of speech respectively.
+        return_sample_id (bool): whether to return the sample_id as a part of each sample
+        channel_selector (int | Iterable[int] | str): select a single channel or a subset of channels from multi-channel audio. If set to `'average'`, it performs averaging across channels. Disabled if set to `None`. Defaults to `None`. Uses zero-based indexing.
+    """
+    def __init__(
+        self,
+        manifest_filepath: str,
+        tokenizer: 'nemo.collections.common.tokenizers.TokenizerSpec',
+        normalize: Optional[str] = "post_norm",
+        normalize_type: Union[str, dict] = "per_feature",
+        use_rttm: bool = False,
+        rttm_mode: str = "mask",
+        feat_min_len: int = 4,
+        feat_mask_val: Optional[float] = None,
+        frame_unit_time_secs: float = 0.01,
+        sample_rate: Optional[int] = 16000,
+        augmentor: 'nemo.collections.asr.parts.perturb.FeatureAugmentor' = None,
+        max_duration: Optional[int] = None,
+        min_duration: Optional[int] = None,
+        max_utts: int = 0,
+        use_start_end_token: bool = True,
+        trim: bool = False,
+        return_sample_id: bool = False,
+        channel_selector: Optional[ChannelSelectorType] = None,
+    ):
+        if use_start_end_token and hasattr(tokenizer, "bos_id") and tokenizer.bos_id > 0:
+            bos_id = tokenizer.bos_id
+        else:
+            bos_id = None
+        if use_start_end_token and hasattr(tokenizer, "eos_id") and tokenizer.eos_id > 0:
+            eos_id = tokenizer.eos_id
+        else:
+            eos_id = None
+        if hasattr(tokenizer, "pad_id") and tokenizer.pad_id > 0:
+            pad_id = tokenizer.pad_id
+        else:
+            pad_id = 0
+        class TokenizerWrapper:
+            def __init__(self, tokenizer):
+                if isinstance(tokenizer, tokenizers.aggregate_tokenizer.AggregateTokenizer):
+                    self.is_aggregate = True
+                else:
+                    self.is_aggregate = False
+                self._tokenizer = tokenizer
+            def __call__(self, *args):
+                if isinstance(args[0], List) and self.is_aggregate:
+                    t = []
+                    for span in args[0]:
+                        t.extend(self._tokenizer.text_to_ids(span['str'], span['lang']))
+                    return t
+                t = self._tokenizer.text_to_ids(*args)
+                return t
+        super().__init__(
+            manifest_filepath=manifest_filepath,
+            parser=TokenizerWrapper(tokenizer),
+            normalize=normalize,
+            normalize_type=normalize_type,
+            use_rttm=use_rttm,
+            rttm_mode=rttm_mode,
+            feat_min_len=feat_min_len,
+            feat_mask_val=feat_mask_val,
+            frame_unit_time_secs=frame_unit_time_secs,
+            sample_rate=sample_rate,
+            augmentor=augmentor,
+            max_duration=max_duration,
+            min_duration=min_duration,
+            max_utts=max_utts,
+            trim=trim,
+            bos_id=bos_id,
+            eos_id=eos_id,
+            pad_id=pad_id,
+            return_sample_id=return_sample_id,
+            channel_selector=channel_selector,
+        )

SoundScribe/SpeakerID/nemo/collections/asr/data/feature_to_text_dataset.py ADDED Viewed

	@@ -0,0 +1,94 @@

+# Copyright (c) 2022, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import Optional
+from nemo.collections.asr.data.feature_to_text import FeatureToBPEDataset, FeatureToCharDataset
+from nemo.utils import logging
+def get_char_dataset(config: dict, augmentor: Optional['FeatureAugmentor'] = None) -> FeatureToCharDataset:
+    """
+    Instantiates a Character Encoding based FeatureToCharDataset.
+    Args:
+        config: Config of the FeatureToCharDataset.
+        augmentor: Optional AudioAugmentor object for augmentations on audio data.
+    Returns:
+        An instance of FeatureToCharDataset.
+    """
+    if 'labels' not in config:
+        logging.warning(f"dataset does not have explicitly defined labels")
+    dataset = FeatureToCharDataset(
+        manifest_filepath=config['manifest_filepath'],
+        labels=config.get('labels', None),
+        normalize=config.get('normalize', 'post_norm'),
+        normalize_type=config.get('normalize_type', 'per_feature'),
+        use_rttm=config.get('use_rttm', False),
+        rttm_mode=config.get('rttm_mode', 'mask'),
+        feat_min_len=config.get('feat_min_len', 4),
+        feat_mask_val=config.get('feat_mask_val', None),
+        frame_unit_time_secs=config.get('frame_unit_time_secs', 0.01),
+        sample_rate=config.get('sample_rate', 16000),
+        augmentor=augmentor,
+        max_duration=config.get('max_duration', None),
+        min_duration=config.get('min_duration', None),
+        max_utts=config.get('max_utts', 0),
+        blank_index=config.get('blank_index', -1),
+        unk_index=config.get('unk_index', -1),
+        trim=config.get('trim_silence', False),
+        parser=config.get('parser', 'en'),
+        return_sample_id=config.get('return_sample_id', False),
+        channel_selector=config.get('channel_selector', None),
+    )
+    return dataset
+def get_bpe_dataset(
+    config: dict, tokenizer: 'TokenizerSpec', augmentor: Optional['FeatureAugmentor'] = None
+) -> FeatureToBPEDataset:
+    """
+    Instantiates a Byte Pair Encoding / Word Piece Encoding based FeatureoToBPEDataset.
+    Args:
+        config: Config of the FeatureToBPEDataset.
+        tokenizer: An instance of a TokenizerSpec object.
+        augmentor: Optional FeatureAugmentor object for augmentations on audio features.
+    Returns:
+        An instance of FeatureToBPEDataset.
+    """
+    dataset = FeatureToBPEDataset(
+        manifest_filepath=config['manifest_filepath'],
+        tokenizer=tokenizer,
+        normalize=config.get('normalize', 'post_norm'),
+        normalize_type=config.get('normalize_type', 'per_feature'),
+        use_rttm=config.get('use_rttm', False),
+        rttm_mode=config.get('rttm_mode', 'mask'),
+        feat_min_len=config.get('feat_min_len', 4),
+        feat_mask_val=config.get('feat_mask_val', None),
+        frame_unit_time_secs=config.get('frame_unit_time_secs', 0.01),
+        sample_rate=config.get('sample_rate', 16000),
+        augmentor=augmentor,
+        max_duration=config.get('max_duration', None),
+        min_duration=config.get('min_duration', None),
+        max_utts=config.get('max_utts', 0),
+        trim=config.get('trim_silence', False),
+        use_start_end_token=config.get('use_start_end_token', True),
+        return_sample_id=config.get('return_sample_id', False),
+        channel_selector=config.get('channel_selector', None),
+    )
+    return dataset

SoundScribe/SpeakerID/nemo/collections/asr/data/text_to_text.py ADDED Viewed

	@@ -0,0 +1,482 @@

+# Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import annotations
+import concurrent.futures
+import copy
+import gc
+import json
+import math
+import random
+from pathlib import Path
+from typing import Any, Callable, Dict, Iterable, List, NamedTuple, Optional, Set, Union
+import numpy as np
+import torch
+import torch.utils.data
+from torch.nn.utils.rnn import pad_sequence
+from tqdm.auto import tqdm
+from nemo.collections.asr.data.audio_to_text import _speech_collate_fn
+from nemo.collections.common.tokenizers import TokenizerSpec
+from nemo.core.classes import Dataset, IterableDataset
+from nemo.utils import logging
+try:
+    from nemo_text_processing.text_normalization.normalize import Normalizer
+except Exception as e:
+    pass  # Normalizer imported only for annotation purposes, error can be ignored
+AnyPath = Union[Path, str]
+class TextToTextItem(NamedTuple):
+    tts_text: torch.Tensor  # normalized and tokenized text for TTS
+    transcript: torch.Tensor  # tokenized text for ASR
+    speaker: int  # speaker id for multi-speaker TTS
+class TextToTextBatch(NamedTuple):
+    tts_texts: torch.Tensor  # tokenized texts for tts
+    tts_text_lengths: torch.Tensor
+    transcripts: torch.Tensor  # tokenized texts for ASR
+    transcript_lengths: torch.Tensor
+    speakers: torch.Tensor  # speaker ids for multi-speaker TTS
+    @staticmethod
+    def collate_fn(batch: List[TextToTextItem], asr_pad_id: int, tts_text_pad_id: int) -> TextToTextBatch:
+        return TextToTextBatch(
+            tts_texts=pad_sequence([item.tts_text for item in batch], batch_first=True, padding_value=tts_text_pad_id),
+            tts_text_lengths=torch.tensor([item.tts_text.shape[0] for item in batch]).long(),
+            transcripts=pad_sequence([item.transcript for item in batch], batch_first=True, padding_value=asr_pad_id),
+            transcript_lengths=torch.tensor([item.transcript.shape[0] for item in batch]).long(),
+            speakers=torch.tensor([item.speaker for item in batch]).long(),
+        )
+class TextOrAudioToTextBatch(NamedTuple):
+    audio_signals: torch.Tensor
+    audio_signal_lengths: torch.Tensor
+    tts_texts: torch.Tensor
+    tts_text_lengths: torch.Tensor
+    speakers: torch.Tensor
+    transcripts: torch.Tensor
+    transcript_lengths: torch.Tensor
+    @staticmethod
+    def collate_fn(
+        batch: List[Union[TextToTextItem, tuple]], tts_text_pad_id: int, asr_pad_id: int
+    ) -> Union[TextToTextBatch, TextOrAudioToTextBatch, tuple]:
+        """
+        Collate function for dataloader
+        Can accept mixed batch of text-to-text items and audio-text items (typical for ASR)
+        """
+        text_items: List[TextToTextItem] = [item for item in batch if isinstance(item, TextToTextItem)]
+        if not text_items:
+            # pure audio-text batch
+            return _speech_collate_fn(batch=batch, pad_id=asr_pad_id)
+        asr_items = [item for item in batch if not isinstance(item, TextToTextItem)]
+        if not asr_items:
+            # pure text-to-text batch
+            return TextToTextBatch.collate_fn(batch=text_items, asr_pad_id=asr_pad_id, tts_text_pad_id=tts_text_pad_id)
+        # mixed batch
+        # each asr item is a tuple:
+        # audio_signal (0), audio_length (1), transcript (2), transcript_length (3), sample_id (4, optional)
+        audio_signals = pad_sequence([item[0] for item in asr_items], batch_first=True, padding_value=0.0)
+        audio_signal_lengths = torch.tensor([item[1] for item in asr_items]).long()
+        tts_texts = pad_sequence(
+            [item.tts_text for item in text_items], batch_first=True, padding_value=tts_text_pad_id
+        )
+        tts_text_lengths = torch.tensor([item.tts_text.shape[0] for item in text_items]).long()
+        speakers = torch.tensor([item.speaker for item in text_items]).long()
+        transcripts = pad_sequence(
+            [item.transcript for item in text_items] + [item[2] for item in asr_items],
+            batch_first=True,
+            padding_value=asr_pad_id,
+        )
+        transcript_lengths = torch.tensor(
+            [item.transcript.shape[0] for item in text_items] + [item[3] for item in asr_items]
+        ).long()
+        return TextOrAudioToTextBatch(
+            audio_signals=audio_signals,
+            audio_signal_lengths=audio_signal_lengths,
+            tts_texts=tts_texts,
+            tts_text_lengths=tts_text_lengths,
+            speakers=speakers,
+            transcripts=transcripts,
+            transcript_lengths=transcript_lengths,
+        )
+def _asr_text_to_tokens(text: str) -> np.ndarray:
+    """
+    Helper function for asr tokenization with multiprocessing pool only.
+    Must be defined on the top level.
+    Expects asr_tokenizer_global, asr_bos_id_global, asr_eos_id_global to exist in the current pool process
+    """
+    ids = asr_tokenizer_global.text_to_ids(text)
+    if asr_bos_id_global is not None:
+        ids = [asr_bos_id_global] + ids
+    if asr_eos_id_global is not None:
+        ids.append(asr_eos_id_global)
+    return np.asarray(ids)
+def _tts_text_to_tokens(text: str) -> np.ndarray:
+    """
+    Helper function for asr tokenization with multiprocessing pool only.
+    Must be defined on the top level.
+    Expects tts_tokenizer_global to exist in the current pool process
+    """
+    return np.asarray(tts_tokenizer_global(text))
+def _iterate_manifest(filepath: AnyPath) -> Iterable[Dict[str, Any]]:
+    """
+    Helper function to iterate manifest
+    """
+    with open(filepath, "r", encoding="utf-8") as f:
+        for line in f:
+            record = json.loads(line)
+            yield record
+class TextToTextDatasetBase:
+    """
+    Base class for loading text-to-text manifests
+    Map-style and Iterable datasets should inherit this class
+    """
+    asr_pad_id: int
+    tts_text_pad_id: int
+    asr_bos_id: Optional[int] = None
+    asr_eos_id: Optional[int] = None
+    data: List[Dict[str, Any]]
+    def __init__(
+        self,
+        manifest_filepath: Union[AnyPath, List[AnyPath]],
+        speakers_filepath: Union[AnyPath, List[AnyPath]],
+        asr_tokenizer: TokenizerSpec,
+        asr_use_start_end_token: bool,
+        tts_parser: Callable,
+        tts_text_pad_id: int,
+        tts_text_normalizer: "Normalizer",
+        tts_text_normalizer_call_kwargs: Dict,
+        min_words: int = 1,
+        max_words: int = 1_000_000,
+        tokenizer_workers: int = 1,
+        num_parts: int = 1,
+        current_part_index: int = 0,
+    ):
+        super().__init__()
+        # ASR tokenizer setup
+        if asr_use_start_end_token and hasattr(asr_tokenizer, 'bos_token'):
+            self.asr_bos_id = asr_tokenizer.bos_id
+        if asr_use_start_end_token and hasattr(asr_tokenizer, 'eos_token'):
+            self.asr_eos_id = asr_tokenizer.eos_id
+        if hasattr(asr_tokenizer, 'pad_token'):
+            self.asr_pad_id = asr_tokenizer.pad_id
+        else:
+            self.asr_pad_id = 0
+        self.asr_tokenizer = asr_tokenizer
+        # TTS tokenizer setup
+        self.tts_parser = tts_parser
+        self.tts_normalizer = tts_text_normalizer
+        self.tts_normalizer_kwargs = tts_text_normalizer_call_kwargs
+        self.tts_text_pad_id = tts_text_pad_id
+        # Load speakers
+        if isinstance(speakers_filepath, str):
+            speakers_filepath = speakers_filepath.split(",")
+        elif isinstance(speakers_filepath, Path):
+            speakers_filepath = [speakers_filepath]
+        speakers: Set[int] = set()
+        for filepath in speakers_filepath:
+            with open(Path(filepath).expanduser(), "r") as f:
+                speakers.update(map(int, f.read().split()))
+        self.speakers = np.asarray(sorted(speakers))
+        logging.info(f"Loaded {len(self.speakers)} speakers")
+        # Load manifest
+        if isinstance(manifest_filepath, str):
+            manifest_filepath = manifest_filepath.split(",")
+        elif isinstance(manifest_filepath, Path):
+            manifest_filepath = [manifest_filepath]
+        self.manifest_paths = [Path(filepath) for filepath in manifest_filepath]
+        num_skipped_words = 0
+        num_skipped_utterances = 0
+        asr_texts = []
+        tts_texts = []
+        need_normalization = False
+        for manifest_path in self.manifest_paths:
+            for tmp_item in tqdm(_iterate_manifest(manifest_path)):
+                text = tmp_item["text"]
+                num_words = len(text.split())
+                # skip if number of works not in desired range
+                # TODO: maybe it would be valuable to sample sub-utterances from long utterances
+                if not (min_words <= num_words <= max_words):
+                    num_skipped_words += num_words
+                    num_skipped_utterances += 1
+                    continue
+                asr_texts.append(tmp_item["text"])
+                if "tts_text_normalized" in tmp_item:
+                    tts_texts.append(tmp_item["tts_text_normalized"])
+                else:
+                    tts_texts.append(tmp_item["tts_text"])
+                    need_normalization = True
+        if need_normalization:
+            logging.warning("TTS normalization is extremely slow! It is recommended to normalize TTS text")
+        if num_skipped_utterances:
+            logging.warning(f"Skipped {num_skipped_utterances} utterances " f"with {num_skipped_words}")
+        num_utterances = len(asr_texts)
+        # preprocessing is very costly, if we need only part - remove unnecessary utterances
+        if num_parts > 1:
+            # NB: floor division, full dataset can contain fewer utterances than original, like in tarred dataset
+            num_utterances_part = num_utterances // num_parts
+            start = num_utterances_part * current_part_index
+            end = start + num_utterances_part
+            logging.info(
+                f"Taking part of the dataset: {current_part_index} index, total {num_parts} from {start} to {end}"
+            )
+            asr_texts = asr_texts[start:end]
+            tts_texts = tts_texts[start:end]
+            num_utterances = num_utterances_part
+        self.data = [dict() for _ in range(num_utterances)]
+        if len(asr_texts) == 0:
+            # no data was loaded
+            logging.warning("Text-to-text dataset is empty")
+            return
+        if tokenizer_workers == 1:
+            logging.warning(
+                "Preprocessing large text with tokenizer_workers=1 may be slow with TTS tokenizer. "
+                "Prefer tokenizer_workers=(num_cpu_cores/num_gpus_per_node)"
+            )
+            for i, tokenized_text in enumerate(
+                tqdm((self._asr_text_to_tokens(text) for text in asr_texts), total=len(asr_texts))
+            ):
+                self.data[i]["asr_text_tokens"] = tokenized_text
+        else:
+            # Multiprocessing hack: use global variables for every process (not really global in program context)
+            def _init_asr_tokenize_process(tokenizer, bos_id, eos_id):
+                global asr_tokenizer_global, asr_bos_id_global, asr_eos_id_global  # process-global
+                # deepcopy to avoid serialization of parent models
+                asr_tokenizer_global = copy.deepcopy(tokenizer)
+                asr_bos_id_global = copy.deepcopy(bos_id)
+                asr_eos_id_global = copy.deepcopy(eos_id)
+            with concurrent.futures.ProcessPoolExecutor(
+                initializer=_init_asr_tokenize_process,
+                initargs=(asr_tokenizer, self.asr_bos_id, self.asr_eos_id),
+                max_workers=tokenizer_workers,
+            ) as pool:
+                # chunk size for pool map is empirically chosen as a trade-off between speed and responsiveness
+                for i, tokenized_text in enumerate(
+                    tqdm(pool.map(_asr_text_to_tokens, asr_texts, chunksize=1000), total=len(asr_texts))
+                ):
+                    self.data[i]["asr_text_tokens"] = tokenized_text
+        # force free memory
+        del asr_texts
+        gc.collect()
+        if tokenizer_workers == 1:
+            logging.warning(
+                "Preprocessing large text with tokenizer_workers=1 may be slow with TTS tokenizer. "
+                "Prefer tokenizer_workers=(num_cpu_cores/num_gpus_per_node)"
+            )
+            for i, tokenized_text in enumerate(
+                tqdm(
+                    (self._tts_text_to_tokens(text, normalize=need_normalization) for text in tts_texts),
+                    total=len(tts_texts),
+                )
+            ):
+                self.data[i]["tts_text_tokens"] = tokenized_text
+        else:
+            if need_normalization:
+                # TODO: implement, if we really need normalization inplace
+                raise NotImplementedError(
+                    "Normalization with tokenizer_workers > 1 is not implemented. "
+                    "It is not recommended to use normalization on the fly at all, since it's extremely slow"
+                )
+            def _init_tts_tokenize_process(tokenizer):
+                global tts_tokenizer_global  # process-global
+                tts_tokenizer_global = copy.deepcopy(tokenizer)
+            with concurrent.futures.ProcessPoolExecutor(
+                initializer=_init_tts_tokenize_process, initargs=(tts_parser,), max_workers=tokenizer_workers,
+            ) as pool:
+                # chunk size for pool map is empirically chosen as a trade-off between speed and responsiveness
+                for i, tokenized_text in enumerate(
+                    tqdm(pool.map(_tts_text_to_tokens, tts_texts, chunksize=1000), total=len(tts_texts))
+                ):
+                    self.data[i]["tts_text_tokens"] = tokenized_text
+        # force free memory
+        del tts_texts
+        gc.collect()
+    def _asr_text_to_tokens(self, text: str) -> np.ndarray:
+        ids = self.asr_tokenizer.text_to_ids(text)
+        if self.asr_bos_id is not None:
+            ids = [self.asr_bos_id] + ids
+        if self.asr_eos_id is not None:
+            ids.append(self.asr_eos_id)
+        return np.asarray(ids)
+    def _tts_text_to_tokens(self, text: str, normalize=True) -> np.ndarray:
+        if normalize:
+            text = self.tts_normalizer.normalize(text, **self.tts_normalizer_kwargs)
+        tokens = self.tts_parser(text)
+        return np.asarray(tokens)
+    def __getitem__(self, index):
+        item = self.data[index]
+        return TextToTextItem(
+            transcript=torch.from_numpy(item["asr_text_tokens"]).long(),
+            tts_text=torch.from_numpy(item["tts_text_tokens"]).long(),
+            speaker=random.choice(self.speakers),
+        )
+    def __len__(self):
+        return len(self.data)
+class TextToTextDataset(TextToTextDatasetBase, Dataset):
+    """Text-to-Text Map-style Dataset for hybrid ASR-TTS models"""
+    def __init__(
+        self,
+        manifest_filepath: Union[AnyPath, List[AnyPath]],
+        speakers_filepath: Union[AnyPath, List[AnyPath]],
+        asr_tokenizer: TokenizerSpec,
+        asr_use_start_end_token: bool,
+        tts_parser: Callable,
+        tts_text_pad_id: int,
+        tts_text_normalizer: "Normalizer",
+        tts_text_normalizer_call_kwargs: Dict,
+        min_words: int = 1,
+        max_words: int = 1_000_000,
+        tokenizer_workers: int = 1,
+    ):
+        super().__init__(
+            manifest_filepath=manifest_filepath,
+            speakers_filepath=speakers_filepath,
+            asr_tokenizer=asr_tokenizer,
+            asr_use_start_end_token=asr_use_start_end_token,
+            tts_parser=tts_parser,
+            tts_text_pad_id=tts_text_pad_id,
+            tts_text_normalizer=tts_text_normalizer,
+            tts_text_normalizer_call_kwargs=tts_text_normalizer_call_kwargs,
+            min_words=min_words,
+            max_words=max_words,
+            tokenizer_workers=tokenizer_workers,
+            num_parts=1,
+        )
+    def collate_fn(
+        self, batch: List[Union[TextToTextItem, tuple]]
+    ) -> Union[TextToTextBatch, TextOrAudioToTextBatch, tuple]:
+        """
+        Collate function for dataloader
+        Can accept mixed batch of text-to-text items and audio-text items (typical for ASR)
+        """
+        return TextOrAudioToTextBatch.collate_fn(
+            batch=batch, asr_pad_id=self.asr_pad_id, tts_text_pad_id=self.tts_text_pad_id
+        )
+class TextToTextIterableDataset(TextToTextDatasetBase, IterableDataset):
+    """
+    Text-to-Text Iterable Dataset for hybrid ASR-TTS models
+    Only part necessary for current process should be loaded and stored
+    """
+    def __init__(
+        self,
+        manifest_filepath: Union[AnyPath, List[AnyPath]],
+        speakers_filepath: Union[AnyPath, List[AnyPath]],
+        asr_tokenizer: TokenizerSpec,
+        asr_use_start_end_token: bool,
+        tts_parser: Callable,
+        tts_text_pad_id: int,
+        tts_text_normalizer: "Normalizer",
+        tts_text_normalizer_call_kwargs: Dict,
+        min_words: int = 1,
+        max_words: int = 1_000_000,
+        tokenizer_workers: int = 1,
+        num_parts: int = 1,
+        current_part_index: int = 0,
+    ):
+        super().__init__(
+            manifest_filepath=manifest_filepath,
+            speakers_filepath=speakers_filepath,
+            asr_tokenizer=asr_tokenizer,
+            asr_use_start_end_token=asr_use_start_end_token,
+            tts_parser=tts_parser,
+            tts_text_pad_id=tts_text_pad_id,
+            tts_text_normalizer=tts_text_normalizer,
+            tts_text_normalizer_call_kwargs=tts_text_normalizer_call_kwargs,
+            min_words=min_words,
+            max_words=max_words,
+            tokenizer_workers=tokenizer_workers,
+            num_parts=num_parts,
+            current_part_index=current_part_index,
+        )
+    def __iter__(self):
+        # Implementation based on docs: https://pytorch.org/docs/stable/data.html#torch.utils.data.IterableDataset
+        worker_info = torch.utils.data.get_worker_info()
+        if worker_info is None:  # single-process data loading, return the full iterator
+            start = 0
+            end = len(self)
+        else:  # in a worker process
+            # split workload
+            per_worker = int(math.ceil(len(self) / float(worker_info.num_workers)))
+            worker_id = worker_info.id
+            start = worker_id * per_worker
+            end = min(start + per_worker, len(self))
+        indices = np.arange(start, end)
+        np.random.shuffle(indices)
+        return map(self.__getitem__, indices)
+    def collate_fn(
+        self, batch: List[Union[TextToTextItem, tuple]]
+    ) -> Union[TextToTextBatch, TextOrAudioToTextBatch, tuple]:
+        """
+        Collate function for dataloader
+        Can accept mixed batch of text-to-text items and audio-text items (typical for ASR)
+        """
+        return TextOrAudioToTextBatch.collate_fn(
+            batch=batch, asr_pad_id=self.asr_pad_id, tts_text_pad_id=self.tts_text_pad_id
+        )

SoundScribe/SpeakerID/nemo/collections/asr/losses/__init__.py ADDED Viewed

	@@ -0,0 +1,22 @@

+# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from nemo.collections.asr.losses.angularloss import AngularSoftmaxLoss
+from nemo.collections.asr.losses.audio_losses import SDRLoss
+from nemo.collections.asr.losses.ctc import CTCLoss
+from nemo.collections.asr.losses.lattice_losses import LatticeLoss
+from nemo.collections.asr.losses.ssl_losses.contrastive import ContrastiveLoss
+from nemo.collections.asr.losses.ssl_losses.ctc import CTCLossForSSL
+from nemo.collections.asr.losses.ssl_losses.mlm import MLMLoss
+from nemo.collections.asr.losses.ssl_losses.rnnt import RNNTLossForSSL

SoundScribe/SpeakerID/nemo/collections/asr/losses/__pycache__/__init__.cpython-310.pyc ADDED Viewed

Binary file (784 Bytes). View file

SoundScribe/SpeakerID/nemo/collections/asr/losses/__pycache__/angularloss.cpython-310.pyc ADDED Viewed

Binary file (2.43 kB). View file

SoundScribe/SpeakerID/nemo/collections/asr/losses/__pycache__/audio_losses.cpython-310.pyc ADDED Viewed

Binary file (10.9 kB). View file

SoundScribe/SpeakerID/nemo/collections/asr/losses/__pycache__/ctc.cpython-310.pyc ADDED Viewed

Binary file (2.29 kB). View file