diff --git a/README.md b/README.md index 7b916f7bc09b9102fe9a0b1709701d106c0606e5..ea8958397bb5b5fdcb96cd966bd040050ece6fd6 100644 --- a/README.md +++ b/README.md @@ -1,10 +1,10 @@ --- -title: Vakyansh Bengali TTS -emoji: 💩 -colorFrom: gray -colorTo: blue +title: Vakyansh Hindi TTS +emoji: 🐨 +colorFrom: indigo +colorTo: purple sdk: gradio -sdk_version: 2.9.1 +sdk_version: 2.8.13 app_file: app.py pinned: false license: apache-2.0 diff --git a/app.py b/app.py new file mode 100644 index 0000000000000000000000000000000000000000..17d266c2853bd1b137ed4e818a565f77348b2f43 --- /dev/null +++ b/app.py @@ -0,0 +1,28 @@ +import os +os.system('wget -q https://storage.googleapis.com/vakyansh-open-models/tts/bengali/bn-IN/female_voice_0/glow.zip && unzip -q glow.zip -d ttsv/checkpoints/female') +os.system('wget -q https://storage.googleapis.com/vakyansh-open-models/tts/bengali/bn-IN/female_voice_0/hifi.zip && unzip -q hifi.zip -d ttsv/checkpoints/female') +os.system('rm glow.zip && rm hifi.zip') +os.system('wget -q https://storage.googleapis.com/vakyansh-open-models/tts/bengali/bn-IN/male_voice_1/glow.zip && unzip -q glow.zip -d ttsv/checkpoints/male') +os.system('wget -q https://storage.googleapis.com/vakyansh-open-models/tts/bengali/bn-IN/male_voice_1/hifi.zip && unzip -q hifi.zip -d ttsv/checkpoints/male') +os.system('wget -q https://storage.googleapis.com/vakyansh-open-models/translit_models.zip -P ttsv/checkpoints/ && unzip -q ttsv/checkpoints/translit_models.zip -d ttsv/checkpoints/') + + +for path, subdirs, files in os.walk('ttsv/checkpoints/'): + print(subdirs) + for name in files: + print(os.path.join(path, name)) + +from ttsv.utils.inference.run_gradio import * +from argparse import Namespace + +#os.system('python ttsv/utils/inference/run_gradio.py -a ttsv/checkpoints/glow/male -v ttsv/checkpoints/hifi/male -d cpu -L hi') + + +args = { + 'acoustic':'/home/user/app/ttsv/checkpoints/female/glow_ckp,/home/user/app/ttsv/checkpoints/male/glow_ckp', + 'vocoder':'/home/user/app/ttsv/checkpoints/female/hifi_ckp,/home/user/app/ttsv/checkpoints/male/hifi_ckp', + 'device':'cpu', + 'lang':'hi' +} + +build_gradio(Namespace(**args)) \ No newline at end of file diff --git a/packages.txt b/packages.txt new file mode 100644 index 0000000000000000000000000000000000000000..327dcda5b104f81b324065a8386d143f1afb44a1 --- /dev/null +++ b/packages.txt @@ -0,0 +1 @@ +libsndfile1 \ No newline at end of file diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000000000000000000000000000000000000..448a87a8c839c5bce20bbbebce120fcf1eff5027 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,19 @@ +Cython==0.29.24 +layers==0.1.5 +librosa==0.8.1 +matplotlib==3.3.4 +numpy==1.21.0 +scipy==1.5.4 +tensorboardX==2.4 +tensorboard==2.7.0 +tqdm==4.62.3 +fastapi==0.70.0 +uvicorn==0.15.0 +gradio==2.5.2 +wavio==0.0.4 +mosestokenizer==1.2.1 +indic-nlp-library==0.81 +inflect==5.3.0 +Unidecode==1.3.2 +torch +pydload \ No newline at end of file diff --git a/ttsv/.gitignore b/ttsv/.gitignore new file mode 100644 index 0000000000000000000000000000000000000000..45bf260a4a2f452da0b71f9adaacd7b49a6b51de --- /dev/null +++ b/ttsv/.gitignore @@ -0,0 +1,132 @@ +# Byte-compiled / optimized / DLL files +.DS_Store +__pycache__/ +*.py[cod] +*$py.class + +# C extensions +*.so + +# Distribution / packaging +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +pip-wheel-metadata/ +share/python-wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.nox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +*.py,cover +.hypothesis/ +.pytest_cache/ + +# Translations +*.mo +*.pot + +# Django stuff: +*.log +local_settings.py +db.sqlite3 +db.sqlite3-journal + +# Flask stuff: +instance/ +.webassets-cache + +# Scrapy stuff: +.scrapy + +# Sphinx documentation +docs/_build/ + +# PyBuilder +target/ + +# Jupyter Notebook +.ipynb_checkpoints + +# IPython +profile_default/ +ipython_config.py + +# pyenv +.python-version + +# pipenv +# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. +# However, in case of collaboration, if having platform-specific dependencies or dependencies +# having no cross-platform support, pipenv may install dependencies that don't work, or not +# install all needed dependencies. +#Pipfile.lock + +# PEP 582; used by e.g. github.com/David-OConnor/pyflow +__pypackages__/ + +# Celery stuff +celerybeat-schedule +celerybeat.pid + +# SageMath parsed files +*.sage.py + +# Environments +.env +.venv +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ + +# Spyder project settings +.spyderproject +.spyproject + +# Rope project settings +.ropeproject + +# mkdocs documentation +/site + +# mypy +.mypy_cache/ +.dmypy.json +dmypy.json + +# Pyre type checker +.pyre/ + +.idea/ diff --git a/ttsv/LICENSE.md b/ttsv/LICENSE.md new file mode 100644 index 0000000000000000000000000000000000000000..5fd2e54913fd05b69de2874ec8f9a10c7f4e8d3f --- /dev/null +++ b/ttsv/LICENSE.md @@ -0,0 +1,21 @@ +MIT License + +Copyright (c) 2022 Open-Speech-EkStep + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. diff --git a/ttsv/README.md b/ttsv/README.md new file mode 100644 index 0000000000000000000000000000000000000000..02892bc9dd4344e550596d238e2b71870cfc7dd3 --- /dev/null +++ b/ttsv/README.md @@ -0,0 +1,220 @@ +# vakyansh-tts +Text to Speech for Indic languages + +## 1. Installation and Setup for training + +Clone repo +Note : for multspeaker glow-tts training use branch [multispeaker](https://github.com/Open-Speech-EkStep/vakyansh-tts/tree/multispeaker) +``` +git clone https://github.com/Open-Speech-EkStep/vakyansh-tts +``` +Build conda virtual environment +``` +cd ./vakyansh-tts +conda create --name python=3.7 +conda activate +pip install -r requirements.txt +``` +Install [apex](https://github.com/NVIDIA/apex); commit: 37cdaf4 for Mixed-precision training + +Note : used only for glow-tts +``` +cd .. +git clone https://github.com/NVIDIA/apex +cd apex +git checkout 37cdaf4 +pip install -v --disable-pip-version-check --no-cache-dir ./ +cd ../vakyansh-tts +``` +Build Monotonic Alignment Search Code (Cython) + +Note : used only for glow-tts +``` +bash install.sh +``` + +## 2. Data Resampling + +The data format should have a folder containing all the .wav files for glow-tts and a text file containing filenames with their sentences. + +Directory structure: + +langauge_folder_name +``` +language_folder_name +|-- ./wav/*.wav +|-- ./text_file_name.txt +``` +The format for text_file_name.txt (Text file is only needed for glow-tts training) + +``` +( audio1.wav "Sentence1." ) +( audio2.wav "Sentence2." ) +``` + +To resample the .wav files to 22050 sample rate, change the following parameters in the vakyansh-tts/scripts/data/resample.sh + +``` +input_wav_path : absolute path to wav file folder in vakyansh_tts/data/ +output_wav_path : absolute path to vakyansh_tts/data/resampled_wav_folder_name +output_sample_rate : 22050 (or any other desired sample rate) +``` + +To run: +```bash +cd scripts/data/ +bash resample.sh +``` + + +## 3. Spectogram Training (glow-tts) + +### 3.1 Data Preparation + + +To prepare the data edit the vakyansh-tts/scripts/glow/prepare_data.sh file and change the following parameters +``` +input_text_path : absolute path to vakyansh_tts/data/text_file_name.txt +input_wav_path : absolute path to vakyansh_tts/data/resampled_wav_folder_name +gender : female or male voice +``` +To run: +```bash +cd scripts/glow/ +bash prepare_data.sh +``` +### 3.2 Training glow-tts + +To start the spectogram-training edit the vakyansh-tts/scripts/glow/train_glow.sh file and change the following parameter: +``` +gender : female or male voice +``` +Make sure that the gender is same as that of the prepare_data.sh file + +To start the training, run: +```bash +cd scripts/glow/ +bash train_glow.sh +``` +## 4. Vocoder Training (hifi-gan) + +### 4.1 Data Preparation + +To prepare the data edit the vakyansh-tts/scripts/hifi/prepare_data.sh file and change the following parameters +``` +input_wav_path : absolute path to vakyansh_tts/data/resampled_wav_folder_name +gender : female or male voice +``` +To run: +```bash +cd scripts/hifi/ +bash prepare_data.sh +``` +### 4.2 Training hifi-gan + +To start the spectogram-training edit the vakyansh-tts/scripts/hifi/train_hifi.sh file and change the following parameter: +``` +gender : female or male voice +``` +Make sure that the gender is same as that of the prepare_data.sh file + +To start the training, run: +```bash +cd scripts/hifi/ +bash train_hifi.sh +``` + +## 5. Inference + +### 5.1 Using Gradio + +To use the gradio link edit the following parameters in the vakyansh-tts/scripts/inference/gradio.sh file: +``` +gender : female or male voice +device : cpu or cuda +lang : langauge code +``` + +To run: +```bash +cd scripts/inference/ +bash gradio.sh +``` +### 5.2 Using fast API +To use the fast api link edit the parameters in the vakyansh-tts/scripts/inference/api.sh file similar to section 5.1 + +To run: +```bash +cd scripts/inference/ +bash api.sh +``` + +### 5.3 Direct Inference using text +To infer, edit the parameters in the vakyansh-tts/scripts/inference/infer.sh file similar to section 5.1 and set the text to the text variable + +To run: +```bash +cd scripts/inference/ +bash infer.sh +``` + +To configure other parameters there is a version that runs the advanced inference as well. Additional Parameters: +``` +noise_scale : can vary from 0 to 1 for noise factor +length_scale : can vary from 0 to 2 for changing the speed of the generated audio +transliteration : whether to switch on/off transliteration. 1: ON, 0: OFF +number_conversion : whether to switch on/off number to words conversion. 1: ON, 0: OFF +split_sentences : whether to switch on/off splitting of sentences. 1: ON, 0: OFF +``` +To run: +``` +cd scripts/inference/ +bash advanced_infer.sh +``` + +### 5.4 Installation of tts_infer package + +In tts_infer package, we currently have two components: + + 1. Transliteration (AI4bharat's open sourced models) (Languages supported: {'hi', 'gu', 'mr', 'bn', 'te', 'ta', 'kn', 'pa', 'gom', 'mai', 'ml', 'sd', 'si', 'ur'} ) + + 2. Num to Word (Languages supported: {'en', 'hi', 'gu', 'mr', 'bn', 'te', 'ta', 'kn', 'or', 'pa'} ) +``` +git clone https://github.com/Open-Speech-EkStep/vakyansh-tts +cd vakyansh-tts +bash install.sh +python setup.py bdist_wheel +pip install -e . +cd tts_infer +gsutil -m cp -r gs://vakyaansh-open-models/translit_models . +``` + +Usage: Refer to example file in tts_infer/ +``` +from tts_infer.tts import TextToMel, MelToWav +from tts_infer.transliterate import XlitEngine +from tts_infer.num_to_word_on_sent import normalize_nums + +import re +from scipy.io.wavfile import write + +text_to_mel = TextToMel(glow_model_dir='/path/to/glow-tts/checkpoint/dir', device='cuda') +mel_to_wav = MelToWav(hifi_model_dir='/path/to/hifi/checkpoint/dir', device='cuda') + +def translit(text, lang): + reg = re.compile(r'[a-zA-Z]') + engine = XlitEngine(lang) + words = [engine.translit_word(word, topk=1)[lang][0] if reg.match(word) else word for word in text.split()] + updated_sent = ' '.join(words) + return updated_sent + +def run_tts(text, lang): + text = text.replace('।', '.') # only for hindi models + text_num_to_word = normalize_nums(text, lang) # converting numbers to words in lang + text_num_to_word_and_transliterated = translit(text_num_to_word, lang) # transliterating english words to lang + + mel = text_to_mel.generate_mel(text_num_to_word_and_transliterated) + audio, sr = mel_to_wav.generate_wav(mel) + write(filename='temp.wav', rate=sr, data=audio) # for saving wav file, if needed + return (sr, audio) +``` diff --git a/ttsv/__init__.py b/ttsv/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/ttsv/checkpoints/glow/.gitkeep b/ttsv/checkpoints/glow/.gitkeep new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/ttsv/checkpoints/hifi/.gitkeep b/ttsv/checkpoints/hifi/.gitkeep new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/ttsv/config/.gitkeep b/ttsv/config/.gitkeep new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/ttsv/config/glow/base.json b/ttsv/config/glow/base.json new file mode 100644 index 0000000000000000000000000000000000000000..c87165196faa226bcef5f995113281489aea0de7 --- /dev/null +++ b/ttsv/config/glow/base.json @@ -0,0 +1,54 @@ +{ + "train": { + "use_cuda": true, + "log_interval": 20, + "seed": 1234, + "epochs": 10000, + "learning_rate": 1e0, + "betas": [0.9, 0.98], + "eps": 1e-9, + "warmup_steps": 4000, + "scheduler": "noam", + "batch_size": 16, + "ddi": true, + "fp16_run": true, + "save_epoch": 1 + }, + "data": { + "load_mel_from_disk": false, + "training_files":"../data/training/train.txt", + "validation_files":"../data/training/valid.txt", + "chars":"", + "punc":"", + "text_cleaners":["basic_indic_cleaners"], + "max_wav_value": 32768.0, + "sampling_rate": 22050, + "filter_length": 1024, + "hop_length": 256, + "win_length": 1024, + "n_mel_channels": 80, + "mel_fmin": 80.0, + "mel_fmax": 7600.0, + "add_noise": true + }, + "model": { + "hidden_channels": 192, + "filter_channels": 768, + "filter_channels_dp": 256, + "kernel_size": 3, + "p_dropout": 0.1, + "n_blocks_dec": 12, + "n_layers_enc": 6, + "n_heads": 2, + "p_dropout_dec": 0.05, + "dilation_rate": 1, + "kernel_size_dec": 5, + "n_block_layers": 4, + "n_sqz": 2, + "prenet": true, + "mean_only": true, + "hidden_channels_enc": 192, + "hidden_channels_dec": 192, + "window_size": 4 + } +} diff --git a/ttsv/config/glow/base_blank.json b/ttsv/config/glow/base_blank.json new file mode 100644 index 0000000000000000000000000000000000000000..2c359a73317e7769f51cedcac2127affbf40b7ff --- /dev/null +++ b/ttsv/config/glow/base_blank.json @@ -0,0 +1,55 @@ +{ + "train": { + "use_cuda": true, + "log_interval": 20, + "seed": 1234, + "epochs": 10000, + "learning_rate": 1e0, + "betas": [0.9, 0.98], + "eps": 1e-9, + "warmup_steps": 4000, + "scheduler": "noam", + "batch_size": 16, + "ddi": true, + "fp16_run": true, + "save_epoch": 1 + }, + "data": { + "load_mel_from_disk": false, + "training_files":"../data/training/train.txt", + "validation_files":"../data/training/valid.txt", + "chars":"", + "punc":"", + "text_cleaners":["basic_indic_cleaners"], + "max_wav_value": 32768.0, + "sampling_rate": 22050, + "filter_length": 1024, + "hop_length": 256, + "win_length": 1024, + "n_mel_channels": 80, + "mel_fmin": 80.0, + "mel_fmax": 7600.0, + "add_noise": true, + "add_blank": true + }, + "model": { + "hidden_channels": 192, + "filter_channels": 768, + "filter_channels_dp": 256, + "kernel_size": 3, + "p_dropout": 0.1, + "n_blocks_dec": 12, + "n_layers_enc": 6, + "n_heads": 2, + "p_dropout_dec": 0.05, + "dilation_rate": 1, + "kernel_size_dec": 5, + "n_block_layers": 4, + "n_sqz": 2, + "prenet": true, + "mean_only": true, + "hidden_channels_enc": 192, + "hidden_channels_dec": 192, + "window_size": 4 + } +} diff --git a/ttsv/config/hifi/config_v1.json b/ttsv/config/hifi/config_v1.json new file mode 100644 index 0000000000000000000000000000000000000000..cb82eda796ce9f9e60de119cd503b617d4efdba2 --- /dev/null +++ b/ttsv/config/hifi/config_v1.json @@ -0,0 +1,37 @@ +{ + "resblock": "1", + "num_gpus": 0, + "batch_size": 24, + "learning_rate": 0.0002, + "adam_b1": 0.8, + "adam_b2": 0.99, + "lr_decay": 0.999, + "seed": 1234, + + "upsample_rates": [8,8,2,2], + "upsample_kernel_sizes": [16,16,4,4], + "upsample_initial_channel": 512, + "resblock_kernel_sizes": [3,7,11], + "resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]], + + "segment_size": 8192, + "num_mels": 80, + "num_freq": 1025, + "n_fft": 1024, + "hop_size": 256, + "win_size": 1024, + + "sampling_rate": 22050, + + "fmin": 80, + "fmax": 7600, + "fmax_for_loss": null, + + "num_workers": 4, + + "dist_config": { + "dist_backend": "nccl", + "dist_url": "tcp://localhost:54321", + "world_size": 1 + } +} diff --git a/ttsv/config/hifi/config_v2.json b/ttsv/config/hifi/config_v2.json new file mode 100644 index 0000000000000000000000000000000000000000..b5a85ef874ed03d4002258ab5901f9bdf9a4f07b --- /dev/null +++ b/ttsv/config/hifi/config_v2.json @@ -0,0 +1,37 @@ +{ + "resblock": "1", + "num_gpus": 0, + "batch_size": 24, + "learning_rate": 0.0002, + "adam_b1": 0.8, + "adam_b2": 0.99, + "lr_decay": 0.999, + "seed": 1234, + + "upsample_rates": [8,8,2,2], + "upsample_kernel_sizes": [16,16,4,4], + "upsample_initial_channel": 128, + "resblock_kernel_sizes": [3,7,11], + "resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]], + + "segment_size": 8192, + "num_mels": 80, + "num_freq": 1025, + "n_fft": 1024, + "hop_size": 256, + "win_size": 1024, + + "sampling_rate": 22050, + + "fmin": 80, + "fmax": 7600, + "fmax_for_loss": null, + + "num_workers": 4, + + "dist_config": { + "dist_backend": "nccl", + "dist_url": "tcp://localhost:54321", + "world_size": 1 + } +} diff --git a/ttsv/config/hifi/config_v3.json b/ttsv/config/hifi/config_v3.json new file mode 100644 index 0000000000000000000000000000000000000000..7d6bafd26a180906df23e38f8ff59ce6f3469a03 --- /dev/null +++ b/ttsv/config/hifi/config_v3.json @@ -0,0 +1,37 @@ +{ + "resblock": "2", + "num_gpus": 0, + "batch_size": 24, + "learning_rate": 0.0002, + "adam_b1": 0.8, + "adam_b2": 0.99, + "lr_decay": 0.999, + "seed": 1234, + + "upsample_rates": [8,8,4], + "upsample_kernel_sizes": [16,16,8], + "upsample_initial_channel": 256, + "resblock_kernel_sizes": [3,5,7], + "resblock_dilation_sizes": [[1,2], [2,6], [3,12]], + + "segment_size": 8192, + "num_mels": 80, + "num_freq": 1025, + "n_fft": 1024, + "hop_size": 256, + "win_size": 1024, + + "sampling_rate": 22050, + + "fmin": 80, + "fmax": 7600, + "fmax_for_loss": null, + + "num_workers": 4, + + "dist_config": { + "dist_backend": "nccl", + "dist_url": "tcp://localhost:54321", + "world_size": 1 + } +} diff --git a/ttsv/data/.gitkeep b/ttsv/data/.gitkeep new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/ttsv/install.sh b/ttsv/install.sh new file mode 100644 index 0000000000000000000000000000000000000000..51e038d5a0098f21d4efd8051a15b7f0cdeb4b73 --- /dev/null +++ b/ttsv/install.sh @@ -0,0 +1,6 @@ +cd src/glow_tts/monotonic_align/ +pip install . +cd ../../../ + +# torch +pip install torch==1.7.1+cu110 torchvision==0.8.2+cu110 torchaudio==0.7.2 -f https://download.pytorch.org/whl/torch_stable.html diff --git a/ttsv/logs/glow/.gitkeep b/ttsv/logs/glow/.gitkeep new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/ttsv/logs/hifi/.gitkeep b/ttsv/logs/hifi/.gitkeep new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/ttsv/notebooks/vakyansh_tts_demo.ipynb b/ttsv/notebooks/vakyansh_tts_demo.ipynb new file mode 100644 index 0000000000000000000000000000000000000000..a39c80d91a8a2f1edef63bf09e7337712c54d9d3 --- /dev/null +++ b/ttsv/notebooks/vakyansh_tts_demo.ipynb @@ -0,0 +1,546 @@ +{ + "nbformat": 4, + "nbformat_minor": 0, + "metadata": { + "colab": { + "name": "vakyansh_tts_demo.ipynb", + "provenance": [], + "authorship_tag": "ABX9TyNhhwduU9+eajfOP6r1Y98A", + "include_colab_link": true + }, + "kernelspec": { + "name": "python3", + "display_name": "Python 3" + }, + "language_info": { + "name": "python" + } + }, + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "id": "view-in-github", + "colab_type": "text" + }, + "source": [ + "\"Open" + ] + }, + { + "cell_type": "markdown", + "source": [ + "## Installing Dependencies" + ], + "metadata": { + "id": "oyoFPN29HrRt" + } + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 1000 + }, + "id": "5x4wJQGUaysK", + "outputId": "90d49030-311e-4100-b42a-3849df217887" + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Cloning into 'vakyansh-tts'...\n", + "remote: Enumerating objects: 466, done.\u001b[K\n", + "remote: Counting objects: 100% (201/201), done.\u001b[K\n", + "remote: Compressing objects: 100% (175/175), done.\u001b[K\n", + "remote: Total 466 (delta 89), reused 64 (delta 22), pack-reused 265\u001b[K\n", + "Receiving objects: 100% (466/466), 259.27 KiB | 1.39 MiB/s, done.\n", + "Resolving deltas: 100% (229/229), done.\n", + "Processing /content/vakyansh-tts/src/glow_tts/monotonic_align\n", + "\u001b[33m DEPRECATION: A future pip version will change local packages to be built in-place without first copying to a temporary directory. We recommend you use --use-feature=in-tree-build to test your packages with this new behavior before it becomes the default.\n", + " pip 21.3 will remove support for this functionality. You can find discussion regarding this at https://github.com/pypa/pip/issues/7555.\u001b[0m\n", + " Installing build dependencies ... \u001b[?25l\u001b[?25hdone\n", + " Getting requirements to build wheel ... \u001b[?25l\u001b[?25hdone\n", + " Preparing wheel metadata ... \u001b[?25l\u001b[?25hdone\n", + "Requirement already satisfied: numpy in /usr/local/lib/python3.7/dist-packages (from monotonic-align==1.1) (1.19.5)\n", + "Building wheels for collected packages: monotonic-align\n", + " Building wheel for monotonic-align (PEP 517) ... \u001b[?25l\u001b[?25hdone\n", + " Created wheel for monotonic-align: filename=monotonic_align-1.1-cp37-cp37m-linux_x86_64.whl size=237012 sha256=3ffba87629daf17ecf86f538ead38094792d74d16b36cf691371c36f2e2c8ead\n", + " Stored in directory: /tmp/pip-ephem-wheel-cache-m1jlgsel/wheels/3a/e4/2d/953a66d439600fcb1836ffba5ef6915b944df396e8228909cb\n", + "Successfully built monotonic-align\n", + "Installing collected packages: monotonic-align\n", + "Successfully installed monotonic-align-1.1\n", + "running bdist_wheel\n", + "running build\n", + "running build_py\n", + "creating build\n", + "creating build/lib\n", + "creating build/lib/tts_infer\n", + "copying tts_infer/tts.py -> build/lib/tts_infer\n", + "copying tts_infer/num_to_word_on_sent.py -> build/lib/tts_infer\n", + "copying tts_infer/transliterate.py -> build/lib/tts_infer\n", + "copying tts_infer/__init__.py -> build/lib/tts_infer\n", + "running egg_info\n", + "creating vakyansh_tts.egg-info\n", + "writing vakyansh_tts.egg-info/PKG-INFO\n", + "writing dependency_links to vakyansh_tts.egg-info/dependency_links.txt\n", + "writing requirements to vakyansh_tts.egg-info/requires.txt\n", + "writing top-level names to vakyansh_tts.egg-info/top_level.txt\n", + "writing manifest file 'vakyansh_tts.egg-info/SOURCES.txt'\n", + "adding license file 'LICENSE.md'\n", + "writing manifest file 'vakyansh_tts.egg-info/SOURCES.txt'\n", + "copying tts_infer/requirements.txt -> build/lib/tts_infer\n", + "installing to build/bdist.linux-x86_64/wheel\n", + "running install\n", + "running install_lib\n", + "creating build/bdist.linux-x86_64\n", + "creating build/bdist.linux-x86_64/wheel\n", + "creating build/bdist.linux-x86_64/wheel/tts_infer\n", + "copying build/lib/tts_infer/tts.py -> build/bdist.linux-x86_64/wheel/tts_infer\n", + "copying build/lib/tts_infer/num_to_word_on_sent.py -> build/bdist.linux-x86_64/wheel/tts_infer\n", + "copying build/lib/tts_infer/transliterate.py -> build/bdist.linux-x86_64/wheel/tts_infer\n", + "copying build/lib/tts_infer/__init__.py -> build/bdist.linux-x86_64/wheel/tts_infer\n", + "copying build/lib/tts_infer/requirements.txt -> build/bdist.linux-x86_64/wheel/tts_infer\n", + "running install_egg_info\n", + "Copying vakyansh_tts.egg-info to build/bdist.linux-x86_64/wheel/vakyansh_tts-0.0.1-py3.7.egg-info\n", + "running install_scripts\n", + "adding license file \"LICENSE.md\" (matched pattern \"LICEN[CS]E*\")\n", + "creating build/bdist.linux-x86_64/wheel/vakyansh_tts-0.0.1.dist-info/WHEEL\n", + "creating 'dist/vakyansh_tts-0.0.1-py3-none-any.whl' and adding 'build/bdist.linux-x86_64/wheel' to it\n", + "adding 'tts_infer/__init__.py'\n", + "adding 'tts_infer/num_to_word_on_sent.py'\n", + "adding 'tts_infer/requirements.txt'\n", + "adding 'tts_infer/transliterate.py'\n", + "adding 'tts_infer/tts.py'\n", + "adding 'vakyansh_tts-0.0.1.dist-info/LICENSE.md'\n", + "adding 'vakyansh_tts-0.0.1.dist-info/METADATA'\n", + "adding 'vakyansh_tts-0.0.1.dist-info/WHEEL'\n", + "adding 'vakyansh_tts-0.0.1.dist-info/top_level.txt'\n", + "adding 'vakyansh_tts-0.0.1.dist-info/RECORD'\n", + "removing build/bdist.linux-x86_64/wheel\n", + "Obtaining file:///content/vakyansh-tts\n", + "Requirement already satisfied: Cython==0.29.24 in /usr/local/lib/python3.7/dist-packages (from vakyansh-tts==0.0.1) (0.29.24)\n", + "Collecting inflect==5.3.0\n", + " Downloading inflect-5.3.0-py3-none-any.whl (32 kB)\n", + "Collecting layers==0.1.5\n", + " Downloading layers-0.1.5.tar.gz (5.5 kB)\n", + "Requirement already satisfied: librosa==0.8.1 in /usr/local/lib/python3.7/dist-packages (from vakyansh-tts==0.0.1) (0.8.1)\n", + "Collecting matplotlib==3.3.4\n", + " Downloading matplotlib-3.3.4-cp37-cp37m-manylinux1_x86_64.whl (11.5 MB)\n", + "\u001b[K |████████████████████████████████| 11.5 MB 11.3 MB/s \n", + "\u001b[?25hRequirement already satisfied: numpy==1.19.5 in /usr/local/lib/python3.7/dist-packages (from vakyansh-tts==0.0.1) (1.19.5)\n", + "Collecting scipy==1.5.4\n", + " Downloading scipy-1.5.4-cp37-cp37m-manylinux1_x86_64.whl (25.9 MB)\n", + "\u001b[K |████████████████████████████████| 25.9 MB 1.2 MB/s \n", + "\u001b[?25hCollecting tensorboardX==2.4\n", + " Downloading tensorboardX-2.4-py2.py3-none-any.whl (124 kB)\n", + "\u001b[K |████████████████████████████████| 124 kB 57.6 MB/s \n", + "\u001b[?25hRequirement already satisfied: tensorboard==2.7.0 in /usr/local/lib/python3.7/dist-packages (from vakyansh-tts==0.0.1) (2.7.0)\n", + "Collecting torch==1.5.1\n", + " Downloading torch-1.5.1-cp37-cp37m-manylinux1_x86_64.whl (753.2 MB)\n", + "\u001b[K |████████████████████████████████| 753.2 MB 13 kB/s \n", + "\u001b[?25hCollecting Unidecode==1.3.2\n", + " Downloading Unidecode-1.3.2-py3-none-any.whl (235 kB)\n", + "\u001b[K |████████████████████████████████| 235 kB 65.7 MB/s \n", + "\u001b[?25hRequirement already satisfied: tqdm==4.62.3 in /usr/local/lib/python3.7/dist-packages (from vakyansh-tts==0.0.1) (4.62.3)\n", + "Collecting fastapi==0.70.0\n", + " Downloading fastapi-0.70.0-py3-none-any.whl (51 kB)\n", + "\u001b[K |████████████████████████████████| 51 kB 706 kB/s \n", + "\u001b[?25hCollecting uvicorn==0.15.0\n", + " Downloading uvicorn-0.15.0-py3-none-any.whl (54 kB)\n", + "\u001b[K |████████████████████████████████| 54 kB 3.2 MB/s \n", + "\u001b[?25hCollecting gradio==2.5.2\n", + " Downloading gradio-2.5.2-py3-none-any.whl (982 kB)\n", + "\u001b[K |████████████████████████████████| 982 kB 61.2 MB/s \n", + "\u001b[?25hCollecting wavio==0.0.4\n", + " Downloading wavio-0.0.4-py2.py3-none-any.whl (9.0 kB)\n", + "Collecting pydload==1.0.9\n", + " Downloading pydload-1.0.9-py2.py3-none-any.whl (16 kB)\n", + "Collecting pydantic!=1.7,!=1.7.1,!=1.7.2,!=1.7.3,!=1.8,!=1.8.1,<2.0.0,>=1.6.2\n", + " Downloading pydantic-1.9.0-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (10.9 MB)\n", + "\u001b[K |████████████████████████████████| 10.9 MB 35.1 MB/s \n", + "\u001b[?25hCollecting starlette==0.16.0\n", + " Downloading starlette-0.16.0-py3-none-any.whl (61 kB)\n", + "\u001b[K |████████████████████████████████| 61 kB 298 kB/s \n", + "\u001b[?25hCollecting Flask-Login\n", + " Downloading Flask_Login-0.5.0-py2.py3-none-any.whl (16 kB)\n", + "Collecting flask-cachebuster\n", + " Downloading Flask-CacheBuster-1.0.0.tar.gz (3.1 kB)\n", + "Collecting ffmpy\n", + " Downloading ffmpy-0.3.0.tar.gz (4.8 kB)\n", + "Collecting pydub\n", + " Downloading pydub-0.25.1-py2.py3-none-any.whl (32 kB)\n", + "Requirement already satisfied: pandas in /usr/local/lib/python3.7/dist-packages (from gradio==2.5.2->vakyansh-tts==0.0.1) (1.1.5)\n", + "Requirement already satisfied: requests in /usr/local/lib/python3.7/dist-packages (from gradio==2.5.2->vakyansh-tts==0.0.1) (2.23.0)\n", + "Requirement already satisfied: Flask>=1.1.1 in /usr/local/lib/python3.7/dist-packages (from gradio==2.5.2->vakyansh-tts==0.0.1) (1.1.4)\n", + "Collecting Flask-Cors>=3.0.8\n", + " Downloading Flask_Cors-3.0.10-py2.py3-none-any.whl (14 kB)\n", + "Requirement already satisfied: pillow in /usr/local/lib/python3.7/dist-packages (from gradio==2.5.2->vakyansh-tts==0.0.1) (7.1.2)\n", + "Collecting markdown2\n", + " Downloading markdown2-2.4.2-py2.py3-none-any.whl (34 kB)\n", + "Collecting analytics-python\n", + " Downloading analytics_python-1.4.0-py2.py3-none-any.whl (15 kB)\n", + "Collecting paramiko\n", + " Downloading paramiko-2.9.1-py2.py3-none-any.whl (210 kB)\n", + "\u001b[K |████████████████████████████████| 210 kB 61.1 MB/s \n", + "\u001b[?25hCollecting pycryptodome\n", + " Downloading pycryptodome-3.12.0-cp35-abi3-manylinux2010_x86_64.whl (2.0 MB)\n", + "\u001b[K |████████████████████████████████| 2.0 MB 42.3 MB/s \n", + "\u001b[?25hRequirement already satisfied: PyYaml in /usr/local/lib/python3.7/dist-packages (from layers==0.1.5->vakyansh-tts==0.0.1) (3.13)\n", + "Collecting bashutils\n", + " Downloading Bashutils-0.0.4.tar.gz (4.2 kB)\n", + "Requirement already satisfied: resampy>=0.2.2 in /usr/local/lib/python3.7/dist-packages (from librosa==0.8.1->vakyansh-tts==0.0.1) (0.2.2)\n", + "Requirement already satisfied: pooch>=1.0 in /usr/local/lib/python3.7/dist-packages (from librosa==0.8.1->vakyansh-tts==0.0.1) (1.5.2)\n", + "Requirement already satisfied: numba>=0.43.0 in /usr/local/lib/python3.7/dist-packages (from librosa==0.8.1->vakyansh-tts==0.0.1) (0.51.2)\n", + "Requirement already satisfied: audioread>=2.0.0 in /usr/local/lib/python3.7/dist-packages (from librosa==0.8.1->vakyansh-tts==0.0.1) (2.1.9)\n", + "Requirement already satisfied: scikit-learn!=0.19.0,>=0.14.0 in /usr/local/lib/python3.7/dist-packages (from librosa==0.8.1->vakyansh-tts==0.0.1) (1.0.1)\n", + "Requirement already satisfied: packaging>=20.0 in /usr/local/lib/python3.7/dist-packages (from librosa==0.8.1->vakyansh-tts==0.0.1) (21.3)\n", + "Requirement already satisfied: soundfile>=0.10.2 in /usr/local/lib/python3.7/dist-packages (from librosa==0.8.1->vakyansh-tts==0.0.1) (0.10.3.post1)\n", + "Requirement already satisfied: joblib>=0.14 in /usr/local/lib/python3.7/dist-packages (from librosa==0.8.1->vakyansh-tts==0.0.1) (1.1.0)\n", + "Requirement already satisfied: decorator>=3.0.0 in /usr/local/lib/python3.7/dist-packages (from librosa==0.8.1->vakyansh-tts==0.0.1) (4.4.2)\n", + "Requirement already satisfied: pyparsing!=2.0.4,!=2.1.2,!=2.1.6,>=2.0.3 in /usr/local/lib/python3.7/dist-packages (from matplotlib==3.3.4->vakyansh-tts==0.0.1) (3.0.6)\n", + "Requirement already satisfied: python-dateutil>=2.1 in /usr/local/lib/python3.7/dist-packages (from matplotlib==3.3.4->vakyansh-tts==0.0.1) (2.8.2)\n", + "Requirement already satisfied: cycler>=0.10 in /usr/local/lib/python3.7/dist-packages (from matplotlib==3.3.4->vakyansh-tts==0.0.1) (0.11.0)\n", + "Requirement already satisfied: kiwisolver>=1.0.1 in /usr/local/lib/python3.7/dist-packages (from matplotlib==3.3.4->vakyansh-tts==0.0.1) (1.3.2)\n", + "Requirement already satisfied: progressbar2 in /usr/local/lib/python3.7/dist-packages (from pydload==1.0.9->vakyansh-tts==0.0.1) (3.38.0)\n", + "Requirement already satisfied: typing-extensions in /usr/local/lib/python3.7/dist-packages (from starlette==0.16.0->fastapi==0.70.0->vakyansh-tts==0.0.1) (3.10.0.2)\n", + "Collecting anyio<4,>=3.0.0\n", + " Downloading anyio-3.4.0-py3-none-any.whl (78 kB)\n", + "\u001b[K |████████████████████████████████| 78 kB 7.8 MB/s \n", + "\u001b[?25hRequirement already satisfied: markdown>=2.6.8 in /usr/local/lib/python3.7/dist-packages (from tensorboard==2.7.0->vakyansh-tts==0.0.1) (3.3.6)\n", + "Requirement already satisfied: google-auth-oauthlib<0.5,>=0.4.1 in /usr/local/lib/python3.7/dist-packages (from tensorboard==2.7.0->vakyansh-tts==0.0.1) (0.4.6)\n", + "Requirement already satisfied: tensorboard-data-server<0.7.0,>=0.6.0 in /usr/local/lib/python3.7/dist-packages (from tensorboard==2.7.0->vakyansh-tts==0.0.1) (0.6.1)\n", + "Requirement already satisfied: protobuf>=3.6.0 in /usr/local/lib/python3.7/dist-packages (from tensorboard==2.7.0->vakyansh-tts==0.0.1) (3.17.3)\n", + "Requirement already satisfied: absl-py>=0.4 in /usr/local/lib/python3.7/dist-packages (from tensorboard==2.7.0->vakyansh-tts==0.0.1) (0.12.0)\n", + "Requirement already satisfied: grpcio>=1.24.3 in /usr/local/lib/python3.7/dist-packages (from tensorboard==2.7.0->vakyansh-tts==0.0.1) (1.42.0)\n", + "Requirement already satisfied: wheel>=0.26 in /usr/local/lib/python3.7/dist-packages (from tensorboard==2.7.0->vakyansh-tts==0.0.1) (0.37.0)\n", + "Requirement already satisfied: werkzeug>=0.11.15 in /usr/local/lib/python3.7/dist-packages (from tensorboard==2.7.0->vakyansh-tts==0.0.1) (1.0.1)\n", + "Requirement already satisfied: google-auth<3,>=1.6.3 in /usr/local/lib/python3.7/dist-packages (from tensorboard==2.7.0->vakyansh-tts==0.0.1) (1.35.0)\n", + "Requirement already satisfied: setuptools>=41.0.0 in /usr/local/lib/python3.7/dist-packages (from tensorboard==2.7.0->vakyansh-tts==0.0.1) (57.4.0)\n", + "Requirement already satisfied: tensorboard-plugin-wit>=1.6.0 in /usr/local/lib/python3.7/dist-packages (from tensorboard==2.7.0->vakyansh-tts==0.0.1) (1.8.0)\n", + "Requirement already satisfied: future in /usr/local/lib/python3.7/dist-packages (from torch==1.5.1->vakyansh-tts==0.0.1) (0.16.0)\n", + "Collecting asgiref>=3.4.0\n", + " Downloading asgiref-3.4.1-py3-none-any.whl (25 kB)\n", + "Requirement already satisfied: click>=7.0 in /usr/local/lib/python3.7/dist-packages (from uvicorn==0.15.0->vakyansh-tts==0.0.1) (7.1.2)\n", + "Collecting h11>=0.8\n", + " Downloading h11-0.12.0-py3-none-any.whl (54 kB)\n", + "\u001b[K |████████████████████████████████| 54 kB 3.7 MB/s \n", + "\u001b[?25hRequirement already satisfied: six in /usr/local/lib/python3.7/dist-packages (from absl-py>=0.4->tensorboard==2.7.0->vakyansh-tts==0.0.1) (1.15.0)\n", + "Collecting sniffio>=1.1\n", + " Downloading sniffio-1.2.0-py3-none-any.whl (10 kB)\n", + "Requirement already satisfied: idna>=2.8 in /usr/local/lib/python3.7/dist-packages (from anyio<4,>=3.0.0->starlette==0.16.0->fastapi==0.70.0->vakyansh-tts==0.0.1) (2.10)\n", + "Requirement already satisfied: itsdangerous<2.0,>=0.24 in /usr/local/lib/python3.7/dist-packages (from Flask>=1.1.1->gradio==2.5.2->vakyansh-tts==0.0.1) (1.1.0)\n", + "Requirement already satisfied: Jinja2<3.0,>=2.10.1 in /usr/local/lib/python3.7/dist-packages (from Flask>=1.1.1->gradio==2.5.2->vakyansh-tts==0.0.1) (2.11.3)\n", + "Requirement already satisfied: rsa<5,>=3.1.4 in /usr/local/lib/python3.7/dist-packages (from google-auth<3,>=1.6.3->tensorboard==2.7.0->vakyansh-tts==0.0.1) (4.8)\n", + "Requirement already satisfied: pyasn1-modules>=0.2.1 in /usr/local/lib/python3.7/dist-packages (from google-auth<3,>=1.6.3->tensorboard==2.7.0->vakyansh-tts==0.0.1) (0.2.8)\n", + "Requirement already satisfied: cachetools<5.0,>=2.0.0 in /usr/local/lib/python3.7/dist-packages (from google-auth<3,>=1.6.3->tensorboard==2.7.0->vakyansh-tts==0.0.1) (4.2.4)\n", + "Requirement already satisfied: requests-oauthlib>=0.7.0 in /usr/local/lib/python3.7/dist-packages (from google-auth-oauthlib<0.5,>=0.4.1->tensorboard==2.7.0->vakyansh-tts==0.0.1) (1.3.0)\n", + "Requirement already satisfied: MarkupSafe>=0.23 in /usr/local/lib/python3.7/dist-packages (from Jinja2<3.0,>=2.10.1->Flask>=1.1.1->gradio==2.5.2->vakyansh-tts==0.0.1) (2.0.1)\n", + "Requirement already satisfied: importlib-metadata>=4.4 in /usr/local/lib/python3.7/dist-packages (from markdown>=2.6.8->tensorboard==2.7.0->vakyansh-tts==0.0.1) (4.8.2)\n", + "Requirement already satisfied: zipp>=0.5 in /usr/local/lib/python3.7/dist-packages (from importlib-metadata>=4.4->markdown>=2.6.8->tensorboard==2.7.0->vakyansh-tts==0.0.1) (3.6.0)\n", + "Requirement already satisfied: llvmlite<0.35,>=0.34.0.dev0 in /usr/local/lib/python3.7/dist-packages (from numba>=0.43.0->librosa==0.8.1->vakyansh-tts==0.0.1) (0.34.0)\n", + "Requirement already satisfied: appdirs in /usr/local/lib/python3.7/dist-packages (from pooch>=1.0->librosa==0.8.1->vakyansh-tts==0.0.1) (1.4.4)\n", + "Requirement already satisfied: pyasn1<0.5.0,>=0.4.6 in /usr/local/lib/python3.7/dist-packages (from pyasn1-modules>=0.2.1->google-auth<3,>=1.6.3->tensorboard==2.7.0->vakyansh-tts==0.0.1) (0.4.8)\n", + "Requirement already satisfied: chardet<4,>=3.0.2 in /usr/local/lib/python3.7/dist-packages (from requests->gradio==2.5.2->vakyansh-tts==0.0.1) (3.0.4)\n", + "Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.7/dist-packages (from requests->gradio==2.5.2->vakyansh-tts==0.0.1) (2021.10.8)\n", + "Requirement already satisfied: urllib3!=1.25.0,!=1.25.1,<1.26,>=1.21.1 in /usr/local/lib/python3.7/dist-packages (from requests->gradio==2.5.2->vakyansh-tts==0.0.1) (1.24.3)\n", + "Requirement already satisfied: oauthlib>=3.0.0 in /usr/local/lib/python3.7/dist-packages (from requests-oauthlib>=0.7.0->google-auth-oauthlib<0.5,>=0.4.1->tensorboard==2.7.0->vakyansh-tts==0.0.1) (3.1.1)\n", + "Requirement already satisfied: threadpoolctl>=2.0.0 in /usr/local/lib/python3.7/dist-packages (from scikit-learn!=0.19.0,>=0.14.0->librosa==0.8.1->vakyansh-tts==0.0.1) (3.0.0)\n", + "Requirement already satisfied: cffi>=1.0 in /usr/local/lib/python3.7/dist-packages (from soundfile>=0.10.2->librosa==0.8.1->vakyansh-tts==0.0.1) (1.15.0)\n", + "Requirement already satisfied: pycparser in /usr/local/lib/python3.7/dist-packages (from cffi>=1.0->soundfile>=0.10.2->librosa==0.8.1->vakyansh-tts==0.0.1) (2.21)\n", + "Collecting monotonic>=1.5\n", + " Downloading monotonic-1.6-py2.py3-none-any.whl (8.2 kB)\n", + "Collecting backoff==1.10.0\n", + " Downloading backoff-1.10.0-py2.py3-none-any.whl (31 kB)\n", + "Requirement already satisfied: pytz>=2017.2 in /usr/local/lib/python3.7/dist-packages (from pandas->gradio==2.5.2->vakyansh-tts==0.0.1) (2018.9)\n", + "Collecting bcrypt>=3.1.3\n", + " Downloading bcrypt-3.2.0-cp36-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_24_x86_64.whl (61 kB)\n", + "\u001b[K |████████████████████████████████| 61 kB 386 kB/s \n", + "\u001b[?25hCollecting cryptography>=2.5\n", + " Downloading cryptography-36.0.1-cp36-abi3-manylinux_2_24_x86_64.whl (3.6 MB)\n", + "\u001b[K |████████████████████████████████| 3.6 MB 40.6 MB/s \n", + "\u001b[?25hCollecting pynacl>=1.0.1\n", + " Downloading PyNaCl-1.4.0-cp35-abi3-manylinux1_x86_64.whl (961 kB)\n", + "\u001b[K |████████████████████████████████| 961 kB 49.8 MB/s \n", + "\u001b[?25hRequirement already satisfied: python-utils>=2.3.0 in /usr/local/lib/python3.7/dist-packages (from progressbar2->pydload==1.0.9->vakyansh-tts==0.0.1) (2.5.6)\n", + "Building wheels for collected packages: layers, bashutils, ffmpy, flask-cachebuster\n", + " Building wheel for layers (setup.py) ... \u001b[?25l\u001b[?25hdone\n", + " Created wheel for layers: filename=layers-0.1.5-py3-none-any.whl size=5379 sha256=759f381849c193619d4e1d46982ad55fd081f3359d2b70d3fede9092d81d6b24\n", + " Stored in directory: /root/.cache/pip/wheels/75/6f/32/757f357608178c55254f10906905e7f8cd63b566173377c819\n", + " Building wheel for bashutils (setup.py) ... \u001b[?25l\u001b[?25hdone\n", + " Created wheel for bashutils: filename=Bashutils-0.0.4-py3-none-any.whl size=5472 sha256=60c44cb259b33784163362297bfeb8a6c349296e3acb89196eb4d9cab2274c08\n", + " Stored in directory: /root/.cache/pip/wheels/c7/a0/9a/b99da313eb952e5d8ab2622528c0102544d5cddca1ffc9b15e\n", + " Building wheel for ffmpy (setup.py) ... \u001b[?25l\u001b[?25hdone\n", + " Created wheel for ffmpy: filename=ffmpy-0.3.0-py3-none-any.whl size=4710 sha256=a2f3fdb0f222e1f7efb4cec778da16dc98a2dd5504bc1aa55f8d9210904764bf\n", + " Stored in directory: /root/.cache/pip/wheels/13/e4/6c/e8059816e86796a597c6e6b0d4c880630f51a1fcfa0befd5e6\n", + " Building wheel for flask-cachebuster (setup.py) ... \u001b[?25l\u001b[?25hdone\n", + " Created wheel for flask-cachebuster: filename=Flask_CacheBuster-1.0.0-py3-none-any.whl size=3371 sha256=ed02a328b3fdd4faad60c78aff1cdd40efd352cf5bcd5a15e0783d47789aaf19\n", + " Stored in directory: /root/.cache/pip/wheels/28/c0/c4/44687421dab41455be93112bd1b0dee1f3c5a9aa27bee63708\n", + "Successfully built layers bashutils ffmpy flask-cachebuster\n", + "Installing collected packages: sniffio, scipy, pynacl, monotonic, cryptography, bcrypt, backoff, anyio, starlette, pydub, pydantic, pycryptodome, paramiko, matplotlib, markdown2, h11, Flask-Login, Flask-Cors, flask-cachebuster, ffmpy, bashutils, asgiref, analytics-python, wavio, uvicorn, Unidecode, torch, tensorboardX, pydload, layers, inflect, gradio, fastapi, vakyansh-tts\n", + " Attempting uninstall: scipy\n", + " Found existing installation: scipy 1.4.1\n", + " Uninstalling scipy-1.4.1:\n", + " Successfully uninstalled scipy-1.4.1\n", + " Attempting uninstall: matplotlib\n", + " Found existing installation: matplotlib 3.2.2\n", + " Uninstalling matplotlib-3.2.2:\n", + " Successfully uninstalled matplotlib-3.2.2\n", + " Attempting uninstall: torch\n", + " Found existing installation: torch 1.10.0+cu111\n", + " Uninstalling torch-1.10.0+cu111:\n", + " Successfully uninstalled torch-1.10.0+cu111\n", + " Attempting uninstall: inflect\n", + " Found existing installation: inflect 2.1.0\n", + " Uninstalling inflect-2.1.0:\n", + " Successfully uninstalled inflect-2.1.0\n", + " Running setup.py develop for vakyansh-tts\n", + "\u001b[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.\n", + "torchvision 0.11.1+cu111 requires torch==1.10.0, but you have torch 1.5.1 which is incompatible.\n", + "torchtext 0.11.0 requires torch==1.10.0, but you have torch 1.5.1 which is incompatible.\n", + "torchaudio 0.10.0+cu111 requires torch==1.10.0, but you have torch 1.5.1 which is incompatible.\n", + "albumentations 0.1.12 requires imgaug<0.2.7,>=0.2.5, but you have imgaug 0.2.9 which is incompatible.\u001b[0m\n", + "Successfully installed Flask-Cors-3.0.10 Flask-Login-0.5.0 Unidecode-1.3.2 analytics-python-1.4.0 anyio-3.4.0 asgiref-3.4.1 backoff-1.10.0 bashutils-0.0.4 bcrypt-3.2.0 cryptography-36.0.1 fastapi-0.70.0 ffmpy-0.3.0 flask-cachebuster-1.0.0 gradio-2.5.2 h11-0.12.0 inflect-5.3.0 layers-0.1.5 markdown2-2.4.2 matplotlib-3.3.4 monotonic-1.6 paramiko-2.9.1 pycryptodome-3.12.0 pydantic-1.9.0 pydload-1.0.9 pydub-0.25.1 pynacl-1.4.0 scipy-1.5.4 sniffio-1.2.0 starlette-0.16.0 tensorboardX-2.4 torch-1.5.1 uvicorn-0.15.0 vakyansh-tts-0.0.1 wavio-0.0.4\n" + ] + }, + { + "output_type": "display_data", + "data": { + "application/vnd.colab-display-data+json": { + "pip_warning": { + "packages": [ + "matplotlib", + "mpl_toolkits" + ] + } + } + }, + "metadata": {} + }, + { + "output_type": "stream", + "name": "stdout", + "text": [ + "--2022-01-04 08:20:03-- https://storage.googleapis.com/vakyaansh-open-models/translit_models/default_lineup.json\n", + "Resolving storage.googleapis.com (storage.googleapis.com)... 142.250.141.128, 2607:f8b0:4023:c0b::80\n", + "Connecting to storage.googleapis.com (storage.googleapis.com)|142.250.141.128|:443... connected.\n", + "HTTP request sent, awaiting response... 200 OK\n", + "Length: 3422 (3.3K) [application/json]\n", + "Saving to: ‘default_lineup.json’\n", + "\n", + "\rdefault_lineup.json 0%[ ] 0 --.-KB/s \rdefault_lineup.json 100%[===================>] 3.34K --.-KB/s in 0s \n", + "\n", + "2022-01-04 08:20:03 (44.3 MB/s) - ‘default_lineup.json’ saved [3422/3422]\n", + "\n", + "--2022-01-04 08:20:03-- https://storage.googleapis.com/vakyaansh-open-models/translit_models/hindi/hindi_transliteration.zip\n", + "Resolving storage.googleapis.com (storage.googleapis.com)... 142.250.141.128, 2607:f8b0:4023:c0b::80\n", + "Connecting to storage.googleapis.com (storage.googleapis.com)|142.250.141.128|:443... connected.\n", + "HTTP request sent, awaiting response... 200 OK\n", + "Length: 45018357 (43M) [application/zip]\n", + "Saving to: ‘hindi_transliteration.zip’\n", + "\n", + "hindi_transliterati 100%[===================>] 42.93M 113MB/s in 0.4s \n", + "\n", + "2022-01-04 08:20:04 (113 MB/s) - ‘hindi_transliteration.zip’ saved [45018357/45018357]\n", + "\n", + "Archive: hindi_transliteration.zip\n", + " inflating: hi_111_model.pth \n", + " inflating: hi_scripts.json \n", + " inflating: hi_words_a4b.json \n", + "--2022-01-04 08:20:05-- https://storage.googleapis.com/vakyansh-open-models/tts/hindi/hi-IN/female_voice_0/glow.zip\n", + "Resolving storage.googleapis.com (storage.googleapis.com)... 142.250.141.128, 2607:f8b0:4023:c0b::80\n", + "Connecting to storage.googleapis.com (storage.googleapis.com)|142.250.141.128|:443... connected.\n", + "HTTP request sent, awaiting response... 200 OK\n", + "Length: 313981548 (299M) [application/zip]\n", + "Saving to: ‘glow.zip’\n", + "\n", + "glow.zip 100%[===================>] 299.44M 109MB/s in 2.7s \n", + "\n", + "2022-01-04 08:20:08 (109 MB/s) - ‘glow.zip’ saved [313981548/313981548]\n", + "\n", + "Archive: glow.zip\n", + " creating: glow_ckp/\n", + " inflating: glow_ckp/config.json \n", + " inflating: glow_ckp/G_250.pth \n", + "--2022-01-04 08:20:12-- https://storage.googleapis.com/vakyansh-open-models/tts/hindi/hi-IN/female_voice_0/hifi.zip\n", + "Resolving storage.googleapis.com (storage.googleapis.com)... 142.250.141.128, 2607:f8b0:4023:c0b::80\n", + "Connecting to storage.googleapis.com (storage.googleapis.com)|142.250.141.128|:443... connected.\n", + "HTTP request sent, awaiting response... 200 OK\n", + "Length: 51788492 (49M) [application/zip]\n", + "Saving to: ‘hifi.zip’\n", + "\n", + "hifi.zip 100%[===================>] 49.39M 88.9MB/s in 0.6s \n", + "\n", + "2022-01-04 08:20:13 (88.9 MB/s) - ‘hifi.zip’ saved [51788492/51788492]\n", + "\n", + "Archive: hifi.zip\n", + " creating: hifi_ckp/\n", + " inflating: hifi_ckp/config.json \n", + " inflating: hifi_ckp/g_00100000 \n" + ] + } + ], + "source": [ + "import os\n", + "!git clone https://github.com/Open-Speech-EkStep/vakyansh-tts\n", + "os.chdir('vakyansh-tts') \n", + "!bash install.sh\n", + "!python setup.py bdist_wheel\n", + "!pip install -e .\n", + "os.chdir('tts_infer')\n", + "!mkdir translit_models\n", + "os.chdir('translit_models')\n", + "!wget https://storage.googleapis.com/vakyaansh-open-models/translit_models/default_lineup.json\n", + "!mkdir hindi\n", + "os.chdir('hindi')\n", + "!wget https://storage.googleapis.com/vakyaansh-open-models/translit_models/hindi/hindi_transliteration.zip\n", + "!unzip hindi_transliteration\n", + "\n", + "!wget https://storage.googleapis.com/vakyansh-open-models/tts/hindi/hi-IN/female_voice_0/glow.zip\n", + "!unzip glow.zip\n", + "\n", + "!wget https://storage.googleapis.com/vakyansh-open-models/tts/hindi/hi-IN/female_voice_0/hifi.zip\n", + "!unzip hifi.zip\n", + "\n", + "!rm glow.zip\n", + "!rm hifi.zip\n", + "\n", + "os.chdir('/content/vakyansh-tts/')" + ] + }, + { + "cell_type": "markdown", + "source": [ + "## Inference Code" + ], + "metadata": { + "id": "NvQoCgYzKbWN" + } + }, + { + "cell_type": "code", + "source": [ + "from tts_infer.tts import TextToMel, MelToWav\n", + "from tts_infer.transliterate import XlitEngine\n", + "from tts_infer.num_to_word_on_sent import normalize_nums\n", + "\n", + "import re\n", + "from scipy.io.wavfile import write\n", + "device = 'cpu'\n", + "\n", + "text_to_mel = TextToMel(glow_model_dir='/content/vakyansh-tts/tts_infer/translit_models/hindi/glow_ckp', device=device)\n", + "mel_to_wav = MelToWav(hifi_model_dir='/content/vakyansh-tts/tts_infer/translit_models/hindi/hifi_ckp', device=device)\n", + "\n", + "def translit(text, lang):\n", + " reg = re.compile(r'[a-zA-Z]')\n", + " engine = XlitEngine(lang)\n", + " words = [engine.translit_word(word, topk=1)[lang][0] if reg.match(word) else word for word in text.split()]\n", + " updated_sent = ' '.join(words)\n", + " return updated_sent\n", + " \n", + "def run_tts(text, lang):\n", + " text = text.replace('।', '.') # only for hindi models\n", + " text_num_to_word = normalize_nums(text, lang) # converting numbers to words in lang\n", + " text_num_to_word_and_transliterated = translit(text_num_to_word, lang) # transliterating english words to lang\n", + " \n", + " mel = text_to_mel.generate_mel(text_num_to_word_and_transliterated)\n", + " audio, sr = mel_to_wav.generate_wav(mel)\n", + " write(filename='temp.wav', rate=sr, data=audio) # for saving wav file, if needed\n", + " return (sr, audio)" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "TVW_x9L0b5W4", + "outputId": "28f0a3b9-8f72-4562-db4b-af49699d6cc3" + }, + "execution_count": 2, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "/content/vakyansh-tts/tts_infer/translit_models/hindi/glow_ckp/G_250.pth\n", + "INFO:root:Loaded checkpoint '/content/vakyansh-tts/tts_infer/translit_models/hindi/glow_ckp/G_250.pth' (iteration 250)\n", + "/content/vakyansh-tts/tts_infer/translit_models/hindi/hifi_ckp/g_00100000\n", + "Loading '/content/vakyansh-tts/tts_infer/translit_models/hindi/hifi_ckp/g_00100000'\n", + "Complete.\n", + "Removing weight norm...\n" + ] + } + ] + }, + { + "cell_type": "code", + "source": [ + "_, audio = run_tts('hello my name is harveen', 'hi')" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "aqZ5xOVidczp", + "outputId": "bdf8f92b-c673-4738-860e-0cbf3f339d6e" + }, + "execution_count": 3, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Loading hi...\n" + ] + } + ] + }, + { + "cell_type": "markdown", + "source": [ + "## Results" + ], + "metadata": { + "id": "jaFjD59HKghg" + } + }, + { + "cell_type": "code", + "source": [ + "import IPython.display as ipd\n", + "ipd.Audio('temp.wav')" + ], + "metadata": { + "id": "zC9I2Zt5fijp", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 75 + }, + "outputId": "86d09807-41a8-48e7-ec71-4734b6ccbdc8" + }, + "execution_count": 4, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/html": [ + "\n", + " \n", + " " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "execution_count": 4 + } + ] + } + ] +} \ No newline at end of file diff --git a/ttsv/results/api/.gitkeep b/ttsv/results/api/.gitkeep new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/ttsv/scripts/data/duration.sh b/ttsv/scripts/data/duration.sh new file mode 100644 index 0000000000000000000000000000000000000000..6fc586c05259d3d576fa4437dea5f650fe5f5031 --- /dev/null +++ b/ttsv/scripts/data/duration.sh @@ -0,0 +1,9 @@ +wav_path='/home/harveen/en/iitm_data/english/wav_22k' +####################### + +dir=$PWD +parentdir="$(dirname "$dir")" +parentdir="$(dirname "$parentdir")" + + +python $parentdir/utils/data/duration.py $wav_path diff --git a/ttsv/scripts/data/resample.sh b/ttsv/scripts/data/resample.sh new file mode 100644 index 0000000000000000000000000000000000000000..8489b0a0056d46a93d24db8dba173ad7a4b8a44a --- /dev/null +++ b/ttsv/scripts/data/resample.sh @@ -0,0 +1,14 @@ +input_wav_path='/home/harveen/en/iitm_data/english/wav/' +output_wav_path='/home/harveen/en/iitm_data/english/wav_22k/' +output_sample_rate=22050 + +####################### + +dir=$PWD +parentdir="$(dirname "$dir")" +parentdir="$(dirname "$parentdir")" + +mkdir -p $output_wav_path +python $parentdir/utils/data/resample.py -i $input_wav_path -o $output_wav_path -s $output_sample_rate + +python $parentdir/utils/data/duration.py $output_wav_path diff --git a/ttsv/scripts/glow/prepare_data.sh b/ttsv/scripts/glow/prepare_data.sh new file mode 100644 index 0000000000000000000000000000000000000000..2357eeebd0fb7e6fba858242af44e8b8aa87fdf9 --- /dev/null +++ b/ttsv/scripts/glow/prepare_data.sh @@ -0,0 +1,12 @@ +input_text_path='/home/harveen/en/iitm_data/english/txt.done.data' +input_wav_path='/home/harveen/en/iitm_data/english/wav_22k' +gender='male' + + +output_data_path='../../data/glow/'$gender + +valid_samples=100 +test_samples=10 + +mkdir -p $output_data_path +python ../../utils/glow/prepare_iitm_data_glow_en.py -i $input_text_path -o $output_data_path -w $input_wav_path -v $valid_samples -t $test_samples diff --git a/ttsv/scripts/glow/train_glow.sh b/ttsv/scripts/glow/train_glow.sh new file mode 100755 index 0000000000000000000000000000000000000000..f12939d5d4563de555bf49408fa7a27397e0dae3 --- /dev/null +++ b/ttsv/scripts/glow/train_glow.sh @@ -0,0 +1,17 @@ +#!/bin/bash + +gender='male' + +config='../../config/glow/'$gender'.json' +modeldir='../../checkpoints/glow/'$gender +logdir='../../logs/glow/'$gender +init=1 # 1 if start from scratch. 0 if start from last checkpoint + + +#################################################### + +if [[ $init -eq 1 ]] +then + python ../../src/glow_tts/init.py -c $config -m $modeldir -l $logdir +fi +python ../../src/glow_tts/train.py -c $config -m $modeldir -l $logdir diff --git a/ttsv/scripts/hifi/prepare_data.sh b/ttsv/scripts/hifi/prepare_data.sh new file mode 100644 index 0000000000000000000000000000000000000000..d620cfeb93d8de9b2f750ad9bd52a937b0b88c33 --- /dev/null +++ b/ttsv/scripts/hifi/prepare_data.sh @@ -0,0 +1,10 @@ +input_wav_path='/home/harveen/en/iitm_data/english/wav_22k' #give multiple folders separated by comma(,) +gender='male' + +output_data_path='../../data/hifi/'$gender + +valid_samples=100 +test_samples=10 + +mkdir -p $output_data_path +python ../../utils/hifi/prepare_iitm_data_hifi.py -i $input_wav_path -v $valid_samples -t $test_samples -d $output_data_path diff --git a/ttsv/scripts/hifi/train_hifi.sh b/ttsv/scripts/hifi/train_hifi.sh new file mode 100644 index 0000000000000000000000000000000000000000..287ca1159b5bf8f779d66885197fadbcd23b911e --- /dev/null +++ b/ttsv/scripts/hifi/train_hifi.sh @@ -0,0 +1,21 @@ +#!/bin/bash + +gender='male' + +config='../../config/hifi/config_v1.json' +modeldir='../../checkpoints/hifi/'$gender +logdir='../../logs/hifi/'$gender + + +#################################################### + + + +python ../../src/hifi_gan/train.py \ + --config $config \ + --input_training_file '../../data/hifi/'$gender'/train.txt' \ + --input_validation_file '../../data/hifi/'$gender'/valid.txt' \ + --checkpoint_path $modeldir \ + --logs_path $logdir \ + --checkpoint_interval 10000 \ + --stdout_interval 50 diff --git a/ttsv/scripts/inference/advanced_infer.sh b/ttsv/scripts/inference/advanced_infer.sh new file mode 100644 index 0000000000000000000000000000000000000000..6bbd53454331f0bd5157aa4e38ae4d329fba05fd --- /dev/null +++ b/ttsv/scripts/inference/advanced_infer.sh @@ -0,0 +1,22 @@ +gender='male' +glowdir='../../checkpoints/glow/'$gender'/' +hifidir='../../checkpoints/hifi/'$gender'/' +device='cpu' +text='Hey mr. I am testing this one. Now on multiple sentences. Just want to see the flow.' +noise_scale='0.667' +length_scale='1.0' +transliteration=1 +number_conversion=1 +split_sentences=1 +lang='en' + + +timestamp=$(date +%s) +wav='../../results/'$gender'/' +wav_file=$wav/$timestamp'.wav' + + +mkdir -p $wav + +python ../../utils/inference/advanced_tts.py -a $glowdir -v $hifidir -d $device -t "$text" -w $wav_file -L $lang -n $noise_scale -l $length_scale -T $transliteration -N $number_conversion -S $split_sentences +echo "File saved at: "$wav_file diff --git a/ttsv/scripts/inference/api.sh b/ttsv/scripts/inference/api.sh new file mode 100644 index 0000000000000000000000000000000000000000..4f6ce2a2147f69e5b3da851c8222bef830056338 --- /dev/null +++ b/ttsv/scripts/inference/api.sh @@ -0,0 +1,8 @@ +gender='male' +glowdir='../../checkpoints/glow/'$gender'/' +hifidir='../../checkpoints/hifi/'$gender'/' +device='cpu' +lang='en' + + +python ../../utils/inference/api.py -a $glowdir -v $hifidir -d $device -L $lang diff --git a/ttsv/scripts/inference/gradio.sh b/ttsv/scripts/inference/gradio.sh new file mode 100644 index 0000000000000000000000000000000000000000..2b6657952c21ca7821a9a82ed0a38f7dcf78b8e1 --- /dev/null +++ b/ttsv/scripts/inference/gradio.sh @@ -0,0 +1,8 @@ +gender='male' +glowdir='../../checkpoints/glow/'$gender'/' +hifidir='../../checkpoints/hifi/'$gender'/' +device='cpu' +lang='en' + + +python ../../utils/inference/run_gradio.py -a $glowdir -v $hifidir -d $device -L $lang \ No newline at end of file diff --git a/ttsv/scripts/inference/infer.sh b/ttsv/scripts/inference/infer.sh new file mode 100644 index 0000000000000000000000000000000000000000..dec70e1f30fb80f6957f4f3382b4c0963827cf43 --- /dev/null +++ b/ttsv/scripts/inference/infer.sh @@ -0,0 +1,15 @@ +gender='male' +glowdir='../../checkpoints/glow/'$gender'/' +hifidir='../../checkpoints/hifi/'$gender'/' +device='cpu' +text='testing this one' + + +timestamp=$(date +%s) +wav='../../results/'$gender'/' +wav_file=$wav/$timestamp'.wav' + + +mkdir -p $wav +python ../../utils/inference/tts.py -a $glowdir -v $hifidir -d $device -t "$text" -w $wav_file +echo "File saved at: "$wav_file diff --git a/ttsv/setup.py b/ttsv/setup.py new file mode 100644 index 0000000000000000000000000000000000000000..9d2c73345b8406195aaa6327cb3148bb92b65190 --- /dev/null +++ b/ttsv/setup.py @@ -0,0 +1,55 @@ +from setuptools import setup, find_packages + +with open("README.md", "r") as f: + long_description = f.read() + +setup( + name="vakyansh-tts", + version="0.0.5", + description="Text to speech for Indic languages", + long_description=long_description, + long_description_content_type="text/markdown", + url="https://github.com/Open-Speech-EkStep/vakyansh-tts.git", + keywords="nlp, tts, Indic languages, deep learning, text to speech", + # package_dir={'': 'src'}, + # packages=find_packages(where='src'), + packages=["tts_infer"], + python_requires=">=3.7, <4", + install_requires=[ + "Cython==0.29.24", + "layers==0.1.5", + "librosa==0.8.1", + "matplotlib==3.3.4", + "numpy==1.20.2", + "scipy==1.5.4", + "tensorboardX==2.4", + "tensorboard==2.7.0", + "tqdm==4.62.3", + "fastapi==0.70.0", + "uvicorn==0.15.0", + "gradio==2.5.2", + "wavio==0.0.4", + "pydload==1.0.9", + "mosestokenizer==1.2.1", + "indic-nlp-library==0.81" + ], + classifiers=[ + # How mature is this project? Common values are + # 3 - Alpha + # 4 - Beta + # 5 - Production/Stable + "Development Status :: 3 - Alpha", + # Indicate who your project is intended for + "Intended Audience :: Developers", + "Intended Audience :: Education", + "Intended Audience :: Science/Research", + "Topic :: Scientific/Engineering :: Artificial Intelligence", + "Topic :: Text Processing :: Linguistic", + # Pick your license as you wish (should match "license" above) + "License :: OSI Approved :: MIT License", + # Specify the Python versions you support here. In particular, ensure + # that you indicate whether you support Python 2, Python 3 or both. + "Programming Language :: Python :: 3.7", + ], + include_package_data=True, +) diff --git a/ttsv/src/glow_tts/attentions.py b/ttsv/src/glow_tts/attentions.py new file mode 100644 index 0000000000000000000000000000000000000000..62b8c83acbd3150b6d6686f21f3627781107c1ba --- /dev/null +++ b/ttsv/src/glow_tts/attentions.py @@ -0,0 +1,378 @@ +import copy +import math +import numpy as np +import torch +from torch import nn +from torch.nn import functional as F + +import commons +import modules +from modules import LayerNorm + + +class Encoder(nn.Module): + def __init__( + self, + hidden_channels, + filter_channels, + n_heads, + n_layers, + kernel_size=1, + p_dropout=0.0, + window_size=None, + block_length=None, + **kwargs + ): + super().__init__() + self.hidden_channels = hidden_channels + self.filter_channels = filter_channels + self.n_heads = n_heads + self.n_layers = n_layers + self.kernel_size = kernel_size + self.p_dropout = p_dropout + self.window_size = window_size + self.block_length = block_length + + self.drop = nn.Dropout(p_dropout) + self.attn_layers = nn.ModuleList() + self.norm_layers_1 = nn.ModuleList() + self.ffn_layers = nn.ModuleList() + self.norm_layers_2 = nn.ModuleList() + for i in range(self.n_layers): + self.attn_layers.append( + MultiHeadAttention( + hidden_channels, + hidden_channels, + n_heads, + window_size=window_size, + p_dropout=p_dropout, + block_length=block_length, + ) + ) + self.norm_layers_1.append(LayerNorm(hidden_channels)) + self.ffn_layers.append( + FFN( + hidden_channels, + hidden_channels, + filter_channels, + kernel_size, + p_dropout=p_dropout, + ) + ) + self.norm_layers_2.append(LayerNorm(hidden_channels)) + + def forward(self, x, x_mask): + attn_mask = x_mask.unsqueeze(2) * x_mask.unsqueeze(-1) + for i in range(self.n_layers): + x = x * x_mask + y = self.attn_layers[i](x, x, attn_mask) + y = self.drop(y) + x = self.norm_layers_1[i](x + y) + + y = self.ffn_layers[i](x, x_mask) + y = self.drop(y) + x = self.norm_layers_2[i](x + y) + x = x * x_mask + return x + + +class CouplingBlock(nn.Module): + def __init__( + self, + in_channels, + hidden_channels, + kernel_size, + dilation_rate, + n_layers, + gin_channels=0, + p_dropout=0, + sigmoid_scale=False, + ): + super().__init__() + self.in_channels = in_channels + self.hidden_channels = hidden_channels + self.kernel_size = kernel_size + self.dilation_rate = dilation_rate + self.n_layers = n_layers + self.gin_channels = gin_channels + self.p_dropout = p_dropout + self.sigmoid_scale = sigmoid_scale + + start = torch.nn.Conv1d(in_channels // 2, hidden_channels, 1) + start = torch.nn.utils.weight_norm(start) + self.start = start + # Initializing last layer to 0 makes the affine coupling layers + # do nothing at first. It helps to stabilze training. + end = torch.nn.Conv1d(hidden_channels, in_channels, 1) + end.weight.data.zero_() + end.bias.data.zero_() + self.end = end + + self.wn = modules.WN( + in_channels, + hidden_channels, + kernel_size, + dilation_rate, + n_layers, + gin_channels, + p_dropout, + ) + + def forward(self, x, x_mask=None, reverse=False, g=None, **kwargs): + b, c, t = x.size() + if x_mask is None: + x_mask = 1 + x_0, x_1 = x[:, : self.in_channels // 2], x[:, self.in_channels // 2 :] + + x = self.start(x_0) * x_mask + x = self.wn(x, x_mask, g) + out = self.end(x) + + z_0 = x_0 + m = out[:, : self.in_channels // 2, :] + logs = out[:, self.in_channels // 2 :, :] + if self.sigmoid_scale: + logs = torch.log(1e-6 + torch.sigmoid(logs + 2)) + + if reverse: + z_1 = (x_1 - m) * torch.exp(-logs) * x_mask + logdet = None + else: + z_1 = (m + torch.exp(logs) * x_1) * x_mask + logdet = torch.sum(logs * x_mask, [1, 2]) + + z = torch.cat([z_0, z_1], 1) + return z, logdet + + def store_inverse(self): + self.wn.remove_weight_norm() + + +class MultiHeadAttention(nn.Module): + def __init__( + self, + channels, + out_channels, + n_heads, + window_size=None, + heads_share=True, + p_dropout=0.0, + block_length=None, + proximal_bias=False, + proximal_init=False, + ): + super().__init__() + assert channels % n_heads == 0 + + self.channels = channels + self.out_channels = out_channels + self.n_heads = n_heads + self.window_size = window_size + self.heads_share = heads_share + self.block_length = block_length + self.proximal_bias = proximal_bias + self.p_dropout = p_dropout + self.attn = None + + self.k_channels = channels // n_heads + self.conv_q = nn.Conv1d(channels, channels, 1) + self.conv_k = nn.Conv1d(channels, channels, 1) + self.conv_v = nn.Conv1d(channels, channels, 1) + if window_size is not None: + n_heads_rel = 1 if heads_share else n_heads + rel_stddev = self.k_channels ** -0.5 + self.emb_rel_k = nn.Parameter( + torch.randn(n_heads_rel, window_size * 2 + 1, self.k_channels) + * rel_stddev + ) + self.emb_rel_v = nn.Parameter( + torch.randn(n_heads_rel, window_size * 2 + 1, self.k_channels) + * rel_stddev + ) + self.conv_o = nn.Conv1d(channels, out_channels, 1) + self.drop = nn.Dropout(p_dropout) + + nn.init.xavier_uniform_(self.conv_q.weight) + nn.init.xavier_uniform_(self.conv_k.weight) + if proximal_init: + self.conv_k.weight.data.copy_(self.conv_q.weight.data) + self.conv_k.bias.data.copy_(self.conv_q.bias.data) + nn.init.xavier_uniform_(self.conv_v.weight) + + def forward(self, x, c, attn_mask=None): + q = self.conv_q(x) + k = self.conv_k(c) + v = self.conv_v(c) + + x, self.attn = self.attention(q, k, v, mask=attn_mask) + + x = self.conv_o(x) + return x + + def attention(self, query, key, value, mask=None): + # reshape [b, d, t] -> [b, n_h, t, d_k] + b, d, t_s, t_t = (*key.size(), query.size(2)) + query = query.view(b, self.n_heads, self.k_channels, t_t).transpose(2, 3) + key = key.view(b, self.n_heads, self.k_channels, t_s).transpose(2, 3) + value = value.view(b, self.n_heads, self.k_channels, t_s).transpose(2, 3) + + scores = torch.matmul(query, key.transpose(-2, -1)) / math.sqrt(self.k_channels) + if self.window_size is not None: + assert ( + t_s == t_t + ), "Relative attention is only available for self-attention." + key_relative_embeddings = self._get_relative_embeddings(self.emb_rel_k, t_s) + rel_logits = self._matmul_with_relative_keys(query, key_relative_embeddings) + rel_logits = self._relative_position_to_absolute_position(rel_logits) + scores_local = rel_logits / math.sqrt(self.k_channels) + scores = scores + scores_local + if self.proximal_bias: + assert t_s == t_t, "Proximal bias is only available for self-attention." + scores = scores + self._attention_bias_proximal(t_s).to( + device=scores.device, dtype=scores.dtype + ) + if mask is not None: + scores = scores.masked_fill(mask == 0, -1e4) + if self.block_length is not None: + block_mask = ( + torch.ones_like(scores) + .triu(-self.block_length) + .tril(self.block_length) + ) + scores = scores * block_mask + -1e4 * (1 - block_mask) + p_attn = F.softmax(scores, dim=-1) # [b, n_h, t_t, t_s] + p_attn = self.drop(p_attn) + output = torch.matmul(p_attn, value) + if self.window_size is not None: + relative_weights = self._absolute_position_to_relative_position(p_attn) + value_relative_embeddings = self._get_relative_embeddings( + self.emb_rel_v, t_s + ) + output = output + self._matmul_with_relative_values( + relative_weights, value_relative_embeddings + ) + output = ( + output.transpose(2, 3).contiguous().view(b, d, t_t) + ) # [b, n_h, t_t, d_k] -> [b, d, t_t] + return output, p_attn + + def _matmul_with_relative_values(self, x, y): + """ + x: [b, h, l, m] + y: [h or 1, m, d] + ret: [b, h, l, d] + """ + ret = torch.matmul(x, y.unsqueeze(0)) + return ret + + def _matmul_with_relative_keys(self, x, y): + """ + x: [b, h, l, d] + y: [h or 1, m, d] + ret: [b, h, l, m] + """ + ret = torch.matmul(x, y.unsqueeze(0).transpose(-2, -1)) + return ret + + def _get_relative_embeddings(self, relative_embeddings, length): + max_relative_position = 2 * self.window_size + 1 + # Pad first before slice to avoid using cond ops. + pad_length = max(length - (self.window_size + 1), 0) + slice_start_position = max((self.window_size + 1) - length, 0) + slice_end_position = slice_start_position + 2 * length - 1 + if pad_length > 0: + padded_relative_embeddings = F.pad( + relative_embeddings, + commons.convert_pad_shape([[0, 0], [pad_length, pad_length], [0, 0]]), + ) + else: + padded_relative_embeddings = relative_embeddings + used_relative_embeddings = padded_relative_embeddings[ + :, slice_start_position:slice_end_position + ] + return used_relative_embeddings + + def _relative_position_to_absolute_position(self, x): + """ + x: [b, h, l, 2*l-1] + ret: [b, h, l, l] + """ + batch, heads, length, _ = x.size() + # Concat columns of pad to shift from relative to absolute indexing. + x = F.pad(x, commons.convert_pad_shape([[0, 0], [0, 0], [0, 0], [0, 1]])) + + # Concat extra elements so to add up to shape (len+1, 2*len-1). + x_flat = x.view([batch, heads, length * 2 * length]) + x_flat = F.pad( + x_flat, commons.convert_pad_shape([[0, 0], [0, 0], [0, length - 1]]) + ) + + # Reshape and slice out the padded elements. + x_final = x_flat.view([batch, heads, length + 1, 2 * length - 1])[ + :, :, :length, length - 1 : + ] + return x_final + + def _absolute_position_to_relative_position(self, x): + """ + x: [b, h, l, l] + ret: [b, h, l, 2*l-1] + """ + batch, heads, length, _ = x.size() + # padd along column + x = F.pad( + x, commons.convert_pad_shape([[0, 0], [0, 0], [0, 0], [0, length - 1]]) + ) + x_flat = x.view([batch, heads, length ** 2 + length * (length - 1)]) + # add 0's in the beginning that will skew the elements after reshape + x_flat = F.pad(x_flat, commons.convert_pad_shape([[0, 0], [0, 0], [length, 0]])) + x_final = x_flat.view([batch, heads, length, 2 * length])[:, :, :, 1:] + return x_final + + def _attention_bias_proximal(self, length): + """Bias for self-attention to encourage attention to close positions. + Args: + length: an integer scalar. + Returns: + a Tensor with shape [1, 1, length, length] + """ + r = torch.arange(length, dtype=torch.float32) + diff = torch.unsqueeze(r, 0) - torch.unsqueeze(r, 1) + return torch.unsqueeze(torch.unsqueeze(-torch.log1p(torch.abs(diff)), 0), 0) + + +class FFN(nn.Module): + def __init__( + self, + in_channels, + out_channels, + filter_channels, + kernel_size, + p_dropout=0.0, + activation=None, + ): + super().__init__() + self.in_channels = in_channels + self.out_channels = out_channels + self.filter_channels = filter_channels + self.kernel_size = kernel_size + self.p_dropout = p_dropout + self.activation = activation + + self.conv_1 = nn.Conv1d( + in_channels, filter_channels, kernel_size, padding=kernel_size // 2 + ) + self.conv_2 = nn.Conv1d( + filter_channels, out_channels, kernel_size, padding=kernel_size // 2 + ) + self.drop = nn.Dropout(p_dropout) + + def forward(self, x, x_mask): + x = self.conv_1(x * x_mask) + if self.activation == "gelu": + x = x * torch.sigmoid(1.702 * x) + else: + x = torch.relu(x) + x = self.drop(x) + x = self.conv_2(x * x_mask) + return x * x_mask diff --git a/ttsv/src/glow_tts/audio_processing.py b/ttsv/src/glow_tts/audio_processing.py new file mode 100644 index 0000000000000000000000000000000000000000..3a4467355952fefaba117b6014864139ac319c6b --- /dev/null +++ b/ttsv/src/glow_tts/audio_processing.py @@ -0,0 +1,100 @@ +import torch +import numpy as np +from scipy.signal import get_window +import librosa.util as librosa_util + + +def window_sumsquare( + window, + n_frames, + hop_length=200, + win_length=800, + n_fft=800, + dtype=np.float32, + norm=None, +): + """ + # from librosa 0.6 + Compute the sum-square envelope of a window function at a given hop length. + + This is used to estimate modulation effects induced by windowing + observations in short-time fourier transforms. + + Parameters + ---------- + window : string, tuple, number, callable, or list-like + Window specification, as in `get_window` + + n_frames : int > 0 + The number of analysis frames + + hop_length : int > 0 + The number of samples to advance between frames + + win_length : [optional] + The length of the window function. By default, this matches `n_fft`. + + n_fft : int > 0 + The length of each analysis frame. + + dtype : np.dtype + The data type of the output + + Returns + ------- + wss : np.ndarray, shape=`(n_fft + hop_length * (n_frames - 1))` + The sum-squared envelope of the window function + """ + if win_length is None: + win_length = n_fft + + n = n_fft + hop_length * (n_frames - 1) + x = np.zeros(n, dtype=dtype) + + # Compute the squared window at the desired length + win_sq = get_window(window, win_length, fftbins=True) + win_sq = librosa_util.normalize(win_sq, norm=norm) ** 2 + win_sq = librosa_util.pad_center(win_sq, n_fft) + + # Fill the envelope + for i in range(n_frames): + sample = i * hop_length + x[sample : min(n, sample + n_fft)] += win_sq[: max(0, min(n_fft, n - sample))] + return x + + +def griffin_lim(magnitudes, stft_fn, n_iters=30): + """ + PARAMS + ------ + magnitudes: spectrogram magnitudes + stft_fn: STFT class with transform (STFT) and inverse (ISTFT) methods + """ + + angles = np.angle(np.exp(2j * np.pi * np.random.rand(*magnitudes.size()))) + angles = angles.astype(np.float32) + angles = torch.autograd.Variable(torch.from_numpy(angles)) + signal = stft_fn.inverse(magnitudes, angles).squeeze(1) + + for i in range(n_iters): + _, angles = stft_fn.transform(signal) + signal = stft_fn.inverse(magnitudes, angles).squeeze(1) + return signal + + +def dynamic_range_compression(x, C=1, clip_val=1e-5): + """ + PARAMS + ------ + C: compression factor + """ + return torch.log(torch.clamp(x, min=clip_val) * C) + + +def dynamic_range_decompression(x, C=1): + """ + PARAMS + ------ + C: compression factor used to compress + """ + return torch.exp(x) / C diff --git a/ttsv/src/glow_tts/commons.py b/ttsv/src/glow_tts/commons.py new file mode 100644 index 0000000000000000000000000000000000000000..8da7b35049d768a29de6f66cbe8795a825967818 --- /dev/null +++ b/ttsv/src/glow_tts/commons.py @@ -0,0 +1,273 @@ +import math +import numpy as np +import torch +from torch import nn +from torch.nn import functional as F + +from librosa.filters import mel as librosa_mel_fn +from audio_processing import dynamic_range_compression +from audio_processing import dynamic_range_decompression +from stft import STFT + + +def intersperse(lst, item): + result = [item] * (len(lst) * 2 + 1) + result[1::2] = lst + return result + + +def mle_loss(z, m, logs, logdet, mask): + l = torch.sum(logs) + 0.5 * torch.sum( + torch.exp(-2 * logs) * ((z - m) ** 2) + ) # neg normal likelihood w/o the constant term + l = l - torch.sum(logdet) # log jacobian determinant + l = l / torch.sum( + torch.ones_like(z) * mask + ) # averaging across batch, channel and time axes + l = l + 0.5 * math.log(2 * math.pi) # add the remaining constant term + return l + + +def duration_loss(logw, logw_, lengths): + l = torch.sum((logw - logw_) ** 2) / torch.sum(lengths) + return l + + +@torch.jit.script +def fused_add_tanh_sigmoid_multiply(input_a, input_b, n_channels): + n_channels_int = n_channels[0] + in_act = input_a + input_b + t_act = torch.tanh(in_act[:, :n_channels_int, :]) + s_act = torch.sigmoid(in_act[:, n_channels_int:, :]) + acts = t_act * s_act + return acts + + +def convert_pad_shape(pad_shape): + l = pad_shape[::-1] + pad_shape = [item for sublist in l for item in sublist] + return pad_shape + + +def shift_1d(x): + x = F.pad(x, convert_pad_shape([[0, 0], [0, 0], [1, 0]]))[:, :, :-1] + return x + + +def sequence_mask(length, max_length=None): + if max_length is None: + max_length = length.max() + x = torch.arange(max_length, dtype=length.dtype, device=length.device) + return x.unsqueeze(0) < length.unsqueeze(1) + + +def maximum_path(value, mask, max_neg_val=-np.inf): + """Numpy-friendly version. It's about 4 times faster than torch version. + value: [b, t_x, t_y] + mask: [b, t_x, t_y] + """ + value = value * mask + + device = value.device + dtype = value.dtype + value = value.cpu().detach().numpy() + mask = mask.cpu().detach().numpy().astype(np.bool) + + b, t_x, t_y = value.shape + direction = np.zeros(value.shape, dtype=np.int64) + v = np.zeros((b, t_x), dtype=np.float32) + x_range = np.arange(t_x, dtype=np.float32).reshape(1, -1) + for j in range(t_y): + v0 = np.pad(v, [[0, 0], [1, 0]], mode="constant", constant_values=max_neg_val)[ + :, :-1 + ] + v1 = v + max_mask = v1 >= v0 + v_max = np.where(max_mask, v1, v0) + direction[:, :, j] = max_mask + + index_mask = x_range <= j + v = np.where(index_mask, v_max + value[:, :, j], max_neg_val) + direction = np.where(mask, direction, 1) + + path = np.zeros(value.shape, dtype=np.float32) + index = mask[:, :, 0].sum(1).astype(np.int64) - 1 + index_range = np.arange(b) + for j in reversed(range(t_y)): + path[index_range, index, j] = 1 + index = index + direction[index_range, index, j] - 1 + path = path * mask.astype(np.float32) + path = torch.from_numpy(path).to(device=device, dtype=dtype) + return path + + +def generate_path(duration, mask): + """ + duration: [b, t_x] + mask: [b, t_x, t_y] + """ + device = duration.device + + b, t_x, t_y = mask.shape + cum_duration = torch.cumsum(duration, 1) + path = torch.zeros(b, t_x, t_y, dtype=mask.dtype).to(device=device) + + cum_duration_flat = cum_duration.view(b * t_x) + path = sequence_mask(cum_duration_flat, t_y).to(mask.dtype) + path = path.view(b, t_x, t_y) + path = path - F.pad(path, convert_pad_shape([[0, 0], [1, 0], [0, 0]]))[:, :-1] + path = path * mask + return path + + +class Adam: + def __init__( + self, + params, + scheduler, + dim_model, + warmup_steps=4000, + lr=1e0, + betas=(0.9, 0.98), + eps=1e-9, + ): + self.params = params + self.scheduler = scheduler + self.dim_model = dim_model + self.warmup_steps = warmup_steps + self.lr = lr + self.betas = betas + self.eps = eps + + self.step_num = 1 + self.cur_lr = lr * self._get_lr_scale() + + self._optim = torch.optim.Adam(params, lr=self.cur_lr, betas=betas, eps=eps) + + def _get_lr_scale(self): + if self.scheduler == "noam": + return np.power(self.dim_model, -0.5) * np.min( + [ + np.power(self.step_num, -0.5), + self.step_num * np.power(self.warmup_steps, -1.5), + ] + ) + else: + return 1 + + def _update_learning_rate(self): + self.step_num += 1 + if self.scheduler == "noam": + self.cur_lr = self.lr * self._get_lr_scale() + for param_group in self._optim.param_groups: + param_group["lr"] = self.cur_lr + + def get_lr(self): + return self.cur_lr + + def step(self): + self._optim.step() + self._update_learning_rate() + + def zero_grad(self): + self._optim.zero_grad() + + def load_state_dict(self, d): + self._optim.load_state_dict(d) + + def state_dict(self): + return self._optim.state_dict() + + +class TacotronSTFT(nn.Module): + def __init__( + self, + filter_length=1024, + hop_length=256, + win_length=1024, + n_mel_channels=80, + sampling_rate=22050, + mel_fmin=0.0, + mel_fmax=8000.0, + ): + super(TacotronSTFT, self).__init__() + self.n_mel_channels = n_mel_channels + self.sampling_rate = sampling_rate + self.stft_fn = STFT(filter_length, hop_length, win_length) + mel_basis = librosa_mel_fn( + sampling_rate, filter_length, n_mel_channels, mel_fmin, mel_fmax + ) + mel_basis = torch.from_numpy(mel_basis).float() + self.register_buffer("mel_basis", mel_basis) + + def spectral_normalize(self, magnitudes): + output = dynamic_range_compression(magnitudes) + return output + + def spectral_de_normalize(self, magnitudes): + output = dynamic_range_decompression(magnitudes) + return output + + def mel_spectrogram(self, y): + """Computes mel-spectrograms from a batch of waves + PARAMS + ------ + y: Variable(torch.FloatTensor) with shape (B, T) in range [-1, 1] + + RETURNS + ------- + mel_output: torch.FloatTensor of shape (B, n_mel_channels, T) + """ + assert torch.min(y.data) >= -1 + assert torch.max(y.data) <= 1 + + magnitudes, phases = self.stft_fn.transform(y) + magnitudes = magnitudes.data + mel_output = torch.matmul(self.mel_basis, magnitudes) + mel_output = self.spectral_normalize(mel_output) + return mel_output + + +def clip_grad_value_(parameters, clip_value, norm_type=2): + if isinstance(parameters, torch.Tensor): + parameters = [parameters] + parameters = list(filter(lambda p: p.grad is not None, parameters)) + norm_type = float(norm_type) + clip_value = float(clip_value) + + total_norm = 0 + for p in parameters: + param_norm = p.grad.data.norm(norm_type) + total_norm += param_norm.item() ** norm_type + + p.grad.data.clamp_(min=-clip_value, max=clip_value) + total_norm = total_norm ** (1.0 / norm_type) + return total_norm + + +def squeeze(x, x_mask=None, n_sqz=2): + b, c, t = x.size() + + t = (t // n_sqz) * n_sqz + x = x[:, :, :t] + x_sqz = x.view(b, c, t // n_sqz, n_sqz) + x_sqz = x_sqz.permute(0, 3, 1, 2).contiguous().view(b, c * n_sqz, t // n_sqz) + + if x_mask is not None: + x_mask = x_mask[:, :, n_sqz - 1 :: n_sqz] + else: + x_mask = torch.ones(b, 1, t // n_sqz).to(device=x.device, dtype=x.dtype) + return x_sqz * x_mask, x_mask + + +def unsqueeze(x, x_mask=None, n_sqz=2): + b, c, t = x.size() + + x_unsqz = x.view(b, n_sqz, c // n_sqz, t) + x_unsqz = x_unsqz.permute(0, 2, 3, 1).contiguous().view(b, c // n_sqz, t * n_sqz) + + if x_mask is not None: + x_mask = x_mask.unsqueeze(-1).repeat(1, 1, 1, n_sqz).view(b, 1, t * n_sqz) + else: + x_mask = torch.ones(b, 1, t * n_sqz).to(device=x.device, dtype=x.dtype) + return x_unsqz * x_mask, x_mask diff --git a/ttsv/src/glow_tts/data_utils.py b/ttsv/src/glow_tts/data_utils.py new file mode 100644 index 0000000000000000000000000000000000000000..b58d84b3df3de3afb0a6a3bb8fadfd7a592dd602 --- /dev/null +++ b/ttsv/src/glow_tts/data_utils.py @@ -0,0 +1,274 @@ +import random +import numpy as np +import torch +import torch.utils.data + +import commons +from utils import load_wav_to_torch, load_filepaths_and_text +from text import text_to_sequence + +class TextMelLoader(torch.utils.data.Dataset): + """ + 1) loads audio,text pairs + 2) normalizes text and converts them to sequences of one-hot vectors + 3) computes mel-spectrograms from audio files. + """ + + def __init__(self, audiopaths_and_text, hparams): + self.audiopaths_and_text = load_filepaths_and_text(audiopaths_and_text) + self.text_cleaners = hparams.text_cleaners + self.max_wav_value = hparams.max_wav_value + self.sampling_rate = hparams.sampling_rate + self.load_mel_from_disk = hparams.load_mel_from_disk + self.add_noise = hparams.add_noise + self.symbols = hparams.punc + hparams.chars + self.add_blank = getattr(hparams, "add_blank", False) # improved version + self.stft = commons.TacotronSTFT( + hparams.filter_length, + hparams.hop_length, + hparams.win_length, + hparams.n_mel_channels, + hparams.sampling_rate, + hparams.mel_fmin, + hparams.mel_fmax, + ) + random.seed(1234) + random.shuffle(self.audiopaths_and_text) + + def get_mel_text_pair(self, audiopath_and_text): + # separate filename and text + audiopath, text = audiopath_and_text[0], audiopath_and_text[1] + text = self.get_text(text) + mel = self.get_mel(audiopath) + return (text, mel) + + def get_mel(self, filename): + if not self.load_mel_from_disk: + audio, sampling_rate = load_wav_to_torch(filename) + if sampling_rate != self.stft.sampling_rate: + raise ValueError( + "{} {} SR doesn't match target {} SR".format( + sampling_rate, self.stft.sampling_rate + ) + ) + if self.add_noise: + audio = audio + torch.rand_like(audio) + audio_norm = audio / self.max_wav_value + audio_norm = audio_norm.unsqueeze(0) + melspec = self.stft.mel_spectrogram(audio_norm) + melspec = torch.squeeze(melspec, 0) + else: + melspec = torch.from_numpy(np.load(filename)) + assert ( + melspec.size(0) == self.stft.n_mel_channels + ), "Mel dimension mismatch: given {}, expected {}".format( + melspec.size(0), self.stft.n_mel_channels + ) + + return melspec + + def get_text(self, text): + text_norm = text_to_sequence(text, self.symbols, self.text_cleaners) + if self.add_blank: + text_norm = commons.intersperse( + text_norm, len(self.symbols) + ) # add a blank token, whose id number is len(symbols) + text_norm = torch.IntTensor(text_norm) + return text_norm + + def __getitem__(self, index): + return self.get_mel_text_pair(self.audiopaths_and_text[index]) + + def __len__(self): + return len(self.audiopaths_and_text) + + +class TextMelCollate: + """Zero-pads model inputs and targets based on number of frames per step""" + + def __init__(self, n_frames_per_step=1): + self.n_frames_per_step = n_frames_per_step + + def __call__(self, batch): + """Collate's training batch from normalized text and mel-spectrogram + PARAMS + ------ + batch: [text_normalized, mel_normalized] + """ + # Right zero-pad all one-hot text sequences to max input length + input_lengths, ids_sorted_decreasing = torch.sort( + torch.LongTensor([len(x[0]) for x in batch]), dim=0, descending=True + ) + max_input_len = input_lengths[0] + + text_padded = torch.LongTensor(len(batch), max_input_len) + text_padded.zero_() + for i in range(len(ids_sorted_decreasing)): + text = batch[ids_sorted_decreasing[i]][0] + text_padded[i, : text.size(0)] = text + + # Right zero-pad mel-spec + num_mels = batch[0][1].size(0) + max_target_len = max([x[1].size(1) for x in batch]) + if max_target_len % self.n_frames_per_step != 0: + max_target_len += ( + self.n_frames_per_step - max_target_len % self.n_frames_per_step + ) + assert max_target_len % self.n_frames_per_step == 0 + + # include mel padded + mel_padded = torch.FloatTensor(len(batch), num_mels, max_target_len) + mel_padded.zero_() + output_lengths = torch.LongTensor(len(batch)) + for i in range(len(ids_sorted_decreasing)): + mel = batch[ids_sorted_decreasing[i]][1] + mel_padded[i, :, : mel.size(1)] = mel + output_lengths[i] = mel.size(1) + + return text_padded, input_lengths, mel_padded, output_lengths + + +"""Multi speaker version""" + + +class TextMelSpeakerLoader(torch.utils.data.Dataset): + """ + 1) loads audio, speaker_id, text pairs + 2) normalizes text and converts them to sequences of one-hot vectors + 3) computes mel-spectrograms from audio files. + """ + + def __init__(self, audiopaths_sid_text, hparams): + self.audiopaths_sid_text = load_filepaths_and_text(audiopaths_sid_text) + self.text_cleaners = hparams.text_cleaners + self.max_wav_value = hparams.max_wav_value + self.sampling_rate = hparams.sampling_rate + self.load_mel_from_disk = hparams.load_mel_from_disk + self.add_noise = hparams.add_noise + self.symbols = hparams.punc + hparams.chars + self.add_blank = getattr(hparams, "add_blank", False) # improved version + self.min_text_len = getattr(hparams, "min_text_len", 1) + self.max_text_len = getattr(hparams, "max_text_len", 190) + self.stft = commons.TacotronSTFT( + hparams.filter_length, + hparams.hop_length, + hparams.win_length, + hparams.n_mel_channels, + hparams.sampling_rate, + hparams.mel_fmin, + hparams.mel_fmax, + ) + + self._filter_text_len() + random.seed(1234) + random.shuffle(self.audiopaths_sid_text) + + def _filter_text_len(self): + audiopaths_sid_text_new = [] + for audiopath, sid, text in self.audiopaths_sid_text: + if self.min_text_len <= len(text) and len(text) <= self.max_text_len: + audiopaths_sid_text_new.append([audiopath, sid, text]) + self.audiopaths_sid_text = audiopaths_sid_text_new + + def get_mel_text_speaker_pair(self, audiopath_sid_text): + # separate filename, speaker_id and text + audiopath, sid, text = ( + audiopath_sid_text[0], + audiopath_sid_text[1], + audiopath_sid_text[2], + ) + text = self.get_text(text) + mel = self.get_mel(audiopath) + sid = self.get_sid(sid) + return (text, mel, sid) + + def get_mel(self, filename): + if not self.load_mel_from_disk: + audio, sampling_rate = load_wav_to_torch(filename) + if sampling_rate != self.stft.sampling_rate: + raise ValueError( + "{} {} SR doesn't match target {} SR".format( + sampling_rate, self.stft.sampling_rate + ) + ) + if self.add_noise: + audio = audio + torch.rand_like(audio) + audio_norm = audio / self.max_wav_value + audio_norm = audio_norm.unsqueeze(0) + melspec = self.stft.mel_spectrogram(audio_norm) + melspec = torch.squeeze(melspec, 0) + else: + melspec = torch.from_numpy(np.load(filename)) + assert ( + melspec.size(0) == self.stft.n_mel_channels + ), "Mel dimension mismatch: given {}, expected {}".format( + melspec.size(0), self.stft.n_mel_channels + ) + + return melspec + + def get_text(self, text): + text_norm = text_to_sequence(text, self.symbols, self.text_cleaners) + if self.add_blank: + text_norm = commons.intersperse( + text_norm, len(self.symbols) + ) # add a blank token, whose id number is len(symbols) + text_norm = torch.IntTensor(text_norm) + return text_norm + + def get_sid(self, sid): + sid = torch.IntTensor([int(sid)]) + return sid + + def __getitem__(self, index): + return self.get_mel_text_speaker_pair(self.audiopaths_sid_text[index]) + + def __len__(self): + return len(self.audiopaths_sid_text) + + +class TextMelSpeakerCollate: + """Zero-pads model inputs and targets based on number of frames per step""" + + def __init__(self, n_frames_per_step=1): + self.n_frames_per_step = n_frames_per_step + + def __call__(self, batch): + """Collate's training batch from normalized text and mel-spectrogram + PARAMS + ------ + batch: [text_normalized, mel_normalized] + """ + # Right zero-pad all one-hot text sequences to max input length + input_lengths, ids_sorted_decreasing = torch.sort( + torch.LongTensor([len(x[0]) for x in batch]), dim=0, descending=True + ) + max_input_len = input_lengths[0] + + text_padded = torch.LongTensor(len(batch), max_input_len) + text_padded.zero_() + for i in range(len(ids_sorted_decreasing)): + text = batch[ids_sorted_decreasing[i]][0] + text_padded[i, : text.size(0)] = text + + # Right zero-pad mel-spec + num_mels = batch[0][1].size(0) + max_target_len = max([x[1].size(1) for x in batch]) + if max_target_len % self.n_frames_per_step != 0: + max_target_len += ( + self.n_frames_per_step - max_target_len % self.n_frames_per_step + ) + assert max_target_len % self.n_frames_per_step == 0 + + # include mel padded & sid + mel_padded = torch.FloatTensor(len(batch), num_mels, max_target_len) + mel_padded.zero_() + output_lengths = torch.LongTensor(len(batch)) + sid = torch.LongTensor(len(batch)) + for i in range(len(ids_sorted_decreasing)): + mel = batch[ids_sorted_decreasing[i]][1] + mel_padded[i, :, : mel.size(1)] = mel + output_lengths[i] = mel.size(1) + sid[i] = batch[ids_sorted_decreasing[i]][2] + + return text_padded, input_lengths, mel_padded, output_lengths, sid diff --git a/ttsv/src/glow_tts/generate_mels.py b/ttsv/src/glow_tts/generate_mels.py new file mode 100644 index 0000000000000000000000000000000000000000..a3d331aef019cfd8cf45d6264db88d0fa26e5c0f --- /dev/null +++ b/ttsv/src/glow_tts/generate_mels.py @@ -0,0 +1,70 @@ +import numpy as np +import os +import torch +import commons + +import models +import utils +from argparse import ArgumentParser +from tqdm import tqdm +from text import text_to_sequence + +if __name__ == "__main__": + parser = ArgumentParser() + parser.add_argument("-m", "--model_dir", required=True, type=str) + parser.add_argument("-s", "--mels_dir", required=True, type=str) + args = parser.parse_args() + MODEL_DIR = args.model_dir # path to model dir + SAVE_MELS_DIR = args.mels_dir # path to save generated mels + + if not os.path.exists(SAVE_MELS_DIR): + os.makedirs(SAVE_MELS_DIR) + + hps = utils.get_hparams_from_dir(MODEL_DIR) + symbols = list(hps.data.punc) + list(hps.data.chars) + checkpoint_path = utils.latest_checkpoint_path(MODEL_DIR) + cleaner = hps.data.text_cleaners + + model = models.FlowGenerator( + len(symbols) + getattr(hps.data, "add_blank", False), + out_channels=hps.data.n_mel_channels, + **hps.model + ).to("cuda") + + utils.load_checkpoint(checkpoint_path, model) + model.decoder.store_inverse() # do not calcuate jacobians for fast decoding + _ = model.eval() + + def get_mel(text, fpath): + if getattr(hps.data, "add_blank", False): + text_norm = text_to_sequence(text, symbols, cleaner) + text_norm = commons.intersperse(text_norm, len(symbols)) + else: # If not using "add_blank" option during training, adding spaces at the beginning and the end of utterance improves quality + text = " " + text.strip() + " " + text_norm = text_to_sequence(text, symbols, cleaner) + + sequence = np.array(text_norm)[None, :] + + x_tst = torch.autograd.Variable(torch.from_numpy(sequence)).cuda().long() + x_tst_lengths = torch.tensor([x_tst.shape[1]]).cuda() + + with torch.no_grad(): + noise_scale = 0.667 + length_scale = 1.0 + (y_gen_tst, *_), *_, (attn_gen, *_) = model( + x_tst, + x_tst_lengths, + gen=True, + noise_scale=noise_scale, + length_scale=length_scale, + ) + + np.save(os.path.join(SAVE_MELS_DIR, fpath), y_gen_tst.cpu().detach().numpy()) + + for f in [hps.data.training_files, hps.data.validation_files]: + file_lines = open(f).read().splitlines() + + for line in tqdm(file_lines): + fname, text = line.split("|") + fname = os.path.basename(fname).replace(".wav", ".npy") + get_mel(text, fname) diff --git a/ttsv/src/glow_tts/hifi/__init__.py b/ttsv/src/glow_tts/hifi/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..0323b35a0fc2ef21ac417857d9336cc7c8a3b717 --- /dev/null +++ b/ttsv/src/glow_tts/hifi/__init__.py @@ -0,0 +1,5 @@ +from .env import AttrDict +from .models import Generator + +if __name__ == "__main__": + pass diff --git a/ttsv/src/glow_tts/hifi/env.py b/ttsv/src/glow_tts/hifi/env.py new file mode 100644 index 0000000000000000000000000000000000000000..2bdbc95d4f7a8bad8fd4f5eef657e2b51d946056 --- /dev/null +++ b/ttsv/src/glow_tts/hifi/env.py @@ -0,0 +1,15 @@ +import os +import shutil + + +class AttrDict(dict): + def __init__(self, *args, **kwargs): + super(AttrDict, self).__init__(*args, **kwargs) + self.__dict__ = self + + +def build_env(config, config_name, path): + t_path = os.path.join(path, config_name) + if config != t_path: + os.makedirs(path, exist_ok=True) + shutil.copyfile(config, os.path.join(path, config_name)) diff --git a/ttsv/src/glow_tts/hifi/models.py b/ttsv/src/glow_tts/hifi/models.py new file mode 100644 index 0000000000000000000000000000000000000000..aaf911836119d69129abe22aa4fc875f2ba3d53c --- /dev/null +++ b/ttsv/src/glow_tts/hifi/models.py @@ -0,0 +1,403 @@ +import torch +import torch.nn.functional as F +import torch.nn as nn +from torch.nn import Conv1d, ConvTranspose1d, AvgPool1d, Conv2d +from torch.nn.utils import weight_norm, remove_weight_norm, spectral_norm +from .utils import init_weights, get_padding + +LRELU_SLOPE = 0.1 + + +class ResBlock1(torch.nn.Module): + def __init__(self, h, channels, kernel_size=3, dilation=(1, 3, 5)): + super(ResBlock1, self).__init__() + self.h = h + self.convs1 = nn.ModuleList( + [ + weight_norm( + Conv1d( + channels, + channels, + kernel_size, + 1, + dilation=dilation[0], + padding=get_padding(kernel_size, dilation[0]), + ) + ), + weight_norm( + Conv1d( + channels, + channels, + kernel_size, + 1, + dilation=dilation[1], + padding=get_padding(kernel_size, dilation[1]), + ) + ), + weight_norm( + Conv1d( + channels, + channels, + kernel_size, + 1, + dilation=dilation[2], + padding=get_padding(kernel_size, dilation[2]), + ) + ), + ] + ) + self.convs1.apply(init_weights) + + self.convs2 = nn.ModuleList( + [ + weight_norm( + Conv1d( + channels, + channels, + kernel_size, + 1, + dilation=1, + padding=get_padding(kernel_size, 1), + ) + ), + weight_norm( + Conv1d( + channels, + channels, + kernel_size, + 1, + dilation=1, + padding=get_padding(kernel_size, 1), + ) + ), + weight_norm( + Conv1d( + channels, + channels, + kernel_size, + 1, + dilation=1, + padding=get_padding(kernel_size, 1), + ) + ), + ] + ) + self.convs2.apply(init_weights) + + def forward(self, x): + for c1, c2 in zip(self.convs1, self.convs2): + xt = F.leaky_relu(x, LRELU_SLOPE) + xt = c1(xt) + xt = F.leaky_relu(xt, LRELU_SLOPE) + xt = c2(xt) + x = xt + x + return x + + def remove_weight_norm(self): + for l in self.convs1: + remove_weight_norm(l) + for l in self.convs2: + remove_weight_norm(l) + + +class ResBlock2(torch.nn.Module): + def __init__(self, h, channels, kernel_size=3, dilation=(1, 3)): + super(ResBlock2, self).__init__() + self.h = h + self.convs = nn.ModuleList( + [ + weight_norm( + Conv1d( + channels, + channels, + kernel_size, + 1, + dilation=dilation[0], + padding=get_padding(kernel_size, dilation[0]), + ) + ), + weight_norm( + Conv1d( + channels, + channels, + kernel_size, + 1, + dilation=dilation[1], + padding=get_padding(kernel_size, dilation[1]), + ) + ), + ] + ) + self.convs.apply(init_weights) + + def forward(self, x): + for c in self.convs: + xt = F.leaky_relu(x, LRELU_SLOPE) + xt = c(xt) + x = xt + x + return x + + def remove_weight_norm(self): + for l in self.convs: + remove_weight_norm(l) + + +class Generator(torch.nn.Module): + def __init__(self, h): + super(Generator, self).__init__() + self.h = h + self.num_kernels = len(h.resblock_kernel_sizes) + self.num_upsamples = len(h.upsample_rates) + self.conv_pre = weight_norm( + Conv1d(80, h.upsample_initial_channel, 7, 1, padding=3) + ) + resblock = ResBlock1 if h.resblock == "1" else ResBlock2 + + self.ups = nn.ModuleList() + for i, (u, k) in enumerate(zip(h.upsample_rates, h.upsample_kernel_sizes)): + self.ups.append( + weight_norm( + ConvTranspose1d( + h.upsample_initial_channel // (2 ** i), + h.upsample_initial_channel // (2 ** (i + 1)), + k, + u, + padding=(k - u) // 2, + ) + ) + ) + + self.resblocks = nn.ModuleList() + for i in range(len(self.ups)): + ch = h.upsample_initial_channel // (2 ** (i + 1)) + for j, (k, d) in enumerate( + zip(h.resblock_kernel_sizes, h.resblock_dilation_sizes) + ): + self.resblocks.append(resblock(h, ch, k, d)) + + self.conv_post = weight_norm(Conv1d(ch, 1, 7, 1, padding=3)) + self.ups.apply(init_weights) + self.conv_post.apply(init_weights) + + def forward(self, x): + x = self.conv_pre(x) + for i in range(self.num_upsamples): + x = F.leaky_relu(x, LRELU_SLOPE) + x = self.ups[i](x) + xs = None + for j in range(self.num_kernels): + if xs is None: + xs = self.resblocks[i * self.num_kernels + j](x) + else: + xs += self.resblocks[i * self.num_kernels + j](x) + x = xs / self.num_kernels + x = F.leaky_relu(x) + x = self.conv_post(x) + x = torch.tanh(x) + + return x + + def remove_weight_norm(self): + print("Removing weight norm...") + for l in self.ups: + remove_weight_norm(l) + for l in self.resblocks: + l.remove_weight_norm() + remove_weight_norm(self.conv_pre) + remove_weight_norm(self.conv_post) + + +class DiscriminatorP(torch.nn.Module): + def __init__(self, period, kernel_size=5, stride=3, use_spectral_norm=False): + super(DiscriminatorP, self).__init__() + self.period = period + norm_f = weight_norm if use_spectral_norm == False else spectral_norm + self.convs = nn.ModuleList( + [ + norm_f( + Conv2d( + 1, + 32, + (kernel_size, 1), + (stride, 1), + padding=(get_padding(5, 1), 0), + ) + ), + norm_f( + Conv2d( + 32, + 128, + (kernel_size, 1), + (stride, 1), + padding=(get_padding(5, 1), 0), + ) + ), + norm_f( + Conv2d( + 128, + 512, + (kernel_size, 1), + (stride, 1), + padding=(get_padding(5, 1), 0), + ) + ), + norm_f( + Conv2d( + 512, + 1024, + (kernel_size, 1), + (stride, 1), + padding=(get_padding(5, 1), 0), + ) + ), + norm_f(Conv2d(1024, 1024, (kernel_size, 1), 1, padding=(2, 0))), + ] + ) + self.conv_post = norm_f(Conv2d(1024, 1, (3, 1), 1, padding=(1, 0))) + + def forward(self, x): + fmap = [] + + # 1d to 2d + b, c, t = x.shape + if t % self.period != 0: # pad first + n_pad = self.period - (t % self.period) + x = F.pad(x, (0, n_pad), "reflect") + t = t + n_pad + x = x.view(b, c, t // self.period, self.period) + + for l in self.convs: + x = l(x) + x = F.leaky_relu(x, LRELU_SLOPE) + fmap.append(x) + x = self.conv_post(x) + fmap.append(x) + x = torch.flatten(x, 1, -1) + + return x, fmap + + +class MultiPeriodDiscriminator(torch.nn.Module): + def __init__(self): + super(MultiPeriodDiscriminator, self).__init__() + self.discriminators = nn.ModuleList( + [ + DiscriminatorP(2), + DiscriminatorP(3), + DiscriminatorP(5), + DiscriminatorP(7), + DiscriminatorP(11), + ] + ) + + def forward(self, y, y_hat): + y_d_rs = [] + y_d_gs = [] + fmap_rs = [] + fmap_gs = [] + for i, d in enumerate(self.discriminators): + y_d_r, fmap_r = d(y) + y_d_g, fmap_g = d(y_hat) + y_d_rs.append(y_d_r) + fmap_rs.append(fmap_r) + y_d_gs.append(y_d_g) + fmap_gs.append(fmap_g) + + return y_d_rs, y_d_gs, fmap_rs, fmap_gs + + +class DiscriminatorS(torch.nn.Module): + def __init__(self, use_spectral_norm=False): + super(DiscriminatorS, self).__init__() + norm_f = weight_norm if use_spectral_norm == False else spectral_norm + self.convs = nn.ModuleList( + [ + norm_f(Conv1d(1, 128, 15, 1, padding=7)), + norm_f(Conv1d(128, 128, 41, 2, groups=4, padding=20)), + norm_f(Conv1d(128, 256, 41, 2, groups=16, padding=20)), + norm_f(Conv1d(256, 512, 41, 4, groups=16, padding=20)), + norm_f(Conv1d(512, 1024, 41, 4, groups=16, padding=20)), + norm_f(Conv1d(1024, 1024, 41, 1, groups=16, padding=20)), + norm_f(Conv1d(1024, 1024, 5, 1, padding=2)), + ] + ) + self.conv_post = norm_f(Conv1d(1024, 1, 3, 1, padding=1)) + + def forward(self, x): + fmap = [] + for l in self.convs: + x = l(x) + x = F.leaky_relu(x, LRELU_SLOPE) + fmap.append(x) + x = self.conv_post(x) + fmap.append(x) + x = torch.flatten(x, 1, -1) + + return x, fmap + + +class MultiScaleDiscriminator(torch.nn.Module): + def __init__(self): + super(MultiScaleDiscriminator, self).__init__() + self.discriminators = nn.ModuleList( + [ + DiscriminatorS(use_spectral_norm=True), + DiscriminatorS(), + DiscriminatorS(), + ] + ) + self.meanpools = nn.ModuleList( + [AvgPool1d(4, 2, padding=2), AvgPool1d(4, 2, padding=2)] + ) + + def forward(self, y, y_hat): + y_d_rs = [] + y_d_gs = [] + fmap_rs = [] + fmap_gs = [] + for i, d in enumerate(self.discriminators): + if i != 0: + y = self.meanpools[i - 1](y) + y_hat = self.meanpools[i - 1](y_hat) + y_d_r, fmap_r = d(y) + y_d_g, fmap_g = d(y_hat) + y_d_rs.append(y_d_r) + fmap_rs.append(fmap_r) + y_d_gs.append(y_d_g) + fmap_gs.append(fmap_g) + + return y_d_rs, y_d_gs, fmap_rs, fmap_gs + + +def feature_loss(fmap_r, fmap_g): + loss = 0 + for dr, dg in zip(fmap_r, fmap_g): + for rl, gl in zip(dr, dg): + loss += torch.mean(torch.abs(rl - gl)) + + return loss * 2 + + +def discriminator_loss(disc_real_outputs, disc_generated_outputs): + loss = 0 + r_losses = [] + g_losses = [] + for dr, dg in zip(disc_real_outputs, disc_generated_outputs): + r_loss = torch.mean((1 - dr) ** 2) + g_loss = torch.mean(dg ** 2) + loss += r_loss + g_loss + r_losses.append(r_loss.item()) + g_losses.append(g_loss.item()) + + return loss, r_losses, g_losses + + +def generator_loss(disc_outputs): + loss = 0 + gen_losses = [] + for dg in disc_outputs: + l = torch.mean((1 - dg) ** 2) + gen_losses.append(l) + loss += l + + return loss, gen_losses diff --git a/ttsv/src/glow_tts/hifi/utils.py b/ttsv/src/glow_tts/hifi/utils.py new file mode 100644 index 0000000000000000000000000000000000000000..71e9b2c99e053e2d4239074a67d64b834898c348 --- /dev/null +++ b/ttsv/src/glow_tts/hifi/utils.py @@ -0,0 +1,57 @@ +import glob +import os +import matplotlib +import torch +from torch.nn.utils import weight_norm + +matplotlib.use("Agg") +import matplotlib.pylab as plt + + +def plot_spectrogram(spectrogram): + fig, ax = plt.subplots(figsize=(10, 2)) + im = ax.imshow(spectrogram, aspect="auto", origin="lower", interpolation="none") + plt.colorbar(im, ax=ax) + + fig.canvas.draw() + plt.close() + + return fig + + +def init_weights(m, mean=0.0, std=0.01): + classname = m.__class__.__name__ + if classname.find("Conv") != -1: + m.weight.data.normal_(mean, std) + + +def apply_weight_norm(m): + classname = m.__class__.__name__ + if classname.find("Conv") != -1: + weight_norm(m) + + +def get_padding(kernel_size, dilation=1): + return int((kernel_size * dilation - dilation) / 2) + + +def load_checkpoint(filepath, device): + assert os.path.isfile(filepath) + print("Loading '{}'".format(filepath)) + checkpoint_dict = torch.load(filepath, map_location=device) + print("Complete.") + return checkpoint_dict + + +def save_checkpoint(filepath, obj): + print("Saving checkpoint to {}".format(filepath)) + torch.save(obj, filepath) + print("Complete.") + + +def scan_checkpoint(cp_dir, prefix): + pattern = os.path.join(cp_dir, prefix + "????????") + cp_list = glob.glob(pattern) + if len(cp_list) == 0: + return None + return sorted(cp_list)[-1] diff --git a/ttsv/src/glow_tts/init.py b/ttsv/src/glow_tts/init.py new file mode 100644 index 0000000000000000000000000000000000000000..39dd83dbd55475d562a3f54d951cb822800d2e0f --- /dev/null +++ b/ttsv/src/glow_tts/init.py @@ -0,0 +1,79 @@ +import os +import json +import argparse +import math +import torch +from torch import nn, optim +from torch.nn import functional as F +from torch.utils.data import DataLoader + +from data_utils import TextMelLoader, TextMelCollate +import models +import commons +import utils + + +class FlowGenerator_DDI(models.FlowGenerator): + """A helper for Data-dependent Initialization""" + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + for f in self.decoder.flows: + if getattr(f, "set_ddi", False): + f.set_ddi(True) + + +def main(): + hps = utils.get_hparams() + logger = utils.get_logger(hps.log_dir) + logger.info(hps) + utils.check_git_hash(hps.log_dir) + + torch.manual_seed(hps.train.seed) + + train_dataset = TextMelLoader(hps.data.training_files, hps.data) + collate_fn = TextMelCollate(1) + train_loader = DataLoader( + train_dataset, + num_workers=8, + shuffle=True, + batch_size=hps.train.batch_size, + pin_memory=True, + drop_last=True, + collate_fn=collate_fn, + ) + symbols = hps.data.punc + hps.data.chars + generator = FlowGenerator_DDI( + len(symbols) + getattr(hps.data, "add_blank", False), + out_channels=hps.data.n_mel_channels, + **hps.model + ).cuda() + optimizer_g = commons.Adam( + generator.parameters(), + scheduler=hps.train.scheduler, + dim_model=hps.model.hidden_channels, + warmup_steps=hps.train.warmup_steps, + lr=hps.train.learning_rate, + betas=hps.train.betas, + eps=hps.train.eps, + ) + + generator.train() + for batch_idx, (x, x_lengths, y, y_lengths) in enumerate(train_loader): + x, x_lengths = x.cuda(), x_lengths.cuda() + y, y_lengths = y.cuda(), y_lengths.cuda() + + _ = generator(x, x_lengths, y, y_lengths, gen=False) + break + + utils.save_checkpoint( + generator, + optimizer_g, + hps.train.learning_rate, + 0, + os.path.join(hps.model_dir, "ddi_G.pth"), + ) + + +if __name__ == "__main__": + main() diff --git a/ttsv/src/glow_tts/models.py b/ttsv/src/glow_tts/models.py new file mode 100644 index 0000000000000000000000000000000000000000..a77596153fa2e7e6fdd52ee0028a0c8ce02050b4 --- /dev/null +++ b/ttsv/src/glow_tts/models.py @@ -0,0 +1,403 @@ +import math +import torch +from torch import nn +from torch.nn import functional as F + +import modules +import commons +import attentions +import monotonic_align + + +class DurationPredictor(nn.Module): + def __init__(self, in_channels, filter_channels, kernel_size, p_dropout): + super().__init__() + + self.in_channels = in_channels + self.filter_channels = filter_channels + self.kernel_size = kernel_size + self.p_dropout = p_dropout + + self.drop = nn.Dropout(p_dropout) + self.conv_1 = nn.Conv1d( + in_channels, filter_channels, kernel_size, padding=kernel_size // 2 + ) + self.norm_1 = attentions.LayerNorm(filter_channels) + self.conv_2 = nn.Conv1d( + filter_channels, filter_channels, kernel_size, padding=kernel_size // 2 + ) + self.norm_2 = attentions.LayerNorm(filter_channels) + self.proj = nn.Conv1d(filter_channels, 1, 1) + + def forward(self, x, x_mask): + x = self.conv_1(x * x_mask) + x = torch.relu(x) + x = self.norm_1(x) + x = self.drop(x) + x = self.conv_2(x * x_mask) + x = torch.relu(x) + x = self.norm_2(x) + x = self.drop(x) + x = self.proj(x * x_mask) + return x * x_mask + + +class TextEncoder(nn.Module): + def __init__( + self, + n_vocab, + out_channels, + hidden_channels, + filter_channels, + filter_channels_dp, + n_heads, + n_layers, + kernel_size, + p_dropout, + window_size=None, + block_length=None, + mean_only=False, + prenet=False, + gin_channels=0, + ): + + super().__init__() + + self.n_vocab = n_vocab + self.out_channels = out_channels + self.hidden_channels = hidden_channels + self.filter_channels = filter_channels + self.filter_channels_dp = filter_channels_dp + self.n_heads = n_heads + self.n_layers = n_layers + self.kernel_size = kernel_size + self.p_dropout = p_dropout + self.window_size = window_size + self.block_length = block_length + self.mean_only = mean_only + self.prenet = prenet + self.gin_channels = gin_channels + + self.emb = nn.Embedding(n_vocab, hidden_channels) + nn.init.normal_(self.emb.weight, 0.0, hidden_channels ** -0.5) + + if prenet: + self.pre = modules.ConvReluNorm( + hidden_channels, + hidden_channels, + hidden_channels, + kernel_size=5, + n_layers=3, + p_dropout=0.5, + ) + self.encoder = attentions.Encoder( + hidden_channels, + filter_channels, + n_heads, + n_layers, + kernel_size, + p_dropout, + window_size=window_size, + block_length=block_length, + ) + + self.proj_m = nn.Conv1d(hidden_channels, out_channels, 1) + if not mean_only: + self.proj_s = nn.Conv1d(hidden_channels, out_channels, 1) + self.proj_w = DurationPredictor( + hidden_channels + gin_channels, filter_channels_dp, kernel_size, p_dropout + ) + + def forward(self, x, x_lengths, g=None): + x = self.emb(x) * math.sqrt(self.hidden_channels) # [b, t, h] + x = torch.transpose(x, 1, -1) # [b, h, t] + x_mask = torch.unsqueeze(commons.sequence_mask(x_lengths, x.size(2)), 1).to( + x.dtype + ) + + if self.prenet: + x = self.pre(x, x_mask) + x = self.encoder(x, x_mask) + + if g is not None: + g_exp = g.expand(-1, -1, x.size(-1)) + x_dp = torch.cat([torch.detach(x), g_exp], 1) + else: + x_dp = torch.detach(x) + + x_m = self.proj_m(x) * x_mask + if not self.mean_only: + x_logs = self.proj_s(x) * x_mask + else: + x_logs = torch.zeros_like(x_m) + + logw = self.proj_w(x_dp, x_mask) + return x_m, x_logs, logw, x_mask + + +class FlowSpecDecoder(nn.Module): + def __init__( + self, + in_channels, + hidden_channels, + kernel_size, + dilation_rate, + n_blocks, + n_layers, + p_dropout=0.0, + n_split=4, + n_sqz=2, + sigmoid_scale=False, + gin_channels=0, + ): + super().__init__() + + self.in_channels = in_channels + self.hidden_channels = hidden_channels + self.kernel_size = kernel_size + self.dilation_rate = dilation_rate + self.n_blocks = n_blocks + self.n_layers = n_layers + self.p_dropout = p_dropout + self.n_split = n_split + self.n_sqz = n_sqz + self.sigmoid_scale = sigmoid_scale + self.gin_channels = gin_channels + + self.flows = nn.ModuleList() + for b in range(n_blocks): + self.flows.append(modules.ActNorm(channels=in_channels * n_sqz)) + self.flows.append( + modules.InvConvNear(channels=in_channels * n_sqz, n_split=n_split) + ) + self.flows.append( + attentions.CouplingBlock( + in_channels * n_sqz, + hidden_channels, + kernel_size=kernel_size, + dilation_rate=dilation_rate, + n_layers=n_layers, + gin_channels=gin_channels, + p_dropout=p_dropout, + sigmoid_scale=sigmoid_scale, + ) + ) + + def forward(self, x, x_mask, g=None, reverse=False): + if not reverse: + flows = self.flows + logdet_tot = 0 + else: + flows = reversed(self.flows) + logdet_tot = None + + if self.n_sqz > 1: + x, x_mask = commons.squeeze(x, x_mask, self.n_sqz) + for f in flows: + if not reverse: + x, logdet = f(x, x_mask, g=g, reverse=reverse) + logdet_tot += logdet + else: + x, logdet = f(x, x_mask, g=g, reverse=reverse) + if self.n_sqz > 1: + x, x_mask = commons.unsqueeze(x, x_mask, self.n_sqz) + return x, logdet_tot + + def store_inverse(self): + for f in self.flows: + f.store_inverse() + + +class FlowGenerator(nn.Module): + def __init__( + self, + n_vocab, + hidden_channels, + filter_channels, + filter_channels_dp, + out_channels, + kernel_size=3, + n_heads=2, + n_layers_enc=6, + p_dropout=0.0, + n_blocks_dec=12, + kernel_size_dec=5, + dilation_rate=5, + n_block_layers=4, + p_dropout_dec=0.0, + n_speakers=0, + gin_channels=0, + n_split=4, + n_sqz=1, + sigmoid_scale=False, + window_size=None, + block_length=None, + mean_only=False, + hidden_channels_enc=None, + hidden_channels_dec=None, + prenet=False, + **kwargs + ): + + super().__init__() + self.n_vocab = n_vocab + self.hidden_channels = hidden_channels + self.filter_channels = filter_channels + self.filter_channels_dp = filter_channels_dp + self.out_channels = out_channels + self.kernel_size = kernel_size + self.n_heads = n_heads + self.n_layers_enc = n_layers_enc + self.p_dropout = p_dropout + self.n_blocks_dec = n_blocks_dec + self.kernel_size_dec = kernel_size_dec + self.dilation_rate = dilation_rate + self.n_block_layers = n_block_layers + self.p_dropout_dec = p_dropout_dec + self.n_speakers = n_speakers + self.gin_channels = gin_channels + self.n_split = n_split + self.n_sqz = n_sqz + self.sigmoid_scale = sigmoid_scale + self.window_size = window_size + self.block_length = block_length + self.mean_only = mean_only + self.hidden_channels_enc = hidden_channels_enc + self.hidden_channels_dec = hidden_channels_dec + self.prenet = prenet + + self.encoder = TextEncoder( + n_vocab, + out_channels, + hidden_channels_enc or hidden_channels, + filter_channels, + filter_channels_dp, + n_heads, + n_layers_enc, + kernel_size, + p_dropout, + window_size=window_size, + block_length=block_length, + mean_only=mean_only, + prenet=prenet, + gin_channels=gin_channels, + ) + + self.decoder = FlowSpecDecoder( + out_channels, + hidden_channels_dec or hidden_channels, + kernel_size_dec, + dilation_rate, + n_blocks_dec, + n_block_layers, + p_dropout=p_dropout_dec, + n_split=n_split, + n_sqz=n_sqz, + sigmoid_scale=sigmoid_scale, + gin_channels=gin_channels, + ) + + if n_speakers > 1: + self.emb_g = nn.Embedding(n_speakers, gin_channels) + nn.init.uniform_(self.emb_g.weight, -0.1, 0.1) + + def forward( + self, + x, + x_lengths, + y=None, + y_lengths=None, + g=None, + gen=False, + noise_scale=1.0, + length_scale=1.0, + ): + if g is not None: + g = F.normalize(self.emb_g(g)).unsqueeze(-1) # [b, h] + x_m, x_logs, logw, x_mask = self.encoder(x, x_lengths, g=g) + + if gen: + w = torch.exp(logw) * x_mask * length_scale + w_ceil = torch.ceil(w) + y_lengths = torch.clamp_min(torch.sum(w_ceil, [1, 2]), 1).long() + y_max_length = None + else: + y_max_length = y.size(2) + y, y_lengths, y_max_length = self.preprocess(y, y_lengths, y_max_length) + z_mask = torch.unsqueeze(commons.sequence_mask(y_lengths, y_max_length), 1).to( + x_mask.dtype + ) + attn_mask = torch.unsqueeze(x_mask, -1) * torch.unsqueeze(z_mask, 2) + + if gen: + attn = commons.generate_path( + w_ceil.squeeze(1), attn_mask.squeeze(1) + ).unsqueeze(1) + z_m = torch.matmul( + attn.squeeze(1).transpose(1, 2), x_m.transpose(1, 2) + ).transpose( + 1, 2 + ) # [b, t', t], [b, t, d] -> [b, d, t'] + z_logs = torch.matmul( + attn.squeeze(1).transpose(1, 2), x_logs.transpose(1, 2) + ).transpose( + 1, 2 + ) # [b, t', t], [b, t, d] -> [b, d, t'] + logw_ = torch.log(1e-8 + torch.sum(attn, -1)) * x_mask + + z = (z_m + torch.exp(z_logs) * torch.randn_like(z_m) * noise_scale) * z_mask + y, logdet = self.decoder(z, z_mask, g=g, reverse=True) + return ( + (y, z_m, z_logs, logdet, z_mask), + (x_m, x_logs, x_mask), + (attn, logw, logw_), + ) + else: + z, logdet = self.decoder(y, z_mask, g=g, reverse=False) + with torch.no_grad(): + x_s_sq_r = torch.exp(-2 * x_logs) + logp1 = torch.sum(-0.5 * math.log(2 * math.pi) - x_logs, [1]).unsqueeze( + -1 + ) # [b, t, 1] + logp2 = torch.matmul( + x_s_sq_r.transpose(1, 2), -0.5 * (z ** 2) + ) # [b, t, d] x [b, d, t'] = [b, t, t'] + logp3 = torch.matmul( + (x_m * x_s_sq_r).transpose(1, 2), z + ) # [b, t, d] x [b, d, t'] = [b, t, t'] + logp4 = torch.sum(-0.5 * (x_m ** 2) * x_s_sq_r, [1]).unsqueeze( + -1 + ) # [b, t, 1] + logp = logp1 + logp2 + logp3 + logp4 # [b, t, t'] + + attn = ( + monotonic_align.maximum_path(logp, attn_mask.squeeze(1)) + .unsqueeze(1) + .detach() + ) + z_m = torch.matmul( + attn.squeeze(1).transpose(1, 2), x_m.transpose(1, 2) + ).transpose( + 1, 2 + ) # [b, t', t], [b, t, d] -> [b, d, t'] + z_logs = torch.matmul( + attn.squeeze(1).transpose(1, 2), x_logs.transpose(1, 2) + ).transpose( + 1, 2 + ) # [b, t', t], [b, t, d] -> [b, d, t'] + logw_ = torch.log(1e-8 + torch.sum(attn, -1)) * x_mask + return ( + (z, z_m, z_logs, logdet, z_mask), + (x_m, x_logs, x_mask), + (attn, logw, logw_), + ) + + def preprocess(self, y, y_lengths, y_max_length): + if y_max_length is not None: + y_max_length = (y_max_length // self.n_sqz) * self.n_sqz + y = y[:, :, :y_max_length] + y_lengths = (y_lengths // self.n_sqz) * self.n_sqz + return y, y_lengths, y_max_length + + def store_inverse(self): + self.decoder.store_inverse() diff --git a/ttsv/src/glow_tts/modules.py b/ttsv/src/glow_tts/modules.py new file mode 100644 index 0000000000000000000000000000000000000000..a192251aaccb036780d77d6c8b538b652a5e24e2 --- /dev/null +++ b/ttsv/src/glow_tts/modules.py @@ -0,0 +1,276 @@ +import copy +import math +import numpy as np +import scipy +import torch +from torch import nn +from torch.nn import functional as F + +import commons + + +class LayerNorm(nn.Module): + def __init__(self, channels, eps=1e-4): + super().__init__() + self.channels = channels + self.eps = eps + + self.gamma = nn.Parameter(torch.ones(channels)) + self.beta = nn.Parameter(torch.zeros(channels)) + + def forward(self, x): + n_dims = len(x.shape) + mean = torch.mean(x, 1, keepdim=True) + variance = torch.mean((x - mean) ** 2, 1, keepdim=True) + + x = (x - mean) * torch.rsqrt(variance + self.eps) + + shape = [1, -1] + [1] * (n_dims - 2) + x = x * self.gamma.view(*shape) + self.beta.view(*shape) + return x + + +class ConvReluNorm(nn.Module): + def __init__( + self, + in_channels, + hidden_channels, + out_channels, + kernel_size, + n_layers, + p_dropout, + ): + super().__init__() + self.in_channels = in_channels + self.hidden_channels = hidden_channels + self.out_channels = out_channels + self.kernel_size = kernel_size + self.n_layers = n_layers + self.p_dropout = p_dropout + assert n_layers > 1, "Number of layers should be larger than 0." + + self.conv_layers = nn.ModuleList() + self.norm_layers = nn.ModuleList() + self.conv_layers.append( + nn.Conv1d( + in_channels, hidden_channels, kernel_size, padding=kernel_size // 2 + ) + ) + self.norm_layers.append(LayerNorm(hidden_channels)) + self.relu_drop = nn.Sequential(nn.ReLU(), nn.Dropout(p_dropout)) + for _ in range(n_layers - 1): + self.conv_layers.append( + nn.Conv1d( + hidden_channels, + hidden_channels, + kernel_size, + padding=kernel_size // 2, + ) + ) + self.norm_layers.append(LayerNorm(hidden_channels)) + self.proj = nn.Conv1d(hidden_channels, out_channels, 1) + self.proj.weight.data.zero_() + self.proj.bias.data.zero_() + + def forward(self, x, x_mask): + x_org = x + for i in range(self.n_layers): + x = self.conv_layers[i](x * x_mask) + x = self.norm_layers[i](x) + x = self.relu_drop(x) + x = x_org + self.proj(x) + return x * x_mask + + +class WN(torch.nn.Module): + def __init__( + self, + in_channels, + hidden_channels, + kernel_size, + dilation_rate, + n_layers, + gin_channels=0, + p_dropout=0, + ): + super(WN, self).__init__() + assert kernel_size % 2 == 1 + assert hidden_channels % 2 == 0 + self.in_channels = in_channels + self.hidden_channels = hidden_channels + self.kernel_size = (kernel_size,) + self.dilation_rate = dilation_rate + self.n_layers = n_layers + self.gin_channels = gin_channels + self.p_dropout = p_dropout + + self.in_layers = torch.nn.ModuleList() + self.res_skip_layers = torch.nn.ModuleList() + self.drop = nn.Dropout(p_dropout) + + if gin_channels != 0: + cond_layer = torch.nn.Conv1d( + gin_channels, 2 * hidden_channels * n_layers, 1 + ) + self.cond_layer = torch.nn.utils.weight_norm(cond_layer, name="weight") + + for i in range(n_layers): + dilation = dilation_rate ** i + padding = int((kernel_size * dilation - dilation) / 2) + in_layer = torch.nn.Conv1d( + hidden_channels, + 2 * hidden_channels, + kernel_size, + dilation=dilation, + padding=padding, + ) + in_layer = torch.nn.utils.weight_norm(in_layer, name="weight") + self.in_layers.append(in_layer) + + # last one is not necessary + if i < n_layers - 1: + res_skip_channels = 2 * hidden_channels + else: + res_skip_channels = hidden_channels + + res_skip_layer = torch.nn.Conv1d(hidden_channels, res_skip_channels, 1) + res_skip_layer = torch.nn.utils.weight_norm(res_skip_layer, name="weight") + self.res_skip_layers.append(res_skip_layer) + + def forward(self, x, x_mask=None, g=None, **kwargs): + output = torch.zeros_like(x) + n_channels_tensor = torch.IntTensor([self.hidden_channels]) + + if g is not None: + g = self.cond_layer(g) + + for i in range(self.n_layers): + x_in = self.in_layers[i](x) + x_in = self.drop(x_in) + if g is not None: + cond_offset = i * 2 * self.hidden_channels + g_l = g[:, cond_offset : cond_offset + 2 * self.hidden_channels, :] + else: + g_l = torch.zeros_like(x_in) + + acts = commons.fused_add_tanh_sigmoid_multiply(x_in, g_l, n_channels_tensor) + + res_skip_acts = self.res_skip_layers[i](acts) + if i < self.n_layers - 1: + x = (x + res_skip_acts[:, : self.hidden_channels, :]) * x_mask + output = output + res_skip_acts[:, self.hidden_channels :, :] + else: + output = output + res_skip_acts + return output * x_mask + + def remove_weight_norm(self): + if self.gin_channels != 0: + torch.nn.utils.remove_weight_norm(self.cond_layer) + for l in self.in_layers: + torch.nn.utils.remove_weight_norm(l) + for l in self.res_skip_layers: + torch.nn.utils.remove_weight_norm(l) + + +class ActNorm(nn.Module): + def __init__(self, channels, ddi=False, **kwargs): + super().__init__() + self.channels = channels + self.initialized = not ddi + + self.logs = nn.Parameter(torch.zeros(1, channels, 1)) + self.bias = nn.Parameter(torch.zeros(1, channels, 1)) + + def forward(self, x, x_mask=None, reverse=False, **kwargs): + if x_mask is None: + x_mask = torch.ones(x.size(0), 1, x.size(2)).to( + device=x.device, dtype=x.dtype + ) + x_len = torch.sum(x_mask, [1, 2]) + if not self.initialized: + self.initialize(x, x_mask) + self.initialized = True + + if reverse: + z = (x - self.bias) * torch.exp(-self.logs) * x_mask + logdet = None + else: + z = (self.bias + torch.exp(self.logs) * x) * x_mask + logdet = torch.sum(self.logs) * x_len # [b] + + return z, logdet + + def store_inverse(self): + pass + + def set_ddi(self, ddi): + self.initialized = not ddi + + def initialize(self, x, x_mask): + with torch.no_grad(): + denom = torch.sum(x_mask, [0, 2]) + m = torch.sum(x * x_mask, [0, 2]) / denom + m_sq = torch.sum(x * x * x_mask, [0, 2]) / denom + v = m_sq - (m ** 2) + logs = 0.5 * torch.log(torch.clamp_min(v, 1e-6)) + + bias_init = ( + (-m * torch.exp(-logs)).view(*self.bias.shape).to(dtype=self.bias.dtype) + ) + logs_init = (-logs).view(*self.logs.shape).to(dtype=self.logs.dtype) + + self.bias.data.copy_(bias_init) + self.logs.data.copy_(logs_init) + + +class InvConvNear(nn.Module): + def __init__(self, channels, n_split=4, no_jacobian=False, **kwargs): + super().__init__() + assert n_split % 2 == 0 + self.channels = channels + self.n_split = n_split + self.no_jacobian = no_jacobian + + w_init = torch.qr(torch.FloatTensor(self.n_split, self.n_split).normal_())[0] + if torch.det(w_init) < 0: + w_init[:, 0] = -1 * w_init[:, 0] + self.weight = nn.Parameter(w_init) + + def forward(self, x, x_mask=None, reverse=False, **kwargs): + b, c, t = x.size() + assert c % self.n_split == 0 + if x_mask is None: + x_mask = 1 + x_len = torch.ones((b,), dtype=x.dtype, device=x.device) * t + else: + x_len = torch.sum(x_mask, [1, 2]) + + x = x.view(b, 2, c // self.n_split, self.n_split // 2, t) + x = ( + x.permute(0, 1, 3, 2, 4) + .contiguous() + .view(b, self.n_split, c // self.n_split, t) + ) + + if reverse: + if hasattr(self, "weight_inv"): + weight = self.weight_inv + else: + weight = torch.inverse(self.weight.float()).to(dtype=self.weight.dtype) + logdet = None + else: + weight = self.weight + if self.no_jacobian: + logdet = 0 + else: + logdet = torch.logdet(self.weight) * (c / self.n_split) * x_len # [b] + + weight = weight.view(self.n_split, self.n_split, 1, 1) + z = F.conv2d(x, weight) + + z = z.view(b, 2, self.n_split // 2, c // self.n_split, t) + z = z.permute(0, 1, 3, 2, 4).contiguous().view(b, c, t) * x_mask + return z, logdet + + def store_inverse(self): + self.weight_inv = torch.inverse(self.weight.float()).to(dtype=self.weight.dtype) diff --git a/ttsv/src/glow_tts/monotonic_align/monotonic_align/__init__.py b/ttsv/src/glow_tts/monotonic_align/monotonic_align/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..47a4dbf3177302af6b8e7d08b0b78343b1329efa --- /dev/null +++ b/ttsv/src/glow_tts/monotonic_align/monotonic_align/__init__.py @@ -0,0 +1,5 @@ +import pkg_resources + +__version__ = pkg_resources.get_distribution("monotonic_align").version + +from monotonic_align.mas import * diff --git a/ttsv/src/glow_tts/monotonic_align/monotonic_align/core.pyx b/ttsv/src/glow_tts/monotonic_align/monotonic_align/core.pyx new file mode 100644 index 0000000000000000000000000000000000000000..6aabccc4c408cb1b555e2abb4d73e0d1ce4d346e --- /dev/null +++ b/ttsv/src/glow_tts/monotonic_align/monotonic_align/core.pyx @@ -0,0 +1,45 @@ +import numpy as np +cimport numpy as np +cimport cython +from cython.parallel import prange + + +@cython.boundscheck(False) +@cython.wraparound(False) +cdef void maximum_path_each(int[:,::1] path, float[:,::1] value, int t_x, int t_y, float max_neg_val) nogil: + cdef int x + cdef int y + cdef float v_prev + cdef float v_cur + cdef float tmp + cdef int index = t_x - 1 + + for y in range(t_y): + for x in range(max(0, t_x + y - t_y), min(t_x, y + 1)): + if x == y: + v_cur = max_neg_val + else: + v_cur = value[x, y-1] + if x == 0: + if y == 0: + v_prev = 0. + else: + v_prev = max_neg_val + else: + v_prev = value[x-1, y-1] + value[x, y] = max(v_cur, v_prev) + value[x, y] + + for y in range(t_y - 1, -1, -1): + path[index, y] = 1 + if index != 0 and (index == y or value[index, y-1] < value[index-1, y-1]): + index = index - 1 + + +@cython.boundscheck(False) +@cython.wraparound(False) +cpdef void maximum_path_c(int[:,:,::1] paths, float[:,:,::1] values, int[::1] t_xs, int[::1] t_ys, float max_neg_val=-1e9) nogil: + cdef int b = values.shape[0] + + cdef int i + for i in prange(b, nogil=True): + maximum_path_each(paths[i], values[i], t_xs[i], t_ys[i], max_neg_val) diff --git a/ttsv/src/glow_tts/monotonic_align/monotonic_align/mas.py b/ttsv/src/glow_tts/monotonic_align/monotonic_align/mas.py new file mode 100644 index 0000000000000000000000000000000000000000..207ab3e858389ec06c902fd6f5bec6c5da2996af --- /dev/null +++ b/ttsv/src/glow_tts/monotonic_align/monotonic_align/mas.py @@ -0,0 +1,57 @@ +from typing import overload +import numpy as np +import torch +from monotonic_align.core import maximum_path_c + + +def mask_from_len(lens: torch.Tensor, max_len=None): + """ + Make a `mask` from lens. + + :param inputs: (B, T, D) + :param lens: (B) + + :return: + `mask`: (B, T) + """ + if max_len is None: + max_len = lens.max() + index = torch.arange(max_len).to(lens).view(1, -1) + return index < lens.unsqueeze(1) # (B, T) + + +def mask_from_lens( + similarity: torch.Tensor, + symbol_lens: torch.Tensor, + mel_lens: torch.Tensor, +): + """ + :param similarity: (B, S, T) + :param symbol_lens: (B,) + :param mel_lens: (B,) + """ + _, S, T = similarity.size() + mask_S = mask_from_len(symbol_lens, S) + mask_T = mask_from_len(mel_lens, T) + mask_ST = mask_S.unsqueeze(2) * mask_T.unsqueeze(1) + return mask_ST.to(similarity) + + +def maximum_path(value, mask=None): + """Cython optimised version. + value: [b, t_x, t_y] + mask: [b, t_x, t_y] + """ + if mask is None: + mask = torch.zeros_like(value) + + value = value * mask + device = value.device + dtype = value.dtype + value = value.data.cpu().numpy().astype(np.float32) + path = np.zeros_like(value).astype(np.int32) + mask = mask.data.cpu().numpy() + t_x_max = mask.sum(1)[:, 0].astype(np.int32) + t_y_max = mask.sum(2)[:, 0].astype(np.int32) + maximum_path_c(path, value, t_x_max, t_y_max) + return torch.from_numpy(path).to(device=device, dtype=dtype) diff --git a/ttsv/src/glow_tts/monotonic_align/pyproject.toml b/ttsv/src/glow_tts/monotonic_align/pyproject.toml new file mode 100644 index 0000000000000000000000000000000000000000..ea6358a08fd8d6fc177e2361a82e1a5cc7b837d9 --- /dev/null +++ b/ttsv/src/glow_tts/monotonic_align/pyproject.toml @@ -0,0 +1,7 @@ +[build-system] +requires = [ + "wheel", + "setuptools", + "cython>=0.24.0", + "numpy= win_length + # get window and zero center pad it to filter_length + fft_window = get_window(window, win_length, fftbins=True) + fft_window = pad_center(fft_window, filter_length) + fft_window = torch.from_numpy(fft_window).float() + + # window the bases + forward_basis *= fft_window + inverse_basis *= fft_window + + self.register_buffer("forward_basis", forward_basis.float()) + self.register_buffer("inverse_basis", inverse_basis.float()) + + def transform(self, input_data): + num_batches = input_data.size(0) + num_samples = input_data.size(1) + + self.num_samples = num_samples + + if input_data.device.type == "cuda": + # similar to librosa, reflect-pad the input + input_data = input_data.view(num_batches, 1, num_samples) + input_data = F.pad( + input_data.unsqueeze(1), + (int(self.filter_length / 2), int(self.filter_length / 2), 0, 0), + mode="reflect", + ) + input_data = input_data.squeeze(1) + + forward_transform = F.conv1d( + input_data, self.forward_basis, stride=self.hop_length, padding=0 + ) + + cutoff = int((self.filter_length / 2) + 1) + real_part = forward_transform[:, :cutoff, :] + imag_part = forward_transform[:, cutoff:, :] + else: + x = input_data.detach().numpy() + real_part = [] + imag_part = [] + for y in x: + y_ = stft( + y, self.filter_length, self.hop_length, self.win_length, self.window + ) + real_part.append(y_.real[None, :, :]) + imag_part.append(y_.imag[None, :, :]) + real_part = np.concatenate(real_part, 0) + imag_part = np.concatenate(imag_part, 0) + + real_part = torch.from_numpy(real_part).to(input_data.dtype) + imag_part = torch.from_numpy(imag_part).to(input_data.dtype) + + magnitude = torch.sqrt(real_part ** 2 + imag_part ** 2) + phase = torch.atan2(imag_part.data, real_part.data) + + return magnitude, phase + + def inverse(self, magnitude, phase): + recombine_magnitude_phase = torch.cat( + [magnitude * torch.cos(phase), magnitude * torch.sin(phase)], dim=1 + ) + + if magnitude.device.type == "cuda": + inverse_transform = F.conv_transpose1d( + recombine_magnitude_phase, + self.inverse_basis, + stride=self.hop_length, + padding=0, + ) + + if self.window is not None: + window_sum = window_sumsquare( + self.window, + magnitude.size(-1), + hop_length=self.hop_length, + win_length=self.win_length, + n_fft=self.filter_length, + dtype=np.float32, + ) + # remove modulation effects + approx_nonzero_indices = torch.from_numpy( + np.where(window_sum > tiny(window_sum))[0] + ) + window_sum = torch.from_numpy(window_sum).to(inverse_transform.device) + inverse_transform[:, :, approx_nonzero_indices] /= window_sum[ + approx_nonzero_indices + ] + + # scale by hop ratio + inverse_transform *= float(self.filter_length) / self.hop_length + + inverse_transform = inverse_transform[:, :, int(self.filter_length / 2) :] + inverse_transform = inverse_transform[ + :, :, : -int(self.filter_length / 2) : + ] + inverse_transform = inverse_transform.squeeze(1) + else: + x_org = recombine_magnitude_phase.detach().numpy() + n_b, n_f, n_t = x_org.shape + x = np.empty([n_b, n_f // 2, n_t], dtype=np.complex64) + x.real = x_org[:, : n_f // 2] + x.imag = x_org[:, n_f // 2 :] + inverse_transform = [] + for y in x: + y_ = istft(y, self.hop_length, self.win_length, self.window) + inverse_transform.append(y_[None, :]) + inverse_transform = np.concatenate(inverse_transform, 0) + inverse_transform = torch.from_numpy(inverse_transform).to( + recombine_magnitude_phase.dtype + ) + + return inverse_transform + + def forward(self, input_data): + self.magnitude, self.phase = self.transform(input_data) + reconstruction = self.inverse(self.magnitude, self.phase) + return reconstruction diff --git a/ttsv/src/glow_tts/t2s_fastapi.py b/ttsv/src/glow_tts/t2s_fastapi.py new file mode 100644 index 0000000000000000000000000000000000000000..e034fc01a4a5bcd54b365a49dad2e907b57504a1 --- /dev/null +++ b/ttsv/src/glow_tts/t2s_fastapi.py @@ -0,0 +1,63 @@ +from starlette.responses import StreamingResponse +from texttospeech import MelToWav, TextToMel +from typing import Optional +from pydantic import BaseModel +from fastapi import FastAPI, HTTPException +import uvicorn +import base64 + +app = FastAPI() + + +class TextJson(BaseModel): + text: str + lang: Optional[str] = "hi" + gender: Optional[str] = "male" + + +glow_hi_male = TextToMel(glow_model_dir="", device="") +glow_hi_female = TextToMel(glow_model_dir="", device="") +hifi_hi = MelToWav(hifi_model_dir="", device="") + + +available_choice = { + "hi_male": [glow_hi_male, hifi_hi], + "hi_female": [glow_hi_female, hifi_hi], +} + + +@app.post("/TTS/") +async def tts(input: TextJson): + text = input.text + lang = input.lang + gender = input.gender + + choice = lang + "_" + gender + if choice in available_choice.keys(): + t2s = available_choice[choice] + else: + raise HTTPException( + status_code=400, detail={"error": "Requested model not found"} + ) + + if text: + mel = t2s[0].generate_mel(text) + data, sr = t2s[1].generate_wav(mel) + t2s.save_audio("out.wav", data, sr) + else: + raise HTTPException(status_code=400, detail={"error": "No text"}) + + ## to return outpur as a file + # audio = open('out.wav', mode='rb') + # return StreamingResponse(audio, media_type="audio/wav") + + with open("out.wav", "rb") as audio_file: + encoded_bytes = base64.b64encode(audio_file.read()) + encoded_string = encoded_bytes.decode() + return {"encoding": "base64", "data": encoded_string, "sr": sr} + + +if __name__ == "__main__": + uvicorn.run( + "t2s_fastapi:app", host="127.0.0.1", port=5000, log_level="info", reload=True + ) diff --git a/ttsv/src/glow_tts/t2s_gradio.py b/ttsv/src/glow_tts/t2s_gradio.py new file mode 100644 index 0000000000000000000000000000000000000000..bd9acbe68761759ff259f4476bb3df57a75c78ff --- /dev/null +++ b/ttsv/src/glow_tts/t2s_gradio.py @@ -0,0 +1,24 @@ +import gradio as gr +from texttospeech import TextToMel, MelToWav + +text_to_mel = TextToMel( + glow_model_dir="/path/to/glow-tts/checkpoint/dir", device="cuda" +) +mel_to_wav = MelToWav(hifi_model_dir="/path/to/glow-tts/checkpoint/dir", device="cuda") + + +def run_tts(text): + mel = text_to_mel.generate_mel(text) + audio, sr = mel_to_wav.generate_wav(mel) + return (sr, audio) + + +# text = " सीआईएसएफ में उप-निरीक्षक महावीर प्रसाद गोदरा को मरणोपरांत 'शौर्य चक्र' से सम्मानित किया गया। " +# run_tts(text) + +textbox = gr.inputs.Textbox( + placeholder="Enter Telugu text here", default="", label="TTS" +) +op = gr.outputs.Audio(type="numpy", label=None) +iface = gr.Interface(fn=run_tts, inputs=textbox, outputs=op) +iface.launch(share=True) diff --git a/ttsv/src/glow_tts/text/__init__.py b/ttsv/src/glow_tts/text/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..3f5aa62bfcd56165b85d064f5ca0ba59fbe34a72 --- /dev/null +++ b/ttsv/src/glow_tts/text/__init__.py @@ -0,0 +1,84 @@ +""" from https://github.com/keithito/tacotron """ +import re +from text import cleaners + +# Regular expression matching text enclosed in curly braces: +_curly_re = re.compile(r'(.*?)\{(.+?)\}(.*)') + + +def get_arpabet(word, dictionary): + word_arpabet = dictionary.lookup(word) + if word_arpabet is not None: + return "{" + word_arpabet[0] + "}" + else: + return word + + +def text_to_sequence(text, symbols, cleaner_names, dictionary=None): + '''Converts a string of text to a sequence of IDs corresponding to the symbols in the text. + + The text can optionally have ARPAbet sequences enclosed in curly braces embedded + in it. For example, "Turn left on {HH AW1 S S T AH0 N} Street." + + Args: + text: string to convert to a sequence + cleaner_names: names of the cleaner functions to run the text through + dictionary: arpabet class with arpabet dictionary + + Returns: + List of integers corresponding to the symbols in the text + ''' + # Mappings from symbol to numeric ID and vice versa: + global _id_to_symbol, _symbol_to_id + _symbol_to_id = {s: i for i, s in enumerate(symbols)} + _id_to_symbol = {i: s for i, s in enumerate(symbols)} + + sequence = [] + + space = _symbols_to_sequence(' ') + # Check for curly braces and treat their contents as ARPAbet: + while len(text): + m = _curly_re.match(text) + if not m: + clean_text = _clean_text(text, cleaner_names) + if dictionary is not None: + clean_text = [get_arpabet(w, dictionary) for w in clean_text.split(" ")] + for i in range(len(clean_text)): + t = clean_text[i] + if t.startswith("{"): + sequence += _arpabet_to_sequence(t[1:-1]) + else: + sequence += _symbols_to_sequence(t) + sequence += space + else: + sequence += _symbols_to_sequence(clean_text) + break + sequence += _symbols_to_sequence(_clean_text(m.group(1), cleaner_names)) + sequence += _arpabet_to_sequence(m.group(2)) + text = m.group(3) + + # remove trailing space + if dictionary is not None: + sequence = sequence[:-1] if sequence[-1] == space[0] else sequence + return sequence + + +def _clean_text(text, cleaner_names): + for name in cleaner_names: + cleaner = getattr(cleaners, name) + if not cleaner: + raise Exception('Unknown cleaner: %s' % name) + text = cleaner(text) + return text + + +def _symbols_to_sequence(symbols): + return [_symbol_to_id[s] for s in symbols if _should_keep_symbol(s)] + + +def _arpabet_to_sequence(text): + return _symbols_to_sequence(['@' + s for s in text.split()]) + + +def _should_keep_symbol(s): + return s in _symbol_to_id and s is not '_' and s is not '~' \ No newline at end of file diff --git a/ttsv/src/glow_tts/text/cleaners.py b/ttsv/src/glow_tts/text/cleaners.py new file mode 100644 index 0000000000000000000000000000000000000000..a7d4e029baa436e88e4d68090e886afdd998a68d --- /dev/null +++ b/ttsv/src/glow_tts/text/cleaners.py @@ -0,0 +1,78 @@ +import re + +from unidecode import unidecode +from .numbers import normalize_numbers + + + + +# Regular expression matching whitespace: +_whitespace_re = re.compile(r"\s+") + +def lowercase(text): + return text.lower() + +def collapse_whitespace(text): + return re.sub(_whitespace_re, " ", text) + +def basic_indic_cleaners(text): + """Basic pipeline that collapses whitespace without transliteration.""" + text = collapse_whitespace(text) + return text + + +def english_cleaner(text): + text = text.lower().replace('‘','\'').replace('’','\'') + return text + + +def lowercase(text): + return text.lower() + +def convert_to_ascii(text): + return unidecode(text) + +def expand_numbers(text): + return normalize_numbers(text) + +def expand_abbreviations(text): + for regex, replacement in _abbreviations: + text = re.sub(regex, replacement, text) + return text + +_abbreviations = [(re.compile('\\b%s\\.' % x[0], re.IGNORECASE), x[1]) for x in [ + ('mrs', 'missus'), + ('mr', 'mister'), + ('dr', 'doctor'), + ('st', 'saint'), + ('co', 'company'), + ('jr', 'junior'), + ('maj', 'major'), + ('gen', 'general'), + ('drs', 'doctors'), + ('rev', 'reverend'), + ('lt', 'lieutenant'), + ('hon', 'honorable'), + ('sgt', 'sergeant'), + ('capt', 'captain'), + ('esq', 'esquire'), + ('ltd', 'limited'), + ('col', 'colonel'), + ('ft', 'fort'), + ('pvt', 'private'), + ('rs', 'Rupees') +]] + + + + + + +def english_cleaners(text): + '''Pipeline for English text, including number and abbreviation expansion.''' + text = convert_to_ascii(text) + text = lowercase(text) + text = expand_numbers(text) + text = expand_abbreviations(text) + text = collapse_whitespace(text) + return text diff --git a/ttsv/src/glow_tts/text/numbers.py b/ttsv/src/glow_tts/text/numbers.py new file mode 100644 index 0000000000000000000000000000000000000000..491634d692ee71e7ea0e5213b513e15be825c9b2 --- /dev/null +++ b/ttsv/src/glow_tts/text/numbers.py @@ -0,0 +1,69 @@ +import inflect +import re + + +_inflect = inflect.engine() +_comma_number_re = re.compile(r'([0-9][0-9\,]+[0-9])') +_decimal_number_re = re.compile(r'([0-9]+\.[0-9]+)') +_pounds_re = re.compile(r'£([0-9\,]*[0-9]+)') +_dollars_re = re.compile(r'\$([0-9\.\,]*[0-9]+)') +_ordinal_re = re.compile(r'[0-9]+(st|nd|rd|th)') +_number_re = re.compile(r'[0-9]+') + + +def _remove_commas(m): + return m.group(1).replace(',', '') + + +def _expand_decimal_point(m): + return m.group(1).replace('.', ' point ') + + +def _expand_dollars(m): + match = m.group(1) + parts = match.split('.') + if len(parts) > 2: + return match + ' dollars' # Unexpected format + dollars = int(parts[0]) if parts[0] else 0 + cents = int(parts[1]) if len(parts) > 1 and parts[1] else 0 + if dollars and cents: + dollar_unit = 'dollar' if dollars == 1 else 'dollars' + cent_unit = 'cent' if cents == 1 else 'cents' + return '%s %s, %s %s' % (dollars, dollar_unit, cents, cent_unit) + elif dollars: + dollar_unit = 'dollar' if dollars == 1 else 'dollars' + return '%s %s' % (dollars, dollar_unit) + elif cents: + cent_unit = 'cent' if cents == 1 else 'cents' + return '%s %s' % (cents, cent_unit) + else: + return 'zero dollars' + + +def _expand_ordinal(m): + return _inflect.number_to_words(m.group(0)) + + +def _expand_number(m): + num = int(m.group(0)) + if num > 1000 and num < 3000: + if num == 2000: + return 'two thousand' + elif num > 2000 and num < 2010: + return 'two thousand ' + _inflect.number_to_words(num % 100) + elif num % 100 == 0: + return _inflect.number_to_words(num // 100) + ' hundred' + else: + return _inflect.number_to_words(num, andword='', zero='oh', group=2).replace(', ', ' ') + else: + return _inflect.number_to_words(num, andword='') + + +def normalize_numbers(text): + text = re.sub(_comma_number_re, _remove_commas, text) + text = re.sub(_pounds_re, r'\1 pounds', text) + text = re.sub(_dollars_re, _expand_dollars, text) + text = re.sub(_decimal_number_re, _expand_decimal_point, text) + text = re.sub(_ordinal_re, _expand_ordinal, text) + text = re.sub(_number_re, _expand_number, text) + return text \ No newline at end of file diff --git a/ttsv/src/glow_tts/texttospeech.py b/ttsv/src/glow_tts/texttospeech.py new file mode 100644 index 0000000000000000000000000000000000000000..3c88925cac0c56e52d35acfa5d6d7e5ce51329c7 --- /dev/null +++ b/ttsv/src/glow_tts/texttospeech.py @@ -0,0 +1,146 @@ +from __future__ import absolute_import, division, print_function, unicode_literals +from typing import Tuple + +from scipy.io.wavfile import write +from hifi.env import AttrDict +from hifi.models import Generator + +import numpy as np +import os +import json + +import torch +from text import text_to_sequence +import commons +import models +import utils +import sys +from argparse import ArgumentParser + + +def check_directory(dir): + if not os.path.exists(dir): + sys.exit("Error: {} directory does not exist".format(dir)) + + +class TextToMel: + def __init__(self, glow_model_dir, device="cuda"): + self.glow_model_dir = glow_model_dir + check_directory(self.glow_model_dir) + self.device = device + self.hps, self.glow_tts_model = self.load_glow_tts() + pass + + def load_glow_tts(self): + hps = utils.get_hparams_from_dir(self.glow_model_dir) + checkpoint_path = utils.latest_checkpoint_path(self.glow_model_dir) + symbols = list(hps.data.punc) + list(hps.data.chars) + glow_tts_model = models.FlowGenerator( + len(symbols) + getattr(hps.data, "add_blank", False), + out_channels=hps.data.n_mel_channels, + **hps.model + ) # .to(self.device) + + if self.device == "cuda": + glow_tts_model.to("cuda") + + utils.load_checkpoint(checkpoint_path, glow_tts_model) + glow_tts_model.decoder.store_inverse() + _ = glow_tts_model.eval() + + return hps, glow_tts_model + + def generate_mel(self, text, noise_scale=0.667, length_scale=1.0): + symbols = list(self.hps.data.punc) + list(self.hps.data.chars) + cleaner = self.hps.data.text_cleaners + if getattr(self.hps.data, "add_blank", False): + text_norm = text_to_sequence(text, symbols, cleaner) + text_norm = commons.intersperse(text_norm, len(symbols)) + else: # If not using "add_blank" option during training, adding spaces at the beginning and the end of utterance improves quality + text = " " + text.strip() + " " + text_norm = text_to_sequence(text, symbols, cleaner) + + sequence = np.array(text_norm)[None, :] + + if self.device == "cuda": + x_tst = torch.autograd.Variable(torch.from_numpy(sequence)).cuda().long() + x_tst_lengths = torch.tensor([x_tst.shape[1]]).cuda() + else: + x_tst = torch.autograd.Variable(torch.from_numpy(sequence)).long() + x_tst_lengths = torch.tensor([x_tst.shape[1]]) + + with torch.no_grad(): + (y_gen_tst, *_), *_, (attn_gen, *_) = self.glow_tts_model( + x_tst, + x_tst_lengths, + gen=True, + noise_scale=noise_scale, + length_scale=length_scale, + ) + + return y_gen_tst + #return y_gen_tst.cpu().detach().numpy() + + +class MelToWav: + def __init__(self, hifi_model_dir, device="cuda"): + self.hifi_model_dir = hifi_model_dir + check_directory(self.hifi_model_dir) + self.device = device + self.h, self.hifi_gan_generator = self.load_hifi_gan() + pass + + def load_hifi_gan(self): + checkpoint_path = utils.latest_checkpoint_path(self.hifi_model_dir, regex="g_*") + config_file = os.path.join(self.hifi_model_dir, "config.json") + data = open(config_file).read() + json_config = json.loads(data) + h = AttrDict(json_config) + torch.manual_seed(h.seed) + + generator = Generator(h).to(self.device) + + assert os.path.isfile(checkpoint_path) + print("Loading '{}'".format(checkpoint_path)) + state_dict_g = torch.load(checkpoint_path, map_location=self.device) + print("Complete.") + + generator.load_state_dict(state_dict_g["generator"]) + + generator.eval() + generator.remove_weight_norm() + + return h, generator + + def generate_wav(self, mel): + #mel = torch.FloatTensor(mel).to(self.device) + + y_g_hat = self.hifi_gan_generator(mel.to(self.device)) # passing through vocoder + audio = y_g_hat.squeeze() + audio = audio * 32768.0 + audio = audio.cpu().detach().numpy().astype("int16") + + return audio, self.h.sampling_rate + + + + + +if __name__ == "__main__": + + parser = ArgumentParser() + parser.add_argument("-m", "--model", required=True, type=str) + parser.add_argument("-g", "--gan", required=True, type=str) + parser.add_argument("-d", "--device", type=str, default="cpu") + parser.add_argument("-t", "--text", type=str, required=True) + parser.add_argument("-w", "--wav", type=str, required=True) + + args = parser.parse_args() + + text_to_mel = TextToMel(glow_model_dir=args.model, device=args.device) + mel_to_wav = MelToWav(hifi_model_dir=args.gan, device=args.device) + + mel = text_to_mel.generate_mel(args.text) + audio, sr = mel_to_wav.generate_wav(mel) + + write(filename=args.wav, rate=sr, data=audio) \ No newline at end of file diff --git a/ttsv/src/glow_tts/train.py b/ttsv/src/glow_tts/train.py new file mode 100644 index 0000000000000000000000000000000000000000..79bf515a707b309e82e9686c140658f23acf1b91 --- /dev/null +++ b/ttsv/src/glow_tts/train.py @@ -0,0 +1,286 @@ +import os +import json +import argparse +import math +import torch +from torch import nn, optim +from torch.nn import functional as F +from torch.utils.data import DataLoader +from torch.utils.tensorboard import SummaryWriter +import torch.multiprocessing as mp +import torch.distributed as dist +from apex.parallel import DistributedDataParallel as DDP +from apex import amp + +from data_utils import TextMelLoader, TextMelCollate +import models +import commons +import utils + + +global_step = 0 + + +def main(): + """Assume Single Node Multi GPUs Training Only""" + assert torch.cuda.is_available(), "CPU training is not allowed." + + n_gpus = torch.cuda.device_count() + os.environ["MASTER_ADDR"] = "localhost" + os.environ["MASTER_PORT"] = "80000" + + hps = utils.get_hparams() + mp.spawn( + train_and_eval, + nprocs=n_gpus, + args=( + n_gpus, + hps, + ), + ) + + +def train_and_eval(rank, n_gpus, hps): + global global_step + if rank == 0: + logger = utils.get_logger(hps.log_dir) + logger.info(hps) + utils.check_git_hash(hps.log_dir) + writer = SummaryWriter(log_dir=hps.log_dir) + writer_eval = SummaryWriter(log_dir=os.path.join(hps.log_dir, "eval")) + + dist.init_process_group( + backend="nccl", init_method="env://", world_size=n_gpus, rank=rank + ) + torch.manual_seed(hps.train.seed) + torch.cuda.set_device(rank) + + train_dataset = TextMelLoader(hps.data.training_files, hps.data) + train_sampler = torch.utils.data.distributed.DistributedSampler( + train_dataset, num_replicas=n_gpus, rank=rank, shuffle=True + ) + collate_fn = TextMelCollate(1) + train_loader = DataLoader( + train_dataset, + num_workers=8, + shuffle=False, + batch_size=hps.train.batch_size, + pin_memory=True, + drop_last=True, + collate_fn=collate_fn, + sampler=train_sampler, + ) + if rank == 0: + val_dataset = TextMelLoader(hps.data.validation_files, hps.data) + val_loader = DataLoader( + val_dataset, + num_workers=8, + shuffle=False, + batch_size=hps.train.batch_size, + pin_memory=True, + drop_last=True, + collate_fn=collate_fn, + ) + symbols = hps.data.punc + hps.data.chars + generator = models.FlowGenerator( + n_vocab=len(symbols) + getattr(hps.data, "add_blank", False), + out_channels=hps.data.n_mel_channels, + **hps.model + ).cuda(rank) + optimizer_g = commons.Adam( + generator.parameters(), + scheduler=hps.train.scheduler, + dim_model=hps.model.hidden_channels, + warmup_steps=hps.train.warmup_steps, + lr=hps.train.learning_rate, + betas=hps.train.betas, + eps=hps.train.eps, + ) + if hps.train.fp16_run: + generator, optimizer_g._optim = amp.initialize( + generator, optimizer_g._optim, opt_level="O1" + ) + generator = DDP(generator) + epoch_str = 1 + global_step = 0 + try: + _, _, _, epoch_str = utils.load_checkpoint( + utils.latest_checkpoint_path(hps.model_dir, "G_*.pth"), + generator, + optimizer_g, + ) + epoch_str += 1 + optimizer_g.step_num = (epoch_str - 1) * len(train_loader) + optimizer_g._update_learning_rate() + global_step = (epoch_str - 1) * len(train_loader) + except: + if hps.train.ddi and os.path.isfile(os.path.join(hps.model_dir, "ddi_G.pth")): + _ = utils.load_checkpoint( + os.path.join(hps.model_dir, "ddi_G.pth"), generator, optimizer_g + ) + + for epoch in range(epoch_str, hps.train.epochs + 1): + if rank == 0: + train( + rank, epoch, hps, generator, optimizer_g, train_loader, logger, writer + ) + evaluate( + rank, + epoch, + hps, + generator, + optimizer_g, + val_loader, + logger, + writer_eval, + ) + if epoch % hps.train.save_epoch == 0: + utils.save_checkpoint( + generator, + optimizer_g, + hps.train.learning_rate, + epoch, + os.path.join(hps.model_dir, "G_{}.pth".format(epoch)), + ) + else: + train(rank, epoch, hps, generator, optimizer_g, train_loader, None, None) + + +def train(rank, epoch, hps, generator, optimizer_g, train_loader, logger, writer): + train_loader.sampler.set_epoch(epoch) + global global_step + + generator.train() + for batch_idx, (x, x_lengths, y, y_lengths) in enumerate(train_loader): + x, x_lengths = x.cuda(rank, non_blocking=True), x_lengths.cuda( + rank, non_blocking=True + ) + y, y_lengths = y.cuda(rank, non_blocking=True), y_lengths.cuda( + rank, non_blocking=True + ) + + # Train Generator + optimizer_g.zero_grad() + + ( + (z, z_m, z_logs, logdet, z_mask), + (x_m, x_logs, x_mask), + (attn, logw, logw_), + ) = generator(x, x_lengths, y, y_lengths, gen=False) + l_mle = commons.mle_loss(z, z_m, z_logs, logdet, z_mask) + l_length = commons.duration_loss(logw, logw_, x_lengths) + + loss_gs = [l_mle, l_length] + loss_g = sum(loss_gs) + + if hps.train.fp16_run: + with amp.scale_loss(loss_g, optimizer_g._optim) as scaled_loss: + scaled_loss.backward() + grad_norm = commons.clip_grad_value_( + amp.master_params(optimizer_g._optim), 5 + ) + else: + loss_g.backward() + grad_norm = commons.clip_grad_value_(generator.parameters(), 5) + optimizer_g.step() + + if rank == 0: + if batch_idx % hps.train.log_interval == 0: + (y_gen, *_), *_ = generator.module(x[:1], x_lengths[:1], gen=True) + logger.info( + "Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}".format( + epoch, + batch_idx * len(x), + len(train_loader.dataset), + 100.0 * batch_idx / len(train_loader), + loss_g.item(), + ) + ) + logger.info( + [x.item() for x in loss_gs] + [global_step, optimizer_g.get_lr()] + ) + + scalar_dict = { + "loss/g/total": loss_g, + "learning_rate": optimizer_g.get_lr(), + "grad_norm": grad_norm, + } + scalar_dict.update( + {"loss/g/{}".format(i): v for i, v in enumerate(loss_gs)} + ) + utils.summarize( + writer=writer, + global_step=global_step, + images={ + "y_org": utils.plot_spectrogram_to_numpy( + y[0].data.cpu().numpy() + ), + "y_gen": utils.plot_spectrogram_to_numpy( + y_gen[0].data.cpu().numpy() + ), + "attn": utils.plot_alignment_to_numpy( + attn[0, 0].data.cpu().numpy() + ), + }, + scalars=scalar_dict, + ) + global_step += 1 + + if rank == 0: + logger.info("====> Epoch: {}".format(epoch)) + + +def evaluate(rank, epoch, hps, generator, optimizer_g, val_loader, logger, writer_eval): + if rank == 0: + global global_step + generator.eval() + losses_tot = [] + with torch.no_grad(): + for batch_idx, (x, x_lengths, y, y_lengths) in enumerate(val_loader): + x, x_lengths = x.cuda(rank, non_blocking=True), x_lengths.cuda( + rank, non_blocking=True + ) + y, y_lengths = y.cuda(rank, non_blocking=True), y_lengths.cuda( + rank, non_blocking=True + ) + + ( + (z, z_m, z_logs, logdet, z_mask), + (x_m, x_logs, x_mask), + (attn, logw, logw_), + ) = generator(x, x_lengths, y, y_lengths, gen=False) + l_mle = commons.mle_loss(z, z_m, z_logs, logdet, z_mask) + l_length = commons.duration_loss(logw, logw_, x_lengths) + + loss_gs = [l_mle, l_length] + loss_g = sum(loss_gs) + + if batch_idx == 0: + losses_tot = loss_gs + else: + losses_tot = [x + y for (x, y) in zip(losses_tot, loss_gs)] + + if batch_idx % hps.train.log_interval == 0: + logger.info( + "Eval Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}".format( + epoch, + batch_idx * len(x), + len(val_loader.dataset), + 100.0 * batch_idx / len(val_loader), + loss_g.item(), + ) + ) + logger.info([x.item() for x in loss_gs]) + + losses_tot = [x / len(val_loader) for x in losses_tot] + loss_tot = sum(losses_tot) + scalar_dict = {"loss/g/total": loss_tot} + scalar_dict.update({"loss/g/{}".format(i): v for i, v in enumerate(losses_tot)}) + utils.summarize( + writer=writer_eval, global_step=global_step, scalars=scalar_dict + ) + logger.info("====> Epoch: {}".format(epoch)) + + +if __name__ == "__main__": + main() diff --git a/ttsv/src/glow_tts/utils.py b/ttsv/src/glow_tts/utils.py new file mode 100644 index 0000000000000000000000000000000000000000..a591aa319ccb264110111cda55c4a232b41aae74 --- /dev/null +++ b/ttsv/src/glow_tts/utils.py @@ -0,0 +1,282 @@ +import os +import glob +import sys +import argparse +import logging +import json +import subprocess +import numpy as np +from scipy.io.wavfile import read +import torch + +MATPLOTLIB_FLAG = False + +logging.basicConfig(stream=sys.stdout, level=logging.DEBUG) +logger = logging + + +def load_checkpoint(checkpoint_path, model, optimizer=None): + assert os.path.isfile(checkpoint_path) + checkpoint_dict = torch.load(checkpoint_path, map_location="cpu") + iteration = 1 + if "iteration" in checkpoint_dict.keys(): + iteration = checkpoint_dict["iteration"] + if "learning_rate" in checkpoint_dict.keys(): + learning_rate = checkpoint_dict["learning_rate"] + if optimizer is not None and "optimizer" in checkpoint_dict.keys(): + optimizer.load_state_dict(checkpoint_dict["optimizer"]) + saved_state_dict = checkpoint_dict["model"] + if hasattr(model, "module"): + state_dict = model.module.state_dict() + else: + state_dict = model.state_dict() + new_state_dict = {} + for k, v in state_dict.items(): + try: + new_state_dict[k] = saved_state_dict[k] + except: + logger.info("%s is not in the checkpoint" % k) + new_state_dict[k] = v + if hasattr(model, "module"): + model.module.load_state_dict(new_state_dict) + else: + model.load_state_dict(new_state_dict) + logger.info( + "Loaded checkpoint '{}' (iteration {})".format(checkpoint_path, iteration) + ) + return model, optimizer, learning_rate, iteration + + +def save_checkpoint(model, optimizer, learning_rate, iteration, checkpoint_path): + logger.info( + "Saving model and optimizer state at iteration {} to {}".format( + iteration, checkpoint_path + ) + ) + if hasattr(model, "module"): + state_dict = model.module.state_dict() + else: + state_dict = model.state_dict() + torch.save( + { + "model": state_dict, + "iteration": iteration, + "optimizer": optimizer.state_dict(), + "learning_rate": learning_rate, + }, + checkpoint_path, + ) + + +def summarize(writer, global_step, scalars={}, histograms={}, images={}): + for k, v in scalars.items(): + writer.add_scalar(k, v, global_step) + for k, v in histograms.items(): + writer.add_histogram(k, v, global_step) + for k, v in images.items(): + writer.add_image(k, v, global_step, dataformats="HWC") + + +def latest_checkpoint_path(dir_path, regex="G_*.pth"): + f_list = glob.glob(os.path.join(dir_path, regex)) + f_list.sort(key=lambda f: int("".join(filter(str.isdigit, f)))) + x = f_list[-1] + print(x) + return x + + +def plot_spectrogram_to_numpy(spectrogram): + global MATPLOTLIB_FLAG + if not MATPLOTLIB_FLAG: + import matplotlib + + matplotlib.use("Agg") + MATPLOTLIB_FLAG = True + mpl_logger = logging.getLogger("matplotlib") + mpl_logger.setLevel(logging.WARNING) + import matplotlib.pylab as plt + import numpy as np + + fig, ax = plt.subplots() + im = ax.imshow(spectrogram, aspect="auto", origin="lower", interpolation="none") + plt.colorbar(im, ax=ax) + plt.xlabel("Frames") + plt.ylabel("Channels") + plt.tight_layout() + + fig.canvas.draw() + data = np.fromstring(fig.canvas.tostring_rgb(), dtype=np.uint8, sep="") + data = data.reshape(fig.canvas.get_width_height()[::-1] + (3,)) + plt.close() + return data + + +def plot_alignment_to_numpy(alignment, info=None): + global MATPLOTLIB_FLAG + if not MATPLOTLIB_FLAG: + import matplotlib + + matplotlib.use("Agg") + MATPLOTLIB_FLAG = True + mpl_logger = logging.getLogger("matplotlib") + mpl_logger.setLevel(logging.WARNING) + import matplotlib.pylab as plt + import numpy as np + + fig, ax = plt.subplots(figsize=(6, 4)) + im = ax.imshow(alignment, aspect="auto", origin="lower", interpolation="none") + fig.colorbar(im, ax=ax) + xlabel = "Decoder timestep" + if info is not None: + xlabel += "\n\n" + info + plt.xlabel(xlabel) + plt.ylabel("Encoder timestep") + plt.tight_layout() + + fig.canvas.draw() + data = np.fromstring(fig.canvas.tostring_rgb(), dtype=np.uint8, sep="") + data = data.reshape(fig.canvas.get_width_height()[::-1] + (3,)) + plt.close() + return data + + +def load_wav_to_torch(full_path): + sampling_rate, data = read(full_path) + return torch.FloatTensor(data.astype(np.float32)), sampling_rate + + +def load_filepaths_and_text(filename, split="|"): + with open(filename, encoding="utf-8") as f: + filepaths_and_text = [line.strip().split(split) for line in f] + return filepaths_and_text + + +def get_hparams(init=True): + parser = argparse.ArgumentParser() + parser.add_argument("-c", "--config", type=str, help="JSON file for configuration") + parser.add_argument("-m", "--model", type=str, help="Model name") + # parser.add_argument('-g', '--gan', type=str, + # help='Model name') + parser.add_argument("-l", "--logs", type=str, help="logs name") + # parser.add_argument('-s', '--mels', type=str, + # help='logs name') + + args = parser.parse_args() + # model_dir = os.path.join("./logs", args.model) + model_dir = args.model + if not os.path.exists(model_dir): + os.makedirs(model_dir) + + config_path = args.config + config_save_path = os.path.join(model_dir, "config.json") + + # if not config_path : config_path = config_save_path + + if init: + with open(config_path, "r") as f: + data = f.read() + with open(config_save_path, "w") as f: + f.write(data) + else: + with open(config_save_path, "r") as f: + data = f.read() + config = json.loads(data) + + hparams = HParams(**config) + hparams.model_dir = model_dir + hparams.log_dir = args.logs + # hparams.mels_dir = args.mels + # hparams.gan_dir = args.gan + return hparams + + +def get_hparams_from_dir(model_dir): + config_save_path = os.path.join(model_dir, "config.json") + with open(config_save_path, "r") as f: + data = f.read() + config = json.loads(data) + + hparams = HParams(**config) + hparams.model_dir = model_dir + return hparams + + +def get_hparams_from_file(config_path): + with open(config_path, "r") as f: + data = f.read() + config = json.loads(data) + + hparams = HParams(**config) + return hparams + + +def check_git_hash(model_dir): + source_dir = os.path.dirname(os.path.realpath(__file__)) + if not os.path.exists(os.path.join(source_dir, ".git")): + logger.warn( + "{} is not a git repository, therefore hash value comparison will be ignored.".format( + source_dir + ) + ) + return + + cur_hash = subprocess.getoutput("git rev-parse HEAD") + + path = os.path.join(model_dir, "githash") + if os.path.exists(path): + saved_hash = open(path).read() + if saved_hash != cur_hash: + logger.warn( + "git hash values are different. {}(saved) != {}(current)".format( + saved_hash[:8], cur_hash[:8] + ) + ) + else: + open(path, "w").write(cur_hash) + + +def get_logger(model_dir, filename="train.log"): + global logger + logger = logging.getLogger(os.path.basename(model_dir)) + logger.setLevel(logging.DEBUG) + + formatter = logging.Formatter("%(asctime)s\t%(name)s\t%(levelname)s\t%(message)s") + if not os.path.exists(model_dir): + os.makedirs(model_dir) + h = logging.FileHandler(os.path.join(model_dir, filename)) + h.setLevel(logging.DEBUG) + h.setFormatter(formatter) + logger.addHandler(h) + return logger + + +class HParams: + def __init__(self, **kwargs): + for k, v in kwargs.items(): + if type(v) == dict: + v = HParams(**v) + self[k] = v + + def keys(self): + return self.__dict__.keys() + + def items(self): + return self.__dict__.items() + + def values(self): + return self.__dict__.values() + + def __len__(self): + return len(self.__dict__) + + def __getitem__(self, key): + return getattr(self, key) + + def __setitem__(self, key, value): + return setattr(self, key, value) + + def __contains__(self, key): + return key in self.__dict__ + + def __repr__(self): + return self.__dict__.__repr__() diff --git a/ttsv/src/hifi_gan/env.py b/ttsv/src/hifi_gan/env.py new file mode 100644 index 0000000000000000000000000000000000000000..2bdbc95d4f7a8bad8fd4f5eef657e2b51d946056 --- /dev/null +++ b/ttsv/src/hifi_gan/env.py @@ -0,0 +1,15 @@ +import os +import shutil + + +class AttrDict(dict): + def __init__(self, *args, **kwargs): + super(AttrDict, self).__init__(*args, **kwargs) + self.__dict__ = self + + +def build_env(config, config_name, path): + t_path = os.path.join(path, config_name) + if config != t_path: + os.makedirs(path, exist_ok=True) + shutil.copyfile(config, os.path.join(path, config_name)) diff --git a/ttsv/src/hifi_gan/inference.py b/ttsv/src/hifi_gan/inference.py new file mode 100644 index 0000000000000000000000000000000000000000..c70ee09b4110677b7cf9732d76a5e6ca93c8860c --- /dev/null +++ b/ttsv/src/hifi_gan/inference.py @@ -0,0 +1,98 @@ +from __future__ import absolute_import, division, print_function, unicode_literals + +import glob +import os +import argparse +import json +import torch +from scipy.io.wavfile import write +from env import AttrDict +from meldataset import mel_spectrogram, MAX_WAV_VALUE, load_wav +from models import Generator + +h = None +device = None + + +def load_checkpoint(filepath, device): + assert os.path.isfile(filepath) + print("Loading '{}'".format(filepath)) + checkpoint_dict = torch.load(filepath, map_location=device) + print("Complete.") + return checkpoint_dict + + +def get_mel(x): + return mel_spectrogram( + x, h.n_fft, h.num_mels, h.sampling_rate, h.hop_size, h.win_size, h.fmin, h.fmax + ) + + +def scan_checkpoint(cp_dir, prefix): + pattern = os.path.join(cp_dir, prefix + "*") + cp_list = glob.glob(pattern) + if len(cp_list) == 0: + return "" + return sorted(cp_list)[-1] + + +def inference(a): + generator = Generator(h).to(device) + + state_dict_g = load_checkpoint(a.checkpoint_file, device) + generator.load_state_dict(state_dict_g["generator"]) + + filelist = os.listdir(a.input_wavs_dir) + + os.makedirs(a.output_dir, exist_ok=True) + + generator.eval() + generator.remove_weight_norm() + with torch.no_grad(): + for i, filname in enumerate(filelist): + wav, sr = load_wav(os.path.join(a.input_wavs_dir, filname)) + wav = wav / MAX_WAV_VALUE + wav = torch.FloatTensor(wav).to(device) + x = get_mel(wav.unsqueeze(0)) + y_g_hat = generator(x) + audio = y_g_hat.squeeze() + audio = audio * MAX_WAV_VALUE + audio = audio.cpu().numpy().astype("int16") + + output_file = os.path.join( + a.output_dir, os.path.splitext(filname)[0] + "_generated.wav" + ) + write(output_file, h.sampling_rate, audio) + print(output_file) + + +def main(): + print("Initializing Inference Process..") + + parser = argparse.ArgumentParser() + parser.add_argument("--input_wavs_dir", default="test_files") + parser.add_argument("--output_dir", default="generated_files") + parser.add_argument("--checkpoint_file", required=True) + a = parser.parse_args() + + config_file = os.path.join(os.path.split(a.checkpoint_file)[0], "config.json") + with open(config_file) as f: + data = f.read() + + global h + json_config = json.loads(data) + h = AttrDict(json_config) + + torch.manual_seed(h.seed) + global device + if torch.cuda.is_available(): + torch.cuda.manual_seed(h.seed) + device = torch.device("cuda") + else: + device = torch.device("cpu") + + inference(a) + + +if __name__ == "__main__": + main() diff --git a/ttsv/src/hifi_gan/inference_e2e.py b/ttsv/src/hifi_gan/inference_e2e.py new file mode 100644 index 0000000000000000000000000000000000000000..062aecd4280925336ab1d36420d2cd47febf661c --- /dev/null +++ b/ttsv/src/hifi_gan/inference_e2e.py @@ -0,0 +1,91 @@ +from __future__ import absolute_import, division, print_function, unicode_literals + +import glob +import os +import numpy as np +import argparse +import json +import torch +from scipy.io.wavfile import write +from env import AttrDict +from meldataset import MAX_WAV_VALUE +from models import Generator + +h = None +device = None + + +def load_checkpoint(filepath, device): + assert os.path.isfile(filepath) + print("Loading '{}'".format(filepath)) + checkpoint_dict = torch.load(filepath, map_location=device) + print("Complete.") + return checkpoint_dict + + +def scan_checkpoint(cp_dir, prefix): + pattern = os.path.join(cp_dir, prefix + "*") + cp_list = glob.glob(pattern) + if len(cp_list) == 0: + return "" + return sorted(cp_list)[-1] + + +def inference(a): + generator = Generator(h).to(device) + + state_dict_g = load_checkpoint(a.checkpoint_file, device) + generator.load_state_dict(state_dict_g["generator"]) + + filelist = os.listdir(a.input_mels_dir) + + os.makedirs(a.output_dir, exist_ok=True) + + generator.eval() + generator.remove_weight_norm() + with torch.no_grad(): + for i, filname in enumerate(filelist): + x = np.load(os.path.join(a.input_mels_dir, filname)) + x = torch.FloatTensor(x).to(device) + y_g_hat = generator(x) + audio = y_g_hat.squeeze() + audio = audio * MAX_WAV_VALUE + audio = audio.cpu().numpy().astype("int16") + + output_file = os.path.join( + a.output_dir, os.path.splitext(filname)[0] + "_generated_e2e.wav" + ) + write(output_file, h.sampling_rate, audio) + print(output_file) + + +def main(): + print("Initializing Inference Process..") + + parser = argparse.ArgumentParser() + parser.add_argument("--input_mels_dir", default="test_mel_files") + parser.add_argument("--output_dir", default="generated_files_from_mel") + parser.add_argument("--checkpoint_file", required=True) + a = parser.parse_args() + + config_file = os.path.join(os.path.split(a.checkpoint_file)[0], "config.json") + with open(config_file) as f: + data = f.read() + + global h + json_config = json.loads(data) + h = AttrDict(json_config) + + torch.manual_seed(h.seed) + global device + if torch.cuda.is_available(): + torch.cuda.manual_seed(h.seed) + device = torch.device("cuda") + else: + device = torch.device("cpu") + + inference(a) + + +if __name__ == "__main__": + main() diff --git a/ttsv/src/hifi_gan/meldataset.py b/ttsv/src/hifi_gan/meldataset.py new file mode 100644 index 0000000000000000000000000000000000000000..8c6ca9ec8a6cc6408a77492e795bffef7f86b611 --- /dev/null +++ b/ttsv/src/hifi_gan/meldataset.py @@ -0,0 +1,233 @@ +import math +import os +import random +import torch +import torch.utils.data +import numpy as np +from librosa.util import normalize +from scipy.io.wavfile import read +from librosa.filters import mel as librosa_mel_fn + +MAX_WAV_VALUE = 32768.0 + + +def load_wav(full_path): + sampling_rate, data = read(full_path) + return data, sampling_rate + + +def dynamic_range_compression(x, C=1, clip_val=1e-5): + return np.log(np.clip(x, a_min=clip_val, a_max=None) * C) + + +def dynamic_range_decompression(x, C=1): + return np.exp(x) / C + + +def dynamic_range_compression_torch(x, C=1, clip_val=1e-5): + return torch.log(torch.clamp(x, min=clip_val) * C) + + +def dynamic_range_decompression_torch(x, C=1): + return torch.exp(x) / C + + +def spectral_normalize_torch(magnitudes): + output = dynamic_range_compression_torch(magnitudes) + return output + + +def spectral_de_normalize_torch(magnitudes): + output = dynamic_range_decompression_torch(magnitudes) + return output + + +mel_basis = {} +hann_window = {} + + +def mel_spectrogram( + y, n_fft, num_mels, sampling_rate, hop_size, win_size, fmin, fmax, center=False +): + if torch.min(y) < -1.0: + print("min value is ", torch.min(y)) + if torch.max(y) > 1.0: + print("max value is ", torch.max(y)) + + global mel_basis, hann_window + if fmax not in mel_basis: + mel = librosa_mel_fn(sampling_rate, n_fft, num_mels, fmin, fmax) + mel_basis[str(fmax) + "_" + str(y.device)] = ( + torch.from_numpy(mel).float().to(y.device) + ) + hann_window[str(y.device)] = torch.hann_window(win_size).to(y.device) + + y = torch.nn.functional.pad( + y.unsqueeze(1), + (int((n_fft - hop_size) / 2), int((n_fft - hop_size) / 2)), + mode="reflect", + ) + y = y.squeeze(1) + + spec = torch.stft( + y, + n_fft, + hop_length=hop_size, + win_length=win_size, + window=hann_window[str(y.device)], + center=center, + pad_mode="reflect", + normalized=False, + onesided=True, + ) + + spec = torch.sqrt(spec.pow(2).sum(-1) + (1e-9)) + + spec = torch.matmul(mel_basis[str(fmax) + "_" + str(y.device)], spec) + spec = spectral_normalize_torch(spec) + + return spec + + +def get_dataset_filelist(a): + with open(a.input_training_file, "r", encoding="utf-8") as fi: + training_files = [x for x in fi.read().split("\n") if len(x) > 0] + + with open(a.input_validation_file, "r", encoding="utf-8") as fi: + validation_files = [x for x in fi.read().split("\n") if len(x) > 0] + return training_files, validation_files + + +class MelDataset(torch.utils.data.Dataset): + def __init__( + self, + training_files, + segment_size, + n_fft, + num_mels, + hop_size, + win_size, + sampling_rate, + fmin, + fmax, + split=True, + shuffle=True, + n_cache_reuse=1, + device=None, + fmax_loss=None, + fine_tuning=False, + base_mels_path=None, + ): + self.audio_files = training_files + random.seed(1234) + if shuffle: + random.shuffle(self.audio_files) + self.segment_size = segment_size + self.sampling_rate = sampling_rate + self.split = split + self.n_fft = n_fft + self.num_mels = num_mels + self.hop_size = hop_size + self.win_size = win_size + self.fmin = fmin + self.fmax = fmax + self.fmax_loss = fmax_loss + self.cached_wav = None + self.n_cache_reuse = n_cache_reuse + self._cache_ref_count = 0 + self.device = device + self.fine_tuning = fine_tuning + self.base_mels_path = base_mels_path + + def __getitem__(self, index): + filename = self.audio_files[index] + if self._cache_ref_count == 0: + audio, sampling_rate = load_wav(filename) + audio = audio / MAX_WAV_VALUE + if not self.fine_tuning: + audio = normalize(audio) * 0.95 + self.cached_wav = audio + if sampling_rate != self.sampling_rate: + raise ValueError( + "{} SR doesn't match target {} SR".format( + sampling_rate, self.sampling_rate + ) + ) + self._cache_ref_count = self.n_cache_reuse + else: + audio = self.cached_wav + self._cache_ref_count -= 1 + + audio = torch.FloatTensor(audio) + audio = audio.unsqueeze(0) + + if not self.fine_tuning: + if self.split: + if audio.size(1) >= self.segment_size: + max_audio_start = audio.size(1) - self.segment_size + audio_start = random.randint(0, max_audio_start) + audio = audio[:, audio_start : audio_start + self.segment_size] + else: + audio = torch.nn.functional.pad( + audio, (0, self.segment_size - audio.size(1)), "constant" + ) + + mel = mel_spectrogram( + audio, + self.n_fft, + self.num_mels, + self.sampling_rate, + self.hop_size, + self.win_size, + self.fmin, + self.fmax, + center=False, + ) + else: + mel = np.load( + os.path.join( + self.base_mels_path, + os.path.splitext(os.path.split(filename)[-1])[0] + ".npy", + ) + ) + mel = torch.from_numpy(mel) + + if len(mel.shape) < 3: + mel = mel.unsqueeze(0) + + if self.split: + frames_per_seg = math.ceil(self.segment_size / self.hop_size) + + if audio.size(1) >= self.segment_size: + mel_start = random.randint(0, mel.size(2) - frames_per_seg - 1) + mel = mel[:, :, mel_start : mel_start + frames_per_seg] + audio = audio[ + :, + mel_start + * self.hop_size : (mel_start + frames_per_seg) + * self.hop_size, + ] + else: + mel = torch.nn.functional.pad( + mel, (0, frames_per_seg - mel.size(2)), "constant" + ) + audio = torch.nn.functional.pad( + audio, (0, self.segment_size - audio.size(1)), "constant" + ) + + mel_loss = mel_spectrogram( + audio, + self.n_fft, + self.num_mels, + self.sampling_rate, + self.hop_size, + self.win_size, + self.fmin, + self.fmax_loss, + center=False, + ) + + return (mel.squeeze(), audio.squeeze(0), filename, mel_loss.squeeze()) + + def __len__(self): + return len(self.audio_files) diff --git a/ttsv/src/hifi_gan/models.py b/ttsv/src/hifi_gan/models.py new file mode 100644 index 0000000000000000000000000000000000000000..be51fa51407e6ce1daaee5e8d090f6acdbee0db9 --- /dev/null +++ b/ttsv/src/hifi_gan/models.py @@ -0,0 +1,403 @@ +import torch +import torch.nn.functional as F +import torch.nn as nn +from torch.nn import Conv1d, ConvTranspose1d, AvgPool1d, Conv2d +from torch.nn.utils import weight_norm, remove_weight_norm, spectral_norm +from utils import init_weights, get_padding + +LRELU_SLOPE = 0.1 + + +class ResBlock1(torch.nn.Module): + def __init__(self, h, channels, kernel_size=3, dilation=(1, 3, 5)): + super(ResBlock1, self).__init__() + self.h = h + self.convs1 = nn.ModuleList( + [ + weight_norm( + Conv1d( + channels, + channels, + kernel_size, + 1, + dilation=dilation[0], + padding=get_padding(kernel_size, dilation[0]), + ) + ), + weight_norm( + Conv1d( + channels, + channels, + kernel_size, + 1, + dilation=dilation[1], + padding=get_padding(kernel_size, dilation[1]), + ) + ), + weight_norm( + Conv1d( + channels, + channels, + kernel_size, + 1, + dilation=dilation[2], + padding=get_padding(kernel_size, dilation[2]), + ) + ), + ] + ) + self.convs1.apply(init_weights) + + self.convs2 = nn.ModuleList( + [ + weight_norm( + Conv1d( + channels, + channels, + kernel_size, + 1, + dilation=1, + padding=get_padding(kernel_size, 1), + ) + ), + weight_norm( + Conv1d( + channels, + channels, + kernel_size, + 1, + dilation=1, + padding=get_padding(kernel_size, 1), + ) + ), + weight_norm( + Conv1d( + channels, + channels, + kernel_size, + 1, + dilation=1, + padding=get_padding(kernel_size, 1), + ) + ), + ] + ) + self.convs2.apply(init_weights) + + def forward(self, x): + for c1, c2 in zip(self.convs1, self.convs2): + xt = F.leaky_relu(x, LRELU_SLOPE) + xt = c1(xt) + xt = F.leaky_relu(xt, LRELU_SLOPE) + xt = c2(xt) + x = xt + x + return x + + def remove_weight_norm(self): + for l in self.convs1: + remove_weight_norm(l) + for l in self.convs2: + remove_weight_norm(l) + + +class ResBlock2(torch.nn.Module): + def __init__(self, h, channels, kernel_size=3, dilation=(1, 3)): + super(ResBlock2, self).__init__() + self.h = h + self.convs = nn.ModuleList( + [ + weight_norm( + Conv1d( + channels, + channels, + kernel_size, + 1, + dilation=dilation[0], + padding=get_padding(kernel_size, dilation[0]), + ) + ), + weight_norm( + Conv1d( + channels, + channels, + kernel_size, + 1, + dilation=dilation[1], + padding=get_padding(kernel_size, dilation[1]), + ) + ), + ] + ) + self.convs.apply(init_weights) + + def forward(self, x): + for c in self.convs: + xt = F.leaky_relu(x, LRELU_SLOPE) + xt = c(xt) + x = xt + x + return x + + def remove_weight_norm(self): + for l in self.convs: + remove_weight_norm(l) + + +class Generator(torch.nn.Module): + def __init__(self, h): + super(Generator, self).__init__() + self.h = h + self.num_kernels = len(h.resblock_kernel_sizes) + self.num_upsamples = len(h.upsample_rates) + self.conv_pre = weight_norm( + Conv1d(80, h.upsample_initial_channel, 7, 1, padding=3) + ) + resblock = ResBlock1 if h.resblock == "1" else ResBlock2 + + self.ups = nn.ModuleList() + for i, (u, k) in enumerate(zip(h.upsample_rates, h.upsample_kernel_sizes)): + self.ups.append( + weight_norm( + ConvTranspose1d( + h.upsample_initial_channel // (2 ** i), + h.upsample_initial_channel // (2 ** (i + 1)), + k, + u, + padding=(k - u) // 2, + ) + ) + ) + + self.resblocks = nn.ModuleList() + for i in range(len(self.ups)): + ch = h.upsample_initial_channel // (2 ** (i + 1)) + for j, (k, d) in enumerate( + zip(h.resblock_kernel_sizes, h.resblock_dilation_sizes) + ): + self.resblocks.append(resblock(h, ch, k, d)) + + self.conv_post = weight_norm(Conv1d(ch, 1, 7, 1, padding=3)) + self.ups.apply(init_weights) + self.conv_post.apply(init_weights) + + def forward(self, x): + x = self.conv_pre(x) + for i in range(self.num_upsamples): + x = F.leaky_relu(x, LRELU_SLOPE) + x = self.ups[i](x) + xs = None + for j in range(self.num_kernels): + if xs is None: + xs = self.resblocks[i * self.num_kernels + j](x) + else: + xs += self.resblocks[i * self.num_kernels + j](x) + x = xs / self.num_kernels + x = F.leaky_relu(x) + x = self.conv_post(x) + x = torch.tanh(x) + + return x + + def remove_weight_norm(self): + print("Removing weight norm...") + for l in self.ups: + remove_weight_norm(l) + for l in self.resblocks: + l.remove_weight_norm() + remove_weight_norm(self.conv_pre) + remove_weight_norm(self.conv_post) + + +class DiscriminatorP(torch.nn.Module): + def __init__(self, period, kernel_size=5, stride=3, use_spectral_norm=False): + super(DiscriminatorP, self).__init__() + self.period = period + norm_f = weight_norm if use_spectral_norm == False else spectral_norm + self.convs = nn.ModuleList( + [ + norm_f( + Conv2d( + 1, + 32, + (kernel_size, 1), + (stride, 1), + padding=(get_padding(5, 1), 0), + ) + ), + norm_f( + Conv2d( + 32, + 128, + (kernel_size, 1), + (stride, 1), + padding=(get_padding(5, 1), 0), + ) + ), + norm_f( + Conv2d( + 128, + 512, + (kernel_size, 1), + (stride, 1), + padding=(get_padding(5, 1), 0), + ) + ), + norm_f( + Conv2d( + 512, + 1024, + (kernel_size, 1), + (stride, 1), + padding=(get_padding(5, 1), 0), + ) + ), + norm_f(Conv2d(1024, 1024, (kernel_size, 1), 1, padding=(2, 0))), + ] + ) + self.conv_post = norm_f(Conv2d(1024, 1, (3, 1), 1, padding=(1, 0))) + + def forward(self, x): + fmap = [] + + # 1d to 2d + b, c, t = x.shape + if t % self.period != 0: # pad first + n_pad = self.period - (t % self.period) + x = F.pad(x, (0, n_pad), "reflect") + t = t + n_pad + x = x.view(b, c, t // self.period, self.period) + + for l in self.convs: + x = l(x) + x = F.leaky_relu(x, LRELU_SLOPE) + fmap.append(x) + x = self.conv_post(x) + fmap.append(x) + x = torch.flatten(x, 1, -1) + + return x, fmap + + +class MultiPeriodDiscriminator(torch.nn.Module): + def __init__(self): + super(MultiPeriodDiscriminator, self).__init__() + self.discriminators = nn.ModuleList( + [ + DiscriminatorP(2), + DiscriminatorP(3), + DiscriminatorP(5), + DiscriminatorP(7), + DiscriminatorP(11), + ] + ) + + def forward(self, y, y_hat): + y_d_rs = [] + y_d_gs = [] + fmap_rs = [] + fmap_gs = [] + for i, d in enumerate(self.discriminators): + y_d_r, fmap_r = d(y) + y_d_g, fmap_g = d(y_hat) + y_d_rs.append(y_d_r) + fmap_rs.append(fmap_r) + y_d_gs.append(y_d_g) + fmap_gs.append(fmap_g) + + return y_d_rs, y_d_gs, fmap_rs, fmap_gs + + +class DiscriminatorS(torch.nn.Module): + def __init__(self, use_spectral_norm=False): + super(DiscriminatorS, self).__init__() + norm_f = weight_norm if use_spectral_norm == False else spectral_norm + self.convs = nn.ModuleList( + [ + norm_f(Conv1d(1, 128, 15, 1, padding=7)), + norm_f(Conv1d(128, 128, 41, 2, groups=4, padding=20)), + norm_f(Conv1d(128, 256, 41, 2, groups=16, padding=20)), + norm_f(Conv1d(256, 512, 41, 4, groups=16, padding=20)), + norm_f(Conv1d(512, 1024, 41, 4, groups=16, padding=20)), + norm_f(Conv1d(1024, 1024, 41, 1, groups=16, padding=20)), + norm_f(Conv1d(1024, 1024, 5, 1, padding=2)), + ] + ) + self.conv_post = norm_f(Conv1d(1024, 1, 3, 1, padding=1)) + + def forward(self, x): + fmap = [] + for l in self.convs: + x = l(x) + x = F.leaky_relu(x, LRELU_SLOPE) + fmap.append(x) + x = self.conv_post(x) + fmap.append(x) + x = torch.flatten(x, 1, -1) + + return x, fmap + + +class MultiScaleDiscriminator(torch.nn.Module): + def __init__(self): + super(MultiScaleDiscriminator, self).__init__() + self.discriminators = nn.ModuleList( + [ + DiscriminatorS(use_spectral_norm=True), + DiscriminatorS(), + DiscriminatorS(), + ] + ) + self.meanpools = nn.ModuleList( + [AvgPool1d(4, 2, padding=2), AvgPool1d(4, 2, padding=2)] + ) + + def forward(self, y, y_hat): + y_d_rs = [] + y_d_gs = [] + fmap_rs = [] + fmap_gs = [] + for i, d in enumerate(self.discriminators): + if i != 0: + y = self.meanpools[i - 1](y) + y_hat = self.meanpools[i - 1](y_hat) + y_d_r, fmap_r = d(y) + y_d_g, fmap_g = d(y_hat) + y_d_rs.append(y_d_r) + fmap_rs.append(fmap_r) + y_d_gs.append(y_d_g) + fmap_gs.append(fmap_g) + + return y_d_rs, y_d_gs, fmap_rs, fmap_gs + + +def feature_loss(fmap_r, fmap_g): + loss = 0 + for dr, dg in zip(fmap_r, fmap_g): + for rl, gl in zip(dr, dg): + loss += torch.mean(torch.abs(rl - gl)) + + return loss * 2 + + +def discriminator_loss(disc_real_outputs, disc_generated_outputs): + loss = 0 + r_losses = [] + g_losses = [] + for dr, dg in zip(disc_real_outputs, disc_generated_outputs): + r_loss = torch.mean((1 - dr) ** 2) + g_loss = torch.mean(dg ** 2) + loss += r_loss + g_loss + r_losses.append(r_loss.item()) + g_losses.append(g_loss.item()) + + return loss, r_losses, g_losses + + +def generator_loss(disc_outputs): + loss = 0 + gen_losses = [] + for dg in disc_outputs: + l = torch.mean((1 - dg) ** 2) + gen_losses.append(l) + loss += l + + return loss, gen_losses diff --git a/ttsv/src/hifi_gan/train.py b/ttsv/src/hifi_gan/train.py new file mode 100644 index 0000000000000000000000000000000000000000..709e085d019eb98006b26555f7fe2582d759efa6 --- /dev/null +++ b/ttsv/src/hifi_gan/train.py @@ -0,0 +1,400 @@ +import warnings + +warnings.simplefilter(action="ignore", category=FutureWarning) +import itertools +import os +import time +import argparse +import json +import torch +import torch.nn.functional as F +from torch.utils.tensorboard import SummaryWriter +from torch.utils.data import DistributedSampler, DataLoader +import torch.multiprocessing as mp +from torch.distributed import init_process_group +from torch.nn.parallel import DistributedDataParallel +from env import AttrDict, build_env +from meldataset import MelDataset, mel_spectrogram, get_dataset_filelist +from models import ( + Generator, + MultiPeriodDiscriminator, + MultiScaleDiscriminator, + feature_loss, + generator_loss, + discriminator_loss, +) +from utils import plot_spectrogram, scan_checkpoint, load_checkpoint, save_checkpoint + +torch.backends.cudnn.benchmark = True + + +def train(rank, a, h): + if h.num_gpus > 1: + init_process_group( + backend=h.dist_config["dist_backend"], + init_method=h.dist_config["dist_url"], + world_size=h.dist_config["world_size"] * h.num_gpus, + rank=rank, + ) + + torch.cuda.manual_seed(h.seed) + device = torch.device("cuda:{:d}".format(rank)) + + generator = Generator(h).to(device) + mpd = MultiPeriodDiscriminator().to(device) + msd = MultiScaleDiscriminator().to(device) + + if rank == 0: + print(generator) + os.makedirs(a.checkpoint_path, exist_ok=True) + print("checkpoints directory : ", a.checkpoint_path) + + if os.path.isdir(a.checkpoint_path): + cp_g = scan_checkpoint(a.checkpoint_path, "g_") + cp_do = scan_checkpoint(a.checkpoint_path, "do_") + + steps = 0 + if cp_g is None or cp_do is None: + state_dict_do = None + last_epoch = -1 + else: + state_dict_g = load_checkpoint(cp_g, device) + state_dict_do = load_checkpoint(cp_do, device) + generator.load_state_dict(state_dict_g["generator"]) + mpd.load_state_dict(state_dict_do["mpd"]) + msd.load_state_dict(state_dict_do["msd"]) + steps = state_dict_do["steps"] + 1 + last_epoch = state_dict_do["epoch"] + + if h.num_gpus > 1: + generator = DistributedDataParallel(generator, device_ids=[rank]).to(device) + mpd = DistributedDataParallel(mpd, device_ids=[rank]).to(device) + msd = DistributedDataParallel(msd, device_ids=[rank]).to(device) + + optim_g = torch.optim.AdamW( + generator.parameters(), h.learning_rate, betas=[h.adam_b1, h.adam_b2] + ) + optim_d = torch.optim.AdamW( + itertools.chain(msd.parameters(), mpd.parameters()), + h.learning_rate, + betas=[h.adam_b1, h.adam_b2], + ) + + if state_dict_do is not None: + optim_g.load_state_dict(state_dict_do["optim_g"]) + optim_d.load_state_dict(state_dict_do["optim_d"]) + + scheduler_g = torch.optim.lr_scheduler.ExponentialLR( + optim_g, gamma=h.lr_decay, last_epoch=last_epoch + ) + scheduler_d = torch.optim.lr_scheduler.ExponentialLR( + optim_d, gamma=h.lr_decay, last_epoch=last_epoch + ) + + training_filelist, validation_filelist = get_dataset_filelist(a) + + trainset = MelDataset( + training_filelist, + h.segment_size, + h.n_fft, + h.num_mels, + h.hop_size, + h.win_size, + h.sampling_rate, + h.fmin, + h.fmax, + n_cache_reuse=0, + shuffle=False if h.num_gpus > 1 else True, + fmax_loss=h.fmax_for_loss, + device=device, + fine_tuning=a.fine_tuning, + base_mels_path=a.input_mels_dir, + ) + + train_sampler = DistributedSampler(trainset) if h.num_gpus > 1 else None + + train_loader = DataLoader( + trainset, + num_workers=h.num_workers, + shuffle=False, + sampler=train_sampler, + batch_size=h.batch_size, + pin_memory=True, + drop_last=True, + ) + + if rank == 0: + validset = MelDataset( + validation_filelist, + h.segment_size, + h.n_fft, + h.num_mels, + h.hop_size, + h.win_size, + h.sampling_rate, + h.fmin, + h.fmax, + False, + False, + n_cache_reuse=0, + fmax_loss=h.fmax_for_loss, + device=device, + fine_tuning=a.fine_tuning, + base_mels_path=a.input_mels_dir, + ) + validation_loader = DataLoader( + validset, + num_workers=1, + shuffle=False, + sampler=None, + batch_size=1, + pin_memory=True, + drop_last=True, + ) + + sw = SummaryWriter(os.path.join(a.logs_path)) + + generator.train() + mpd.train() + msd.train() + for epoch in range(max(0, last_epoch), a.training_epochs): + if rank == 0: + start = time.time() + print("Epoch: {}".format(epoch + 1)) + + if h.num_gpus > 1: + train_sampler.set_epoch(epoch) + + for i, batch in enumerate(train_loader): + if rank == 0: + start_b = time.time() + x, y, _, y_mel = batch + x = torch.autograd.Variable(x.to(device, non_blocking=True)) + y = torch.autograd.Variable(y.to(device, non_blocking=True)) + y_mel = torch.autograd.Variable(y_mel.to(device, non_blocking=True)) + y = y.unsqueeze(1) + + y_g_hat = generator(x) + y_g_hat_mel = mel_spectrogram( + y_g_hat.squeeze(1), + h.n_fft, + h.num_mels, + h.sampling_rate, + h.hop_size, + h.win_size, + h.fmin, + h.fmax_for_loss, + ) + + optim_d.zero_grad() + + # MPD + y_df_hat_r, y_df_hat_g, _, _ = mpd(y, y_g_hat.detach()) + loss_disc_f, losses_disc_f_r, losses_disc_f_g = discriminator_loss( + y_df_hat_r, y_df_hat_g + ) + + # MSD + y_ds_hat_r, y_ds_hat_g, _, _ = msd(y, y_g_hat.detach()) + loss_disc_s, losses_disc_s_r, losses_disc_s_g = discriminator_loss( + y_ds_hat_r, y_ds_hat_g + ) + + loss_disc_all = loss_disc_s + loss_disc_f + + loss_disc_all.backward() + optim_d.step() + + # Generator + optim_g.zero_grad() + + # L1 Mel-Spectrogram Loss + loss_mel = F.l1_loss(y_mel, y_g_hat_mel) * 45 + + y_df_hat_r, y_df_hat_g, fmap_f_r, fmap_f_g = mpd(y, y_g_hat) + y_ds_hat_r, y_ds_hat_g, fmap_s_r, fmap_s_g = msd(y, y_g_hat) + loss_fm_f = feature_loss(fmap_f_r, fmap_f_g) + loss_fm_s = feature_loss(fmap_s_r, fmap_s_g) + loss_gen_f, losses_gen_f = generator_loss(y_df_hat_g) + loss_gen_s, losses_gen_s = generator_loss(y_ds_hat_g) + loss_gen_all = loss_gen_s + loss_gen_f + loss_fm_s + loss_fm_f + loss_mel + + loss_gen_all.backward() + optim_g.step() + + if rank == 0: + # STDOUT logging + if steps % a.stdout_interval == 0: + with torch.no_grad(): + mel_error = F.l1_loss(y_mel, y_g_hat_mel).item() + + print( + "Steps : {:d}, Gen Loss Total : {:4.3f}, Mel-Spec. Error : {:4.3f}, s/b : {:4.3f}".format( + steps, loss_gen_all, mel_error, time.time() - start_b + ) + ) + + # checkpointing + if steps % a.checkpoint_interval == 0 and steps != 0: + checkpoint_path = "{}/g_{:08d}".format(a.checkpoint_path, steps) + save_checkpoint( + checkpoint_path, + { + "generator": ( + generator.module if h.num_gpus > 1 else generator + ).state_dict() + }, + ) + checkpoint_path = "{}/do_{:08d}".format(a.checkpoint_path, steps) + save_checkpoint( + checkpoint_path, + { + "mpd": (mpd.module if h.num_gpus > 1 else mpd).state_dict(), + "msd": (msd.module if h.num_gpus > 1 else msd).state_dict(), + "optim_g": optim_g.state_dict(), + "optim_d": optim_d.state_dict(), + "steps": steps, + "epoch": epoch, + }, + ) + + # Tensorboard summary logging + if steps % a.summary_interval == 0: + sw.add_scalar("training/gen_loss_total", loss_gen_all, steps) + sw.add_scalar("training/mel_spec_error", mel_error, steps) + + # Validation + if steps % a.validation_interval == 0: # and steps != 0: + generator.eval() + torch.cuda.empty_cache() + val_err_tot = 0 + with torch.no_grad(): + for j, batch in enumerate(validation_loader): + x, y, _, y_mel = batch + y_g_hat = generator(x.to(device)) + y_mel = torch.autograd.Variable( + y_mel.to(device, non_blocking=True) + ) + y_g_hat_mel = mel_spectrogram( + y_g_hat.squeeze(1), + h.n_fft, + h.num_mels, + h.sampling_rate, + h.hop_size, + h.win_size, + h.fmin, + h.fmax_for_loss, + ) + val_err_tot += F.l1_loss(y_mel, y_g_hat_mel).item() + + if j <= 4: + if steps == 0: + sw.add_audio( + "gt/y_{}".format(j), + y[0], + steps, + h.sampling_rate, + ) + sw.add_figure( + "gt/y_spec_{}".format(j), + plot_spectrogram(x[0]), + steps, + ) + + sw.add_audio( + "generated/y_hat_{}".format(j), + y_g_hat[0], + steps, + h.sampling_rate, + ) + y_hat_spec = mel_spectrogram( + y_g_hat.squeeze(1), + h.n_fft, + h.num_mels, + h.sampling_rate, + h.hop_size, + h.win_size, + h.fmin, + h.fmax, + ) + sw.add_figure( + "generated/y_hat_spec_{}".format(j), + plot_spectrogram( + y_hat_spec.squeeze(0).cpu().numpy() + ), + steps, + ) + + val_err = val_err_tot / (j + 1) + sw.add_scalar("validation/mel_spec_error", val_err, steps) + + generator.train() + + steps += 1 + + scheduler_g.step() + scheduler_d.step() + + if rank == 0: + print( + "Time taken for epoch {} is {} sec\n".format( + epoch + 1, int(time.time() - start) + ) + ) + + +def main(): + print("Initializing Training Process..") + + parser = argparse.ArgumentParser() + + parser.add_argument("--group_name", default=None) + parser.add_argument("--input_wavs_dir", default="LJSpeech-1.1/wavs") + parser.add_argument("--input_mels_dir", default="ft_dataset") + parser.add_argument("--input_training_file", default="LJSpeech-1.1/training.txt") + parser.add_argument( + "--input_validation_file", default="LJSpeech-1.1/validation.txt" + ) + parser.add_argument("--checkpoint_path", default="cp_hifigan") + parser.add_argument("--logs_path", default="") + parser.add_argument("--config", default="") + parser.add_argument("--training_epochs", default=3100, type=int) + parser.add_argument("--stdout_interval", default=5, type=int) + parser.add_argument("--checkpoint_interval", default=5000, type=int) + parser.add_argument("--summary_interval", default=100, type=int) + parser.add_argument("--validation_interval", default=1000, type=int) + parser.add_argument("--fine_tuning", default=False, type=bool) + + a = parser.parse_args() + + with open(a.config) as f: + data = f.read() + + json_config = json.loads(data) + h = AttrDict(json_config) + build_env(a.config, "config.json", a.checkpoint_path) + + torch.manual_seed(h.seed) + if torch.cuda.is_available(): + torch.cuda.manual_seed(h.seed) + h.num_gpus = torch.cuda.device_count() + h.batch_size = int(h.batch_size / h.num_gpus) + print("Batch size per GPU :", h.batch_size) + else: + pass + + if h.num_gpus > 1: + mp.spawn( + train, + nprocs=h.num_gpus, + args=( + a, + h, + ), + ) + else: + train(0, a, h) + + +if __name__ == "__main__": + main() diff --git a/ttsv/src/hifi_gan/utils.py b/ttsv/src/hifi_gan/utils.py new file mode 100644 index 0000000000000000000000000000000000000000..71e9b2c99e053e2d4239074a67d64b834898c348 --- /dev/null +++ b/ttsv/src/hifi_gan/utils.py @@ -0,0 +1,57 @@ +import glob +import os +import matplotlib +import torch +from torch.nn.utils import weight_norm + +matplotlib.use("Agg") +import matplotlib.pylab as plt + + +def plot_spectrogram(spectrogram): + fig, ax = plt.subplots(figsize=(10, 2)) + im = ax.imshow(spectrogram, aspect="auto", origin="lower", interpolation="none") + plt.colorbar(im, ax=ax) + + fig.canvas.draw() + plt.close() + + return fig + + +def init_weights(m, mean=0.0, std=0.01): + classname = m.__class__.__name__ + if classname.find("Conv") != -1: + m.weight.data.normal_(mean, std) + + +def apply_weight_norm(m): + classname = m.__class__.__name__ + if classname.find("Conv") != -1: + weight_norm(m) + + +def get_padding(kernel_size, dilation=1): + return int((kernel_size * dilation - dilation) / 2) + + +def load_checkpoint(filepath, device): + assert os.path.isfile(filepath) + print("Loading '{}'".format(filepath)) + checkpoint_dict = torch.load(filepath, map_location=device) + print("Complete.") + return checkpoint_dict + + +def save_checkpoint(filepath, obj): + print("Saving checkpoint to {}".format(filepath)) + torch.save(obj, filepath) + print("Complete.") + + +def scan_checkpoint(cp_dir, prefix): + pattern = os.path.join(cp_dir, prefix + "????????") + cp_list = glob.glob(pattern) + if len(cp_list) == 0: + return None + return sorted(cp_list)[-1] diff --git a/ttsv/tts_infer/__init__.py b/ttsv/tts_infer/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/ttsv/tts_infer/example_inference.py b/ttsv/tts_infer/example_inference.py new file mode 100644 index 0000000000000000000000000000000000000000..676718fff3c6a7120cea91b0cfc95f8872929da7 --- /dev/null +++ b/ttsv/tts_infer/example_inference.py @@ -0,0 +1,79 @@ +''' Example file to test tts_infer after installing it. Refer to section 1.1 in README.md for steps of installation. ''' + +from tts_infer.tts import TextToMel, MelToWav +from tts_infer.transliterate import XlitEngine +from tts_infer.num_to_word_on_sent import normalize_nums + +import re +import numpy as np +from scipy.io.wavfile import write + +from mosestokenizer import * +from indicnlp.tokenize import sentence_tokenize + +INDIC = ["as", "bn", "gu", "hi", "kn", "ml", "mr", "or", "pa", "ta", "te"] + +def split_sentences(paragraph, language): + if language == "en": + with MosesSentenceSplitter(language) as splitter: + return splitter([paragraph]) + elif language in INDIC: + return sentence_tokenize.sentence_split(paragraph, lang=language) + + +device='cpu' +text_to_mel = TextToMel(glow_model_dir='/path/to/glow_ckp', device=device) +mel_to_wav = MelToWav(hifi_model_dir='/path/to/hifi_ckp', device=device) + +lang='hi' # transliteration from En to Hi +engine = XlitEngine(lang) # loading translit model globally + +def translit(text, lang): + reg = re.compile(r'[a-zA-Z]') + words = [engine.translit_word(word, topk=1)[lang][0] if reg.match(word) else word for word in text.split()] + updated_sent = ' '.join(words) + return updated_sent + +def run_tts(text, lang): + text = text.replace('।', '.') # only for hindi models + text_num_to_word = normalize_nums(text, lang) # converting numbers to words in lang + text_num_to_word_and_transliterated = translit(text_num_to_word, lang) # transliterating english words to lang + final_text = ' ' + text_num_to_word_and_transliterated + + mel = text_to_mel.generate_mel(final_text) + audio, sr = mel_to_wav.generate_wav(mel) + write(filename='temp.wav', rate=sr, data=audio) # for saving wav file, if needed + return (sr, audio) + +def run_tts_paragraph(text, lang): + audio_list = [] + split_sentences_list = split_sentences(text, language='hi') + + for sent in split_sentences_list: + sr, audio = run_tts(sent, lang) + audio_list.append(audio) + + concatenated_audio = np.concatenate([i for i in audio_list]) + write(filename='temp_long.wav', rate=sr, data=concatenated_audio) + return (sr, concatenated_audio) + +if __name__ == "__main__": + _, audio = run_tts('mera naam neeraj hai', 'hi') + + para = ''' + भारत मेरा देश है और मुझे भारतीय होने पर गर्व है। ये विश्व का सातवाँ सबसे बड़ा और विश्व में दूसरा सबसे अधिक जनसंख्या वाला देश है। + इसे भारत, हिन्दुस्तान और आर्यव्रत के नाम से भी जाना जाता है। ये एक प्रायद्वीप है जो पूरब में बंगाल की खाड़ी, + पश्चिम में अरेबियन सागर और दक्षिण में भारतीय महासागर जैसे तीन महासगरों से घिरा हुआ है। + भारत का राष्ट्रीय पशु चीता, राष्ट्रीय पक्षी मोर, राष्ट्रीय फूल कमल, और राष्ट्रीय फल आम है। + भारत मेरा देश है और मुझे भारतीय होने पर गर्व है। ये विश्व का सातवाँ सबसे बड़ा और विश्व में दूसरा सबसे अधिक जनसंख्या वाला देश है। + इसे भारत, हिन्दुस्तान और आर्यव्रत के नाम से भी जाना जाता है। ये एक प्रायद्वीप है जो पूरब में बंगाल की खाड़ी, + पश्चिम में अरेबियन सागर और दक्षिण में भारतीय महासागर जैसे तीन महासगरों से घिरा हुआ है। + भारत का राष्ट्रीय पशु चीता, राष्ट्रीय पक्षी मोर, राष्ट्रीय फूल कमल, और राष्ट्रीय फल आम है। + भारत मेरा देश है और मुझे भारतीय होने पर गर्व है। ये विश्व का सातवाँ सबसे बड़ा और विश्व में दूसरा सबसे अधिक जनसंख्या वाला देश है। + इसे भारत, हिन्दुस्तान और आर्यव्रत के नाम से भी जाना जाता है। ये एक प्रायद्वीप है जो पूरब में बंगाल की खाड़ी, + पश्चिम में अरेबियन सागर और दक्षिण में भारतीय महासागर जैसे तीन महासगरों से घिरा हुआ है। + भारत का राष्ट्रीय पशु चीता, राष्ट्रीय पक्षी मोर, राष्ट्रीय फूल कमल, और राष्ट्रीय फल आम है। + ''' + + print('Num chars in paragraph: ', len(para)) + _, audio_long = run_tts_paragraph(para, 'hi') diff --git a/ttsv/tts_infer/num_to_word_on_sent.py b/ttsv/tts_infer/num_to_word_on_sent.py new file mode 100644 index 0000000000000000000000000000000000000000..de571c2be63fa467491d01daf0e2f38dada67de9 --- /dev/null +++ b/ttsv/tts_infer/num_to_word_on_sent.py @@ -0,0 +1,1319 @@ +import re +import string + +# ----------------------------- indic_num.py ----------------------------- +supported_lang = {"en", "hi", "gu", "mr", "bn", "te", "ta", "kn", "or", "pa"} +# supported_lang = {'eng', 'hin', 'guj', 'mar', 'ben', 'tel', 'tam', 'kan', 'ori', 'pan'} # Three alphabet lang code + +all_num = { + "en": ["0", "1", "2", "3", "4", "5", "6", "7", "8", "9"], + "hi": ["०", "१", "२", "३", "४", "५", "६", "७", "८", "९"], + "gu": ["૦", "૧", "૨", "૩", "૪", "૫", "૬", "૭", "૮", "૯"], + "mr": ["०", "१", "२", "३", "४", "५", "६", "७", "८", "९"], + "bn": ["০", "১", "২", "৩", "৪", "৫", "৬", "৭", "৮", "৯"], + "te": ["౦", "౧", "౨", "౩", "౪", "౫", "౬", "౭", "౮", "౯"], + "ta": ["0", "௧", "௨", "௩", "௪", "௫", "௬", "௭", "௮", "௯", "௰"], + "kn": ["೦", "೧", "೨", "೩", "೪", "೫", "೬", "೭", "೮", "೯"], + "or": ["୦", "୧", "୨", "୩", "୪", "୫", "୬", "୭", "୮", "୯"], + "pa": ["੦", "੧", "੨", "੩", "੪", "੫", "੬", "੭", "੮", "੯"], +} + +num_dict = dict() +num_dict["en"] = { + "0": "zero", + "1": "one", + "2": "two", + "3": "three", + "4": "four", + "5": "five", + "6": "six", + "7": "seven", + "8": "eight", + "9": "nine", + "10": "ten", + "11": "eleven", + "12": "twelve", + "13": "thirteen", + "14": "fourteen", + "15": "fifteen", + "16": "sixteen", + "17": "seventeen", + "18": "eighteen", + "19": "nineteen", + "20": "twenty", + "21": "twenty-one", + "22": "twenty-two", + "23": "twenty-three", + "24": "twenty-four", + "25": "twenty-five", + "26": "twenty-six", + "27": "twenty-seven", + "28": "twenty-eight", + "29": "twenty-nine", + "30": "thirty", + "31": "thirty-one", + "32": "thirty-two", + "33": "thirty-three", + "34": "thirty-four", + "35": "thirty-five", + "36": "thirty-six", + "37": "thirty-seven", + "38": "thirty-eight", + "39": "thirty-nine", + "40": "forty", + "41": "forty-one", + "42": "forty-two", + "43": "forty-three", + "44": "forty-four", + "45": "forty-five", + "46": "forty-six", + "47": "forty-seven", + "48": "forty-eight", + "49": "forty-nine", + "50": "fifty", + "51": "fifty-one", + "52": "fifty-two", + "53": "fifty-three", + "54": "fifty-four", + "55": "fifty-five", + "56": "fifty-six", + "57": "fifty-seven", + "58": "fifty-eight", + "59": "fifty-nine", + "60": "sixty", + "61": "sixty-one", + "62": "sixty-two", + "63": "sixty-three", + "64": "sixty-four", + "65": "sixty-five", + "66": "sixty-six", + "67": "sixty-seven", + "68": "sixty-eight", + "69": "sixty-nine", + "70": "seventy", + "71": "seventy-one", + "72": "seventy-two", + "73": "seventy-three", + "74": "seventy-four", + "75": "seventy-five", + "76": "seventy-six", + "77": "seventy-seven", + "78": "seventy-eight", + "79": "seventy-nine", + "80": "eighty", + "81": "eighty-one", + "82": "eighty-two", + "83": "eighty-three", + "84": "eighty-four", + "85": "eighty-five", + "86": "eighty-six", + "87": "eighty-seven", + "88": "eighty-eight", + "89": "eighty-nine", + "90": "ninety", + "91": "ninety-one", + "92": "ninety-two", + "93": "ninety-three", + "94": "ninety-four", + "95": "ninety-five", + "96": "ninety-six", + "97": "ninety-seven", + "98": "ninety-eight", + "99": "ninety-nine", + "100": "hundred", + "1000": "thousand", + "100000": "lac", + "10000000": "crore", + "1000000000": "arab", +} # English-India +num_dict["hi"] = { + "0": "शून्य", + "1": "एक", + "2": "दो", + "3": "तीन", + "4": "चार", + "5": "पाँच", + "6": "छः", + "7": "सात", + "8": "आठ", + "9": "नौ", + "10": "दस", + "11": "ग्यारह", + "12": "बारह", + "13": "तेरह", + "14": "चौदह", + "15": "पंद्रह", + "16": "सोलह", + "17": "सत्रह", + "18": "अट्ठारह", + "19": "उन्नीस", + "20": "बीस", + "21": "इक्कीस", + "22": "बाईस", + "23": "तेईस", + "24": "चौबिस", + "25": "पच्चीस", + "26": "छब्बीस", + "27": "सत्ताईस", + "28": "अट्ठाईस", + "29": "उनतीस", + "30": "तीस", + "31": "इकतीस", + "32": "बत्तीस", + "33": "तैंतीस", + "34": "चौंतीस", + "35": "पैंतीस", + "36": "छत्तीस", + "37": "सैंतीस", + "38": "अड़तीस", + "39": "उनतालीस", + "40": "चालीस", + "41": "इकतालीस", + "42": "बयालीस", + "43": "तैंतालीस", + "44": "चौंतालीस", + "45": "पैंतालीस", + "46": "छियालीस", + "47": "सैंतालीस", + "48": "अड़तालीस", + "49": "उनचास", + "50": "पचास", + "51": "इक्यावन​", + "52": "बावन", + "53": "तिरेपन", + "54": "चौवन", + "55": "पचपन", + "56": "छप्पन", + "57": "सत्तावन", + "58": "अट्ठावन", + "59": "उनसठ", + "60": "साठ", + "61": "इकसठ", + "62": "बासठ", + "63": "तिरेसठ", + "64": "चौंसठ", + "65": "पैंसठ", + "66": "छयासठ", + "67": "सरसठ​", + "68": "अड़सठ", + "69": "उनहत्तर", + "70": "सत्तर", + "71": "इकहत्तर", + "72": "बहत्तर", + "73": "तिहत्तर", + "74": "चौहत्तर", + "75": "पचहत्तर", + "76": "छिहत्तर", + "77": "सतहत्तर", + "78": "अठहत्तर", + "79": "उन्यासी", + "80": "अस्सी", + "81": "इक्यासी", + "82": "बयासी", + "83": "तिरासी", + "84": "चौरासी", + "85": "पचासी", + "86": "छियासी", + "87": "सत्तासी", + "88": "अठासी", + "89": "नवासी", + "90": "नब्बे", + "91": "इक्यानवे", + "92": "बानवे", + "93": "तिरानवे", + "94": "चौरानवे", + "95": "पचानवे", + "96": "छियानवे", + "97": "सत्तानवे", + "98": "अट्ठानवे", + "99": "निन्यानवे", + "100": "सौ", + "1000": "हज़ार", + "100000": "लाख", + "10000000": "करोड़", + "1000000000": "अरब", +} # Hindi +num_dict["gu"] = { + "0": "શૂન્ય", + "1": "એક", + "2": "બે", + "3": "ત્રણ", + "4": "ચાર", + "5": "પાંચ", + "6": "છ", + "7": "સાત", + "8": "આઠ", + "9": "નવ", + "10": "દસ", + "11": "અગિયાર", + "12": "બાર", + "13": "તેર", + "14": "ચૌદ", + "15": "પંદર", + "16": "સોળ", + "17": "સત્તર", + "18": "અઢાર", + "19": "ઓગણિસ", + "20": "વીસ", + "21": "એકવીસ", + "22": "બાવીસ", + "23": "તેવીસ", + "24": "ચોવીસ", + "25": "પચ્ચીસ", + "26": "છવીસ", + "27": "સત્તાવીસ", + "28": "અઠ્ઠાવીસ", + "29": "ઓગણત્રીસ", + "30": "ત્રીસ", + "31": "એકત્રીસ", + "32": "બત્રીસ", + "33": "તેત્રીસ", + "34": "ચોત્રીસ", + "35": "પાંત્રીસ", + "36": "છત્રીસ", + "37": "સડત્રીસ", + "38": "અડત્રીસ", + "39": "ઓગણચાલીસ", + "40": "ચાલીસ", + "41": "એકતાલીસ", + "42": "બેતાલીસ", + "43": "ત્રેતાલીસ", + "44": "ચુંમાલીસ", + "45": "પિસ્તાલીસ", + "46": "છેતાલીસ", + "47": "સુડતાલીસ", + "48": "અડતાલીસ", + "49": "ઓગણપચાસ", + "50": "પચાસ", + "51": "એકાવન", + "52": "બાવન", + "53": "ત્રેપન", + "54": "ચોપન", + "55": "પંચાવન", + "56": "છપ્પન", + "57": "સત્તાવન", + "58": "અઠ્ઠાવન", + "59": "ઓગણસાઠ", + "60": "સાઈઠ", + "61": "એકસઠ", + "62": "બાસઠ", + "63": "ત્રેસઠ", + "64": "ચોસઠ", + "65": "પાંસઠ", + "66": "છાસઠ", + "67": "સડસઠ", + "68": "અડસઠ", + "69": "અગણોસિત્તેર", + "70": "સિત્તેર", + "71": "એકોતેર", + "72": "બોતેર", + "73": "તોતેર", + "74": "ચુમોતેર", + "75": "પંચોતેર", + "76": "છોતેર", + "77": "સિત્યોતેર", + "78": "ઇઠ્યોતેર", + "79": "ઓગણાએંસી", + "80": "એંસી", + "81": "એક્યાસી", + "82": "બ્યાસી", + "83": "ત્યાસી", + "84": "ચોર્યાસી", + "85": "પંચાસી", + "86": "છ્યાસી", + "87": "સિત્યાસી", + "88": "ઈઠ્યાસી", + "89": "નેવ્યાસી", + "90": "નેવું", + "91": "એકાણું", + "92": "બાણું", + "93": "ત્રાણું", + "94": "ચોરાણું", + "95": "પંચાણું", + "96": "છન્નું", + "97": "સત્તાણું", + "98": "અઠ્ઠાણું", + "99": "નવ્વાણું", + "100": "સો", + "1000": "હજાર", + "100000": "લાખ", + "1000000": "દસ લાખ", + "10000000": "કરોડ઼", +} # Gujarati +num_dict["mr"] = { + "0": "शून्य", + "1": "एक", + "2": "दोन", + "3": "तीन", + "4": "चार", + "5": "पाच", + "6": "सहा", + "7": "सात", + "8": "आठ", + "9": "नऊ", + "10": "दहा", + "11": "अकरा", + "12": "बारा", + "13": "तेरा", + "14": "चौदा", + "15": "पंधरा", + "16": "सोळा", + "17": "सतरा", + "18": "अठरा", + "19": "एकोणीस", + "20": "वीस", + "21": "एकवीस", + "22": "बावीस", + "23": "तेवीस", + "24": "चोवीस", + "25": "पंचवीस", + "26": "सव्वीस", + "27": "सत्तावीस", + "28": "अठ्ठावीस", + "29": "एकोणतीस", + "30": "तीस", + "31": "एकतीस", + "32": "बत्तीस", + "33": "तेहेतीस", + "34": "चौतीस", + "35": "पस्तीस", + "36": "छत्तीस", + "37": "सदतीस", + "38": "अडतीस", + "39": "एकोणचाळीस", + "40": "चाळीस", + "41": "एक्केचाळीस", + "42": "बेचाळीस", + "43": "त्रेचाळीस", + "44": "चव्वेचाळीस", + "45": "पंचेचाळीस", + "46": "सेहेचाळीस", + "47": "सत्तेचाळीस", + "48": "अठ्ठेचाळीस", + "49": "एकोणपन्नास", + "50": "पन्नास", + "51": "एक्कावन्न", + "52": "बावन्न", + "53": "त्रेपन्न", + "54": "चोपन्न", + "55": "पंचावन्न", + "56": "छप्पन्न", + "57": "सत्तावन्न", + "58": "अठ्ठावन्न", + "59": "एकोणसाठ", + "60": "साठ", + "61": "एकसष्ठ", + "62": "बासष्ठ", + "63": "त्रेसष्ठ", + "64": "चौसष्ठ", + "65": "पासष्ठ", + "66": "सहासष्ठ", + "67": "सदुसष्ठ", + "68": "अडुसष्ठ", + "69": "एकोणसत्तर", + "70": "सत्तर", + "71": "एक्काहत्तर", + "72": "बाहत्तर", + "73": "त्र्याहत्तर", + "74": "चौर्‍याहत्तर", + "75": "पंच्याहत्तर", + "76": "शहात्तर", + "77": "सत्याहत्तर", + "78": "अठ्ठ्याहत्तर", + "79": "एकोण ऐंशी", + "80": "ऐंशी", + "81": "एक्क्याऐंशी", + "82": "ब्याऐंशी", + "83": "त्र्याऐंशी", + "84": "चौऱ्याऐंशी", + "85": "पंच्याऐंशी", + "86": "शहाऐंशी", + "87": "सत्त्याऐंशी", + "88": "अठ्ठ्याऐंशी", + "89": "एकोणनव्वद", + "90": "नव्वद", + "91": "एक्क्याण्णव", + "92": "ब्याण्णव", + "93": "त्र्याण्णव", + "94": "चौऱ्याण्णव", + "95": "पंच्याण्णव", + "96": "शहाण्णव", + "97": "सत्त्याण्णव", + "98": "अठ्ठ्याण्णव", + "99": "नव्व्याण्णव", + "100": "शे", + "1000": "हजार", + "100000": "लाख", + "10000000": "कोटी", + "1000000000": "अब्ज", +} # Marathi +num_dict["bn"] = { + "0": "শূন্য", + "1": "এক", + "2": "দুই", + "3": "তিন", + "4": "চার", + "5": "পাঁচ", + "6": "ছয়", + "7": "সাত", + "8": "আট", + "9": "নয়", + "10": "দশ", + "11": "এগার", + "12": "বার", + "13": "তের", + "14": "চৌদ্দ", + "15": "পনের", + "16": "ষোল", + "17": "সতের", + "18": "আঠার", + "19": "ঊনিশ", + "20": "বিশ", + "21": "একুশ", + "22": "বাইশ", + "23": "তেইশ", + "24": "চব্বিশ", + "25": "পঁচিশ", + "26": "ছাব্বিশ", + "27": "সাতাশ", + "28": "আঠাশ", + "29": "ঊনত্রিশ", + "30": "ত্রিশ", + "31": "একত্রিশ", + "32": "বত্রিশ", + "33": "তেত্রিশ", + "34": "চৌত্রিশ", + "35": "পঁয়ত্রিশ", + "36": "ছত্রিশ", + "37": "সাঁইত্রিশ", + "38": "আটত্রিশ", + "39": "ঊনচল্লিশ", + "40": "চল্লিশ", + "41": "একচল্লিশ", + "42": "বিয়াল্লিশ", + "43": "তেতাল্লিশ", + "44": "চুয়াল্লিশ", + "45": "পঁয়তাল্লিশ", + "46": "ছেচল্লিশ", + "47": "সাতচল্লিশ", + "48": "আটচল্লিশ", + "49": "ঊনপঞ্চাশ", + "50": "পঞ্চাশ", + "51": "একান্ন", + "52": "বায়ান্ন", + "53": "তিপ্পান্ন", + "54": "চুয়ান্ন", + "55": "পঞ্চান্ন", + "56": "ছাপ্পান্ন", + "57": "সাতান্ন", + "58": "আটান্ন", + "59": "ঊনষাট", + "60": "ষাট", + "61": "একষট্টি", + "62": "বাষট্টি", + "63": "তেষট্টি", + "64": "চৌষট্টি", + "65": "পঁয়ষট্টি", + "66": "ছেষট্টি", + "67": "সাতষট্টি", + "68": "আটষট্টি", + "69": "ঊনসত্তর", + "70": "সত্তর", + "71": "একাত্তর", + "72": "বাহাত্তর", + "73": "তিয়াত্তর", + "74": "চুয়াত্তর", + "75": "পঁচাত্তর", + "76": "ছিয়াত্তর", + "77": "সাতাত্তর", + "78": "আটাত্তর", + "79": "ঊনআশি", + "80": "আশি", + "81": "একাশি", + "82": "বিরাশি", + "83": "তিরাশি", + "84": "চুরাশি", + "85": "পঁচাশি", + "86": "ছিয়াশি", + "87": "সাতাশি", + "88": "আটাশি", + "89": "ঊননব্বই", + "90": "নব্বই", + "91": "একানব্বই", + "92": "বিরানব্বই", + "93": "তিরানব্বই", + "94": "চুরানব্বই", + "95": "পঁচানব্বই", + "96": "ছিয়ানব্বই", + "97": "সাতানব্বই", + "98": "আটানব্বই", + "99": "নিরানব্বই", + "100": "শো", + "1000": "হাজার", + "100000": "লাখ", + "10000000": "কোটি", + "1000000000": "একশ’ কোটি", +} # Bengali +num_dict["te"] = { + "0": "సున్నా", + "1": "ఒకటి", + "2": "రెండు", + "3": "మూడు", + "4": "నాలుగు", + "5": "ఐదు", + "6": "ఆరు", + "7": "ఏడు", + "8": "ఎనిమిది", + "9": "తొమ్మిది", + "10": "పది", + "11": "పదకొండు", + "12": "పన్నెండు", + "13": "పదమూడు", + "14": "పద్నాలుగు", + "15": "పదిహేను", + "16": "పదహారు", + "17": "పదిహేడు", + "18": "పద్దెనిమిది", + "19": "పందొమ్మిది", + "20": "ఇరవై", + "21": "ఇరవై ఒకటి", + "22": "ఇరవై రెండు", + "23": "ఇరవై మూడు", + "24": "ఇరవై నాలుగు", + "25": "ఇరవై ఐదు", + "26": "ఇరవై ఆరు", + "27": "ఇరవై ఏడు", + "28": "ఇరవై ఎనిమిది", + "29": "ఇరవై తొమ్మిది", + "30": "ముప్పై", + "31": "ముప్పై ఒకటి", + "32": "ముప్పై రెండు", + "33": "ముప్పై మూడు", + "34": "ముప్పై నాలుగు", + "35": "ముప్పై ఐదు", + "36": "ముప్పై ఆరు", + "37": "ముప్పై ఏడు", + "38": "ముప్పై ఎనిమిది", + "39": "ముప్పై తొమ్మిది", + "40": "నలభై", + "41": "నలభై ఒకటి", + "42": "నలభై రెండు", + "43": "నలభై మూడు", + "44": "నలభై నాలుగు", + "45": "నలభై ఐదు", + "46": "నలభై ఆరు", + "47": "నలభై ఏడు", + "48": "నలభై ఎనిమిది", + "49": "నలభై తొమ్మిది", + "50": "యాభై", + "51": "యాభై ఒకటి", + "52": "యాభై రెండు", + "53": "యాభై మూడు", + "54": "యాభై నాలుగు", + "55": "యాభై ఐదు", + "56": "యాభై ఆరు", + "57": "యాభై ఏడు", + "58": "యాభై ఎనిమిది", + "59": "యాభై తొమ్మిది", + "60": "అరవై", + "61": "అరవై ఒకటి", + "62": "అరవై రెండు", + "63": "అరవై మూడు", + "64": "అరవై నాలుగు", + "65": "అరవై ఐదు", + "66": "అరవై ఆరు", + "67": "అరవై ఏడు", + "68": "అరవై ఎనిమిది", + "69": "అరవై తొమ్మిది", + "70": "డెబ్బై", + "71": "డెబ్బై ఒకటి", + "72": "డెబ్బై రెండు", + "73": "డెబ్బై మూడు", + "74": "డెబ్బై నాలుగు", + "75": "డెబ్బై ఐదు", + "76": "డెబ్బై ఆరు", + "77": "డెబ్బై ఏడు", + "78": "డెబ్బై ఎనిమిది", + "79": "డెబ్బై తొమ్మిది", + "80": "ఎనభై", + "81": "ఎనభై ఒకటి", + "82": "ఎనభై రెండు", + "83": "ఎనభై మూడు", + "84": "ఎనభై నాలుగు", + "85": "ఎనభై ఐదు", + "86": "ఎనభై ఆరు", + "87": "ఎనభై ఏడు", + "88": "ఎనభై ఎనిమిది", + "89": "ఎనభై తొమ్మిది", + "90": "తొంభై", + "91": "తొంభై ఒకటి", + "92": "తొంభై రెండు", + "93": "తొంభై మూడు", + "94": "తొంభై నాలుగు", + "95": "తొంభై ఐదు", + "96": "తొంభై ఆరు", + "97": "తొంభై ఏడు", + "98": "తొంభై ఎనిమిది", + "99": "తొంభై తొమ్మిది", + "100": "వందల", + "1000": "వేల", + "100000": "లక్షల", + "10000000": "కోట్ల", + "1000000000": "బిలియన్", +} # Telugu +num_dict["ta"] = { + "0": "பூஜ்ஜியம்", + "1": "ஒன்று", + "2": "இரண்டு", + "3": "மூன்று", + "4": "நான்கு", + "5": "ஐந்து", + "6": "ஆறு", + "7": "ஏழு", + "8": "எட்டு", + "9": "ஒன்பது", + "10": "பத்து", + "11": "பதினொன்று", + "12": "பன்னிரண்டு", + "13": "பதிமூன்று", + "14": "பதினான்கு", + "15": "பதினைந்து", + "16": "பதினாறு", + "17": "பதினேழு", + "18": "பதினெட்டு", + "19": "பத்தொன்பது", + "20": "இருபது", + "21": "இருபது ஒன்று", + "22": "இருபத்து இரண்டு", + "23": "இருபத்து மூன்று", + "24": "இருபத்து நான்கு", + "25": "இருபத்து ஐந்து", + "26": "இருபத்து ஆறு", + "27": "இருபத்து ஏழு", + "28": "இருபத்து எட்டு", + "29": "இருபத்து ஒன்பது", + "30": "முப்பது", + "31": "முப்பத்து ஒன்று", + "32": "முப்பத்து இரண்டு", + "33": "முப்பத்து மூன்று", + "34": "முப்பத்து நான்கு", + "35": "முப்பத்து ஐந்து", + "36": "முப்பத்து ஆறு", + "37": "முப்பத்து ஏழு", + "38": "முப்பத்து எட்டு", + "39": "முப்பத்து ஒன்பது", + "40": "நாற்பது", + "41": "நாற்பத்து ஒன்று", + "42": "நாற்பத்து இரண்டு", + "43": "நாற்பத்து மூன்று", + "44": "நாற்பத்து நான்கு", + "45": "நாற்பத்து ஐந்து", + "46": "நாற்பத்து ஆறு", + "47": " நாற்பத்து ஏழு", + "48": "நாற்பத்து எட்டு", + "49": "நாற்பத்து ஒன்பது", + "50": "ஐம்பது", + "51": "ஐம்பத்து ஒன்று", + "52": "ஐம்பத்து இரண்டு", + "53": "ஐம்பத்து மூன்று", + "54": "ஐம்பத்து நான்கு", + "55": "ஐம்பத்து ஐந்து", + "56": "ஐம்பத்து ஆறு", + "57": "ஐம்பத்து ஏழு", + "58": "ஐம்பத்து எட்டு", + "59": "ஐம்பத்து ஒன்பது", + "60": "அறுபது", + "61": "அறுபத்து ஒன்று", + "62": "அறுபத்து இரண்டு", + "63": "அறுபத்து மூன்று", + "64": "அறுபத்து நான்கு", + "65": "அறுபத்து ஐந்து", + "66": "அறுபத்து ஆறு", + "67": "அறுபத்து ஏழு", + "68": "அறுபத்து எட்டு", + "69": "அறுபத்து ஒன்பது", + "70": "எழுபது", + "71": "எழுபத்தி ஒன்று", + "72": "எழுபத்தி இரண்டு", + "73": "எழுபத்தி முச்சக்கர", + "74": "எழுபத்தி நான்கு", + "75": "எழுபத்தி ஐந்து", + "76": "எழுபத்தி ஆறு", + "77": "எழுபத்தி ஏழு", + "78": "எழுபத்தி எட்டு", + "79": "எழுபத்தி ஒன்பது", + "80": "எண்பது", + "81": "எண்பத்தியொன்று", + "82": "எண்பத்திரண்டு", + "83": "எண்பத்திமூன்று", + "84": "என்பதினான்கு", + "85": "என்பதினைந்து", + "86": "எண்பத்திஆறு", + "87": "எண்பத்திஏழு", + "88": "எண்பத்தியெட்டு", + "89": "எண்பத்தியொன்பது", + "90": "தொன்னூறு", + "91": "தொண்ணூற்றியொன்று", + "92": "தொண்ணூற்றிரண்டு", + "93": "தொண்ணூற்றிமூன்று", + "94": "தொண்ணூற்றிநான்கு", + "95": "தொண்ணூற்றிஐந்து", + "96": "தொண்ணூற்றியாறு", + "97": "தொண்ணூற்றியேழு", + "98": "தொண்ணூற்றியெட்டு", + "99": "தொண்ணூற்றிஒன்பது", + "100": "நூறு", + "1000": "ஆயிரம்", + "100000": "இலட்சம்", + "10000000": "கோடி", + "1000000000": "பில்லியன்", +} # Tamil +num_dict["kn"] = { + "0": "ಸೊನ್ನೆ", + "1": "ಒಂದು", + "2": "ಎರಡು", + "3": "ಮೂರು", + "4": "ನಾಲ್ಕು", + "5": "ಅಯ್ದು", + "6": "ಆರು", + "7": "ಏಳು", + "8": "ಎಂಟು", + "9": "ಒಂಬತ್ತು", + "10": "ಹತ್ತು", + "11": "ಹನ್ನೊಂದು", + "12": "ಹನ್ನೆರಡು", + "13": "ಹದಿಮೂರು", + "14": "ಹದಿನಾಲ್ಕು", + "15": "ಹದಿನೈದು", + "16": "ಹದಿನಾರು", + "17": "ಹದಿನೇಳು", + "18": "ಹದಿನೆಂಟು", + "19": "ಹತ್ತೊಂಬತ್ತು", + "20": "ಇಪ್ಪತ್ತು", + "21": "ಇಪ್ಪತ್ತ್’ಒಂದು", + "22": "ಇಪ್ಪತ್ತ್’ಎರಡು", + "23": "ಇಪ್ಪತ್ತ್’ಮೂರು", + "24": "ಇಪ್ಪತ್ತ್’ನಾಲ್ಕು", + "25": "ಇಪ್ಪತ್ತ್’ಐದು", + "26": "ಇಪ್ಪತ್ತ್’ಆರು", + "27": "ಇಪ್ಪತ್ತ್’ಏಳು", + "28": "ಇಪ್ಪತ್ತ್’ಎಂಟು", + "29": "ಇಪ್ಪತ್ತ್’ಒಂಬತ್ತು", + "30": "ಮೂವತ್ತು", + "31": "ಮುವತ್ತ್’ಒಂದು", + "32": "ಮುವತ್ತ್’ಎರಡು", + "33": "ಮುವತ್ತ್’ಮೂರು", + "34": "ಮೂವತ್ತ್’ನಾಲ್ಕು", + "35": "ಮೂವತ್ತ್’ಐದು", + "36": "ಮೂವತ್ತ್’ಆರು", + "37": "ಮೂವತ್ತ್’ಏಳು", + "38": "ಮೂವತ್ತ್’ಎಂಟು", + "39": "ಮೂವತ್ತ್’ಒಂಬತ್ತು", + "40": "ನಲವತ್ತು", + "41": "ನಲವತ್ತೊಂದು", + "42": "ನಲವತ್ತ್ ಎರಡು", + "43": "ನಲವತ್ತ್ ಮೂರು", + "44": "ನಲವತ್ತ್ ನಾಲ್ಕು", + "45": "ನಲವತ್ತೈದು", + "46": "ನಲವತ್ತಾರು", + "47": "ನಲವತ್ತೇಳು", + "48": "ನಲವತ್ತೆಂಟು", + "49": "ನಲವತ್ತೊಂಬತ್ತು", + "50": "ಐವತ್ತು", + "51": "ಐವತ್ತೊಂದು", + "52": "ಐವತ್ತೆರಡು", + "53": "ಐವತ್ತಮೂರು", + "54": "ಐವತ್ತ್ನಾಲ್ಕು", + "55": "ಐವತ್ತೈದು", + "56": "ಐವತ್ತಾರು", + "57": "ಐವತ್ತೇಳು", + "58": "ಐವತ್ತೆಂಟು", + "59": "ಐವತ್ತೊಂಬತ್ತು", + "60": "ಅರವತ್ತು", + "61": "ಅರವತ್ತೊಂದು", + "62": "ಅರವತ್ತೆರಡು", + "63": "ಅರವತ್ತ್ ಮೂರು", + "64": "ಅರವತ್ತ್ ನಾಲ್ಕು", + "65": "ಅರವತ್ತೈದು", + "66": "ಅರವತ್ತಾರು", + "67": "ಅರವತ್ತೇಳು", + "68": "ಅರವತ್ತೆಂಟು", + "69": "ಅರವತ್ತೊಂಬತ್ತು", + "70": "ಎಪ್ಪತ್ತು", + "71": "ಎಪ್ಪತ್ತೊಂದು", + "72": "ಎಪ್ಪತ್ತೆರಡು", + "73": "ಎಪ್ಪತ್ತ್ ಮೂರು", + "74": "ಎಪ್ಪತ್ತ್ ನಾಲ್ಕು", + "75": "ಎಪ್ಪತ್ತೈದು", + "76": "ಎಪ್ಪತ್ತಾರು", + "77": "ಎಪ್ಪತ್ತೇಳು", + "78": "ಎಪ್ಪತ್ತೆಂಟು", + "79": "ಎಪ್ಪತ್ತೊಂಬತ್ತು", + "80": "ಎಂಬತ್ತು", + "81": "ಎಂಬತ್ತೊಂದು", + "82": "ಎಂಬತ್ತೆರಡು", + "83": "ಎಂಬತ್ತ್ ಮೂರು", + "84": "ಎಂಬತ್ತ್ ನಾಲ್ಕು", + "85": "ಎಂಬತ್ತೈದು", + "86": "ಎಂಬತ್ತಾರು", + "87": "ಎಂಬತ್ತೇಳು", + "88": "ಎಂಬತ್ತೆಂಟು", + "89": "ಎಂಬತ್ತೊಂಬತ್ತು", + "90": "ತೊಂಬತ್ತು", + "91": "ತೊಂಬತ್ತೊಂದು", + "92": "ತೊಂಬತ್ತೆರಡು", + "93": "ತೊಂಬತ್ತ ಮೂರು", + "94": "ತೊಂಬತ್ತ ನಾಲ್ಕು", + "95": "ತೊಂಬತ್ತೈದು", + "96": "ತೊಂಬತ್ತಾರು", + "97": "ತೊಂಬತ್ತೇಳು", + "98": "ತೊಂಬತ್ತೆಂಟು", + "99": "ತೊಂಬತ್ತೊಂಬತ್ತು", + "100": "ನೂರ", + "1000": "ಸಾವಿರದ", + "100000": "ಲಕ್ಷದ", + "10000000": "ಕೋಟಿ", + "1000000000": "ಶತಕೋಟಿ", +} # Kannada +num_dict["or"] = { + "0": "ଶୁନ୍ୟ", + "1": "ଏକ", + "2": "ଦୁଇ", + "3": "ତିନି", + "4": "ଚାରି", + "5": "ପାଞ୍ଚ", + "6": "ଛଅ", + "7": "ସାତ", + "8": "ଆଠ", + "9": "ନଅ", + "10": "ନଅ", + "11": "ଏଗାର", + "12": "ବାର", + "13": "ତେର", + "14": "ଚଉଦ", + "15": "ପନ୍ଦର", + "16": "ଷୋହଳ", + "17": "ସତର", + "18": "ଅଠର", + "19": "ଊଣାଇଶ", + "20": "କୋଡିଏ", + "21": "ଏକୋଇଶି", + "22": "ବାଇଶି", + "23": "ତେଇଶି", + "24": "ଚବିଶି", + "25": "ପଚିଶି", + "26": "ଛବିଶି", + "27": "ସତାଇଶି", + "28": "ଅଠାଇଶି", + "29": "ଅଣତିରିଶି", + "30": "ତିରିଶି", + "31": "ଏକତିରିଶି", + "32": "ବତିଶି", + "33": "ତେତିଶି", + "34": "ଚଉତିରିଶି", + "35": "ପଞ୍ଚତିରିଶି", + "36": "ଛତିଶି", + "37": "ସଂଇତିରିଶି", + "38": "ଅଠତିରିଶି", + "39": "ଅଣଚାଳିଶି", + "40": "ଚାଳିଶି", + "41": "ଏକଚାଳିଶି", + "42": "ବୟାଳିଶି", + "43": "ତେୟାଳିଶି", + "44": "ଚଉରାଳିଶି", + "45": "ପଞ୍ଚଚାଳିଶି", + "46": "ଛୟାଳିଶି", + "47": "ସତଚାଳିଶି", + "48": "ଅଠଚାଳିଶି", + "49": "ଅଣଚାଶ", + "50": "ପଚାଶ", + "51": "ଏକାବନ", + "52": "ବାଉନ", + "53": "ତେପନ", + "54": "ଚଉବନ", + "55": "ପଞ୍ଚାବନ", + "56": "ଛପନ", + "57": "ସତାବନ", + "58": "ଅଠାବନ", + "59": "ଅଣଷଠି", + "60": "ଷାଠିଏ", + "61": "ଏକଷଠି", + "62": "ବାଷଠି", + "63": "ତେଷଠି", + "64": "ଚଉଷଠି", + "65": "ପଞ୍ଚଷଠି", + "66": "ଛଅଷଠି", + "67": "ସତଷଠି", + "68": "ଅଠଷଠି", + "69": "ଅଣସ୍ତରୀ", + "70": "ସତୂରୀ", + "71": "ଏକସ୍ତରୀ", + "72": "ବାସ୍ତରୀ", + "73": "ତେସ୍ତରୀ", + "74": "ଚଉସ୍ତରୀ", + "75": "ପଞ୍ଚସ୍ତରୀ", + "76": "ଛଅସ୍ତରୀ", + "77": "ସତସ୍ତରୀ", + "78": "ଅଠସ୍ତରୀ", + "79": "ଅଣାଅଶୀ", + "80": "ଅଶୀ", + "81": "ଏକାଅଶୀ", + "82": "ବୟାଅଶୀ", + "83": "ତେୟାଅଶୀ", + "84": "ଚଉରାଅଶୀ", + "85": "ପଞ୍ଚାଅଶୀ", + "86": "ଛୟାଅଶୀ", + "87": "ସତାଅଶୀ", + "88": "ଅଠାଅଶୀ", + "89": "ଅଣାନବେ", + "90": "ନବେ", + "91": "ଏକାନବେ", + "92": "ବୟାନବେ", + "93": "ତେୟାନବେ", + "94": "ଚଉରାନବେ", + "95": "ପଞ୍ଚାନବେ", + "96": "ଛୟାନବେ", + "97": "ସତାନବେ", + "98": "ଅଠାନବେ", + "99": "ଅନେଶତ", + "100": "ଶହେ", + "1000": "ହଜାର", + "100000": "ଲକ୍ଷ", + "10000000": "କୋଟି", + "1000000000": "କୋଟି", +} # Oriya +num_dict["pa"] = { + "0": "ਸਿਫਰ ", + "1": "ਇੱਕ", + "2": "ਦੋ", + "3": "ਤਿੰਨ", + "4": "ਚਾਰ", + "5": "ਪੰਜ", + "6": "ਛੇ", + "7": "ਸੱਤ", + "8": "ਅੱਠ", + "9": "ਨੌਂ", + "10": "ਦੱਸ", + "11": "ਗਿਆਰਾਂ", + "12": "ਬਾਰਾਂ", + "13": "ਤੇਰਾਂ", + "14": "ਚੌਦਾਂ", + "15": "ਪੰਦਰਾਂ", + "16": "ਸੋਲ਼ਾਂ", + "17": "ਸਤਾਰਾਂ", + "18": "ਅਠਾਰਾਂ", + "19": "ਉਨੀ", + "20": "ਵੀਹ", + "21": "ਇੱਕੀ", + "22": "ਬਾਈ", + "23": "ਤੇਈ", + "24": "ਚੌਵੀ", + "25": "ਪੰਝੀ", + "26": "ਛੱਬੀ", + "27": "ਸਤਾਈ", + "28": "ਅਠਾਈ", + "29": "ਉਨੱਤੀ", + "30": "ਤੀਹ", + "31": "ਇਕੱਤੀ", + "32": "ਬੱਤੀ", + "33": "ਤੇਤੀ", + "34": "ਚੌਂਤੀ", + "35": "ਪੈਂਤੀ", + "36": "ਛੱਤੀ", + "37": "ਸੈਂਤੀ", + "38": "ਅਠੱਤੀ", + "39": "ਉਨਤਾਲੀ", + "40": "ਚਾਲੀ", + "41": "ਇਕਤਾਲੀ", + "42": "ਬਤਾਲੀ", + "43": "ਤਰਤਾਲੀ", + "44": "ਚੌਤਾਲੀ", + "45": "ਪੰਜਤਾਲੀ", + "46": "ਛਿਆਲੀ", + "47": "ਸੰਤਾਲੀ", + "48": "ਅੱਠਤਾਲੀ", + "49": "ਉਣਿੰਜਾ", + "50": "ਪੰਜਾਹ", + "51": "ਇਕਵਿੰਜਾ", + "52": "ਬਵਿੰਜਾ", + "53": "ਤਰਵਿੰਜਾ", + "54": "ਚਰਿੰਜਾ", + "55": "ਪਚਵਿੰਜਾ", + "56": "ਛਪਿੰਜਾ", + "57": "ਸਤਵਿੰਜਾ", + "58": "ਅੱਠਵਿੰਜਾ", + "59": "ਉਣਾਠ", + "60": "ਸੱਠ", + "61": "ਇਕਾਠ", + "62": "ਬਾਠ੍ਹ", + "63": "ਤਰੇਠ੍ਹ", + "64": "ਚੌਠ੍ਹ", + "65": "ਪੈਂਠ", + "66": "ਛਿਆਠ", + "67": "ਸਤਾਹਠ", + "68": "ਅੱਠਾਠ", + "69": "ਉਣੱਤਰ", + "70": "ਸੱਤਰ", + "71": "ਇਕ੍ਹੱਤਰ", + "72": "ਬਹੱਤਰ", + "73": "ਤਹੱਤਰ", + "74": "ਚੌਹੱਤਰ", + "75": "ਪੰਜੱਤਰ", + "76": "ਛਿਹੱਤਰ", + "77": "ਸਤੱਤਰ", + "78": "ਅਠੱਤਰ", + "79": "ਉਣਾਸੀ", + "80": "ਅੱਸੀ", + "81": "ਇਕਾਸੀ", + "82": "ਬਿਆਸੀ", + "83": "ਤਰਾਸੀ", + "84": "ਚਰਾਸੀ", + "85": "ਪੰਜਾਸੀ", + "86": "ਛਿਆਸੀ", + "87": "ਸਤਾਸੀ", + "88": "ਅਠਾਸੀ", + "89": "ਉਣਾਨਵੇਂ", + "90": "ਨੱਬੇ", + "91": "ਇਕਾਨਵੇਂ", + "92": "ਬਿਆਨਵੇਂ", + "93": "ਤਰਾਨਵੇਂ", + "94": "ਚਰਾਨਵੇਂ", + "95": "ਪਚਾਨਵੇਂ", + "96": "ਛਿਆਨਵੇਂ", + "97": "ਸਤਾਨਵੇਂ", + "98": "ਅਠਾਨਵੇਂ", + "99": "ਨਿੜਾਨਵੇਂ", + "100": "ਸੌ", + "1000": "ਹਜਾਰ", + "100000": "ਲੱਖ", + "10000000": "ਕਰੋੜ", + "1000000000": "ਅਰਬ", +} # Punjabi + +# --------------------------- num_to_word.py ------------------------------ +""" +Method to convert Numbers to Words +for indian languages + +Use cases:- +1) Speech recognition pre-processing +2) Language modeling Data pre-processing + +------------------------- +check indic_numbers.py to add support +for any indian language +""" + + +def language_specific_exception(words, lang, combiner): + """ + Language Specific Exception will come here + """ + + def occurs_at_end(piece): + return words[-len(piece) :] == piece + + if lang == "mr": + words = words.replace("एक" + combiner + "शे", "शंभर") + elif lang == "gu": + words = words.replace("બે" + combiner + "સો", "બસ્સો") + elif lang == "te": + exception_dict = { + "1": "ఒక", + "100": "వంద", + "100+": "వందలు", + "1000": "వెయ్యి", + "1000+": "వేలు", + "100000": "లక్ష", + "100000+": "లక్షలు", + "10000000": "కోటి", + "10000000+": "కోట్లు", + } + + test_case = ["100", "1000", "100000", "10000000"] + for test in test_case: + test_word = num_dict["te"][test] + match = num_dict["te"]["1"] + combiner + test_word + # for numbers like : 100, 1000, 100000 + if words == match: + return exception_dict[test] + # for numbers like : 200, 4000, 800000 + elif occurs_at_end(test_word): + words = words.replace(test_word, exception_dict[test + "+"]) + # for numbers like : 105, 1076, 123993 + elif not occurs_at_end(match): + replacement = exception_dict["1"] + combiner + exception_dict[test] + words = words.replace(match, replacement) + + # Exception case for 101...199 + special_case = "ఒక" + combiner + "వంద" + words = words.replace(special_case, "నూట") + elif lang == "kn": + # special case for 100 + if words == ("ಒಂದು" + combiner + "ನೂರ"): + return "ನೂರು" + exception_dict = { + "ನೂರ": "ನೂರು", + "ಸಾವಿರದ": "ಸಾವಿರ", + "ಲಕ್ಷದ": "ಲಕ್ಷ", + "ಕೋಟಿಯ": "ಕೋಟಿ", + } + for expt in exception_dict: + if occurs_at_end(expt): + words = words.replace(expt, exception_dict[expt]) + return words + + +def num_to_word(num, lang, separator=", ", combiner=" "): + """ + Main Method + :param num: Number digits from any indian language + :param lang: Language Code from supported Language + :param separator: Separator character i.e. separator = '-' --> 'two hundred-sixty' + :param combiner: combine number with position i.e. combiner = '-' --> 'two-hundred sixty' + :return: UTF-8 String of numbers in words + """ + lang = lang.lower() + num = str(num) + + # Load dictionary according to language code + assert lang in supported_lang, "Language not supported" + num_dic = num_dict[lang] + + # dash default combiner for english-india + if (lang == "en") & (combiner == " "): + combiner = "-" + + # Remove punctuations from numbers + num = str(num).replace(",", "").replace(" ", "") + + # return word as it is if not number + if not num.isdecimal(): + return num + + # Replace native language numbers with english digits + for language in supported_lang: + for num_index in range(10): + num = num.replace(all_num[language][num_index], all_num["en"][num_index]) + + # Assert that input contains only integer number + for digit in num: + assert digit in all_num["en"], "Give proper input" + + # Process + # For Number longer than 9 digits + def all_two_digit(digits_2): + if len(digits_2) <= 1: # Provided only one/zero digit + return num_dic.get(digits_2, "") + elif digits_2 == "00": # Two Zero provided + return num_dic["0"] + separator + num_dic["0"] + elif digits_2[0] == "0": # First digit is zero + return num_dic["0"] + separator + num_dic[digits_2[1]] + else: # Both digit provided + return num_dic[digits_2] + + # For Number less than 9 digits + def two_digit(digits_2): + digits_2 = digits_2.lstrip("0") + if len(digits_2) != 0: + return num_dic[digits_2] + else: + return "" + + def all_digit(digits): + digits = digits.lstrip("0") + digit_len = len(digits) + if digit_len > 3: + num_of_digits_to_process = (digit_len % 2) + 1 + process_digits = digits[:num_of_digits_to_process] + base = str(10 ** (int(digit_len / 2) * 2 - 1)) + remain_digits = digits[num_of_digits_to_process:] + return ( + num_dic[process_digits] + + combiner + + num_dic[base] + + separator + + all_digit(remain_digits) + ) + elif len(digits) == 3: + return ( + num_dic[digits[:1]] + + combiner + + num_dic["100"] + + separator + + two_digit(digits[1:]) + ) + else: + return two_digit(digits) + + num = num.lstrip("0") + full_digit_len = len(num) + + if full_digit_len == 0: + output = num_dic["0"] + elif full_digit_len <= 9: + output = all_digit(num) + else: + iteration = round(full_digit_len / 2) + output = all_two_digit(num[:2]) # First to digit + for i in range(1, iteration): + output = ( + output + separator + all_two_digit(num[i * 2 : (i + 1) * 2]) + ) # Next two digit pairs + remaining_digits = num[iteration * 2 :] + if not all_two_digit(remaining_digits) == "": + output = ( + output + separator + all_two_digit(remaining_digits) + ) # remaining Last one/two digits + + output = output.strip(separator) + + output = language_specific_exception(output, lang, combiner) + + return output + + +# --------------------------------- num_to_word_on_a_sent --------------------------------- + + +def is_digit(word, digit_pattern): + return re.search(digit_pattern, word) + + +def remove_punct(sent): + clean = re.sub("[%s]" % re.escape(string.punctuation), " ", sent) + return " ".join([word for word in clean.split() if word]) + + +def normalize_nums(text, lang): + """ + text: str (eg) + lang: lang code ['en', 'hi'] + + returns: str + (eg) + """ + + if lang in supported_lang: + text = text.replace('-',' - ') # space separate hyphen + words = text.split() + lang_digits = [str(i) for i in range(0, 10)] + + digit_pattern = "[" + "".join(lang_digits) + "]" + num_indices = [ + ind for ind, word in enumerate(words) if is_digit(word, digit_pattern) + ] + + words_up = [ + num_to_word(word, lang, separator=" ", combiner=" ") + if ind in num_indices + else word + for ind, word in enumerate(words) + ] + return " ".join(words_up) + else: + return text + + +if __name__ == "__main__": + print(normalize_nums("रीटा के पास 16 बिल्लियाँ हैं।", "hi")) diff --git a/ttsv/tts_infer/requirements.txt b/ttsv/tts_infer/requirements.txt new file mode 100644 index 0000000000000000000000000000000000000000..cbbb8fff6386b8b986fd69a4328cbd93fdc4ce6a --- /dev/null +++ b/ttsv/tts_infer/requirements.txt @@ -0,0 +1,6 @@ +# will be installed with main setup.py, no need to reinstall + +ai4bharat-transliteration==0.5.0.3 +numpy==1.19.5 +pandas +pydload \ No newline at end of file diff --git a/ttsv/tts_infer/transliterate.py b/ttsv/tts_infer/transliterate.py new file mode 100644 index 0000000000000000000000000000000000000000..575430562683434cd44fd8d2e77d26dab9ced73b --- /dev/null +++ b/ttsv/tts_infer/transliterate.py @@ -0,0 +1,919 @@ +import torch +import torch.nn as nn +import numpy as np +import pandas as pd +import random +import sys +import os +import json +import enum +import traceback +import re + +F_DIR = os.path.dirname(os.environ.get('translit_model_base_path', os.path.realpath(__file__))) + + +class XlitError(enum.Enum): + lang_err = "Unsupported langauge ID requested ;( Please check available languages." + string_err = "String passed is incompatable ;(" + internal_err = "Internal crash ;(" + unknown_err = "Unknown Failure" + loading_err = "Loading failed ;( Check if metadata/paths are correctly configured." + + +##=================== Network ================================================== + + +class Encoder(nn.Module): + def __init__( + self, + input_dim, + embed_dim, + hidden_dim, + rnn_type="gru", + layers=1, + bidirectional=False, + dropout=0, + device="cpu", + ): + super(Encoder, self).__init__() + + self.input_dim = input_dim # src_vocab_sz + self.enc_embed_dim = embed_dim + self.enc_hidden_dim = hidden_dim + self.enc_rnn_type = rnn_type + self.enc_layers = layers + self.enc_directions = 2 if bidirectional else 1 + self.device = device + + self.embedding = nn.Embedding(self.input_dim, self.enc_embed_dim) + + if self.enc_rnn_type == "gru": + self.enc_rnn = nn.GRU( + input_size=self.enc_embed_dim, + hidden_size=self.enc_hidden_dim, + num_layers=self.enc_layers, + bidirectional=bidirectional, + ) + elif self.enc_rnn_type == "lstm": + self.enc_rnn = nn.LSTM( + input_size=self.enc_embed_dim, + hidden_size=self.enc_hidden_dim, + num_layers=self.enc_layers, + bidirectional=bidirectional, + ) + else: + raise Exception("XlitError: unknown RNN type mentioned") + + def forward(self, x, x_sz, hidden=None): + """ + x_sz: (batch_size, 1) - Unpadded sequence lengths used for pack_pad + """ + batch_sz = x.shape[0] + # x: batch_size, max_length, enc_embed_dim + x = self.embedding(x) + + ## pack the padded data + # x: max_length, batch_size, enc_embed_dim -> for pack_pad + x = x.permute(1, 0, 2) + x = nn.utils.rnn.pack_padded_sequence(x, x_sz, enforce_sorted=False) # unpad + + # output: packed_size, batch_size, enc_embed_dim + # hidden: n_layer**num_directions, batch_size, hidden_dim | if LSTM (h_n, c_n) + output, hidden = self.enc_rnn( + x + ) # gru returns hidden state of all timesteps as well as hidden state at last timestep + + ## pad the sequence to the max length in the batch + # output: max_length, batch_size, enc_emb_dim*directions) + output, _ = nn.utils.rnn.pad_packed_sequence(output) + + # output: batch_size, max_length, hidden_dim + output = output.permute(1, 0, 2) + + return output, hidden + + def get_word_embedding(self, x): + """ """ + x_sz = torch.tensor([len(x)]) + x_ = torch.tensor(x).unsqueeze(0).to(dtype=torch.long) + # x: 1, max_length, enc_embed_dim + x = self.embedding(x_) + + ## pack the padded data + # x: max_length, 1, enc_embed_dim -> for pack_pad + x = x.permute(1, 0, 2) + x = nn.utils.rnn.pack_padded_sequence(x, x_sz, enforce_sorted=False) # unpad + + # output: packed_size, 1, enc_embed_dim + # hidden: n_layer**num_directions, 1, hidden_dim | if LSTM (h_n, c_n) + output, hidden = self.enc_rnn( + x + ) # gru returns hidden state of all timesteps as well as hidden state at last timestep + + out_embed = hidden[0].squeeze() + + return out_embed + + +class Decoder(nn.Module): + def __init__( + self, + output_dim, + embed_dim, + hidden_dim, + rnn_type="gru", + layers=1, + use_attention=True, + enc_outstate_dim=None, # enc_directions * enc_hidden_dim + dropout=0, + device="cpu", + ): + super(Decoder, self).__init__() + + self.output_dim = output_dim # tgt_vocab_sz + self.dec_hidden_dim = hidden_dim + self.dec_embed_dim = embed_dim + self.dec_rnn_type = rnn_type + self.dec_layers = layers + self.use_attention = use_attention + self.device = device + if self.use_attention: + self.enc_outstate_dim = enc_outstate_dim if enc_outstate_dim else hidden_dim + else: + self.enc_outstate_dim = 0 + + self.embedding = nn.Embedding(self.output_dim, self.dec_embed_dim) + + if self.dec_rnn_type == "gru": + self.dec_rnn = nn.GRU( + input_size=self.dec_embed_dim + + self.enc_outstate_dim, # to concat attention_output + hidden_size=self.dec_hidden_dim, # previous Hidden + num_layers=self.dec_layers, + batch_first=True, + ) + elif self.dec_rnn_type == "lstm": + self.dec_rnn = nn.LSTM( + input_size=self.dec_embed_dim + + self.enc_outstate_dim, # to concat attention_output + hidden_size=self.dec_hidden_dim, # previous Hidden + num_layers=self.dec_layers, + batch_first=True, + ) + else: + raise Exception("XlitError: unknown RNN type mentioned") + + self.fc = nn.Sequential( + nn.Linear(self.dec_hidden_dim, self.dec_embed_dim), + nn.LeakyReLU(), + # nn.Linear(self.dec_embed_dim, self.dec_embed_dim), nn.LeakyReLU(), # removing to reduce size + nn.Linear(self.dec_embed_dim, self.output_dim), + ) + + ##----- Attention ---------- + if self.use_attention: + self.W1 = nn.Linear(self.enc_outstate_dim, self.dec_hidden_dim) + self.W2 = nn.Linear(self.dec_hidden_dim, self.dec_hidden_dim) + self.V = nn.Linear(self.dec_hidden_dim, 1) + + def attention(self, x, hidden, enc_output): + """ + x: (batch_size, 1, dec_embed_dim) -> after Embedding + enc_output: batch_size, max_length, enc_hidden_dim *num_directions + hidden: n_layers, batch_size, hidden_size | if LSTM (h_n, c_n) + """ + + ## perform addition to calculate the score + + # hidden_with_time_axis: batch_size, 1, hidden_dim + ## hidden_with_time_axis = hidden.permute(1, 0, 2) ## replaced with below 2lines + hidden_with_time_axis = ( + torch.sum(hidden, axis=0) + if self.dec_rnn_type != "lstm" + else torch.sum(hidden[0], axis=0) + ) # h_n + + hidden_with_time_axis = hidden_with_time_axis.unsqueeze(1) + + # score: batch_size, max_length, hidden_dim + score = torch.tanh(self.W1(enc_output) + self.W2(hidden_with_time_axis)) + + # attention_weights: batch_size, max_length, 1 + # we get 1 at the last axis because we are applying score to self.V + attention_weights = torch.softmax(self.V(score), dim=1) + + # context_vector shape after sum == (batch_size, hidden_dim) + context_vector = attention_weights * enc_output + context_vector = torch.sum(context_vector, dim=1) + # context_vector: batch_size, 1, hidden_dim + context_vector = context_vector.unsqueeze(1) + + # attend_out (batch_size, 1, dec_embed_dim + hidden_size) + attend_out = torch.cat((context_vector, x), -1) + + return attend_out, attention_weights + + def forward(self, x, hidden, enc_output): + """ + x: (batch_size, 1) + enc_output: batch_size, max_length, dec_embed_dim + hidden: n_layer, batch_size, hidden_size | lstm: (h_n, c_n) + """ + if (hidden is None) and (self.use_attention is False): + raise Exception( + "XlitError: No use of a decoder with No attention and No Hidden" + ) + + batch_sz = x.shape[0] + + if hidden is None: + # hidden: n_layers, batch_size, hidden_dim + hid_for_att = torch.zeros( + (self.dec_layers, batch_sz, self.dec_hidden_dim) + ).to(self.device) + elif self.dec_rnn_type == "lstm": + hid_for_att = hidden[1] # c_n + + # x (batch_size, 1, dec_embed_dim) -> after embedding + x = self.embedding(x) + + if self.use_attention: + # x (batch_size, 1, dec_embed_dim + hidden_size) -> after attention + # aw: (batch_size, max_length, 1) + x, aw = self.attention(x, hidden, enc_output) + else: + x, aw = x, 0 + + # passing the concatenated vector to the GRU + # output: (batch_size, n_layers, hidden_size) + # hidden: n_layers, batch_size, hidden_size | if LSTM (h_n, c_n) + output, hidden = ( + self.dec_rnn(x, hidden) if hidden is not None else self.dec_rnn(x) + ) + + # output :shp: (batch_size * 1, hidden_size) + output = output.view(-1, output.size(2)) + + # output :shp: (batch_size * 1, output_dim) + output = self.fc(output) + + return output, hidden, aw + + +class Seq2Seq(nn.Module): + """ + Class dependency: Encoder, Decoder + """ + + def __init__( + self, encoder, decoder, pass_enc2dec_hid=False, dropout=0, device="cpu" + ): + super(Seq2Seq, self).__init__() + + self.encoder = encoder + self.decoder = decoder + self.device = device + self.pass_enc2dec_hid = pass_enc2dec_hid + _force_en2dec_hid_conv = False + + if self.pass_enc2dec_hid: + assert ( + decoder.dec_hidden_dim == encoder.enc_hidden_dim + ), "Hidden Dimension of encoder and decoder must be same, or unset `pass_enc2dec_hid`" + if decoder.use_attention: + assert ( + decoder.enc_outstate_dim + == encoder.enc_directions * encoder.enc_hidden_dim + ), "Set `enc_out_dim` correctly in decoder" + assert ( + self.pass_enc2dec_hid or decoder.use_attention + ), "No use of a decoder with No attention and No Hidden from Encoder" + + self.use_conv_4_enc2dec_hid = False + if ( + self.pass_enc2dec_hid + and (encoder.enc_directions * encoder.enc_layers != decoder.dec_layers) + ) or _force_en2dec_hid_conv: + if encoder.enc_rnn_type == "lstm" or encoder.enc_rnn_type == "lstm": + raise Exception( + "XlitError: conv for enc2dec_hid not implemented; Change the layer numbers appropriately" + ) + + self.use_conv_4_enc2dec_hid = True + self.enc_hid_1ax = encoder.enc_directions * encoder.enc_layers + self.dec_hid_1ax = decoder.dec_layers + self.e2d_hidden_conv = nn.Conv1d(self.enc_hid_1ax, self.dec_hid_1ax, 1) + + def enc2dec_hidden(self, enc_hidden): + """ + enc_hidden: n_layer, batch_size, hidden_dim*num_directions + TODO: Implement the logic for LSTm bsed model + """ + # hidden: batch_size, enc_layer*num_directions, enc_hidden_dim + hidden = enc_hidden.permute(1, 0, 2).contiguous() + # hidden: batch_size, dec_layers, dec_hidden_dim -> [N,C,Tstep] + hidden = self.e2d_hidden_conv(hidden) + + # hidden: dec_layers, batch_size , dec_hidden_dim + hidden_for_dec = hidden.permute(1, 0, 2).contiguous() + + return hidden_for_dec + + def active_beam_inference(self, src, beam_width=3, max_tgt_sz=50): + """Search based decoding + src: (sequence_len) + """ + + def _avg_score(p_tup): + """Used for Sorting + TODO: Dividing by length of sequence power alpha as hyperparam + """ + return p_tup[0] + + import sys + + batch_size = 1 + start_tok = src[0] + end_tok = src[-1] + src_sz = torch.tensor([len(src)]) + src_ = src.unsqueeze(0) + + # enc_output: (batch_size, padded_seq_length, enc_hidden_dim*num_direction) + # enc_hidden: (enc_layers*num_direction, batch_size, hidden_dim) + enc_output, enc_hidden = self.encoder(src_, src_sz) + + if self.pass_enc2dec_hid: + # dec_hidden: dec_layers, batch_size , dec_hidden_dim + if self.use_conv_4_enc2dec_hid: + init_dec_hidden = self.enc2dec_hidden(enc_hidden) + else: + init_dec_hidden = enc_hidden + else: + # dec_hidden -> Will be initialized to zeros internally + init_dec_hidden = None + + # top_pred[][0] = Σ-log_softmax + # top_pred[][1] = sequence torch.tensor shape: (1) + # top_pred[][2] = dec_hidden + top_pred_list = [(0, start_tok.unsqueeze(0), init_dec_hidden)] + + for t in range(max_tgt_sz): + cur_pred_list = [] + + for p_tup in top_pred_list: + if p_tup[1][-1] == end_tok: + cur_pred_list.append(p_tup) + continue + + # dec_hidden: dec_layers, 1, hidden_dim + # dec_output: 1, output_dim + dec_output, dec_hidden, _ = self.decoder( + x=p_tup[1][-1].view(1, 1), # dec_input: (1,1) + hidden=p_tup[2], + enc_output=enc_output, + ) + + ## π{prob} = Σ{log(prob)} -> to prevent diminishing + # dec_output: (1, output_dim) + dec_output = nn.functional.log_softmax(dec_output, dim=1) + # pred_topk.values & pred_topk.indices: (1, beam_width) + pred_topk = torch.topk(dec_output, k=beam_width, dim=1) + + for i in range(beam_width): + sig_logsmx_ = p_tup[0] + pred_topk.values[0][i] + # seq_tensor_ : (seq_len) + seq_tensor_ = torch.cat((p_tup[1], pred_topk.indices[0][i].view(1))) + + cur_pred_list.append((sig_logsmx_, seq_tensor_, dec_hidden)) + + cur_pred_list.sort(key=_avg_score, reverse=True) # Maximized order + top_pred_list = cur_pred_list[:beam_width] + + # check if end_tok of all topk + end_flags_ = [1 if t[1][-1] == end_tok else 0 for t in top_pred_list] + if beam_width == sum(end_flags_): + break + + pred_tnsr_list = [t[1] for t in top_pred_list] + + return pred_tnsr_list + + +##===================== Glyph handlers ======================================= + + +class GlyphStrawboss: + def __init__(self, glyphs="en"): + """list of letters in a language in unicode + lang: ISO Language code + glyphs: json file with script information + """ + if glyphs == "en": + # Smallcase alone + self.glyphs = [chr(alpha) for alpha in range(97, 122 + 1)] + else: + self.dossier = json.load(open(glyphs, encoding="utf-8")) + self.glyphs = self.dossier["glyphs"] + self.numsym_map = self.dossier["numsym_map"] + + self.char2idx = {} + self.idx2char = {} + self._create_index() + + def _create_index(self): + + self.char2idx["_"] = 0 # pad + self.char2idx["$"] = 1 # start + self.char2idx["#"] = 2 # end + self.char2idx["*"] = 3 # Mask + self.char2idx["'"] = 4 # apostrophe U+0027 + self.char2idx["%"] = 5 # unused + self.char2idx["!"] = 6 # unused + + # letter to index mapping + for idx, char in enumerate(self.glyphs): + self.char2idx[char] = idx + 7 # +7 token initially + + # index to letter mapping + for char, idx in self.char2idx.items(): + self.idx2char[idx] = char + + def size(self): + return len(self.char2idx) + + def word2xlitvec(self, word): + """Converts given string of gyphs(word) to vector(numpy) + Also adds tokens for start and end + """ + try: + vec = [self.char2idx["$"]] # start token + for i in list(word): + vec.append(self.char2idx[i]) + vec.append(self.char2idx["#"]) # end token + + vec = np.asarray(vec, dtype=np.int64) + return vec + + except Exception as error: + print("XlitError: In word:", word, "Error Char not in Token:", error) + sys.exit() + + def xlitvec2word(self, vector): + """Converts vector(numpy) to string of glyphs(word)""" + char_list = [] + for i in vector: + char_list.append(self.idx2char[i]) + + word = "".join(char_list).replace("$", "").replace("#", "") # remove tokens + word = word.replace("_", "").replace("*", "") # remove tokens + return word + + +class VocabSanitizer: + def __init__(self, data_file): + """ + data_file: path to file conatining vocabulary list + """ + extension = os.path.splitext(data_file)[-1] + if extension == ".json": + self.vocab_set = set(json.load(open(data_file, encoding="utf-8"))) + elif extension == ".csv": + self.vocab_df = pd.read_csv(data_file).set_index("WORD") + self.vocab_set = set(self.vocab_df.index) + else: + print("XlitError: Only Json/CSV file extension supported") + + def reposition(self, word_list): + """Reorder Words in list""" + new_list = [] + temp_ = word_list.copy() + for v in word_list: + if v in self.vocab_set: + new_list.append(v) + temp_.remove(v) + new_list.extend(temp_) + + return new_list + + +##=============== INSTANTIATION ================================================ + + +class XlitPiston: + """ + For handling prediction & post-processing of transliteration for a single language + Class dependency: Seq2Seq, GlyphStrawboss, VocabSanitizer + Global Variables: F_DIR + """ + + def __init__( + self, + weight_path, + vocab_file, + tglyph_cfg_file, + iglyph_cfg_file="en", + device="cpu", + ): + + self.device = device + self.in_glyph_obj = GlyphStrawboss(iglyph_cfg_file) + self.tgt_glyph_obj = GlyphStrawboss(glyphs=tglyph_cfg_file) + self.voc_sanity = VocabSanitizer(vocab_file) + + self._numsym_set = set( + json.load(open(tglyph_cfg_file, encoding="utf-8"))["numsym_map"].keys() + ) + self._inchar_set = set("abcdefghijklmnopqrstuvwxyz") + self._natscr_set = set().union( + self.tgt_glyph_obj.glyphs, sum(self.tgt_glyph_obj.numsym_map.values(), []) + ) + + ## Model Config Static TODO: add defining in json support + input_dim = self.in_glyph_obj.size() + output_dim = self.tgt_glyph_obj.size() + enc_emb_dim = 300 + dec_emb_dim = 300 + enc_hidden_dim = 512 + dec_hidden_dim = 512 + rnn_type = "lstm" + enc2dec_hid = True + attention = True + enc_layers = 1 + dec_layers = 2 + m_dropout = 0 + enc_bidirect = True + enc_outstate_dim = enc_hidden_dim * (2 if enc_bidirect else 1) + + enc = Encoder( + input_dim=input_dim, + embed_dim=enc_emb_dim, + hidden_dim=enc_hidden_dim, + rnn_type=rnn_type, + layers=enc_layers, + dropout=m_dropout, + device=self.device, + bidirectional=enc_bidirect, + ) + dec = Decoder( + output_dim=output_dim, + embed_dim=dec_emb_dim, + hidden_dim=dec_hidden_dim, + rnn_type=rnn_type, + layers=dec_layers, + dropout=m_dropout, + use_attention=attention, + enc_outstate_dim=enc_outstate_dim, + device=self.device, + ) + self.model = Seq2Seq(enc, dec, pass_enc2dec_hid=enc2dec_hid, device=self.device) + self.model = self.model.to(self.device) + weights = torch.load(weight_path, map_location=torch.device(self.device)) + + self.model.load_state_dict(weights) + self.model.eval() + + def character_model(self, word, beam_width=1): + in_vec = torch.from_numpy(self.in_glyph_obj.word2xlitvec(word)).to(self.device) + ## change to active or passive beam + p_out_list = self.model.active_beam_inference(in_vec, beam_width=beam_width) + p_result = [ + self.tgt_glyph_obj.xlitvec2word(out.cpu().numpy()) for out in p_out_list + ] + + result = self.voc_sanity.reposition(p_result) + + # List type + return result + + def numsym_model(self, seg): + """tgt_glyph_obj.numsym_map[x] returns a list object""" + if len(seg) == 1: + return [seg] + self.tgt_glyph_obj.numsym_map[seg] + + a = [self.tgt_glyph_obj.numsym_map[n][0] for n in seg] + return [seg] + ["".join(a)] + + def _word_segementer(self, sequence): + + sequence = sequence.lower() + accepted = set().union(self._numsym_set, self._inchar_set, self._natscr_set) + # sequence = ''.join([i for i in sequence if i in accepted]) + + segment = [] + idx = 0 + seq_ = list(sequence) + while len(seq_): + # for Number-Symbol + temp = "" + while len(seq_) and seq_[0] in self._numsym_set: + temp += seq_[0] + seq_.pop(0) + if temp != "": + segment.append(temp) + + # for Target Chars + temp = "" + while len(seq_) and seq_[0] in self._natscr_set: + temp += seq_[0] + seq_.pop(0) + if temp != "": + segment.append(temp) + + # for Input-Roman Chars + temp = "" + while len(seq_) and seq_[0] in self._inchar_set: + temp += seq_[0] + seq_.pop(0) + if temp != "": + segment.append(temp) + + temp = "" + while len(seq_) and seq_[0] not in accepted: + temp += seq_[0] + seq_.pop(0) + if temp != "": + segment.append(temp) + + return segment + + def inferencer(self, sequence, beam_width=10): + + seg = self._word_segementer(sequence[:120]) + lit_seg = [] + + p = 0 + while p < len(seg): + if seg[p][0] in self._natscr_set: + lit_seg.append([seg[p]]) + p += 1 + + elif seg[p][0] in self._inchar_set: + lit_seg.append(self.character_model(seg[p], beam_width=beam_width)) + p += 1 + + elif seg[p][0] in self._numsym_set: # num & punc + lit_seg.append(self.numsym_model(seg[p])) + p += 1 + else: + lit_seg.append([seg[p]]) + p += 1 + + ## IF segment less/equal to 2 then return combinotorial, + ## ELSE only return top1 of each result concatenated + if len(lit_seg) == 1: + final_result = lit_seg[0] + + elif len(lit_seg) == 2: + final_result = [""] + for seg in lit_seg: + new_result = [] + for s in seg: + for f in final_result: + new_result.append(f + s) + final_result = new_result + + else: + new_result = [] + for seg in lit_seg: + new_result.append(seg[0]) + final_result = ["".join(new_result)] + + return final_result + + +from collections.abc import Iterable +from pydload import dload +import zipfile + +MODEL_DOWNLOAD_URL_PREFIX = "https://github.com/AI4Bharat/IndianNLP-Transliteration/releases/download/xlit_v0.5.0/" + + +def is_folder_writable(folder): + try: + os.makedirs(folder, exist_ok=True) + tmp_file = os.path.join(folder, ".write_test") + with open(tmp_file, "w") as f: + f.write("Permission Check") + os.remove(tmp_file) + return True + except: + return False + + +def is_directory_writable(path): + if os.name == "nt": + return is_folder_writable(path) + return os.access(path, os.W_OK | os.X_OK) + + +class XlitEngine: + """ + For Managing the top level tasks and applications of transliteration + Global Variables: F_DIR + """ + + def __init__( + self, lang2use="all", config_path="translit_models/default_lineup.json" + ): + + lineup = json.load(open(os.path.join(F_DIR, config_path), encoding="utf-8")) + self.lang_config = {} + if isinstance(lang2use, str): + if lang2use == "all": + self.lang_config = lineup + elif lang2use in lineup: + self.lang_config[lang2use] = lineup[lang2use] + else: + raise Exception( + "XlitError: The entered Langauge code not found. Available are {}".format( + lineup.keys() + ) + ) + + elif isinstance(lang2use, Iterable): + for l in lang2use: + try: + self.lang_config[l] = lineup[l] + except: + print( + "XlitError: Language code {} not found, Skipping...".format(l) + ) + else: + raise Exception( + "XlitError: lang2use must be a list of language codes (or) string of single language code" + ) + + if is_directory_writable(F_DIR): + models_path = os.path.join(F_DIR, "translit_models") + else: + user_home = os.path.expanduser("~") + models_path = os.path.join(user_home, ".AI4Bharat_Xlit_Models") + os.makedirs(models_path, exist_ok=True) + self.download_models(models_path) + + self.langs = {} + self.lang_model = {} + for la in self.lang_config: + try: + print("Loading {}...".format(la)) + self.lang_model[la] = XlitPiston( + weight_path=os.path.join( + models_path, self.lang_config[la]["weight"] + ), + vocab_file=os.path.join(models_path, self.lang_config[la]["vocab"]), + tglyph_cfg_file=os.path.join( + models_path, self.lang_config[la]["script"] + ), + iglyph_cfg_file="en", + ) + self.langs[la] = self.lang_config[la]["name"] + except Exception as error: + print("XlitError: Failure in loading {} \n".format(la), error) + print(XlitError.loading_err.value) + + def download_models(self, models_path): + """ + Download models from GitHub Releases if not exists + """ + for l in self.lang_config: + lang_name = self.lang_config[l]["eng_name"] + lang_model_path = os.path.join(models_path, lang_name) + if not os.path.isdir(lang_model_path): + print("Downloading model for language: %s" % lang_name) + remote_url = MODEL_DOWNLOAD_URL_PREFIX + lang_name + ".zip" + downloaded_zip_path = os.path.join(models_path, lang_name + ".zip") + dload(url=remote_url, save_to_path=downloaded_zip_path, max_time=None) + + if not os.path.isfile(downloaded_zip_path): + exit( + f"ERROR: Unable to download model from {remote_url} into {models_path}" + ) + + with zipfile.ZipFile(downloaded_zip_path, "r") as zip_ref: + zip_ref.extractall(models_path) + + if os.path.isdir(lang_model_path): + os.remove(downloaded_zip_path) + else: + exit( + f"ERROR: Unable to find models in {lang_model_path} after download" + ) + return + + def translit_word(self, eng_word, lang_code="default", topk=7, beam_width=10): + if eng_word == "": + return [] + + if lang_code in self.langs: + try: + res_list = self.lang_model[lang_code].inferencer( + eng_word, beam_width=beam_width + ) + return res_list[:topk] + + except Exception as error: + print("XlitError:", traceback.format_exc()) + print(XlitError.internal_err.value) + return XlitError.internal_err + + elif lang_code == "default": + try: + res_dict = {} + for la in self.lang_model: + res = self.lang_model[la].inferencer( + eng_word, beam_width=beam_width + ) + res_dict[la] = res[:topk] + return res_dict + + except Exception as error: + print("XlitError:", traceback.format_exc()) + print(XlitError.internal_err.value) + return XlitError.internal_err + + else: + print("XlitError: Unknown Langauge requested", lang_code) + print(XlitError.lang_err.value) + return XlitError.lang_err + + def translit_sentence(self, eng_sentence, lang_code="default", beam_width=10): + if eng_sentence == "": + return [] + + if lang_code in self.langs: + try: + out_str = "" + for word in eng_sentence.split(): + res_ = self.lang_model[lang_code].inferencer( + word, beam_width=beam_width + ) + out_str = out_str + res_[0] + " " + return out_str[:-1] + + except Exception as error: + print("XlitError:", traceback.format_exc()) + print(XlitError.internal_err.value) + return XlitError.internal_err + + elif lang_code == "default": + try: + res_dict = {} + for la in self.lang_model: + out_str = "" + for word in eng_sentence.split(): + res_ = self.lang_model[la].inferencer( + word, beam_width=beam_width + ) + out_str = out_str + res_[0] + " " + res_dict[la] = out_str[:-1] + return res_dict + + except Exception as error: + print("XlitError:", traceback.format_exc()) + print(XlitError.internal_err.value) + return XlitError.internal_err + + else: + print("XlitError: Unknown Langauge requested", lang_code) + print(XlitError.lang_err.value) + return XlitError.lang_err + + +if __name__ == "__main__": + + available_lang = [ + "bn", + "gu", + "hi", + "kn", + "gom", + "mai", + "ml", + "mr", + "pa", + "sd", + "si", + "ta", + "te", + "ur", + ] + + reg = re.compile(r"[a-zA-Z]") + lang = "hi" + engine = XlitEngine( + lang + ) # if you don't specify lang code here, this will give results in all langs available + sent = "Hello World! ABCD क्या हाल है आपका?" + words = [ + engine.translit_word(word, topk=1)[lang][0] if reg.match(word) else word + for word in sent.split() + ] # only transliterated en words, leaves rest as it is + updated_sent = " ".join(words) + + print(updated_sent) + + # output : हेलो वर्ल्ड! क्या हाल है आपका? + + # y = engine.translit_sentence("Hello World !")['hi'] + # print(y) diff --git a/ttsv/tts_infer/tts.py b/ttsv/tts_infer/tts.py new file mode 100644 index 0000000000000000000000000000000000000000..b373de8d62ce4aeb6ba5db5a07e8b018c347217b --- /dev/null +++ b/ttsv/tts_infer/tts.py @@ -0,0 +1,158 @@ +from __future__ import absolute_import, division, print_function, unicode_literals +from typing import Tuple +import sys +from argparse import ArgumentParser + +import torch +import numpy as np +import os +import json +import torch + +sys.path.append(os.path.join(os.path.dirname(__file__), "../src/glow_tts")) + +from scipy.io.wavfile import write +from hifi.env import AttrDict +from hifi.models import Generator + + +from text import text_to_sequence +import commons +import models +import utils + + +def check_directory(dir): + if not os.path.exists(dir): + sys.exit("Error: {} directory does not exist".format(dir)) + + +class TextToMel: + def __init__(self, glow_model_dir, device="cuda"): + self.glow_model_dir = glow_model_dir + check_directory(self.glow_model_dir) + self.device = device + self.hps, self.glow_tts_model = self.load_glow_tts() + pass + + def load_glow_tts(self): + hps = utils.get_hparams_from_dir(self.glow_model_dir) + checkpoint_path = utils.latest_checkpoint_path(self.glow_model_dir) + symbols = list(hps.data.punc) + list(hps.data.chars) + glow_tts_model = models.FlowGenerator( + len(symbols) + getattr(hps.data, "add_blank", False), + out_channels=hps.data.n_mel_channels, + **hps.model + ) # .to(self.device) + + if self.device == "cuda": + glow_tts_model.to("cuda") + + utils.load_checkpoint(checkpoint_path, glow_tts_model) + glow_tts_model.decoder.store_inverse() + _ = glow_tts_model.eval() + + return hps, glow_tts_model + + def generate_mel(self, text, noise_scale=0.667, length_scale=1.0): + symbols = list(self.hps.data.punc) + list(self.hps.data.chars) + cleaner = self.hps.data.text_cleaners + if getattr(self.hps.data, "add_blank", False): + text_norm = text_to_sequence(text, symbols, cleaner) + text_norm = commons.intersperse(text_norm, len(symbols)) + else: # If not using "add_blank" option during training, adding spaces at the beginning and the end of utterance improves quality + text = " " + text.strip() + " " + text_norm = text_to_sequence(text, symbols, cleaner) + + sequence = np.array(text_norm)[None, :] + + del symbols + del cleaner + del text + del text_norm + + if self.device == "cuda": + x_tst = torch.autograd.Variable(torch.from_numpy(sequence)).cuda().long() + x_tst_lengths = torch.tensor([x_tst.shape[1]]).cuda() + else: + x_tst = torch.autograd.Variable(torch.from_numpy(sequence)).long() + x_tst_lengths = torch.tensor([x_tst.shape[1]]) + + with torch.no_grad(): + (y_gen_tst, *_), *_, (attn_gen, *_) = self.glow_tts_model( + x_tst, + x_tst_lengths, + gen=True, + noise_scale=noise_scale, + length_scale=length_scale, + ) + del x_tst + del x_tst_lengths + torch.cuda.empty_cache() + return y_gen_tst + #return y_gen_tst.cpu().detach().numpy() + + +class MelToWav: + def __init__(self, hifi_model_dir, device="cuda"): + self.hifi_model_dir = hifi_model_dir + check_directory(self.hifi_model_dir) + self.device = device + self.h, self.hifi_gan_generator = self.load_hifi_gan() + pass + + def load_hifi_gan(self): + checkpoint_path = utils.latest_checkpoint_path(self.hifi_model_dir, regex="g_*") + config_file = os.path.join(self.hifi_model_dir, "config.json") + data = open(config_file).read() + json_config = json.loads(data) + h = AttrDict(json_config) + torch.manual_seed(h.seed) + + generator = Generator(h).to(self.device) + + assert os.path.isfile(checkpoint_path) + print("Loading '{}'".format(checkpoint_path)) + state_dict_g = torch.load(checkpoint_path, map_location=self.device) + print("Complete.") + + generator.load_state_dict(state_dict_g["generator"]) + + generator.eval() + generator.remove_weight_norm() + + return h, generator + + def generate_wav(self, mel): + #mel = torch.FloatTensor(mel).to(self.device) + + y_g_hat = self.hifi_gan_generator(mel.to(self.device)) # passing through vocoder + audio = y_g_hat.squeeze() + audio = audio * 32768.0 + audio = audio.cpu().detach().numpy().astype("int16") + + del y_g_hat + del mel + torch.cuda.empty_cache() + return audio, self.h.sampling_rate + + +if __name__ == "__main__": + + parser = ArgumentParser() + parser.add_argument("-m", "--model", required=True, type=str) + parser.add_argument("-g", "--gan", required=True, type=str) + parser.add_argument("-d", "--device", type=str, default="cpu") + parser.add_argument("-t", "--text", type=str, required=True) + parser.add_argument("-w", "--wav", type=str, required=True) + args = parser.parse_args() + + text_to_mel = TextToMel(glow_model_dir=args.model, device=args.device) + mel_to_wav = MelToWav(hifi_model_dir=args.gan, device=args.device) + + mel = text_to_mel.generate_mel(args.text) + audio, sr = mel_to_wav.generate_wav(mel) + + write(filename=args.wav, rate=sr, data=audio) + + pass diff --git a/ttsv/utils/__init__.py b/ttsv/utils/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/ttsv/utils/data/duration.py b/ttsv/utils/data/duration.py new file mode 100644 index 0000000000000000000000000000000000000000..c3b5e112b72dd5a07ea2463f604d98bb8d961496 --- /dev/null +++ b/ttsv/utils/data/duration.py @@ -0,0 +1,33 @@ +# Usage -> python duration.py /src/folder/path + +import soundfile as sf +import sys +import os +from glob import glob +from joblib import Parallel, delayed +from tqdm import tqdm + + +def get_duration(fpath): + w = sf.SoundFile(fpath) + sr = w.samplerate + assert 22050 == sr, "Sample rate is not 22050" + return len(w) / sr + + +def main(folder, ext="wav"): + file_list = glob(folder + "/**/*." + ext, recursive=True) + print(f"\n\tTotal number of wav files {len(file_list)}") + duration_list = Parallel(n_jobs=1)( + delayed(get_duration)(i) for i in tqdm(file_list) + ) + print( + f"\n\tMin Duration {min(duration_list):.2f} Max Duration {max(duration_list):.2f} in secs" + ) + print(f"\n\tTotal Duration {sum(duration_list)/3600:.2f} in hours") + + +if __name__ == "__main__": + folder = sys.argv[1] + folder = os.path.abspath(folder) + main(folder) diff --git a/ttsv/utils/data/resample.py b/ttsv/utils/data/resample.py new file mode 100644 index 0000000000000000000000000000000000000000..c77109ef4d5142cd9094f46dd186a17571071ab8 --- /dev/null +++ b/ttsv/utils/data/resample.py @@ -0,0 +1,59 @@ +import argparse +import librosa +import numpy as np +import os +import scipy +import scipy.io.wavfile +import sys + +from glob import glob +from tqdm import tqdm +from joblib import Parallel, delayed + + +def check_directories(dir_input, dir_output): + if not os.path.exists(dir_input): + sys.exit("Error: Input directory does not exist: {}".format(dir_input)) + if not os.path.exists(dir_output): + sys.exit("Error: Output directory does not exist: {}".format(dir_output)) + abs_a = os.path.abspath(dir_input) + abs_b = os.path.abspath(dir_output) + if abs_a == abs_b: + sys.exit("Error: Paths are the same: {}".format(abs_a)) + + +def resample_file(input_filename, output_filename, sample_rate): + mono = ( + True # librosa converts signal to mono by default, so I'm just surfacing this + ) + audio, existing_rate = librosa.load(input_filename, sr=sample_rate, mono=mono) + audio /= 1.414 # Scale to [-1.0, 1.0] + audio *= 32767 # Scale to int16 + audio = audio.astype(np.int16) + scipy.io.wavfile.write(output_filename, sample_rate, audio) + + +def downsample_wav_files(input_dir, output_dir, output_sample_rate): + check_directories(input_dir, output_dir) + inp_wav_paths = glob(input_dir + "/*.wav") + out_wav_paths = [ + os.path.join(output_dir, os.path.basename(p)) for p in inp_wav_paths + ] + _ = Parallel(n_jobs=-1)( + delayed(resample_file)(i, o, output_sample_rate) + for i, o in tqdm(zip(inp_wav_paths, out_wav_paths)) + ) + + +def parse_args(): + parser = argparse.ArgumentParser() + parser.add_argument("--input_dir", "-i", type=str, required=True) + parser.add_argument("--output_dir", "-o", type=str, required=True) + parser.add_argument("--output_sample_rate", "-s", type=int, required=True) + return parser.parse_args() + + +if __name__ == "__main__": + args = parse_args() + downsample_wav_files(args.input_dir, args.output_dir, args.output_sample_rate) + print(f"\n\tCompleted") diff --git a/ttsv/utils/glow/prepare_iitm_data_glow.py b/ttsv/utils/glow/prepare_iitm_data_glow.py new file mode 100644 index 0000000000000000000000000000000000000000..9e1e5cb8cd85c88892371851917ec721c2c4b08e --- /dev/null +++ b/ttsv/utils/glow/prepare_iitm_data_glow.py @@ -0,0 +1,134 @@ +import os +from glob import glob +import re +import string +import argparse + +import random +random.seed(42) + +def replace_extra_chars(line): + line = line.replace("(", "").replace( + ")", "" + ) # .replace('\u200d', ' ').replace('\ufeff', ' ').replace('\u200c', ' ').replace('\u200e', ' ') + # line = line.replace('“', ' ').replace('”', ' ').replace(':', ' ') + + return line.strip() + + +def write_txt(content, filename): + with open(filename, "w+", encoding="utf-8") as f: + f.write(content) + + +def save_train_test_valid_split(annotations_txt, num_samples_valid, num_samples_test): + with open(annotations_txt, encoding="utf-8") as f: + all_lines = [line.strip() for line in f.readlines()] + test_val_indices = random.sample( + range(len(all_lines)), num_samples_valid + num_samples_test + ) + valid_ix = test_val_indices[:num_samples_valid] + test_ix = test_val_indices[num_samples_valid:] + train = [line for i, line in enumerate(all_lines) if i not in test_val_indices] + valid = [line for i, line in enumerate(all_lines) if i in valid_ix] + test = [line for i, line in enumerate(all_lines) if i in test_ix] + + print(f"Num samples in train: {len(train)}") + print(f"Num samples in valid: {len(valid)}") + print(f"Num samples in test: {len(test)}") + + out_dir_path = "/".join(annotations_txt.split("/")[:-1]) + with open(os.path.join(out_dir_path, "train.txt"), "w+", encoding="utf-8") as f: + for line in train: + print(line, file=f) + with open(os.path.join(out_dir_path, "valid.txt"), "w+", encoding="utf-8") as f: + for line in valid: + print(line, file=f) + with open(os.path.join(out_dir_path, "test.txt"), "w+", encoding="utf-8") as f: + for line in test: + print(line, file=f) + print(f"train, test and valid txts saved in {out_dir_path}") + + +def save_txts_from_txt_done_data( + text_path, + wav_path_for_annotations_txt, + out_path_for_txts, + num_samples_valid, + num_samples_test, +): + outfile = os.path.join(out_path_for_txts, "annotations.txt") + with open(text_path) as file: + file_lines = file.readlines() + + # print(file_lines[0]) + + file_lines = [replace_extra_chars(line) for line in file_lines] + # print(file_lines[0]) + + fnames, ftexts = [], [] + for line in file_lines: + elems = line.split('"') + fnames.append(elems[0].strip()) + ftexts.append(elems[1].strip()) + + all_chars = list(set("".join(ftexts))) + punct_with_space = [i for i in all_chars if i in list(string.punctuation)] + [" "] + chars = [i for i in all_chars if i not in punct_with_space if i.strip()] + chars = "".join(chars) + punct_with_space = "".join(punct_with_space) + + with open('../../config/glow/base_blank.json', 'r') as jfile: + json_config = json.load(jfile) + + json_config["data"]["chars"] = chars + json_config["data"]["punc"] = punct_with_space + json_config["data"]["training_files"]=out_path_for_txts + '/train.txt' + json_config["data"]["validation_files"] = out_path_for_txts + '/valid.txt' + new_config_name = out_path_for_txts.split('/')[-1] + with open(f'../../config/glow/{new_config_name}.json','w+') as jfile: + json.dump(json_config, jfile) + + print(f"Characters: {chars}") + print(f"Punctuation: {punct_with_space}") + print(f"Config file is stored at ../../config/glow/{new_config_name}.json") + + outfile_f = open(outfile, "w+", encoding="utf-8") + for f, t in zip(fnames, ftexts): + print( + os.path.join(wav_path_for_annotations_txt, f) + ".wav", + t, + sep="|", + file=outfile_f, + ) + outfile_f.close() + write_txt(punct_with_space, os.path.join(out_path_for_txts, "punc.txt")) + write_txt(chars, os.path.join(out_path_for_txts, "chars.txt")) + + save_train_test_valid_split( + annotations_txt=outfile, + num_samples_valid=num_samples_valid, + num_samples_test=num_samples_test, + ) + + + + +if __name__ == "__main__": + + + parser = argparse.ArgumentParser() + parser.add_argument("-i", "--text-path", type=str, required=True) + parser.add_argument("-o", "--output-path", type=str, required=True) + parser.add_argument("-w", "--wav-path", type=str, required=True) + parser.add_argument("-v", "--valid-samples", type=int, default = 100) + parser.add_argument("-t", "--test-samples", type=int, default = 10) + args = parser.parse_args() + + save_txts_from_txt_done_data( + args.text_path, + args.wav_path, + args.output_path, + args.valid_samples, + args.test_samples, + ) diff --git a/ttsv/utils/glow/prepare_iitm_data_glow_en.py b/ttsv/utils/glow/prepare_iitm_data_glow_en.py new file mode 100644 index 0000000000000000000000000000000000000000..827bdc98f2d84090cc445d786ff8fc1e5ff3d829 --- /dev/null +++ b/ttsv/utils/glow/prepare_iitm_data_glow_en.py @@ -0,0 +1,135 @@ +import os +from glob import glob +import re +import string +import argparse +import json +import random +random.seed(42) + +def replace_extra_chars(line): + line = line.replace("(", "").replace( + ")", "" + ) # .replace('\u200d', ' ').replace('\ufeff', ' ').replace('\u200c', ' ').replace('\u200e', ' ') + # line = line.replace('“', ' ').replace('”', ' ').replace(':', ' ') + + return line.strip() + + +def write_txt(content, filename): + with open(filename, "w+", encoding="utf-8") as f: + f.write(content) + + +def save_train_test_valid_split(annotations_txt, num_samples_valid, num_samples_test): + with open(annotations_txt, encoding="utf-8") as f: + all_lines = [line.strip() for line in f.readlines()] + test_val_indices = random.sample( + range(len(all_lines)), num_samples_valid + num_samples_test + ) + valid_ix = test_val_indices[:num_samples_valid] + test_ix = test_val_indices[num_samples_valid:] + train = [line for i, line in enumerate(all_lines) if i not in test_val_indices] + valid = [line for i, line in enumerate(all_lines) if i in valid_ix] + test = [line for i, line in enumerate(all_lines) if i in test_ix] + + print(f"Num samples in train: {len(train)}") + print(f"Num samples in valid: {len(valid)}") + print(f"Num samples in test: {len(test)}") + + out_dir_path = "/".join(annotations_txt.split("/")[:-1]) + with open(os.path.join(out_dir_path, "train.txt"), "w+", encoding="utf-8") as f: + for line in train: + print(line, file=f) + with open(os.path.join(out_dir_path, "valid.txt"), "w+", encoding="utf-8") as f: + for line in valid: + print(line, file=f) + with open(os.path.join(out_dir_path, "test.txt"), "w+", encoding="utf-8") as f: + for line in test: + print(line, file=f) + print(f"train, test and valid txts saved in {out_dir_path}") + + +def save_txts_from_txt_done_data( + text_path, + wav_path_for_annotations_txt, + out_path_for_txts, + num_samples_valid, + num_samples_test, +): + outfile = os.path.join(out_path_for_txts, "annotations.txt") + with open(text_path) as file: + file_lines = file.readlines() + + # print(file_lines[0]) + + file_lines = [replace_extra_chars(line) for line in file_lines] + # print(file_lines[0]) + + fnames, ftexts = [], [] + for line in file_lines: + elems = line.split('"') + fnames.append(elems[0].strip()) + ftexts.append(elems[1].strip().lower().replace('‘','\'').replace('’','\'')) + + all_chars = list(set("".join(ftexts))) + punct_with_space = [i for i in all_chars if i in list(string.punctuation)] + [" "] + chars = [i for i in all_chars if i not in punct_with_space if i.strip()] + chars = "".join(chars) + punct_with_space = "".join(punct_with_space)#.replace("'",r"\'") + + with open('../../config/glow/base_blank.json', 'r') as jfile: + json_config = json.load(jfile) + + json_config["data"]["chars"] = chars + json_config["data"]["punc"] = punct_with_space + json_config["data"]["training_files"]=out_path_for_txts + '/train.txt' + json_config["data"]["validation_files"] = out_path_for_txts + '/valid.txt' + new_config_name = out_path_for_txts.split('/')[-1] + with open(f'../../config/glow/{new_config_name}.json','w+') as jfile: + json.dump(json_config, jfile) + + print(f"Characters: {chars}") + print(f"Len of vocab: {len(chars)}") + print(f"Punctuation: {punct_with_space}") + print(f"Config file is stored at ../../config/glow/{new_config_name}.json") + + outfile_f = open(outfile, "w+", encoding="utf-8") + for f, t in zip(fnames, ftexts): + print( + os.path.join(wav_path_for_annotations_txt, f) + ".wav", + t, + sep="|", + file=outfile_f, + ) + outfile_f.close() + write_txt(punct_with_space, os.path.join(out_path_for_txts, "punc.txt")) + write_txt(chars, os.path.join(out_path_for_txts, "chars.txt")) + + save_train_test_valid_split( + annotations_txt=outfile, + num_samples_valid=num_samples_valid, + num_samples_test=num_samples_test, + ) + + + + +if __name__ == "__main__": + + + parser = argparse.ArgumentParser() + parser.add_argument("-i", "--text-path", type=str, required=True) + parser.add_argument("-o", "--output-path", type=str, required=True) + parser.add_argument("-w", "--wav-path", type=str, required=True) + parser.add_argument("-v", "--valid-samples", type=int, default = 100) + parser.add_argument("-t", "--test-samples", type=int, default = 10) + args = parser.parse_args() + + save_txts_from_txt_done_data( + args.text_path, + args.wav_path, + args.output_path, + args.valid_samples, + args.test_samples, + ) diff --git a/ttsv/utils/hifi/prepare_iitm_data_hifi.py b/ttsv/utils/hifi/prepare_iitm_data_hifi.py new file mode 100644 index 0000000000000000000000000000000000000000..1e1de2e28735143aeef8ddb10bc5a4672c02564b --- /dev/null +++ b/ttsv/utils/hifi/prepare_iitm_data_hifi.py @@ -0,0 +1,64 @@ + +import glob +import random +import sys +import os +import argparse + + + + +def process_data(args): + + path = args.input_path + valid_files = args.valid_files + test_files = args.test_files + dest_path = args.dest_path + + list_paths = path.split(',') + + valid_set = [] + training_set = [] + test_set = [] + + for local_path in list_paths: + files = glob.glob(local_path+'/*.wav') + print(f"Total files: {len(files)}") + + valid_set_local = random.sample(files, valid_files) + + test_set_local = random.sample(valid_set_local, test_files) + valid_set.extend(list(set(valid_set_local) - set(test_set_local))) + test_set.extend(test_set_local) + + print(len(valid_set_local)) + + training_set_local = set(files) - set(valid_set_local) + print(len(training_set_local)) + training_set.extend(training_set_local) + + + valid_set = random.sample(valid_set, len(valid_set)) + test_set = random.sample(test_set, len(test_set)) + training_set = random.sample(training_set, len(training_set)) + + with open(os.path.join(dest_path , 'valid.txt'), mode = 'w+') as file: + file.write("\n".join(list(valid_set))) + + with open(os.path.join(dest_path , 'train.txt'), mode = 'w+') as file: + file.write("\n".join(list(training_set))) + + with open(os.path.join(dest_path , 'test.txt'), mode = 'w+') as file: + file.write("\n".join(list(test_set))) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument('-i','--input-path',type=str,help='path to input wav files') + parser.add_argument('-v','--valid-files',type=int,help='number of valid files') + parser.add_argument('-t','--test-files',type=int,help='number of test files') + parser.add_argument('-d','--dest-path',type=str,help='destination path to output filelists') + + args = parser.parse_args() + + process_data(args) \ No newline at end of file diff --git a/ttsv/utils/inference/__init__.py b/ttsv/utils/inference/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/ttsv/utils/inference/advanced_tts.py b/ttsv/utils/inference/advanced_tts.py new file mode 100644 index 0000000000000000000000000000000000000000..6f8e2f5870e0f7dcd28c35c71cde58de6f1ae415 --- /dev/null +++ b/ttsv/utils/inference/advanced_tts.py @@ -0,0 +1,155 @@ + +from .tts import TextToMel, MelToWav +from .transliterate import XlitEngine +from .num_to_word_on_sent import normalize_nums + +import re +import numpy as np +from scipy.io.wavfile import write + +from mosestokenizer import * +from indicnlp.tokenize import sentence_tokenize +import argparse + +_INDIC = ["as", "bn", "gu", "hi", "kn", "ml", "mr", "or", "pa", "ta", "te"] +_PURAM_VIRAM_LANGUAGES = ["hi", "or", "bn", "as"] +_TRANSLITERATION_NOT_AVAILABLE_IN = ["en","or"] +#_NUM2WORDS_NOT_AVAILABLE_IN = [] + +def normalize_text(text, lang): + if lang in _PURAM_VIRAM_LANGUAGES: + text = text.replace('|', '।') + text = text.replace('.', '।') + return text + +def split_sentences(paragraph, language): + if language == "en": + with MosesSentenceSplitter(language) as splitter: + return splitter([paragraph]) + elif language in _INDIC: + return sentence_tokenize.sentence_split(paragraph, lang=language) + + + +def load_models(acoustic, vocoder, device): + text_to_mel = TextToMel(glow_model_dir=acoustic, device=device) + mel_to_wav = MelToWav(hifi_model_dir=vocoder, device=device) + return text_to_mel, mel_to_wav + + +def translit(text, lang): + reg = re.compile(r'[a-zA-Z]') + words = [engine.translit_word(word, topk=1)[lang][0] if reg.match(word) else word for word in text.split()] + updated_sent = ' '.join(words) + return updated_sent + + + +def run_tts(text, lang, args): + if lang == 'hi': + text = text.replace('।', '.') # only for hindi models + + if lang == 'en' and text[-1] != '.': + text = text + '. ' + + if args.number_conversion == 1 and lang!='en': + print("Doing number conversion") + text_num_to_word = normalize_nums(text, lang) # converting numbers to words in lang + else: + text_num_to_word = text + + + if args.transliteration == 1 and lang not in _TRANSLITERATION_NOT_AVAILABLE_IN: + print("Doing transliteration") + text_num_to_word_and_transliterated = translit(text_num_to_word, lang) # transliterating english words to lang + else: + text_num_to_word_and_transliterated = text_num_to_word + + final_text = ' ' + text_num_to_word_and_transliterated + print(final_text) + mel = text_to_mel.generate_mel(final_text, args.noise_scale, args.length_scale) + audio, sr = mel_to_wav.generate_wav(mel) + return sr, audio + +def run_tts_paragraph(args): + audio_list = [] + + global text_to_mel + global mel_to_wav + + if args.gender == 'Male': + text_to_mel = text_to_mel_list[1] + mel_to_wav = mel_to_wav_list[1] + else: + text_to_mel = text_to_mel_list[0] + mel_to_wav = mel_to_wav_list[0] + + + if args.split_sentences == 1: + text = normalize_text(args.text, args.lang) + split_sentences_list = split_sentences(text, args.lang) + + for sent in split_sentences_list: + + sr, audio = run_tts(sent, args.lang, args) + audio_list.append(audio) + + concatenated_audio = np.concatenate([i for i in audio_list]) + if args.wav: + write(filename=args.wav, rate=sr, data=concatenated_audio) + return (sr, concatenated_audio) + else: + sr, audio = run_tts(args.text, args.lang, args) + if args.wav: + write(filename=args.wav, rate=sr, data=audio) + return (sr, audio) + + +def load_all_models(args): + global engine + if args.lang not in _TRANSLITERATION_NOT_AVAILABLE_IN: + engine = XlitEngine(args.lang) # loading translit model globally + + global text_to_mel_list + global mel_to_wav_list + + + text_to_mel_list = [] + mel_to_wav_list = [] + + for acoustic, vocoder in zip( args.acoustic.split(',') , args.vocoder.split(',') ): + ttm, mtw = load_models(acoustic, vocoder, args.device) + text_to_mel_list.append(ttm) + mel_to_wav_list.append(mtw) + + try: + args.noise_scale = float(args.noise_scale) + args.length_scale = float(args.length_scale) + except: + pass + + print(args) + + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("-a", "--acoustic", required=True, type=str) + parser.add_argument("-v", "--vocoder", required=True, type=str) + parser.add_argument("-d", "--device", type=str, default="cpu") + parser.add_argument("-t", "--text", type=str, required=True) + parser.add_argument("-w", "--wav", type=str, required=True) + parser.add_argument("-n", "--noise-scale", default='0.667', type=str ) + parser.add_argument("-l", "--length-scale", default='1.0', type=str) + + parser.add_argument("-T", "--transliteration", default=1, type=int) + parser.add_argument("-N", "--number-conversion", default=1, type=int) + parser.add_argument("-S", "--split-sentences", default=1, type=int) + parser.add_argument("-L", "--lang", type=str, required=True) + + args = parser.parse_args() + + load_all_models(args) + run_tts_paragraph(args) + + diff --git a/ttsv/utils/inference/api.py b/ttsv/utils/inference/api.py new file mode 100644 index 0000000000000000000000000000000000000000..d6bcabd194a4531801941d5e1d248dc134ce255f --- /dev/null +++ b/ttsv/utils/inference/api.py @@ -0,0 +1,66 @@ +from starlette.responses import StreamingResponse +from tts import MelToWav, TextToMel +from advanced_tts import load_all_models, run_tts_paragraph +from typing import Optional +from pydantic import BaseModel +from fastapi import FastAPI, HTTPException +import uvicorn +import base64 +import argparse +import json +import time +from argparse import Namespace + +app = FastAPI() + + +class TextJson(BaseModel): + text: str + lang: Optional[str] = "hi" + noise_scale: Optional[float]=0.667 + length_scale: Optional[float]=1.0 + transliteration: Optional[int]=1 + number_conversion: Optional[int]=1 + split_sentences: Optional[int]=1 + + + + +@app.post("/TTS/") +async def tts(input: TextJson): + text = input.text + lang = input.lang + + args = Namespace(**input.dict()) + + args.wav = '../../results/api/'+str(int(time.time())) + '.wav' + + if text: + sr, audio = run_tts_paragraph(args) + else: + raise HTTPException(status_code=400, detail={"error": "No text"}) + + ## to return outpur as a file + audio = open(args.wav, mode='rb') + return StreamingResponse(audio, media_type="audio/wav") + + # with open(args.wav, "rb") as audio_file: + # encoded_bytes = base64.b64encode(audio_file.read()) + # encoded_string = encoded_bytes.decode() + # return {"encoding": "base64", "data": encoded_string, "sr": sr} + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("-a", "--acoustic", required=True, type=str) + parser.add_argument("-v", "--vocoder", required=True, type=str) + parser.add_argument("-d", "--device", type=str, default="cpu") + parser.add_argument("-L", "--lang", type=str, required=True) + + args = parser.parse_args() + + load_all_models(args) + + uvicorn.run( + "api:app", host="0.0.0.0", port=6006, log_level="debug" + ) diff --git a/ttsv/utils/inference/num_to_word_on_sent.py b/ttsv/utils/inference/num_to_word_on_sent.py new file mode 100644 index 0000000000000000000000000000000000000000..ce878a8c3ee6f5ef629abeaee418d5959f7179ed --- /dev/null +++ b/ttsv/utils/inference/num_to_word_on_sent.py @@ -0,0 +1,1314 @@ +import re +import string + +# ----------------------------- indic_num.py ----------------------------- +supported_lang = {"en", "hi", "gu", "mr", "bn", "te", "ta", "kn", "or", "pa"} +# supported_lang = {'eng', 'hin', 'guj', 'mar', 'ben', 'tel', 'tam', 'kan', 'ori', 'pan'} # Three alphabet lang code + +all_num = { + "en": ["0", "1", "2", "3", "4", "5", "6", "7", "8", "9"], + "hi": ["०", "१", "२", "३", "४", "५", "६", "७", "८", "९"], + "gu": ["૦", "૧", "૨", "૩", "૪", "૫", "૬", "૭", "૮", "૯"], + "mr": ["०", "१", "२", "३", "४", "५", "६", "७", "८", "९"], + "bn": ["০", "১", "২", "৩", "৪", "৫", "৬", "৭", "৮", "৯"], + "te": ["౦", "౧", "౨", "౩", "౪", "౫", "౬", "౭", "౮", "౯"], + "ta": ["0", "௧", "௨", "௩", "௪", "௫", "௬", "௭", "௮", "௯", "௰"], + "kn": ["೦", "೧", "೨", "೩", "೪", "೫", "೬", "೭", "೮", "೯"], + "or": ["୦", "୧", "୨", "୩", "୪", "୫", "୬", "୭", "୮", "୯"], + "pa": ["੦", "੧", "੨", "੩", "੪", "੫", "੬", "੭", "੮", "੯"], +} + +num_dict = dict() +num_dict["en"] = { + "0": "zero", + "1": "one", + "2": "two", + "3": "three", + "4": "four", + "5": "five", + "6": "six", + "7": "seven", + "8": "eight", + "9": "nine", + "10": "ten", + "11": "eleven", + "12": "twelve", + "13": "thirteen", + "14": "fourteen", + "15": "fifteen", + "16": "sixteen", + "17": "seventeen", + "18": "eighteen", + "19": "nineteen", + "20": "twenty", + "21": "twenty-one", + "22": "twenty-two", + "23": "twenty-three", + "24": "twenty-four", + "25": "twenty-five", + "26": "twenty-six", + "27": "twenty-seven", + "28": "twenty-eight", + "29": "twenty-nine", + "30": "thirty", + "31": "thirty-one", + "32": "thirty-two", + "33": "thirty-three", + "34": "thirty-four", + "35": "thirty-five", + "36": "thirty-six", + "37": "thirty-seven", + "38": "thirty-eight", + "39": "thirty-nine", + "40": "forty", + "41": "forty-one", + "42": "forty-two", + "43": "forty-three", + "44": "forty-four", + "45": "forty-five", + "46": "forty-six", + "47": "forty-seven", + "48": "forty-eight", + "49": "forty-nine", + "50": "fifty", + "51": "fifty-one", + "52": "fifty-two", + "53": "fifty-three", + "54": "fifty-four", + "55": "fifty-five", + "56": "fifty-six", + "57": "fifty-seven", + "58": "fifty-eight", + "59": "fifty-nine", + "60": "sixty", + "61": "sixty-one", + "62": "sixty-two", + "63": "sixty-three", + "64": "sixty-four", + "65": "sixty-five", + "66": "sixty-six", + "67": "sixty-seven", + "68": "sixty-eight", + "69": "sixty-nine", + "70": "seventy", + "71": "seventy-one", + "72": "seventy-two", + "73": "seventy-three", + "74": "seventy-four", + "75": "seventy-five", + "76": "seventy-six", + "77": "seventy-seven", + "78": "seventy-eight", + "79": "seventy-nine", + "80": "eighty", + "81": "eighty-one", + "82": "eighty-two", + "83": "eighty-three", + "84": "eighty-four", + "85": "eighty-five", + "86": "eighty-six", + "87": "eighty-seven", + "88": "eighty-eight", + "89": "eighty-nine", + "90": "ninety", + "91": "ninety-one", + "92": "ninety-two", + "93": "ninety-three", + "94": "ninety-four", + "95": "ninety-five", + "96": "ninety-six", + "97": "ninety-seven", + "98": "ninety-eight", + "99": "ninety-nine", + "100": "hundred", + "1000": "thousand", + "100000": "lac", + "10000000": "crore", + "1000000000": "arab", +} # English-India +num_dict["hi"] = { + "0": "शून्य", + "1": "एक", + "2": "दो", + "3": "तीन", + "4": "चार", + "5": "पाँच", + "6": "छः", + "7": "सात", + "8": "आठ", + "9": "नौ", + "10": "दस", + "11": "ग्यारह", + "12": "बारह", + "13": "तेरह", + "14": "चौदह", + "15": "पंद्रह", + "16": "सोलह", + "17": "सत्रह", + "18": "अट्ठारह", + "19": "उन्नीस", + "20": "बीस", + "21": "इक्कीस", + "22": "बाईस", + "23": "तेईस", + "24": "चौबिस", + "25": "पच्चीस", + "26": "छब्बीस", + "27": "सत्ताईस", + "28": "अट्ठाईस", + "29": "उनतीस", + "30": "तीस", + "31": "इकतीस", + "32": "बत्तीस", + "33": "तैंतीस", + "34": "चौंतीस", + "35": "पैंतीस", + "36": "छत्तीस", + "37": "सैंतीस", + "38": "अड़तीस", + "39": "उनतालीस", + "40": "चालीस", + "41": "इकतालीस", + "42": "बयालीस", + "43": "तैंतालीस", + "44": "चौंतालीस", + "45": "पैंतालीस", + "46": "छियालीस", + "47": "सैंतालीस", + "48": "अड़तालीस", + "49": "उनचास", + "50": "पचास", + "51": "इक्यावन​", + "52": "बावन", + "53": "तिरेपन", + "54": "चौवन", + "55": "पचपन", + "56": "छप्पन", + "57": "सत्तावन", + "58": "अट्ठावन", + "59": "उनसठ", + "60": "साठ", + "61": "इकसठ", + "62": "बासठ", + "63": "तिरेसठ", + "64": "चौंसठ", + "65": "पैंसठ", + "66": "छयासठ", + "67": "सरसठ​", + "68": "अड़सठ", + "69": "उनहत्तर", + "70": "सत्तर", + "71": "इकहत्तर", + "72": "बहत्तर", + "73": "तिहत्तर", + "74": "चौहत्तर", + "75": "पचहत्तर", + "76": "छिहत्तर", + "77": "सतहत्तर", + "78": "अठहत्तर", + "79": "उन्यासी", + "80": "अस्सी", + "81": "इक्यासी", + "82": "बयासी", + "83": "तिरासी", + "84": "चौरासी", + "85": "पचासी", + "86": "छियासी", + "87": "सत्तासी", + "88": "अठासी", + "89": "नवासी", + "90": "नब्बे", + "91": "इक्यानवे", + "92": "बानवे", + "93": "तिरानवे", + "94": "चौरानवे", + "95": "पचानवे", + "96": "छियानवे", + "97": "सत्तानवे", + "98": "अट्ठानवे", + "99": "निन्यानवे", + "100": "सौ", + "1000": "हज़ार", + "100000": "लाख", + "10000000": "करोड़", + "1000000000": "अरब", +} # Hindi +num_dict["gu"] = { + "0": "શૂન્ય", + "1": "એક", + "2": "બે", + "3": "ત્રણ", + "4": "ચાર", + "5": "પાંચ", + "6": "છ", + "7": "સાત", + "8": "આઠ", + "9": "નવ", + "10": "દસ", + "11": "અગિયાર", + "12": "બાર", + "13": "તેર", + "14": "ચૌદ", + "15": "પંદર", + "16": "સોળ", + "17": "સત્તર", + "18": "અઢાર", + "19": "ઓગણિસ", + "20": "વીસ", + "21": "એકવીસ", + "22": "બાવીસ", + "23": "તેવીસ", + "24": "ચોવીસ", + "25": "પચ્ચીસ", + "26": "છવીસ", + "27": "સત્તાવીસ", + "28": "અઠ્ઠાવીસ", + "29": "ઓગણત્રીસ", + "30": "ત્રીસ", + "31": "એકત્રીસ", + "32": "બત્રીસ", + "33": "તેત્રીસ", + "34": "ચોત્રીસ", + "35": "પાંત્રીસ", + "36": "છત્રીસ", + "37": "સડત્રીસ", + "38": "અડત્રીસ", + "39": "ઓગણચાલીસ", + "40": "ચાલીસ", + "41": "એકતાલીસ", + "42": "બેતાલીસ", + "43": "ત્રેતાલીસ", + "44": "ચુંમાલીસ", + "45": "પિસ્તાલીસ", + "46": "છેતાલીસ", + "47": "સુડતાલીસ", + "48": "અડતાલીસ", + "49": "ઓગણપચાસ", + "50": "પચાસ", + "51": "એકાવન", + "52": "બાવન", + "53": "ત્રેપન", + "54": "ચોપન", + "55": "પંચાવન", + "56": "છપ્પન", + "57": "સત્તાવન", + "58": "અઠ્ઠાવન", + "59": "ઓગણસાઠ", + "60": "સાઈઠ", + "61": "એકસઠ", + "62": "બાસઠ", + "63": "ત્રેસઠ", + "64": "ચોસઠ", + "65": "પાંસઠ", + "66": "છાસઠ", + "67": "સડસઠ", + "68": "અડસઠ", + "69": "અગણોસિત્તેર", + "70": "સિત્તેર", + "71": "એકોતેર", + "72": "બોતેર", + "73": "તોતેર", + "74": "ચુમોતેર", + "75": "પંચોતેર", + "76": "છોતેર", + "77": "સિત્યોતેર", + "78": "ઇઠ્યોતેર", + "79": "ઓગણાએંસી", + "80": "એંસી", + "81": "એક્યાસી", + "82": "બ્યાસી", + "83": "ત્યાસી", + "84": "ચોર્યાસી", + "85": "પંચાસી", + "86": "છ્યાસી", + "87": "સિત્યાસી", + "88": "ઈઠ્યાસી", + "89": "નેવ્યાસી", + "90": "નેવું", + "91": "એકાણું", + "92": "બાણું", + "93": "ત્રાણું", + "94": "ચોરાણું", + "95": "પંચાણું", + "96": "છન્નું", + "97": "સત્તાણું", + "98": "અઠ્ઠાણું", + "99": "નવ્વાણું", + "100": "સો", + "1000": "હજાર", + "100000": "લાખ", + "1000000": "દસ લાખ", + "10000000": "કરોડ઼", +} # Gujarati +num_dict["mr"] = { + "0": "शून्य", + "1": "एक", + "2": "दोन", + "3": "तीन", + "4": "चार", + "5": "पाच", + "6": "सहा", + "7": "सात", + "8": "आठ", + "9": "नऊ", + "10": "दहा", + "11": "अकरा", + "12": "बारा", + "13": "तेरा", + "14": "चौदा", + "15": "पंधरा", + "16": "सोळा", + "17": "सतरा", + "18": "अठरा", + "19": "एकोणीस", + "20": "वीस", + "21": "एकवीस", + "22": "बावीस", + "23": "तेवीस", + "24": "चोवीस", + "25": "पंचवीस", + "26": "सव्वीस", + "27": "सत्तावीस", + "28": "अठ्ठावीस", + "29": "एकोणतीस", + "30": "तीस", + "31": "एकतीस", + "32": "बत्तीस", + "33": "तेहेतीस", + "34": "चौतीस", + "35": "पस्तीस", + "36": "छत्तीस", + "37": "सदतीस", + "38": "अडतीस", + "39": "एकोणचाळीस", + "40": "चाळीस", + "41": "एक्केचाळीस", + "42": "बेचाळीस", + "43": "त्रेचाळीस", + "44": "चव्वेचाळीस", + "45": "पंचेचाळीस", + "46": "सेहेचाळीस", + "47": "सत्तेचाळीस", + "48": "अठ्ठेचाळीस", + "49": "एकोणपन्नास", + "50": "पन्नास", + "51": "एक्कावन्न", + "52": "बावन्न", + "53": "त्रेपन्न", + "54": "चोपन्न", + "55": "पंचावन्न", + "56": "छप्पन्न", + "57": "सत्तावन्न", + "58": "अठ्ठावन्न", + "59": "एकोणसाठ", + "60": "साठ", + "61": "एकसष्ठ", + "62": "बासष्ठ", + "63": "त्रेसष्ठ", + "64": "चौसष्ठ", + "65": "पासष्ठ", + "66": "सहासष्ठ", + "67": "सदुसष्ठ", + "68": "अडुसष्ठ", + "69": "एकोणसत्तर", + "70": "सत्तर", + "71": "एक्काहत्तर", + "72": "बाहत्तर", + "73": "त्र्याहत्तर", + "74": "चौर्‍याहत्तर", + "75": "पंच्याहत्तर", + "76": "शहात्तर", + "77": "सत्याहत्तर", + "78": "अठ्ठ्याहत्तर", + "79": "एकोण ऐंशी", + "80": "ऐंशी", + "81": "एक्क्याऐंशी", + "82": "ब्याऐंशी", + "83": "त्र्याऐंशी", + "84": "चौऱ्याऐंशी", + "85": "पंच्याऐंशी", + "86": "शहाऐंशी", + "87": "सत्त्याऐंशी", + "88": "अठ्ठ्याऐंशी", + "89": "एकोणनव्वद", + "90": "नव्वद", + "91": "एक्क्याण्णव", + "92": "ब्याण्णव", + "93": "त्र्याण्णव", + "94": "चौऱ्याण्णव", + "95": "पंच्याण्णव", + "96": "शहाण्णव", + "97": "सत्त्याण्णव", + "98": "अठ्ठ्याण्णव", + "99": "नव्व्याण्णव", + "100": "शे", + "1000": "हजार", + "100000": "लाख", + "10000000": "कोटी", + "1000000000": "अब्ज", +} # Marathi +num_dict["bn"] = { + "0": "শূন্য", + "1": "এক", + "2": "দুই", + "3": "তিন", + "4": "চার", + "5": "পাঁচ", + "6": "ছয়", + "7": "সাত", + "8": "আট", + "9": "নয়", + "10": "দশ", + "11": "এগার", + "12": "বার", + "13": "তের", + "14": "চৌদ্দ", + "15": "পনের", + "16": "ষোল", + "17": "সতের", + "18": "আঠার", + "19": "ঊনিশ", + "20": "বিশ", + "21": "একুশ", + "22": "বাইশ", + "23": "তেইশ", + "24": "চব্বিশ", + "25": "পঁচিশ", + "26": "ছাব্বিশ", + "27": "সাতাশ", + "28": "আঠাশ", + "29": "ঊনত্রিশ", + "30": "ত্রিশ", + "31": "একত্রিশ", + "32": "বত্রিশ", + "33": "তেত্রিশ", + "34": "চৌত্রিশ", + "35": "পঁয়ত্রিশ", + "36": "ছত্রিশ", + "37": "সাঁইত্রিশ", + "38": "আটত্রিশ", + "39": "ঊনচল্লিশ", + "40": "চল্লিশ", + "41": "একচল্লিশ", + "42": "বিয়াল্লিশ", + "43": "তেতাল্লিশ", + "44": "চুয়াল্লিশ", + "45": "পঁয়তাল্লিশ", + "46": "ছেচল্লিশ", + "47": "সাতচল্লিশ", + "48": "আটচল্লিশ", + "49": "ঊনপঞ্চাশ", + "50": "পঞ্চাশ", + "51": "একান্ন", + "52": "বায়ান্ন", + "53": "তিপ্পান্ন", + "54": "চুয়ান্ন", + "55": "পঞ্চান্ন", + "56": "ছাপ্পান্ন", + "57": "সাতান্ন", + "58": "আটান্ন", + "59": "ঊনষাট", + "60": "ষাট", + "61": "একষট্টি", + "62": "বাষট্টি", + "63": "তেষট্টি", + "64": "চৌষট্টি", + "65": "পঁয়ষট্টি", + "66": "ছেষট্টি", + "67": "সাতষট্টি", + "68": "আটষট্টি", + "69": "ঊনসত্তর", + "70": "সত্তর", + "71": "একাত্তর", + "72": "বাহাত্তর", + "73": "তিয়াত্তর", + "74": "চুয়াত্তর", + "75": "পঁচাত্তর", + "76": "ছিয়াত্তর", + "77": "সাতাত্তর", + "78": "আটাত্তর", + "79": "ঊনআশি", + "80": "আশি", + "81": "একাশি", + "82": "বিরাশি", + "83": "তিরাশি", + "84": "চুরাশি", + "85": "পঁচাশি", + "86": "ছিয়াশি", + "87": "সাতাশি", + "88": "আটাশি", + "89": "ঊননব্বই", + "90": "নব্বই", + "91": "একানব্বই", + "92": "বিরানব্বই", + "93": "তিরানব্বই", + "94": "চুরানব্বই", + "95": "পঁচানব্বই", + "96": "ছিয়ানব্বই", + "97": "সাতানব্বই", + "98": "আটানব্বই", + "99": "নিরানব্বই", + "100": "শো", + "1000": "হাজার", + "100000": "লাখ", + "10000000": "কোটি", + "1000000000": "একশ’ কোটি", +} # Bengali +num_dict["te"] = { + "0": "సున్నా", + "1": "ఒకటి", + "2": "రెండు", + "3": "మూడు", + "4": "నాలుగు", + "5": "ఐదు", + "6": "ఆరు", + "7": "ఏడు", + "8": "ఎనిమిది", + "9": "తొమ్మిది", + "10": "పది", + "11": "పదకొండు", + "12": "పన్నెండు", + "13": "పదమూడు", + "14": "పద్నాలుగు", + "15": "పదిహేను", + "16": "పదహారు", + "17": "పదిహేడు", + "18": "పద్దెనిమిది", + "19": "పందొమ్మిది", + "20": "ఇరవై", + "21": "ఇరవై ఒకటి", + "22": "ఇరవై రెండు", + "23": "ఇరవై మూడు", + "24": "ఇరవై నాలుగు", + "25": "ఇరవై ఐదు", + "26": "ఇరవై ఆరు", + "27": "ఇరవై ఏడు", + "28": "ఇరవై ఎనిమిది", + "29": "ఇరవై తొమ్మిది", + "30": "ముప్పై", + "31": "ముప్పై ఒకటి", + "32": "ముప్పై రెండు", + "33": "ముప్పై మూడు", + "34": "ముప్పై నాలుగు", + "35": "ముప్పై ఐదు", + "36": "ముప్పై ఆరు", + "37": "ముప్పై ఏడు", + "38": "ముప్పై ఎనిమిది", + "39": "ముప్పై తొమ్మిది", + "40": "నలభై", + "41": "నలభై ఒకటి", + "42": "నలభై రెండు", + "43": "నలభై మూడు", + "44": "నలభై నాలుగు", + "45": "నలభై ఐదు", + "46": "నలభై ఆరు", + "47": "నలభై ఏడు", + "48": "నలభై ఎనిమిది", + "49": "నలభై తొమ్మిది", + "50": "యాభై", + "51": "యాభై ఒకటి", + "52": "యాభై రెండు", + "53": "యాభై మూడు", + "54": "యాభై నాలుగు", + "55": "యాభై ఐదు", + "56": "యాభై ఆరు", + "57": "యాభై ఏడు", + "58": "యాభై ఎనిమిది", + "59": "యాభై తొమ్మిది", + "60": "అరవై", + "61": "అరవై ఒకటి", + "62": "అరవై రెండు", + "63": "అరవై మూడు", + "64": "అరవై నాలుగు", + "65": "అరవై ఐదు", + "66": "అరవై ఆరు", + "67": "అరవై ఏడు", + "68": "అరవై ఎనిమిది", + "69": "అరవై తొమ్మిది", + "70": "డెబ్బై", + "71": "డెబ్బై ఒకటి", + "72": "డెబ్బై రెండు", + "73": "డెబ్బై మూడు", + "74": "డెబ్బై నాలుగు", + "75": "డెబ్బై ఐదు", + "76": "డెబ్బై ఆరు", + "77": "డెబ్బై ఏడు", + "78": "డెబ్బై ఎనిమిది", + "79": "డెబ్బై తొమ్మిది", + "80": "ఎనభై", + "81": "ఎనభై ఒకటి", + "82": "ఎనభై రెండు", + "83": "ఎనభై మూడు", + "84": "ఎనభై నాలుగు", + "85": "ఎనభై ఐదు", + "86": "ఎనభై ఆరు", + "87": "ఎనభై ఏడు", + "88": "ఎనభై ఎనిమిది", + "89": "ఎనభై తొమ్మిది", + "90": "తొంభై", + "91": "తొంభై ఒకటి", + "92": "తొంభై రెండు", + "93": "తొంభై మూడు", + "94": "తొంభై నాలుగు", + "95": "తొంభై ఐదు", + "96": "తొంభై ఆరు", + "97": "తొంభై ఏడు", + "98": "తొంభై ఎనిమిది", + "99": "తొంభై తొమ్మిది", + "100": "వందల", + "1000": "వేల", + "100000": "లక్షల", + "10000000": "కోట్ల", + "1000000000": "బిలియన్", +} # Telugu +num_dict["ta"] = { + "0": "பூஜ்ஜியம்", + "1": "ஒன்று", + "2": "இரண்டு", + "3": "மூன்று", + "4": "நான்கு", + "5": "ஐந்து", + "6": "ஆறு", + "7": "ஏழு", + "8": "எட்டு", + "9": "ஒன்பது", + "10": "பத்து", + "11": "பதினொன்று", + "12": "பன்னிரண்டு", + "13": "பதிமூன்று", + "14": "பதினான்கு", + "15": "பதினைந்து", + "16": "பதினாறு", + "17": "பதினேழு", + "18": "பதினெட்டு", + "19": "பத்தொன்பது", + "20": "இருபது", + "21": "இருபது ஒன்று", + "22": "இருபத்து இரண்டு", + "23": "இருபத்து மூன்று", + "24": "இருபத்து நான்கு", + "25": "இருபத்து ஐந்து", + "26": "இருபத்து ஆறு", + "27": "இருபத்து ஏழு", + "28": "இருபத்து எட்டு", + "29": "இருபத்து ஒன்பது", + "30": "முப்பது", + "31": "முப்பத்து ஒன்று", + "32": "முப்பத்து இரண்டு", + "33": "முப்பத்து மூன்று", + "34": "முப்பத்து நான்கு", + "35": "முப்பத்து ஐந்து", + "36": "முப்பத்து ஆறு", + "37": "முப்பத்து ஏழு", + "38": "முப்பத்து எட்டு", + "39": "முப்பத்து ஒன்பது", + "40": "நாற்பது", + "41": "நாற்பத்து ஒன்று", + "42": "நாற்பத்து இரண்டு", + "43": "நாற்பத்து மூன்று", + "44": "நாற்பத்து நான்கு", + "45": "நாற்பத்து ஐந்து", + "46": "நாற்பத்து ஆறு", + "47": " நாற்பத்து ஏழு", + "48": "நாற்பத்து எட்டு", + "49": "நாற்பத்து ஒன்பது", + "50": "ஐம்பது", + "51": "ஐம்பத்து ஒன்று", + "52": "ஐம்பத்து இரண்டு", + "53": "ஐம்பத்து மூன்று", + "54": "ஐம்பத்து நான்கு", + "55": "ஐம்பத்து ஐந்து", + "56": "ஐம்பத்து ஆறு", + "57": "ஐம்பத்து ஏழு", + "58": "ஐம்பத்து எட்டு", + "59": "ஐம்பத்து ஒன்பது", + "60": "அறுபது", + "61": "அறுபத்து ஒன்று", + "62": "அறுபத்து இரண்டு", + "63": "அறுபத்து மூன்று", + "64": "அறுபத்து நான்கு", + "65": "அறுபத்து ஐந்து", + "66": "அறுபத்து ஆறு", + "67": "அறுபத்து ஏழு", + "68": "அறுபத்து எட்டு", + "69": "அறுபத்து ஒன்பது", + "70": "எழுபது", + "71": "எழுபத்தி ஒன்று", + "72": "எழுபத்தி இரண்டு", + "73": "எழுபத்தி முச்சக்கர", + "74": "எழுபத்தி நான்கு", + "75": "எழுபத்தி ஐந்து", + "76": "எழுபத்தி ஆறு", + "77": "எழுபத்தி ஏழு", + "78": "எழுபத்தி எட்டு", + "79": "எழுபத்தி ஒன்பது", + "80": "எண்பது", + "81": "எண்பத்தியொன்று", + "82": "எண்பத்திரண்டு", + "83": "எண்பத்திமூன்று", + "84": "என்பதினான்கு", + "85": "என்பதினைந்து", + "86": "எண்பத்திஆறு", + "87": "எண்பத்திஏழு", + "88": "எண்பத்தியெட்டு", + "89": "எண்பத்தியொன்பது", + "90": "தொன்னூறு", + "91": "தொண்ணூற்றியொன்று", + "92": "தொண்ணூற்றிரண்டு", + "93": "தொண்ணூற்றிமூன்று", + "94": "தொண்ணூற்றிநான்கு", + "95": "தொண்ணூற்றிஐந்து", + "96": "தொண்ணூற்றியாறு", + "97": "தொண்ணூற்றியேழு", + "98": "தொண்ணூற்றியெட்டு", + "99": "தொண்ணூற்றிஒன்பது", + "100": "நூறு", + "1000": "ஆயிரம்", + "100000": "இலட்சம்", + "10000000": "கோடி", + "1000000000": "பில்லியன்", +} # Tamil +num_dict["kn"] = { + "0": "ಸೊನ್ನೆ", + "1": "ಒಂದು", + "2": "ಎರಡು", + "3": "ಮೂರು", + "4": "ನಾಲ್ಕು", + "5": "ಅಯ್ದು", + "6": "ಆರು", + "7": "ಏಳು", + "8": "ಎಂಟು", + "9": "ಒಂಬತ್ತು", + "10": "ಹತ್ತು", + "11": "ಹನ್ನೊಂದು", + "12": "ಹನ್ನೆರಡು", + "13": "ಹದಿಮೂರು", + "14": "ಹದಿನಾಲ್ಕು", + "15": "ಹದಿನೈದು", + "16": "ಹದಿನಾರು", + "17": "ಹದಿನೇಳು", + "18": "ಹದಿನೆಂಟು", + "19": "ಹತ್ತೊಂಬತ್ತು", + "20": "ಇಪ್ಪತ್ತು", + "21": "ಇಪ್ಪತ್ತ್’ಒಂದು", + "22": "ಇಪ್ಪತ್ತ್’ಎರಡು", + "23": "ಇಪ್ಪತ್ತ್’ಮೂರು", + "24": "ಇಪ್ಪತ್ತ್’ನಾಲ್ಕು", + "25": "ಇಪ್ಪತ್ತ್’ಐದು", + "26": "ಇಪ್ಪತ್ತ್’ಆರು", + "27": "ಇಪ್ಪತ್ತ್’ಏಳು", + "28": "ಇಪ್ಪತ್ತ್’ಎಂಟು", + "29": "ಇಪ್ಪತ್ತ್’ಒಂಬತ್ತು", + "30": "ಮೂವತ್ತು", + "31": "ಮುವತ್ತ್’ಒಂದು", + "32": "ಮುವತ್ತ್’ಎರಡು", + "33": "ಮುವತ್ತ್’ಮೂರು", + "34": "ಮೂವತ್ತ್’ನಾಲ್ಕು", + "35": "ಮೂವತ್ತ್’ಐದು", + "36": "ಮೂವತ್ತ್’ಆರು", + "37": "ಮೂವತ್ತ್’ಏಳು", + "38": "ಮೂವತ್ತ್’ಎಂಟು", + "39": "ಮೂವತ್ತ್’ಒಂಬತ್ತು", + "40": "ನಲವತ್ತು", + "41": "ನಲವತ್ತೊಂದು", + "42": "ನಲವತ್ತ್ ಎರಡು", + "43": "ನಲವತ್ತ್ ಮೂರು", + "44": "ನಲವತ್ತ್ ನಾಲ್ಕು", + "45": "ನಲವತ್ತೈದು", + "46": "ನಲವತ್ತಾರು", + "47": "ನಲವತ್ತೇಳು", + "48": "ನಲವತ್ತೆಂಟು", + "49": "ನಲವತ್ತೊಂಬತ್ತು", + "50": "ಐವತ್ತು", + "51": "ಐವತ್ತೊಂದು", + "52": "ಐವತ್ತೆರಡು", + "53": "ಐವತ್ತಮೂರು", + "54": "ಐವತ್ತ್ನಾಲ್ಕು", + "55": "ಐವತ್ತೈದು", + "56": "ಐವತ್ತಾರು", + "57": "ಐವತ್ತೇಳು", + "58": "ಐವತ್ತೆಂಟು", + "59": "ಐವತ್ತೊಂಬತ್ತು", + "60": "ಅರವತ್ತು", + "61": "ಅರವತ್ತೊಂದು", + "62": "ಅರವತ್ತೆರಡು", + "63": "ಅರವತ್ತ್ ಮೂರು", + "64": "ಅರವತ್ತ್ ನಾಲ್ಕು", + "65": "ಅರವತ್ತೈದು", + "66": "ಅರವತ್ತಾರು", + "67": "ಅರವತ್ತೇಳು", + "68": "ಅರವತ್ತೆಂಟು", + "69": "ಅರವತ್ತೊಂಬತ್ತು", + "70": "ಎಪ್ಪತ್ತು", + "71": "ಎಪ್ಪತ್ತೊಂದು", + "72": "ಎಪ್ಪತ್ತೆರಡು", + "73": "ಎಪ್ಪತ್ತ್ ಮೂರು", + "74": "ಎಪ್ಪತ್ತ್ ನಾಲ್ಕು", + "75": "ಎಪ್ಪತ್ತೈದು", + "76": "ಎಪ್ಪತ್ತಾರು", + "77": "ಎಪ್ಪತ್ತೇಳು", + "78": "ಎಪ್ಪತ್ತೆಂಟು", + "79": "ಎಪ್ಪತ್ತೊಂಬತ್ತು", + "80": "ಎಂಬತ್ತು", + "81": "ಎಂಬತ್ತೊಂದು", + "82": "ಎಂಬತ್ತೆರಡು", + "83": "ಎಂಬತ್ತ್ ಮೂರು", + "84": "ಎಂಬತ್ತ್ ನಾಲ್ಕು", + "85": "ಎಂಬತ್ತೈದು", + "86": "ಎಂಬತ್ತಾರು", + "87": "ಎಂಬತ್ತೇಳು", + "88": "ಎಂಬತ್ತೆಂಟು", + "89": "ಎಂಬತ್ತೊಂಬತ್ತು", + "90": "ತೊಂಬತ್ತು", + "91": "ತೊಂಬತ್ತೊಂದು", + "92": "ತೊಂಬತ್ತೆರಡು", + "93": "ತೊಂಬತ್ತ ಮೂರು", + "94": "ತೊಂಬತ್ತ ನಾಲ್ಕು", + "95": "ತೊಂಬತ್ತೈದು", + "96": "ತೊಂಬತ್ತಾರು", + "97": "ತೊಂಬತ್ತೇಳು", + "98": "ತೊಂಬತ್ತೆಂಟು", + "99": "ತೊಂಬತ್ತೊಂಬತ್ತು", + "100": "ನೂರ", + "1000": "ಸಾವಿರದ", + "100000": "ಲಕ್ಷದ", + "10000000": "ಕೋಟಿ", + "1000000000": "ಶತಕೋಟಿ", +} # Kannada +num_dict["or"] = { + "0": "ଶୁନ୍ୟ", + "1": "ଏକ", + "2": "ଦୁଇ", + "3": "ତିନି", + "4": "ଚାରି", + "5": "ପାଞ୍ଚ", + "6": "ଛଅ", + "7": "ସାତ", + "8": "ଆଠ", + "9": "ନଅ", + "10": "ନଅ", + "11": "ଏଗାର", + "12": "ବାର", + "13": "ତେର", + "14": "ଚଉଦ", + "15": "ପନ୍ଦର", + "16": "ଷୋହଳ", + "17": "ସତର", + "18": "ଅଠର", + "19": "ଊଣାଇଶ", + "20": "କୋଡିଏ", + "21": "ଏକୋଇଶି", + "22": "ବାଇଶି", + "23": "ତେଇଶି", + "24": "ଚବିଶି", + "25": "ପଚିଶି", + "26": "ଛବିଶି", + "27": "ସତାଇଶି", + "28": "ଅଠାଇଶି", + "29": "ଅଣତିରିଶି", + "30": "ତିରିଶି", + "31": "ଏକତିରିଶି", + "32": "ବତିଶି", + "33": "ତେତିଶି", + "34": "ଚଉତିରିଶି", + "35": "ପଞ୍ଚତିରିଶି", + "36": "ଛତିଶି", + "37": "ସଂଇତିରିଶି", + "38": "ଅଠତିରିଶି", + "39": "ଅଣଚାଳିଶି", + "40": "ଚାଳିଶି", + "41": "ଏକଚାଳିଶି", + "42": "ବୟାଳିଶି", + "43": "ତେୟାଳିଶି", + "44": "ଚଉରାଳିଶି", + "45": "ପଞ୍ଚଚାଳିଶି", + "46": "ଛୟାଳିଶି", + "47": "ସତଚାଳିଶି", + "48": "ଅଠଚାଳିଶି", + "49": "ଅଣଚାଶ", + "50": "ପଚାଶ", + "51": "ଏକାବନ", + "52": "ବାଉନ", + "53": "ତେପନ", + "54": "ଚଉବନ", + "55": "ପଞ୍ଚାବନ", + "56": "ଛପନ", + "57": "ସତାବନ", + "58": "ଅଠାବନ", + "59": "ଅଣଷଠି", + "60": "ଷାଠିଏ", + "61": "ଏକଷଠି", + "62": "ବାଷଠି", + "63": "ତେଷଠି", + "64": "ଚଉଷଠି", + "65": "ପଞ୍ଚଷଠି", + "66": "ଛଅଷଠି", + "67": "ସତଷଠି", + "68": "ଅଠଷଠି", + "69": "ଅଣସ୍ତରୀ", + "70": "ସତୂରୀ", + "71": "ଏକସ୍ତରୀ", + "72": "ବାସ୍ତରୀ", + "73": "ତେସ୍ତରୀ", + "74": "ଚଉସ୍ତରୀ", + "75": "ପଞ୍ଚସ୍ତରୀ", + "76": "ଛଅସ୍ତରୀ", + "77": "ସତସ୍ତରୀ", + "78": "ଅଠସ୍ତରୀ", + "79": "ଅଣାଅଶୀ", + "80": "ଅଶୀ", + "81": "ଏକାଅଶୀ", + "82": "ବୟାଅଶୀ", + "83": "ତେୟାଅଶୀ", + "84": "ଚଉରାଅଶୀ", + "85": "ପଞ୍ଚାଅଶୀ", + "86": "ଛୟାଅଶୀ", + "87": "ସତାଅଶୀ", + "88": "ଅଠାଅଶୀ", + "89": "ଅଣାନବେ", + "90": "ନବେ", + "91": "ଏକାନବେ", + "92": "ବୟାନବେ", + "93": "ତେୟାନବେ", + "94": "ଚଉରାନବେ", + "95": "ପଞ୍ଚାନବେ", + "96": "ଛୟାନବେ", + "97": "ସତାନବେ", + "98": "ଅଠାନବେ", + "99": "ଅନେଶତ", + "100": "ଶହେ", + "1000": "ହଜାର", + "100000": "ଲକ୍ଷ", + "10000000": "କୋଟି", + "1000000000": "କୋଟି", +} # Oriya +num_dict["pa"] = { + "0": "ਸਿਫਰ ", + "1": "ਇੱਕ", + "2": "ਦੋ", + "3": "ਤਿੰਨ", + "4": "ਚਾਰ", + "5": "ਪੰਜ", + "6": "ਛੇ", + "7": "ਸੱਤ", + "8": "ਅੱਠ", + "9": "ਨੌਂ", + "10": "ਦੱਸ", + "11": "ਗਿਆਰਾਂ", + "12": "ਬਾਰਾਂ", + "13": "ਤੇਰਾਂ", + "14": "ਚੌਦਾਂ", + "15": "ਪੰਦਰਾਂ", + "16": "ਸੋਲ਼ਾਂ", + "17": "ਸਤਾਰਾਂ", + "18": "ਅਠਾਰਾਂ", + "19": "ਉਨੀ", + "20": "ਵੀਹ", + "21": "ਇੱਕੀ", + "22": "ਬਾਈ", + "23": "ਤੇਈ", + "24": "ਚੌਵੀ", + "25": "ਪੰਝੀ", + "26": "ਛੱਬੀ", + "27": "ਸਤਾਈ", + "28": "ਅਠਾਈ", + "29": "ਉਨੱਤੀ", + "30": "ਤੀਹ", + "31": "ਇਕੱਤੀ", + "32": "ਬੱਤੀ", + "33": "ਤੇਤੀ", + "34": "ਚੌਂਤੀ", + "35": "ਪੈਂਤੀ", + "36": "ਛੱਤੀ", + "37": "ਸੈਂਤੀ", + "38": "ਅਠੱਤੀ", + "39": "ਉਨਤਾਲੀ", + "40": "ਚਾਲੀ", + "41": "ਇਕਤਾਲੀ", + "42": "ਬਤਾਲੀ", + "43": "ਤਰਤਾਲੀ", + "44": "ਚੌਤਾਲੀ", + "45": "ਪੰਜਤਾਲੀ", + "46": "ਛਿਆਲੀ", + "47": "ਸੰਤਾਲੀ", + "48": "ਅੱਠਤਾਲੀ", + "49": "ਉਣਿੰਜਾ", + "50": "ਪੰਜਾਹ", + "51": "ਇਕਵਿੰਜਾ", + "52": "ਬਵਿੰਜਾ", + "53": "ਤਰਵਿੰਜਾ", + "54": "ਚਰਿੰਜਾ", + "55": "ਪਚਵਿੰਜਾ", + "56": "ਛਪਿੰਜਾ", + "57": "ਸਤਵਿੰਜਾ", + "58": "ਅੱਠਵਿੰਜਾ", + "59": "ਉਣਾਠ", + "60": "ਸੱਠ", + "61": "ਇਕਾਠ", + "62": "ਬਾਠ੍ਹ", + "63": "ਤਰੇਠ੍ਹ", + "64": "ਚੌਠ੍ਹ", + "65": "ਪੈਂਠ", + "66": "ਛਿਆਠ", + "67": "ਸਤਾਹਠ", + "68": "ਅੱਠਾਠ", + "69": "ਉਣੱਤਰ", + "70": "ਸੱਤਰ", + "71": "ਇਕ੍ਹੱਤਰ", + "72": "ਬਹੱਤਰ", + "73": "ਤਹੱਤਰ", + "74": "ਚੌਹੱਤਰ", + "75": "ਪੰਜੱਤਰ", + "76": "ਛਿਹੱਤਰ", + "77": "ਸਤੱਤਰ", + "78": "ਅਠੱਤਰ", + "79": "ਉਣਾਸੀ", + "80": "ਅੱਸੀ", + "81": "ਇਕਾਸੀ", + "82": "ਬਿਆਸੀ", + "83": "ਤਰਾਸੀ", + "84": "ਚਰਾਸੀ", + "85": "ਪੰਜਾਸੀ", + "86": "ਛਿਆਸੀ", + "87": "ਸਤਾਸੀ", + "88": "ਅਠਾਸੀ", + "89": "ਉਣਾਨਵੇਂ", + "90": "ਨੱਬੇ", + "91": "ਇਕਾਨਵੇਂ", + "92": "ਬਿਆਨਵੇਂ", + "93": "ਤਰਾਨਵੇਂ", + "94": "ਚਰਾਨਵੇਂ", + "95": "ਪਚਾਨਵੇਂ", + "96": "ਛਿਆਨਵੇਂ", + "97": "ਸਤਾਨਵੇਂ", + "98": "ਅਠਾਨਵੇਂ", + "99": "ਨਿੜਾਨਵੇਂ", + "100": "ਸੌ", + "1000": "ਹਜਾਰ", + "100000": "ਲੱਖ", + "10000000": "ਕਰੋੜ", + "1000000000": "ਅਰਬ", +} # Punjabi + +# --------------------------- num_to_word.py ------------------------------ +""" +Method to convert Numbers to Words +for indian languages + +Use cases:- +1) Speech recognition pre-processing +2) Language modeling Data pre-processing + +------------------------- +check indic_numbers.py to add support +for any indian language +""" + + +def language_specific_exception(words, lang, combiner): + """ + Language Specific Exception will come here + """ + + def occurs_at_end(piece): + return words[-len(piece) :] == piece + + if lang == "mr": + words = words.replace("एक" + combiner + "शे", "शंभर") + elif lang == "gu": + words = words.replace("બે" + combiner + "સો", "બસ્સો") + elif lang == "te": + exception_dict = { + "1": "ఒక", + "100": "వంద", + "100+": "వందలు", + "1000": "వెయ్యి", + "1000+": "వేలు", + "100000": "లక్ష", + "100000+": "లక్షలు", + "10000000": "కోటి", + "10000000+": "కోట్లు", + } + + test_case = ["100", "1000", "100000", "10000000"] + for test in test_case: + test_word = num_dict["te"][test] + match = num_dict["te"]["1"] + combiner + test_word + # for numbers like : 100, 1000, 100000 + if words == match: + return exception_dict[test] + # for numbers like : 200, 4000, 800000 + elif occurs_at_end(test_word): + words = words.replace(test_word, exception_dict[test + "+"]) + # for numbers like : 105, 1076, 123993 + elif not occurs_at_end(match): + replacement = exception_dict["1"] + combiner + exception_dict[test] + words = words.replace(match, replacement) + + # Exception case for 101...199 + special_case = "ఒక" + combiner + "వంద" + words = words.replace(special_case, "నూట") + elif lang == "kn": + # special case for 100 + if words == ("ಒಂದು" + combiner + "ನೂರ"): + return "ನೂರು" + exception_dict = { + "ನೂರ": "ನೂರು", + "ಸಾವಿರದ": "ಸಾವಿರ", + "ಲಕ್ಷದ": "ಲಕ್ಷ", + "ಕೋಟಿಯ": "ಕೋಟಿ", + } + for expt in exception_dict: + if occurs_at_end(expt): + words = words.replace(expt, exception_dict[expt]) + return words + + +def num_to_word(num, lang, separator=", ", combiner=" "): + """ + Main Method + :param num: Number digits from any indian language + :param lang: Language Code from supported Language + :param separator: Separator character i.e. separator = '-' --> 'two hundred-sixty' + :param combiner: combine number with position i.e. combiner = '-' --> 'two-hundred sixty' + :return: UTF-8 String of numbers in words + """ + lang = lang.lower() + num = str(num) + + # Load dictionary according to language code + assert lang in supported_lang, "Language not supported" + num_dic = num_dict[lang] + + # dash default combiner for english-india + if (lang == "en") & (combiner == " "): + combiner = "-" + + # Remove punctuations from numbers + num = str(num).replace(",", "").replace(" ", "") + + # Replace native language numbers with english digits + for language in supported_lang: + for num_index in range(10): + num = num.replace(all_num[language][num_index], all_num["en"][num_index]) + + # Assert that input contains only integer number + for digit in num: + assert digit in all_num["en"], "Give proper input" + + # Process + # For Number longer than 9 digits + def all_two_digit(digits_2): + if len(digits_2) <= 1: # Provided only one/zero digit + return num_dic.get(digits_2, "") + elif digits_2 == "00": # Two Zero provided + return num_dic["0"] + separator + num_dic["0"] + elif digits_2[0] == "0": # First digit is zero + return num_dic["0"] + separator + num_dic[digits_2[1]] + else: # Both digit provided + return num_dic[digits_2] + + # For Number less than 9 digits + def two_digit(digits_2): + digits_2 = digits_2.lstrip("0") + if len(digits_2) != 0: + return num_dic[digits_2] + else: + return "" + + def all_digit(digits): + digits = digits.lstrip("0") + digit_len = len(digits) + if digit_len > 3: + num_of_digits_to_process = (digit_len % 2) + 1 + process_digits = digits[:num_of_digits_to_process] + base = str(10 ** (int(digit_len / 2) * 2 - 1)) + remain_digits = digits[num_of_digits_to_process:] + return ( + num_dic[process_digits] + + combiner + + num_dic[base] + + separator + + all_digit(remain_digits) + ) + elif len(digits) == 3: + return ( + num_dic[digits[:1]] + + combiner + + num_dic["100"] + + separator + + two_digit(digits[1:]) + ) + else: + return two_digit(digits) + + num = num.lstrip("0") + full_digit_len = len(num) + + if full_digit_len == 0: + output = num_dic["0"] + elif full_digit_len <= 9: + output = all_digit(num) + else: + iteration = round(full_digit_len / 2) + output = all_two_digit(num[:2]) # First to digit + for i in range(1, iteration): + output = ( + output + separator + all_two_digit(num[i * 2 : (i + 1) * 2]) + ) # Next two digit pairs + remaining_digits = num[iteration * 2 :] + if not all_two_digit(remaining_digits) == "": + output = ( + output + separator + all_two_digit(remaining_digits) + ) # remaining Last one/two digits + + output = output.strip(separator) + + output = language_specific_exception(output, lang, combiner) + + return output + + +# --------------------------------- num_to_word_on_a_sent --------------------------------- + + +def is_digit(word, digit_pattern): + return re.search(digit_pattern, word) + + +def remove_punct(sent): + clean = re.sub("[%s]" % re.escape(string.punctuation), " ", sent) + return " ".join([word for word in clean.split() if word]) + + +def normalize_nums(text, lang): + """ + text: str (eg) + lang: lang code ['en', 'hi'] + + returns: str + (eg) + """ + + if lang in supported_lang: + words = text.split() + lang_digits = [str(i) for i in range(0, 10)] + + digit_pattern = "[" + "".join(lang_digits) + "]" + num_indices = [ + ind for ind, word in enumerate(words) if is_digit(word, digit_pattern) + ] + + words_up = [ + num_to_word(word, lang, separator=" ", combiner=" ") + if ind in num_indices + else word + for ind, word in enumerate(words) + ] + return " ".join(words_up) + else: + return text + + +if __name__ == "__main__": + print(normalize_nums("रीटा के पास 16 बिल्लियाँ हैं।", "hi")) diff --git a/ttsv/utils/inference/run_gradio.py b/ttsv/utils/inference/run_gradio.py new file mode 100644 index 0000000000000000000000000000000000000000..7e272ee26581d01f4534ba3f336399d7d061d7b6 --- /dev/null +++ b/ttsv/utils/inference/run_gradio.py @@ -0,0 +1,60 @@ +import gradio as gr +import argparse +import numpy as np +from argparse import Namespace +from .advanced_tts import load_all_models, run_tts_paragraph + + +def hit_tts(textbox, gender, slider_noise_scale, slider_length_sclae, choice_transliteration, choice_number_conversion, choice_split_sentences): + inputs_to_gradio = {'text' : textbox, + 'gender' : gender, + 'noise_scale': slider_noise_scale, + 'length_scale': slider_length_sclae, + 'transliteration' : 1 if choice_transliteration else 0, + 'number_conversion' : 1 if choice_number_conversion else 0, + 'split_sentences' : 1 if choice_split_sentences else 0 + } + + args = Namespace(**inputs_to_gradio) + args.wav = None + args.lang = lang + args.gender = gender + + if args.text: + sr, audio = run_tts_paragraph(args) + return (sr, audio) + +def build_gradio(args): + global lang + lang = args.lang + load_all_models(args) + textbox = gr.inputs.Textbox(placeholder="Enter Text to run", default="", label="Enter Input Text") + gender = gr.inputs.Radio(choices = ['Female', 'Male'], default='Female', label='Gender') + slider_noise_scale = gr.inputs.Slider(minimum=0, maximum=1.0, step=0.001, default=0.667, label='Noise Scale') + slider_length_sclae = gr.inputs.Slider(minimum=0, maximum=2.0, step=0.1, default=1.0, label='Length Scale') + + choice_transliteration = gr.inputs.Checkbox(default=True, label="Transliteration") + choice_number_conversion = gr.inputs.Checkbox(default=True, label="Number Conversion") + choice_split_sentences = gr.inputs.Checkbox(default=True, label="Split Sentences") + + examples = [['ভারত আমার দেশ এবং আমি একজন ভারতীয় হিসেবে গর্বিত।', 'Male', 0.667, 1, 0, 1, 1]] + + op = gr.outputs.Audio(type="numpy", label=None) + + inputs_to_gradio = [textbox, gender, slider_noise_scale, slider_length_sclae, choice_transliteration, choice_number_conversion, choice_split_sentences] + iface = gr.Interface(fn=hit_tts, examples = examples, inputs=inputs_to_gradio, outputs=op, theme='huggingface', title='Vakyansh Hindi TTS', article = 'Note: Transliteration models may not work well in some scenarios which can hamper the TTS quality, to evaluate the model in better sense it is advisable to provide input in the required language and switch off transliteration. Contact @harveenchadha on twitter for any issues.') + iface.launch(enable_queue=True) + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("-a", "--acoustic", required=True, type=str) + parser.add_argument("-v", "--vocoder", required=True, type=str) + parser.add_argument("-d", "--device", type=str, default="cpu") + parser.add_argument("-L", "--lang", type=str, required=True) + + global lang + + args = parser.parse_args() + lang = args.lang + + build_gradio(args) \ No newline at end of file diff --git a/ttsv/utils/inference/transliterate.py b/ttsv/utils/inference/transliterate.py new file mode 100644 index 0000000000000000000000000000000000000000..de1ccab4426659552a019b593c4766522efff616 --- /dev/null +++ b/ttsv/utils/inference/transliterate.py @@ -0,0 +1,919 @@ +import torch +import torch.nn as nn +import numpy as np +import pandas as pd +import random +import sys +import os +import json +import enum +import traceback +import re + +#F_DIR = os.path.dirname(os.path.realpath(__file__)) +F_DIR = '/home/user/app/ttsv/checkpoints/' + +class XlitError(enum.Enum): + lang_err = "Unsupported langauge ID requested ;( Please check available languages." + string_err = "String passed is incompatable ;(" + internal_err = "Internal crash ;(" + unknown_err = "Unknown Failure" + loading_err = "Loading failed ;( Check if metadata/paths are correctly configured." + + +##=================== Network ================================================== + + +class Encoder(nn.Module): + def __init__( + self, + input_dim, + embed_dim, + hidden_dim, + rnn_type="gru", + layers=1, + bidirectional=False, + dropout=0, + device="cpu", + ): + super(Encoder, self).__init__() + + self.input_dim = input_dim # src_vocab_sz + self.enc_embed_dim = embed_dim + self.enc_hidden_dim = hidden_dim + self.enc_rnn_type = rnn_type + self.enc_layers = layers + self.enc_directions = 2 if bidirectional else 1 + self.device = device + + self.embedding = nn.Embedding(self.input_dim, self.enc_embed_dim) + + if self.enc_rnn_type == "gru": + self.enc_rnn = nn.GRU( + input_size=self.enc_embed_dim, + hidden_size=self.enc_hidden_dim, + num_layers=self.enc_layers, + bidirectional=bidirectional, + ) + elif self.enc_rnn_type == "lstm": + self.enc_rnn = nn.LSTM( + input_size=self.enc_embed_dim, + hidden_size=self.enc_hidden_dim, + num_layers=self.enc_layers, + bidirectional=bidirectional, + ) + else: + raise Exception("XlitError: unknown RNN type mentioned") + + def forward(self, x, x_sz, hidden=None): + """ + x_sz: (batch_size, 1) - Unpadded sequence lengths used for pack_pad + """ + batch_sz = x.shape[0] + # x: batch_size, max_length, enc_embed_dim + x = self.embedding(x) + + ## pack the padded data + # x: max_length, batch_size, enc_embed_dim -> for pack_pad + x = x.permute(1, 0, 2) + x = nn.utils.rnn.pack_padded_sequence(x, x_sz, enforce_sorted=False) # unpad + + # output: packed_size, batch_size, enc_embed_dim + # hidden: n_layer**num_directions, batch_size, hidden_dim | if LSTM (h_n, c_n) + output, hidden = self.enc_rnn( + x + ) # gru returns hidden state of all timesteps as well as hidden state at last timestep + + ## pad the sequence to the max length in the batch + # output: max_length, batch_size, enc_emb_dim*directions) + output, _ = nn.utils.rnn.pad_packed_sequence(output) + + # output: batch_size, max_length, hidden_dim + output = output.permute(1, 0, 2) + + return output, hidden + + def get_word_embedding(self, x): + """ """ + x_sz = torch.tensor([len(x)]) + x_ = torch.tensor(x).unsqueeze(0).to(dtype=torch.long) + # x: 1, max_length, enc_embed_dim + x = self.embedding(x_) + + ## pack the padded data + # x: max_length, 1, enc_embed_dim -> for pack_pad + x = x.permute(1, 0, 2) + x = nn.utils.rnn.pack_padded_sequence(x, x_sz, enforce_sorted=False) # unpad + + # output: packed_size, 1, enc_embed_dim + # hidden: n_layer**num_directions, 1, hidden_dim | if LSTM (h_n, c_n) + output, hidden = self.enc_rnn( + x + ) # gru returns hidden state of all timesteps as well as hidden state at last timestep + + out_embed = hidden[0].squeeze() + + return out_embed + + +class Decoder(nn.Module): + def __init__( + self, + output_dim, + embed_dim, + hidden_dim, + rnn_type="gru", + layers=1, + use_attention=True, + enc_outstate_dim=None, # enc_directions * enc_hidden_dim + dropout=0, + device="cpu", + ): + super(Decoder, self).__init__() + + self.output_dim = output_dim # tgt_vocab_sz + self.dec_hidden_dim = hidden_dim + self.dec_embed_dim = embed_dim + self.dec_rnn_type = rnn_type + self.dec_layers = layers + self.use_attention = use_attention + self.device = device + if self.use_attention: + self.enc_outstate_dim = enc_outstate_dim if enc_outstate_dim else hidden_dim + else: + self.enc_outstate_dim = 0 + + self.embedding = nn.Embedding(self.output_dim, self.dec_embed_dim) + + if self.dec_rnn_type == "gru": + self.dec_rnn = nn.GRU( + input_size=self.dec_embed_dim + + self.enc_outstate_dim, # to concat attention_output + hidden_size=self.dec_hidden_dim, # previous Hidden + num_layers=self.dec_layers, + batch_first=True, + ) + elif self.dec_rnn_type == "lstm": + self.dec_rnn = nn.LSTM( + input_size=self.dec_embed_dim + + self.enc_outstate_dim, # to concat attention_output + hidden_size=self.dec_hidden_dim, # previous Hidden + num_layers=self.dec_layers, + batch_first=True, + ) + else: + raise Exception("XlitError: unknown RNN type mentioned") + + self.fc = nn.Sequential( + nn.Linear(self.dec_hidden_dim, self.dec_embed_dim), + nn.LeakyReLU(), + # nn.Linear(self.dec_embed_dim, self.dec_embed_dim), nn.LeakyReLU(), # removing to reduce size + nn.Linear(self.dec_embed_dim, self.output_dim), + ) + + ##----- Attention ---------- + if self.use_attention: + self.W1 = nn.Linear(self.enc_outstate_dim, self.dec_hidden_dim) + self.W2 = nn.Linear(self.dec_hidden_dim, self.dec_hidden_dim) + self.V = nn.Linear(self.dec_hidden_dim, 1) + + def attention(self, x, hidden, enc_output): + """ + x: (batch_size, 1, dec_embed_dim) -> after Embedding + enc_output: batch_size, max_length, enc_hidden_dim *num_directions + hidden: n_layers, batch_size, hidden_size | if LSTM (h_n, c_n) + """ + + ## perform addition to calculate the score + + # hidden_with_time_axis: batch_size, 1, hidden_dim + ## hidden_with_time_axis = hidden.permute(1, 0, 2) ## replaced with below 2lines + hidden_with_time_axis = ( + torch.sum(hidden, axis=0) + if self.dec_rnn_type != "lstm" + else torch.sum(hidden[0], axis=0) + ) # h_n + + hidden_with_time_axis = hidden_with_time_axis.unsqueeze(1) + + # score: batch_size, max_length, hidden_dim + score = torch.tanh(self.W1(enc_output) + self.W2(hidden_with_time_axis)) + + # attention_weights: batch_size, max_length, 1 + # we get 1 at the last axis because we are applying score to self.V + attention_weights = torch.softmax(self.V(score), dim=1) + + # context_vector shape after sum == (batch_size, hidden_dim) + context_vector = attention_weights * enc_output + context_vector = torch.sum(context_vector, dim=1) + # context_vector: batch_size, 1, hidden_dim + context_vector = context_vector.unsqueeze(1) + + # attend_out (batch_size, 1, dec_embed_dim + hidden_size) + attend_out = torch.cat((context_vector, x), -1) + + return attend_out, attention_weights + + def forward(self, x, hidden, enc_output): + """ + x: (batch_size, 1) + enc_output: batch_size, max_length, dec_embed_dim + hidden: n_layer, batch_size, hidden_size | lstm: (h_n, c_n) + """ + if (hidden is None) and (self.use_attention is False): + raise Exception( + "XlitError: No use of a decoder with No attention and No Hidden" + ) + + batch_sz = x.shape[0] + + if hidden is None: + # hidden: n_layers, batch_size, hidden_dim + hid_for_att = torch.zeros( + (self.dec_layers, batch_sz, self.dec_hidden_dim) + ).to(self.device) + elif self.dec_rnn_type == "lstm": + hid_for_att = hidden[1] # c_n + + # x (batch_size, 1, dec_embed_dim) -> after embedding + x = self.embedding(x) + + if self.use_attention: + # x (batch_size, 1, dec_embed_dim + hidden_size) -> after attention + # aw: (batch_size, max_length, 1) + x, aw = self.attention(x, hidden, enc_output) + else: + x, aw = x, 0 + + # passing the concatenated vector to the GRU + # output: (batch_size, n_layers, hidden_size) + # hidden: n_layers, batch_size, hidden_size | if LSTM (h_n, c_n) + output, hidden = ( + self.dec_rnn(x, hidden) if hidden is not None else self.dec_rnn(x) + ) + + # output :shp: (batch_size * 1, hidden_size) + output = output.view(-1, output.size(2)) + + # output :shp: (batch_size * 1, output_dim) + output = self.fc(output) + + return output, hidden, aw + + +class Seq2Seq(nn.Module): + """ + Class dependency: Encoder, Decoder + """ + + def __init__( + self, encoder, decoder, pass_enc2dec_hid=False, dropout=0, device="cpu" + ): + super(Seq2Seq, self).__init__() + + self.encoder = encoder + self.decoder = decoder + self.device = device + self.pass_enc2dec_hid = pass_enc2dec_hid + _force_en2dec_hid_conv = False + + if self.pass_enc2dec_hid: + assert ( + decoder.dec_hidden_dim == encoder.enc_hidden_dim + ), "Hidden Dimension of encoder and decoder must be same, or unset `pass_enc2dec_hid`" + if decoder.use_attention: + assert ( + decoder.enc_outstate_dim + == encoder.enc_directions * encoder.enc_hidden_dim + ), "Set `enc_out_dim` correctly in decoder" + assert ( + self.pass_enc2dec_hid or decoder.use_attention + ), "No use of a decoder with No attention and No Hidden from Encoder" + + self.use_conv_4_enc2dec_hid = False + if ( + self.pass_enc2dec_hid + and (encoder.enc_directions * encoder.enc_layers != decoder.dec_layers) + ) or _force_en2dec_hid_conv: + if encoder.enc_rnn_type == "lstm" or encoder.enc_rnn_type == "lstm": + raise Exception( + "XlitError: conv for enc2dec_hid not implemented; Change the layer numbers appropriately" + ) + + self.use_conv_4_enc2dec_hid = True + self.enc_hid_1ax = encoder.enc_directions * encoder.enc_layers + self.dec_hid_1ax = decoder.dec_layers + self.e2d_hidden_conv = nn.Conv1d(self.enc_hid_1ax, self.dec_hid_1ax, 1) + + def enc2dec_hidden(self, enc_hidden): + """ + enc_hidden: n_layer, batch_size, hidden_dim*num_directions + TODO: Implement the logic for LSTm bsed model + """ + # hidden: batch_size, enc_layer*num_directions, enc_hidden_dim + hidden = enc_hidden.permute(1, 0, 2).contiguous() + # hidden: batch_size, dec_layers, dec_hidden_dim -> [N,C,Tstep] + hidden = self.e2d_hidden_conv(hidden) + + # hidden: dec_layers, batch_size , dec_hidden_dim + hidden_for_dec = hidden.permute(1, 0, 2).contiguous() + + return hidden_for_dec + + def active_beam_inference(self, src, beam_width=3, max_tgt_sz=50): + """Search based decoding + src: (sequence_len) + """ + + def _avg_score(p_tup): + """Used for Sorting + TODO: Dividing by length of sequence power alpha as hyperparam + """ + return p_tup[0] + + import sys + + batch_size = 1 + start_tok = src[0] + end_tok = src[-1] + src_sz = torch.tensor([len(src)]) + src_ = src.unsqueeze(0) + + # enc_output: (batch_size, padded_seq_length, enc_hidden_dim*num_direction) + # enc_hidden: (enc_layers*num_direction, batch_size, hidden_dim) + enc_output, enc_hidden = self.encoder(src_, src_sz) + + if self.pass_enc2dec_hid: + # dec_hidden: dec_layers, batch_size , dec_hidden_dim + if self.use_conv_4_enc2dec_hid: + init_dec_hidden = self.enc2dec_hidden(enc_hidden) + else: + init_dec_hidden = enc_hidden + else: + # dec_hidden -> Will be initialized to zeros internally + init_dec_hidden = None + + # top_pred[][0] = Σ-log_softmax + # top_pred[][1] = sequence torch.tensor shape: (1) + # top_pred[][2] = dec_hidden + top_pred_list = [(0, start_tok.unsqueeze(0), init_dec_hidden)] + + for t in range(max_tgt_sz): + cur_pred_list = [] + + for p_tup in top_pred_list: + if p_tup[1][-1] == end_tok: + cur_pred_list.append(p_tup) + continue + + # dec_hidden: dec_layers, 1, hidden_dim + # dec_output: 1, output_dim + dec_output, dec_hidden, _ = self.decoder( + x=p_tup[1][-1].view(1, 1), # dec_input: (1,1) + hidden=p_tup[2], + enc_output=enc_output, + ) + + ## π{prob} = Σ{log(prob)} -> to prevent diminishing + # dec_output: (1, output_dim) + dec_output = nn.functional.log_softmax(dec_output, dim=1) + # pred_topk.values & pred_topk.indices: (1, beam_width) + pred_topk = torch.topk(dec_output, k=beam_width, dim=1) + + for i in range(beam_width): + sig_logsmx_ = p_tup[0] + pred_topk.values[0][i] + # seq_tensor_ : (seq_len) + seq_tensor_ = torch.cat((p_tup[1], pred_topk.indices[0][i].view(1))) + + cur_pred_list.append((sig_logsmx_, seq_tensor_, dec_hidden)) + + cur_pred_list.sort(key=_avg_score, reverse=True) # Maximized order + top_pred_list = cur_pred_list[:beam_width] + + # check if end_tok of all topk + end_flags_ = [1 if t[1][-1] == end_tok else 0 for t in top_pred_list] + if beam_width == sum(end_flags_): + break + + pred_tnsr_list = [t[1] for t in top_pred_list] + + return pred_tnsr_list + + +##===================== Glyph handlers ======================================= + + +class GlyphStrawboss: + def __init__(self, glyphs="en"): + """list of letters in a language in unicode + lang: ISO Language code + glyphs: json file with script information + """ + if glyphs == "en": + # Smallcase alone + self.glyphs = [chr(alpha) for alpha in range(97, 122 + 1)] + else: + self.dossier = json.load(open(glyphs, encoding="utf-8")) + self.glyphs = self.dossier["glyphs"] + self.numsym_map = self.dossier["numsym_map"] + + self.char2idx = {} + self.idx2char = {} + self._create_index() + + def _create_index(self): + + self.char2idx["_"] = 0 # pad + self.char2idx["$"] = 1 # start + self.char2idx["#"] = 2 # end + self.char2idx["*"] = 3 # Mask + self.char2idx["'"] = 4 # apostrophe U+0027 + self.char2idx["%"] = 5 # unused + self.char2idx["!"] = 6 # unused + + # letter to index mapping + for idx, char in enumerate(self.glyphs): + self.char2idx[char] = idx + 7 # +7 token initially + + # index to letter mapping + for char, idx in self.char2idx.items(): + self.idx2char[idx] = char + + def size(self): + return len(self.char2idx) + + def word2xlitvec(self, word): + """Converts given string of gyphs(word) to vector(numpy) + Also adds tokens for start and end + """ + try: + vec = [self.char2idx["$"]] # start token + for i in list(word): + vec.append(self.char2idx[i]) + vec.append(self.char2idx["#"]) # end token + + vec = np.asarray(vec, dtype=np.int64) + return vec + + except Exception as error: + print("XlitError: In word:", word, "Error Char not in Token:", error) + sys.exit() + + def xlitvec2word(self, vector): + """Converts vector(numpy) to string of glyphs(word)""" + char_list = [] + for i in vector: + char_list.append(self.idx2char[i]) + + word = "".join(char_list).replace("$", "").replace("#", "") # remove tokens + word = word.replace("_", "").replace("*", "") # remove tokens + return word + + +class VocabSanitizer: + def __init__(self, data_file): + """ + data_file: path to file conatining vocabulary list + """ + extension = os.path.splitext(data_file)[-1] + if extension == ".json": + self.vocab_set = set(json.load(open(data_file, encoding="utf-8"))) + elif extension == ".csv": + self.vocab_df = pd.read_csv(data_file).set_index("WORD") + self.vocab_set = set(self.vocab_df.index) + else: + print("XlitError: Only Json/CSV file extension supported") + + def reposition(self, word_list): + """Reorder Words in list""" + new_list = [] + temp_ = word_list.copy() + for v in word_list: + if v in self.vocab_set: + new_list.append(v) + temp_.remove(v) + new_list.extend(temp_) + + return new_list + + +##=============== INSTANTIATION ================================================ + + +class XlitPiston: + """ + For handling prediction & post-processing of transliteration for a single language + Class dependency: Seq2Seq, GlyphStrawboss, VocabSanitizer + Global Variables: F_DIR + """ + + def __init__( + self, + weight_path, + vocab_file, + tglyph_cfg_file, + iglyph_cfg_file="en", + device="cpu", + ): + + self.device = device + self.in_glyph_obj = GlyphStrawboss(iglyph_cfg_file) + self.tgt_glyph_obj = GlyphStrawboss(glyphs=tglyph_cfg_file) + self.voc_sanity = VocabSanitizer(vocab_file) + + self._numsym_set = set( + json.load(open(tglyph_cfg_file, encoding="utf-8"))["numsym_map"].keys() + ) + self._inchar_set = set("abcdefghijklmnopqrstuvwxyz") + self._natscr_set = set().union( + self.tgt_glyph_obj.glyphs, sum(self.tgt_glyph_obj.numsym_map.values(), []) + ) + + ## Model Config Static TODO: add defining in json support + input_dim = self.in_glyph_obj.size() + output_dim = self.tgt_glyph_obj.size() + enc_emb_dim = 300 + dec_emb_dim = 300 + enc_hidden_dim = 512 + dec_hidden_dim = 512 + rnn_type = "lstm" + enc2dec_hid = True + attention = True + enc_layers = 1 + dec_layers = 2 + m_dropout = 0 + enc_bidirect = True + enc_outstate_dim = enc_hidden_dim * (2 if enc_bidirect else 1) + + enc = Encoder( + input_dim=input_dim, + embed_dim=enc_emb_dim, + hidden_dim=enc_hidden_dim, + rnn_type=rnn_type, + layers=enc_layers, + dropout=m_dropout, + device=self.device, + bidirectional=enc_bidirect, + ) + dec = Decoder( + output_dim=output_dim, + embed_dim=dec_emb_dim, + hidden_dim=dec_hidden_dim, + rnn_type=rnn_type, + layers=dec_layers, + dropout=m_dropout, + use_attention=attention, + enc_outstate_dim=enc_outstate_dim, + device=self.device, + ) + self.model = Seq2Seq(enc, dec, pass_enc2dec_hid=enc2dec_hid, device=self.device) + self.model = self.model.to(self.device) + weights = torch.load(weight_path, map_location=torch.device(self.device)) + + self.model.load_state_dict(weights) + self.model.eval() + + def character_model(self, word, beam_width=1): + in_vec = torch.from_numpy(self.in_glyph_obj.word2xlitvec(word)).to(self.device) + ## change to active or passive beam + p_out_list = self.model.active_beam_inference(in_vec, beam_width=beam_width) + p_result = [ + self.tgt_glyph_obj.xlitvec2word(out.cpu().numpy()) for out in p_out_list + ] + + result = self.voc_sanity.reposition(p_result) + + # List type + return result + + def numsym_model(self, seg): + """tgt_glyph_obj.numsym_map[x] returns a list object""" + if len(seg) == 1: + return [seg] + self.tgt_glyph_obj.numsym_map[seg] + + a = [self.tgt_glyph_obj.numsym_map[n][0] for n in seg] + return [seg] + ["".join(a)] + + def _word_segementer(self, sequence): + + sequence = sequence.lower() + accepted = set().union(self._numsym_set, self._inchar_set, self._natscr_set) + # sequence = ''.join([i for i in sequence if i in accepted]) + + segment = [] + idx = 0 + seq_ = list(sequence) + while len(seq_): + # for Number-Symbol + temp = "" + while len(seq_) and seq_[0] in self._numsym_set: + temp += seq_[0] + seq_.pop(0) + if temp != "": + segment.append(temp) + + # for Target Chars + temp = "" + while len(seq_) and seq_[0] in self._natscr_set: + temp += seq_[0] + seq_.pop(0) + if temp != "": + segment.append(temp) + + # for Input-Roman Chars + temp = "" + while len(seq_) and seq_[0] in self._inchar_set: + temp += seq_[0] + seq_.pop(0) + if temp != "": + segment.append(temp) + + temp = "" + while len(seq_) and seq_[0] not in accepted: + temp += seq_[0] + seq_.pop(0) + if temp != "": + segment.append(temp) + + return segment + + def inferencer(self, sequence, beam_width=10): + + seg = self._word_segementer(sequence[:120]) + lit_seg = [] + + p = 0 + while p < len(seg): + if seg[p][0] in self._natscr_set: + lit_seg.append([seg[p]]) + p += 1 + + elif seg[p][0] in self._inchar_set: + lit_seg.append(self.character_model(seg[p], beam_width=beam_width)) + p += 1 + + elif seg[p][0] in self._numsym_set: # num & punc + lit_seg.append(self.numsym_model(seg[p])) + p += 1 + else: + lit_seg.append([seg[p]]) + p += 1 + + ## IF segment less/equal to 2 then return combinotorial, + ## ELSE only return top1 of each result concatenated + if len(lit_seg) == 1: + final_result = lit_seg[0] + + elif len(lit_seg) == 2: + final_result = [""] + for seg in lit_seg: + new_result = [] + for s in seg: + for f in final_result: + new_result.append(f + s) + final_result = new_result + + else: + new_result = [] + for seg in lit_seg: + new_result.append(seg[0]) + final_result = ["".join(new_result)] + + return final_result + + +from collections.abc import Iterable +from pydload import dload +import zipfile + +MODEL_DOWNLOAD_URL_PREFIX = "https://github.com/AI4Bharat/IndianNLP-Transliteration/releases/download/xlit_v0.5.0/" + + +def is_folder_writable(folder): + try: + os.makedirs(folder, exist_ok=True) + tmp_file = os.path.join(folder, ".write_test") + with open(tmp_file, "w") as f: + f.write("Permission Check") + os.remove(tmp_file) + return True + except: + return False + + +def is_directory_writable(path): + if os.name == "nt": + return is_folder_writable(path) + return os.access(path, os.W_OK | os.X_OK) + + +class XlitEngine: + """ + For Managing the top level tasks and applications of transliteration + Global Variables: F_DIR + """ + + def __init__( + self, lang2use="all", config_path="translit_models/default_lineup.json" + ): + + lineup = json.load(open(os.path.join(F_DIR, config_path), encoding="utf-8")) + self.lang_config = {} + if isinstance(lang2use, str): + if lang2use == "all": + self.lang_config = lineup + elif lang2use in lineup: + self.lang_config[lang2use] = lineup[lang2use] + else: + raise Exception( + "XlitError: The entered Langauge code not found. Available are {}".format( + lineup.keys() + ) + ) + + elif isinstance(lang2use, Iterable): + for l in lang2use: + try: + self.lang_config[l] = lineup[l] + except: + print( + "XlitError: Language code {} not found, Skipping...".format(l) + ) + else: + raise Exception( + "XlitError: lang2use must be a list of language codes (or) string of single language code" + ) + + if is_directory_writable(F_DIR): + models_path = os.path.join(F_DIR, "translit_models") + else: + user_home = os.path.expanduser("~") + models_path = os.path.join(user_home, ".AI4Bharat_Xlit_Models") + os.makedirs(models_path, exist_ok=True) + self.download_models(models_path) + + self.langs = {} + self.lang_model = {} + for la in self.lang_config: + try: + print("Loading {}...".format(la)) + self.lang_model[la] = XlitPiston( + weight_path=os.path.join( + models_path, self.lang_config[la]["weight"] + ), + vocab_file=os.path.join(models_path, self.lang_config[la]["vocab"]), + tglyph_cfg_file=os.path.join( + models_path, self.lang_config[la]["script"] + ), + iglyph_cfg_file="en", + ) + self.langs[la] = self.lang_config[la]["name"] + except Exception as error: + print("XlitError: Failure in loading {} \n".format(la), error) + print(XlitError.loading_err.value) + + def download_models(self, models_path): + """ + Download models from GitHub Releases if not exists + """ + for l in self.lang_config: + lang_name = self.lang_config[l]["eng_name"] + lang_model_path = os.path.join(models_path, lang_name) + if not os.path.isdir(lang_model_path): + print("Downloading model for language: %s" % lang_name) + remote_url = MODEL_DOWNLOAD_URL_PREFIX + lang_name + ".zip" + downloaded_zip_path = os.path.join(models_path, lang_name + ".zip") + dload(url=remote_url, save_to_path=downloaded_zip_path, max_time=None) + + if not os.path.isfile(downloaded_zip_path): + exit( + f"ERROR: Unable to download model from {remote_url} into {models_path}" + ) + + with zipfile.ZipFile(downloaded_zip_path, "r") as zip_ref: + zip_ref.extractall(models_path) + + if os.path.isdir(lang_model_path): + os.remove(downloaded_zip_path) + else: + exit( + f"ERROR: Unable to find models in {lang_model_path} after download" + ) + return + + def translit_word(self, eng_word, lang_code="default", topk=7, beam_width=10): + if eng_word == "": + return [] + + if lang_code in self.langs: + try: + res_list = self.lang_model[lang_code].inferencer( + eng_word, beam_width=beam_width + ) + return res_list[:topk] + + except Exception as error: + print("XlitError:", traceback.format_exc()) + print(XlitError.internal_err.value) + return XlitError.internal_err + + elif lang_code == "default": + try: + res_dict = {} + for la in self.lang_model: + res = self.lang_model[la].inferencer( + eng_word, beam_width=beam_width + ) + res_dict[la] = res[:topk] + return res_dict + + except Exception as error: + print("XlitError:", traceback.format_exc()) + print(XlitError.internal_err.value) + return XlitError.internal_err + + else: + print("XlitError: Unknown Langauge requested", lang_code) + print(XlitError.lang_err.value) + return XlitError.lang_err + + def translit_sentence(self, eng_sentence, lang_code="default", beam_width=10): + if eng_sentence == "": + return [] + + if lang_code in self.langs: + try: + out_str = "" + for word in eng_sentence.split(): + res_ = self.lang_model[lang_code].inferencer( + word, beam_width=beam_width + ) + out_str = out_str + res_[0] + " " + return out_str[:-1] + + except Exception as error: + print("XlitError:", traceback.format_exc()) + print(XlitError.internal_err.value) + return XlitError.internal_err + + elif lang_code == "default": + try: + res_dict = {} + for la in self.lang_model: + out_str = "" + for word in eng_sentence.split(): + res_ = self.lang_model[la].inferencer( + word, beam_width=beam_width + ) + out_str = out_str + res_[0] + " " + res_dict[la] = out_str[:-1] + return res_dict + + except Exception as error: + print("XlitError:", traceback.format_exc()) + print(XlitError.internal_err.value) + return XlitError.internal_err + + else: + print("XlitError: Unknown Langauge requested", lang_code) + print(XlitError.lang_err.value) + return XlitError.lang_err + + +if __name__ == "__main__": + + available_lang = [ + "bn", + "gu", + "hi", + "kn", + "gom", + "mai", + "ml", + "mr", + "pa", + "sd", + "si", + "ta", + "te", + "ur", + ] + + reg = re.compile(r"[a-zA-Z]") + lang = "hi" + engine = XlitEngine( + lang + ) # if you don't specify lang code here, this will give results in all langs available + sent = "Hello World! ABCD क्या हाल है आपका?" + words = [ + engine.translit_word(word, topk=1)[lang][0] if reg.match(word) else word + for word in sent.split() + ] # only transliterated en words, leaves rest as it is + updated_sent = " ".join(words) + + print(updated_sent) + + # output : हेलो वर्ल्ड! क्या हाल है आपका? + + # y = engine.translit_sentence("Hello World !")['hi'] + # print(y) diff --git a/ttsv/utils/inference/tts.py b/ttsv/utils/inference/tts.py new file mode 100644 index 0000000000000000000000000000000000000000..dc485ec44dbf34ddbb69c15ad524c0fab189c3c5 --- /dev/null +++ b/ttsv/utils/inference/tts.py @@ -0,0 +1,167 @@ +from __future__ import absolute_import, division, print_function, unicode_literals +from typing import Tuple +import sys +from argparse import ArgumentParser + +import torch +import numpy as np +import os +import json +import torch + +sys.path.append(os.path.join(os.path.dirname(__file__), "../../src/glow_tts")) + +from scipy.io.wavfile import write +from hifi.env import AttrDict +from hifi.models import Generator + + +from text import text_to_sequence +import commons +import models +import utils + + +def check_directory(dir): + if not os.path.exists(dir): + sys.exit("Error: {} directory does not exist".format(dir)) + + +class TextToMel: + def __init__(self, glow_model_dir, device="cuda"): + self.glow_model_dir = glow_model_dir + check_directory(self.glow_model_dir) + self.device = device + self.hps, self.glow_tts_model = self.load_glow_tts() + + def load_glow_tts(self): + hps = utils.get_hparams_from_dir(self.glow_model_dir) + checkpoint_path = utils.latest_checkpoint_path(self.glow_model_dir) + symbols = list(hps.data.punc) + list(hps.data.chars) + glow_tts_model = models.FlowGenerator( + len(symbols) + getattr(hps.data, "add_blank", False), + out_channels=hps.data.n_mel_channels, + **hps.model + ) # .to(self.device) + + if self.device == "cuda": + glow_tts_model.to("cuda") + + utils.load_checkpoint(checkpoint_path, glow_tts_model) + glow_tts_model.decoder.store_inverse() + _ = glow_tts_model.eval() + + return hps, glow_tts_model + + def generate_mel(self, text, noise_scale=0.667, length_scale=1.0): + print(f"Noise scale: {noise_scale} and Length scale: {length_scale}") + symbols = list(self.hps.data.punc) + list(self.hps.data.chars) + cleaner = self.hps.data.text_cleaners + if getattr(self.hps.data, "add_blank", False): + text_norm = text_to_sequence(text, symbols, cleaner) + text_norm = commons.intersperse(text_norm, len(symbols)) + else: # If not using "add_blank" option during training, adding spaces at the beginning and the end of utterance improves quality + text = " " + text.strip() + " " + text_norm = text_to_sequence(text, symbols, cleaner) + + sequence = np.array(text_norm)[None, :] + + del symbols + del cleaner + del text + del text_norm + + if self.device == "cuda": + x_tst = torch.autograd.Variable(torch.from_numpy(sequence)).cuda().long() + x_tst_lengths = torch.tensor([x_tst.shape[1]]).cuda() + else: + x_tst = torch.autograd.Variable(torch.from_numpy(sequence)).long() + x_tst_lengths = torch.tensor([x_tst.shape[1]]) + + with torch.no_grad(): + (y_gen_tst, *_), *_, (attn_gen, *_) = self.glow_tts_model( + x_tst, + x_tst_lengths, + gen=True, + noise_scale=noise_scale, + length_scale=length_scale, + ) + del x_tst + del x_tst_lengths + torch.cuda.empty_cache() + return y_gen_tst.cpu().detach().numpy() + + +class MelToWav: + def __init__(self, hifi_model_dir, device="cuda"): + self.hifi_model_dir = hifi_model_dir + check_directory(self.hifi_model_dir) + self.device = device + self.h, self.hifi_gan_generator = self.load_hifi_gan() + + def load_hifi_gan(self): + checkpoint_path = utils.latest_checkpoint_path(self.hifi_model_dir, regex="g_*") + config_file = os.path.join(self.hifi_model_dir, "config.json") + data = open(config_file).read() + json_config = json.loads(data) + h = AttrDict(json_config) + torch.manual_seed(h.seed) + + generator = Generator(h).to(self.device) + + assert os.path.isfile(checkpoint_path) + print("Loading '{}'".format(checkpoint_path)) + state_dict_g = torch.load(checkpoint_path, map_location=self.device) + print("Complete.") + + generator.load_state_dict(state_dict_g["generator"]) + + generator.eval() + generator.remove_weight_norm() + + return h, generator + + def generate_wav(self, mel): + mel = torch.FloatTensor(mel).to(self.device) + + y_g_hat = self.hifi_gan_generator(mel) # passing through vocoder + audio = y_g_hat.squeeze() + audio = audio * 32768.0 + audio = audio.cpu().detach().numpy().astype("int16") + + del y_g_hat + del mel + torch.cuda.empty_cache() + return audio, self.h.sampling_rate + +def restricted_float(x): + try: + x = float(x) + except ValueError: + raise argparse.ArgumentTypeError("%r not a floating-point literal" % (x,)) + + if x < 0.0 or x > 1.0: + raise argparse.ArgumentTypeError("%r not in range [0.0, 1.0]"%(x,)) + return x + + +if __name__ == "__main__": + parser = ArgumentParser() + parser.add_argument("-a", "--acoustic", required=True, type=str) + parser.add_argument("-v", "--vocoder", required=True, type=str) + parser.add_argument("-d", "--device", type=str, default="cpu") + parser.add_argument("-t", "--text", type=str, required=True) + parser.add_argument("-w", "--wav", type=str, required=True) + parser.add_argument("-n", "--noise-scale", default=0.667, type=restricted_float ) + parser.add_argument("-l", "--length-scale", default=1.0, type=float) + + args = parser.parse_args() + + text_to_mel = TextToMel(glow_model_dir=args.acoustic, device=args.device) + mel_to_wav = MelToWav(hifi_model_dir=args.vocoder, device=args.device) + + mel = text_to_mel.generate_mel(args.text, args.noise_scale, args.length_scale) + audio, sr = mel_to_wav.generate_wav(mel) + + write(filename=args.wav, rate=sr, data=audio) +