File size: 5,458 Bytes
c8a0bb3 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 |
#!/usr/bin/env python3
import hashlib
import json
import unittest
from pathlib import Path
_DIR = Path(__file__).parent
_REPO_DIR = _DIR.parent
class VoiceTest(unittest.TestCase):
def test_voices(self):
for onnx_path in _REPO_DIR.rglob("*.onnx"):
with self.subTest(onnx_path=onnx_path):
self.assertGreater(onnx_path.stat().st_size, 0, "Empty onnx file")
# Load JSON config for voice
config_path = onnx_path.parent / f"{onnx_path.name}.json"
with open(config_path, "r", encoding="utf-8") as config_file:
config = json.load(config_file)
# Verify config
self.assertIn(
"piper_version", config, "Missing piper_version in config"
)
self.assertIn("language", config, "Missing language in config")
self.assertIn("dataset", config, "Missing dataset in config")
self.assertIn(
"quality", config["audio"], "Missing audio quality in config"
)
# Verify directory structure
# <lang_family>/<lang_code>/<dataset>/<quality>/
quality_dir = onnx_path.parent
dataset_dir = quality_dir.parent
lang_code_dir = dataset_dir.parent
lang_family_dir = lang_code_dir.parent
self.assertEqual(
lang_family_dir.name,
config["language"]["family"],
"Wrong lang family dir",
)
self.assertEqual(
lang_code_dir.name,
config["language"]["code"],
"Wrong lang code dir",
)
self.assertEqual(
dataset_dir.name, config["dataset"], "Wrong dataset dir"
)
self.assertEqual(
quality_dir.name, config["audio"]["quality"], "Wrong quality dir"
)
# Verify file names
file_lang_code, file_dataset, file_quality = onnx_path.stem.split("-")
file_lang_family = file_lang_code.split("_", maxsplit=1)[0]
self.assertEqual(
file_lang_family,
config["language"]["family"],
"Wrong lang family file",
)
self.assertEqual(
file_lang_code, config["language"]["code"], "Wrong lang code file"
)
self.assertEqual(file_dataset, config["dataset"], "Wrong dataset file")
self.assertEqual(
file_quality, config["audio"]["quality"], "Wrong quality file"
)
def run_tests():
runner = unittest.TextTestRunner()
runner.run(unittest.makeSuite(VoiceTest))
# -----------------------------------------------------------------------------
def write_voices_json():
# {
# "<family>_<region>-<dataset>-<quality>": {
# "name": "<dataset>",
# "language": "<family>_<region>",
# "quality": "<quality>", // x_low, low, medium, high
# "num_speakers": int,
# "speaker_id_map": {
# "name": int,
# ...
# }
# "files": {
# "relative/path/to/file": {
# "size_bytes": int,
# "md5_digest": str, // hex
# },
# ...
# }
# },
# ...
# }
voices = {}
for onnx_path in _REPO_DIR.rglob("*.onnx"):
voice_dir = onnx_path.parent
config_path = voice_dir / f"{onnx_path.name}.json"
with open(config_path, "r", encoding="utf-8") as config_file:
config = json.load(config_file)
quality = config["audio"]["quality"]
dataset = config["dataset"]
lang_code = config["language"]["code"]
voice_key = f"{lang_code}-{dataset}-{quality}"
model_card_path = voice_dir / "MODEL_CARD"
assert model_card_path.exists(), f"Missing {model_card_path}"
voices[voice_key] = {
"name": dataset,
"language": lang_code,
"quality": quality,
"num_speakers": config["num_speakers"],
"speaker_id_map": config.get("speaker_id_map", {}),
"files": {
str(file_path.relative_to(_REPO_DIR)): {
"size_bytes": file_path.stat().st_size,
"md5_digest": get_file_hash(file_path),
}
for file_path in (
onnx_path,
config_path,
model_card_path,
)
},
}
with open(_REPO_DIR / "voices.json", "w", encoding="utf-8") as voices_file:
json.dump(voices, voices_file, indent=4, ensure_ascii=False)
def get_file_hash(path, bytes_per_chunk: int = 8192) -> str:
"""Hash a file in chunks using md5."""
path_hash = hashlib.md5()
with open(path, "rb") as path_file:
chunk = path_file.read(bytes_per_chunk)
while chunk:
path_hash.update(chunk)
chunk = path_file.read(bytes_per_chunk)
return path_hash.hexdigest()
# -----------------------------------------------------------------------------
if __name__ == "__main__":
run_tests()
print("Writing voices.json")
write_voices_json()
|