Spaces:
Paused
Paused
Fix text clean.py
Browse files- app.py +3 -0
- fish_speech/text/clean.py +9 -47
app.py
CHANGED
@@ -1,4 +1,7 @@
|
|
1 |
import os
|
|
|
|
|
|
|
2 |
import queue
|
3 |
from huggingface_hub import snapshot_download
|
4 |
import hydra
|
|
|
1 |
import os
|
2 |
+
|
3 |
+
os.environ["TORCHAUDIO_USE_FFMPEG"] = "1"
|
4 |
+
|
5 |
import queue
|
6 |
from huggingface_hub import snapshot_download
|
7 |
import hydra
|
fish_speech/text/clean.py
CHANGED
@@ -1,61 +1,24 @@
|
|
1 |
-
import itertools
|
2 |
import re
|
3 |
|
4 |
-
LANGUAGE_UNICODE_RANGE_MAP = {
|
5 |
-
"ZH": [(0x4E00, 0x9FFF)],
|
6 |
-
"JP": [(0x4E00, 0x9FFF), (0x3040, 0x309F), (0x30A0, 0x30FF), (0x31F0, 0x31FF)],
|
7 |
-
"EN": [(0x0000, 0x007F)],
|
8 |
-
}
|
9 |
-
|
10 |
SYMBOLS_MAPPING = {
|
11 |
-
"οΌ": ",",
|
12 |
-
"οΌ": ",",
|
13 |
-
"οΌ": ",",
|
14 |
-
"γ": ".",
|
15 |
-
"οΌ": "!",
|
16 |
-
"οΌ": "?",
|
17 |
-
"\n": ".",
|
18 |
-
"Β·": ",",
|
19 |
-
"γ": ",",
|
20 |
-
"...": "β¦",
|
21 |
"β": "'",
|
22 |
"β": "'",
|
23 |
"β": "'",
|
24 |
"β": "'",
|
25 |
-
"
|
26 |
-
"
|
27 |
-
"
|
28 |
-
"
|
29 |
-
"
|
30 |
-
"
|
31 |
-
"
|
32 |
-
"
|
33 |
-
"
|
34 |
-
"]": "'",
|
35 |
-
"β": "-",
|
36 |
-
"ο½": "-",
|
37 |
-
"~": "-",
|
38 |
-
"γ»": "-",
|
39 |
-
"γ": "'",
|
40 |
-
"γ": "'",
|
41 |
-
";": ",",
|
42 |
-
":": ",",
|
43 |
}
|
44 |
|
45 |
REPLACE_SYMBOL_REGEX = re.compile(
|
46 |
"|".join(re.escape(p) for p in SYMBOLS_MAPPING.keys())
|
47 |
)
|
48 |
-
ALL_KNOWN_UTF8_RANGE = list(
|
49 |
-
itertools.chain.from_iterable(LANGUAGE_UNICODE_RANGE_MAP.values())
|
50 |
-
)
|
51 |
-
REMOVE_UNKNOWN_SYMBOL_REGEX = re.compile(
|
52 |
-
"[^"
|
53 |
-
+ "".join(
|
54 |
-
f"{re.escape(chr(start))}-{re.escape(chr(end))}"
|
55 |
-
for start, end in ALL_KNOWN_UTF8_RANGE
|
56 |
-
)
|
57 |
-
+ "]"
|
58 |
-
)
|
59 |
|
60 |
|
61 |
def clean_text(text):
|
@@ -64,6 +27,5 @@ def clean_text(text):
|
|
64 |
|
65 |
# Replace all chinese symbols with their english counterparts
|
66 |
text = REPLACE_SYMBOL_REGEX.sub(lambda x: SYMBOLS_MAPPING[x.group()], text)
|
67 |
-
# text = REMOVE_UNKNOWN_SYMBOL_REGEX.sub("", text)
|
68 |
|
69 |
return text
|
|
|
|
|
1 |
import re
|
2 |
|
|
|
|
|
|
|
|
|
|
|
|
|
3 |
SYMBOLS_MAPPING = {
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
4 |
"β": "'",
|
5 |
"β": "'",
|
6 |
"β": "'",
|
7 |
"β": "'",
|
8 |
+
"γ": "",
|
9 |
+
"γ": "",
|
10 |
+
"[": "",
|
11 |
+
"]": "",
|
12 |
+
"οΌ": "",
|
13 |
+
"οΌ": "",
|
14 |
+
"(": "",
|
15 |
+
")": "",
|
16 |
+
"γ»": "Β·",
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
17 |
}
|
18 |
|
19 |
REPLACE_SYMBOL_REGEX = re.compile(
|
20 |
"|".join(re.escape(p) for p in SYMBOLS_MAPPING.keys())
|
21 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
22 |
|
23 |
|
24 |
def clean_text(text):
|
|
|
27 |
|
28 |
# Replace all chinese symbols with their english counterparts
|
29 |
text = REPLACE_SYMBOL_REGEX.sub(lambda x: SYMBOLS_MAPPING[x.group()], text)
|
|
|
30 |
|
31 |
return text
|