princepride
commited on
Commit
•
e64bf30
1
Parent(s):
5f97682
Update model.py
Browse files
model.py
CHANGED
@@ -60,15 +60,15 @@ class SpecialTokenFilter(Filter):
|
|
60 |
|
61 |
class SperSignFilter(Filter):
|
62 |
def __init__(self):
|
63 |
-
self.name = 's
|
64 |
self.code = []
|
65 |
|
66 |
def encoder(self, inputs):
|
67 |
encoded_inputs = []
|
68 |
self.code = [] # 清空 self.code
|
69 |
for i, input_str in enumerate(inputs):
|
70 |
-
if 's
|
71 |
-
encoded_str = input_str.replace('s
|
72 |
self.code.append(i) # 将包含 's%' 的字符串的索引存储到 self.code 中
|
73 |
else:
|
74 |
encoded_str = input_str
|
@@ -78,7 +78,33 @@ class SperSignFilter(Filter):
|
|
78 |
def decoder(self, inputs):
|
79 |
decoded_inputs = inputs.copy()
|
80 |
for i in self.code:
|
81 |
-
decoded_inputs[i] = decoded_inputs[i].replace('*', 's
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
82 |
return decoded_inputs
|
83 |
|
84 |
class SimilarFilter(Filter):
|
@@ -121,6 +147,65 @@ class SimilarFilter(Filter):
|
|
121 |
decoded_inputs.insert(i+j, new_str)
|
122 |
return decoded_inputs
|
123 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
124 |
script_dir = os.path.dirname(os.path.abspath(__file__))
|
125 |
parent_dir = os.path.dirname(os.path.dirname(os.path.dirname(script_dir)))
|
126 |
|
|
|
60 |
|
61 |
class SperSignFilter(Filter):
|
62 |
def __init__(self):
|
63 |
+
self.name = 's percentage sign filter'
|
64 |
self.code = []
|
65 |
|
66 |
def encoder(self, inputs):
|
67 |
encoded_inputs = []
|
68 |
self.code = [] # 清空 self.code
|
69 |
for i, input_str in enumerate(inputs):
|
70 |
+
if '%s' in input_str:
|
71 |
+
encoded_str = input_str.replace('%s', '*')
|
72 |
self.code.append(i) # 将包含 's%' 的字符串的索引存储到 self.code 中
|
73 |
else:
|
74 |
encoded_str = input_str
|
|
|
78 |
def decoder(self, inputs):
|
79 |
decoded_inputs = inputs.copy()
|
80 |
for i in self.code:
|
81 |
+
decoded_inputs[i] = decoded_inputs[i].replace('*', '%s') # 使用 self.code 中的索引还原原始字符串
|
82 |
+
return decoded_inputs
|
83 |
+
|
84 |
+
class ChevronsFilter(Filter):
|
85 |
+
def __init__(self):
|
86 |
+
self.name = 'chevrons filter'
|
87 |
+
self.code = []
|
88 |
+
|
89 |
+
def encoder(self, inputs):
|
90 |
+
encoded_inputs = []
|
91 |
+
self.code = [] # 清空 self.code
|
92 |
+
pattern = re.compile(r'<.*?>')
|
93 |
+
for i, input_str in enumerate(inputs):
|
94 |
+
if pattern.search(input_str):
|
95 |
+
matches = pattern.findall(input_str)
|
96 |
+
encoded_str = pattern.sub('#', input_str)
|
97 |
+
self.code.append((i, matches)) # 将包含匹配模式的字符串的索引和匹配列表存储到 self.code 中
|
98 |
+
else:
|
99 |
+
encoded_str = input_str
|
100 |
+
encoded_inputs.append(encoded_str)
|
101 |
+
return encoded_inputs
|
102 |
+
|
103 |
+
def decoder(self, inputs):
|
104 |
+
decoded_inputs = inputs.copy()
|
105 |
+
for i, matches in self.code:
|
106 |
+
for match in matches:
|
107 |
+
decoded_inputs[i] = decoded_inputs[i].replace('#', match, 1) # 使用 self.code 中的匹配列表依次还原原始字符串
|
108 |
return decoded_inputs
|
109 |
|
110 |
class SimilarFilter(Filter):
|
|
|
147 |
decoded_inputs.insert(i+j, new_str)
|
148 |
return decoded_inputs
|
149 |
|
150 |
+
class ChineseFilter:
|
151 |
+
def __init__(self, pinyin_lib_file='./pinyin.txt'):
|
152 |
+
self.name = 'chinese filter'
|
153 |
+
self.code = []
|
154 |
+
self.pinyin_lib = self.load_pinyin_lib(pinyin_lib_file)
|
155 |
+
|
156 |
+
def load_pinyin_lib(self, file_path):
|
157 |
+
with open(file_path, 'r', encoding='utf-8') as f:
|
158 |
+
return set(line.strip().lower() for line in f)
|
159 |
+
|
160 |
+
def is_valid_chinese(self, word):
|
161 |
+
# 判断一个单词是否符合要求:只有一个单词构成,并且首字母大写
|
162 |
+
if len(word.split()) == 1 and word[0].isupper():
|
163 |
+
# 使用pinyin_or_word函数判断是否是合法的拼音
|
164 |
+
return self.is_pinyin(word.lower())
|
165 |
+
return False
|
166 |
+
|
167 |
+
def encoder(self, inputs):
|
168 |
+
encoded_inputs = []
|
169 |
+
self.code = [] # 清空 self.code
|
170 |
+
for i, word in enumerate(inputs):
|
171 |
+
if self.is_valid_chinese(word):
|
172 |
+
self.code.append((i, word)) # 将需要过滤的中文单词的索引和拼音存储到 self.code 中
|
173 |
+
else:
|
174 |
+
encoded_inputs.append(word)
|
175 |
+
return encoded_inputs
|
176 |
+
|
177 |
+
def decoder(self, inputs):
|
178 |
+
decoded_inputs = inputs.copy()
|
179 |
+
for i, word in self.code:
|
180 |
+
decoded_inputs.insert(i, word) # 根据索引将过滤的中文单词还原到原位置
|
181 |
+
return decoded_inputs
|
182 |
+
|
183 |
+
def is_pinyin(self, string):
|
184 |
+
'''
|
185 |
+
judge a string is a pinyin or a english word.
|
186 |
+
pinyin_Lib comes from a txt file.
|
187 |
+
'''
|
188 |
+
string = string.lower()
|
189 |
+
stringlen = len(string)
|
190 |
+
max_len = 6
|
191 |
+
result = []
|
192 |
+
n = 0
|
193 |
+
while n < stringlen:
|
194 |
+
matched = 0
|
195 |
+
temp_result = []
|
196 |
+
for i in range(max_len, 0, -1):
|
197 |
+
s = string[0:i]
|
198 |
+
if s in self.pinyin_lib:
|
199 |
+
temp_result.append(string[:i])
|
200 |
+
matched = i
|
201 |
+
break
|
202 |
+
if i == 1 and len(temp_result) == 0:
|
203 |
+
return False
|
204 |
+
result.extend(temp_result)
|
205 |
+
string = string[matched:]
|
206 |
+
n += matched
|
207 |
+
return True
|
208 |
+
|
209 |
script_dir = os.path.dirname(os.path.abspath(__file__))
|
210 |
parent_dir = os.path.dirname(os.path.dirname(os.path.dirname(script_dir)))
|
211 |
|