princepride commited on
Commit
e64bf30
1 Parent(s): 5f97682

Update model.py

Browse files
Files changed (1) hide show
  1. model.py +89 -4
model.py CHANGED
@@ -60,15 +60,15 @@ class SpecialTokenFilter(Filter):
60
 
61
  class SperSignFilter(Filter):
62
  def __init__(self):
63
- self.name = 's persign filter'
64
  self.code = []
65
 
66
  def encoder(self, inputs):
67
  encoded_inputs = []
68
  self.code = [] # 清空 self.code
69
  for i, input_str in enumerate(inputs):
70
- if 's%' in input_str:
71
- encoded_str = input_str.replace('s%', '*')
72
  self.code.append(i) # 将包含 's%' 的字符串的索引存储到 self.code 中
73
  else:
74
  encoded_str = input_str
@@ -78,7 +78,33 @@ class SperSignFilter(Filter):
78
  def decoder(self, inputs):
79
  decoded_inputs = inputs.copy()
80
  for i in self.code:
81
- decoded_inputs[i] = decoded_inputs[i].replace('*', 's%') # 使用 self.code 中的索引还原原始字符串
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
82
  return decoded_inputs
83
 
84
  class SimilarFilter(Filter):
@@ -121,6 +147,65 @@ class SimilarFilter(Filter):
121
  decoded_inputs.insert(i+j, new_str)
122
  return decoded_inputs
123
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
124
  script_dir = os.path.dirname(os.path.abspath(__file__))
125
  parent_dir = os.path.dirname(os.path.dirname(os.path.dirname(script_dir)))
126
 
 
60
 
61
  class SperSignFilter(Filter):
62
  def __init__(self):
63
+ self.name = 's percentage sign filter'
64
  self.code = []
65
 
66
  def encoder(self, inputs):
67
  encoded_inputs = []
68
  self.code = [] # 清空 self.code
69
  for i, input_str in enumerate(inputs):
70
+ if '%s' in input_str:
71
+ encoded_str = input_str.replace('%s', '*')
72
  self.code.append(i) # 将包含 's%' 的字符串的索引存储到 self.code 中
73
  else:
74
  encoded_str = input_str
 
78
  def decoder(self, inputs):
79
  decoded_inputs = inputs.copy()
80
  for i in self.code:
81
+ decoded_inputs[i] = decoded_inputs[i].replace('*', '%s') # 使用 self.code 中的索引还原原始字符串
82
+ return decoded_inputs
83
+
84
+ class ChevronsFilter(Filter):
85
+ def __init__(self):
86
+ self.name = 'chevrons filter'
87
+ self.code = []
88
+
89
+ def encoder(self, inputs):
90
+ encoded_inputs = []
91
+ self.code = [] # 清空 self.code
92
+ pattern = re.compile(r'<.*?>')
93
+ for i, input_str in enumerate(inputs):
94
+ if pattern.search(input_str):
95
+ matches = pattern.findall(input_str)
96
+ encoded_str = pattern.sub('#', input_str)
97
+ self.code.append((i, matches)) # 将包含匹配模式的字符串的索引和匹配列表存储到 self.code 中
98
+ else:
99
+ encoded_str = input_str
100
+ encoded_inputs.append(encoded_str)
101
+ return encoded_inputs
102
+
103
+ def decoder(self, inputs):
104
+ decoded_inputs = inputs.copy()
105
+ for i, matches in self.code:
106
+ for match in matches:
107
+ decoded_inputs[i] = decoded_inputs[i].replace('#', match, 1) # 使用 self.code 中的匹配列表依次还原原始字符串
108
  return decoded_inputs
109
 
110
  class SimilarFilter(Filter):
 
147
  decoded_inputs.insert(i+j, new_str)
148
  return decoded_inputs
149
 
150
+ class ChineseFilter:
151
+ def __init__(self, pinyin_lib_file='./pinyin.txt'):
152
+ self.name = 'chinese filter'
153
+ self.code = []
154
+ self.pinyin_lib = self.load_pinyin_lib(pinyin_lib_file)
155
+
156
+ def load_pinyin_lib(self, file_path):
157
+ with open(file_path, 'r', encoding='utf-8') as f:
158
+ return set(line.strip().lower() for line in f)
159
+
160
+ def is_valid_chinese(self, word):
161
+ # 判断一个单词是否符合要求:只有一个单词构成,并且首字母大写
162
+ if len(word.split()) == 1 and word[0].isupper():
163
+ # 使用pinyin_or_word函数判断是否是合法的拼音
164
+ return self.is_pinyin(word.lower())
165
+ return False
166
+
167
+ def encoder(self, inputs):
168
+ encoded_inputs = []
169
+ self.code = [] # 清空 self.code
170
+ for i, word in enumerate(inputs):
171
+ if self.is_valid_chinese(word):
172
+ self.code.append((i, word)) # 将需要过滤的中文单词的索引和拼音存储到 self.code 中
173
+ else:
174
+ encoded_inputs.append(word)
175
+ return encoded_inputs
176
+
177
+ def decoder(self, inputs):
178
+ decoded_inputs = inputs.copy()
179
+ for i, word in self.code:
180
+ decoded_inputs.insert(i, word) # 根据索引将过滤的中文单词还原到原位置
181
+ return decoded_inputs
182
+
183
+ def is_pinyin(self, string):
184
+ '''
185
+ judge a string is a pinyin or a english word.
186
+ pinyin_Lib comes from a txt file.
187
+ '''
188
+ string = string.lower()
189
+ stringlen = len(string)
190
+ max_len = 6
191
+ result = []
192
+ n = 0
193
+ while n < stringlen:
194
+ matched = 0
195
+ temp_result = []
196
+ for i in range(max_len, 0, -1):
197
+ s = string[0:i]
198
+ if s in self.pinyin_lib:
199
+ temp_result.append(string[:i])
200
+ matched = i
201
+ break
202
+ if i == 1 and len(temp_result) == 0:
203
+ return False
204
+ result.extend(temp_result)
205
+ string = string[matched:]
206
+ n += matched
207
+ return True
208
+
209
  script_dir = os.path.dirname(os.path.abspath(__file__))
210
  parent_dir = os.path.dirname(os.path.dirname(os.path.dirname(script_dir)))
211