yonyou-sg
/

nllb-200-distilled-1.3B

Text2Text Generation

Transformers

Safetensors

m2m_100

Inference Endpoints

Model card Files Files and versions Community

princepride commited on May 9

Commit

e64bf30

•

1 Parent(s): 5f97682

Update model.py

Browse files

Files changed (1) hide show

model.py +89 -4

model.py CHANGED Viewed

@@ -60,15 +60,15 @@ class SpecialTokenFilter(Filter):
 class SperSignFilter(Filter):
     def __init__(self):
-        self.name = 's persign filter'
         self.code = []
     def encoder(self, inputs):
         encoded_inputs = []
         self.code = []  # 清空 self.code
         for i, input_str in enumerate(inputs):
-            if 's%' in input_str:
-                encoded_str = input_str.replace('s%', '*')
                 self.code.append(i)  # 将包含 's%' 的字符串的索引存储到 self.code 中
             else:
                 encoded_str = input_str
@@ -78,7 +78,33 @@ class SperSignFilter(Filter):
     def decoder(self, inputs):
         decoded_inputs = inputs.copy()
         for i in self.code:
-            decoded_inputs[i] = decoded_inputs[i].replace('*', 's%')  # 使用 self.code 中的索引还原原始字符串
         return decoded_inputs
 class SimilarFilter(Filter):
@@ -121,6 +147,65 @@ class SimilarFilter(Filter):
                     decoded_inputs.insert(i+j, new_str)
         return decoded_inputs
 script_dir = os.path.dirname(os.path.abspath(__file__))
 parent_dir = os.path.dirname(os.path.dirname(os.path.dirname(script_dir)))

 class SperSignFilter(Filter):
     def __init__(self):
+        self.name = 's percentage sign filter'
         self.code = []
     def encoder(self, inputs):
         encoded_inputs = []
         self.code = []  # 清空 self.code
         for i, input_str in enumerate(inputs):
+            if '%s' in input_str:
+                encoded_str = input_str.replace('%s', '*')
                 self.code.append(i)  # 将包含 's%' 的字符串的索引存储到 self.code 中
             else:
                 encoded_str = input_str
     def decoder(self, inputs):
         decoded_inputs = inputs.copy()
         for i in self.code:
+            decoded_inputs[i] = decoded_inputs[i].replace('*', '%s')  # 使用 self.code 中的索引还原原始字符串
+        return decoded_inputs
+class ChevronsFilter(Filter):
+    def __init__(self):
+        self.name = 'chevrons filter'
+        self.code = []
+    def encoder(self, inputs):
+        encoded_inputs = []
+        self.code = []  # 清空 self.code
+        pattern = re.compile(r'<.*?>')
+        for i, input_str in enumerate(inputs):
+            if pattern.search(input_str):
+                matches = pattern.findall(input_str)
+                encoded_str = pattern.sub('#', input_str)
+                self.code.append((i, matches))  # 将包含匹配模式的字符串的索引和匹配列表存储到 self.code 中
+            else:
+                encoded_str = input_str
+            encoded_inputs.append(encoded_str)
+        return encoded_inputs
+    def decoder(self, inputs):
+        decoded_inputs = inputs.copy()
+        for i, matches in self.code:
+            for match in matches:
+                decoded_inputs[i] = decoded_inputs[i].replace('#', match, 1)  # 使用 self.code 中的匹配列表依次还原原始字符串
         return decoded_inputs
 class SimilarFilter(Filter):
                     decoded_inputs.insert(i+j, new_str)
         return decoded_inputs
+class ChineseFilter:
+    def __init__(self, pinyin_lib_file='./pinyin.txt'):
+        self.name = 'chinese filter'
+        self.code = []
+        self.pinyin_lib = self.load_pinyin_lib(pinyin_lib_file)
+    def load_pinyin_lib(self, file_path):
+        with open(file_path, 'r', encoding='utf-8') as f:
+            return set(line.strip().lower() for line in f)
+    def is_valid_chinese(self, word):
+        # 判断一个单词是否符合要求:只有一个单词构成,并且首字母大写
+        if len(word.split()) == 1 and word[0].isupper():
+            # 使用pinyin_or_word函数判断是否是合法的拼音
+            return self.is_pinyin(word.lower())
+        return False
+    def encoder(self, inputs):
+        encoded_inputs = []
+        self.code = []  # 清空 self.code
+        for i, word in enumerate(inputs):
+            if self.is_valid_chinese(word):
+                self.code.append((i, word))  # 将需要过滤的中文单词的索引和拼音存储到 self.code 中
+            else:
+                encoded_inputs.append(word)
+        return encoded_inputs
+    def decoder(self, inputs):
+        decoded_inputs = inputs.copy()
+        for i, word in self.code:
+            decoded_inputs.insert(i, word)  # 根据索引将过滤的中文单词还原到原位置
+        return decoded_inputs
+    def is_pinyin(self, string):
+        '''
+        judge a string is a pinyin or a english word.
+        pinyin_Lib comes from a txt file.
+        '''
+        string = string.lower()
+        stringlen = len(string)
+        max_len = 6
+        result = []
+        n = 0
+        while n < stringlen:
+            matched = 0
+            temp_result = []
+            for i in range(max_len, 0, -1):
+                s = string[0:i]
+                if s in self.pinyin_lib:
+                    temp_result.append(string[:i])
+                    matched = i
+                    break
+                if i == 1 and len(temp_result) == 0:
+                    return False
+            result.extend(temp_result)
+            string = string[matched:]
+            n += matched
+        return True
 script_dir = os.path.dirname(os.path.abspath(__file__))
 parent_dir = os.path.dirname(os.path.dirname(os.path.dirname(script_dir)))