qingxu99 commited on
Commit
dd648bd
·
1 Parent(s): a2002eb

disallow special token + limit num of file < 512

Browse files
crazy_functions/Latex全文润色.py CHANGED
@@ -14,7 +14,7 @@ class PaperFileGroup():
14
  import tiktoken
15
  from toolbox import get_conf
16
  enc = tiktoken.encoding_for_model(*get_conf('LLM_MODEL'))
17
- def get_token_num(txt): return len(enc.encode(txt))
18
  self.get_token_num = get_token_num
19
 
20
  def run_file_split(self, max_token_limit=1900):
 
14
  import tiktoken
15
  from toolbox import get_conf
16
  enc = tiktoken.encoding_for_model(*get_conf('LLM_MODEL'))
17
+ def get_token_num(txt): return len(enc.encode(txt, disallowed_special=()))
18
  self.get_token_num = get_token_num
19
 
20
  def run_file_split(self, max_token_limit=1900):
crazy_functions/Latex全文翻译.py CHANGED
@@ -14,7 +14,7 @@ class PaperFileGroup():
14
  import tiktoken
15
  from toolbox import get_conf
16
  enc = tiktoken.encoding_for_model(*get_conf('LLM_MODEL'))
17
- def get_token_num(txt): return len(enc.encode(txt))
18
  self.get_token_num = get_token_num
19
 
20
  def run_file_split(self, max_token_limit=1900):
 
14
  import tiktoken
15
  from toolbox import get_conf
16
  enc = tiktoken.encoding_for_model(*get_conf('LLM_MODEL'))
17
+ def get_token_num(txt): return len(enc.encode(txt, disallowed_special=()))
18
  self.get_token_num = get_token_num
19
 
20
  def run_file_split(self, max_token_limit=1900):
crazy_functions/crazy_utils.py CHANGED
@@ -6,7 +6,7 @@ def input_clipping(inputs, history, max_token_limit):
6
  import numpy as np
7
  from toolbox import get_conf
8
  enc = tiktoken.encoding_for_model(*get_conf('LLM_MODEL'))
9
- def get_token_num(txt): return len(enc.encode(txt))
10
 
11
  mode = 'input-and-history'
12
  # 当 输入部分的token占比 小于 全文的一半时,只裁剪历史
@@ -23,7 +23,7 @@ def input_clipping(inputs, history, max_token_limit):
23
 
24
  while n_token > max_token_limit:
25
  where = np.argmax(everything_token)
26
- encoded = enc.encode(everything[where])
27
  clipped_encoded = encoded[:len(encoded)-delta]
28
  everything[where] = enc.decode(clipped_encoded)[:-1] # -1 to remove the may-be illegal char
29
  everything_token[where] = get_token_num(everything[where])
 
6
  import numpy as np
7
  from toolbox import get_conf
8
  enc = tiktoken.encoding_for_model(*get_conf('LLM_MODEL'))
9
+ def get_token_num(txt): return len(enc.encode(txt, disallowed_special=()))
10
 
11
  mode = 'input-and-history'
12
  # 当 输入部分的token占比 小于 全文的一半时,只裁剪历史
 
23
 
24
  while n_token > max_token_limit:
25
  where = np.argmax(everything_token)
26
+ encoded = enc.encode(everything[where], disallowed_special=())
27
  clipped_encoded = encoded[:len(encoded)-delta]
28
  everything[where] = enc.decode(clipped_encoded)[:-1] # -1 to remove the may-be illegal char
29
  everything_token[where] = get_token_num(everything[where])
crazy_functions/代码重写为全英文_多线程.py CHANGED
@@ -62,7 +62,7 @@ def 全项目切换英文(txt, llm_kwargs, plugin_kwargs, chatbot, history, sys_
62
  import tiktoken
63
  from toolbox import get_conf
64
  enc = tiktoken.encoding_for_model(*get_conf('LLM_MODEL'))
65
- def get_token_fn(txt): return len(enc.encode(txt))
66
 
67
 
68
  # 第6步:任务函数
 
62
  import tiktoken
63
  from toolbox import get_conf
64
  enc = tiktoken.encoding_for_model(*get_conf('LLM_MODEL'))
65
+ def get_token_fn(txt): return len(enc.encode(txt, disallowed_special=()))
66
 
67
 
68
  # 第6步:任务函数
crazy_functions/批量Markdown翻译.py CHANGED
@@ -14,7 +14,7 @@ class PaperFileGroup():
14
  import tiktoken
15
  from toolbox import get_conf
16
  enc = tiktoken.encoding_for_model(*get_conf('LLM_MODEL'))
17
- def get_token_num(txt): return len(enc.encode(txt))
18
  self.get_token_num = get_token_num
19
 
20
  def run_file_split(self, max_token_limit=1900):
 
14
  import tiktoken
15
  from toolbox import get_conf
16
  enc = tiktoken.encoding_for_model(*get_conf('LLM_MODEL'))
17
+ def get_token_num(txt): return len(enc.encode(txt, disallowed_special=()))
18
  self.get_token_num = get_token_num
19
 
20
  def run_file_split(self, max_token_limit=1900):
crazy_functions/批量翻译PDF文档_多线程.py CHANGED
@@ -70,7 +70,7 @@ def 解析PDF(file_manifest, project_folder, llm_kwargs, plugin_kwargs, chatbot,
70
  from .crazy_utils import breakdown_txt_to_satisfy_token_limit_for_pdf
71
  from toolbox import get_conf
72
  enc = tiktoken.encoding_for_model(*get_conf('LLM_MODEL'))
73
- def get_token_num(txt): return len(enc.encode(txt))
74
  paper_fragments = breakdown_txt_to_satisfy_token_limit_for_pdf(
75
  txt=file_content, get_token_fn=get_token_num, limit=TOKEN_LIMIT_PER_FRAGMENT)
76
  page_one_fragments = breakdown_txt_to_satisfy_token_limit_for_pdf(
 
70
  from .crazy_utils import breakdown_txt_to_satisfy_token_limit_for_pdf
71
  from toolbox import get_conf
72
  enc = tiktoken.encoding_for_model(*get_conf('LLM_MODEL'))
73
+ def get_token_num(txt): return len(enc.encode(txt, disallowed_special=()))
74
  paper_fragments = breakdown_txt_to_satisfy_token_limit_for_pdf(
75
  txt=file_content, get_token_fn=get_token_num, limit=TOKEN_LIMIT_PER_FRAGMENT)
76
  page_one_fragments = breakdown_txt_to_satisfy_token_limit_for_pdf(
crazy_functions/理解PDF文档内容.py CHANGED
@@ -19,7 +19,7 @@ def 解析PDF(file_name, llm_kwargs, plugin_kwargs, chatbot, history, system_pro
19
  from .crazy_utils import breakdown_txt_to_satisfy_token_limit_for_pdf
20
  from toolbox import get_conf
21
  enc = tiktoken.encoding_for_model(*get_conf('LLM_MODEL'))
22
- def get_token_num(txt): return len(enc.encode(txt))
23
  paper_fragments = breakdown_txt_to_satisfy_token_limit_for_pdf(
24
  txt=file_content, get_token_fn=get_token_num, limit=TOKEN_LIMIT_PER_FRAGMENT)
25
  page_one_fragments = breakdown_txt_to_satisfy_token_limit_for_pdf(
 
19
  from .crazy_utils import breakdown_txt_to_satisfy_token_limit_for_pdf
20
  from toolbox import get_conf
21
  enc = tiktoken.encoding_for_model(*get_conf('LLM_MODEL'))
22
+ def get_token_num(txt): return len(enc.encode(txt, disallowed_special=()))
23
  paper_fragments = breakdown_txt_to_satisfy_token_limit_for_pdf(
24
  txt=file_content, get_token_fn=get_token_num, limit=TOKEN_LIMIT_PER_FRAGMENT)
25
  page_one_fragments = breakdown_txt_to_satisfy_token_limit_for_pdf(
crazy_functions/解析项目源代码.py CHANGED
@@ -11,7 +11,8 @@ def 解析源代码新(file_manifest, project_folder, llm_kwargs, plugin_kwargs,
11
  history_array = []
12
  sys_prompt_array = []
13
  report_part_1 = []
14
-
 
15
  ############################## <第一步,逐个文件分析,多线程> ##################################
16
  for index, fp in enumerate(file_manifest):
17
  with open(fp, 'r', encoding='utf-8', errors='replace') as f:
 
11
  history_array = []
12
  sys_prompt_array = []
13
  report_part_1 = []
14
+
15
+ assert len(file_manifest) <= 512, "源文件太多, 请缩减输入文件的数量, 或者删除此行并拆分file_manifest以保证结果能被分批存储。"
16
  ############################## <第一步,逐个文件分析,多线程> ##################################
17
  for index, fp in enumerate(file_manifest):
18
  with open(fp, 'r', encoding='utf-8', errors='replace') as f: