Spaces:
Running
Running
改进效率
Browse files- crazy_functions/代码重写为全英文_多线程.py +11 -14
crazy_functions/代码重写为全英文_多线程.py
CHANGED
@@ -10,16 +10,13 @@ def extract_code_block_carefully(txt):
|
|
10 |
txt_out = '```'.join(splitted[1:-1])
|
11 |
return txt_out
|
12 |
|
13 |
-
def breakdown_txt_to_satisfy_token_limit(txt, limit, must_break_at_empty_line=True):
|
14 |
-
from transformers import GPT2TokenizerFast
|
15 |
-
tokenizer = GPT2TokenizerFast.from_pretrained("gpt2")
|
16 |
-
get_token_cnt = lambda txt: len(tokenizer(txt)["input_ids"])
|
17 |
def cut(txt_tocut, must_break_at_empty_line): # 递归
|
18 |
-
if
|
19 |
return [txt_tocut]
|
20 |
else:
|
21 |
lines = txt_tocut.split('\n')
|
22 |
-
estimated_line_cut = limit /
|
23 |
estimated_line_cut = int(estimated_line_cut)
|
24 |
for cnt in reversed(range(estimated_line_cut)):
|
25 |
if must_break_at_empty_line:
|
@@ -27,7 +24,7 @@ def breakdown_txt_to_satisfy_token_limit(txt, limit, must_break_at_empty_line=Tr
|
|
27 |
print(cnt)
|
28 |
prev = "\n".join(lines[:cnt])
|
29 |
post = "\n".join(lines[cnt:])
|
30 |
-
if
|
31 |
if cnt == 0:
|
32 |
print('what the f?')
|
33 |
raise RuntimeError("存在一行极长的文本!")
|
@@ -86,12 +83,12 @@ def 全项目切换英文(txt, top_p, temperature, chatbot, history, sys_prompt,
|
|
86 |
|
87 |
|
88 |
# 第5步:Token限制下的截断与处理
|
89 |
-
MAX_TOKEN =
|
90 |
-
|
91 |
-
|
92 |
-
|
93 |
-
|
94 |
-
|
95 |
|
96 |
|
97 |
# 第6步:任务函数
|
@@ -107,7 +104,7 @@ def 全项目切换英文(txt, top_p, temperature, chatbot, history, sys_prompt,
|
|
107 |
try:
|
108 |
gpt_say = ""
|
109 |
# 分解代码文件
|
110 |
-
file_content_breakdown = breakdown_txt_to_satisfy_token_limit(file_content, MAX_TOKEN)
|
111 |
for file_content_partial in file_content_breakdown:
|
112 |
i_say = i_say_template(fp, file_content_partial)
|
113 |
# # ** gpt request **
|
|
|
10 |
txt_out = '```'.join(splitted[1:-1])
|
11 |
return txt_out
|
12 |
|
13 |
+
def breakdown_txt_to_satisfy_token_limit(txt, get_token_fn, limit, must_break_at_empty_line=True):
|
|
|
|
|
|
|
14 |
def cut(txt_tocut, must_break_at_empty_line): # 递归
|
15 |
+
if get_token_fn(txt_tocut) <= limit:
|
16 |
return [txt_tocut]
|
17 |
else:
|
18 |
lines = txt_tocut.split('\n')
|
19 |
+
estimated_line_cut = limit / get_token_fn(txt_tocut) * len(lines)
|
20 |
estimated_line_cut = int(estimated_line_cut)
|
21 |
for cnt in reversed(range(estimated_line_cut)):
|
22 |
if must_break_at_empty_line:
|
|
|
24 |
print(cnt)
|
25 |
prev = "\n".join(lines[:cnt])
|
26 |
post = "\n".join(lines[cnt:])
|
27 |
+
if get_token_fn(prev) < limit: break
|
28 |
if cnt == 0:
|
29 |
print('what the f?')
|
30 |
raise RuntimeError("存在一行极长的文本!")
|
|
|
83 |
|
84 |
|
85 |
# 第5步:Token限制下的截断与处理
|
86 |
+
MAX_TOKEN = 3000
|
87 |
+
from transformers import GPT2TokenizerFast
|
88 |
+
print('加载tokenizer中')
|
89 |
+
tokenizer = GPT2TokenizerFast.from_pretrained("gpt2")
|
90 |
+
get_token_fn = lambda txt: len(tokenizer(txt)["input_ids"])
|
91 |
+
print('加载tokenizer结束')
|
92 |
|
93 |
|
94 |
# 第6步:任务函数
|
|
|
104 |
try:
|
105 |
gpt_say = ""
|
106 |
# 分解代码文件
|
107 |
+
file_content_breakdown = breakdown_txt_to_satisfy_token_limit(file_content, get_token_fn, MAX_TOKEN)
|
108 |
for file_content_partial in file_content_breakdown:
|
109 |
i_say = i_say_template(fp, file_content_partial)
|
110 |
# # ** gpt request **
|