File size: 5,767 Bytes
9813f91
 
 
 
 
 
 
432eb42
c5079c2
9813f91
 
eb87ba4
9813f91
 
 
c5079c2
 
 
 
 
 
 
 
 
 
 
 
 
9813f91
c5079c2
 
 
 
eb87ba4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9813f91
 
 
 
 
 
 
 
 
eb87ba4
 
 
 
 
 
 
 
 
 
9813f91
 
 
 
 
eb87ba4
9813f91
eb87ba4
 
 
 
9813f91
 
 
 
 
 
 
 
 
 
 
 
 
 
5e8fd04
9813f91
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b5ddb7e
432eb42
478872e
 
a7ca30e
432eb42
 
 
 
478872e
 
 
432eb42
 
 
 
 
b5ddb7e
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
import os
import logging
import traceback

import openai
import gradio as gr
import ujson as json
import commentjson
import openpyxl

import modules.presets as presets
from modules.utils import get_file_hash, count_token
from modules.presets import i18n

def excel_to_jsonl(filepath, preview=False):
    # 打开Excel文件
    workbook = openpyxl.load_workbook(filepath)

    # 获取第一个工作表
    sheet = workbook.active

    # 获取所有行数据
    data = []
    for row in sheet.iter_rows(values_only=True):
        data.append(row)

    # 构建字典列表
    headers = data[0]
    jsonl = []
    for row in data[1:]:
        row_data = dict(zip(headers, row))
        if any(row_data.values()):
            jsonl.append(row_data)
    formatted_jsonl = []
    for i in jsonl:
            if "提问" in i and "答案" in i:
                if "系统" in i :
                    formatted_jsonl.append({
                        "messages":[
                            {"role": "system", "content": i["系统"]},
                            {"role": "user", "content": i["提问"]},
                            {"role": "assistant", "content": i["答案"]}
                        ]
                    })
                else:
                    formatted_jsonl.append({
                        "messages":[
                            {"role": "user", "content": i["提问"]},
                            {"role": "assistant", "content": i["答案"]}
                        ]
                    })
            else:
                logging.warning(f"跳过一行数据,因为没有找到提问和答案: {i}")
    return formatted_jsonl

def jsonl_save_to_disk(jsonl, filepath):
    file_hash = get_file_hash(file_paths = [filepath])
    os.makedirs("files", exist_ok=True)
    save_path = f"files/{file_hash}.jsonl"
    with open(save_path, "w") as f:
        f.write("\n".join([json.dumps(i, ensure_ascii=False) for i in jsonl]))
    return save_path

def estimate_cost(ds):
    dialogues = []
    for l in ds:
        for m in l["messages"]:
            dialogues.append(m["content"])
    dialogues = "\n".join(dialogues)
    tokens = count_token(dialogues)
    return f"Token 数约为 {tokens},预估每轮(epoch)费用约为 {tokens / 1000 * 0.008} 美元。"


def handle_dataset_selection(file_src):
    logging.info(f"Loading dataset {file_src.name}...")
    preview = ""
    if file_src.name.endswith(".jsonl"):
        with open(file_src.name, "r") as f:
            ds = [json.loads(l) for l in f.readlines()]
    else:
        ds = excel_to_jsonl(file_src.name)
    preview = ds[0]

    return preview, gr.update(interactive=True), estimate_cost(ds)

def upload_to_openai(file_src):
    openai.api_key = os.getenv("OPENAI_API_KEY")
    dspath = file_src.name
    msg = ""
    logging.info(f"Uploading dataset {dspath}...")
    if dspath.endswith(".xlsx"):
        jsonl = excel_to_jsonl(dspath)
        dspath = jsonl_save_to_disk(jsonl, dspath)
    try:
        uploaded = openai.File.create(
            file=open(dspath, "rb"),
            purpose='fine-tune'
            )
        return uploaded.id, f"上传成功"
    except Exception as e:
        traceback.print_exc()
        return "", f"上传失败,原因:{ e }"

def build_event_description(id, status, trained_tokens, name=i18n("暂时未知")):
    # convert to markdown
    return f"""
    #### 训练任务 {id}

    模型名称:{name}

    状态:{status}

    已经训练了 {trained_tokens} 个token
    """

def start_training(file_id, suffix, epochs):
    openai.api_key = os.getenv("OPENAI_API_KEY")
    try:
        job = openai.FineTuningJob.create(training_file=file_id, model="gpt-3.5-turbo", suffix=suffix, hyperparameters={"n_epochs": epochs})
        return build_event_description(job.id, job.status, job.trained_tokens)
    except Exception as e:
        traceback.print_exc()
        if "is not ready" in str(e):
            return "训练出错,因为文件还没准备好。OpenAI 需要一点时间准备文件,过几分钟再来试试。"
        return f"训练失败,原因:{ e }"

def get_training_status():
    openai.api_key = os.getenv("OPENAI_API_KEY")
    active_jobs = [build_event_description(job["id"], job["status"], job["trained_tokens"], job["fine_tuned_model"]) for job in openai.FineTuningJob.list(limit=10)["data"] if job["status"] != "cancelled"]
    return "\n\n".join(active_jobs), gr.update(interactive=True) if len(active_jobs) > 0 else gr.update(interactive=False)

def handle_dataset_clear():
    return gr.update(value=None), gr.update(interactive=False)

def add_to_models():
    openai.api_key = os.getenv("OPENAI_API_KEY")
    succeeded_jobs = [job for job in openai.FineTuningJob.list()["data"] if job["status"] == "succeeded"]
    extra_models = [job["fine_tuned_model"] for job in succeeded_jobs]
    for i in extra_models:
        if i not in presets.MODELS:
            presets.MODELS.append(i)

    with open('config.json', 'r') as f:
        data = commentjson.load(f)
    if 'extra_models' in data:
        for i in extra_models:
            if i not in data['extra_models']:
                data['extra_models'].append(i)
    else:
        data['extra_models'] = extra_models
    with open('config.json', 'w') as f:
        commentjson.dump(data, f, indent=4)

    return gr.update(choices=presets.MODELS), f"成功添加了 {len(succeeded_jobs)} 个模型。"

def cancel_all_jobs():
    openai.api_key = os.getenv("OPENAI_API_KEY")
    jobs = [job for job in openai.FineTuningJob.list()["data"] if job["status"] not in ["cancelled", "succeeded"]]
    for job in jobs:
        openai.FineTuningJob.cancel(job["id"])
    return f"成功取消了 {len(jobs)} 个训练任务。"