Commit
·
e4b6cc5
1
Parent(s):
bff4352
feat: replace pipeline with individual generator
Browse files- app.py +1 -2
- src/distilabel_dataset_generator/sft.py +33 -12
app.py
CHANGED
@@ -9,5 +9,4 @@ demo = gr.TabbedInterface(
|
|
9 |
head="⚗️ Distilabel Dataset Generator",
|
10 |
)
|
11 |
|
12 |
-
|
13 |
-
demo.launch()
|
|
|
9 |
head="⚗️ Distilabel Dataset Generator",
|
10 |
)
|
11 |
|
12 |
+
demo.launch()
|
|
src/distilabel_dataset_generator/sft.py
CHANGED
@@ -1,4 +1,5 @@
|
|
1 |
import gradio as gr
|
|
|
2 |
from distilabel.llms import InferenceEndpointsLLM
|
3 |
from distilabel.pipeline import Pipeline
|
4 |
from distilabel.steps.tasks import MagpieGenerator, TextGeneration
|
@@ -111,13 +112,17 @@ The prompt you write should follow the same style and structure as the following
|
|
111 |
User dataset description:
|
112 |
"""
|
113 |
|
114 |
-
MODEL = "meta-llama/Meta-Llama-3.1-
|
115 |
|
116 |
generate_description = TextGeneration(
|
117 |
llm=InferenceEndpointsLLM(
|
118 |
model_id=MODEL,
|
119 |
tokenizer_id=MODEL,
|
120 |
-
generation_kwargs={
|
|
|
|
|
|
|
|
|
121 |
),
|
122 |
use_system_prompt=True,
|
123 |
)
|
@@ -137,7 +142,7 @@ def _generate_system_prompt(_dataset_description):
|
|
137 |
)[0]["generation"]
|
138 |
|
139 |
|
140 |
-
def _generate_dataset(_system_prompt, _num_turns=1, _num_rows=
|
141 |
with Pipeline(name="sft") as pipeline:
|
142 |
magpie_step = MagpieGenerator(
|
143 |
llm=InferenceEndpointsLLM(
|
@@ -152,16 +157,28 @@ def _generate_dataset(_system_prompt, _num_turns=1, _num_rows=1):
|
|
152 |
num_rows=_num_rows,
|
153 |
system_prompt=_system_prompt,
|
154 |
)
|
155 |
-
|
156 |
-
|
157 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
158 |
|
159 |
|
160 |
with gr.Blocks(
|
161 |
-
title="⚗️ Distilabel Dataset Generator",
|
|
|
162 |
) as demo:
|
163 |
dataset_description = gr.Textbox(
|
164 |
-
label="Provide a description of the dataset",
|
|
|
165 |
)
|
166 |
|
167 |
btn_generate_system_prompt = gr.Button(
|
@@ -177,10 +194,10 @@ with gr.Blocks(
|
|
177 |
)
|
178 |
|
179 |
btn_generate_sample_dataset = gr.Button(
|
180 |
-
value="🧪 Generate Sample Dataset of
|
181 |
)
|
182 |
|
183 |
-
table = gr.Dataframe(label="Generated Dataset")
|
184 |
|
185 |
btn_generate_sample_dataset.click(
|
186 |
fn=_generate_dataset,
|
@@ -190,9 +207,13 @@ with gr.Blocks(
|
|
190 |
|
191 |
with gr.Row(variant="panel"):
|
192 |
with gr.Column():
|
193 |
-
num_turns = gr.Number(
|
|
|
|
|
194 |
with gr.Column():
|
195 |
-
num_rows = gr.Number(
|
|
|
|
|
196 |
|
197 |
dataset_name_push_to_hub = gr.Textbox(label="Dataset Name to push to Hub")
|
198 |
|
|
|
1 |
import gradio as gr
|
2 |
+
import pandas as pd
|
3 |
from distilabel.llms import InferenceEndpointsLLM
|
4 |
from distilabel.pipeline import Pipeline
|
5 |
from distilabel.steps.tasks import MagpieGenerator, TextGeneration
|
|
|
112 |
User dataset description:
|
113 |
"""
|
114 |
|
115 |
+
MODEL = "meta-llama/Meta-Llama-3.1-70B-Instruct"
|
116 |
|
117 |
generate_description = TextGeneration(
|
118 |
llm=InferenceEndpointsLLM(
|
119 |
model_id=MODEL,
|
120 |
tokenizer_id=MODEL,
|
121 |
+
generation_kwargs={
|
122 |
+
"temperature": 0.8,
|
123 |
+
"max_new_tokens": 2048,
|
124 |
+
"do_sample": True,
|
125 |
+
},
|
126 |
),
|
127 |
use_system_prompt=True,
|
128 |
)
|
|
|
142 |
)[0]["generation"]
|
143 |
|
144 |
|
145 |
+
def _generate_dataset(_system_prompt, _num_turns=1, _num_rows=5):
|
146 |
with Pipeline(name="sft") as pipeline:
|
147 |
magpie_step = MagpieGenerator(
|
148 |
llm=InferenceEndpointsLLM(
|
|
|
157 |
num_rows=_num_rows,
|
158 |
system_prompt=_system_prompt,
|
159 |
)
|
160 |
+
magpie_step.load()
|
161 |
+
if _num_turns == 1:
|
162 |
+
outputs = {"instruction": [], "response": []}
|
163 |
+
for _ in range(_num_rows):
|
164 |
+
entry = next(magpie_step.process())[0][0]
|
165 |
+
outputs["instruction"].append(entry["instruction"])
|
166 |
+
outputs["response"].append(entry["response"])
|
167 |
+
else:
|
168 |
+
outputs = {"conversation": []}
|
169 |
+
for _ in range(_num_rows):
|
170 |
+
entry = next(magpie_step.process())[0][0]
|
171 |
+
outputs["conversation"].append(entry["conversation"])
|
172 |
+
return pd.DataFrame(outputs)
|
173 |
|
174 |
|
175 |
with gr.Blocks(
|
176 |
+
title="⚗️ Distilabel Dataset Generator",
|
177 |
+
head="⚗️ Distilabel Dataset Generator",
|
178 |
) as demo:
|
179 |
dataset_description = gr.Textbox(
|
180 |
+
label="Provide a description of the dataset",
|
181 |
+
value="A chemistry dataset for an assistant that explains chemical reactions and formulas",
|
182 |
)
|
183 |
|
184 |
btn_generate_system_prompt = gr.Button(
|
|
|
194 |
)
|
195 |
|
196 |
btn_generate_sample_dataset = gr.Button(
|
197 |
+
value="🧪 Generate Sample Dataset of 5 rows and a single turn"
|
198 |
)
|
199 |
|
200 |
+
table = gr.Dataframe(label="Generated Dataset", wrap=True)
|
201 |
|
202 |
btn_generate_sample_dataset.click(
|
203 |
fn=_generate_dataset,
|
|
|
207 |
|
208 |
with gr.Row(variant="panel"):
|
209 |
with gr.Column():
|
210 |
+
num_turns = gr.Number(
|
211 |
+
value=1, label="Number of turns in the conversation", minimum=1
|
212 |
+
)
|
213 |
with gr.Column():
|
214 |
+
num_rows = gr.Number(
|
215 |
+
value=1, label="Number of rows in the dataset", minimum=1
|
216 |
+
)
|
217 |
|
218 |
dataset_name_push_to_hub = gr.Textbox(label="Dataset Name to push to Hub")
|
219 |
|