davidberenstein1957 HF staff commited on
Commit
e4b6cc5
·
1 Parent(s): bff4352

feat: replace pipeline with individual generator

Browse files
Files changed (2) hide show
  1. app.py +1 -2
  2. src/distilabel_dataset_generator/sft.py +33 -12
app.py CHANGED
@@ -9,5 +9,4 @@ demo = gr.TabbedInterface(
9
  head="⚗️ Distilabel Dataset Generator",
10
  )
11
 
12
- if __name__ == "__main__":
13
- demo.launch()
 
9
  head="⚗️ Distilabel Dataset Generator",
10
  )
11
 
12
+ demo.launch()
 
src/distilabel_dataset_generator/sft.py CHANGED
@@ -1,4 +1,5 @@
1
  import gradio as gr
 
2
  from distilabel.llms import InferenceEndpointsLLM
3
  from distilabel.pipeline import Pipeline
4
  from distilabel.steps.tasks import MagpieGenerator, TextGeneration
@@ -111,13 +112,17 @@ The prompt you write should follow the same style and structure as the following
111
  User dataset description:
112
  """
113
 
114
- MODEL = "meta-llama/Meta-Llama-3.1-8B-Instruct"
115
 
116
  generate_description = TextGeneration(
117
  llm=InferenceEndpointsLLM(
118
  model_id=MODEL,
119
  tokenizer_id=MODEL,
120
- generation_kwargs={"temperature": 0.8, "max_new_tokens": 2048},
 
 
 
 
121
  ),
122
  use_system_prompt=True,
123
  )
@@ -137,7 +142,7 @@ def _generate_system_prompt(_dataset_description):
137
  )[0]["generation"]
138
 
139
 
140
- def _generate_dataset(_system_prompt, _num_turns=1, _num_rows=1):
141
  with Pipeline(name="sft") as pipeline:
142
  magpie_step = MagpieGenerator(
143
  llm=InferenceEndpointsLLM(
@@ -152,16 +157,28 @@ def _generate_dataset(_system_prompt, _num_turns=1, _num_rows=1):
152
  num_rows=_num_rows,
153
  system_prompt=_system_prompt,
154
  )
155
- distiset = pipeline.run()
156
- print(distiset)
157
- return distiset
 
 
 
 
 
 
 
 
 
 
158
 
159
 
160
  with gr.Blocks(
161
- title="⚗️ Distilabel Dataset Generator", head="⚗️ Distilabel Dataset Generator"
 
162
  ) as demo:
163
  dataset_description = gr.Textbox(
164
- label="Provide a description of the dataset", value="I am a dataset"
 
165
  )
166
 
167
  btn_generate_system_prompt = gr.Button(
@@ -177,10 +194,10 @@ with gr.Blocks(
177
  )
178
 
179
  btn_generate_sample_dataset = gr.Button(
180
- value="🧪 Generate Sample Dataset of 10 rows and a single turn"
181
  )
182
 
183
- table = gr.Dataframe(label="Generated Dataset")
184
 
185
  btn_generate_sample_dataset.click(
186
  fn=_generate_dataset,
@@ -190,9 +207,13 @@ with gr.Blocks(
190
 
191
  with gr.Row(variant="panel"):
192
  with gr.Column():
193
- num_turns = gr.Number(value=1, label="Number of turns in the conversation")
 
 
194
  with gr.Column():
195
- num_rows = gr.Number(value=1, label="Number of rows in the dataset")
 
 
196
 
197
  dataset_name_push_to_hub = gr.Textbox(label="Dataset Name to push to Hub")
198
 
 
1
  import gradio as gr
2
+ import pandas as pd
3
  from distilabel.llms import InferenceEndpointsLLM
4
  from distilabel.pipeline import Pipeline
5
  from distilabel.steps.tasks import MagpieGenerator, TextGeneration
 
112
  User dataset description:
113
  """
114
 
115
+ MODEL = "meta-llama/Meta-Llama-3.1-70B-Instruct"
116
 
117
  generate_description = TextGeneration(
118
  llm=InferenceEndpointsLLM(
119
  model_id=MODEL,
120
  tokenizer_id=MODEL,
121
+ generation_kwargs={
122
+ "temperature": 0.8,
123
+ "max_new_tokens": 2048,
124
+ "do_sample": True,
125
+ },
126
  ),
127
  use_system_prompt=True,
128
  )
 
142
  )[0]["generation"]
143
 
144
 
145
+ def _generate_dataset(_system_prompt, _num_turns=1, _num_rows=5):
146
  with Pipeline(name="sft") as pipeline:
147
  magpie_step = MagpieGenerator(
148
  llm=InferenceEndpointsLLM(
 
157
  num_rows=_num_rows,
158
  system_prompt=_system_prompt,
159
  )
160
+ magpie_step.load()
161
+ if _num_turns == 1:
162
+ outputs = {"instruction": [], "response": []}
163
+ for _ in range(_num_rows):
164
+ entry = next(magpie_step.process())[0][0]
165
+ outputs["instruction"].append(entry["instruction"])
166
+ outputs["response"].append(entry["response"])
167
+ else:
168
+ outputs = {"conversation": []}
169
+ for _ in range(_num_rows):
170
+ entry = next(magpie_step.process())[0][0]
171
+ outputs["conversation"].append(entry["conversation"])
172
+ return pd.DataFrame(outputs)
173
 
174
 
175
  with gr.Blocks(
176
+ title="⚗️ Distilabel Dataset Generator",
177
+ head="⚗️ Distilabel Dataset Generator",
178
  ) as demo:
179
  dataset_description = gr.Textbox(
180
+ label="Provide a description of the dataset",
181
+ value="A chemistry dataset for an assistant that explains chemical reactions and formulas",
182
  )
183
 
184
  btn_generate_system_prompt = gr.Button(
 
194
  )
195
 
196
  btn_generate_sample_dataset = gr.Button(
197
+ value="🧪 Generate Sample Dataset of 5 rows and a single turn"
198
  )
199
 
200
+ table = gr.Dataframe(label="Generated Dataset", wrap=True)
201
 
202
  btn_generate_sample_dataset.click(
203
  fn=_generate_dataset,
 
207
 
208
  with gr.Row(variant="panel"):
209
  with gr.Column():
210
+ num_turns = gr.Number(
211
+ value=1, label="Number of turns in the conversation", minimum=1
212
+ )
213
  with gr.Column():
214
+ num_rows = gr.Number(
215
+ value=1, label="Number of rows in the dataset", minimum=1
216
+ )
217
 
218
  dataset_name_push_to_hub = gr.Textbox(label="Dataset Name to push to Hub")
219