sdiazlor HF staff commited on
Commit
b924828
1 Parent(s): d593617

fix: duplicated labels in labels and number of rows update listener in raw pipeline

Browse files
src/distilabel_dataset_generator/apps/textcat.py CHANGED
@@ -279,11 +279,13 @@ def generate_dataset(
279
  else:
280
  dataframe["labels"] = dataframe["labels"].apply(
281
  lambda x: (
282
- [
283
- label.lower().strip()
284
- for label in x
285
- if label.lower().strip() in labels
286
- ]
 
 
287
  if isinstance(x, list)
288
  else None
289
  )
@@ -556,3 +558,8 @@ with app:
556
  inputs=[system_prompt, difficulty, clarity, labels, num_labels, num_rows],
557
  outputs=[pipeline_code],
558
  )
 
 
 
 
 
 
279
  else:
280
  dataframe["labels"] = dataframe["labels"].apply(
281
  lambda x: (
282
+ list(
283
+ set(
284
+ label.lower().strip()
285
+ for label in x
286
+ if label.lower().strip() in labels
287
+ )
288
+ )
289
  if isinstance(x, list)
290
  else None
291
  )
 
558
  inputs=[system_prompt, difficulty, clarity, labels, num_labels, num_rows],
559
  outputs=[pipeline_code],
560
  )
561
+ num_rows.change(
562
+ fn=generate_pipeline_code,
563
+ inputs=[system_prompt, difficulty, clarity, labels, num_labels, num_rows],
564
+ outputs=[pipeline_code],
565
+ )
src/distilabel_dataset_generator/pipelines/textcat.py CHANGED
@@ -199,7 +199,7 @@ def get_labeller_generator(system_prompt, labels, num_labels):
199
  tokenizer_id=MODEL,
200
  api_key=_get_next_api_key(),
201
  generation_kwargs={
202
- "temperature": 0.8,
203
  "max_new_tokens": 2048,
204
  },
205
  ),
 
199
  tokenizer_id=MODEL,
200
  api_key=_get_next_api_key(),
201
  generation_kwargs={
202
+ "temperature": 0.7,
203
  "max_new_tokens": 2048,
204
  },
205
  ),