fix: duplicated labels in labels and number of rows update listener in raw pipeline
Browse files
src/distilabel_dataset_generator/apps/textcat.py
CHANGED
@@ -279,11 +279,13 @@ def generate_dataset(
|
|
279 |
else:
|
280 |
dataframe["labels"] = dataframe["labels"].apply(
|
281 |
lambda x: (
|
282 |
-
|
283 |
-
|
284 |
-
|
285 |
-
|
286 |
-
|
|
|
|
|
287 |
if isinstance(x, list)
|
288 |
else None
|
289 |
)
|
@@ -556,3 +558,8 @@ with app:
|
|
556 |
inputs=[system_prompt, difficulty, clarity, labels, num_labels, num_rows],
|
557 |
outputs=[pipeline_code],
|
558 |
)
|
|
|
|
|
|
|
|
|
|
|
|
279 |
else:
|
280 |
dataframe["labels"] = dataframe["labels"].apply(
|
281 |
lambda x: (
|
282 |
+
list(
|
283 |
+
set(
|
284 |
+
label.lower().strip()
|
285 |
+
for label in x
|
286 |
+
if label.lower().strip() in labels
|
287 |
+
)
|
288 |
+
)
|
289 |
if isinstance(x, list)
|
290 |
else None
|
291 |
)
|
|
|
558 |
inputs=[system_prompt, difficulty, clarity, labels, num_labels, num_rows],
|
559 |
outputs=[pipeline_code],
|
560 |
)
|
561 |
+
num_rows.change(
|
562 |
+
fn=generate_pipeline_code,
|
563 |
+
inputs=[system_prompt, difficulty, clarity, labels, num_labels, num_rows],
|
564 |
+
outputs=[pipeline_code],
|
565 |
+
)
|
src/distilabel_dataset_generator/pipelines/textcat.py
CHANGED
@@ -199,7 +199,7 @@ def get_labeller_generator(system_prompt, labels, num_labels):
|
|
199 |
tokenizer_id=MODEL,
|
200 |
api_key=_get_next_api_key(),
|
201 |
generation_kwargs={
|
202 |
-
"temperature": 0.
|
203 |
"max_new_tokens": 2048,
|
204 |
},
|
205 |
),
|
|
|
199 |
tokenizer_id=MODEL,
|
200 |
api_key=_get_next_api_key(),
|
201 |
generation_kwargs={
|
202 |
+
"temperature": 0.7,
|
203 |
"max_new_tokens": 2048,
|
204 |
},
|
205 |
),
|