Commit
·
a13f86c
1
Parent(s):
291ad35
fix: stop sequences for textgen and add examples to pipeline definiton
Browse files
src/distilabel_dataset_generator/apps/sft.py
CHANGED
@@ -89,14 +89,18 @@ def generate_dataset(
|
|
89 |
"You can only generate a dataset with 5000 or fewer rows. Setting to 5000."
|
90 |
)
|
91 |
|
92 |
-
if num_rows <
|
93 |
duration = 60
|
94 |
-
elif num_rows <
|
95 |
-
duration =
|
|
|
|
|
|
|
|
|
96 |
elif num_rows < 1000:
|
97 |
-
duration =
|
98 |
else:
|
99 |
-
duration =
|
100 |
|
101 |
result_queue = multiprocessing.Queue()
|
102 |
p = multiprocessing.Process(
|
@@ -127,7 +131,7 @@ def generate_dataset(
|
|
127 |
repo_id=repo_id,
|
128 |
private=private,
|
129 |
include_script=False,
|
130 |
-
token=oauth_token
|
131 |
)
|
132 |
|
133 |
# If not pushing to hub generate the dataset directly
|
|
|
89 |
"You can only generate a dataset with 5000 or fewer rows. Setting to 5000."
|
90 |
)
|
91 |
|
92 |
+
if num_rows < 10:
|
93 |
duration = 60
|
94 |
+
elif num_rows < 30:
|
95 |
+
duration = 120
|
96 |
+
elif num_rows < 100:
|
97 |
+
duration = 240
|
98 |
+
elif num_rows < 300:
|
99 |
+
duration = 600
|
100 |
elif num_rows < 1000:
|
101 |
+
duration = 1200
|
102 |
else:
|
103 |
+
duration = 2400
|
104 |
|
105 |
result_queue = multiprocessing.Queue()
|
106 |
p = multiprocessing.Process(
|
|
|
131 |
repo_id=repo_id,
|
132 |
private=private,
|
133 |
include_script=False,
|
134 |
+
token=oauth_token,
|
135 |
)
|
136 |
|
137 |
# If not pushing to hub generate the dataset directly
|
src/distilabel_dataset_generator/pipelines/sft.py
CHANGED
@@ -227,7 +227,6 @@ def get_prompt_generation_step():
|
|
227 |
"temperature": 0.8,
|
228 |
"max_new_tokens": 2048,
|
229 |
"do_sample": True,
|
230 |
-
"stop_sequences": _STOP_SEQUENCES,
|
231 |
},
|
232 |
),
|
233 |
use_system_prompt=True,
|
@@ -243,7 +242,7 @@ if __name__ == "__main__":
|
|
243 |
[
|
244 |
{
|
245 |
"system_prompt": PROMPT_CREATION_PROMPT,
|
246 |
-
"instruction":
|
247 |
}
|
248 |
]
|
249 |
)
|
|
|
227 |
"temperature": 0.8,
|
228 |
"max_new_tokens": 2048,
|
229 |
"do_sample": True,
|
|
|
230 |
},
|
231 |
),
|
232 |
use_system_prompt=True,
|
|
|
242 |
[
|
243 |
{
|
244 |
"system_prompt": PROMPT_CREATION_PROMPT,
|
245 |
+
"instruction": DEFAULT_DATASET_DESCRIPTIONS[0],
|
246 |
}
|
247 |
]
|
248 |
)
|