import time from urllib.parse import urlparse, parse_qs import gradio as gr import io import pandas as pd import spaces from generate import stream_jsonl_file MAX_SIZE = 20 DEFAULT_SEED = 42 DEFAULT_SIZE = 3 @spaces.GPU(duration=120) def stream_output(filename: str): parsed_filename = urlparse(filename) filename = parsed_filename.path params = parse_qs(parsed_filename.query) prompt = params["prompt"][0] if "prompt" in params else "" columns = [column.strip() for column in params["columns"][0].split(",") if column.strip()] if "columns" in params else [] size = int(params["size"][0]) if "size" in params else DEFAULT_SIZE seed = int(params["seed"][0]) if "seed" in params else DEFAULT_SEED if size > MAX_SIZE: yield None, None, "Error: Maximum size is 20" content = "" start_time = time.time() for i, chunk in enumerate(stream_jsonl_file( filename=filename, prompt=prompt, columns=columns, seed=seed, size=size, )): content += chunk df = pd.read_json(io.StringIO(content), lines=True) state_msg = ( f"✅ Done generating {size} samples in {time.time() - start_time:.2f}s" if i + 1 == size else f"⚙️ Generating... [{i + 1}/{size}]" ) yield df, "```json\n" + content + "\n```", state_msg title = "LLM DataGen" description = "Generate and stream synthetic dataset files in JSON Lines format" examples = [ "movies_data.jsonl", "dungeon_and_dragon_characters.jsonl" "bad_amazon_reviews_on_defunct_products_that_people_hate.jsonl", "common_first_names.jsonl?columns=first_name,popularity&size=10", ] with gr.Blocks() as demo: gr.Markdown(f"# {title}") gr.Markdown(description) filename_comp = gr.Textbox(examples[0], placeholder=examples[0]) gr.Examples(examples, filename_comp) generate_button = gr.Button("Generate dataset") state_msg_comp = gr.Markdown("🔥 Ready to generate") with gr.Tab("Dataset"): dataframe_comp = gr.DataFrame() with gr.Tab("File content"): file_content_comp = gr.Markdown() generate_button.click(stream_output, filename_comp, [dataframe_comp, file_content_comp, state_msg_comp]) demo.launch()