fake-data-generator-jsonl / gradio_app.py
lhoestq's picture
lhoestq HF staff
cleaning
6b97460
raw
history blame
2.27 kB
import time
from urllib.parse import urlparse, parse_qs
import gradio as gr
import io
import pandas as pd
import spaces
from generate import stream_jsonl_file
MAX_SIZE = 20
DEFAULT_SEED = 42
DEFAULT_SIZE = 3
@spaces.GPU(duration=120)
def stream_output(filename: str):
parsed_filename = urlparse(filename)
filename = parsed_filename.path
params = parse_qs(parsed_filename.query)
prompt = params["prompt"][0] if "prompt" in params else ""
columns = [column.strip() for column in params["columns"][0].split(",") if column.strip()] if "columns" in params else []
size = int(params["size"][0]) if "size" in params else DEFAULT_SIZE
seed = int(params["seed"][0]) if "seed" in params else DEFAULT_SEED
if size > MAX_SIZE:
yield None, None, "Error: Maximum size is 20"
content = ""
start_time = time.time()
for i, chunk in enumerate(stream_jsonl_file(
filename=filename,
prompt=prompt,
columns=columns,
seed=seed,
size=size,
)):
content += chunk
df = pd.read_json(io.StringIO(content), lines=True)
state_msg = (
f"βœ… Done generating {size} samples in {time.time() - start_time:.2f}s"
if i + 1 == size else
f"βš™οΈ Generating... [{i + 1}/{size}]"
)
yield df, "```json\n" + content + "\n```", state_msg
title = "LLM DataGen"
description = "Generate and stream synthetic dataset files in JSON Lines format"
examples = [
"movies_data.jsonl",
"dungeon_and_dragon_characters.jsonl"
"bad_amazon_reviews_on_defunct_products_that_people_hate.jsonl",
"common_first_names.jsonl?columns=first_name,popularity&size=10",
]
with gr.Blocks() as demo:
gr.Markdown(f"# {title}")
gr.Markdown(description)
filename_comp = gr.Textbox(examples[0], placeholder=examples[0])
gr.Examples(examples, filename_comp)
generate_button = gr.Button("Generate dataset")
state_msg_comp = gr.Markdown("πŸ”₯ Ready to generate")
with gr.Tab("Dataset"):
dataframe_comp = gr.DataFrame()
with gr.Tab("File content"):
file_content_comp = gr.Markdown()
generate_button.click(stream_output, filename_comp, [dataframe_comp, file_content_comp, state_msg_comp])
demo.launch()