Spaces:

datasets-topics
/

topics-generator

Sleeping

App Files Files Community

asoria HF staff commited on 9 days ago

Commit

a5eff40

•

1 Parent(s): b5ecaeb

Progress bar by task

Browse files

Files changed (1) hide show

app.py +194 -152

app.py CHANGED Viewed

@@ -178,9 +178,9 @@ def generate_topics(dataset, config, split, column, plot_type):
     topics_info, topic_plot = None, None
     full_processing = split_rows <= MAX_ROWS
     message = (
-        f"⚙️ Processing full dataset: 0 of ({split_rows} rows)"
         if full_processing
-        else f"⚙️ Processing partial dataset 0 of ({limit} rows)"
     )
     sub_title = (
         f"Data map for the entire dataset ({limit} rows) using the column '{column}'"
@@ -191,48 +191,140 @@ def generate_topics(dataset, config, split, column, plot_type):
         gr.Accordion(open=False),
         gr.DataFrame(value=[], interactive=False, visible=True),
         gr.Plot(value=None, visible=True),
-        gr.Label({message: rows_processed / limit}, visible=True),
         "",
     )
-    while offset < limit:
-        docs = get_docs_from_parquet(parquet_urls, column, offset, CHUNK_SIZE)
-        if not docs:
-            break
-        logging.info(
-            f"----> Processing chunk: {offset=} {CHUNK_SIZE=} with {len(docs)} docs"
-        )
-        embeddings = calculate_embeddings(docs)
-        new_model = fit_model(docs, embeddings, n_neighbors, n_components)
-        if base_model is None:
-            base_model = new_model
-            logging.info(
-                f"The following topics are newly found: {base_model.topic_labels_}"
             )
-        else:
-            updated_model = BERTopic.merge_models([base_model, new_model])
-            nr_new_topics = len(set(updated_model.topics_)) - len(
-                set(base_model.topics_)
             )
-            new_topics = list(updated_model.topic_labels_.values())[-nr_new_topics:]
-            logging.info(f"The following topics are newly found: {new_topics}")
-            base_model = updated_model
-        reduced_embeddings = reduce_umap_model.fit_transform(embeddings)
-        reduced_embeddings_list.append(reduced_embeddings)
-        all_docs.extend(docs)
-        reduced_embeddings_array = np.vstack(reduced_embeddings_list)
-        topics_info = base_model.get_topic_info()
         all_topics = base_model.topics_
         topic_plot = (
             base_model.visualize_document_datamap(
                 docs=all_docs,
                 topics=all_topics,
                 reduced_embeddings=reduced_embeddings_array,
                 title="",
                 sub_title=sub_title,
@@ -258,137 +350,87 @@ def generate_topics(dataset, config, split, column, plot_type):
                 title="",
             )
         )
-        rows_processed += len(docs)
-        progress = min(rows_processed / limit, 1.0)
-        logging.info(f"Progress: {progress} % - {rows_processed} of {limit}")
-        message = (
-            f"⚙️ Processing full dataset: {rows_processed} of {limit}"
-            if full_processing
-            else f"⚙️ Processing partial dataset: {rows_processed} of {limit} rows"
-        )
         yield (
             gr.Accordion(open=False),
             topics_info,
             topic_plot,
-            gr.Label({message: progress}, visible=True),
             "",
         )
-        offset += CHUNK_SIZE
-        del docs, embeddings, new_model, reduced_embeddings
-    logging.info("Finished processing all data")
-    dataset_clear_name = dataset.replace("/", "-")
-    plot_png = f"{dataset_clear_name}-{plot_type.lower()}.png"
-    if plot_type == "DataMapPlot":
-        topic_plot.savefig(plot_png, format="png", dpi=300)
-    else:
-        topic_plot.write_image(plot_png)
-    all_topics = base_model.topics_
-    topic_info = base_model.get_topic_info()
-    new_topics_by_text_generation = {}
-    for _, row in topic_info.iterrows():
-        logging.info(
-            f"Processing topic: {row['Topic']} - Representation: {row['Representation']}"
         )
-        prompt = f"{REPRESENTATION_PROMPT.replace('[KEYWORDS]', ','.join(row['Representation']))}"
-        logging.info(prompt)
-        topic_description = generator(prompt)
-        logging.info(topic_description)
-        new_topics_by_text_generation[row["Topic"]] = topic_description[0][
-            "generated_text"
-        ].replace(prompt, "")
-    base_model.set_topic_labels(new_topics_by_text_generation)
-    topics_info = base_model.get_topic_info()
-    topic_plot = (
-        base_model.visualize_document_datamap(
-            docs=all_docs,
-            topics=all_topics,
-            custom_labels=True,
-            reduced_embeddings=reduced_embeddings_array,
-            title="",
-            sub_title=sub_title,
-            width=800,
-            height=700,
-            arrowprops={
-                "arrowstyle": "wedge,tail_width=0.5",
-                "connectionstyle": "arc3,rad=0.05",
-                "linewidth": 0,
-                "fc": "#33333377",
-            },
-            dynamic_label_size=True,
-            # label_wrap_width=12,
-            label_over_points=True,
-            max_font_size=36,
-            min_font_size=4,
         )
-        if plot_type == "DataMapPlot"
-        else base_model.visualize_documents(
-            docs=all_docs,
-            reduced_embeddings=reduced_embeddings_array,
-            custom_labels=True,
-            title="",
         )
-    )
-    custom_labels = base_model.custom_labels_
-    topic_names_array = [custom_labels[doc_topic + 1] for doc_topic in all_topics]
-    interactive_plot = datamapplot.create_interactive_plot(
-        reduced_embeddings_array,
-        topic_names_array,
-        hover_text=all_docs,
-        title=dataset,
-        sub_title=sub_title.replace(
-            "dataset",
-            f"<a href='https://huggingface.co/datasets/{dataset}/viewer/{config}/{split}' target='_blank'>dataset</a>",
-        ),
-        enable_search=True,
-        # TODO: Export data to .arrow and also serve it
-        inline_data=True,
-        # offline_data_prefix=dataset_clear_name,
-        initial_zoom_fraction=0.8,
-    )
-    html_content = str(interactive_plot)
-    html_file_path = f"{dataset_clear_name}.html"
-    with open(html_file_path, "w", encoding="utf-8") as html_file:
-        html_file.write(html_content)
-    repo_id = f"{DATASETS_TOPICS_ORGANIZATION}/{dataset_clear_name}"
-    space_id = create_space_with_content(
-        api=api,
-        repo_id=repo_id,
-        dataset_id=dataset,
-        html_file_path=html_file_path,
-        plot_file_path=plot_png,
-        space_card=SPACE_REPO_CARD_CONTENT,
-        token=HF_TOKEN,
-    )
-    space_link = f"https://huggingface.co/spaces/{space_id}"
-    yield (
-        gr.Accordion(open=False),
-        topics_info,
-        topic_plot,
-        gr.Label(
-            {f"✅ Done: {rows_processed} rows have been processed": 1.0}, visible=True
-        ),
-        f"[![Go to interactive plot](https://img.shields.io/badge/%F0%9F%A4%97%20Hugging%20Face-Space-blue)]({space_link})",
-    )
-    del reduce_umap_model, all_docs, reduced_embeddings_list
-    del (
-        base_model,
-        all_topics,
-        topic_info,
-        topic_names_array,
-        interactive_plot,
-    )
-    cuda.empty_cache()
 with gr.Blocks() as demo:
@@ -437,11 +479,11 @@ with gr.Blocks() as demo:
         generate_button = gr.Button("Generate Topics", variant="primary")
     gr.Markdown("## Data map")
-    full_topics_generation_label = gr.Label(visible=False, show_label=False)
     open_space_label = gr.Markdown()
     topics_plot = gr.Plot()
-    with gr.Accordion("Topics Info", open=False):
-        topics_df = gr.DataFrame(interactive=False, visible=True)
     gr.HTML(
         f"<p style='text-align: center; color:orange;'>⚠ This space processes datasets in batches of <b>{CHUNK_SIZE}</b>, with a maximum of <b>{MAX_ROWS}</b> rows. If you need further assistance, please open a new issue in the Community tab.</p>"
     )
@@ -463,7 +505,7 @@ with gr.Blocks() as demo:
             data_details_accordion,
             topics_df,
             topics_plot,
-            full_topics_generation_label,
             open_space_label,
         ],
     )

     topics_info, topic_plot = None, None
     full_processing = split_rows <= MAX_ROWS
     message = (
+        f"Processing topics for full dataset: 0 of ({split_rows} rows)"
         if full_processing
+        else f"Processing topics for partial dataset 0 of ({limit} rows)"
     )
     sub_title = (
         f"Data map for the entire dataset ({limit} rows) using the column '{column}'"
         gr.Accordion(open=False),
         gr.DataFrame(value=[], interactive=False, visible=True),
         gr.Plot(value=None, visible=True),
+        gr.Label({"⏳ " + message: 0.0}, visible=True),
         "",
     )
+    try:
+        while offset < limit:
+            docs = get_docs_from_parquet(parquet_urls, column, offset, CHUNK_SIZE)
+            if not docs:
+                break
+            logging.info(
+                f"----> Processing chunk: {offset=} {CHUNK_SIZE=} with {len(docs)} docs"
+            )
+            embeddings = calculate_embeddings(docs)
+            new_model = fit_model(docs, embeddings, n_neighbors, n_components)
+            if base_model is None:
+                base_model = new_model
+                logging.info(
+                    f"The following topics are newly found: {base_model.topic_labels_}"
+                )
+            else:
+                updated_model = BERTopic.merge_models([base_model, new_model])
+                nr_new_topics = len(set(updated_model.topics_)) - len(
+                    set(base_model.topics_)
+                )
+                new_topics = list(updated_model.topic_labels_.values())[-nr_new_topics:]
+                logging.info(f"The following topics are newly found: {new_topics}")
+                base_model = updated_model
+            reduced_embeddings = reduce_umap_model.fit_transform(embeddings)
+            reduced_embeddings_list.append(reduced_embeddings)
+            all_docs.extend(docs)
+            reduced_embeddings_array = np.vstack(reduced_embeddings_list)
+            topics_info = base_model.get_topic_info()
+            all_topics = base_model.topics_
+            topic_plot = (
+                base_model.visualize_document_datamap(
+                    docs=all_docs,
+                    topics=all_topics,
+                    reduced_embeddings=reduced_embeddings_array,
+                    title="",
+                    sub_title=sub_title,
+                    width=800,
+                    height=700,
+                    arrowprops={
+                        "arrowstyle": "wedge,tail_width=0.5",
+                        "connectionstyle": "arc3,rad=0.05",
+                        "linewidth": 0,
+                        "fc": "#33333377",
+                    },
+                    dynamic_label_size=True,
+                    # label_wrap_width=12,
+                    label_over_points=True,
+                    max_font_size=36,
+                    min_font_size=4,
+                )
+                if plot_type == "DataMapPlot"
+                else base_model.visualize_documents(
+                    docs=all_docs,
+                    reduced_embeddings=reduced_embeddings_array,
+                    custom_labels=True,
+                    title="",
+                )
             )
+            rows_processed += len(docs)
+            progress = min(rows_processed / limit, 1.0)
+            logging.info(f"Progress: {progress} % - {rows_processed} of {limit}")
+            message = (
+                f"Processing topics for full dataset: {rows_processed} of {limit}"
+                if full_processing
+                else f"Processing topics for partial dataset: {rows_processed} of {limit} rows"
             )
+            yield (
+                gr.Accordion(open=False),
+                topics_info,
+                topic_plot,
+                gr.Label({"⏳ " + message: progress}, visible=True),
+                "",
+            )
+            offset += CHUNK_SIZE
+            del docs, embeddings, new_model, reduced_embeddings
+        logging.info("Finished processing topic modeling data")
+        yield (
+            gr.Accordion(open=False),
+            topics_info,
+            topic_plot,
+            gr.Label(
+                {
+                    "✅ " + message: 1.0,
+                    f"⏳ Generating topic names with {model_id}": 0.0,
+                },
+                visible=True,
+            ),
+            "",
+        )
+        dataset_clear_name = dataset.replace("/", "-")
+        plot_png = f"{dataset_clear_name}-{plot_type.lower()}.png"
+        if plot_type == "DataMapPlot":
+            topic_plot.savefig(plot_png, format="png", dpi=300)
+        else:
+            topic_plot.write_image(plot_png)
         all_topics = base_model.topics_
+        topics_info = base_model.get_topic_info()
+        new_topics_by_text_generation = {}
+        for _, row in topics_info.iterrows():
+            logging.info(
+                f"Processing topic: {row['Topic']} - Representation: {row['Representation']}"
+            )
+            prompt = f"{REPRESENTATION_PROMPT.replace('[KEYWORDS]', ','.join(row['Representation']))}"
+            logging.info(prompt)
+            topic_description = generator(prompt)
+            logging.info(topic_description)
+            new_topics_by_text_generation[row["Topic"]] = topic_description[0][
+                "generated_text"
+            ].replace(prompt, "")
+        base_model.set_topic_labels(new_topics_by_text_generation)
+        topics_info = base_model.get_topic_info()
         topic_plot = (
             base_model.visualize_document_datamap(
                 docs=all_docs,
                 topics=all_topics,
+                custom_labels=True,
                 reduced_embeddings=reduced_embeddings_array,
                 title="",
                 sub_title=sub_title,
                 title="",
             )
         )
+        custom_labels = base_model.custom_labels_
+        topic_names_array = [custom_labels[doc_topic + 1] for doc_topic in all_topics]
         yield (
             gr.Accordion(open=False),
             topics_info,
             topic_plot,
+            gr.Label(
+                {
+                    "✅ " + message: 1.0,
+                    f"✅ Generating topic names with {model_id}": 1.0,
+                    "⏳ Creating Interactive Space": 0.0,
+                },
+                visible=True,
+            ),
             "",
         )
+        interactive_plot = datamapplot.create_interactive_plot(
+            reduced_embeddings_array,
+            topic_names_array,
+            hover_text=all_docs,
+            title=dataset,
+            sub_title=sub_title.replace(
+                "dataset",
+                f"<a href='https://huggingface.co/datasets/{dataset}/viewer/{config}/{split}' target='_blank'>dataset</a>",
+            ),
+            enable_search=True,
+            # TODO: Export data to .arrow and also serve it
+            inline_data=True,
+            # offline_data_prefix=dataset_clear_name,
+            initial_zoom_fraction=0.8,
+        )
+        html_content = str(interactive_plot)
+        html_file_path = f"{dataset_clear_name}.html"
+        with open(html_file_path, "w", encoding="utf-8") as html_file:
+            html_file.write(html_content)
+        repo_id = f"{DATASETS_TOPICS_ORGANIZATION}/{dataset_clear_name}"
+        space_id = create_space_with_content(
+            api=api,
+            repo_id=repo_id,
+            dataset_id=dataset,
+            html_file_path=html_file_path,
+            plot_file_path=plot_png,
+            space_card=SPACE_REPO_CARD_CONTENT,
+            token=HF_TOKEN,
+        )
+        space_link = f"https://huggingface.co/spaces/{space_id}"
+        yield (
+            gr.Accordion(open=False),
+            topics_info,
+            topic_plot,
+            gr.Label(
+                {
+                    "✅ " + message: 1.0,
+                    f"✅ Generating topic names with {model_id}": 1.0,
+                    "✅ Creating Interactive Space": 1.0,
+                },
+                visible=True,
+            ),
+            f"[![Go to interactive plot](https://img.shields.io/badge/%F0%9F%A4%97%20Hugging%20Face-Space-blue)]({space_link})",
         )
+        del reduce_umap_model, all_docs, reduced_embeddings_list
+        del (
+            base_model,
+            all_topics,
+            topic_info,
+            topic_names_array,
+            interactive_plot,
         )
+        cuda.empty_cache()
+    except Exception as error:
+        return (
+            gr.Accordion(open=True),
+            gr.DataFrame(value=[], interactive=False, visible=True),
+            gr.Plot(value=None, visible=True),
+            gr.Label({f"❌ Error: {error}": 0.0}, visible=True),
+            "",
         )
 with gr.Blocks() as demo:
         generate_button = gr.Button("Generate Topics", variant="primary")
     gr.Markdown("## Data map")
+    progress_label = gr.Label(visible=False, show_label=False)
     open_space_label = gr.Markdown()
     topics_plot = gr.Plot()
+    # with gr.Accordion("Topics Info", open=False):
+    topics_df = gr.DataFrame(interactive=False, visible=True)
     gr.HTML(
         f"<p style='text-align: center; color:orange;'>⚠ This space processes datasets in batches of <b>{CHUNK_SIZE}</b>, with a maximum of <b>{MAX_ROWS}</b> rows. If you need further assistance, please open a new issue in the Community tab.</p>"
     )
             data_details_accordion,
             topics_df,
             topics_plot,
+            progress_label,
             open_space_label,
         ],
     )