Spaces:

datasets-topics
/

topics-generator

Running on T4

App Files Files Community

asoria HF staff commited on 9 days ago

Commit

bf92466

•

1 Parent(s): a5eff40

Replace model with inference client + llama3

Browse files

Files changed (3) hide show

app.py +34 -49
requirements.txt +0 -1
src/templates.py +10 -0

app.py CHANGED Viewed

@@ -11,21 +11,13 @@ from dotenv import load_dotenv
 from gradio_huggingfacehub_search import HuggingfaceHubSearch
 from bertopic import BERTopic
 from bertopic.representation import KeyBERTInspired
-from bertopic.representation import TextGeneration
-from huggingface_hub import HfApi
 from sklearn.feature_extraction.text import CountVectorizer
 from sentence_transformers import SentenceTransformer
-from torch import cuda, bfloat16
-from transformers import (
-    BitsAndBytesConfig,
-    AutoTokenizer,
-    AutoModelForCausalLM,
-    pipeline,
-)
 from src.hub import create_space_with_content
-from src.templates import REPRESENTATION_PROMPT, SPACE_REPO_CARD_CONTENT
 from src.viewer_api import (
     get_split_rows,
     get_parquet_urls,
@@ -60,35 +52,13 @@ logging.basicConfig(
 api = HfApi(token=HF_TOKEN)
-bnb_config = BitsAndBytesConfig(
-    load_in_4bit=True,
-    bnb_4bit_quant_type="nf4",
-    bnb_4bit_use_double_quant=True,
-    bnb_4bit_compute_dtype=bfloat16,
-)
-model_id = "meta-llama/Llama-2-7b-chat-hf"
-tokenizer = AutoTokenizer.from_pretrained(model_id)
-model = AutoModelForCausalLM.from_pretrained(
-    model_id,
-    trust_remote_code=True,
-    quantization_config=bnb_config,
-    device_map="auto",
-)
-model.eval()
-generator = pipeline(
-    model=model,
-    tokenizer=tokenizer,
-    task="text-generation",
-    temperature=0.1,
-    max_new_tokens=500,
-    repetition_penalty=1.1,
-)
 embedding_model = SentenceTransformer("all-MiniLM-L6-v2")
 vectorizer_model = CountVectorizer(stop_words="english")
 representation_model = KeyBERTInspired()
 def calculate_embeddings(docs):
     return embedding_model.encode(docs, show_progress_bar=True, batch_size=32)
@@ -294,13 +264,6 @@ def generate_topics(dataset, config, split, column, plot_type):
             "",
         )
-        dataset_clear_name = dataset.replace("/", "-")
-        plot_png = f"{dataset_clear_name}-{plot_type.lower()}.png"
-        if plot_type == "DataMapPlot":
-            topic_plot.savefig(plot_png, format="png", dpi=300)
-        else:
-            topic_plot.write_image(plot_png)
         all_topics = base_model.topics_
         topics_info = base_model.get_topic_info()
@@ -309,13 +272,27 @@ def generate_topics(dataset, config, split, column, plot_type):
             logging.info(
                 f"Processing topic: {row['Topic']} - Representation: {row['Representation']}"
             )
-            prompt = f"{REPRESENTATION_PROMPT.replace('[KEYWORDS]', ','.join(row['Representation']))}"
-            logging.info(prompt)
-            topic_description = generator(prompt)
-            logging.info(topic_description)
-            new_topics_by_text_generation[row["Topic"]] = topic_description[0][
-                "generated_text"
-            ].replace(prompt, "")
         base_model.set_topic_labels(new_topics_by_text_generation)
         topics_info = base_model.get_topic_info()
@@ -350,6 +327,14 @@ def generate_topics(dataset, config, split, column, plot_type):
                 title="",
             )
         )
         custom_labels = base_model.custom_labels_
         topic_names_array = [custom_labels[doc_topic + 1] for doc_topic in all_topics]
         yield (

 from gradio_huggingfacehub_search import HuggingfaceHubSearch
 from bertopic import BERTopic
 from bertopic.representation import KeyBERTInspired
+from huggingface_hub import HfApi, InferenceClient
 from sklearn.feature_extraction.text import CountVectorizer
 from sentence_transformers import SentenceTransformer
 from src.hub import create_space_with_content
+from src.templates import LLAMA_3_8B_PROMPT, SPACE_REPO_CARD_CONTENT
 from src.viewer_api import (
     get_split_rows,
     get_parquet_urls,
 api = HfApi(token=HF_TOKEN)
+model_id = "meta-llama/Meta-Llama-3-8B-Instruct"
 embedding_model = SentenceTransformer("all-MiniLM-L6-v2")
 vectorizer_model = CountVectorizer(stop_words="english")
 representation_model = KeyBERTInspired()
+inference_client = InferenceClient(model_id)
 def calculate_embeddings(docs):
     return embedding_model.encode(docs, show_progress_bar=True, batch_size=32)
             "",
         )
         all_topics = base_model.topics_
         topics_info = base_model.get_topic_info()
             logging.info(
                 f"Processing topic: {row['Topic']} - Representation: {row['Representation']}"
             )
+            prompt = f"{LLAMA_3_8B_PROMPT.replace('[KEYWORDS]', ','.join(row['Representation']))}"
+            prompt_messages = [
+                {
+                    "role": "system",
+                    "content": "You are a helpful, respectful and honest assistant for labeling topics.",
+                },
+                {"role": "user", "content": prompt},
+            ]
+            output = inference_client.chat_completion(
+                messages=prompt_messages,
+                stream=False,
+                max_tokens=500,
+                top_p=0.8,
+                seed=42,
+            )
+            inference_response = output.choices[0].message.content
+            logging.info("Inference response:")
+            logging.info(inference_response)
+            new_topics_by_text_generation[row["Topic"]] = inference_response.replace(
+                "Topic=", ""
+            ).strip()
         base_model.set_topic_labels(new_topics_by_text_generation)
         topics_info = base_model.get_topic_info()
                 title="",
             )
         )
+        dataset_clear_name = dataset.replace("/", "-")
+        plot_png = f"{dataset_clear_name}-{plot_type.lower()}.png"
+        if plot_type == "DataMapPlot":
+            topic_plot.savefig(plot_png, format="png", dpi=300)
+        else:
+            topic_plot.write_image(plot_png)
         custom_labels = base_model.custom_labels_
         topic_names_array = [custom_labels[doc_topic + 1] for doc_topic in all_topics]
         yield (

requirements.txt CHANGED Viewed

@@ -15,4 +15,3 @@ pandas
 numpy
 python-dotenv
 kaleido
-transformers

 numpy
 python-dotenv
 kaleido

src/templates.py CHANGED Viewed

@@ -22,6 +22,16 @@ Based on the information about the topic above, please create a short label of t
 REPRESENTATION_PROMPT = f"{SYSTEM_PROMPT}{EXAMPLE_PROMPT}{MAIN_PROMPT}"
 SPACE_REPO_CARD_CONTENT = """
 ---
 title: {dataset_id}

 REPRESENTATION_PROMPT = f"{SYSTEM_PROMPT}{EXAMPLE_PROMPT}{MAIN_PROMPT}"
+LLAMA_3_8B_PROMPT = """
+Example:
+I have a topic that is described by the following keywords: 'meat, beef, eat, eating, emissions, steak, food, health, processed, chicken'.
+Based on the information about the topic above, please create a short label of this topic. Make sure you to only return the label and nothing more.
+Topic=Environmental impacts of eating meat
+Instruction:
+I have a topic that is described by the following keywords: '[KEYWORDS]'.
+Based on the information about the topic above, please create a short label of this topic. Make sure you to only return the label and nothing more.
+"""
 SPACE_REPO_CARD_CONTENT = """
 ---
 title: {dataset_id}