import fasttext import gradio as gr import joblib import json as js import omikuji import os import re from collections import defaultdict from huggingface_hub import snapshot_download from typing import List, Tuple, Dict from install_packages import download_model download_model('https://dl.fbaipublicfiles.com/fasttext/supervised-models/lid.176.bin', 'lid.176.bin') # Download the model files from Hugging Face for repo_id in ['kapllan/omikuji-bonsai-parliament-de-spacy', 'kapllan/omikuji-bonsai-parliament-fr-spacy', 'kapllan/omikuji-bonsai-parliament-it-spacy']: if not os.path.exists(repo_id): os.makedirs(repo_id) model_dir = snapshot_download(repo_id=repo_id, local_dir=repo_id) lang_model = fasttext.load_model('lid.176.bin') with open('./id2label.json', 'r') as f: id2label = js.load(f) with open('topics_hierarchy.json', 'r') as f: topics_hierarchy = js.load(f) def map_language(language: str) -> str: language_mapping = {'de': 'German', 'it': 'Italian', 'fr': 'French'} if language in language_mapping.keys(): return language_mapping[language] else: return language def find_model(language: str): vectorizer, model = None, None if language in ['de', 'fr', 'it']: path_to_vectorizer = f'./kapllan/omikuji-bonsai-parliament-{language}-spacy/vectorizer' path_to_model = f'./kapllan/omikuji-bonsai-parliament-{language}-spacy/omikuji-model' vectorizer = joblib.load(path_to_vectorizer) model = omikuji.Model.load(path_to_model) return vectorizer, model def predict_lang(text: str) -> str: text = re.sub(r'\n', '', text) # Remove linebreaks because fasttext cannot process that otherwise predictions = lang_model.predict(text, k=1) # returns top 2 matching languages language = predictions[0][0] # returns top 2 matching languages language = re.sub(r'__label__', '', language) # returns top 2 matching languages return language def predict_topic(text: str) -> [List[str], str]: results = [] language = predict_lang(text) vectorizer, model = find_model(language) language = map_language(language) if vectorizer is not None: texts = [text] vector = vectorizer.transform(texts) for row in vector: if row.nnz == 0: # All zero vector, empty result continue feature_values = [(col, row[0, col]) for col in row.nonzero()[1]] for subj_id, score in model.predict(feature_values, top_k=1000): results.append((id2label[str(subj_id)], score)) return results, language def get_row_color(type: str): if 'main' in type.lower(): return 'background-color: darkgrey;' if 'sub' in type.lower(): return 'background-color: lightgrey;' def generate_html_table(topics: List[Tuple[str, str, float]]): html = '
Type | Topic | Score |
---|---|---|
{type} | {topic} | {score} |