import os from functools import lru_cache from typing import Dict, List import plotly.express as px import streamlit as st import pandas as pd from datasets import Dataset, get_dataset_infos, load_dataset import stanza import matplotlib.pyplot as plt from wordcloud import WordCloud import io st.set_page_config( page_title="Eskulap Dataset", page_icon="馃┖", layout="wide", initial_sidebar_state="expanded", ) BASE_DATASET: str = "lion-ai/pl_med_data" read_key = os.environ.get('HF_TOKEN', None) datasets_map = { "znany_lekarz": { "display_name": "Porady", "description": "Zbi贸r pyta艅 i odpowiedzi odno艣nie medycyny.", "primary_column": "question", }, "kor_epikryzy_qa": { "display_name": "Dokumentacja - QA", "description": "Zbi贸r pyta艅 i odpowiedzi do zanonimizowanej dokumentacji medycznej.", "primary_column": "content", }, "wikipedia": { "display_name": "Wikipedia", "description": "Zbi贸r pyta艅 i odpowiedzi na podstawie artyku艂贸w z Wikipedii.", "primary_column": "question", }, "ulotki_medyczne": { "display_name": "Pytania farmaceutyczne", "description": "Zbi贸r pyta艅 i odpowiedzi na podstawie ulotek medycznych.", "primary_column": "question", }, } dataset_names_map: Dict[str, str] = {k: v["display_name"] for k, v in datasets_map.items()} reverse_dataset_names_map: Dict[str, str] = {v: k for k, v in dataset_names_map.items()} @st.cache_resource def load_stanza_pipeline(): return stanza.Pipeline(lang='pl', processors='tokenize,mwt,pos,lemma') @st.cache_resource def list_datasets() -> Dict[str, Dataset]: """ Retrieves a list of dataset information. Returns: List[Dict[str, str]]: A list of dataset information. """ return get_dataset_infos(BASE_DATASET, token=read_key) def show_examples(dataset_name: str, split: str) -> None: dataset_name = reverse_dataset_names_map.get(dataset_name, dataset_name) dataset: Dataset = load_dataset(BASE_DATASET, dataset_name, split=f"{split}[:50]", token=read_key) st.data_editor(dataset.to_pandas(), use_container_width=True, height=900) def count_all_examples(datasets: Dict[str, Dataset]) -> None: count: int = 0 for dataset_name, dataset_info in datasets.items(): count += dataset_info.num_examples st.metric(label="Total no. of instructions", value=f"{count:,}") def filter_splits(dataset: Dict[str, Dataset], split: str) -> Dict[str, Dataset]: """ Filter the dataset based on the specified split. Args: dataset (Dict[str, Dataset]): A dictionary containing dataset information. split (str): The split to filter the dataset by. Returns: Dict[str, Dataset]: A dictionary containing the filtered dataset splits. """ dataset_splits: Dict[str, Dataset] = {} for dataset_name, dataset_info in dataset.items(): if split in dataset_info.splits: dataset_name = dataset_names_map.get(dataset_name, dataset_name) dataset_splits[dataset_name] = dataset_info.splits[split] return dataset_splits @st.cache_data(show_spinner=False) def generate_wordcloud(dataset_name, split): dataset_name = reverse_dataset_names_map.get(dataset_name, dataset_name) dataset: Dataset = load_dataset(BASE_DATASET, dataset_name, split=f"{split}[:500]", token=read_key) primary_column = datasets_map[dataset_name]["primary_column"] text = "" progress_bar = st.progress(0, text = "Generating wordcloud...") for i, example in enumerate(dataset[primary_column]): doc = stanza_pipeline(example) nouns = [word.lemma for sent in doc.sentences for word in sent.words if word.upos == 'NOUN'] text += " ".join(nouns) + " " progress_bar.progress((i + 1) / len(dataset[primary_column]), text = f"Generating wordcloud...") wordcloud = WordCloud(width=600, height=600, background_color='#212c2a', colormap="Greens", contour_width=0, contour_color="#212c2a").generate(text) progress_bar.empty() plt.figure(figsize=(6, 6), facecolor='#212c2a') plt.imshow(wordcloud, interpolation='bilinear') plt.axis('off') plt.tight_layout(pad=0) # Save the plot to a bytes buffer buf = io.BytesIO() plt.savefig(buf, format='png', bbox_inches='tight', pad_inches=0, facecolor='#212c2a') buf.seek(0) # Display the image in Streamlit st.image(buf, use_column_width=True) _, col, _ = st.columns([1, 2, 1]) with col: split: str = "processed" datasets: Dict[str, Dataset] = list_datasets() stanza_pipeline = load_stanza_pipeline() # st.write(datasets) filtered_datasets: Dict[str, Dataset] = filter_splits(datasets, split) # st.write(filtered_datasets) image = st.image("Eskulap.png", use_column_width=True) count_all_examples(filtered_datasets) distribution = { "dataset": list(filtered_datasets.keys()), "count": [split.num_examples for split in filtered_datasets.values()], } distribution_df = pd.DataFrame(distribution) # Create a pie chart showing the number of examples per dataset fig = px.pie( distribution_df, names="dataset", values="count", hover_name="dataset", title=f"Data distribution", labels={"label": "Dataset", "value": "Number of Examples"}, color_discrete_sequence=px.colors.sequential.Blugrn, hole=0.3, ) # Update layout for better readability # fig.update_traces(textposition="inside", textinfo="value+label") fig.update_traces(textposition='none') fig.update_layout(legend_title_text="Datasets", uniformtext_minsize=12, uniformtext_mode="hide") chart = st.plotly_chart(fig, use_container_width=True) dataset_name = st.selectbox("Select a dataset", list(filtered_datasets.keys())) st.write(f"### {dataset_name}") st.write(datasets_map[reverse_dataset_names_map.get(dataset_name)]["description"]) st.markdown("***") col1, col2 = st.columns(2) with col1: st.write(f"### Sample data") show_examples(dataset_name, split) with col2: st.write(f"### Wordcloud") generate_wordcloud(dataset_name, split) _, col, _ = st.columns([1, 2, 1]) with col: st.button("Made with 鉂わ笍 by thelion.ai", use_container_width=True, disabled=True) st.write("Intersted in the project? Contact us : contact@thelion.ai")