File size: 6,503 Bytes
0315018
ea129da
 
 
 
 
7bb750a
ea129da
 
7bb750a
 
 
 
 
 
 
 
 
 
 
 
 
ea129da
 
3b7a7ae
 
7bb750a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ea129da
 
7bb750a
 
 
ea129da
 
7bb750a
 
 
ea129da
 
 
 
 
 
 
 
 
f3c17dc
ea129da
 
 
 
 
7bb750a
 
ea129da
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7bb750a
 
 
ea129da
7bb750a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ea129da
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
import os
from functools import lru_cache
from typing import Dict, List

import plotly.express as px
import streamlit as st
import pandas as pd

from datasets import Dataset, get_dataset_infos, load_dataset
import stanza

import matplotlib.pyplot as plt
from wordcloud import WordCloud
import io

st.set_page_config(
    page_title="Eskulap Dataset",
    page_icon="馃┖",
    layout="wide",
    initial_sidebar_state="expanded",
)


BASE_DATASET: str = "lion-ai/pl_med_data"
read_key = os.environ.get('HF_TOKEN', None)

datasets_map = {
    "znany_lekarz":
    {
        "display_name": "Porady",
        "description": "Zbi贸r pyta艅 i odpowiedzi odno艣nie medycyny.",
        "primary_column": "question",
    },
    "kor_epikryzy_qa":
    {
        "display_name": "Dokumentacja - QA",
        "description": "Zbi贸r pyta艅 i odpowiedzi do zanonimizowanej dokumentacji medycznej.",
        "primary_column": "content",
    },
    "wikipedia":
    {
        "display_name": "Wikipedia",
        "description": "Zbi贸r pyta艅 i odpowiedzi na podstawie artyku艂贸w z Wikipedii.",
        "primary_column": "question",
    },
    "ulotki_medyczne":
    {
        "display_name": "Pytania farmaceutyczne",
        "description": "Zbi贸r pyta艅 i odpowiedzi na podstawie ulotek medycznych.",
        "primary_column": "question",
    },
}


dataset_names_map: Dict[str, str] = {k: v["display_name"] for k, v in datasets_map.items()}

reverse_dataset_names_map: Dict[str, str] = {v: k for k, v in dataset_names_map.items()}

@st.cache_resource
def load_stanza_pipeline():
    return stanza.Pipeline(lang='pl', processors='tokenize,mwt,pos,lemma')

@st.cache_resource
def list_datasets() -> Dict[str, Dataset]:
    """
    Retrieves a list of dataset information.

    Returns:
        List[Dict[str, str]]: A list of dataset information.
    """
    return get_dataset_infos(BASE_DATASET, token=read_key)


def show_examples(dataset_name: str, split: str) -> None:
    dataset_name = reverse_dataset_names_map.get(dataset_name, dataset_name)

    dataset: Dataset = load_dataset(BASE_DATASET, dataset_name, split=f"{split}[:50]", token=read_key)
    st.data_editor(dataset.to_pandas(), use_container_width=True, height=900)


def count_all_examples(datasets: Dict[str, Dataset]) -> None:
    count: int = 0
    for dataset_name, dataset_info in datasets.items():
        count += dataset_info.num_examples
    st.metric(label="Total no. of instructions", value=f"{count:,}")


def filter_splits(dataset: Dict[str, Dataset], split: str) -> Dict[str, Dataset]:
    """
    Filter the dataset based on the specified split.

    Args:
        dataset (Dict[str, Dataset]): A dictionary containing dataset information.
        split (str): The split to filter the dataset by.

    Returns:
        Dict[str, Dataset]: A dictionary containing the filtered dataset splits.
    """

    dataset_splits: Dict[str, Dataset] = {}
    for dataset_name, dataset_info in dataset.items():
        if split in dataset_info.splits:
            dataset_name = dataset_names_map.get(dataset_name, dataset_name)
            dataset_splits[dataset_name] = dataset_info.splits[split]
    return dataset_splits

@st.cache_data(show_spinner=False)
def generate_wordcloud(dataset_name, split):
    dataset_name = reverse_dataset_names_map.get(dataset_name, dataset_name)

    dataset: Dataset = load_dataset(BASE_DATASET, dataset_name, split=f"{split}[:500]", token=read_key)
    
    primary_column = datasets_map[dataset_name]["primary_column"]
    
    text = ""
    progress_bar = st.progress(0, text = "Generating wordcloud...")
    for i, example in enumerate(dataset[primary_column]):
        doc = stanza_pipeline(example)
        nouns = [word.lemma for sent in doc.sentences for word in sent.words if word.upos == 'NOUN']
        text += " ".join(nouns) + " "
        progress_bar.progress((i + 1) / len(dataset[primary_column]), text = f"Generating wordcloud...")
    
    wordcloud = WordCloud(width=600, height=600, background_color='#212c2a', colormap="Greens", contour_width=0, contour_color="#212c2a").generate(text)
    progress_bar.empty()
    
    plt.figure(figsize=(6, 6), facecolor='#212c2a')
    plt.imshow(wordcloud, interpolation='bilinear')
    plt.axis('off')
    plt.tight_layout(pad=0)
    
    # Save the plot to a bytes buffer
    buf = io.BytesIO()
    plt.savefig(buf, format='png', bbox_inches='tight', pad_inches=0, facecolor='#212c2a')
    buf.seek(0)
    
    # Display the image in Streamlit
    st.image(buf, use_column_width=True)


_, col, _ = st.columns([1, 2, 1])

with col:
    split: str = "processed"

    datasets: Dict[str, Dataset] = list_datasets()
    stanza_pipeline = load_stanza_pipeline()
    # st.write(datasets)

    filtered_datasets: Dict[str, Dataset] = filter_splits(datasets, split)
    # st.write(filtered_datasets)
    image = st.image("Eskulap.png", use_column_width=True)

    count_all_examples(filtered_datasets)

    distribution = {
        "dataset": list(filtered_datasets.keys()),
        "count": [split.num_examples for split in filtered_datasets.values()],
    }

    distribution_df = pd.DataFrame(distribution)

    # Create a pie chart showing the number of examples per dataset
    fig = px.pie(
        distribution_df,
        names="dataset",
        values="count",
        hover_name="dataset",
        title=f"Data distribution",
        labels={"label": "Dataset", "value": "Number of Examples"},
        color_discrete_sequence=px.colors.sequential.Blugrn,
        hole=0.3,
    )

    # Update layout for better readability
    # fig.update_traces(textposition="inside", textinfo="value+label")
    fig.update_traces(textposition='none')
    fig.update_layout(legend_title_text="Datasets", uniformtext_minsize=12, uniformtext_mode="hide")

    chart = st.plotly_chart(fig, use_container_width=True)


    dataset_name = st.selectbox("Select a dataset", list(filtered_datasets.keys()))
    st.write(f"### {dataset_name}")
    st.write(datasets_map[reverse_dataset_names_map.get(dataset_name)]["description"])
st.markdown("***")
col1, col2 = st.columns(2)
with col1:
    st.write(f"### Sample data")
    show_examples(dataset_name, split)


with col2:
    st.write(f"### Wordcloud")
    generate_wordcloud(dataset_name, split)

_, col, _ = st.columns([1, 2, 1])


with col:
    st.button("Made with 鉂わ笍 by thelion.ai", use_container_width=True, disabled=True)
    st.write("Intersted in the project? Contact us : contact@thelion.ai")