Spaces:
Runtime error
Runtime error
Upload 10 files
Browse files- data_sum_extract.csv +0 -0
- data_tr_extract.csv +0 -0
- presentation.py +126 -0
- streamlit_presentation/__pycache__/analyse.cpython-39.pyc +0 -0
- streamlit_presentation/__pycache__/modele.cpython-39.pyc +0 -0
- streamlit_presentation/__pycache__/preprocessing.cpython-39.pyc +0 -0
- streamlit_presentation/__pycache__/stats.cpython-39.pyc +0 -0
- streamlit_presentation/analyse.py +66 -0
- streamlit_presentation/modele.py +53 -0
- streamlit_presentation/preprocessing.py +35 -0
data_sum_extract.csv
ADDED
The diff for this file is too large to render.
See raw diff
|
|
data_tr_extract.csv
ADDED
The diff for this file is too large to render.
See raw diff
|
|
presentation.py
ADDED
@@ -0,0 +1,126 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import pandas as pd
|
2 |
+
import numpy as np
|
3 |
+
import matplotlib.pyplot as plt
|
4 |
+
import importlib
|
5 |
+
|
6 |
+
|
7 |
+
from st_on_hover_tabs import on_hover_tabs
|
8 |
+
|
9 |
+
import streamlit as st
|
10 |
+
|
11 |
+
import streamlit_presentation
|
12 |
+
import streamlit_presentation.analyse
|
13 |
+
importlib.reload(streamlit_presentation.analyse)
|
14 |
+
from streamlit_presentation.analyse import repartition_par_categorie
|
15 |
+
from streamlit_presentation.analyse import repartition_longueur_categorie
|
16 |
+
|
17 |
+
|
18 |
+
import streamlit_presentation.preprocessing
|
19 |
+
importlib.reload(streamlit_presentation.preprocessing)
|
20 |
+
from streamlit_presentation.preprocessing import detection_langage_et_traduction
|
21 |
+
|
22 |
+
import streamlit_presentation.modele
|
23 |
+
importlib.reload(streamlit_presentation.modele)
|
24 |
+
from streamlit_presentation.modele import presentation_modele
|
25 |
+
from sklearn.metrics import f1_score
|
26 |
+
|
27 |
+
plt.rcParams['font.size'] = 12
|
28 |
+
plt.rcParams['axes.labelsize'] = 10
|
29 |
+
plt.rcParams['axes.titlesize'] = 12
|
30 |
+
plt.rcParams['xtick.labelsize'] = 8
|
31 |
+
plt.rcParams['ytick.labelsize'] = 8
|
32 |
+
plt.rcParams['legend.fontsize'] = 8
|
33 |
+
plt.rcParams['lines.linewidth'] = 1
|
34 |
+
|
35 |
+
#on charge les donnees utilisees
|
36 |
+
data = pd.read_csv( 'data.csv')
|
37 |
+
extract_data = pd.read_csv( 'data_tr_extract.csv')
|
38 |
+
sum_data = pd.read_csv( 'data_sum_extract.csv')
|
39 |
+
test_data = pd.read_pickle( 'data_test.pkl')
|
40 |
+
|
41 |
+
from keras.models import load_model
|
42 |
+
import tensorflow as tf
|
43 |
+
from tensorflow.keras import backend as K
|
44 |
+
import ast
|
45 |
+
|
46 |
+
|
47 |
+
def f1_weighted(true, pred):
|
48 |
+
|
49 |
+
# Classes
|
50 |
+
classes = K.arange(0, 27)
|
51 |
+
true = K.one_hot(K.cast(true, 'int32'), 27)
|
52 |
+
|
53 |
+
# Calcule les TP, FP, FN pour chaque classe
|
54 |
+
tp = K.dot(K.transpose(true), K.round(pred))
|
55 |
+
fp = K.dot(K.transpose(1-true), K.round(pred))
|
56 |
+
fn = K.dot(K.transpose(true), 1-K.round(pred))
|
57 |
+
|
58 |
+
# Calcule le score F1 pour chaque classe
|
59 |
+
p = tp / (tp + fp + K.epsilon())
|
60 |
+
r = tp / (tp + fn + K.epsilon())
|
61 |
+
f1 = 2*p*r / (p+r+K.epsilon())
|
62 |
+
|
63 |
+
|
64 |
+
weighted_f1 = K.sum(f1 * K.sum(true, axis=0) / K.sum(true))
|
65 |
+
return weighted_f1
|
66 |
+
|
67 |
+
model = load_model("final_model_kfold.h5", custom_objects={'f1_weighted': f1_weighted})
|
68 |
+
|
69 |
+
|
70 |
+
|
71 |
+
|
72 |
+
|
73 |
+
from sklearn.preprocessing import LabelEncoder
|
74 |
+
encoder = LabelEncoder()
|
75 |
+
print(test_data.columns)
|
76 |
+
y_test = encoder.fit_transform(test_data["prdtypecode"])
|
77 |
+
class_labels = encoder.classes_
|
78 |
+
label_size = 27
|
79 |
+
|
80 |
+
|
81 |
+
|
82 |
+
####### Page principale
|
83 |
+
st.set_page_config(layout="wide")
|
84 |
+
st.markdown('<style>' + open('./style.css').read() + '</style>', unsafe_allow_html=True)
|
85 |
+
|
86 |
+
st.title("Mon Application")
|
87 |
+
|
88 |
+
with st.sidebar:
|
89 |
+
tabs = on_hover_tabs(tabName=['Introduction', "Analyse", "Preprocessing", "Modèle", "Pistes exploratoires"],
|
90 |
+
iconName=['apps', 'bar_chart', "sync", "memory", "topic"], default_choice=0)
|
91 |
+
|
92 |
+
st.markdown("""
|
93 |
+
<style>
|
94 |
+
.rounded-border-parent {
|
95 |
+
border-radius: 15px !important;
|
96 |
+
border: 1px solid blue !important;
|
97 |
+
background-color: lightgray !important;
|
98 |
+
}
|
99 |
+
</style>
|
100 |
+
""", unsafe_allow_html=True)
|
101 |
+
|
102 |
+
|
103 |
+
if tabs == "Introduction":
|
104 |
+
st.write("# Introduction")
|
105 |
+
st.write("Ici")
|
106 |
+
|
107 |
+
elif tabs == "Analyse":
|
108 |
+
st.write("# Analyse")
|
109 |
+
|
110 |
+
st.dataframe(data.head(30))
|
111 |
+
st.write("")
|
112 |
+
|
113 |
+
repartition_par_categorie(st, data)
|
114 |
+
repartition_longueur_categorie(st, data)
|
115 |
+
|
116 |
+
elif tabs == "Preprocessing":
|
117 |
+
detection_langage_et_traduction(st, extract_data, sum_data)
|
118 |
+
|
119 |
+
elif tabs == "Modèle":
|
120 |
+
presentation_modele(st, test_data, model,class_labels,y_test)
|
121 |
+
|
122 |
+
elif tabs == "Pistes exploratoires":
|
123 |
+
st.write("# Pistes exploratoires")
|
124 |
+
st.write("Ici")
|
125 |
+
|
126 |
+
|
streamlit_presentation/__pycache__/analyse.cpython-39.pyc
ADDED
Binary file (2.56 kB). View file
|
|
streamlit_presentation/__pycache__/modele.cpython-39.pyc
ADDED
Binary file (1.89 kB). View file
|
|
streamlit_presentation/__pycache__/preprocessing.cpython-39.pyc
ADDED
Binary file (2.6 kB). View file
|
|
streamlit_presentation/__pycache__/stats.cpython-39.pyc
ADDED
Binary file (2.56 kB). View file
|
|
streamlit_presentation/analyse.py
ADDED
@@ -0,0 +1,66 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import pandas as pd
|
2 |
+
import numpy as np
|
3 |
+
import seaborn as sns
|
4 |
+
import matplotlib.pyplot as plt
|
5 |
+
|
6 |
+
def repartition_par_categorie(st,data):
|
7 |
+
val_code = data['prdtypecode'].unique()
|
8 |
+
|
9 |
+
target_count = (data["prdtypecode"].value_counts(normalize=True)*100).reset_index()
|
10 |
+
target_count.columns=["prdtypecode","pourcentage"]
|
11 |
+
|
12 |
+
plt.figure(figsize=(10,3))
|
13 |
+
ax = sns.barplot( x="prdtypecode", y="pourcentage", data=target_count)
|
14 |
+
|
15 |
+
ax.axhline(y=100/len(val_code),color="green",linewidth=2, alpha=0.5)
|
16 |
+
|
17 |
+
plt.xticks(rotation=45)
|
18 |
+
plt.xlabel('Code produit')
|
19 |
+
plt.ylabel('Pourcentage')
|
20 |
+
plt.grid()
|
21 |
+
plt.title("Distribution des valeurs de la target")
|
22 |
+
|
23 |
+
# Afficher le graphique avec Streamlit
|
24 |
+
col1, col2,col3 = st.columns([6,1,3])
|
25 |
+
with col1:
|
26 |
+
st.pyplot(plt)
|
27 |
+
|
28 |
+
with col3:
|
29 |
+
st.markdown('<div class="rounded-border"></div>', unsafe_allow_html=True)
|
30 |
+
st.write("\n\n\n\n\n\n")
|
31 |
+
st.write("La catégorie la plus présente représente 12% du corpus.")
|
32 |
+
st.write("Si la base était uniformément répartie:")
|
33 |
+
st.write(f"=> chaque code serait représenté à {100/len(val_code):.2f}% de la base")
|
34 |
+
|
35 |
+
|
36 |
+
def repartition_longueur_categorie(st,data):
|
37 |
+
data["designation_length"] = data["designation"].str.len()
|
38 |
+
data["description_length"] = data["description"].str.len()
|
39 |
+
|
40 |
+
plt.figure(figsize=(10,4))
|
41 |
+
ax = sns.histplot(x='designation_length', data=data,bins=50);
|
42 |
+
ax.axhline(data["designation_length"].mean(),color="r",linewidth=2, alpha=0.5)
|
43 |
+
|
44 |
+
plt.xticks(rotation=45)
|
45 |
+
plt.xlabel("Longueur de la designation en caractères");
|
46 |
+
plt.ylabel("nb d'occurences");
|
47 |
+
plt.grid()
|
48 |
+
plt.title("Répartition des longueurs des designations");
|
49 |
+
|
50 |
+
col1, col2,col3 = st.columns([2,6,3])
|
51 |
+
with col1:
|
52 |
+
st.write(data["designation_length"].describe())
|
53 |
+
|
54 |
+
with col2:
|
55 |
+
st.pyplot(plt)
|
56 |
+
|
57 |
+
with col3:
|
58 |
+
st.text('')
|
59 |
+
st.text('')
|
60 |
+
st.text('')
|
61 |
+
st.text('')
|
62 |
+
st.text('')
|
63 |
+
st.text('')
|
64 |
+
st.write(f'=> Longueur de la designation est comprise entre {data["designation_length"].min()} et {data["designation_length"].max()} caractères')
|
65 |
+
st.write("on a une majeure partie de la distribution entre 45 et 100 caractères, puis un pic à 250 caractères")
|
66 |
+
|
streamlit_presentation/modele.py
ADDED
@@ -0,0 +1,53 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import matplotlib.pyplot as plt
|
2 |
+
import seaborn as sns
|
3 |
+
import numpy as np
|
4 |
+
from sklearn.metrics import f1_score
|
5 |
+
from sklearn.metrics import confusion_matrix
|
6 |
+
|
7 |
+
def presentation_modele(st,data, model,class_labels, y_test):
|
8 |
+
|
9 |
+
st.write('Notre modèle prend les embeddings de Camembert pour les descriptions et designations (séparemment), les embeddings de FlauBert pour les descriptions, les embeddings VIT pour les images et les tailles des champs de texte.')
|
10 |
+
|
11 |
+
st.image("model.png", use_column_width=True)
|
12 |
+
#afficher une image du modele
|
13 |
+
#afficher les embeddings en extrait
|
14 |
+
#ajouter un bouton qui declanche le training
|
15 |
+
if st.button("Prédire"):
|
16 |
+
X1_test = data["embeddings_desi"].values
|
17 |
+
X1_test = np.stack(X1_test).astype(np.float32)
|
18 |
+
X2_test = data["embeddings_desc"].values
|
19 |
+
X2_test = np.stack(X2_test).astype(np.float32)
|
20 |
+
X3_test = data["embedding_vit"].values
|
21 |
+
X3_test = np.stack(X3_test).astype(np.float32)
|
22 |
+
X4_test = data["designation_length_normalized"].values
|
23 |
+
X5_test = data["description_length_normalized"].values
|
24 |
+
X6_test = data["embeddings_desi_Flaubert"].values
|
25 |
+
X6_test = np.stack(X6_test).astype(np.float32)
|
26 |
+
y_pred = model.predict([X1_test, X2_test,X3_test,X4_test,X5_test,X6_test])
|
27 |
+
y_pred_ids = np.argmax(y_pred, axis=-1)
|
28 |
+
|
29 |
+
weighted_f1_score = f1_score(y_test, y_pred_ids, average='weighted')
|
30 |
+
st.write("weighted F1 score:",weighted_f1_score)
|
31 |
+
|
32 |
+
|
33 |
+
conf_matrix = confusion_matrix(y_test, y_pred_ids)
|
34 |
+
|
35 |
+
row_sums = conf_matrix.sum(axis=0)
|
36 |
+
normalized_conf_matrix = conf_matrix / row_sums[ np.newaxis,:]*100
|
37 |
+
|
38 |
+
|
39 |
+
st.title("Matrice de Confusion Normalisée")
|
40 |
+
plt.figure(figsize=(10, 10))
|
41 |
+
sns.heatmap(normalized_conf_matrix, annot=True, cmap='Blues',fmt='.0f',
|
42 |
+
xticklabels=class_labels,
|
43 |
+
yticklabels=class_labels,
|
44 |
+
linewidths=1.5)
|
45 |
+
plt.xlabel('Prédictions')
|
46 |
+
plt.ylabel('Réelles')
|
47 |
+
plt.title('Matrice de Confusion')
|
48 |
+
st.pyplot(plt)
|
49 |
+
|
50 |
+
#afficher la matrice de conf.
|
51 |
+
st.dataframe(data.head(10))
|
52 |
+
|
53 |
+
|
streamlit_presentation/preprocessing.py
ADDED
@@ -0,0 +1,35 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import numpy as np
|
2 |
+
import seaborn as sns
|
3 |
+
import matplotlib.pyplot as plt
|
4 |
+
|
5 |
+
|
6 |
+
def detection_langage_et_traduction(st,data, sum_data):
|
7 |
+
|
8 |
+
data_lang = data[(data["desi_langue"] != "fr") | (data["desc_langue"] != "fr")]
|
9 |
+
data_lang = data_lang[["designation","desi_langue","tr_designation", "description","desc_langue","tr_description"]]
|
10 |
+
st.write('Utilisation de FastText pour détecter la langue: champ "desi_langue", champ "desc_langue"')
|
11 |
+
st.dataframe(data_lang)
|
12 |
+
st.write("Nous remarquons que la langue n'est pas toujours correctement détectée: nous acceptons ce problême, plutôt que de traduire le texte sans fournir la langue d'origine, le résultat étant nettement moins bon.")
|
13 |
+
|
14 |
+
st.markdown("---")
|
15 |
+
st.write("")
|
16 |
+
st.subheader("Génération de résumés")
|
17 |
+
st.write("")
|
18 |
+
st.write("Certaines descriptions dépassent notre limitation en terme de tokens d'entrée du modèle, aussi, plutôt de couper le texte à l'aveugle, nous choisissons de résumer les descriptions.")
|
19 |
+
st.write("")
|
20 |
+
st.markdown("Le modèle Barthez [moussaKam/barthez-orangesum-abstract](https://huggingface.co/moussaKam/barthez-orangesum-abstract) propose de résumer des textes en francais, il utilise des mots et morceaux de phrases provenant du texte lui-même. Notre objectif est de conserver le sujet du texte et avoir les caractéristiques principales")
|
21 |
+
|
22 |
+
st.image("summarize.png", use_column_width=False)
|
23 |
+
st.write("Si le nombre de mots est supérieur à 200, nous retournons la description originale, sinon nous la résumons avec un objectif de 200 mots. Notre limite de token est de 250, en prenant 200 nous gardons une marge de 50.")
|
24 |
+
|
25 |
+
data_sum = sum_data[["description","tr_description_sum"]]
|
26 |
+
st.dataframe(data_sum)
|
27 |
+
st.write("")
|
28 |
+
st.markdown("---")
|
29 |
+
st.write("")
|
30 |
+
st.write("Nous appliquons un prétraitement aux images, qui détecte la présence de padding dans les images, et le réduit au minimum possible. Ensuite, nous transformons les images en 224x224 pour correspondre au format VGG16 et VIT")
|
31 |
+
st.write("")
|
32 |
+
|
33 |
+
st.image("resize.png", use_column_width=True)
|
34 |
+
st.image("samples.png", use_column_width=True)
|
35 |
+
st.write("Un grand nombre de catégories peuvent ainsi éviter la perte due au downscale des images, sauvant presque toutes les cartes à collectionner, les cartes postales, les revues, etc. Nous allions ainsi une taille idéale pour le modèle (celle sur laquelle il a été pré-entrainé) et une perte d'information minimisée.")
|