Mohamed-Maher commited on
Commit
39ccf9b
1 Parent(s): 18feff1

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +52 -33
app.py CHANGED
@@ -1,48 +1,67 @@
 
1
  import re
2
- import nltk
3
  import pickle
4
  import numpy as np
5
  import pandas as pd
6
- import streamlit as st
7
- from datasets import load_dataset
8
  from sklearn.metrics.pairwise import cosine_similarity
9
 
10
- nltk.download('punkt')
 
 
 
 
 
 
 
 
 
 
 
 
 
11
 
12
- dataset = pd.read_csv("Preprocess_LK_Hadith_dataset.csv")
 
 
 
13
 
14
- labels = dataset['Arabic_Grade']
 
 
 
 
 
15
 
16
- # Helper functions
17
- def remove_tashkeel(text):
18
- tashkeel_pattern = re.compile(r'[\u0617-\u061A\u064B-\u0652]')
19
- return re.sub(tashkeel_pattern, '', text)
20
 
21
- def preprocess_arabic_text(text):
22
- text = remove_tashkeel(text)
23
- tokens = nltk.word_tokenize(text)
24
- cleaned_tokens = [token for token in tokens if token.isalnum()]
25
- lowercase_tokens = [token.lower() for token in cleaned_tokens]
26
- return " ".join(lowercase_tokens)
27
 
28
- # Function to predict label
29
- def predict_label(input_text, threshold=0.5):
30
- with open("tfidf_vectorizer.pkl", "rb") as f:
31
- vectorizer = pickle.load(f)
32
- with open("cosine_similarity_model.pkl", "rb") as f:
33
- X = pickle.load(f)
34
 
35
- input_text = preprocess_arabic_text(input_text)
36
- input_vector = vectorizer.transform([input_text])
37
- similarities = cosine_similarity(input_vector, X).flatten()
38
 
39
- max_index = np.argmax(similarities)
40
- max_similarity = similarities[max_index]
 
41
 
42
- if max_similarity >= threshold:
43
- return labels.iloc[max_index]
44
- else:
45
- return "No similar text found in dataset"
 
 
 
 
46
 
47
- x = st.slider('Enter Hadith')
48
- st.write(x, 'Hadith Classification', predict_label)
 
1
+ import os
2
  import re
 
3
  import pickle
4
  import numpy as np
5
  import pandas as pd
6
+ import nltk
7
+ import gradio as gr
8
  from sklearn.metrics.pairwise import cosine_similarity
9
 
10
+ class HadithClassificationApp:
11
+ def __init__(self):
12
+ # Download NLTK resources if needed
13
+ nltk.download('punkt')
14
+
15
+ # Load the dataset and labels
16
+ self.dataset = pd.read_csv("Preprocess_LK_Hadith_dataset.csv")
17
+ self.labels = self.dataset['Arabic_Grade']
18
+
19
+ # Load the models
20
+ with open("tfidf_vectorizer.pkl", "rb") as f:
21
+ self.vectorizer = pickle.load(f)
22
+ with open("cosine_similarity_model.pkl", "rb") as f:
23
+ self.X = pickle.load(f)
24
 
25
+ @staticmethod
26
+ def remove_tashkeel(text):
27
+ tashkeel_pattern = re.compile(r'[\u0617-\u061A\u064B-\u0652]')
28
+ return re.sub(tashkeel_pattern, '', text)
29
 
30
+ def preprocess_arabic_text(self, text):
31
+ text = self.remove_tashkeel(text)
32
+ tokens = nltk.word_tokenize(text)
33
+ cleaned_tokens = [token for token in tokens if token.isalnum()]
34
+ lowercase_tokens = [token.lower() for token in cleaned_tokens]
35
+ return " ".join(lowercase_tokens)
36
 
37
+ def predict_label(self, input_text, threshold=0.5):
38
+ input_text = self.preprocess_arabic_text(input_text)
39
+ input_vector = self.vectorizer.transform([input_text])
40
+ similarities = cosine_similarity(input_vector, self.X).flatten()
41
 
42
+ max_index = np.argmax(similarities)
43
+ max_similarity = similarities[max_index]
 
 
 
 
44
 
45
+ if max_similarity >= threshold:
46
+ return self.labels.iloc[max_index]
47
+ else:
48
+ return "No similar text found in dataset"
 
 
49
 
50
+ def classify_hadith(self, input_text):
51
+ return self.predict_label(input_text)
 
52
 
53
+ if __name__ == "__main__":
54
+ # Initialize the app
55
+ hadith_classification_app = HadithClassificationApp()
56
 
57
+ # Set up the Gradio interface
58
+ iface = gr.Interface(
59
+ fn=hadith_classification_app.classify_hadith,
60
+ inputs="text",
61
+ outputs="text",
62
+ title="Hadith Classification App",
63
+ description="Classify Hadith text based on pre-trained model."
64
+ )
65
 
66
+ # Launch the Gradio interface
67
+ iface.launch()