Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
@@ -1,43 +1,58 @@
|
|
1 |
import streamlit as st
|
2 |
from transformers import T5ForConditionalGeneration, T5Tokenizer
|
3 |
-
import torch
|
4 |
import spacy
|
5 |
import nltk
|
|
|
|
|
|
|
|
|
|
|
6 |
from b import b
|
|
|
7 |
nltk.download('punkt')
|
8 |
from nltk.tokenize import sent_tokenize
|
9 |
|
10 |
# Load spaCy model
|
11 |
nlp = spacy.load("en_core_web_sm")
|
|
|
12 |
|
13 |
# Load T5 model and tokenizer
|
14 |
model_name = "DevBM/t5-large-squad"
|
15 |
model = T5ForConditionalGeneration.from_pretrained(model_name)
|
16 |
tokenizer = T5Tokenizer.from_pretrained(model_name)
|
17 |
|
18 |
-
# Function to extract keywords using
|
19 |
def extract_keywords(text):
|
|
|
|
|
|
|
|
|
|
|
|
|
20 |
doc = nlp(text)
|
21 |
-
|
22 |
-
|
23 |
-
|
24 |
-
|
25 |
-
|
26 |
-
|
27 |
-
|
28 |
-
|
29 |
-
|
|
|
|
|
|
|
30 |
|
31 |
-
# Function to map keywords to sentences
|
32 |
-
def map_keywords_to_sentences(text, keywords):
|
33 |
sentences = sent_tokenize(text)
|
34 |
keyword_sentence_mapping = {}
|
35 |
for keyword in keywords:
|
36 |
for i, sentence in enumerate(sentences):
|
37 |
if keyword in sentence:
|
38 |
# Combine current sentence with surrounding sentences for context
|
39 |
-
start = max(0, i-
|
40 |
-
end = min(len(sentences), i+
|
41 |
context = ' '.join(sentences[start:end])
|
42 |
if keyword not in keyword_sentence_mapping:
|
43 |
keyword_sentence_mapping[keyword] = context
|
@@ -45,28 +60,77 @@ def map_keywords_to_sentences(text, keywords):
|
|
45 |
keyword_sentence_mapping[keyword] += ' ' + context
|
46 |
return keyword_sentence_mapping
|
47 |
|
48 |
-
# Function to
|
49 |
-
def
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
50 |
input_text = f"<context> {context} <answer> {answer}"
|
51 |
input_ids = tokenizer.encode(input_text, return_tensors='pt')
|
52 |
-
outputs = model.generate(input_ids)
|
53 |
question = tokenizer.decode(outputs[0], skip_special_tokens=True)
|
54 |
return question
|
55 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
56 |
# Streamlit interface
|
57 |
st.title("Question Generator from Text")
|
58 |
-
text = st.text_area("Enter text here:")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
59 |
if st.button("Generate Questions"):
|
60 |
if text:
|
61 |
keywords = extract_keywords(text)
|
62 |
-
keyword_sentence_mapping = map_keywords_to_sentences(text, keywords)
|
63 |
|
64 |
st.subheader("Generated Questions:")
|
65 |
-
|
66 |
-
|
|
|
|
|
|
|
|
|
67 |
st.write(f"**Context:** {context}")
|
68 |
st.write(f"**Answer:** {keyword}")
|
69 |
st.write(f"**Question:** {question}")
|
|
|
|
|
70 |
st.write("---")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
71 |
else:
|
72 |
st.write("Please enter some text to generate questions.")
|
|
|
1 |
import streamlit as st
|
2 |
from transformers import T5ForConditionalGeneration, T5Tokenizer
|
|
|
3 |
import spacy
|
4 |
import nltk
|
5 |
+
from sklearn.feature_extraction.text import TfidfVectorizer
|
6 |
+
from rake_nltk import Rake
|
7 |
+
import pandas as pd
|
8 |
+
from fpdf import FPDF
|
9 |
+
import wikipediaapi
|
10 |
from b import b
|
11 |
+
|
12 |
nltk.download('punkt')
|
13 |
from nltk.tokenize import sent_tokenize
|
14 |
|
15 |
# Load spaCy model
|
16 |
nlp = spacy.load("en_core_web_sm")
|
17 |
+
# wiki_wiki = wikipediaapi.Wikipedia('en')
|
18 |
|
19 |
# Load T5 model and tokenizer
|
20 |
model_name = "DevBM/t5-large-squad"
|
21 |
model = T5ForConditionalGeneration.from_pretrained(model_name)
|
22 |
tokenizer = T5Tokenizer.from_pretrained(model_name)
|
23 |
|
24 |
+
# Function to extract keywords using combined techniques
|
25 |
def extract_keywords(text):
|
26 |
+
# Use RAKE
|
27 |
+
rake = Rake()
|
28 |
+
rake.extract_keywords_from_text(text)
|
29 |
+
rake_keywords = set(rake.get_ranked_phrases())
|
30 |
+
|
31 |
+
# Use spaCy for NER and POS tagging
|
32 |
doc = nlp(text)
|
33 |
+
spacy_keywords = set([ent.text for ent in doc.ents])
|
34 |
+
spacy_keywords.update([token.text for token in doc if token.pos_ in ["NOUN", "PROPN", "VERB", "ADJ"]])
|
35 |
+
|
36 |
+
# Use TF-IDF
|
37 |
+
vectorizer = TfidfVectorizer(stop_words='english')
|
38 |
+
X = vectorizer.fit_transform([text])
|
39 |
+
tfidf_keywords = set(vectorizer.get_feature_names_out())
|
40 |
+
|
41 |
+
# Combine all keywords
|
42 |
+
combined_keywords = rake_keywords.union(spacy_keywords).union(tfidf_keywords)
|
43 |
+
|
44 |
+
return list(combined_keywords)
|
45 |
|
46 |
+
# Function to map keywords to sentences with customizable context window size
|
47 |
+
def map_keywords_to_sentences(text, keywords, context_window_size):
|
48 |
sentences = sent_tokenize(text)
|
49 |
keyword_sentence_mapping = {}
|
50 |
for keyword in keywords:
|
51 |
for i, sentence in enumerate(sentences):
|
52 |
if keyword in sentence:
|
53 |
# Combine current sentence with surrounding sentences for context
|
54 |
+
start = max(0, i - context_window_size)
|
55 |
+
end = min(len(sentences), i + context_window_size + 1)
|
56 |
context = ' '.join(sentences[start:end])
|
57 |
if keyword not in keyword_sentence_mapping:
|
58 |
keyword_sentence_mapping[keyword] = context
|
|
|
60 |
keyword_sentence_mapping[keyword] += ' ' + context
|
61 |
return keyword_sentence_mapping
|
62 |
|
63 |
+
# Function to perform entity linking using Wikipedia API
|
64 |
+
# def entity_linking(keyword):
|
65 |
+
# page = wiki_wiki.page(keyword)
|
66 |
+
# if page.exists():
|
67 |
+
# return page.fullurl
|
68 |
+
# return None
|
69 |
+
|
70 |
+
# Function to generate questions using beam search
|
71 |
+
def generate_question(context, answer, num_beams=5):
|
72 |
input_text = f"<context> {context} <answer> {answer}"
|
73 |
input_ids = tokenizer.encode(input_text, return_tensors='pt')
|
74 |
+
outputs = model.generate(input_ids, num_beams=num_beams, early_stopping=True)
|
75 |
question = tokenizer.decode(outputs[0], skip_special_tokens=True)
|
76 |
return question
|
77 |
|
78 |
+
# Function to export questions to CSV
|
79 |
+
def export_to_csv(data):
|
80 |
+
df = pd.DataFrame(data, columns=["Context", "Answer", "Question"])
|
81 |
+
df.to_csv('questions.csv', index=False)
|
82 |
+
|
83 |
+
# Function to export questions to PDF
|
84 |
+
def export_to_pdf(data):
|
85 |
+
pdf = FPDF()
|
86 |
+
pdf.add_page()
|
87 |
+
pdf.set_font("Arial", size=12)
|
88 |
+
|
89 |
+
for context, answer, question in data:
|
90 |
+
pdf.multi_cell(0, 10, f"Context: {context}")
|
91 |
+
pdf.multi_cell(0, 10, f"Answer: {answer}")
|
92 |
+
pdf.multi_cell(0, 10, f"Question: {question}")
|
93 |
+
pdf.ln(10)
|
94 |
+
|
95 |
+
pdf.output("questions.pdf")
|
96 |
+
|
97 |
# Streamlit interface
|
98 |
st.title("Question Generator from Text")
|
99 |
+
text = st.text_area("Enter text here:", value="Joe Biden, the current US president is on a weak wicket going in for his reelection later this November against former President Donald Trump.")
|
100 |
+
|
101 |
+
# Customization options
|
102 |
+
num_beams = st.slider("Select number of beams for question generation", min_value=1, max_value=10, value=5)
|
103 |
+
context_window_size = st.slider("Select context window size (number of sentences before and after)", min_value=1, max_value=5, value=1)
|
104 |
+
num_questions = st.slider("Select number of questions to generate", min_value=1, max_value=1000, value=5)
|
105 |
+
question_complexity = st.selectbox("Select question complexity", ["Simple", "Intermediate", "Complex"])
|
106 |
+
|
107 |
if st.button("Generate Questions"):
|
108 |
if text:
|
109 |
keywords = extract_keywords(text)
|
110 |
+
keyword_sentence_mapping = map_keywords_to_sentences(text, keywords, context_window_size)
|
111 |
|
112 |
st.subheader("Generated Questions:")
|
113 |
+
data = []
|
114 |
+
for i, (keyword, context) in enumerate(keyword_sentence_mapping.items()):
|
115 |
+
if i >= num_questions:
|
116 |
+
break
|
117 |
+
# linked_entity = entity_linking(keyword)
|
118 |
+
question = generate_question(context, keyword, num_beams=num_beams)
|
119 |
st.write(f"**Context:** {context}")
|
120 |
st.write(f"**Answer:** {keyword}")
|
121 |
st.write(f"**Question:** {question}")
|
122 |
+
# if linked_entity:
|
123 |
+
# st.write(f"**Entity Link:** {linked_entity}")
|
124 |
st.write("---")
|
125 |
+
data.append((context, keyword, question))
|
126 |
+
|
127 |
+
# Export buttons
|
128 |
+
if st.button("Export to CSV"):
|
129 |
+
export_to_csv(data)
|
130 |
+
st.success("Questions exported to questions.csv")
|
131 |
+
|
132 |
+
if st.button("Export to PDF"):
|
133 |
+
export_to_pdf(data)
|
134 |
+
st.success("Questions exported to questions.pdf")
|
135 |
else:
|
136 |
st.write("Please enter some text to generate questions.")
|