DevBM commited on
Commit
6b21734
1 Parent(s): 9c6fa4d

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +86 -22
app.py CHANGED
@@ -1,43 +1,58 @@
1
  import streamlit as st
2
  from transformers import T5ForConditionalGeneration, T5Tokenizer
3
- import torch
4
  import spacy
5
  import nltk
 
 
 
 
 
6
  from b import b
 
7
  nltk.download('punkt')
8
  from nltk.tokenize import sent_tokenize
9
 
10
  # Load spaCy model
11
  nlp = spacy.load("en_core_web_sm")
 
12
 
13
  # Load T5 model and tokenizer
14
  model_name = "DevBM/t5-large-squad"
15
  model = T5ForConditionalGeneration.from_pretrained(model_name)
16
  tokenizer = T5Tokenizer.from_pretrained(model_name)
17
 
18
- # Function to extract keywords using spaCy
19
  def extract_keywords(text):
 
 
 
 
 
 
20
  doc = nlp(text)
21
- keywords = set()
22
- # Extract named entities
23
- for entity in doc.ents:
24
- keywords.add(entity.text)
25
- # Extract nouns and proper nouns
26
- for token in doc:
27
- if token.pos_ in ["NOUN", "PROPN"]:
28
- keywords.add(token.text)
29
- return list(keywords)
 
 
 
30
 
31
- # Function to map keywords to sentences
32
- def map_keywords_to_sentences(text, keywords):
33
  sentences = sent_tokenize(text)
34
  keyword_sentence_mapping = {}
35
  for keyword in keywords:
36
  for i, sentence in enumerate(sentences):
37
  if keyword in sentence:
38
  # Combine current sentence with surrounding sentences for context
39
- start = max(0, i-1)
40
- end = min(len(sentences), i+2)
41
  context = ' '.join(sentences[start:end])
42
  if keyword not in keyword_sentence_mapping:
43
  keyword_sentence_mapping[keyword] = context
@@ -45,28 +60,77 @@ def map_keywords_to_sentences(text, keywords):
45
  keyword_sentence_mapping[keyword] += ' ' + context
46
  return keyword_sentence_mapping
47
 
48
- # Function to generate questions
49
- def generate_question(context, answer):
 
 
 
 
 
 
 
50
  input_text = f"<context> {context} <answer> {answer}"
51
  input_ids = tokenizer.encode(input_text, return_tensors='pt')
52
- outputs = model.generate(input_ids)
53
  question = tokenizer.decode(outputs[0], skip_special_tokens=True)
54
  return question
55
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
56
  # Streamlit interface
57
  st.title("Question Generator from Text")
58
- text = st.text_area("Enter text here:")
 
 
 
 
 
 
 
59
  if st.button("Generate Questions"):
60
  if text:
61
  keywords = extract_keywords(text)
62
- keyword_sentence_mapping = map_keywords_to_sentences(text, keywords)
63
 
64
  st.subheader("Generated Questions:")
65
- for keyword, context in keyword_sentence_mapping.items():
66
- question = generate_question(context, keyword)
 
 
 
 
67
  st.write(f"**Context:** {context}")
68
  st.write(f"**Answer:** {keyword}")
69
  st.write(f"**Question:** {question}")
 
 
70
  st.write("---")
 
 
 
 
 
 
 
 
 
 
71
  else:
72
  st.write("Please enter some text to generate questions.")
 
1
  import streamlit as st
2
  from transformers import T5ForConditionalGeneration, T5Tokenizer
 
3
  import spacy
4
  import nltk
5
+ from sklearn.feature_extraction.text import TfidfVectorizer
6
+ from rake_nltk import Rake
7
+ import pandas as pd
8
+ from fpdf import FPDF
9
+ import wikipediaapi
10
  from b import b
11
+
12
  nltk.download('punkt')
13
  from nltk.tokenize import sent_tokenize
14
 
15
  # Load spaCy model
16
  nlp = spacy.load("en_core_web_sm")
17
+ # wiki_wiki = wikipediaapi.Wikipedia('en')
18
 
19
  # Load T5 model and tokenizer
20
  model_name = "DevBM/t5-large-squad"
21
  model = T5ForConditionalGeneration.from_pretrained(model_name)
22
  tokenizer = T5Tokenizer.from_pretrained(model_name)
23
 
24
+ # Function to extract keywords using combined techniques
25
  def extract_keywords(text):
26
+ # Use RAKE
27
+ rake = Rake()
28
+ rake.extract_keywords_from_text(text)
29
+ rake_keywords = set(rake.get_ranked_phrases())
30
+
31
+ # Use spaCy for NER and POS tagging
32
  doc = nlp(text)
33
+ spacy_keywords = set([ent.text for ent in doc.ents])
34
+ spacy_keywords.update([token.text for token in doc if token.pos_ in ["NOUN", "PROPN", "VERB", "ADJ"]])
35
+
36
+ # Use TF-IDF
37
+ vectorizer = TfidfVectorizer(stop_words='english')
38
+ X = vectorizer.fit_transform([text])
39
+ tfidf_keywords = set(vectorizer.get_feature_names_out())
40
+
41
+ # Combine all keywords
42
+ combined_keywords = rake_keywords.union(spacy_keywords).union(tfidf_keywords)
43
+
44
+ return list(combined_keywords)
45
 
46
+ # Function to map keywords to sentences with customizable context window size
47
+ def map_keywords_to_sentences(text, keywords, context_window_size):
48
  sentences = sent_tokenize(text)
49
  keyword_sentence_mapping = {}
50
  for keyword in keywords:
51
  for i, sentence in enumerate(sentences):
52
  if keyword in sentence:
53
  # Combine current sentence with surrounding sentences for context
54
+ start = max(0, i - context_window_size)
55
+ end = min(len(sentences), i + context_window_size + 1)
56
  context = ' '.join(sentences[start:end])
57
  if keyword not in keyword_sentence_mapping:
58
  keyword_sentence_mapping[keyword] = context
 
60
  keyword_sentence_mapping[keyword] += ' ' + context
61
  return keyword_sentence_mapping
62
 
63
+ # Function to perform entity linking using Wikipedia API
64
+ # def entity_linking(keyword):
65
+ # page = wiki_wiki.page(keyword)
66
+ # if page.exists():
67
+ # return page.fullurl
68
+ # return None
69
+
70
+ # Function to generate questions using beam search
71
+ def generate_question(context, answer, num_beams=5):
72
  input_text = f"<context> {context} <answer> {answer}"
73
  input_ids = tokenizer.encode(input_text, return_tensors='pt')
74
+ outputs = model.generate(input_ids, num_beams=num_beams, early_stopping=True)
75
  question = tokenizer.decode(outputs[0], skip_special_tokens=True)
76
  return question
77
 
78
+ # Function to export questions to CSV
79
+ def export_to_csv(data):
80
+ df = pd.DataFrame(data, columns=["Context", "Answer", "Question"])
81
+ df.to_csv('questions.csv', index=False)
82
+
83
+ # Function to export questions to PDF
84
+ def export_to_pdf(data):
85
+ pdf = FPDF()
86
+ pdf.add_page()
87
+ pdf.set_font("Arial", size=12)
88
+
89
+ for context, answer, question in data:
90
+ pdf.multi_cell(0, 10, f"Context: {context}")
91
+ pdf.multi_cell(0, 10, f"Answer: {answer}")
92
+ pdf.multi_cell(0, 10, f"Question: {question}")
93
+ pdf.ln(10)
94
+
95
+ pdf.output("questions.pdf")
96
+
97
  # Streamlit interface
98
  st.title("Question Generator from Text")
99
+ text = st.text_area("Enter text here:", value="Joe Biden, the current US president is on a weak wicket going in for his reelection later this November against former President Donald Trump.")
100
+
101
+ # Customization options
102
+ num_beams = st.slider("Select number of beams for question generation", min_value=1, max_value=10, value=5)
103
+ context_window_size = st.slider("Select context window size (number of sentences before and after)", min_value=1, max_value=5, value=1)
104
+ num_questions = st.slider("Select number of questions to generate", min_value=1, max_value=1000, value=5)
105
+ question_complexity = st.selectbox("Select question complexity", ["Simple", "Intermediate", "Complex"])
106
+
107
  if st.button("Generate Questions"):
108
  if text:
109
  keywords = extract_keywords(text)
110
+ keyword_sentence_mapping = map_keywords_to_sentences(text, keywords, context_window_size)
111
 
112
  st.subheader("Generated Questions:")
113
+ data = []
114
+ for i, (keyword, context) in enumerate(keyword_sentence_mapping.items()):
115
+ if i >= num_questions:
116
+ break
117
+ # linked_entity = entity_linking(keyword)
118
+ question = generate_question(context, keyword, num_beams=num_beams)
119
  st.write(f"**Context:** {context}")
120
  st.write(f"**Answer:** {keyword}")
121
  st.write(f"**Question:** {question}")
122
+ # if linked_entity:
123
+ # st.write(f"**Entity Link:** {linked_entity}")
124
  st.write("---")
125
+ data.append((context, keyword, question))
126
+
127
+ # Export buttons
128
+ if st.button("Export to CSV"):
129
+ export_to_csv(data)
130
+ st.success("Questions exported to questions.csv")
131
+
132
+ if st.button("Export to PDF"):
133
+ export_to_pdf(data)
134
+ st.success("Questions exported to questions.pdf")
135
  else:
136
  st.write("Please enter some text to generate questions.")