import streamlit as st from transformers import T5ForConditionalGeneration, T5Tokenizer import spacy import nltk from sklearn.feature_extraction.text import TfidfVectorizer from rake_nltk import Rake import pandas as pd from fpdf import FPDF import wikipediaapi from functools import lru_cache nltk.download('punkt') nltk.download('stopwords') nltk.download('brown') from nltk.tokenize import sent_tokenize nltk.downlaod('wordnet') from gensim.models import KeyedVectors from nltk.corpus import wordnet import random # Load spaCy model nlp = spacy.load("en_core_web_sm") # Initialize Wikipedia API with a user agent user_agent = 'QGen/1.0 (channingfisher7@gmail.com)' wiki_wiki = wikipediaapi.Wikipedia(user_agent= user_agent,language='en') # Load pre-trained word vectors (this may take a while) word_vectors = KeyedVectors.load_word2vec_format('vectors/GoogleNews-vectors-negative300.bin', binary=True) def load_model(): model_name = "DevBM/t5-large-squad" model = T5ForConditionalGeneration.from_pretrained(model_name) tokenizer = T5Tokenizer.from_pretrained(model_name) return model, tokenizer # Initialize session state for model and tokenizer if 'model' not in st.session_state: st.session_state.model, st.session_state.tokenizer = load_model() # Use the model and tokenizer from session state model = st.session_state.model tokenizer = st.session_state.tokenizer # Function to extract keywords using combined techniques def extract_keywords(text): # Use RAKE rake = Rake() rake.extract_keywords_from_text(text) rake_keywords = set(rake.get_ranked_phrases()) # Use spaCy for NER and POS tagging doc = nlp(text) spacy_keywords = set([ent.text for ent in doc.ents]) spacy_keywords.update([token.text for token in doc if token.pos_ in ["NOUN", "PROPN", "VERB", "ADJ"]]) # Use TF-IDF vectorizer = TfidfVectorizer(stop_words='english') X = vectorizer.fit_transform([text]) tfidf_keywords = set(vectorizer.get_feature_names_out()) # Combine all keywords combined_keywords = rake_keywords.union(spacy_keywords).union(tfidf_keywords) return list(combined_keywords) # Function to map keywords to sentences with customizable context window size def map_keywords_to_sentences(text, keywords, context_window_size): sentences = sent_tokenize(text) keyword_sentence_mapping = {} for keyword in keywords: for i, sentence in enumerate(sentences): if keyword in sentence: # Combine current sentence with surrounding sentences for context start = max(0, i - context_window_size) end = min(len(sentences), i + context_window_size + 1) context = ' '.join(sentences[start:end]) if keyword not in keyword_sentence_mapping: keyword_sentence_mapping[keyword] = context else: keyword_sentence_mapping[keyword] += ' ' + context return keyword_sentence_mapping def get_similar_words(word, n=3): try: similar_words = word_vectors.most_similar(word, topn=n) return [word for word, _ in similar_words] except KeyError: return [] def get_synonyms(word, n=3): synonyms = [] for syn in wordnet.synsets(word): for lemma in syn.lemmas(): if lemma.name() != word and lemma.name() not in synonyms: synonyms.append(lemma.name()) if len(synonyms) == n: return synonyms return synonyms def generate_options(answer, context, n=3): options = [answer] # Try to get similar words based on word embeddings similar_words = get_similar_words(answer, n) options.extend(similar_words) # If we don't have enough options, try synonyms if len(options) < n + 1: synonyms = get_synonyms(answer, n - len(options) + 1) options.extend(synonyms) # If we still don't have enough options, extract other entities from the context if len(options) < n + 1: doc = nlp(context) entities = [ent.text for ent in doc.ents if ent.text.lower() != answer.lower()] options.extend(entities[:n - len(options) + 1]) # If we still need more options, add some random words from the context if len(options) < n + 1: context_words = [token.text for token in nlp(context) if token.is_alpha and token.text.lower() != answer.lower()] options.extend(random.sample(context_words, min(n - len(options) + 1, len(context_words)))) # Ensure we have the correct number of unique options options = list(dict.fromkeys(options))[:n+1] # Shuffle the options random.shuffle(options) return options # Function to perform entity linking using Wikipedia API @lru_cache(maxsize=128) def entity_linking(keyword): page = wiki_wiki.page(keyword) if page.exists(): return page.fullurl return None # Function to generate questions using beam search def generate_question(context, answer, num_beams): input_text = f" {context} {answer}" input_ids = tokenizer.encode(input_text, return_tensors='pt') outputs = model.generate(input_ids, num_beams=num_beams, early_stopping=True) question = tokenizer.decode(outputs[0], skip_special_tokens=True) return question # Function to export questions to CSV def export_to_csv(data): df = pd.DataFrame(data, columns=["Context", "Answer", "Question"]) csv = df.to_csv(index=False,encoding='utf-8') return csv # Function to export questions to PDF def export_to_pdf(data): pdf = FPDF() pdf.add_page() pdf.set_font("Arial", size=12) for context, answer, question in data: pdf.multi_cell(0, 10, f"Context: {context}") pdf.multi_cell(0, 10, f"Answer: {answer}") pdf.multi_cell(0, 10, f"Question: {question}") pdf.ln(10) # pdf.output("questions.pdf") return pdf.output(name='questions.pdf',dest='S').encode('latin1') if 'data' not in st.session_state: st.session_state.data = None # Streamlit interface st.title(":blue[Question Generator from Text]") text = st.text_area("Enter text here:", value="Joe Biden, the current US president is on a weak wicket going in for his reelection later this November against former President Donald Trump.") with st.sidebar: st.subheader("Customization Options") # Customization options num_beams = st.slider("Select number of beams for question generation", min_value=1, max_value=10, value=5) context_window_size = st.slider("Select context window size (number of sentences before and after)", min_value=1, max_value=5, value=1) num_questions = st.slider("Select number of questions to generate", min_value=1, max_value=1000, value=5) question_complexity = st.selectbox("Select question complexity", ["Simple", "Intermediate", "Complex"]) if st.button("Generate Questions"): if text: load_model() keywords = extract_keywords(text) keyword_sentence_mapping = map_keywords_to_sentences(text, keywords, context_window_size) st.subheader("Generated Questions:") data = [] for i, (keyword, context) in enumerate(keyword_sentence_mapping.items()): if i >= num_questions: break linked_entity = entity_linking(keyword) question = generate_question(context, keyword, num_beams=num_beams) options = generate_options(keyword, context) st.write(f"**Context:** {context}") st.write(f"**Answer:** {keyword}") st.write(f"**Question:** {question}") st.write(f"**Options:**") for j, option in options: st.write(f"{chr(65+j)}. {option}") if linked_entity: st.write(f"**Entity Link:** {linked_entity}") st.write("---") data.append((context, keyword, question)) # Add the data to session state st.session_state.data = data # Export buttons if st.session_state.data is not None: with st.sidebar: st.subheader('Download Content') csv_data = export_to_csv(data) st.download_button(label="CSV Format", data=csv_data, file_name='questions.csv', mime='text/csv') pdf_data = export_to_pdf(data) st.download_button(label="PDF Format", data=pdf_data, file_name='questions.pdf', mime='application/pdf') if st.session_state.data is not None: st.markdown("You can download the data from the sidebar.") else: st.write("Please enter some text to generate questions.")