Spaces:

parsak
/

tuda-nlp4web-hw1

Sleeping

Parsa Kzr

feat: application file

8533d6f 22 days ago

19.3 kB

	import numpy as np
	from dataclasses import dataclass
	import pickle
	import os
	from typing import Iterable, Callable, List, Dict, Optional, Type, TypeVar
	from collections import Counter
	import tqdm
	import re
	import nltk
	from __future__ import annotations
	from dataclasses import asdict, dataclass
	import math
	from typing import Iterable, List, Optional, Type
	import tqdm
	from typing import Type
	from abc import abstractmethod
	import pytrec_eval
	import gradio as gr
	from typing import TypedDict
	from nlp4web_codebase.ir.data_loaders.dm import Document
	from nlp4web_codebase.ir.data_loaders.sciq import load_sciq
	from nlp4web_codebase.ir.data_loaders.dm import Document
	from nlp4web_codebase.ir.models import BaseRetriever
	from nlp4web_codebase.ir.data_loaders import Split
	from scipy.sparse._csc import csc_matrix


	# ----------------- PRE SETUP ----------------- #
	nltk.download("stopwords", quiet=True)
	from nltk.corpus import stopwords as nltk_stopwords

	LANGUAGE = "english"
	word_splitter = re.compile(r"(?u)\b\w\w+\b").findall
	stopwords = set(nltk_stopwords.words(LANGUAGE))


	best_k1 = 0.8
	best_b = 0.6

	index_dir = "output/csc_bm25_index"


	# ----------------- SETUP CLASSES AND FUCNTIONS ----------------- #
	def word_splitting(text: str) -> List[str]:
	return word_splitter(text.lower())


	def lemmatization(words: List[str]) -> List[str]:
	return words # We ignore lemmatization here for simplicity


	def simple_tokenize(text: str) -> List[str]:
	words = word_splitting(text)
	tokenized = list(filter(lambda w: w not in stopwords, words))
	tokenized = lemmatization(tokenized)
	return tokenized


	T = TypeVar("T", bound="InvertedIndex")


	@dataclass
	class PostingList:
	term: str # The term
	docid_postings: List[
	int
	] # docid_postings[i] means the docid (int) of the i-th associated posting
	tweight_postings: List[
	float
	] # tweight_postings[i] means the term weight (float) of the i-th associated posting


	@dataclass
	class InvertedIndex:
	posting_lists: List[PostingList] # docid -> posting_list
	vocab: Dict[str, int]
	cid2docid: Dict[str, int] # collection_id -> docid
	collection_ids: List[str] # docid -> collection_id
	doc_texts: Optional[List[str]] = None # docid -> document text

	def save(self, output_dir: str) -> None:
	os.makedirs(output_dir, exist_ok=True)
	with open(os.path.join(output_dir, "index.pkl"), "wb") as f:
	pickle.dump(self, f)

	@classmethod
	def from_saved(cls: Type[T], saved_dir: str) -> T:
	index = cls(
	posting_lists=[], vocab={}, cid2docid={}, collection_ids=[], doc_texts=None
	)
	with open(os.path.join(saved_dir, "index.pkl"), "rb") as f:
	index = pickle.load(f)
	return index


	# The output of the counting function:
	@dataclass
	class Counting:
	posting_lists: List[PostingList]
	vocab: Dict[str, int]
	cid2docid: Dict[str, int]
	collection_ids: List[str]
	dfs: List[int] # tid -> df
	dls: List[int] # docid -> doc length
	avgdl: float
	nterms: int
	doc_texts: Optional[List[str]] = None


	def run_counting(
	documents: Iterable[Document],
	tokenize_fn: Callable[[str], List[str]] = simple_tokenize,
	store_raw: bool = True, # store the document text in doc_texts
	ndocs: Optional[int] = None,
	show_progress_bar: bool = True,
	) -> Counting:
	"""Counting TFs, DFs, doc_lengths, etc."""
	posting_lists: List[PostingList] = []
	vocab: Dict[str, int] = {}
	cid2docid: Dict[str, int] = {}
	collection_ids: List[str] = []
	dfs: List[int] = [] # tid -> df
	dls: List[int] = [] # docid -> doc length
	nterms: int = 0
	doc_texts: Optional[List[str]] = []
	for doc in tqdm.tqdm(
	documents,
	desc="Counting",
	total=ndocs,
	disable=not show_progress_bar,
	):
	if doc.collection_id in cid2docid:
	continue
	collection_ids.append(doc.collection_id)
	docid = cid2docid.setdefault(doc.collection_id, len(cid2docid))
	toks = tokenize_fn(doc.text)
	tok2tf = Counter(toks)
	dls.append(sum(tok2tf.values()))
	for tok, tf in tok2tf.items():
	nterms += tf
	tid = vocab.get(tok, None)
	if tid is None:
	posting_lists.append(
	PostingList(term=tok, docid_postings=[], tweight_postings=[])
	)
	tid = vocab.setdefault(tok, len(vocab))
	posting_lists[tid].docid_postings.append(docid)
	posting_lists[tid].tweight_postings.append(tf)
	if tid < len(dfs):
	dfs[tid] += 1
	else:
	dfs.append(0)
	if store_raw:
	doc_texts.append(doc.text)
	else:
	doc_texts = None
	return Counting(
	posting_lists=posting_lists,
	vocab=vocab,
	cid2docid=cid2docid,
	collection_ids=collection_ids,
	dfs=dfs,
	dls=dls,
	avgdl=sum(dls) / len(dls),
	nterms=nterms,
	doc_texts=doc_texts,
	)


	# sciq = load_sciq()
	# counting = run_counting(documents=iter(sciq.corpus), ndocs=len(sciq.corpus))


	@dataclass
	class BM25Index(InvertedIndex):

	@staticmethod
	def tokenize(text: str) -> List[str]:
	return simple_tokenize(text)

	@staticmethod
	def cache_term_weights(
	posting_lists: List[PostingList],
	total_docs: int,
	avgdl: float,
	dfs: List[int],
	dls: List[int],
	k1: float,
	b: float,
	) -> None:
	"""Compute term weights and caching"""

	N = total_docs
	for tid, posting_list in enumerate(
	tqdm.tqdm(posting_lists, desc="Regularizing TFs")
	):
	idf = BM25Index.calc_idf(df=dfs[tid], N=N)
	for i in range(len(posting_list.docid_postings)):
	docid = posting_list.docid_postings[i]
	tf = posting_list.tweight_postings[i]
	dl = dls[docid]
	regularized_tf = BM25Index.calc_regularized_tf(
	tf=tf, dl=dl, avgdl=avgdl, k1=k1, b=b
	)
	posting_list.tweight_postings[i] = regularized_tf * idf

	@staticmethod
	def calc_regularized_tf(
	tf: int, dl: float, avgdl: float, k1: float, b: float
	) -> float:
	return tf / (tf + k1 * (1 - b + b * dl / avgdl))

	@staticmethod
	def calc_idf(df: int, N: int):
	return math.log(1 + (N - df + 0.5) / (df + 0.5))

	@classmethod
	def build_from_documents(
	cls: Type[BM25Index],
	documents: Iterable[Document],
	store_raw: bool = True,
	output_dir: Optional[str] = None,
	ndocs: Optional[int] = None,
	show_progress_bar: bool = True,
	k1: float = 0.9,
	b: float = 0.4,
	) -> BM25Index:
	# Counting TFs, DFs, doc_lengths, etc.:
	counting = run_counting(
	documents=documents,
	tokenize_fn=BM25Index.tokenize,
	store_raw=store_raw,
	ndocs=ndocs,
	show_progress_bar=show_progress_bar,
	)

	# Compute term weights and caching:
	posting_lists = counting.posting_lists
	total_docs = len(counting.cid2docid)
	BM25Index.cache_term_weights(
	posting_lists=posting_lists,
	total_docs=total_docs,
	avgdl=counting.avgdl,
	dfs=counting.dfs,
	dls=counting.dls,
	k1=k1,
	b=b,
	)

	# Assembly and save:
	index = BM25Index(
	posting_lists=posting_lists,
	vocab=counting.vocab,
	cid2docid=counting.cid2docid,
	collection_ids=counting.collection_ids,
	doc_texts=counting.doc_texts,
	)
	return index


	class BaseInvertedIndexRetriever(BaseRetriever):

	@property
	@abstractmethod
	def index_class(self) -> Type[InvertedIndex]:
	pass

	def __init__(self, index_dir: str) -> None:
	self.index = self.index_class.from_saved(index_dir)

	def get_term_weights(self, query: str, cid: str) -> Dict[str, float]:
	toks = self.index.tokenize(query)
	target_docid = self.index.cid2docid[cid]
	term_weights = {}
	for tok in toks:
	if tok not in self.index.vocab:
	continue
	tid = self.index.vocab[tok]
	posting_list = self.index.posting_lists[tid]
	for docid, tweight in zip(
	posting_list.docid_postings, posting_list.tweight_postings
	):
	if docid == target_docid:
	term_weights[tok] = tweight
	break
	return term_weights

	def score(self, query: str, cid: str) -> float:
	return sum(self.get_term_weights(query=query, cid=cid).values())

	def retrieve(self, query: str, topk: int = 10) -> Dict[str, float]:
	toks = self.index.tokenize(query)
	docid2score: Dict[int, float] = {}
	for tok in toks:
	if tok not in self.index.vocab:
	continue
	tid = self.index.vocab[tok]
	posting_list = self.index.posting_lists[tid]
	for docid, tweight in zip(
	posting_list.docid_postings, posting_list.tweight_postings
	):
	docid2score.setdefault(docid, 0)
	docid2score[docid] += tweight
	docid2score = dict(
	sorted(docid2score.items(), key=lambda pair: pair[1], reverse=True)[:topk]
	)
	return {
	self.index.collection_ids[docid]: score
	for docid, score in docid2score.items()
	}


	class BM25Retriever(BaseInvertedIndexRetriever):

	@property
	def index_class(self) -> Type[BM25Index]:
	return BM25Index


	@dataclass
	class CSCInvertedIndex:
	posting_lists_matrix: csc_matrix # docid -> posting_list
	vocab: Dict[str, int]
	cid2docid: Dict[str, int] # collection_id -> docid
	collection_ids: List[str] # docid -> collection_id
	doc_texts: Optional[List[str]] = None # docid -> document text

	def save(self, output_dir: str) -> None:
	os.makedirs(output_dir, exist_ok=True)
	with open(os.path.join(output_dir, "index.pkl"), "wb") as f:
	pickle.dump(self, f)

	@classmethod
	def from_saved(cls: Type[T], saved_dir: str) -> T:
	index = cls(
	posting_lists_matrix=None,
	vocab={},
	cid2docid={},
	collection_ids=[],
	doc_texts=None,
	)
	with open(os.path.join(saved_dir, "index.pkl"), "rb") as f:
	index = pickle.load(f)
	return index


	@dataclass
	class CSCBM25Index(CSCInvertedIndex):

	@staticmethod
	def tokenize(text: str) -> List[str]:
	return simple_tokenize(text)

	@staticmethod
	def cache_term_weights(
	posting_lists: List[PostingList],
	total_docs: int,
	avgdl: float,
	dfs: List[int],
	dls: List[int],
	k1: float,
	b: float,
	) -> csc_matrix:
	## YOUR_CODE_STARTS_HERE
	data: List[np.float32] = []
	row_indices = []
	col_indices = []

	N = total_docs
	for tid, posting_list in enumerate(
	tqdm.tqdm(posting_lists, desc="Regularizing TFs")
	):
	idf = CSCBM25Index.calc_idf(df=dfs[tid], N=N)
	for i in range(len(posting_list.docid_postings)):
	docid = posting_list.docid_postings[i]
	tf = posting_list.tweight_postings[i]
	dl = dls[docid]
	regularized_tf = CSCBM25Index.calc_regularized_tf(
	tf=tf, dl=dl, avgdl=avgdl, k1=k1, b=b
	)
	weight = regularized_tf * idf

	# Store values for sparse matrix construction
	row_indices.append(docid)
	col_indices.append(tid)
	data.append(np.float32(weight))

	# Create a CSC matrix from the collected data
	term_weights_matrix = csc_matrix(
	(data, (row_indices, col_indices)), shape=(N, len(posting_lists))
	)

	return term_weights_matrix

	## YOUR_CODE_ENDS_HERE

	@staticmethod
	def calc_regularized_tf(
	tf: int, dl: float, avgdl: float, k1: float, b: float
	) -> float:
	return tf / (tf + k1 * (1 - b + b * dl / avgdl))

	@staticmethod
	def calc_idf(df: int, N: int):
	return math.log(1 + (N - df + 0.5) / (df + 0.5))

	@classmethod
	def build_from_documents(
	cls: Type[CSCBM25Index],
	documents: Iterable[Document],
	store_raw: bool = True,
	output_dir: Optional[str] = None,
	ndocs: Optional[int] = None,
	show_progress_bar: bool = True,
	k1: float = 0.9,
	b: float = 0.4,
	) -> CSCBM25Index:
	# Counting TFs, DFs, doc_lengths, etc.:
	counting = run_counting(
	documents=documents,
	tokenize_fn=CSCBM25Index.tokenize,
	store_raw=store_raw,
	ndocs=ndocs,
	show_progress_bar=show_progress_bar,
	)

	# Compute term weights and caching:
	posting_lists = counting.posting_lists
	total_docs = len(counting.cid2docid)
	posting_lists_matrix = CSCBM25Index.cache_term_weights(
	posting_lists=posting_lists,
	total_docs=total_docs,
	avgdl=counting.avgdl,
	dfs=counting.dfs,
	dls=counting.dls,
	k1=k1,
	b=b,
	)

	# Assembly and save:
	index = CSCBM25Index(
	posting_lists_matrix=posting_lists_matrix,
	vocab=counting.vocab,
	cid2docid=counting.cid2docid,
	collection_ids=counting.collection_ids,
	doc_texts=counting.doc_texts,
	)
	return index


	class BaseCSCInvertedIndexRetriever(BaseRetriever):

	@property
	@abstractmethod
	def index_class(self) -> Type[CSCInvertedIndex]:
	pass

	def __init__(self, index_dir: str) -> None:
	self.index = self.index_class.from_saved(index_dir)

	def get_term_weights(self, query: str, cid: str) -> Dict[str, float]:
	"""Retrieve term weights for a specific query and document."""
	toks = self.index.tokenize(query)
	target_docid = self.index.cid2docid[cid]
	term_weights = {}

	for tok in toks:
	if tok not in self.index.vocab:
	continue
	tid = self.index.vocab[tok]
	# Access the term weights for the target docid and token tid
	weight = self.index.posting_lists_matrix[target_docid, tid]
	if weight != 0:
	term_weights[tok] = weight

	return term_weights

	def score(self, query: str, cid: str) -> float:
	return sum(self.get_term_weights(query=query, cid=cid).values())

	def retrieve(self, query: str, topk: int = 10) -> Dict[str, float]:
	toks = self.index.tokenize(query)
	docid2score: Dict[int, float] = {}

	for tok in toks:
	if tok not in self.index.vocab:
	continue
	tid = self.index.vocab[tok]
	# Get the column of the matrix corresponding to the tid
	term_weights = self.index.posting_lists_matrix[
	:, tid
	].tocoo() # To COOrdinate Matrix for easier access to rows
	for docid, tweight in zip(term_weights.row, term_weights.data):
	docid2score.setdefault(docid, 0)
	docid2score[docid] += tweight

	# Sort and retrieve the top-k results
	docid2score = dict(
	sorted(docid2score.items(), key=lambda pair: pair[1], reverse=True)[:topk]
	)

	return {
	self.index.collection_ids[docid]: score
	for docid, score in docid2score.items()
	}
	return docid2score


	class CSCBM25Retriever(BaseCSCInvertedIndexRetriever):

	@property
	def index_class(self) -> Type[CSCBM25Index]:
	return CSCBM25Index


	# ----------------- SETUP MAIN ----------------- #

	sciq = load_sciq()
	counting = run_counting(documents=iter(sciq.corpus), ndocs=len(sciq.corpus))

	bm25_index = BM25Index.build_from_documents(
	documents=iter(sciq.corpus),
	ndocs=12160,
	show_progress_bar=True,
	)
	bm25_index.save("output/bm25_index")

	csc_bm25_index = CSCBM25Index.build_from_documents(
	documents=iter(sciq.corpus),
	ndocs=12160,
	show_progress_bar=True,
	k1=best_k1,
	b=best_b,
	)
	csc_bm25_index.save("output/csc_bm25_index")


	class Hit(TypedDict):
	cid: str
	score: float
	text: str


	demo: Optional[gr.Interface] = None # Assign your gradio demo to this variable
	return_type = List[Hit]


	# index_dir = "output/csc_bm25_index"


	def search(query: str, index_dir: str = index_dir) -> List[Hit]: # , topk:int = 10
	"""base search functionality for the retrieval"""
	retriever: BaseRetriever = None
	if "csc" in index_dir.lower():
	retriever = CSCBM25Retriever(index_dir)
	else:
	retriever = BM25Retriever(index_dir)

	# Retrieve the documents
	ranking = retriever.retrieve(query) # , topk
	# used for retrieving the doc texts
	text = lambda docid: (
	retriever.index.doc_texts[retriever.index.cid2docid[docid]]
	if retriever.index.doc_texts
	else None
	)

	hits = [Hit(cid=cid, score=score, text=text(cid)) for cid, score in ranking.items()]

	return hits


	'''
	# Function for formatted display of results
	def format_hits_md(hits: List[Hit]) -> str:
	if not hits:
	return "No results found."
	formatted = []
	for idx, hit in enumerate(hits, start=1):
	formatted.append(
	f"## Result {idx}:\n"
	f"* CID: {hit['cid']}\n"
	f"* Score: {hit['score']:.2f}\n"
	f"* Text: {hit['text'] or 'No text available.'}\n"
	)
	return "\n".join(formatted)
	# to return pure json data as a list of json objects
	def format_hits_json(hits: List[Hit]):
	if not hits:
	return
	formatted = []
	# json format
	for hit in hits:
	formatted.append(
	{
	"cid": hit['cid'],
	"score": hit['score'],
	"text": hit['text'] or ''
	}
	)
	return formatted

	# Gradio wrapper
	def interface_search(query: str) -> str: # , topk: int = 10
	"""Wrapper for Gradio interface to call search function and format results."""
	try:
	hits = search(query) # , topk=topk
	return format_hits_json(hits) # [json, md]
	except Exception as e:
	return f"Error: {str(e)}"
	'''

	# app interface
	demo = gr.Interface(
	fn=search, # interface_search to format Markdown or JSON
	inputs=[
	gr.Textbox(label="Search Query", placeholder="Type your search query"),
	# gr.Number(label="Number of Results (Top-k)", value=10),
	],
	outputs=gr.Textbox(
	label="Search Results"
	), # gr.Markdown() or gr.JSON() for better formatting (Next API Testing block should be changed to work)
	title="BM25 Retrieval on allenai/sciq",
	description="Search through the allenai/sciq corpus using a BM25-based retrieval system.",
	)

	demo.launch()