import pyterrier as pt pt.init() import numpy as np import pandas as pd import gradio as gr from pyterrier_doc2query import Doc2Query, QueryScorer, QueryFilter from pyterrier_dr import ElectraScorer from pyterrier_gradio import Demo, MarkdownFile, interface, df2code, code2md, EX_D MODEL = 'macavaney/doc2query-t5-base-msmarco' SCORE_MODEL = 'crystina-z/monoELECTRA_LCE_nneg31' PERCENTILES_BY_5 = np.array([-3.80468750e+00, -2.21679688e+00, -1.25683594e+00, -5.58105469e-01, -7.65323639e-04, 4.69482422e-01, 8.83300781e-01, 1.25878906e+00, 1.61035156e+00, 1.94335938e+00, 2.26562500e+00, 2.58007812e+00, 2.89648438e+00, 3.21484375e+00, 3.54687500e+00, 3.90039062e+00, 4.30078125e+00, 4.77343750e+00, 5.37109375e+00]) COLORS = ['rgb(252, 132, 100)','rgb(252, 148, 116)','rgb(252, 166, 137)','rgb(252, 183, 156)','rgb(253, 200, 178)','rgb(254, 215, 198)','rgb(255, 228, 216)','rgb(255, 237, 228)','rgb(256, 245, 240)','rgb(256, 256, 256)','rgb(247, 252, 245)','rgb(240, 250, 237)','rgb(233, 247, 228)','rgb(222, 242, 216)','rgb(209, 237, 203)','rgb(195, 232, 188)','rgb(180, 225, 173)','rgb(163, 218, 157)','rgb(145, 210, 142)','rgb(125, 201, 126)'] doc2query = Doc2Query(MODEL, append=True, num_samples=5) electra = ElectraScorer() query_scorer = QueryScorer(electra) query_filter = QueryFilter(t=0, append=False) COLAB_NAME = 'pyterrier_doc2query.ipynb' COLAB_INSTALL = ''' !pip install -q git+https://github.com/terrier-org/pyterrier !pip install -q git+https://github.com/terrierteam/pyterrier_doc2query '''.strip() COLAB_INSTALL_MM = COLAB_INSTALL + '\n!pip install -q git+https://github.com/terrierteam/pyterrier_dr faiss-cpu' def predict(input, model, append, num_samples): assert model == MODEL doc2query.append = append doc2query.num_samples = num_samples code = f'''import pandas as pd from pyterrier_doc2query import Doc2Query doc2query = Doc2Query({repr(model)}, append={append}, num_samples={num_samples}) doc2query({df2code(input)}) ''' res = doc2query(input) vis = generate_vis(res) return (doc2query(input), code2md(code, COLAB_INSTALL, COLAB_NAME), vis) def generate_vis(df): result = [] for row in df.itertuples(index=False): qs = [] if hasattr(row, 'querygen_score'): for q, score in zip(row.querygen.split('\n'), row.querygen_score): bucket = np.searchsorted(PERCENTILES_BY_5, score) color = COLORS[bucket] percentile = bucket * 5 qs.append(f'''
{percentile}th {q}
''') elif hasattr(row, 'querygen'): for q in row.querygen.split('\n'): qs.append(f'''
{q}
''') qs = '\n'.join(qs) if qs: qs = f'''
Expansion Queries:
{qs} ''' text = row.text.replace('\n', '
') result.append(f'''
Document: {row.docno}
{text}
{qs}
''') return '\n'.join(result) def predict_mm(input, model, num_samples, score_model, filter_pct): assert model == MODEL assert score_model == SCORE_MODEL doc2query.append = False doc2query.num_samples = num_samples if filter_pct > 0: query_filter.t = PERCENTILES_BY_5[filter_pct//5-1] pipeline = doc2query >> query_scorer >> query_filter code = f'''import pyterrier as pt ; pt.init() import pandas as pd from pyterrier_doc2query import Doc2Query, QueryScorer, QueryFilter from pyterrier_dr import ElectraScorer doc2query = Doc2Query({repr(model)}, append=False, num_samples={num_samples}) scorer = ElectraScorer({repr(score_model)}) pipeline = doc2query >> QueryScorer(scorer) >> QueryFilter(append=False, t={query_filter.t}) # use append=True when indexing; t={query_filter.t} is the {filter_pct}th percentile for generated queries on MS MARCO pipeline({df2code(input)}) ''' else: pipeline = doc2query >> query_scorer code = f'''import pyterrier as pt ; pt.init() import pandas as pd from pyterrier_doc2query import Doc2Query, QueryScorer from pyterrier_dr import ElectraScorer doc2query = Doc2Query({repr(model)}, append=False, num_samples={num_samples}) scorer = ElectraScorer({repr(score_model)}) pipeline = doc2query >> QueryScorer(scorer) pipeline({df2code(input)}) ''' res = pipeline(input) vis = generate_vis(res) res['querygen_score'] = res['querygen_score'].apply(lambda x: '[ ' + ', '.join(str(v) for v in x) + ' ]') return (res, code2md(code, COLAB_INSTALL_MM, COLAB_NAME), vis) interface( MarkdownFile('README.md'), Demo( predict, EX_D, [ gr.Dropdown( choices=[MODEL], value=MODEL, label='Model', interactive=False, ), gr.Checkbox( value=doc2query.append, label="Append", ), gr.Slider( minimum=1, maximum=10, value=doc2query.num_samples, step=1., label='# Queries' )], ), MarkdownFile('mm.md'), Demo( predict_mm, EX_D, [ gr.Dropdown( choices=[MODEL], value=MODEL, label='Model', interactive=False, ), gr.Slider( minimum=1, maximum=10, value=doc2query.num_samples, step=1., label='# Queries' ), gr.Dropdown( choices=[SCORE_MODEL], value=SCORE_MODEL, label='Scorer', interactive=False, ), gr.Slider( minimum=0, maximum=95, value=10, step=5, label='Filter (top % of queries)' )], ), MarkdownFile('wrapup.md'), ).launch(share=False)