In [1]:
import os
import sys
from pathlib import Path

workding_dir = str(Path.cwd().parent)
os.chdir(workding_dir)
sys.path.append(workding_dir)
print("workding dir:", workding_dir)

workding dir: /Users/inflaton/code/emtech/gpt/llm-qa-bench


In [2]:
from datasets import load_from_disk

new_ds = load_from_disk("./Llama-2-eval/data/datasets/ms_macro/")
new_ds

  from .autonotebook import tqdm as notebook_tqdm


Dataset({
    features: ['answers', 'passages', 'query', 'query_id', 'query_type', 'wellFormedAnswers'],
    num_rows: 500
})

In [3]:
counts = {}
indices = {}
size = 100
for i in range(new_ds.num_rows):
    row = new_ds[i]
    query_type = row["query_type"]
    if query_type in counts:
        counts[query_type] += 1
    else:
        counts[query_type] = 1
    if counts[query_type] == size:
        indices[query_type] = i
counts, indices

({'NUMERIC': 100,
  'DESCRIPTION': 100,
  'ENTITY': 100,
  'PERSON': 100,
  'LOCATION': 100},
 {'NUMERIC': 179,
  'DESCRIPTION': 215,
  'ENTITY': 443,
  'LOCATION': 461,
  'PERSON': 499})

In [4]:
df = new_ds.to_pandas()

In [5]:
df.head()

Unnamed: 0,answers,passages,query,query_id,query_type,wellFormedAnswers
0,"[2,662]","{'is_selected': [0, 0, 0, 1, 0, 0, 0, 0], 'pas...",albany mn population,15177,NUMERIC,"[The population of Albany, Minnesota is 2,662. ]"
1,[The Volcano forecast for Apr 12 is 52 degrees...,"{'is_selected': [1, 0, 1, 0, 0, 0, 0, 1, 0, 0]...","current weather in volcano, ca",114414,DESCRIPTION,[The Volcano forecast for Apr 12 is 52 degrees...
2,[Hippocrates],"{'is_selected': [0, 0, 0, 0, 0, 1, 0, 0, 0, 0]...",____________________ is considered the father ...,9083,DESCRIPTION,[Hippocrates is considered the father of moder...
3,[120 days from the date of the Note.],"{'is_selected': [0, 1, 0, 0, 0, 0, 0, 0, 0, 0]...",how many days is an appraisal good for a fanni...,281439,NUMERIC,[An appraisal is good for 120 days from the da...
4,"[From $26,000 to $39,000 a year]","{'is_selected': [0, 1, 0, 0, 0, 0, 0, 0, 0, 0]...",average pharmacy tech salary,40287,NUMERIC,[The average salary for a pharmacy technician ...


In [6]:
df.rename(columns={"query": "question", "query_id": "id"}, inplace=True)

In [7]:
df.head()

Unnamed: 0,answers,passages,question,id,query_type,wellFormedAnswers
0,"[2,662]","{'is_selected': [0, 0, 0, 1, 0, 0, 0, 0], 'pas...",albany mn population,15177,NUMERIC,"[The population of Albany, Minnesota is 2,662. ]"
1,[The Volcano forecast for Apr 12 is 52 degrees...,"{'is_selected': [1, 0, 1, 0, 0, 0, 0, 1, 0, 0]...","current weather in volcano, ca",114414,DESCRIPTION,[The Volcano forecast for Apr 12 is 52 degrees...
2,[Hippocrates],"{'is_selected': [0, 0, 0, 0, 0, 1, 0, 0, 0, 0]...",____________________ is considered the father ...,9083,DESCRIPTION,[Hippocrates is considered the father of moder...
3,[120 days from the date of the Note.],"{'is_selected': [0, 1, 0, 0, 0, 0, 0, 0, 0, 0]...",how many days is an appraisal good for a fanni...,281439,NUMERIC,[An appraisal is good for 120 days from the da...
4,"[From $26,000 to $39,000 a year]","{'is_selected': [0, 1, 0, 0, 0, 0, 0, 0, 0, 0]...",average pharmacy tech salary,40287,NUMERIC,[The average salary for a pharmacy technician ...


In [8]:
import numpy as np


def generate_context(row, debug=False):
    passages = row["passages"]
    if debug:
        print("question:", row["question"])
        print(passages)

    passage_text = passages["passage_text"]
    context = "\n\n".join(passage_text)

    return context

In [9]:
%%time
context = generate_context(df.iloc[0], debug=True)
print(context)

question: albany mn population
{'is_selected': array([0, 0, 0, 1, 0, 0, 0, 0], dtype=int32), 'passage_text': array(['City of Albany, MN Zip Codes. City of Albany, MN Demographic Information. * Demographic data is based on information taken from the 2000 Census. City of Albany, MN covers 1 Area Code. City of Albany, MN covers 1 Zip Code. 15 Cities within 15 Miles of the City of Albany, MN.',
       'Place of birth for U.S.-born residents: 70% of the 56307 zip code residents lived in the same house 5 years ago. Out of people who lived in different houses, 71% lived in this county. Out of people who lived in different counties, 50% lived in Minnesota. 92% of the 56307 zip code residents lived in the same house 1 year ago.',
       'For the unincorporated community in southeast Minnesota named West Albany, see West Albany, Minnesota. Albany is a city in Stearns County, Minnesota, United States. The population was 2,561 at the 2010 census. It is part of the St. Cloud Metropolitan Statistica

In [10]:
%%time
from tqdm import tqdm

tqdm.pandas()

df["context"] = df.progress_apply(
    generate_context, axis=1
)

100%|██████████| 500/500 [00:00<00:00, 213125.20it/s]

CPU times: user 3.19 ms, sys: 1.47 ms, total: 4.67 ms
Wall time: 4.01 ms





In [11]:
df.head()

Unnamed: 0,answers,passages,question,id,query_type,wellFormedAnswers,context
0,"[2,662]","{'is_selected': [0, 0, 0, 1, 0, 0, 0, 0], 'pas...",albany mn population,15177,NUMERIC,"[The population of Albany, Minnesota is 2,662. ]","City of Albany, MN Zip Codes. City of Albany, ..."
1,[The Volcano forecast for Apr 12 is 52 degrees...,"{'is_selected': [1, 0, 1, 0, 0, 0, 0, 1, 0, 0]...","current weather in volcano, ca",114414,DESCRIPTION,[The Volcano forecast for Apr 12 is 52 degrees...,Volcano 10 Day Weather. Sunday:The Volcano for...
2,[Hippocrates],"{'is_selected': [0, 0, 0, 0, 0, 1, 0, 0, 0, 0]...",____________________ is considered the father ...,9083,DESCRIPTION,[Hippocrates is considered the father of moder...,Hippocrates is widely considered to be the Fat...
3,[120 days from the date of the Note.],"{'is_selected': [0, 1, 0, 0, 0, 0, 0, 0, 0, 0]...",how many days is an appraisal good for a fanni...,281439,NUMERIC,[An appraisal is good for 120 days from the da...,New and Updated Underwriting and Eligibility P...
4,"[From $26,000 to $39,000 a year]","{'is_selected': [0, 1, 0, 0, 0, 0, 0, 0, 0, 0]...",average pharmacy tech salary,40287,NUMERIC,[The average salary for a pharmacy technician ...,If you are interested in becoming a pharmacy t...


In [14]:
# reordering columns
df = df[["id", "question", "answers", "wellFormedAnswers", "context", "query_type"]]
df.head()

Unnamed: 0,id,question,answers,wellFormedAnswers,context,query_type
0,15177,albany mn population,"[2,662]","[The population of Albany, Minnesota is 2,662. ]","City of Albany, MN Zip Codes. City of Albany, ...",NUMERIC
1,114414,"current weather in volcano, ca",[The Volcano forecast for Apr 12 is 52 degrees...,[The Volcano forecast for Apr 12 is 52 degrees...,Volcano 10 Day Weather. Sunday:The Volcano for...,DESCRIPTION
2,9083,____________________ is considered the father ...,[Hippocrates],[Hippocrates is considered the father of moder...,Hippocrates is widely considered to be the Fat...,DESCRIPTION
3,281439,how many days is an appraisal good for a fanni...,[120 days from the date of the Note.],[An appraisal is good for 120 days from the da...,New and Updated Underwriting and Eligibility P...,NUMERIC
4,40287,average pharmacy tech salary,"[From $26,000 to $39,000 a year]",[The average salary for a pharmacy technician ...,If you are interested in becoming a pharmacy t...,NUMERIC


In [15]:
# save df to json with indent=4
df.to_json(
    "./data/datasets/ms_macro.json", orient="records", indent=4
)