AVeriTeC / src /prediction /veracity_with_scraped_text.py
chenxwh's picture
Upload veracity_with_scraped_text.py
a106f67 verified
raw
history blame
No virus
2.25 kB
import os
import argparse
import json
from tqdm import tqdm
def load_url_text_map(knowledge_store_dir, claim_id):
url_text_map = {}
knowledge_file = os.path.join(knowledge_store_dir, f"{claim_id}.json")
if os.path.exists(knowledge_file):
with open(knowledge_file, "r") as f:
for line in f:
data = json.loads(line)
url = data["url"]
url2text = data["url2text"]
concatenated_text = " ".join(url2text)
url_text_map[url] = concatenated_text
return url_text_map
if __name__ == "__main__":
parser = argparse.ArgumentParser(
description="Add scraped_text field to the prediction file."
)
parser.add_argument(
"-i",
"--veracity_prediction_file",
default="data_store/dev_veracity_prediction.json",
help="Json file with the veracity predictions.",
)
parser.add_argument(
"-o",
"--output_file",
default="data_store/dev_veracity_prediction_for_submission.json",
help="Json file with the veracity predictions and the scraped_text.",
)
parser.add_argument(
"--knowledge_store_dir",
type=str,
help="Directory of json files of the knowledge store containing url2text.",
)
args = parser.parse_args()
predictions = []
with open(args.veracity_prediction_file) as f:
predictions = json.load(f)
for claim in tqdm(predictions, desc="Processing claims"):
claim_id = claim["claim_id"]
url_text_map = load_url_text_map(args.knowledge_store_dir, claim_id)
# Process each evidence in the claim and add scraped_text
for evidence in claim["evidence"]:
url = evidence["url"]
scraped_text = url_text_map.get(url)
if scraped_text:
evidence["scraped_text"] = scraped_text
else:
print(
f"Warning: No scraped text found for claim_id {claim_id} and url {url}"
)
with open(args.output_file, "w", encoding="utf-8") as output_file:
json.dump(predictions, output_file, ensure_ascii=False, indent=4)
print(f"Updated JSON saved to {args.output_file}")