import os import argparse import json from tqdm import tqdm def load_url_text_map(knowledge_store_dir, claim_id): url_text_map = {} knowledge_file = os.path.join(knowledge_store_dir, f"{claim_id}.json") if os.path.exists(knowledge_file): with open(knowledge_file, "r") as f: for line in f: data = json.loads(line) url = data["url"] url2text = data["url2text"] concatenated_text = " ".join(url2text) url_text_map[url] = concatenated_text return url_text_map if __name__ == "__main__": parser = argparse.ArgumentParser( description="Add scraped_text field to the prediction file." ) parser.add_argument( "-i", "--veracity_prediction_file", default="data_store/dev_veracity_prediction.json", help="Json file with the veracity predictions.", ) parser.add_argument( "-o", "--output_file", default="data_store/dev_veracity_prediction_for_submission.json", help="Json file with the veracity predictions and the scraped_text.", ) parser.add_argument( "--knowledge_store_dir", type=str, help="Directory of json files of the knowledge store containing url2text.", ) args = parser.parse_args() predictions = [] with open(args.veracity_prediction_file) as f: predictions = json.load(f) for claim in tqdm(predictions, desc="Processing claims"): claim_id = claim["claim_id"] url_text_map = load_url_text_map(args.knowledge_store_dir, claim_id) # Process each evidence in the claim and add scraped_text for evidence in claim["evidence"]: url = evidence["url"] scraped_text = url_text_map.get(url) if scraped_text: evidence["scraped_text"] = scraped_text else: print( f"Warning: No scraped text found for claim_id {claim_id} and url {url}" ) with open(args.output_file, "w", encoding="utf-8") as output_file: json.dump(predictions, output_file, ensure_ascii=False, indent=4) print(f"Updated JSON saved to {args.output_file}")