File size: 2,253 Bytes
a106f67
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
import os
import argparse
import json
from tqdm import tqdm


def load_url_text_map(knowledge_store_dir, claim_id):
    url_text_map = {}
    knowledge_file = os.path.join(knowledge_store_dir, f"{claim_id}.json")

    if os.path.exists(knowledge_file):
        with open(knowledge_file, "r") as f:
            for line in f:
                data = json.loads(line)
                url = data["url"]
                url2text = data["url2text"]
                concatenated_text = " ".join(url2text)
                url_text_map[url] = concatenated_text

    return url_text_map


if __name__ == "__main__":
    parser = argparse.ArgumentParser(
        description="Add scraped_text field to the prediction file."
    )
    parser.add_argument(
        "-i",
        "--veracity_prediction_file",
        default="data_store/dev_veracity_prediction.json",
        help="Json file with the veracity predictions.",
    )
    parser.add_argument(
        "-o",
        "--output_file",
        default="data_store/dev_veracity_prediction_for_submission.json",
        help="Json file with the veracity predictions and the scraped_text.",
    )
    parser.add_argument(
        "--knowledge_store_dir",
        type=str,
        help="Directory of json files of the knowledge store containing url2text.",
    )
    args = parser.parse_args()

    predictions = []
    with open(args.veracity_prediction_file) as f:
        predictions = json.load(f)

    for claim in tqdm(predictions, desc="Processing claims"):
        claim_id = claim["claim_id"]
        url_text_map = load_url_text_map(args.knowledge_store_dir, claim_id)

        # Process each evidence in the claim and add scraped_text
        for evidence in claim["evidence"]:
            url = evidence["url"]
            scraped_text = url_text_map.get(url)
            if scraped_text:
                evidence["scraped_text"] = scraped_text
            else:
                print(
                    f"Warning: No scraped text found for claim_id {claim_id} and url {url}"
                )

    with open(args.output_file, "w", encoding="utf-8") as output_file:
        json.dump(predictions, output_file, ensure_ascii=False, indent=4)

    print(f"Updated JSON saved to {args.output_file}")