File size: 5,278 Bytes
17af92c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
import os
import argparse
import csv
from time import sleep
import time
import json
import numpy as np
import fitz
import pandas as pd
import requests
from src.retrieval.html2lines import url2lines, line_correction

csv.field_size_limit(100000000)

MAX_RETRIES = 3
TIMEOUT = 5  # time limit for request


def scrape_text_from_url(url, temp_name):
    response = None
    for attempt in range(MAX_RETRIES):
        try:
            response = requests.get(url, timeout=TIMEOUT)
        except requests.RequestException as e:
            if attempt < MAX_RETRIES - 1:
                sleep(3)  # Wait before retrying

    if (
        response is None or response.status_code == 503
    ):  # trafilatura does not handle retry with 503, often waiting 24hours as overwriten by the html
        return []

    if url.endswith(".pdf"):
        with open(f"pdf_dir/{temp_name}.pdf", "wb") as f:
            f.write(response.content)

        extracted_text = ""
        doc = fitz.open(f"pdf_dir/{temp_name}.pdf")
        for page in doc:  # iterate the document pages
            extracted_text += page.get_text() if page.get_text() else ""

        return line_correction(extracted_text.split("\n"))

    return line_correction(url2lines(url))


if __name__ == "__main__":

    parser = argparse.ArgumentParser(description="Scraping text from URL")
    parser.add_argument(
        "-i",
        "--tsv_input_file",
        type=str,
        help="The path of the input files containing URLs from Google search.",
    )
    parser.add_argument(
        "-o",
        "--json_output_dir",
        type=str,
        default="output",
        help="The output JSON file to save the scraped data.",
    )
    parser.add_argument(
        "--overwrite_out_file",
        action="store_true",
    )

    args = parser.parse_args()

    assert (
        os.path.splitext(args.tsv_input_file)[-1] == ".tsv"
    ), "The input should be a tsv file."

    os.makedirs(args.json_output_dir, exist_ok=True)

    total_scraped, empty, total_failed = 0, 0, 0

    print(f"Processing files {args.tsv_input_file}")

    st = time.time()

    claim_id = os.path.splitext(os.path.basename(args.tsv_input_file))[0]
    json_output_path = os.path.join(args.json_output_dir, f"{claim_id}.json")

    lines_skipped = 0
    if os.path.exists(json_output_path):
        if args.overwrite_out_file:
            os.remove(json_output_path)
        else:
            with open(json_output_path, "r", encoding="utf-8") as json_file:
                existing_data = json_file.readlines()
                lines_skipped = len(existing_data)
                print(f"    Skipping {lines_skipped} lines in {json_output_path}")

    # Some tsv files will fail to be laoded, try all 4 different libs to to load them
    try:
        df = pd.read_csv(args.tsv_input_file, sep="\t", header=None)
        data = df.values
        print("Data loaded successfully with Pandas.")

    except Exception as e:
        print("Error loading with csv:", e)
        try:
            data = np.genfromtxt(
                args.tsv_input_file, delimiter="\t", dtype=None, encoding=None
            )
            print("Data loaded successfully with NumPy.")
        except Exception as e:
            print("Error loading with NumPy:", e)
            # If NumPy loading fails, attempt to load with Pandas
            try:
                data = []
                with open(args.tsv_input_file, "r", newline="") as tsvfile:
                    reader = csv.reader(tsvfile, delimiter="\t")
                    for row in reader:
                        data.append(row)
                print("Data loaded successfully with csv.")
            except Exception as e:
                print("Error loading with csv:", e)
                data = None

    if len(data) == lines_skipped:
        print("    No more lines need to be processed!")
    else:
        with open(json_output_path, "a", encoding="utf-8") as json_file:
            for index, row in enumerate(data):
                if index < lines_skipped:
                    continue
                url = row[2]
                json_data = {
                    "claim_id": claim_id,
                    "type": row[1],
                    "query": row[3],
                    "url": url,
                    "url2text": [],
                }
                print(f"Scraping text for url_{index}: {url}!")
                try:
                    scrape_result = scrape_text_from_url(url, claim_id)
                    json_data["url2text"] = scrape_result

                    if len(json_data["url2text"]) > 0:
                        total_scraped += 1
                    else:
                        empty += 1

                except Exception as e:
                    total_failed += 1

            json_file.write(json.dumps(json_data, ensure_ascii=False) + "\n")
            json_file.flush()

        print(f"Output for {args.tsv_input_file} saved to {json_output_path}")
        elapsed_time = time.time() - st
        elapsed_minutes = int(elapsed_time // 60)
        elapsed_seconds = int(elapsed_time % 60)
        print(f"Time elapsed: {elapsed_minutes}min {elapsed_seconds}sec")
        print(f"{total_scraped} scraped, {empty} empty, {total_failed} failed")