Spaces:
Sleeping
Sleeping
Upload utilities.py
Browse files- utilities.py +180 -0
utilities.py
ADDED
@@ -0,0 +1,180 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import fileinput
|
2 |
+
import io
|
3 |
+
import json
|
4 |
+
import os
|
5 |
+
import pathlib
|
6 |
+
import sys
|
7 |
+
from functools import wraps
|
8 |
+
from typing import List, Union
|
9 |
+
|
10 |
+
# import google.auth
|
11 |
+
|
12 |
+
|
13 |
+
class Logger(object):
|
14 |
+
def __init__(self, filename="Default.log"):
|
15 |
+
self.terminal = sys.stdout
|
16 |
+
self.log = open(filename, "a")
|
17 |
+
|
18 |
+
def write(self, message):
|
19 |
+
self.terminal.write(message)
|
20 |
+
self.log.write(message)
|
21 |
+
|
22 |
+
def flush(self):
|
23 |
+
pass
|
24 |
+
|
25 |
+
|
26 |
+
def log_to_file(file_name="Default.log"):
|
27 |
+
def decorator(func):
|
28 |
+
@wraps(func)
|
29 |
+
def wrapper(*args, **kwargs):
|
30 |
+
# Save the current stdout and stderr
|
31 |
+
original_stdout = sys.stdout
|
32 |
+
original_stderr = sys.stderr
|
33 |
+
|
34 |
+
# Redirect stdout and stderr to the log file
|
35 |
+
logger = Logger(file_name)
|
36 |
+
sys.stdout = logger
|
37 |
+
sys.stderr = logger
|
38 |
+
|
39 |
+
try:
|
40 |
+
# Call the original function
|
41 |
+
result = func(*args, **kwargs)
|
42 |
+
return result
|
43 |
+
finally:
|
44 |
+
# Reset stdout and stderr
|
45 |
+
sys.stdout = original_stdout
|
46 |
+
sys.stderr = original_stderr
|
47 |
+
|
48 |
+
return wrapper
|
49 |
+
|
50 |
+
return decorator
|
51 |
+
|
52 |
+
|
53 |
+
# doesn't work directly, need to setup Google Cloud credentials if not present
|
54 |
+
# src: https://developers.google.com/drive/api/guides/manage-downloads#download-content
|
55 |
+
# def download_file(real_file_id):
|
56 |
+
# # dataset link: https://drive.google.com/drive/folders/1KD7v4eW2ZKQ0Re_6lXRuaaVswvS3IFIh?usp=sharing
|
57 |
+
# """Downloads a file
|
58 |
+
# Args:
|
59 |
+
# real_file_id: ID of the file to download
|
60 |
+
# Returns : IO object with location.
|
61 |
+
#
|
62 |
+
# Load pre-authorized user credentials from the environment.
|
63 |
+
# TODO(developer) - See https://developers.google.com/identity
|
64 |
+
# for guides on implementing OAuth2 for the application.
|
65 |
+
# """
|
66 |
+
# creds, _ = google.auth.default()
|
67 |
+
#
|
68 |
+
# try:
|
69 |
+
# # create drive api client
|
70 |
+
# service = build("drive", "v3", credentials=creds)
|
71 |
+
#
|
72 |
+
# file_id = real_file_id
|
73 |
+
#
|
74 |
+
# # pylint: disable=maybe-no-member
|
75 |
+
# request = service.files().get_media(fileId=file_id)
|
76 |
+
# file = io.BytesIO()
|
77 |
+
# downloader = MediaIoBaseDownload(file, request)
|
78 |
+
# done = False
|
79 |
+
# while done is False:
|
80 |
+
# status, done = downloader.next_chunk()
|
81 |
+
# print(f"Download {int(status.progress() * 100)}.")
|
82 |
+
#
|
83 |
+
# except HttpError as error:
|
84 |
+
# print(f"An error occurred: {error}")
|
85 |
+
# file = None
|
86 |
+
#
|
87 |
+
# return file.getvalue()
|
88 |
+
|
89 |
+
|
90 |
+
def read_from_all_files(all_files_to_read: List[Union[str, pathlib.Path]], batch_size: int = 1000,
|
91 |
+
batch_num: int = None,
|
92 |
+
encoding: str = "utf-8",
|
93 |
+
reading_only_specific_files: List[str] = None) -> List:
|
94 |
+
"""
|
95 |
+
bas basic generator that yields a batch of lines, leverages in-built fileinput for reading all files and using same file object
|
96 |
+
:param all_files_to_read: list of file paths, str or Path
|
97 |
+
:param batch_size: the number of maximum lines to yield
|
98 |
+
:param batch_num: the number of batches to yield and then stop, added later for testing
|
99 |
+
:return: List of text lines
|
100 |
+
"""
|
101 |
+
print("\n=========\nReading dataset\n=============")
|
102 |
+
counter = 0
|
103 |
+
if reading_only_specific_files:
|
104 |
+
for idx, f_name in enumerate(all_files_to_read):
|
105 |
+
if not all(x in f_name for x in reading_only_specific_files):
|
106 |
+
all_files_to_read.pop(idx)
|
107 |
+
|
108 |
+
print(f"\nCount of files to read...{len(all_files_to_read)}")
|
109 |
+
all_files_to_read = sorted(all_files_to_read)
|
110 |
+
with fileinput.input(files=all_files_to_read,
|
111 |
+
encoding=encoding) as f: # in-built fileinput to read all files, efficient, handles things internally
|
112 |
+
|
113 |
+
batch = []
|
114 |
+
for line in f:
|
115 |
+
# print(f"file number: {f.fileno()}")
|
116 |
+
# print(f"file-line number: {f.filelineno()}")
|
117 |
+
# print(line)
|
118 |
+
if line != '\n':
|
119 |
+
batch.append(line)
|
120 |
+
if len(batch) == batch_size:
|
121 |
+
counter += 1
|
122 |
+
yield batch
|
123 |
+
if batch_num and counter == batch_num:
|
124 |
+
break
|
125 |
+
batch = []
|
126 |
+
if batch:
|
127 |
+
yield batch
|
128 |
+
print(f"\nFinal counter value: {counter}")
|
129 |
+
print("\n=========\nReading dataset done\n=============")
|
130 |
+
|
131 |
+
|
132 |
+
def read_chunks_from_file(file_path, chunk_size=4 * 1024 * 1024, encoding="utf-8"):
|
133 |
+
"""
|
134 |
+
helper function to yield chunk_size of data read from the file_path given
|
135 |
+
"""
|
136 |
+
file_path = os.path.abspath(file_path)
|
137 |
+
with open(file_path, 'r', encoding=encoding) as f:
|
138 |
+
for chunk in iter(lambda: f.read(chunk_size), b''):
|
139 |
+
yield chunk
|
140 |
+
|
141 |
+
|
142 |
+
def get_all_text_dataset(path: str | pathlib.Path, file_type=".txt") -> List:
|
143 |
+
"""
|
144 |
+
Helper function to get all .txt files' given a path or root directory, uses glob recursively to find the given format files
|
145 |
+
:param path: str or Path object, root directory for a dataset
|
146 |
+
:param file_type: format of files to get
|
147 |
+
:return: list of path of all files of the specified format
|
148 |
+
"""
|
149 |
+
files = []
|
150 |
+
# first convert json data to text and then process text
|
151 |
+
convert_json_data_to_text_and_process_text(dir_path="./web-scrapper",
|
152 |
+
file_type=".json",
|
153 |
+
output_file_path="./dataset/combined_from_crawler-json.txt")
|
154 |
+
|
155 |
+
for txt_file in pathlib.Path(path).rglob('*' + file_type):
|
156 |
+
files.append(txt_file)
|
157 |
+
return files
|
158 |
+
|
159 |
+
|
160 |
+
# def get_data_batch(all_files, chunk_size=100 * 1024 * 1024, formats=".txt"):
|
161 |
+
# for file in all_files:
|
162 |
+
# yield from read_chunks_from_file(file)
|
163 |
+
|
164 |
+
|
165 |
+
def convert_json_data_to_text_and_process_text(dir_path, file_type=".json", output_file_path="crawler_data.txt"):
|
166 |
+
"""
|
167 |
+
Helper function to convert JSON data to text and then process the text
|
168 |
+
|
169 |
+
"""
|
170 |
+
|
171 |
+
with open(output_file_path, "w", encoding="utf-8") as f_out:
|
172 |
+
for json_file in pathlib.Path(dir_path).rglob('*' + file_type):
|
173 |
+
with open(json_file, "r", encoding="utf-8") as f:
|
174 |
+
data = json.load(f)
|
175 |
+
for item in data:
|
176 |
+
f_out.write(" ".join(item["text"]) + "\n")
|
177 |
+
|
178 |
+
|
179 |
+
if __name__ == "__main__":
|
180 |
+
download_file(real_file_id="1KD7v4eW2ZKQ0Re_6lXRuaaVswvS3IFIh")
|