Manu101 commited on
Commit
258e5d5
1 Parent(s): b4804f8

Upload utilities.py

Browse files
Files changed (1) hide show
  1. utilities.py +180 -0
utilities.py ADDED
@@ -0,0 +1,180 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import fileinput
2
+ import io
3
+ import json
4
+ import os
5
+ import pathlib
6
+ import sys
7
+ from functools import wraps
8
+ from typing import List, Union
9
+
10
+ # import google.auth
11
+
12
+
13
+ class Logger(object):
14
+ def __init__(self, filename="Default.log"):
15
+ self.terminal = sys.stdout
16
+ self.log = open(filename, "a")
17
+
18
+ def write(self, message):
19
+ self.terminal.write(message)
20
+ self.log.write(message)
21
+
22
+ def flush(self):
23
+ pass
24
+
25
+
26
+ def log_to_file(file_name="Default.log"):
27
+ def decorator(func):
28
+ @wraps(func)
29
+ def wrapper(*args, **kwargs):
30
+ # Save the current stdout and stderr
31
+ original_stdout = sys.stdout
32
+ original_stderr = sys.stderr
33
+
34
+ # Redirect stdout and stderr to the log file
35
+ logger = Logger(file_name)
36
+ sys.stdout = logger
37
+ sys.stderr = logger
38
+
39
+ try:
40
+ # Call the original function
41
+ result = func(*args, **kwargs)
42
+ return result
43
+ finally:
44
+ # Reset stdout and stderr
45
+ sys.stdout = original_stdout
46
+ sys.stderr = original_stderr
47
+
48
+ return wrapper
49
+
50
+ return decorator
51
+
52
+
53
+ # doesn't work directly, need to setup Google Cloud credentials if not present
54
+ # src: https://developers.google.com/drive/api/guides/manage-downloads#download-content
55
+ # def download_file(real_file_id):
56
+ # # dataset link: https://drive.google.com/drive/folders/1KD7v4eW2ZKQ0Re_6lXRuaaVswvS3IFIh?usp=sharing
57
+ # """Downloads a file
58
+ # Args:
59
+ # real_file_id: ID of the file to download
60
+ # Returns : IO object with location.
61
+ #
62
+ # Load pre-authorized user credentials from the environment.
63
+ # TODO(developer) - See https://developers.google.com/identity
64
+ # for guides on implementing OAuth2 for the application.
65
+ # """
66
+ # creds, _ = google.auth.default()
67
+ #
68
+ # try:
69
+ # # create drive api client
70
+ # service = build("drive", "v3", credentials=creds)
71
+ #
72
+ # file_id = real_file_id
73
+ #
74
+ # # pylint: disable=maybe-no-member
75
+ # request = service.files().get_media(fileId=file_id)
76
+ # file = io.BytesIO()
77
+ # downloader = MediaIoBaseDownload(file, request)
78
+ # done = False
79
+ # while done is False:
80
+ # status, done = downloader.next_chunk()
81
+ # print(f"Download {int(status.progress() * 100)}.")
82
+ #
83
+ # except HttpError as error:
84
+ # print(f"An error occurred: {error}")
85
+ # file = None
86
+ #
87
+ # return file.getvalue()
88
+
89
+
90
+ def read_from_all_files(all_files_to_read: List[Union[str, pathlib.Path]], batch_size: int = 1000,
91
+ batch_num: int = None,
92
+ encoding: str = "utf-8",
93
+ reading_only_specific_files: List[str] = None) -> List:
94
+ """
95
+ bas basic generator that yields a batch of lines, leverages in-built fileinput for reading all files and using same file object
96
+ :param all_files_to_read: list of file paths, str or Path
97
+ :param batch_size: the number of maximum lines to yield
98
+ :param batch_num: the number of batches to yield and then stop, added later for testing
99
+ :return: List of text lines
100
+ """
101
+ print("\n=========\nReading dataset\n=============")
102
+ counter = 0
103
+ if reading_only_specific_files:
104
+ for idx, f_name in enumerate(all_files_to_read):
105
+ if not all(x in f_name for x in reading_only_specific_files):
106
+ all_files_to_read.pop(idx)
107
+
108
+ print(f"\nCount of files to read...{len(all_files_to_read)}")
109
+ all_files_to_read = sorted(all_files_to_read)
110
+ with fileinput.input(files=all_files_to_read,
111
+ encoding=encoding) as f: # in-built fileinput to read all files, efficient, handles things internally
112
+
113
+ batch = []
114
+ for line in f:
115
+ # print(f"file number: {f.fileno()}")
116
+ # print(f"file-line number: {f.filelineno()}")
117
+ # print(line)
118
+ if line != '\n':
119
+ batch.append(line)
120
+ if len(batch) == batch_size:
121
+ counter += 1
122
+ yield batch
123
+ if batch_num and counter == batch_num:
124
+ break
125
+ batch = []
126
+ if batch:
127
+ yield batch
128
+ print(f"\nFinal counter value: {counter}")
129
+ print("\n=========\nReading dataset done\n=============")
130
+
131
+
132
+ def read_chunks_from_file(file_path, chunk_size=4 * 1024 * 1024, encoding="utf-8"):
133
+ """
134
+ helper function to yield chunk_size of data read from the file_path given
135
+ """
136
+ file_path = os.path.abspath(file_path)
137
+ with open(file_path, 'r', encoding=encoding) as f:
138
+ for chunk in iter(lambda: f.read(chunk_size), b''):
139
+ yield chunk
140
+
141
+
142
+ def get_all_text_dataset(path: str | pathlib.Path, file_type=".txt") -> List:
143
+ """
144
+ Helper function to get all .txt files' given a path or root directory, uses glob recursively to find the given format files
145
+ :param path: str or Path object, root directory for a dataset
146
+ :param file_type: format of files to get
147
+ :return: list of path of all files of the specified format
148
+ """
149
+ files = []
150
+ # first convert json data to text and then process text
151
+ convert_json_data_to_text_and_process_text(dir_path="./web-scrapper",
152
+ file_type=".json",
153
+ output_file_path="./dataset/combined_from_crawler-json.txt")
154
+
155
+ for txt_file in pathlib.Path(path).rglob('*' + file_type):
156
+ files.append(txt_file)
157
+ return files
158
+
159
+
160
+ # def get_data_batch(all_files, chunk_size=100 * 1024 * 1024, formats=".txt"):
161
+ # for file in all_files:
162
+ # yield from read_chunks_from_file(file)
163
+
164
+
165
+ def convert_json_data_to_text_and_process_text(dir_path, file_type=".json", output_file_path="crawler_data.txt"):
166
+ """
167
+ Helper function to convert JSON data to text and then process the text
168
+
169
+ """
170
+
171
+ with open(output_file_path, "w", encoding="utf-8") as f_out:
172
+ for json_file in pathlib.Path(dir_path).rglob('*' + file_type):
173
+ with open(json_file, "r", encoding="utf-8") as f:
174
+ data = json.load(f)
175
+ for item in data:
176
+ f_out.write(" ".join(item["text"]) + "\n")
177
+
178
+
179
+ if __name__ == "__main__":
180
+ download_file(real_file_id="1KD7v4eW2ZKQ0Re_6lXRuaaVswvS3IFIh")