bluuebunny commited on
Commit
71571a9
1 Parent(s): 941aca1

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +144 -0
app.py ADDED
@@ -0,0 +1,144 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import boto3
2
+ import os
3
+ import zipfile
4
+ from glob import glob
5
+ import shutil
6
+ from huggingface_hub import HfApi
7
+ import gradio as gr
8
+ from tqdm.auto import tqdm
9
+ import threading
10
+
11
+
12
+ ################################################################################
13
+
14
+ # Declarations:
15
+ print("Declaring variables.")
16
+ # AWS S3 service name
17
+ service_name = 's3'
18
+
19
+ # AWS S3 bucket names
20
+ biorxiv_bucket_name = 'biorxiv-src-monthly'
21
+ medrxiv_bucket_name = 'medrxiv-src-monthly'
22
+
23
+ # AWS region name
24
+ region_name = 'us-east-1'
25
+
26
+ # Hugging Face destination repository name
27
+ destination_repo_name = 'xml-dump-monthly'
28
+
29
+ ################################################################################
30
+
31
+ print("Initiating clients.")
32
+
33
+ # Create a S3 client
34
+ s3_client = boto3.client(
35
+ service_name='s3',
36
+ region_name=region_name,
37
+ aws_access_key_id=os.getenv('AWS_ACCESS_KEY_ID'),
38
+ aws_secret_access_key=os.getenv('AWS_SECRET_ACCESS_KEY')
39
+ )
40
+ paginator = s3_client.get_paginator('list_objects_v2')
41
+
42
+ # Create a Hugging Face API client
43
+ access_token = os.getenv('HF_API_KEY')
44
+ hugging_face_api = HfApi(token=access_token)
45
+
46
+ # Create a dataset repo
47
+ hugging_face_api.create_repo(
48
+ repo_id=destination_repo_name,
49
+ repo_type="dataset",
50
+ private=False,
51
+ exist_ok=True
52
+ )
53
+
54
+ # Extract Hugging facec username
55
+ username = hugging_face_api.whoami()['name']
56
+ repo_id = f"{username}/{destination_repo_name}"
57
+
58
+ ################################################################################
59
+
60
+
61
+
62
+ ################################################################################
63
+ def download_medrxiv(Prefix=""):
64
+
65
+ print("Downloading Medrxiv files.")
66
+
67
+ # Output folders for downloaded files
68
+ medrxiv_output_folder = Prefix + 'medrxiv-xml-dump'
69
+
70
+ # Create output folders if they don't exist
71
+ os.makedirs(medrxiv_output_folder, exist_ok=True)
72
+
73
+ # Gather all objects from Medrxiv bucket
74
+ medrxiv_pages = paginator.paginate(
75
+ Bucket=medrxiv_bucket_name,
76
+ RequestPayer='requester',
77
+ Prefix=Prefix
78
+ ).build_full_result()
79
+
80
+ # Dowload all objects from Medrxiv bucket
81
+ for medrxiv_object in tqdm(medrxiv_pages['Contents'], desc=Prefix):
82
+
83
+ # Get the file name
84
+ file = medrxiv_object['Key']
85
+
86
+ # Check if the file is a zip file
87
+ if file.endswith(".meca"):
88
+
89
+ # Proccess the zip file
90
+ try:
91
+
92
+ # Download the file
93
+ s3_client.download_file(medrxiv_bucket_name, file, 'tmp_med.meca', ExtraArgs={'RequestPayer':'requester'})
94
+
95
+ # Unzip meca file
96
+ with zipfile.ZipFile('tmp_med.meca', 'r') as zip_ref:
97
+ zip_ref.extractall("tmp_med")
98
+
99
+ # Gather the xml file
100
+ xml = glob('tmp_med/content/*.xml')
101
+
102
+ # Copy the xml file to the output folder
103
+ shutil.copy(xml[0], medrxiv_output_folder)
104
+
105
+ # Remove the tmp_med folder and file
106
+ shutil.rmtree('tmp_med')
107
+ os.remove('tmp_med.meca')
108
+
109
+ except Exception as e:
110
+ print(f"Error processing file {file}: {e}")
111
+
112
+
113
+ # Zip the output folder
114
+ shutil.make_archive(medrxiv_output_folder, 'zip', medrxiv_output_folder)
115
+
116
+ print(f"Uploading {medrxiv_output_folder}.zip to Hugging Face repo {repo_id}.")
117
+
118
+ hugging_face_api.upload_file(path_or_fileobj=f'{medrxiv_output_folder}.zip', path_in_repo=f'{medrxiv_output_folder}.zip', repo_id=repo_id, repo_type="dataset")
119
+
120
+ print("Medrxiv Done.")
121
+
122
+ # Create separate threads function
123
+ second_thread = threading.Thread(target=download_medrxiv, args=("Current_Content/September_2024/",))
124
+
125
+ # Start thread
126
+ second_thread.start()
127
+
128
+
129
+
130
+ ###############################################################################
131
+
132
+ # Dummy app
133
+
134
+ def greet(name, intensity):
135
+ return "Hello, " + name + "!" * int(intensity)
136
+
137
+ demo = gr.Interface(
138
+ fn=greet,
139
+ inputs=["text", "slider"],
140
+ outputs=["text"],
141
+ )
142
+
143
+ demo.launch()
144
+