m7n commited on
Commit
4d14899
·
verified ·
1 Parent(s): e43e2c8

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +115 -42
app.py CHANGED
@@ -47,6 +47,8 @@ import pandas as pd
47
  from pyalex import Works, Authors, Sources, Institutions, Concepts, Publishers, Funders
48
  from itertools import chain
49
  from compress_pickle import load, dump
 
 
50
 
51
 
52
 
@@ -67,62 +69,102 @@ import umap
67
 
68
 
69
 
70
-
71
-
72
-
73
-
74
-
75
- def query_records(search_term):
76
- def invert_abstract(inv_index):
77
- if inv_index is not None:
78
- l_inv = [(w, p) for w, pos in inv_index.items() for p in pos]
79
- return " ".join(map(lambda x: x[0], sorted(l_inv, key=lambda x: x[1])))
80
- else:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
81
  return ' '
82
-
83
- def get_pub(x):
84
- try:
85
- source = x['source']['display_name']
86
- if source not in ['parsed_publication','Deleted Journal']:
87
- return source
88
- else:
89
- return ' '
90
- except:
91
  return ' '
92
 
93
- # Fetch records based on the search term in the abstract!
94
- query = Works().search([search_term])
95
- query_length = Works().search([search_term]).count()
 
 
 
96
 
97
- records = []
98
- #total_pages = (query_length + 199) // 200 # Calculate total number of pages
99
- progress=gr.Progress()
100
 
101
- for i, record in progress.tqdm(enumerate(chain(*query.paginate(per_page=200)))):
102
- records.append(record)
103
 
104
- # Calculate progress from 0 to 0.1
105
- #achieved_progress = min(0.1, (i + 1) / query_length * 0.1)
106
 
107
- # Update progress bar
108
- #progress(achieved_progress, desc="Getting queried data...")
109
 
110
 
111
 
112
- records_df = pd.DataFrame(records)
113
- records_df['abstract'] = [invert_abstract(t) for t in records_df['abstract_inverted_index']]
114
 
115
- records_df['parsed_publication'] = [get_pub(x) for x in records_df['primary_location']]
116
 
117
 
118
- records_df['parsed_publication'] = records_df['parsed_publication'].fillna(' ')
119
- records_df['abstract'] = records_df['abstract'].fillna(' ')
120
- records_df['title'] = records_df['title'].fillna(' ')
121
 
122
 
123
- return records_df
124
-
125
-
126
 
127
 
128
  ################# Setting up the model for specter2 embeddings ###################
@@ -193,7 +235,38 @@ def predict(text_input, sample_size_slider, reduce_sample_checkbox, progress=gr.
193
 
194
 
195
  # get data.
196
- records_df = query_records(text_input)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
197
  if reduce_sample_checkbox:
198
  records_df = records_df.sample(sample_size_slider)
199
  print(records_df)
 
47
  from pyalex import Works, Authors, Sources, Institutions, Concepts, Publishers, Funders
48
  from itertools import chain
49
  from compress_pickle import load, dump
50
+ from urllib.parse import urlparse, parse_qs
51
+ import re
52
 
53
 
54
 
 
69
 
70
 
71
 
72
+ def openalex_url_to_pyalex_query(url):
73
+ """
74
+ Convert an OpenAlex search URL to a pyalex query.
75
+
76
+ Args:
77
+ url (str): The OpenAlex search URL.
78
+
79
+ Returns:
80
+ tuple: (Works object, dict of parameters)
81
+ """
82
+ parsed_url = urlparse(url)
83
+ query_params = parse_qs(parsed_url.query)
84
+
85
+ # Initialize the Works object
86
+ query = Works()
87
+
88
+ # Handle filters
89
+ if 'filter' in query_params:
90
+ filters = query_params['filter'][0].split(',')
91
+ for f in filters:
92
+ if ':' in f:
93
+ key, value = f.split(':', 1)
94
+ if key == 'default.search':
95
+ query = query.search(value)
96
+ else:
97
+ query = query.filter(**{key: value})
98
+
99
+ # Handle sort
100
+ if 'sort' in query_params:
101
+ sort_params = query_params['sort'][0].split(',')
102
+ for s in sort_params:
103
+ if s.startswith('-'):
104
+ query = query.sort(**{s[1:]: 'desc'})
105
+ else:
106
+ query = query.sort(**{s: 'asc'})
107
+
108
+ # Handle other parameters
109
+ params = {}
110
+ for key in ['page', 'per-page', 'sample', 'seed']:
111
+ if key in query_params:
112
+ params[key] = query_params[key][0]
113
+
114
+ return query, params
115
+
116
+
117
+ def invert_abstract(inv_index):
118
+ if inv_index is not None:
119
+ l_inv = [(w, p) for w, pos in inv_index.items() for p in pos]
120
+ return " ".join(map(lambda x: x[0], sorted(l_inv, key=lambda x: x[1])))
121
+ else:
122
+ return ' '
123
+
124
+ def get_pub(x):
125
+ try:
126
+ source = x['source']['display_name']
127
+ if source not in ['parsed_publication','Deleted Journal']:
128
+ return source
129
+ else:
130
  return ' '
131
+ except:
 
 
 
 
 
 
 
 
132
  return ' '
133
 
134
+ #def query_records(search_term):
135
+
136
+
137
+ # # Fetch records based on the search term in the abstract!
138
+ # query = Works().search([search_term])
139
+ # query_length = Works().search([search_term]).count()
140
 
141
+ # records = []
142
+ # #total_pages = (query_length + 199) // 200 # Calculate total number of pages
143
+ # progress=gr.Progress()
144
 
145
+ # for i, record in progress.tqdm(enumerate(chain(*query.paginate(per_page=200)))):
146
+ # records.append(record)
147
 
148
+ # # Calculate progress from 0 to 0.1
149
+ # #achieved_progress = min(0.1, (i + 1) / query_length * 0.1)
150
 
151
+ # # Update progress bar
152
+ # #progress(achieved_progress, desc="Getting queried data...")
153
 
154
 
155
 
156
+ # records_df = pd.DataFrame(records)
157
+ # records_df['abstract'] = [invert_abstract(t) for t in records_df['abstract_inverted_index']]
158
 
159
+ # records_df['parsed_publication'] = [get_pub(x) for x in records_df['primary_location']]
160
 
161
 
162
+ # records_df['parsed_publication'] = records_df['parsed_publication'].fillna(' ')
163
+ # records_df['abstract'] = records_df['abstract'].fillna(' ')
164
+ # records_df['title'] = records_df['title'].fillna(' ')
165
 
166
 
167
+ # return records_df
 
 
168
 
169
 
170
  ################# Setting up the model for specter2 embeddings ###################
 
235
 
236
 
237
  # get data.
238
+
239
+ query, params = openalex_url_to_pyalex_query(text_input)
240
+ query_length = query.count()
241
+
242
+ records = []
243
+ total_pages = (query_length + 199) // 200 # Calculate total number of pages
244
+
245
+
246
+ for i, record in progress.tqdm(enumerate(chain(*query.paginate(per_page=200)))):
247
+ records.append(record)
248
+
249
+ # Calculate progress from 0 to 0.1
250
+ achieved_progress = min(0., (i + 1) / query_length * 0.1)
251
+
252
+ # Update progress bar
253
+ progress(achieved_progress, desc="Getting queried data...")
254
+
255
+
256
+
257
+ records_df = pd.DataFrame(records)
258
+ records_df['abstract'] = [invert_abstract(t) for t in records_df['abstract_inverted_index']]
259
+
260
+ records_df['parsed_publication'] = [get_pub(x) for x in records_df['primary_location']]
261
+
262
+
263
+ records_df['parsed_publication'] = records_df['parsed_publication'].fillna(' ')
264
+ records_df['abstract'] = records_df['abstract'].fillna(' ')
265
+ records_df['title'] = records_df['title'].fillna(' ')
266
+
267
+
268
+
269
+
270
  if reduce_sample_checkbox:
271
  records_df = records_df.sample(sample_size_slider)
272
  print(records_df)