davanstrien HF staff commited on
Commit
69765f6
1 Parent(s): 409aa4c
Files changed (1) hide show
  1. app.py +205 -0
app.py ADDED
@@ -0,0 +1,205 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from huggingface_hub import hf_hub_url, list_datasets
2
+ from dotenv import load_dotenv
3
+ import os
4
+ from httpx import Client
5
+ from datetime import datetime
6
+ from datetime import timedelta
7
+ from tqdm.auto import tqdm
8
+ from tqdm.contrib.concurrent import thread_map
9
+ import pandas as pd
10
+ import gradio as gr
11
+
12
+ from huggingface_hub import hf_hub_url
13
+ import requests
14
+ from diskcache import Cache
15
+ from diskcache import Cache
16
+ from sys import platform
17
+ import gradio as gr
18
+
19
+ # check if running on macos i.e. local dev
20
+
21
+
22
+ load_dotenv()
23
+
24
+
25
+ HF_TOKEN = os.getenv("HF_TOKEN")
26
+ USER_AGENT = os.getenv("USER_AGENT")
27
+
28
+
29
+ headers = {"authorization": f"Bearer ${HF_TOKEN}", "user-agent": USER_AGENT}
30
+
31
+
32
+ client = Client(
33
+ headers=headers,
34
+ timeout=60,
35
+ )
36
+ LOCAL = False
37
+ if platform == "darwin":
38
+ LOCAL = True
39
+ cache_dir = "cache" if LOCAL else "/data/diskcache"
40
+ cache = Cache(cache_dir)
41
+
42
+
43
+ def add_created_data(dataset):
44
+ _id = dataset._id
45
+ created = datetime.fromtimestamp(int(_id[:8], 16))
46
+ dataset_dict = dataset.__dict__
47
+ dataset_dict["created"] = created
48
+ return dataset_dict
49
+
50
+
51
+ def get_three_months_ago():
52
+ now = datetime.now()
53
+ return now - timedelta(days=90)
54
+
55
+
56
+ def get_readme_len(dataset):
57
+ try:
58
+ url = hf_hub_url(dataset["id"], "README.md", repo_type="dataset")
59
+ resp = client.get(url)
60
+ if resp.status_code == 200:
61
+ dataset["len"] = len(resp.text)
62
+ return dataset
63
+ except Exception as e:
64
+ print(e)
65
+ return None
66
+
67
+
68
+ def render_model_hub_link(hub_id):
69
+ link = f"https://huggingface.co/datasets/{hub_id}"
70
+ return (
71
+ f'<a target="_blank" href="{link}" style="color: var(--link-text-color);'
72
+ f' text-decoration: underline;text-decoration-style: dotted;">{hub_id}</a>'
73
+ )
74
+
75
+
76
+ @cache.memoize(expire=60 * 60 * 12)
77
+ def get_datasets():
78
+ return list(tqdm(iter(list_datasets(limit=None, full=True))))
79
+
80
+
81
+ @cache.memoize(expire=60 * 60 * 12)
82
+ def load_data():
83
+ datasets = get_datasets()
84
+ datasets = [add_created_data(dataset) for dataset in tqdm(datasets)]
85
+ filtered = [ds for ds in datasets if ds.get("cardData")]
86
+ filtered = [ds for ds in filtered if ds["created"] > get_three_months_ago()]
87
+
88
+ ds_with_len = thread_map(get_readme_len, filtered)
89
+ ds_with_len = [ds for ds in ds_with_len if ds is not None]
90
+ return ds_with_len
91
+
92
+
93
+ remove_orgs = {"HuggingFaceM4", "HuggingFaceBR4"}
94
+
95
+
96
+ columns_to_drop = [
97
+ "cardData",
98
+ "gated",
99
+ "sha",
100
+ "paperswithcode_id",
101
+ "tags",
102
+ "description",
103
+ "siblings",
104
+ "disabled",
105
+ "_id",
106
+ "private",
107
+ "author",
108
+ "citation",
109
+ ]
110
+
111
+
112
+ def prep_dataframe(remove_orgs_and_users=remove_orgs, columns_to_drop=columns_to_drop):
113
+ ds_with_len = load_data()
114
+ if remove_orgs_and_users:
115
+ ds_with_len = [
116
+ ds for ds in ds_with_len if ds["author"] not in remove_orgs_and_users
117
+ ]
118
+ df = pd.DataFrame(ds_with_len)
119
+ df["id"] = df["id"].apply(render_model_hub_link)
120
+ if columns_to_drop:
121
+ df = df.drop(columns=columns_to_drop)
122
+ return df
123
+
124
+
125
+ # def filter_df(
126
+ # df,
127
+ # created_after=None,
128
+ # create_before=None,
129
+ # min_likes=None,
130
+ # max_likes=None,
131
+ # min_len=None,
132
+ # max_len=None,
133
+ # min_downloads=None,
134
+ # max_downloads=None,
135
+ # ):
136
+ # if min_likes:
137
+ # df = df[df["likes"] >= min_likes]
138
+ # if max_likes:
139
+ # df = df[df["likes"] <= max_likes]
140
+ # if min_len:
141
+ # df = df[df["len"] >= min_len]
142
+ # if max_len:
143
+ # df = df[df["len"] <= max_len]
144
+ # if min_downloads:
145
+ # df = df[df["downloads"] >= min_downloads]
146
+ # if max_downloads:
147
+ # df = df[df["downloads"] <= max_downloads]
148
+ # return df
149
+
150
+
151
+ import datetime
152
+
153
+ import datetime
154
+
155
+
156
+ def filter_df_by_max_age(max_age_days=None):
157
+ df = prep_dataframe()
158
+ df = df.dropna(subset=["created"])
159
+
160
+ now = datetime.datetime.now()
161
+
162
+ if max_age_days is not None:
163
+ max_date = now - datetime.timedelta(days=max_age_days)
164
+ df = df[df["created"] >= max_date]
165
+
166
+ return df
167
+
168
+
169
+ # def filter_df(
170
+ # min_age_days=None,
171
+ # max_age_days=None,
172
+ # min_likes=None,
173
+ # max_likes=None,
174
+ # min_len=None,
175
+ # max_len=None,
176
+ # min_downloads=None,
177
+ # max_downloads=None,
178
+ # ):
179
+ # if min_age_days is not None or max_age_days is not None:
180
+ # df = filter_df_by_date(min_age_days, max_age_days)
181
+ # else:
182
+ # df = prep_dataframe()
183
+ # if min_likes:
184
+ # df = df[df["likes"] >= min_likes]
185
+ # if max_likes:
186
+ # df = df[df["likes"] <= max_likes]
187
+ # if min_len:
188
+ # df = df[df["len"] >= min_len]
189
+ # if max_len:
190
+ # df = df[df["len"] <= max_len]
191
+ # if min_downloads:
192
+ # df = df[df["downloads"] >= min_downloads]
193
+ # if max_downloads:
194
+ # df = df[df["downloads"] <= max_downloads]
195
+ # return df
196
+
197
+
198
+ with gr.Blocks() as demo:
199
+ max_age_days = gr.Slider(
200
+ label="Max Age (days)", value=7, minimum=0, maximum=90, step=1, interactive=True
201
+ )
202
+ output = gr.DataFrame(prep_dataframe(), datatype="markdown", min_width=160 * 2.5)
203
+ max_age_days.input(filter_df_by_max_age, inputs=[max_age_days], outputs=[output])
204
+
205
+ demo.launch()