nbroad HF staff commited on
Commit
1606cd0
1 Parent(s): 5b96dd0

big update

Browse files
Files changed (4) hide show
  1. app.py +35 -32
  2. constants.py +15 -0
  3. requirements.txt +5 -2
  4. update.py +198 -0
app.py CHANGED
@@ -1,18 +1,21 @@
1
- from fasthtml.common import *
2
- from datetime import datetime, timedelta
3
- import requests
4
- from datetime import datetime
5
  import json
6
- from markdown import markdown
 
7
 
 
 
 
8
  from dotenv import load_dotenv
9
 
 
 
 
10
  loaded = load_dotenv("./.env", override=True)
11
  print("Loaded .env file:", loaded)
12
 
13
- API_URL = os.getenv("API_URL")
14
- API_KEY = os.getenv("MS_SEARCH_KEY")
15
-
16
 
17
  css_content = open("styles.css").read()
18
 
@@ -82,35 +85,30 @@ def iso_to_unix_timestamp(iso_string):
82
 
83
 
84
  def unix_timestamp_to_nice_format(timestamp):
85
- dt = datetime.fromtimestamp(timestamp)
86
- return dt.strftime("%b %d, %Y")
87
 
88
 
89
  def make_query(query, start_date, end_date, page=1, limit=10):
90
- url = f"{API_URL}/indexes/comments/search"
91
- headers = {
92
- "Content-Type": "application/json",
93
- "Authorization": f"Bearer {API_KEY}",
94
- }
95
 
96
  after_timestamp = iso_to_unix_timestamp(start_date)
97
- before_timestamp = iso_to_unix_timestamp(end_date)
98
 
99
- query = {
100
- "q": query,
101
  "limit": limit,
102
  "offset": (page - 1) * limit,
103
- "filter": f"comment_updatedAt_timestamp >= {after_timestamp} AND comment_updatedAt_timestamp < {before_timestamp}",
104
- "attributesToCrop": ["comment_text"],
105
  "cropLength": 30,
106
- "attributesToHighlight": ["comment_text", "discussion_title"],
107
  "highlightPreTag": '<span class="highlight">',
108
  "highlightPostTag": "</span>",
109
  }
110
 
111
- response = requests.post(url, headers=headers, json=query)
112
 
113
- return response.json()
114
 
115
 
116
  def search_results(query, start_date, end_date, page=1):
@@ -119,9 +117,7 @@ def search_results(query, start_date, end_date, page=1):
119
  return Div(
120
  make_results_bar(raw_results),
121
  Div(*[make_card(r) for r in raw_results["hits"]]),
122
- make_pagination(
123
- query, start_date, end_date, page, raw_results["estimatedTotalHits"]
124
- ),
125
  id="search-results",
126
  )
127
 
@@ -138,13 +134,14 @@ def make_results_bar(results):
138
 
139
  def make_card(result):
140
  result = result["_formatted"]
141
- url = f"https://hf.co/{result['repo_id']}/discussions/{result['discussion_num']}"
142
- date = unix_timestamp_to_nice_format(int(result["comment_updatedAt_timestamp"]))
 
143
 
144
  return Div(
145
  Div(
146
- Strong(NotStr(result["discussion_title"])),
147
- P(NotStr(result["comment_text"]), cls="comment-text"),
148
  Div(Span(date)),
149
  A(url, href=url, target="_blank"),
150
  ),
@@ -152,7 +149,7 @@ def make_card(result):
152
  )
153
 
154
 
155
- def make_pagination(query, start_date, end_date, current_page, total_hits, limit=10):
156
  total_pages = -(-total_hits // limit) # Ceiling division
157
 
158
  children = []
@@ -218,4 +215,10 @@ def post(query: str, start_date: str, end_date: str, page: int = 1):
218
  return search_results(query, start_date, end_date, page)
219
 
220
 
221
- serve()
 
 
 
 
 
 
 
 
 
 
 
1
  import json
2
+ import os
3
+ from datetime import datetime, timezone, timedelta
4
 
5
+ import meilisearch
6
+ from fasthtml.common import *
7
+ from markdown import markdown
8
  from dotenv import load_dotenv
9
 
10
+ from constants import MeilisearchIndexFields
11
+ from update import process_webhook
12
+
13
  loaded = load_dotenv("./.env", override=True)
14
  print("Loaded .env file:", loaded)
15
 
16
+ MS_URL = os.getenv("MS_URL")
17
+ MS_SEARCH_KEY = os.getenv("MS_SEARCH_KEY")
18
+ ms_client = meilisearch.Client(MS_URL, MS_SEARCH_KEY)
19
 
20
  css_content = open("styles.css").read()
21
 
 
85
 
86
 
87
  def unix_timestamp_to_nice_format(timestamp):
88
+ dt = datetime.fromtimestamp(timestamp, tz=timezone.utc)
89
+ return dt.strftime("%b %d, %Y at %H:%M UTC")
90
 
91
 
92
  def make_query(query, start_date, end_date, page=1, limit=10):
93
+
94
+ twenty_three_hours_59_minutes_59_seconds_in_seconds = (23 * 60 + 59) * 60 + 59
 
 
 
95
 
96
  after_timestamp = iso_to_unix_timestamp(start_date)
97
+ before_timestamp = iso_to_unix_timestamp(end_date) + twenty_three_hours_59_minutes_59_seconds_in_seconds
98
 
99
+ options = {
 
100
  "limit": limit,
101
  "offset": (page - 1) * limit,
102
+ "filter": f"{MeilisearchIndexFields.UPDATED_AT.value} >= {after_timestamp} AND {MeilisearchIndexFields.UPDATED_AT.value} < {before_timestamp}",
103
+ "attributesToCrop": [MeilisearchIndexFields.CONTENT.value],
104
  "cropLength": 30,
105
+ "attributesToHighlight": [MeilisearchIndexFields.CONTENT.value, MeilisearchIndexFields.TITLE.value],
106
  "highlightPreTag": '<span class="highlight">',
107
  "highlightPostTag": "</span>",
108
  }
109
 
 
110
 
111
+ return ms_client.index(MeilisearchIndexFields.INDEX_NAME.value).search(query=query, opt_params=options)
112
 
113
 
114
  def search_results(query, start_date, end_date, page=1):
 
117
  return Div(
118
  make_results_bar(raw_results),
119
  Div(*[make_card(r) for r in raw_results["hits"]]),
120
+ make_pagination(page, raw_results["estimatedTotalHits"]),
 
 
121
  id="search-results",
122
  )
123
 
 
134
 
135
  def make_card(result):
136
  result = result["_formatted"]
137
+
138
+ url = result[MeilisearchIndexFields.URL.value]
139
+ date = unix_timestamp_to_nice_format(int(result[MeilisearchIndexFields.UPDATED_AT.value]))
140
 
141
  return Div(
142
  Div(
143
+ Strong(NotStr(result[MeilisearchIndexFields.TITLE.value])),
144
+ P(NotStr(result[MeilisearchIndexFields.CONTENT.value]), cls="comment-text"),
145
  Div(Span(date)),
146
  A(url, href=url, target="_blank"),
147
  ),
 
149
  )
150
 
151
 
152
+ def make_pagination(current_page, total_hits, limit=10):
153
  total_pages = -(-total_hits // limit) # Ceiling division
154
 
155
  children = []
 
215
  return search_results(query, start_date, end_date, page)
216
 
217
 
218
+ @app.post("/webhook")
219
+ async def hf_webhook(request):
220
+
221
+ return await process_webhook(request)
222
+
223
+
224
+ serve()
constants.py ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from enum import Enum
2
+
3
+
4
+ class MeilisearchIndexFields(Enum):
5
+
6
+ INDEX_NAME = "comments"
7
+
8
+ ID = "comment_id"
9
+ CONTENT = "content"
10
+ TITLE = "title"
11
+ STATUS = "status"
12
+ AUTHOR = "author"
13
+ URL = "url"
14
+ REPO_ID = "repo_id"
15
+ UPDATED_AT = "updatedAt"
requirements.txt CHANGED
@@ -1,5 +1,8 @@
1
  uvicorn
2
  python-fasthtml
3
  python-dotenv
4
- fasthtml-hf==0.1.4
5
- markdown
 
 
 
 
1
  uvicorn
2
  python-fasthtml
3
  python-dotenv
4
+ fasthtml-hf
5
+ markdown
6
+ meilisearch
7
+ huggingface_hub
8
+ requests
update.py ADDED
@@ -0,0 +1,198 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ This file has functions to update the meilisearch index with new comments.
3
+
4
+
5
+ Payload from HF webhooklooks like this:
6
+ {
7
+ "event": {
8
+ "action": "update",
9
+ "scope": "discussion.comment"
10
+ },
11
+ "repo": {
12
+ "type": "dataset",
13
+ "name": "allenai/objaverse",
14
+ "id": "63977bb96bdef8095268ded0",
15
+ "private": false,
16
+ "url": {
17
+ "web": "https://huggingface.co/datasets/allenai/objaverse",
18
+ "api": "https://huggingface.co/api/datasets/allenai/objaverse"
19
+ },
20
+ "owner": {
21
+ "id": "5e70f3648ce3c604d78fe132"
22
+ }
23
+ },
24
+ "discussion": {
25
+ "id": "66f1a1092eb1ea2422555d24",
26
+ "title": "PullRequest",
27
+ "url": {
28
+ "web": "https://huggingface.co/datasets/allenai/objaverse/discussions/63",
29
+ "api": "https://huggingface.co/api/datasets/allenai/objaverse/discussions/63"
30
+ },
31
+ "status": "draft",
32
+ "author": {
33
+ "id": "6673e848436907f83a815ab0"
34
+ },
35
+ "num": 63,
36
+ "isPullRequest": true,
37
+ "changes": {
38
+ "base": "refs/heads/main"
39
+ }
40
+ },
41
+ "comment": {
42
+ "id": "66f1a1092eb1ea2422555d25",
43
+ "author": {
44
+ "id": "6673e848436907f83a815ab0"
45
+ },
46
+ "hidden": true,
47
+ "url": {
48
+ "web": "https://huggingface.co/datasets/allenai/objaverse/discussions/63#66f1a1092eb1ea2422555d25"
49
+ }
50
+ },
51
+ "webhook": {
52
+ "id": "66d7991f9b7da501cd100d95",
53
+ "version": 3
54
+ }
55
+ }
56
+ """
57
+ import time
58
+ import json
59
+ import os
60
+ from datetime import datetime, timezone
61
+
62
+ import requests
63
+ from dotenv import load_dotenv
64
+ from huggingface_hub import HfApi
65
+ from meilisearch import Client
66
+ from huggingface_hub import HfApi
67
+
68
+ from constants import MeilisearchIndexFields
69
+
70
+ load_dotenv(".env", override=True)
71
+
72
+ WEBHOOK_SECRET = os.getenv("WEBHOOK_SECRET")
73
+ MEILISEARCH_URL = os.getenv("MS_URL")
74
+ MEILISEARCH_KEY = os.getenv("MS_ADMIN_KEY")
75
+
76
+ ms_client = Client(MEILISEARCH_URL, MEILISEARCH_KEY)
77
+
78
+ api = HfApi(token=os.environ["HF_WEBHOOK_TOKEN"])
79
+
80
+ async def process_webhook(request):
81
+
82
+ payload = await request.body()
83
+ payload = payload.decode("utf-8")
84
+ print(payload)
85
+
86
+ payload = json.loads(payload)
87
+
88
+ secret = request.headers.get("X-Webhook-Secret")
89
+ if secret != WEBHOOK_SECRET:
90
+ print("Invalid secret")
91
+ return {"error": "Invalid secret"}, 400
92
+
93
+ if payload["repo"]["type"] == "model":
94
+
95
+ if "discussion" not in payload or payload["discussion"]["isPullRequest"]:
96
+ return {"status": "skipped"}, 200
97
+
98
+ changing_status = "comment" not in payload and payload["event"]["action"] == "update"
99
+ if changing_status:
100
+ update_discussion_status(payload)
101
+ else:
102
+ add_new_comment(payload)
103
+
104
+
105
+ return {"status": "success"}, 200
106
+
107
+
108
+ def user_id_to_username(user_id):
109
+
110
+ api_url = f"https://huggingface.co/api/users/{user_id}/overview"
111
+
112
+ try:
113
+ response = requests.get(api_url)
114
+ return response.json()["user"]
115
+ except Exception as e:
116
+ print(f"Couldn't get username for id {user_id}: {e}")
117
+ return user_id
118
+
119
+ def add_new_comment(payload):
120
+
121
+
122
+ comment = payload["comment"].get("content", "")
123
+ comment_id = payload["comment"]["id"]
124
+
125
+ repo_id = payload["repo"]["name"]
126
+ title = payload["discussion"]["title"]
127
+
128
+ author_id = payload["comment"]["author"]["id"]
129
+ author = user_id_to_username(author_id)
130
+
131
+ url = payload["discussion"]["url"]["web"]
132
+ updatedAt = int(datetime.now(timezone.utc).timestamp())
133
+
134
+ status = payload["discussion"]["status"]
135
+
136
+ melisearch_payload = {
137
+ MeilisearchIndexFields.ID.value: comment_id,
138
+ MeilisearchIndexFields.TITLE.value: title,
139
+ MeilisearchIndexFields.STATUS.value: status,
140
+ MeilisearchIndexFields.AUTHOR.value: author,
141
+ MeilisearchIndexFields.URL.value: url,
142
+ MeilisearchIndexFields.REPO_ID.value: repo_id,
143
+ MeilisearchIndexFields.CONTENT.value: comment,
144
+ MeilisearchIndexFields.UPDATED_AT.value: updatedAt,
145
+ }
146
+
147
+ ms_client.index(MeilisearchIndexFields.INDEX_NAME.value).add_documents([melisearch_payload])
148
+
149
+ def update_discussion_status(payload):
150
+
151
+ # If closing and commenting at the same time,
152
+ # the comment comes with status = open after the webhook that says the discussion is closed.
153
+ # Adding the sleep ensures the update comes afterwards
154
+ time.sleep(1)
155
+
156
+ url = payload["discussion"]["url"]["web"]
157
+ status = payload["discussion"]["status"]
158
+
159
+ existing_results = ms_client.index(MeilisearchIndexFields.INDEX_NAME.value).search(
160
+ query="",
161
+ opt_params={"filter": f"url = '{url}'"}
162
+ )
163
+
164
+ if len(existing_results["hits"]) > 0:
165
+ docs2update = [
166
+ {MeilisearchIndexFields.ID.value: d[MeilisearchIndexFields.ID.value], MeilisearchIndexFields.STATUS.value: status}
167
+ for d in existing_results["hits"]
168
+ ]
169
+
170
+ update_request = ms_client.index(MeilisearchIndexFields.INDEX_NAME.value).update_documents(docs2update)
171
+ print("Update request:", update_request)
172
+
173
+
174
+
175
+ def update_webhooks():
176
+ """
177
+ Delete the old
178
+ """
179
+
180
+ existing_webhooks = api.list_webhooks()
181
+
182
+ webhook_url = os.environ["HF_WEBHOOK_URL"]
183
+
184
+ id2update = [x for x in existing_webhooks if x.url == webhook_url]
185
+
186
+ if len(id2update) > 1:
187
+ print("More than one webhook found")
188
+ print(id2update)
189
+ print("updating the first one")
190
+
191
+ id2update = id2update[0]
192
+
193
+ # get trending models
194
+
195
+ trending_models = api.list_models(sort="likes7d", direction=-1, limit=100)
196
+
197
+ to_add = []
198
+