Spaces:
Running
Running
shimizukawa
commited on
Commit
•
23687d1
1
Parent(s):
341f67a
restore github issue loader, refactoring
Browse files- README.md +35 -0
- app.py +18 -15
- loaders/__init__.py +11 -0
- loaders/github_issue.py +63 -0
- doc_loader.py → loaders/wikipage.py +13 -11
- model.py +0 -11
- models.py +23 -0
- store.py +39 -18
README.md
CHANGED
@@ -11,3 +11,38 @@ license: mit
|
|
11 |
---
|
12 |
|
13 |
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
11 |
---
|
12 |
|
13 |
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
14 |
+
|
15 |
+
# import GitHub issues
|
16 |
+
|
17 |
+
## export from github
|
18 |
+
first, generate token on: https://github.com/settings/tokens
|
19 |
+
|
20 |
+
```
|
21 |
+
$ git clone https://github.com/kazamori/github-api-tools
|
22 |
+
$ pip install -e ./github-api-tools
|
23 |
+
$ export GITHUB_API_TOKEN="********"
|
24 |
+
$ gh-cli-issues --repository <org/repo>
|
25 |
+
$ ls <repo>-issues.json
|
26 |
+
```
|
27 |
+
|
28 |
+
## import from json
|
29 |
+
|
30 |
+
```
|
31 |
+
$ python store.py -l github_issue <index> ../<repo>-issues.json
|
32 |
+
```
|
33 |
+
|
34 |
+
# import Wiki Pages
|
35 |
+
|
36 |
+
## export from somewhere
|
37 |
+
|
38 |
+
create `pages.json` like:
|
39 |
+
```json
|
40 |
+
{"id": <page_id>, "title": <page title>, "content": <page body>, "ctime": ..., "user": <name>, "url": "https:..."}
|
41 |
+
{"title": ...}
|
42 |
+
```
|
43 |
+
|
44 |
+
## import from json
|
45 |
+
|
46 |
+
```
|
47 |
+
$ python store.py -l wikipage <index> ../pages.json
|
48 |
+
```
|
app.py
CHANGED
@@ -1,6 +1,7 @@
|
|
1 |
from time import time
|
2 |
from datetime import datetime
|
3 |
from typing import Iterable
|
|
|
4 |
import streamlit as st
|
5 |
import torch
|
6 |
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
|
@@ -12,8 +13,9 @@ from qdrant_client.http.models import Filter, FieldCondition, MatchValue, Range
|
|
12 |
from langchain.chains import RetrievalQA
|
13 |
from openai.error import InvalidRequestError
|
14 |
from langchain.chat_models import ChatOpenAI
|
|
|
15 |
from config import DB_CONFIG
|
16 |
-
from
|
17 |
|
18 |
|
19 |
@st.cache_resource
|
@@ -150,9 +152,9 @@ def _get_related_url(metadata) -> Iterable[str]:
|
|
150 |
|
151 |
def _get_query_str_filter(
|
152 |
query: str,
|
153 |
-
|
154 |
) -> tuple[str, Filter]:
|
155 |
-
options = [{"key": "metadata.
|
156 |
filter = make_filter_obj(options=options)
|
157 |
return query, filter
|
158 |
|
@@ -160,10 +162,10 @@ def _get_query_str_filter(
|
|
160 |
def run_qa(
|
161 |
llm,
|
162 |
query: str,
|
163 |
-
|
164 |
) -> tuple[str, str]:
|
165 |
now = time()
|
166 |
-
query_str, filter = _get_query_str_filter(query,
|
167 |
qa = get_retrieval_qa(filter, llm)
|
168 |
try:
|
169 |
result = qa(query_str)
|
@@ -178,29 +180,30 @@ def run_qa(
|
|
178 |
|
179 |
def run_search(
|
180 |
query: str,
|
181 |
-
|
182 |
-
) -> Iterable[tuple[
|
183 |
-
query_str, filter = _get_query_str_filter(query,
|
184 |
qdocs = get_similay(query_str, filter)
|
185 |
for qdoc, score in qdocs:
|
186 |
text = qdoc.page_content
|
187 |
metadata = qdoc.metadata
|
188 |
# print(metadata)
|
189 |
-
|
190 |
-
|
191 |
id=metadata.get("id"),
|
192 |
title=metadata.get("title"),
|
193 |
ctime=metadata.get("ctime"),
|
194 |
user=metadata.get("user"),
|
195 |
url=metadata.get("url"),
|
|
|
196 |
)
|
197 |
-
yield
|
198 |
|
199 |
|
200 |
with st.form("my_form"):
|
201 |
st.title("Document Search")
|
202 |
query = st.text_input(label="query")
|
203 |
-
|
204 |
|
205 |
submit_col1, submit_col2 = st.columns(2)
|
206 |
searched = submit_col1.form_submit_button("Search")
|
@@ -209,7 +212,7 @@ with st.form("my_form"):
|
|
209 |
st.header("Search Results")
|
210 |
st.divider()
|
211 |
with st.spinner("Searching..."):
|
212 |
-
results = run_search(query,
|
213 |
for doc, score, text in results:
|
214 |
title = doc.title
|
215 |
url = doc.url
|
@@ -232,7 +235,7 @@ with st.form("my_form"):
|
|
232 |
results = run_qa(
|
233 |
LLM,
|
234 |
query,
|
235 |
-
|
236 |
)
|
237 |
answer, html = results
|
238 |
with st.container():
|
@@ -249,7 +252,7 @@ with st.form("my_form"):
|
|
249 |
results = run_qa(
|
250 |
VICUNA_LLM,
|
251 |
query,
|
252 |
-
|
253 |
)
|
254 |
answer, html = results
|
255 |
with st.container():
|
|
|
1 |
from time import time
|
2 |
from datetime import datetime
|
3 |
from typing import Iterable
|
4 |
+
|
5 |
import streamlit as st
|
6 |
import torch
|
7 |
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
|
|
|
13 |
from langchain.chains import RetrievalQA
|
14 |
from openai.error import InvalidRequestError
|
15 |
from langchain.chat_models import ChatOpenAI
|
16 |
+
|
17 |
from config import DB_CONFIG
|
18 |
+
from models import BaseModel
|
19 |
|
20 |
|
21 |
@st.cache_resource
|
|
|
152 |
|
153 |
def _get_query_str_filter(
|
154 |
query: str,
|
155 |
+
index: str,
|
156 |
) -> tuple[str, Filter]:
|
157 |
+
options = [{"key": "metadata.index", "value": index}]
|
158 |
filter = make_filter_obj(options=options)
|
159 |
return query, filter
|
160 |
|
|
|
162 |
def run_qa(
|
163 |
llm,
|
164 |
query: str,
|
165 |
+
index: str,
|
166 |
) -> tuple[str, str]:
|
167 |
now = time()
|
168 |
+
query_str, filter = _get_query_str_filter(query, index)
|
169 |
qa = get_retrieval_qa(filter, llm)
|
170 |
try:
|
171 |
result = qa(query_str)
|
|
|
180 |
|
181 |
def run_search(
|
182 |
query: str,
|
183 |
+
index: str,
|
184 |
+
) -> Iterable[tuple[BaseModel, float, str]]:
|
185 |
+
query_str, filter = _get_query_str_filter(query, index)
|
186 |
qdocs = get_similay(query_str, filter)
|
187 |
for qdoc, score in qdocs:
|
188 |
text = qdoc.page_content
|
189 |
metadata = qdoc.metadata
|
190 |
# print(metadata)
|
191 |
+
data = BaseModel(
|
192 |
+
index=index,
|
193 |
id=metadata.get("id"),
|
194 |
title=metadata.get("title"),
|
195 |
ctime=metadata.get("ctime"),
|
196 |
user=metadata.get("user"),
|
197 |
url=metadata.get("url"),
|
198 |
+
type=metadata.get("type"),
|
199 |
)
|
200 |
+
yield data, score, text
|
201 |
|
202 |
|
203 |
with st.form("my_form"):
|
204 |
st.title("Document Search")
|
205 |
query = st.text_input(label="query")
|
206 |
+
index = st.text_input(label="index")
|
207 |
|
208 |
submit_col1, submit_col2 = st.columns(2)
|
209 |
searched = submit_col1.form_submit_button("Search")
|
|
|
212 |
st.header("Search Results")
|
213 |
st.divider()
|
214 |
with st.spinner("Searching..."):
|
215 |
+
results = run_search(query, index)
|
216 |
for doc, score, text in results:
|
217 |
title = doc.title
|
218 |
url = doc.url
|
|
|
235 |
results = run_qa(
|
236 |
LLM,
|
237 |
query,
|
238 |
+
index,
|
239 |
)
|
240 |
answer, html = results
|
241 |
with st.container():
|
|
|
252 |
results = run_qa(
|
253 |
VICUNA_LLM,
|
254 |
query,
|
255 |
+
index,
|
256 |
)
|
257 |
answer, html = results
|
258 |
with st.container():
|
loaders/__init__.py
ADDED
@@ -0,0 +1,11 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from .wikipage import WikiPageLoader
|
2 |
+
from .github_issue import GithubIssueLoader
|
3 |
+
|
4 |
+
LOADERS = {
|
5 |
+
"wikipage": WikiPageLoader,
|
6 |
+
"github_issue": GithubIssueLoader
|
7 |
+
}
|
8 |
+
LOADER_NAMES = tuple(LOADERS.keys())
|
9 |
+
|
10 |
+
def get_loader(loader_name, **kwargs):
|
11 |
+
return LOADERS.get(loader_name)(**kwargs)
|
loaders/github_issue.py
ADDED
@@ -0,0 +1,63 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import json
|
2 |
+
from dataclasses import asdict
|
3 |
+
from pathlib import Path
|
4 |
+
from typing import Iterator
|
5 |
+
|
6 |
+
from dateutil.parser import parse
|
7 |
+
from langchain.docstore.document import Document
|
8 |
+
from langchain.document_loaders.base import BaseLoader
|
9 |
+
|
10 |
+
from models import GithubIssue
|
11 |
+
|
12 |
+
|
13 |
+
def date_to_int(dt_str: str) -> int:
|
14 |
+
dt = parse(dt_str)
|
15 |
+
return int(dt.timestamp())
|
16 |
+
|
17 |
+
|
18 |
+
def get_contents(index: str, inputfile: Path) -> Iterator[tuple[GithubIssue, str]]:
|
19 |
+
with inputfile.open("r") as f:
|
20 |
+
obj = [json.loads(line) for line in f]
|
21 |
+
for data in obj:
|
22 |
+
title = data["title"]
|
23 |
+
body = data["body"]
|
24 |
+
issue = GithubIssue(
|
25 |
+
index=index,
|
26 |
+
id=data["number"],
|
27 |
+
title=title,
|
28 |
+
ctime=date_to_int(data["created_at"]),
|
29 |
+
user=data["user.login"],
|
30 |
+
url=data["html_url"],
|
31 |
+
labels=data["labels_"],
|
32 |
+
)
|
33 |
+
text = title
|
34 |
+
if body:
|
35 |
+
text += "\n\n" + body
|
36 |
+
yield issue, text
|
37 |
+
comments = data["comments_"]
|
38 |
+
for comment in comments:
|
39 |
+
issue = GithubIssue(
|
40 |
+
index=index,
|
41 |
+
id=comment["id"],
|
42 |
+
title=data["title"],
|
43 |
+
ctime=date_to_int(comment["created_at"]),
|
44 |
+
user=comment["user.login"],
|
45 |
+
url=comment["html_url"],
|
46 |
+
labels=data["labels_"],
|
47 |
+
type="issue_comment",
|
48 |
+
)
|
49 |
+
yield issue, comment["body"]
|
50 |
+
|
51 |
+
|
52 |
+
class GithubIssueLoader(BaseLoader):
|
53 |
+
def __init__(self, index: str, inputfile: Path):
|
54 |
+
self.index = index
|
55 |
+
self.inputfile = inputfile
|
56 |
+
|
57 |
+
def lazy_load(self) -> Iterator[Document]:
|
58 |
+
for issue, text in get_contents(self.index, self.inputfile):
|
59 |
+
metadata = asdict(issue)
|
60 |
+
yield Document(page_content=text, metadata=metadata)
|
61 |
+
|
62 |
+
def load(self) -> list[Document]:
|
63 |
+
return list(self.lazy_load())
|
doc_loader.py → loaders/wikipage.py
RENAMED
@@ -1,11 +1,13 @@
|
|
1 |
-
from dataclasses import asdict
|
2 |
import json
|
|
|
|
|
3 |
from typing import Iterator
|
|
|
4 |
from dateutil.parser import parse
|
5 |
from langchain.docstore.document import Document
|
6 |
from langchain.document_loaders.base import BaseLoader
|
7 |
|
8 |
-
from
|
9 |
|
10 |
|
11 |
def date_to_int(dt_str: str) -> int:
|
@@ -13,20 +15,20 @@ def date_to_int(dt_str: str) -> int:
|
|
13 |
return int(dt.timestamp())
|
14 |
|
15 |
|
16 |
-
def get_contents(
|
17 |
"""filename for file with ndjson
|
18 |
|
19 |
{"id": <page_id>, "title": <page title>, "content": <page body>, "ctime": ..., "user": <name>, "url": "https:..."}
|
20 |
{"title": ...}
|
21 |
"""
|
22 |
-
with open(
|
23 |
obj = [json.loads(line) for line in f]
|
24 |
for data in obj:
|
25 |
title = data["title"]
|
26 |
body = data["content"]
|
27 |
ctime = date_to_int(data["ctime"]) if isinstance(data["ctime"], str) else data["ctime"]
|
28 |
-
doc =
|
29 |
-
|
30 |
id=data["id"],
|
31 |
title=title,
|
32 |
ctime=ctime,
|
@@ -39,13 +41,13 @@ def get_contents(project_name: str, filename: str) -> Iterator[tuple[Doc, str]]:
|
|
39 |
yield doc, text
|
40 |
|
41 |
|
42 |
-
class
|
43 |
-
def __init__(self,
|
44 |
-
self.
|
45 |
-
self.
|
46 |
|
47 |
def lazy_load(self) -> Iterator[Document]:
|
48 |
-
for doc, text in get_contents(self.
|
49 |
metadata = asdict(doc)
|
50 |
yield Document(page_content=text, metadata=metadata)
|
51 |
|
|
|
|
|
1 |
import json
|
2 |
+
from dataclasses import asdict
|
3 |
+
from pathlib import Path
|
4 |
from typing import Iterator
|
5 |
+
|
6 |
from dateutil.parser import parse
|
7 |
from langchain.docstore.document import Document
|
8 |
from langchain.document_loaders.base import BaseLoader
|
9 |
|
10 |
+
from models import WikiPage
|
11 |
|
12 |
|
13 |
def date_to_int(dt_str: str) -> int:
|
|
|
15 |
return int(dt.timestamp())
|
16 |
|
17 |
|
18 |
+
def get_contents(index: str, inputfile: Path) -> Iterator[tuple[WikiPage, str]]:
|
19 |
"""filename for file with ndjson
|
20 |
|
21 |
{"id": <page_id>, "title": <page title>, "content": <page body>, "ctime": ..., "user": <name>, "url": "https:..."}
|
22 |
{"title": ...}
|
23 |
"""
|
24 |
+
with inputfile.open("r") as f:
|
25 |
obj = [json.loads(line) for line in f]
|
26 |
for data in obj:
|
27 |
title = data["title"]
|
28 |
body = data["content"]
|
29 |
ctime = date_to_int(data["ctime"]) if isinstance(data["ctime"], str) else data["ctime"]
|
30 |
+
doc = WikiPage(
|
31 |
+
index=index,
|
32 |
id=data["id"],
|
33 |
title=title,
|
34 |
ctime=ctime,
|
|
|
41 |
yield doc, text
|
42 |
|
43 |
|
44 |
+
class WikiPageLoader(BaseLoader):
|
45 |
+
def __init__(self, index: str, inputfile: Path):
|
46 |
+
self.index = index
|
47 |
+
self.inputfile = inputfile
|
48 |
|
49 |
def lazy_load(self) -> Iterator[Document]:
|
50 |
+
for doc, text in get_contents(self.index, self.inputfile):
|
51 |
metadata = asdict(doc)
|
52 |
yield Document(page_content=text, metadata=metadata)
|
53 |
|
model.py
DELETED
@@ -1,11 +0,0 @@
|
|
1 |
-
from dataclasses import dataclass
|
2 |
-
|
3 |
-
|
4 |
-
@dataclass(frozen=True)
|
5 |
-
class Doc:
|
6 |
-
project_name: str
|
7 |
-
id: int
|
8 |
-
title: str
|
9 |
-
ctime: int
|
10 |
-
user: str
|
11 |
-
url: str
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
models.py
ADDED
@@ -0,0 +1,23 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import dataclasses
|
2 |
+
|
3 |
+
|
4 |
+
@dataclasses.dataclass()
|
5 |
+
class BaseModel:
|
6 |
+
index: str
|
7 |
+
id: int
|
8 |
+
title: str
|
9 |
+
ctime: int
|
10 |
+
user: str
|
11 |
+
url: str
|
12 |
+
type: str
|
13 |
+
|
14 |
+
|
15 |
+
@dataclasses.dataclass(frozen=True)
|
16 |
+
class GithubIssue(BaseModel):
|
17 |
+
labels: list[str]
|
18 |
+
type: str = "issue"
|
19 |
+
|
20 |
+
|
21 |
+
@dataclasses.dataclass(frozen=True)
|
22 |
+
class WikiPage:
|
23 |
+
type: str = "wiki"
|
store.py
CHANGED
@@ -1,10 +1,14 @@
|
|
|
|
|
|
|
|
|
|
1 |
from tqdm import tqdm
|
2 |
import torch
|
3 |
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
4 |
from langchain.embeddings import HuggingFaceEmbeddings
|
5 |
from langchain.vectorstores import Qdrant
|
6 |
|
7 |
-
from
|
8 |
from config import DB_CONFIG
|
9 |
|
10 |
|
@@ -19,6 +23,16 @@ def get_text_chunk(docs):
|
|
19 |
return texts
|
20 |
|
21 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
22 |
def store(texts):
|
23 |
model_name = "intfloat/multilingual-e5-large"
|
24 |
model_kwargs = {"device": "cuda:0" if torch.cuda.is_available() else "cpu"}
|
@@ -29,9 +43,9 @@ def store(texts):
|
|
29 |
encode_kwargs=encode_kwargs,
|
30 |
)
|
31 |
db_url, db_api_key, db_collection_name = DB_CONFIG
|
32 |
-
for
|
33 |
_ = Qdrant.from_documents(
|
34 |
-
|
35 |
embeddings,
|
36 |
url=db_url,
|
37 |
api_key=db_api_key,
|
@@ -39,24 +53,31 @@ def store(texts):
|
|
39 |
)
|
40 |
|
41 |
|
42 |
-
def
|
43 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
44 |
docs = loader.load()
|
45 |
texts = get_text_chunk(docs)
|
46 |
store(texts)
|
47 |
|
48 |
|
49 |
if __name__ == "__main__":
|
50 |
-
|
51 |
-
$ python store.py "PROJECT_NAME" "FILE_PATH"
|
52 |
-
$ python store.py hoge data/hoge-docs.json
|
53 |
-
"""
|
54 |
-
import sys
|
55 |
-
|
56 |
-
args = sys.argv
|
57 |
-
if len(args) != 3:
|
58 |
-
print("No args, you need two args for project_name, json_file_path")
|
59 |
-
else:
|
60 |
-
project_name = args[1]
|
61 |
-
path = args[2]
|
62 |
-
main(project_name, path)
|
|
|
1 |
+
import argparse
|
2 |
+
from itertools import islice
|
3 |
+
from pathlib import Path
|
4 |
+
|
5 |
from tqdm import tqdm
|
6 |
import torch
|
7 |
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
8 |
from langchain.embeddings import HuggingFaceEmbeddings
|
9 |
from langchain.vectorstores import Qdrant
|
10 |
|
11 |
+
from loaders import get_loader, LOADER_NAMES
|
12 |
from config import DB_CONFIG
|
13 |
|
14 |
|
|
|
23 |
return texts
|
24 |
|
25 |
|
26 |
+
def batched(iterable, *, size=100):
|
27 |
+
"Batch data into tuples of length n. The last batch may be shorter."
|
28 |
+
# batched('ABCDEFG', 3) --> ABC DEF G
|
29 |
+
if size < 1:
|
30 |
+
raise ValueError('n must be at least one')
|
31 |
+
it = iter(iterable)
|
32 |
+
while batch := tuple(islice(it, size)):
|
33 |
+
yield batch
|
34 |
+
|
35 |
+
|
36 |
def store(texts):
|
37 |
model_name = "intfloat/multilingual-e5-large"
|
38 |
model_kwargs = {"device": "cuda:0" if torch.cuda.is_available() else "cpu"}
|
|
|
43 |
encode_kwargs=encode_kwargs,
|
44 |
)
|
45 |
db_url, db_api_key, db_collection_name = DB_CONFIG
|
46 |
+
for batch in tqdm(batched(texts, size=100)):
|
47 |
_ = Qdrant.from_documents(
|
48 |
+
batch,
|
49 |
embeddings,
|
50 |
url=db_url,
|
51 |
api_key=db_api_key,
|
|
|
53 |
)
|
54 |
|
55 |
|
56 |
+
def get_parser():
|
57 |
+
p = argparse.ArgumentParser()
|
58 |
+
p.add_argument("index", type=str)
|
59 |
+
p.add_argument("inputfile", metavar="INPUTFILE", type=argparse.FileType("rt"))
|
60 |
+
p.add_argument("-l", "--loader", type=str, choices=LOADER_NAMES, required=True)
|
61 |
+
return p
|
62 |
+
|
63 |
+
|
64 |
+
def main():
|
65 |
+
"""
|
66 |
+
$ python store.py --loader wikipage "index" "FILE_PATH"
|
67 |
+
$ python store.py -l wikipage wiki data/wiki.json
|
68 |
+
"""
|
69 |
+
p = get_parser()
|
70 |
+
args = p.parse_args()
|
71 |
+
loader = get_loader(
|
72 |
+
args.loader,
|
73 |
+
index=args.index,
|
74 |
+
inputfile=Path(args.inputfile.name),
|
75 |
+
)
|
76 |
+
|
77 |
docs = loader.load()
|
78 |
texts = get_text_chunk(docs)
|
79 |
store(texts)
|
80 |
|
81 |
|
82 |
if __name__ == "__main__":
|
83 |
+
main()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|