shimizukawa commited on
Commit
23687d1
1 Parent(s): 341f67a

restore github issue loader, refactoring

Browse files
Files changed (8) hide show
  1. README.md +35 -0
  2. app.py +18 -15
  3. loaders/__init__.py +11 -0
  4. loaders/github_issue.py +63 -0
  5. doc_loader.py → loaders/wikipage.py +13 -11
  6. model.py +0 -11
  7. models.py +23 -0
  8. store.py +39 -18
README.md CHANGED
@@ -11,3 +11,38 @@ license: mit
11
  ---
12
 
13
  Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
11
  ---
12
 
13
  Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
14
+
15
+ # import GitHub issues
16
+
17
+ ## export from github
18
+ first, generate token on: https://github.com/settings/tokens
19
+
20
+ ```
21
+ $ git clone https://github.com/kazamori/github-api-tools
22
+ $ pip install -e ./github-api-tools
23
+ $ export GITHUB_API_TOKEN="********"
24
+ $ gh-cli-issues --repository <org/repo>
25
+ $ ls <repo>-issues.json
26
+ ```
27
+
28
+ ## import from json
29
+
30
+ ```
31
+ $ python store.py -l github_issue <index> ../<repo>-issues.json
32
+ ```
33
+
34
+ # import Wiki Pages
35
+
36
+ ## export from somewhere
37
+
38
+ create `pages.json` like:
39
+ ```json
40
+ {"id": <page_id>, "title": <page title>, "content": <page body>, "ctime": ..., "user": <name>, "url": "https:..."}
41
+ {"title": ...}
42
+ ```
43
+
44
+ ## import from json
45
+
46
+ ```
47
+ $ python store.py -l wikipage <index> ../pages.json
48
+ ```
app.py CHANGED
@@ -1,6 +1,7 @@
1
  from time import time
2
  from datetime import datetime
3
  from typing import Iterable
 
4
  import streamlit as st
5
  import torch
6
  from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
@@ -12,8 +13,9 @@ from qdrant_client.http.models import Filter, FieldCondition, MatchValue, Range
12
  from langchain.chains import RetrievalQA
13
  from openai.error import InvalidRequestError
14
  from langchain.chat_models import ChatOpenAI
 
15
  from config import DB_CONFIG
16
- from model import Doc
17
 
18
 
19
  @st.cache_resource
@@ -150,9 +152,9 @@ def _get_related_url(metadata) -> Iterable[str]:
150
 
151
  def _get_query_str_filter(
152
  query: str,
153
- project_name: str,
154
  ) -> tuple[str, Filter]:
155
- options = [{"key": "metadata.project_name", "value": project_name}]
156
  filter = make_filter_obj(options=options)
157
  return query, filter
158
 
@@ -160,10 +162,10 @@ def _get_query_str_filter(
160
  def run_qa(
161
  llm,
162
  query: str,
163
- project_name: str,
164
  ) -> tuple[str, str]:
165
  now = time()
166
- query_str, filter = _get_query_str_filter(query, project_name)
167
  qa = get_retrieval_qa(filter, llm)
168
  try:
169
  result = qa(query_str)
@@ -178,29 +180,30 @@ def run_qa(
178
 
179
  def run_search(
180
  query: str,
181
- project_name: str,
182
- ) -> Iterable[tuple[Doc, float, str]]:
183
- query_str, filter = _get_query_str_filter(query, project_name)
184
  qdocs = get_similay(query_str, filter)
185
  for qdoc, score in qdocs:
186
  text = qdoc.page_content
187
  metadata = qdoc.metadata
188
  # print(metadata)
189
- doc = Doc(
190
- project_name=project_name,
191
  id=metadata.get("id"),
192
  title=metadata.get("title"),
193
  ctime=metadata.get("ctime"),
194
  user=metadata.get("user"),
195
  url=metadata.get("url"),
 
196
  )
197
- yield doc, score, text
198
 
199
 
200
  with st.form("my_form"):
201
  st.title("Document Search")
202
  query = st.text_input(label="query")
203
- project_name = st.text_input(label="project")
204
 
205
  submit_col1, submit_col2 = st.columns(2)
206
  searched = submit_col1.form_submit_button("Search")
@@ -209,7 +212,7 @@ with st.form("my_form"):
209
  st.header("Search Results")
210
  st.divider()
211
  with st.spinner("Searching..."):
212
- results = run_search(query, project_name)
213
  for doc, score, text in results:
214
  title = doc.title
215
  url = doc.url
@@ -232,7 +235,7 @@ with st.form("my_form"):
232
  results = run_qa(
233
  LLM,
234
  query,
235
- project_name,
236
  )
237
  answer, html = results
238
  with st.container():
@@ -249,7 +252,7 @@ with st.form("my_form"):
249
  results = run_qa(
250
  VICUNA_LLM,
251
  query,
252
- project_name,
253
  )
254
  answer, html = results
255
  with st.container():
 
1
  from time import time
2
  from datetime import datetime
3
  from typing import Iterable
4
+
5
  import streamlit as st
6
  import torch
7
  from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
 
13
  from langchain.chains import RetrievalQA
14
  from openai.error import InvalidRequestError
15
  from langchain.chat_models import ChatOpenAI
16
+
17
  from config import DB_CONFIG
18
+ from models import BaseModel
19
 
20
 
21
  @st.cache_resource
 
152
 
153
  def _get_query_str_filter(
154
  query: str,
155
+ index: str,
156
  ) -> tuple[str, Filter]:
157
+ options = [{"key": "metadata.index", "value": index}]
158
  filter = make_filter_obj(options=options)
159
  return query, filter
160
 
 
162
  def run_qa(
163
  llm,
164
  query: str,
165
+ index: str,
166
  ) -> tuple[str, str]:
167
  now = time()
168
+ query_str, filter = _get_query_str_filter(query, index)
169
  qa = get_retrieval_qa(filter, llm)
170
  try:
171
  result = qa(query_str)
 
180
 
181
  def run_search(
182
  query: str,
183
+ index: str,
184
+ ) -> Iterable[tuple[BaseModel, float, str]]:
185
+ query_str, filter = _get_query_str_filter(query, index)
186
  qdocs = get_similay(query_str, filter)
187
  for qdoc, score in qdocs:
188
  text = qdoc.page_content
189
  metadata = qdoc.metadata
190
  # print(metadata)
191
+ data = BaseModel(
192
+ index=index,
193
  id=metadata.get("id"),
194
  title=metadata.get("title"),
195
  ctime=metadata.get("ctime"),
196
  user=metadata.get("user"),
197
  url=metadata.get("url"),
198
+ type=metadata.get("type"),
199
  )
200
+ yield data, score, text
201
 
202
 
203
  with st.form("my_form"):
204
  st.title("Document Search")
205
  query = st.text_input(label="query")
206
+ index = st.text_input(label="index")
207
 
208
  submit_col1, submit_col2 = st.columns(2)
209
  searched = submit_col1.form_submit_button("Search")
 
212
  st.header("Search Results")
213
  st.divider()
214
  with st.spinner("Searching..."):
215
+ results = run_search(query, index)
216
  for doc, score, text in results:
217
  title = doc.title
218
  url = doc.url
 
235
  results = run_qa(
236
  LLM,
237
  query,
238
+ index,
239
  )
240
  answer, html = results
241
  with st.container():
 
252
  results = run_qa(
253
  VICUNA_LLM,
254
  query,
255
+ index,
256
  )
257
  answer, html = results
258
  with st.container():
loaders/__init__.py ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from .wikipage import WikiPageLoader
2
+ from .github_issue import GithubIssueLoader
3
+
4
+ LOADERS = {
5
+ "wikipage": WikiPageLoader,
6
+ "github_issue": GithubIssueLoader
7
+ }
8
+ LOADER_NAMES = tuple(LOADERS.keys())
9
+
10
+ def get_loader(loader_name, **kwargs):
11
+ return LOADERS.get(loader_name)(**kwargs)
loaders/github_issue.py ADDED
@@ -0,0 +1,63 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ from dataclasses import asdict
3
+ from pathlib import Path
4
+ from typing import Iterator
5
+
6
+ from dateutil.parser import parse
7
+ from langchain.docstore.document import Document
8
+ from langchain.document_loaders.base import BaseLoader
9
+
10
+ from models import GithubIssue
11
+
12
+
13
+ def date_to_int(dt_str: str) -> int:
14
+ dt = parse(dt_str)
15
+ return int(dt.timestamp())
16
+
17
+
18
+ def get_contents(index: str, inputfile: Path) -> Iterator[tuple[GithubIssue, str]]:
19
+ with inputfile.open("r") as f:
20
+ obj = [json.loads(line) for line in f]
21
+ for data in obj:
22
+ title = data["title"]
23
+ body = data["body"]
24
+ issue = GithubIssue(
25
+ index=index,
26
+ id=data["number"],
27
+ title=title,
28
+ ctime=date_to_int(data["created_at"]),
29
+ user=data["user.login"],
30
+ url=data["html_url"],
31
+ labels=data["labels_"],
32
+ )
33
+ text = title
34
+ if body:
35
+ text += "\n\n" + body
36
+ yield issue, text
37
+ comments = data["comments_"]
38
+ for comment in comments:
39
+ issue = GithubIssue(
40
+ index=index,
41
+ id=comment["id"],
42
+ title=data["title"],
43
+ ctime=date_to_int(comment["created_at"]),
44
+ user=comment["user.login"],
45
+ url=comment["html_url"],
46
+ labels=data["labels_"],
47
+ type="issue_comment",
48
+ )
49
+ yield issue, comment["body"]
50
+
51
+
52
+ class GithubIssueLoader(BaseLoader):
53
+ def __init__(self, index: str, inputfile: Path):
54
+ self.index = index
55
+ self.inputfile = inputfile
56
+
57
+ def lazy_load(self) -> Iterator[Document]:
58
+ for issue, text in get_contents(self.index, self.inputfile):
59
+ metadata = asdict(issue)
60
+ yield Document(page_content=text, metadata=metadata)
61
+
62
+ def load(self) -> list[Document]:
63
+ return list(self.lazy_load())
doc_loader.py → loaders/wikipage.py RENAMED
@@ -1,11 +1,13 @@
1
- from dataclasses import asdict
2
  import json
 
 
3
  from typing import Iterator
 
4
  from dateutil.parser import parse
5
  from langchain.docstore.document import Document
6
  from langchain.document_loaders.base import BaseLoader
7
 
8
- from model import Doc
9
 
10
 
11
  def date_to_int(dt_str: str) -> int:
@@ -13,20 +15,20 @@ def date_to_int(dt_str: str) -> int:
13
  return int(dt.timestamp())
14
 
15
 
16
- def get_contents(project_name: str, filename: str) -> Iterator[tuple[Doc, str]]:
17
  """filename for file with ndjson
18
 
19
  {"id": <page_id>, "title": <page title>, "content": <page body>, "ctime": ..., "user": <name>, "url": "https:..."}
20
  {"title": ...}
21
  """
22
- with open(filename, "r") as f:
23
  obj = [json.loads(line) for line in f]
24
  for data in obj:
25
  title = data["title"]
26
  body = data["content"]
27
  ctime = date_to_int(data["ctime"]) if isinstance(data["ctime"], str) else data["ctime"]
28
- doc = Doc(
29
- project_name=project_name,
30
  id=data["id"],
31
  title=title,
32
  ctime=ctime,
@@ -39,13 +41,13 @@ def get_contents(project_name: str, filename: str) -> Iterator[tuple[Doc, str]]:
39
  yield doc, text
40
 
41
 
42
- class DocLoader(BaseLoader):
43
- def __init__(self, project_name: str, filename: str):
44
- self.project_name = project_name
45
- self.filename = filename
46
 
47
  def lazy_load(self) -> Iterator[Document]:
48
- for doc, text in get_contents(self.project_name, self.filename):
49
  metadata = asdict(doc)
50
  yield Document(page_content=text, metadata=metadata)
51
 
 
 
1
  import json
2
+ from dataclasses import asdict
3
+ from pathlib import Path
4
  from typing import Iterator
5
+
6
  from dateutil.parser import parse
7
  from langchain.docstore.document import Document
8
  from langchain.document_loaders.base import BaseLoader
9
 
10
+ from models import WikiPage
11
 
12
 
13
  def date_to_int(dt_str: str) -> int:
 
15
  return int(dt.timestamp())
16
 
17
 
18
+ def get_contents(index: str, inputfile: Path) -> Iterator[tuple[WikiPage, str]]:
19
  """filename for file with ndjson
20
 
21
  {"id": <page_id>, "title": <page title>, "content": <page body>, "ctime": ..., "user": <name>, "url": "https:..."}
22
  {"title": ...}
23
  """
24
+ with inputfile.open("r") as f:
25
  obj = [json.loads(line) for line in f]
26
  for data in obj:
27
  title = data["title"]
28
  body = data["content"]
29
  ctime = date_to_int(data["ctime"]) if isinstance(data["ctime"], str) else data["ctime"]
30
+ doc = WikiPage(
31
+ index=index,
32
  id=data["id"],
33
  title=title,
34
  ctime=ctime,
 
41
  yield doc, text
42
 
43
 
44
+ class WikiPageLoader(BaseLoader):
45
+ def __init__(self, index: str, inputfile: Path):
46
+ self.index = index
47
+ self.inputfile = inputfile
48
 
49
  def lazy_load(self) -> Iterator[Document]:
50
+ for doc, text in get_contents(self.index, self.inputfile):
51
  metadata = asdict(doc)
52
  yield Document(page_content=text, metadata=metadata)
53
 
model.py DELETED
@@ -1,11 +0,0 @@
1
- from dataclasses import dataclass
2
-
3
-
4
- @dataclass(frozen=True)
5
- class Doc:
6
- project_name: str
7
- id: int
8
- title: str
9
- ctime: int
10
- user: str
11
- url: str
 
 
 
 
 
 
 
 
 
 
 
 
models.py ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import dataclasses
2
+
3
+
4
+ @dataclasses.dataclass()
5
+ class BaseModel:
6
+ index: str
7
+ id: int
8
+ title: str
9
+ ctime: int
10
+ user: str
11
+ url: str
12
+ type: str
13
+
14
+
15
+ @dataclasses.dataclass(frozen=True)
16
+ class GithubIssue(BaseModel):
17
+ labels: list[str]
18
+ type: str = "issue"
19
+
20
+
21
+ @dataclasses.dataclass(frozen=True)
22
+ class WikiPage:
23
+ type: str = "wiki"
store.py CHANGED
@@ -1,10 +1,14 @@
 
 
 
 
1
  from tqdm import tqdm
2
  import torch
3
  from langchain.text_splitter import RecursiveCharacterTextSplitter
4
  from langchain.embeddings import HuggingFaceEmbeddings
5
  from langchain.vectorstores import Qdrant
6
 
7
- from doc_loader import DocLoader
8
  from config import DB_CONFIG
9
 
10
 
@@ -19,6 +23,16 @@ def get_text_chunk(docs):
19
  return texts
20
 
21
 
 
 
 
 
 
 
 
 
 
 
22
  def store(texts):
23
  model_name = "intfloat/multilingual-e5-large"
24
  model_kwargs = {"device": "cuda:0" if torch.cuda.is_available() else "cpu"}
@@ -29,9 +43,9 @@ def store(texts):
29
  encode_kwargs=encode_kwargs,
30
  )
31
  db_url, db_api_key, db_collection_name = DB_CONFIG
32
- for text in tqdm(texts):
33
  _ = Qdrant.from_documents(
34
- [text],
35
  embeddings,
36
  url=db_url,
37
  api_key=db_api_key,
@@ -39,24 +53,31 @@ def store(texts):
39
  )
40
 
41
 
42
- def main(project_name: str, path: str) -> None:
43
- loader = DocLoader(project_name, path)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
44
  docs = loader.load()
45
  texts = get_text_chunk(docs)
46
  store(texts)
47
 
48
 
49
  if __name__ == "__main__":
50
- """
51
- $ python store.py "PROJECT_NAME" "FILE_PATH"
52
- $ python store.py hoge data/hoge-docs.json
53
- """
54
- import sys
55
-
56
- args = sys.argv
57
- if len(args) != 3:
58
- print("No args, you need two args for project_name, json_file_path")
59
- else:
60
- project_name = args[1]
61
- path = args[2]
62
- main(project_name, path)
 
1
+ import argparse
2
+ from itertools import islice
3
+ from pathlib import Path
4
+
5
  from tqdm import tqdm
6
  import torch
7
  from langchain.text_splitter import RecursiveCharacterTextSplitter
8
  from langchain.embeddings import HuggingFaceEmbeddings
9
  from langchain.vectorstores import Qdrant
10
 
11
+ from loaders import get_loader, LOADER_NAMES
12
  from config import DB_CONFIG
13
 
14
 
 
23
  return texts
24
 
25
 
26
+ def batched(iterable, *, size=100):
27
+ "Batch data into tuples of length n. The last batch may be shorter."
28
+ # batched('ABCDEFG', 3) --> ABC DEF G
29
+ if size < 1:
30
+ raise ValueError('n must be at least one')
31
+ it = iter(iterable)
32
+ while batch := tuple(islice(it, size)):
33
+ yield batch
34
+
35
+
36
  def store(texts):
37
  model_name = "intfloat/multilingual-e5-large"
38
  model_kwargs = {"device": "cuda:0" if torch.cuda.is_available() else "cpu"}
 
43
  encode_kwargs=encode_kwargs,
44
  )
45
  db_url, db_api_key, db_collection_name = DB_CONFIG
46
+ for batch in tqdm(batched(texts, size=100)):
47
  _ = Qdrant.from_documents(
48
+ batch,
49
  embeddings,
50
  url=db_url,
51
  api_key=db_api_key,
 
53
  )
54
 
55
 
56
+ def get_parser():
57
+ p = argparse.ArgumentParser()
58
+ p.add_argument("index", type=str)
59
+ p.add_argument("inputfile", metavar="INPUTFILE", type=argparse.FileType("rt"))
60
+ p.add_argument("-l", "--loader", type=str, choices=LOADER_NAMES, required=True)
61
+ return p
62
+
63
+
64
+ def main():
65
+ """
66
+ $ python store.py --loader wikipage "index" "FILE_PATH"
67
+ $ python store.py -l wikipage wiki data/wiki.json
68
+ """
69
+ p = get_parser()
70
+ args = p.parse_args()
71
+ loader = get_loader(
72
+ args.loader,
73
+ index=args.index,
74
+ inputfile=Path(args.inputfile.name),
75
+ )
76
+
77
  docs = loader.load()
78
  texts = get_text_chunk(docs)
79
  store(texts)
80
 
81
 
82
  if __name__ == "__main__":
83
+ main()