shimizukawa commited on
Commit
8d5b271
1 Parent(s): c1dc2ee

refactoring: move index annotation

Browse files
loaders/github_issue.py CHANGED
@@ -15,14 +15,13 @@ def date_to_int(dt_str: str) -> int:
15
  return int(dt.timestamp())
16
 
17
 
18
- def get_contents(index: str, inputfile: Path) -> Iterator[tuple[GithubIssue, str]]:
19
  with inputfile.open("r") as f:
20
  obj = [json.loads(line) for line in f]
21
  for data in obj:
22
  title = data["title"]
23
  body = data["body"]
24
  issue = GithubIssue(
25
- index=index,
26
  id=data["number"],
27
  title=title,
28
  ctime=date_to_int(data["created_at"]),
@@ -37,7 +36,6 @@ def get_contents(index: str, inputfile: Path) -> Iterator[tuple[GithubIssue, str
37
  comments = data["comments_"]
38
  for comment in comments:
39
  issue = GithubIssue(
40
- index=index,
41
  id=comment["id"],
42
  title=data["title"],
43
  ctime=date_to_int(comment["created_at"]),
@@ -50,12 +48,11 @@ def get_contents(index: str, inputfile: Path) -> Iterator[tuple[GithubIssue, str
50
 
51
 
52
  class GithubIssueLoader(BaseLoader):
53
- def __init__(self, index: str, inputfile: Path):
54
- self.index = index
55
  self.inputfile = inputfile
56
 
57
  def lazy_load(self) -> Iterator[Document]:
58
- for issue, text in get_contents(self.index, self.inputfile):
59
  metadata = asdict(issue)
60
  yield Document(page_content=text, metadata=metadata)
61
 
 
15
  return int(dt.timestamp())
16
 
17
 
18
+ def get_contents(inputfile: Path) -> Iterator[tuple[GithubIssue, str]]:
19
  with inputfile.open("r") as f:
20
  obj = [json.loads(line) for line in f]
21
  for data in obj:
22
  title = data["title"]
23
  body = data["body"]
24
  issue = GithubIssue(
 
25
  id=data["number"],
26
  title=title,
27
  ctime=date_to_int(data["created_at"]),
 
36
  comments = data["comments_"]
37
  for comment in comments:
38
  issue = GithubIssue(
 
39
  id=comment["id"],
40
  title=data["title"],
41
  ctime=date_to_int(comment["created_at"]),
 
48
 
49
 
50
  class GithubIssueLoader(BaseLoader):
51
+ def __init__(self, inputfile: Path):
 
52
  self.inputfile = inputfile
53
 
54
  def lazy_load(self) -> Iterator[Document]:
55
+ for issue, text in get_contents(self.inputfile):
56
  metadata = asdict(issue)
57
  yield Document(page_content=text, metadata=metadata)
58
 
loaders/rtdhtmlpage.py CHANGED
@@ -12,8 +12,7 @@ class RTDHtmlPageLoader(ReadTheDocsLoader):
12
  $ wget -r -np -A.html https://docs.djangoproject.com/en/4.2/
13
  $ python store.py -l rtdhtmlpage django ./docs.djangoproject.com/
14
  """
15
- def __init__(self, index: str, inputfile: Path, *args, **kwargs):
16
- self.index = index
17
  kwargs["custom_html_tag"] = ("div", {"id": "docs-content"})
18
  super().__init__(inputfile, *args, **kwargs)
19
 
@@ -66,7 +65,6 @@ class RTDHtmlPageLoader(ReadTheDocsLoader):
66
  "user": "rtd",
67
  "type": "rtd",
68
  "url": f"https://{str(p)}",
69
- "index": self.index,
70
  "id": str(p),
71
  }
72
  # print(metadata)
 
12
  $ wget -r -np -A.html https://docs.djangoproject.com/en/4.2/
13
  $ python store.py -l rtdhtmlpage django ./docs.djangoproject.com/
14
  """
15
+ def __init__(self, inputfile: Path, *args, **kwargs):
 
16
  kwargs["custom_html_tag"] = ("div", {"id": "docs-content"})
17
  super().__init__(inputfile, *args, **kwargs)
18
 
 
65
  "user": "rtd",
66
  "type": "rtd",
67
  "url": f"https://{str(p)}",
 
68
  "id": str(p),
69
  }
70
  # print(metadata)
loaders/wikipage.py CHANGED
@@ -15,7 +15,7 @@ def date_to_int(dt_str: str) -> int:
15
  return int(dt.timestamp())
16
 
17
 
18
- def get_contents(index: str, inputfile: Path) -> Iterator[tuple[WikiPage, str]]:
19
  """filename for file with ndjson
20
 
21
  {"id": <page_id>, "title": <page title>, "content": <page body>, "ctime": ..., "user": <name>, "url": "https:..."}
@@ -28,7 +28,6 @@ def get_contents(index: str, inputfile: Path) -> Iterator[tuple[WikiPage, str]]:
28
  body = data["content"]
29
  ctime = date_to_int(data["ctime"]) if isinstance(data["ctime"], str) else data["ctime"]
30
  doc = WikiPage(
31
- index=index,
32
  id=data["id"],
33
  title=title,
34
  ctime=ctime,
@@ -42,12 +41,11 @@ def get_contents(index: str, inputfile: Path) -> Iterator[tuple[WikiPage, str]]:
42
 
43
 
44
  class WikiPageLoader(BaseLoader):
45
- def __init__(self, index: str, inputfile: Path):
46
- self.index = index
47
  self.inputfile = inputfile
48
 
49
  def lazy_load(self) -> Iterator[Document]:
50
- for doc, text in get_contents(self.index, self.inputfile):
51
  metadata = asdict(doc)
52
  yield Document(page_content=text, metadata=metadata)
53
 
 
15
  return int(dt.timestamp())
16
 
17
 
18
+ def get_contents(inputfile: Path) -> Iterator[tuple[WikiPage, str]]:
19
  """filename for file with ndjson
20
 
21
  {"id": <page_id>, "title": <page title>, "content": <page body>, "ctime": ..., "user": <name>, "url": "https:..."}
 
28
  body = data["content"]
29
  ctime = date_to_int(data["ctime"]) if isinstance(data["ctime"], str) else data["ctime"]
30
  doc = WikiPage(
 
31
  id=data["id"],
32
  title=title,
33
  ctime=ctime,
 
41
 
42
 
43
  class WikiPageLoader(BaseLoader):
44
+ def __init__(self, inputfile: Path):
 
45
  self.inputfile = inputfile
46
 
47
  def lazy_load(self) -> Iterator[Document]:
48
+ for doc, text in get_contents(self.inputfile):
49
  metadata = asdict(doc)
50
  yield Document(page_content=text, metadata=metadata)
51
 
models.py CHANGED
@@ -3,13 +3,13 @@ import dataclasses
3
 
4
  @dataclasses.dataclass(frozen=True)
5
  class BaseModel:
6
- index: str
7
  id: int
8
  title: str
9
  ctime: int
10
  user: str
11
  url: str
12
  type: str
 
13
 
14
 
15
  @dataclasses.dataclass(frozen=True)
 
3
 
4
  @dataclasses.dataclass(frozen=True)
5
  class BaseModel:
 
6
  id: int
7
  title: str
8
  ctime: int
9
  user: str
10
  url: str
11
  type: str
12
+ index: str = ""
13
 
14
 
15
  @dataclasses.dataclass(frozen=True)
store.py CHANGED
@@ -61,6 +61,12 @@ def get_parser():
61
  return p
62
 
63
 
 
 
 
 
 
 
64
  def main():
65
  """
66
  $ python store.py --loader wikipage "index" "FILE_PATH"
@@ -71,12 +77,11 @@ def main():
71
  args = p.parse_args()
72
  loader = get_loader(
73
  args.loader,
74
- index=args.index,
75
  inputfile=Path(args.inputfile),
76
  )
77
 
78
- docs = loader.load()
79
- texts = get_text_chunk(docs)
80
  store(texts)
81
 
82
 
 
61
  return p
62
 
63
 
64
+ def index_annotated_docs(docs, index):
65
+ for doc in docs:
66
+ doc.metadata["index"] = index
67
+ yield doc
68
+
69
+
70
  def main():
71
  """
72
  $ python store.py --loader wikipage "index" "FILE_PATH"
 
77
  args = p.parse_args()
78
  loader = get_loader(
79
  args.loader,
 
80
  inputfile=Path(args.inputfile),
81
  )
82
 
83
+ docs = loader.lazy_load()
84
+ texts = get_text_chunk(index_annotated_docs(docs, args.index))
85
  store(texts)
86
 
87