python-no-senpai / loaders /rtdhtmlpage.py
shimizukawa's picture
provide djangoproject special rule
4cf5bcf
raw
history blame contribute delete
No virus
2.7 kB
from datetime import datetime
from pathlib import Path
from typing import Iterator
from langchain.docstore.document import Document
from langchain.document_loaders import ReadTheDocsLoader
class RTDHtmlPageLoader(ReadTheDocsLoader):
"""directory path for readthedocs documents
$ wget -r -np -A.html https://docs.djangoproject.com/en/4.2/
$ python store.py -l rtdhtmlpage django ./docs.djangoproject.com/
"""
def __init__(self, inputfile: Path, *args, **kwargs):
kwargs["custom_html_tag"] = ("div", {"id": "docs-content"})
super().__init__(inputfile, *args, **kwargs)
def _my_clean_data(self, data: str) -> str:
from bs4 import BeautifulSoup
soup = BeautifulSoup(data, **self.bs_kwargs)
# default tags
html_tags = [
("div", {"role": "main"}),
("main", {"id": "main-content"}),
]
if self.custom_html_tag is not None:
html_tags.append(self.custom_html_tag)
text = None
# reversed order. check the custom one first
for tag, attrs in html_tags[::-1]:
text = soup.find(tag, attrs)
# if found, break
if text is not None:
break
if text is not None:
title = "".join(t.text for t in text.find("h1") if t.name!="a")
text = text.get_text()
else:
text = ""
title = ""
# trim empty lines
text = "\n".join([t for t in text.split("\n") if t])
return text, title
def lazy_load(self) -> Iterator[Document]:
"""Load documents."""
for p in self.file_path.rglob("*"):
if p.is_dir():
continue
# FIXME: utf-8を指定したい
# with open(p, encoding='utf-8', errors='ignore') as f:
with open(p, encoding=self.encoding, errors=self.errors) as f:
text, title = self._my_clean_data(f.read())
if "docs.djangoproject.com" in p.parts and p.name == "index.html":
# Djangoドキュメントではindex.htmlにアクセスすると404になる
p = p.parent
url = f"https://{str(p)}/"
else:
url = f"https://{str(p)}"
metadata = {
"title": title,
"ctime": int(datetime.now().timestamp()),
"user": "rtd",
"type": "rtd",
"url": url,
"id": str(p),
}
# print(metadata)
yield Document(page_content=text, metadata=metadata)
def load(self) -> list[Document]:
return list(self.lazy_load())