Spaces:

shimizukawa
/

python-no-senpai

Running

App Files Files Community

python-no-senpai / loaders /rtdhtmlpage.py

shimizukawa

provide djangoproject special rule

4cf5bcf 11 months ago

raw

history blame contribute delete

No virus

2.7 kB

	from datetime import datetime
	from pathlib import Path
	from typing import Iterator

	from langchain.docstore.document import Document
	from langchain.document_loaders import ReadTheDocsLoader


	class RTDHtmlPageLoader(ReadTheDocsLoader):
	"""directory path for readthedocs documents

	$ wget -r -np -A.html https://docs.djangoproject.com/en/4.2/
	$ python store.py -l rtdhtmlpage django ./docs.djangoproject.com/
	"""
	def __init__(self, inputfile: Path, args, *kwargs):
	kwargs["custom_html_tag"] = ("div", {"id": "docs-content"})
	super().__init__(inputfile, args, *kwargs)

	def _my_clean_data(self, data: str) -> str:
	from bs4 import BeautifulSoup

	soup = BeautifulSoup(data, **self.bs_kwargs)

	# default tags
	html_tags = [
	("div", {"role": "main"}),
	("main", {"id": "main-content"}),
	]

	if self.custom_html_tag is not None:
	html_tags.append(self.custom_html_tag)

	text = None

	# reversed order. check the custom one first
	for tag, attrs in html_tags[::-1]:
	text = soup.find(tag, attrs)
	# if found, break
	if text is not None:
	break

	if text is not None:
	title = "".join(t.text for t in text.find("h1") if t.name!="a")
	text = text.get_text()
	else:
	text = ""
	title = ""

	# trim empty lines
	text = "\n".join([t for t in text.split("\n") if t])

	return text, title

	def lazy_load(self) -> Iterator[Document]:
	"""Load documents."""
	for p in self.file_path.rglob("*"):
	if p.is_dir():
	continue
	# FIXME: utf-8を指定したい
	# with open(p, encoding='utf-8', errors='ignore') as f:
	with open(p, encoding=self.encoding, errors=self.errors) as f:
	text, title = self._my_clean_data(f.read())

	if "docs.djangoproject.com" in p.parts and p.name == "index.html":
	# Djangoドキュメントではindex.htmlにアクセスすると404になる
	p = p.parent
	url = f"https://{str(p)}/"
	else:
	url = f"https://{str(p)}"

	metadata = {
	"title": title,
	"ctime": int(datetime.now().timestamp()),
	"user": "rtd",
	"type": "rtd",
	"url": url,
	"id": str(p),
	}
	# print(metadata)
	yield Document(page_content=text, metadata=metadata)


	def load(self) -> list[Document]:
	return list(self.lazy_load())