Spaces:

Ahmadzei
/

RAG

Runtime error

RAG / prep_scripts /markdown_to_text.py

refactoring

fde33b3 10 months ago

1.76 kB

	import argparse
	import os
	import re

	from tqdm import tqdm
	from bs4 import BeautifulSoup
	from markdown import markdown
	from pathlib import Path


	def markdown_to_text(markdown_string):
	""" Converts a markdown string to plaintext """

	# md -> html -> text since BeautifulSoup can extract text cleanly
	html = markdown(markdown_string)

	html = re.sub(r'<!--((.\|\n)*)-->', '', html)
	html = re.sub('<code>bash', '<code>', html)

	# extract text
	soup = BeautifulSoup(html, "html.parser")
	text = ''.join(soup.findAll(text=True))

	text = re.sub('```(py\|diff\|python)', '', text)
	text = re.sub('```\n', '\n', text)
	text = re.sub('- .*', '', text)
	text = text.replace('...', '')
	text = re.sub('\n(\n)+', '\n\n', text)

	return text


	def main():
	parser = argparse.ArgumentParser()
	parser.add_argument("--input-dir", help="input directory with markdown", type=str,
	default="transformers/docs/source/en/")
	parser.add_argument("--output-dir", help="output directory to store raw texts", type=str,
	default="docs")

	args = parser.parse_args()
	input_dir = Path(args.input_dir)
	output_dir = Path(args.output_dir)

	assert os.path.isdir(input_dir), "Input directory doesn't exist"

	files = input_dir.rglob("*")
	os.makedirs(output_dir, exist_ok=True)

	for file in tqdm(files):
	parent = file.parent.stem if file.parent.stem != input_dir.stem else ""
	if file.is_file():
	with open(file) as f:
	md = f.read()

	text = markdown_to_text(md)

	with open(output_dir / f"{parent}_{file.stem}.txt", "w") as f:
	f.write(text)


	if __name__ == "__main__":
	main()