import urllib.parse from bs4 import BeautifulSoup from crawl4ai import AsyncWebCrawler import re import asyncio def process_url(url, sub_url): return urllib.parse.urljoin(url, sub_url) def clean_markdown(res): pattern = r'\[.*?\]\(.*?\)' try: # 使用 re.sub() 将匹配的内容替换为空字符 result = re.sub(pattern, '', res) url_pattern = pattern = r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+' result = re.sub(url_pattern, '', result) result = result.replace("* \n","") result = re.sub(r"\n\n+", "\n", result) return result except Exception: return res async def get_info(url, screentshot = True) -> str: async with AsyncWebCrawler() as crawler: if screentshot: result = await crawler.arun(url, screenshot=screentshot) # print(result) return result.html, clean_markdown(result.markdown), result.screenshot else: result = await crawler.arun(url, screenshot=screentshot) return result.html, clean_markdown(result.markdown) if __name__ == "__main__": asyncio.run(get_info("https://2024.aclweb.org/"))