import requests import os from typing import Iterable # Define the GitHub API endpoint for the organization's repositories org_name = "oceanhackweek" url = f"https://api.github.com/orgs/oceanhackweek/repos" # Set your personal access token here if needed access_token = os.getenv('git_token') headers = { "Accept": "application/vnd.github.v3+json", "Authorization": f"token {access_token}" # Comment out this line if not using an access token } ## Create a directory to store the README files if not os.path.exists('readmes_proj'): os.makedirs('readmes_proj') # Dictionary to store the mapping of filename to repository link repo_links = {} def download_readme(repo_name, repo_html_url): # Construct the URL for the README file in the repository readme_url = f"https://raw.githubusercontent.com/{org_name}/{repo_name}/main/README.md" try: response = requests.get(readme_url) response.raise_for_status() # Raise an error for bad responses file_name = f"{repo_name}_README.md" file_path = os.path.join('readmes_proj', file_name) with open(file_path, 'w', encoding='utf-8') as file: file.write(response.text) # Save the repo link in the dictionary repo_links[file_name] = repo_html_url print(f"Downloaded: {repo_name}") except requests.exceptions.HTTPError as e: print(f"Failed to download {repo_name}: {e}") def get_repositories(url): repos = [] while url: response = requests.get(url, headers=headers) response.raise_for_status() repos.extend(response.json()) # Check if there is a next page url = response.links.get('next', {}).get('url') return repos repos = get_repositories(url) for repo in repos: repo_name = repo["name"] repo_html_url = repo["html_url"] if "proj" in repo_name: download_readme(repo_name,repo_html_url) def load_md_to_langchain_document(readme_dict, filename): # Load the markdown content from a file with open(f'./readmes_proj/{filename}', 'r', encoding='utf-8') as file: markdown_content = file.read() corrected_content = re.sub(r'(\[.*?\]\(.*?\\)', r'\1 ', markdown_content) # Create a LangChain Document langchain_document = Document( page_content=corrected_content, metadata={"source": readme_dict[filename]} ) return langchain_document # Example usage documents = [] for filename in repo_links: langchain_doc = load_md_to_langchain_document(repo_links, filename) documents.append(langchain_doc) def save_docs_to_jsonl(array:Iterable[Document], file_path:str)->None: ''' save langchain documents as json file ''' with open(file_path, 'w') as jsonl_file: for doc in array: jsonl_file.write(doc.json() + '\n') save_docs_to_jsonl(documents,'project_readmes.json')