Spaces:

boryasbora
/

chatbot_ohw_projects

Sleeping

App Files Files Community

boryasbora commited on Aug 30

Commit

ee03791

•

1 Parent(s): 9514ca1

Create scrape_github.py

Browse files

Files changed (1) hide show

scrape_github.py +86 -0

scrape_github.py ADDED Viewed

	@@ -0,0 +1,86 @@

+import requests
+import os
+from typing import Iterable
+# Define the GitHub API endpoint for the organization's repositories
+org_name = "oceanhackweek"
+url = f"https://api.github.com/orgs/oceanhackweek/repos"
+# Set your personal access token here if needed
+access_token = os.getenv('git_token')
+headers = {
+    "Accept": "application/vnd.github.v3+json",
+    "Authorization": f"token {access_token}"  # Comment out this line if not using an access token
+}
+## Create a directory to store the README files
+if not os.path.exists('readmes_proj'):
+    os.makedirs('readmes_proj')
+# Dictionary to store the mapping of filename to repository link
+repo_links = {}
+def download_readme(repo_name, repo_html_url):
+    # Construct the URL for the README file in the repository
+    readme_url = f"https://raw.githubusercontent.com/{org_name}/{repo_name}/main/README.md"
+    try:
+        response = requests.get(readme_url)
+        response.raise_for_status()  # Raise an error for bad responses
+        file_name = f"{repo_name}_README.md"
+        file_path = os.path.join('readmes_proj', file_name)
+        with open(file_path, 'w', encoding='utf-8') as file:
+            file.write(response.text)
+        # Save the repo link in the dictionary
+        repo_links[file_name] = repo_html_url
+        print(f"Downloaded: {repo_name}")
+    except requests.exceptions.HTTPError as e:
+        print(f"Failed to download {repo_name}: {e}")
+def get_repositories(url):
+    repos = []
+    while url:
+        response = requests.get(url, headers=headers)
+        response.raise_for_status()
+        repos.extend(response.json())
+        # Check if there is a next page
+        url = response.links.get('next', {}).get('url')
+    return repos
+repos = get_repositories(url)
+for repo in repos:
+    repo_name = repo["name"]
+    repo_html_url = repo["html_url"]
+    if "proj" in repo_name:
+        download_readme(repo_name,repo_html_url)
+def load_md_to_langchain_document(readme_dict, filename):
+    # Load the markdown content from a file
+    with open(f'./readmes_proj/{filename}', 'r', encoding='utf-8') as file:
+        markdown_content = file.read()
+    corrected_content = re.sub(r'(\[.*?\]\(.*?\\)', r'\1 ', markdown_content)
+    # Create a LangChain Document
+    langchain_document = Document(
+        page_content=corrected_content,
+        metadata={"source": readme_dict[filename]}
+    )
+    return langchain_document
+# Example usage
+documents = []
+for filename in repo_links:
+    langchain_doc = load_md_to_langchain_document(repo_links, filename)
+    documents.append(langchain_doc)
+def save_docs_to_jsonl(array:Iterable[Document], file_path:str)->None:
+    '''
+    save langchain documents as json file
+    '''
+    with open(file_path, 'w') as jsonl_file:
+        for doc in array:
+            jsonl_file.write(doc.json() + '\n')
+save_docs_to_jsonl(documents,'project_readmes.json')