boryasbora commited on
Commit
ee03791
1 Parent(s): 9514ca1

Create scrape_github.py

Browse files
Files changed (1) hide show
  1. scrape_github.py +86 -0
scrape_github.py ADDED
@@ -0,0 +1,86 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import requests
2
+ import os
3
+ from typing import Iterable
4
+
5
+ # Define the GitHub API endpoint for the organization's repositories
6
+ org_name = "oceanhackweek"
7
+ url = f"https://api.github.com/orgs/oceanhackweek/repos"
8
+
9
+ # Set your personal access token here if needed
10
+ access_token = os.getenv('git_token')
11
+ headers = {
12
+ "Accept": "application/vnd.github.v3+json",
13
+ "Authorization": f"token {access_token}" # Comment out this line if not using an access token
14
+ }
15
+
16
+ ## Create a directory to store the README files
17
+ if not os.path.exists('readmes_proj'):
18
+ os.makedirs('readmes_proj')
19
+
20
+ # Dictionary to store the mapping of filename to repository link
21
+ repo_links = {}
22
+
23
+ def download_readme(repo_name, repo_html_url):
24
+ # Construct the URL for the README file in the repository
25
+ readme_url = f"https://raw.githubusercontent.com/{org_name}/{repo_name}/main/README.md"
26
+
27
+ try:
28
+ response = requests.get(readme_url)
29
+ response.raise_for_status() # Raise an error for bad responses
30
+ file_name = f"{repo_name}_README.md"
31
+ file_path = os.path.join('readmes_proj', file_name)
32
+
33
+ with open(file_path, 'w', encoding='utf-8') as file:
34
+ file.write(response.text)
35
+
36
+ # Save the repo link in the dictionary
37
+ repo_links[file_name] = repo_html_url
38
+
39
+ print(f"Downloaded: {repo_name}")
40
+ except requests.exceptions.HTTPError as e:
41
+ print(f"Failed to download {repo_name}: {e}")
42
+
43
+ def get_repositories(url):
44
+ repos = []
45
+ while url:
46
+ response = requests.get(url, headers=headers)
47
+ response.raise_for_status()
48
+ repos.extend(response.json())
49
+ # Check if there is a next page
50
+ url = response.links.get('next', {}).get('url')
51
+ return repos
52
+ repos = get_repositories(url)
53
+ for repo in repos:
54
+ repo_name = repo["name"]
55
+ repo_html_url = repo["html_url"]
56
+ if "proj" in repo_name:
57
+ download_readme(repo_name,repo_html_url)
58
+
59
+ def load_md_to_langchain_document(readme_dict, filename):
60
+ # Load the markdown content from a file
61
+ with open(f'./readmes_proj/{filename}', 'r', encoding='utf-8') as file:
62
+ markdown_content = file.read()
63
+ corrected_content = re.sub(r'(\[.*?\]\(.*?\\)', r'\1 ', markdown_content)
64
+
65
+ # Create a LangChain Document
66
+ langchain_document = Document(
67
+ page_content=corrected_content,
68
+ metadata={"source": readme_dict[filename]}
69
+ )
70
+
71
+ return langchain_document
72
+
73
+ # Example usage
74
+ documents = []
75
+ for filename in repo_links:
76
+ langchain_doc = load_md_to_langchain_document(repo_links, filename)
77
+ documents.append(langchain_doc)
78
+
79
+ def save_docs_to_jsonl(array:Iterable[Document], file_path:str)->None:
80
+ '''
81
+ save langchain documents as json file
82
+ '''
83
+ with open(file_path, 'w') as jsonl_file:
84
+ for doc in array:
85
+ jsonl_file.write(doc.json() + '\n')
86
+ save_docs_to_jsonl(documents,'project_readmes.json')