import requests
import os
from typing import Iterable

# Define the GitHub API endpoint for the organization's repositories
org_name = "oceanhackweek"
url = f"https://api.github.com/orgs/oceanhackweek/repos"

# Set your personal access token here if needed
access_token = os.getenv('git_token')
headers = {
    "Accept": "application/vnd.github.v3+json",
    "Authorization": f"token {access_token}"  # Comment out this line if not using an access token
}

## Create a directory to store the README files
if not os.path.exists('readmes_proj'):
    os.makedirs('readmes_proj')

# Dictionary to store the mapping of filename to repository link
repo_links = {}

def download_readme(repo_name, repo_html_url):
    # Construct the URL for the README file in the repository
    readme_url = f"https://raw.githubusercontent.com/{org_name}/{repo_name}/main/README.md"
    
    try:
        response = requests.get(readme_url)
        response.raise_for_status()  # Raise an error for bad responses
        file_name = f"{repo_name}_README.md"
        file_path = os.path.join('readmes_proj', file_name)
        
        with open(file_path, 'w', encoding='utf-8') as file:
            file.write(response.text)
        
        # Save the repo link in the dictionary
        repo_links[file_name] = repo_html_url
        
        print(f"Downloaded: {repo_name}")
    except requests.exceptions.HTTPError as e:
        print(f"Failed to download {repo_name}: {e}")

def get_repositories(url):
    repos = []
    while url:
        response = requests.get(url, headers=headers)
        response.raise_for_status()
        repos.extend(response.json())
        # Check if there is a next page
        url = response.links.get('next', {}).get('url')
    return repos
repos = get_repositories(url)
for repo in repos:
    repo_name = repo["name"]
    repo_html_url = repo["html_url"]
    if "proj" in repo_name:
        download_readme(repo_name,repo_html_url)

def load_md_to_langchain_document(readme_dict, filename):
    # Load the markdown content from a file
    with open(f'./readmes_proj/{filename}', 'r', encoding='utf-8') as file:
        markdown_content = file.read()
    corrected_content = re.sub(r'(\[.*?\]\(.*?\\)', r'\1 ', markdown_content)

    # Create a LangChain Document
    langchain_document = Document(
        page_content=corrected_content,
        metadata={"source": readme_dict[filename]}
    )
    
    return langchain_document

# Example usage
documents = []
for filename in repo_links:
    langchain_doc = load_md_to_langchain_document(repo_links, filename)
    documents.append(langchain_doc)

def save_docs_to_jsonl(array:Iterable[Document], file_path:str)->None:
    '''
    save langchain documents as json file
    '''
    with open(file_path, 'w') as jsonl_file:
        for doc in array:
            jsonl_file.write(doc.json() + '\n')
save_docs_to_jsonl(documents,'project_readmes.json')