import requests from bs4 import BeautifulSoup import pandas as pd import numpy as np import matplotlib.pyplot as plt import seaborn as sns import datetime import nltk from nltk.corpus import stopwords from nltk.stem import WordNetLemmatizer from nltk.tokenize import word_tokenize from gensim.models import LdaModel from gensim.corpora import Dictionary from textblob import TextBlob from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer import networkx as nx from sklearn.model_selection import train_test_split from sklearn.linear_model import LogisticRegression from sklearn.ensemble import RandomForestClassifier from sklearn.metrics import accuracy_score, classification_report, roc_auc_score from sklearn.preprocessing import StandardScaler from sklearn.pipeline import Pipeline from sklearn.feature_extraction.text import TfidfVectorizer from scipy import linalg import plotly.graph_objects as go from collections import Counter import warnings import transformers import gradio as gr import streamlit as st warnings.filterwarnings("ignore") # Set up logging import logging logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s') # Function to fetch HTML content from GitHub issue pages def fetch_issue_data(username, repository, start_page, end_page): issues_data = [] for page in range(start_page, end_page + 1): url = f"https://github.com/{username}/{repository}/issues?page={page}" response = requests.get(url) soup = BeautifulSoup(response.content, 'html.parser') issue_elements = soup.find_all('div', class_='flex-shrink-0') for issue_element in issue_elements: issue_link = issue_element.find('a', class_='Link--primary')['href'] issue_url = f"https://github.com{issue_link}" issue_data = fetch_issue_details(issue_url) issues_data.append(issue_data) return issues_data # Function to fetch details of a specific issue def fetch_issue_details(issue_url): response = requests.get(issue_url) soup = BeautifulSoup(response.content, 'html.parser') issue_title = soup.find('h1', class_='gh-header-title').text.strip() issue_body = soup.find('div', class_='markdown-body').text.strip() issue_created_at = soup.find('relative-time')['datetime'] issue_closed_at = soup.find('relative-time', class_='no-wrap') if issue_closed_at: issue_closed_at = issue_closed_at['datetime'] else: issue_closed_at = None issue_author = soup.find('a', class_='author').text.strip() issue_assignee = soup.find('a', class_='Link--muted') if issue_assignee: issue_assignee = issue_assignee.text.strip() else: issue_assignee = None return { 'title': issue_title, 'body': issue_body, 'created_at': issue_created_at, 'closed_at': issue_closed_at, 'author': issue_author, 'assignee': issue_assignee } # Function to clean and structure the data def clean_and_structure_data(issues_data): df = pd.DataFrame(issues_data) if 'created_at' in df.columns: df['created_at'] = pd.to_datetime(df['created_at']) else: logging.error("The 'created_at' column is missing from the dataframe.") df['created_at'] = pd.NaT if 'closed_at' in df.columns: df['closed_at'] = pd.to_datetime(df['closed_at']) else: df['closed_at'] = None df['resolution_time'] = (df['closed_at'] - df['created_at']).dt.days df['resolution_time'] = df['resolution_time'].fillna(-1) df['is_closed'] = (df['closed_at'].notna()).astype(int) return df # Function for exploratory data analysis (EDA) def perform_eda(df): # Descriptive statistics st.write(df.describe()) # Visualizations sns.histplot(df['resolution_time'], kde=True) st.pyplot(plt) sns.lineplot(x=df['created_at'].dt.month, y='resolution_time', data=df) st.pyplot(plt) top_authors = df['author'].value_counts().nlargest(10) st.write("\nTop 10 Authors:") st.write(top_authors) top_assignees = df['assignee'].value_counts().nlargest(10) st.write("\nTop 10 Assignees:") st.write(top_assignees) # Function for text analysis using NLP def analyze_text_content(df): # Text preprocessing stop_words = set(stopwords.words('english')) lemmatizer = WordNetLemmatizer() df['processed_body'] = df['body'].apply(lambda text: ' '.join([lemmatizer.lemmatize(word) for word in word_tokenize(text) if word.lower() not in stop_words])) # Topic modeling dictionary = Dictionary([word_tokenize(text) for text in df['processed_body']]) corpus = [dictionary.doc2bow(word_tokenize(text)) for text in df['processed_body']] lda_model = LdaModel(corpus, num_topics=5, id2word=dictionary) st.write("Top 5 Topics:") for topic in lda_model.print_topics(num_words=5): st.write(topic) # Sentiment analysis analyzer = SentimentIntensityAnalyzer() df['sentiment'] = df['body'].apply(lambda text: analyzer.polarity_scores(text)['compound']) st.write("Sentiment Analysis:") st.write(df['sentiment'].describe()) # Word Cloud for Common Words from wordcloud import WordCloud all_words = ' '.join([text for text in df['processed_body']]) wordcloud = WordCloud(width=800, height=400, background_color='white').generate(all_words) st.pyplot(plt.figure(figsize=(10, 6), facecolor=None)) plt.imshow(wordcloud) plt.axis("off") plt.tight_layout(pad=0) plt.show() # Function to create a network graph of issues, authors, and assignees def create_network_graph(df): graph = nx.Graph() for index, row in df.iterrows(): graph.add_node(row['title'], type='issue') graph.add_node(row['author'], type='author') if row['assignee']: graph.add_node(row['assignee'], type='assignee') graph.add_edge(row['title'], row['author']) if row['assignee']: graph.add_edge(row['title'], row['assignee']) ... # Interactive Network Graph with Plotly pos = nx.spring_layout(graph, k=0.5) edge_x = [] edge_y = [] for edge in graph.edges(): x0, y0 = pos[edge[0]] x1, y1 = pos[edge[1]] edge_x.append([x0, x1, None]) edge_y.append([y0, y1, None]) edge_trace = go.Scatter( x=edge_x, y=edge_y, line=dict(width=0.5, color='#888'), hoverinfo='none', mode='lines' ) node_x = [] node_y = [] for node in graph.nodes(): x, y = pos[node] node_x.append(x) node_y.append(y) node_trace = go.Scatter( x=node_x, y=node_y, mode='markers', marker=dict( color=[], size=10, line=dict(width=2, color='black') ), text=[], hoverinfo='text' ) # Set node colors based on type node_colors = [] for node in graph.nodes(): if graph.nodes[node]['type'] == 'issue': node_colors.append('red') elif graph.nodes[node]['type'] == 'author': node_colors.append('blue') else: node_colors.append('green') # Set node labels node_labels = [] for node in graph.nodes(): node_labels.append(node) node_trace.marker.color = node_colors node_trace.text = node_labels # Create the figure fig = go.Figure(data=[edge_trace, node_trace], layout=go.Layout( title="GitHub Issue Network Graph", showlegend=False, hovermode='closest', margin=dict(b=20, l=5, r=5, t=40), xaxis=dict(showgrid=False, zeroline=False, showticklabels=False), yaxis=dict(showgrid=False, zeroline=False, showticklabels=False) ) ) # Display the figure in a Streamlit app st.plotly_chart(fig) # Function to build a predictive model for issue resolution time def build_predictive_model(df): # Feature engineering df['created_at_day'] = df['created_at'].dt.day df['created_at_weekday'] = df['created_at'].dt.weekday df['created_at_hour'] = df['created_at'].dt.hour df['author_encoded'] = df['author'].astype('category').cat.codes df['assignee_encoded'] = df['assignee'].astype('category').cat.codes # Select features and target variable features = ['created_at_day', 'created_at_weekday', 'created_at_hour', 'author_encoded', 'assignee_encoded', 'sentiment'] target = 'resolution_time' # Split data into training and testing sets X_train, X_test, y_train, y_test = train_test_split(df[features], df[target], test_size=0.2, random_state=42) # Create a pipeline for feature scaling and model training pipeline = Pipeline([ ('scaler', StandardScaler()), ('model', LogisticRegression()) ]) # Train the model pipeline.fit(X_train, y_train) # Evaluate the model y_pred = pipeline.predict(X_test) accuracy = accuracy_score(y_test, y_pred) st.write("Accuracy:", accuracy) st.write(classification_report(y_test, y_pred)) # Main function if __name__ == "__main__": # Replace with your GitHub username and repository name username = "Ig0tU" repository = "miagiii" # Fetch issue data from GitHub issues_data = fetch_issue_data(username, repository, 1, 10) # Clean and structure the data df = clean_and_structure_data(issues_data) # Perform exploratory data analysis (EDA) perform_eda(df) # Analyze text content using NLP analyze_text_content(df) # Create a network graph of issues, authors, and assignees create_network_graph(df) # Build a predictive model for issue resolution time build_predictive_model(df)