acecalisto3 commited on
Commit
5538e6a
1 Parent(s): 5931852

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +281 -65
app.py CHANGED
@@ -1,66 +1,282 @@
1
- # Import necessary libraries
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2
  import gradio as gr
3
- import torch
4
- import torchvision.transforms as T
5
- from torchvision.models.detection import maskrcnn_resnet50_fpn
6
- from transformers import RagTokenizer, RagRetriever, RagSequenceForGeneration
7
- from google_drive_downloader import GoogleDriveDownloader as gdd
8
-
9
- # Function to download the RAG model and tokenizer
10
- def download_models(file_id_model, file_id_tokenizer, dest_path_model, dest_path_tokenizer):
11
- gdd.download_file_from_google_drive(file_id_model, dest_path_model)
12
- gdd.download_file_from_google_drive(file_id_tokenizer, dest_path_tokenizer)
13
-
14
- # Download the RAG model and tokenizer
15
- download_models(
16
- file_id_model='your_model_file_id',
17
- file_id_tokenizer='your_tokenizer_file_id',
18
- dest_path_model='./model.pt',
19
- dest_path_tokenizer='./tokenizer'
20
- )
21
-
22
- # Load the RAG model and tokenizer
23
- tokenizer = RagTokenizer.from_pretrained('./tokenizer')
24
- retriever = RagRetriever.from_pretrained('./model.pt')
25
- model = RagSequenceForGeneration.from_pretrained('./model.pt')
26
-
27
- # Load the Mask R-CNN model
28
- model_rcnn = maskrcnn_resnet50_fpn(pretrained=True)
29
- model_rcnn.eval()
30
-
31
- # Define the class labels for the COCO dataset
32
- class_labels = [
33
- # ...
34
- ]
35
-
36
- # Define the image-to-text object segmentation function
37
- def image_to_text_segmentation(image):
38
- # Preprocess the image and run it through the Mask R-CNN model
39
- # ...
40
-
41
- # Generate the segmented text for each object
42
- segmented_text = []
43
- # ...
44
-
45
- return segmented_text
46
-
47
- # Define the Gradio interface for text generation
48
- text_generation_iface = gr.Interface(
49
- fn=generate_text,
50
- inputs=input_text,
51
- outputs=output_text,
52
- title=title,
53
- description=description,
54
- examples=[
55
- # ...
56
- ]
57
- ).launch()
58
-
59
- # Define the Gradio interface for image-to-text segmentation
60
- segmentation_iface = gr.Interface(
61
- fn=image_to_text_segmentation,
62
- inputs=input_image,
63
- outputs=output_text,
64
- title="Image-to-Text Object Segmentation",
65
- description="Segment objects in the image and generate corresponding text."
66
- ).launch()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import requests
2
+ from bs4 import BeautifulSoup
3
+ import pandas as pd
4
+ import numpy as np
5
+ import matplotlib.pyplot as plt
6
+ import seaborn as sns
7
+ import datetime
8
+ import nltk
9
+ from nltk.corpus import stopwords
10
+ from nltk.stem import WordNetLemmatizer
11
+ from nltk.tokenize import word_tokenize
12
+ from gensim.models import LdaModel
13
+ from gensim.corpora import Dictionary
14
+ from textblob import TextBlob
15
+ from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
16
+ import networkx as nx
17
+ from sklearn.model_selection import train_test_split
18
+ from sklearn.linear_model import LogisticRegression
19
+ from sklearn.ensemble import RandomForestClassifier
20
+ from sklearn.metrics import accuracy_score, classification_report, roc_auc_score
21
+ from sklearn.preprocessing import StandardScaler
22
+ from sklearn.pipeline import Pipeline
23
+ from sklearn.feature_extraction.text import TfidfVectorizer
24
+ from scipy import linalg
25
+ import plotly.graph_objects as go
26
+ from collections import Counter
27
+ import warnings
28
+ import transformers
29
  import gradio as gr
30
+ import streamlit as st
31
+
32
+ warnings.filterwarnings("ignore")
33
+
34
+ # Set up logging
35
+ import logging
36
+ logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
37
+
38
+ # Function to fetch HTML content from GitHub issue pages
39
+ def fetch_issue_data(username, repository, start_page, end_page):
40
+ issues_data = []
41
+ for page in range(start_page, end_page + 1):
42
+ url = f"https://github.com/{username}/{repository}/issues?page={page}"
43
+ response = requests.get(url)
44
+ soup = BeautifulSoup(response.content, 'html.parser')
45
+ issue_elements = soup.find_all('div', class_='flex-shrink-0')
46
+ for issue_element in issue_elements:
47
+ issue_link = issue_element.find('a', class_='Link--primary')['href']
48
+ issue_url = f"https://github.com{issue_link}"
49
+ issue_data = fetch_issue_details(issue_url)
50
+ issues_data.append(issue_data)
51
+ return issues_data
52
+
53
+ # Function to fetch details of a specific issue
54
+ def fetch_issue_details(issue_url):
55
+ response = requests.get(issue_url)
56
+ soup = BeautifulSoup(response.content, 'html.parser')
57
+ issue_title = soup.find('h1', class_='gh-header-title').text.strip()
58
+ issue_body = soup.find('div', class_='markdown-body').text.strip()
59
+ issue_created_at = soup.find('relative-time')['datetime']
60
+ issue_closed_at = soup.find('relative-time', class_='no-wrap')
61
+ if issue_closed_at:
62
+ issue_closed_at = issue_closed_at['datetime']
63
+ else:
64
+ issue_closed_at = None
65
+ issue_author = soup.find('a', class_='author').text.strip()
66
+ issue_assignee = soup.find('a', class_='Link--muted')
67
+ if issue_assignee:
68
+ issue_assignee = issue_assignee.text.strip()
69
+ else:
70
+ issue_assignee = None
71
+ return {
72
+ 'title': issue_title,
73
+ 'body': issue_body,
74
+ 'created_at': issue_created_at,
75
+ 'closed_at': issue_closed_at,
76
+ 'author': issue_author,
77
+ 'assignee': issue_assignee
78
+ }
79
+
80
+ # Function to clean and structure the data
81
+ def clean_and_structure_data(issues_data):
82
+ df = pd.DataFrame(issues_data)
83
+ if 'created_at' in df.columns:
84
+ df['created_at'] = pd.to_datetime(df['created_at'])
85
+ else:
86
+ logging.error("The 'created_at' column is missing from the dataframe.")
87
+ df['created_at'] = pd.NaT
88
+ if 'closed_at' in df.columns:
89
+ df['closed_at'] = pd.to_datetime(df['closed_at'])
90
+ else:
91
+ df['closed_at'] = None
92
+ df['resolution_time'] = (df['closed_at'] - df['created_at']).dt.days
93
+ df['resolution_time'] = df['resolution_time'].fillna(-1)
94
+ df['is_closed'] = (df['closed_at'].notna()).astype(int)
95
+ return df
96
+
97
+ # Function for exploratory data analysis (EDA)
98
+ def perform_eda(df):
99
+ # Descriptive statistics
100
+ st.write(df.describe())
101
+
102
+ # Visualizations
103
+ sns.histplot(df['resolution_time'], kde=True)
104
+ st.pyplot(plt)
105
+ sns.lineplot(x=df['created_at'].dt.month, y='resolution_time', data=df)
106
+ st.pyplot(plt)
107
+ top_authors = df['author'].value_counts().nlargest(10)
108
+ st.write("\nTop 10 Authors:")
109
+ st.write(top_authors)
110
+ top_assignees = df['assignee'].value_counts().nlargest(10)
111
+ st.write("\nTop 10 Assignees:")
112
+ st.write(top_assignees)
113
+
114
+ # Function for text analysis using NLP
115
+ def analyze_text_content(df):
116
+ # Text preprocessing
117
+ stop_words = set(stopwords.words('english'))
118
+ lemmatizer = WordNetLemmatizer()
119
+ df['processed_body'] = df['body'].apply(lambda text: ' '.join([lemmatizer.lemmatize(word) for word in word_tokenize(text) if word.lower() not in stop_words]))
120
+
121
+ # Topic modeling
122
+ dictionary = Dictionary([word_tokenize(text) for text in df['processed_body']])
123
+ corpus = [dictionary.doc2bow(word_tokenize(text)) for text in df['processed_body']]
124
+ lda_model = LdaModel(corpus, num_topics=5, id2word=dictionary)
125
+ st.write("Top 5 Topics:")
126
+ for topic in lda_model.print_topics(num_words=5):
127
+ st.write(topic)
128
+
129
+ # Sentiment analysis
130
+ analyzer = SentimentIntensityAnalyzer()
131
+ df['sentiment'] = df['body'].apply(lambda text: analyzer.polarity_scores(text)['compound'])
132
+ st.write("Sentiment Analysis:")
133
+ st.write(df['sentiment'].describe())
134
+
135
+ # Word Cloud for Common Words
136
+ from wordcloud import WordCloud
137
+ all_words = ' '.join([text for text in df['processed_body']])
138
+ wordcloud = WordCloud(width=800, height=400, background_color='white').generate(all_words)
139
+ st.pyplot(plt.figure(figsize=(10, 6), facecolor=None))
140
+ plt.imshow(wordcloud)
141
+ plt.axis("off")
142
+ plt.tight_layout(pad=0)
143
+ plt.show()
144
+
145
+ # Function to create a network graph of issues, authors, and assignees
146
+ def create_network_graph(df):
147
+ graph = nx.Graph()
148
+ for index, row in df.iterrows():
149
+ graph.add_node(row['title'], type='issue')
150
+ graph.add_node(row['author'], type='author')
151
+ if row['assignee']:
152
+ graph.add_node(row['assignee'], type='assignee')
153
+ graph.add_edge(row['title'], row['author'])
154
+ if row['assignee']:
155
+ graph.add_edge(row['title'], row['assignee'])
156
+
157
+ ...
158
+ # Interactive Network Graph with Plotly
159
+ pos = nx.spring_layout(graph, k=0.5)
160
+ edge_x = []
161
+ edge_y = []
162
+ for edge in graph.edges():
163
+ x0, y0 = pos[edge[0]]
164
+ x1, y1 = pos[edge[1]]
165
+ edge_x.append([x0, x1, None])
166
+ edge_y.append([y0, y1, None])
167
+
168
+ edge_trace = go.Scatter(
169
+ x=edge_x,
170
+ y=edge_y,
171
+ line=dict(width=0.5, color='#888'),
172
+ hoverinfo='none',
173
+ mode='lines'
174
+ )
175
+
176
+ node_x = []
177
+ node_y = []
178
+ for node in graph.nodes():
179
+ x, y = pos[node]
180
+ node_x.append(x)
181
+ node_y.append(y)
182
+
183
+ node_trace = go.Scatter(
184
+ x=node_x,
185
+ y=node_y,
186
+ mode='markers',
187
+ marker=dict(
188
+ color=[],
189
+ size=10,
190
+ line=dict(width=2, color='black')
191
+ ),
192
+ text=[],
193
+ hoverinfo='text'
194
+ )
195
+
196
+ # Set node colors based on type
197
+ node_colors = []
198
+ for node in graph.nodes():
199
+ if graph.nodes[node]['type'] == 'issue':
200
+ node_colors.append('red')
201
+ elif graph.nodes[node]['type'] == 'author':
202
+ node_colors.append('blue')
203
+ else:
204
+ node_colors.append('green')
205
+
206
+ # Set node labels
207
+ node_labels = []
208
+ for node in graph.nodes():
209
+ node_labels.append(node)
210
+
211
+ node_trace.marker.color = node_colors
212
+ node_trace.text = node_labels
213
+
214
+ # Create the figure
215
+ fig = go.Figure(data=[edge_trace, node_trace],
216
+ layout=go.Layout(
217
+ title="GitHub Issue Network Graph",
218
+ showlegend=False,
219
+ hovermode='closest',
220
+ margin=dict(b=20, l=5, r=5, t=40),
221
+ xaxis=dict(showgrid=False, zeroline=False, showticklabels=False),
222
+ yaxis=dict(showgrid=False, zeroline=False, showticklabels=False)
223
+ )
224
+ )
225
+
226
+ # Display the figure in a Streamlit app
227
+ st.plotly_chart(fig)
228
+
229
+ # Function to build a predictive model for issue resolution time
230
+ def build_predictive_model(df):
231
+ # Feature engineering
232
+ df['created_at_day'] = df['created_at'].dt.day
233
+ df['created_at_weekday'] = df['created_at'].dt.weekday
234
+ df['created_at_hour'] = df['created_at'].dt.hour
235
+ df['author_encoded'] = df['author'].astype('category').cat.codes
236
+ df['assignee_encoded'] = df['assignee'].astype('category').cat.codes
237
+
238
+ # Select features and target variable
239
+ features = ['created_at_day', 'created_at_weekday', 'created_at_hour', 'author_encoded', 'assignee_encoded', 'sentiment']
240
+ target = 'resolution_time'
241
+
242
+ # Split data into training and testing sets
243
+ X_train, X_test, y_train, y_test = train_test_split(df[features], df[target], test_size=0.2, random_state=42)
244
+
245
+ # Create a pipeline for feature scaling and model training
246
+ pipeline = Pipeline([
247
+ ('scaler', StandardScaler()),
248
+ ('model', LogisticRegression())
249
+ ])
250
+
251
+ # Train the model
252
+ pipeline.fit(X_train, y_train)
253
+
254
+ # Evaluate the model
255
+ y_pred = pipeline.predict(X_test)
256
+ accuracy = accuracy_score(y_test, y_pred)
257
+ st.write("Accuracy:", accuracy)
258
+ st.write(classification_report(y_test, y_pred))
259
+
260
+ # Main function
261
+ if __name__ == "__main__":
262
+ # Replace with your GitHub username and repository name
263
+ username = "Ig0tU"
264
+ repository = "miagiii"
265
+
266
+ # Fetch issue data from GitHub
267
+ issues_data = fetch_issue_data(username, repository, 1, 10)
268
+
269
+ # Clean and structure the data
270
+ df = clean_and_structure_data(issues_data)
271
+
272
+ # Perform exploratory data analysis (EDA)
273
+ perform_eda(df)
274
+
275
+ # Analyze text content using NLP
276
+ analyze_text_content(df)
277
+
278
+ # Create a network graph of issues, authors, and assignees
279
+ create_network_graph(df)
280
+
281
+ # Build a predictive model for issue resolution time
282
+ build_predictive_model(df)