Tryfonas commited on
Commit
b680b9a
·
verified ·
1 Parent(s): e1629ce

Upload folder using huggingface_hub

Browse files
Files changed (2) hide show
  1. app.py +209 -14
  2. requirements.txt +2 -1
app.py CHANGED
@@ -5,6 +5,11 @@ import altair as alt
5
  import matplotlib.pyplot as plt
6
  import seaborn as sns
7
  from scipy.stats import zscore
 
 
 
 
 
8
 
9
  # Function to load and clean data
10
  @st.cache_data
@@ -32,7 +37,7 @@ st.title('BDS24_Weekly_Assignment_Week 2 | Tryfonas Karmiris')
32
 
33
  # Sidebar for navigation
34
  st.sidebar.title("Navigation")
35
- page = st.sidebar.radio("Select a page:", ["Introduction", "Data Overview", "Top Values by Selected Variable", "Repayment Interval by Selected Variable", "Country Comparison Deepdive", "Sector Comparison Deepdive"])
36
 
37
  # Introduction Page
38
  if page == "Introduction":
@@ -94,7 +99,7 @@ elif page == "Top Values by Selected Variable":
94
  top_values = df_kiva_loans_cleaned.groupby('sector')['funded_amount'].agg(['sum', 'count']).nlargest(num_columns, 'sum').reset_index()
95
  x_column = 'sector'
96
  count_column = 'count'
97
- description = f"This chart illustrates the top {num_columns} sectors by total funded amount. The blue bars represent the total funded amount, while the red line indicates the count of loans. Most loans are funded to the Aggriculture Sector with Food and Retail completing the first three. Looks like that if the sector of the business is close to Primary production or its Basic Necessities(food) "
98
 
99
  # Display description
100
  st.write(description)
@@ -151,10 +156,7 @@ elif page == "Top Values by Selected Variable":
151
  plt.xticks(rotation=90)
152
  st.pyplot(fig)
153
 
154
- # Display description for boxplot
155
-
156
-
157
- # Page 4: Other Plots
158
  elif page == "Repayment Interval by Selected Variable":
159
  st.subheader('Repayment Interval by Selected Variable')
160
 
@@ -178,7 +180,7 @@ elif page == "Repayment Interval by Selected Variable":
178
  elif plot_var == 'country':
179
  top_values_plot = df_kiva_loans_cleaned.groupby('country')['funded_amount'].agg('count').nlargest(num_top_values).index
180
  filtered_df_plot = df_kiva_loans_cleaned[df_kiva_loans_cleaned['country'].isin(top_values_plot)]
181
- description = f"This countplot illustrates the distribution of repayment intervals for the top {num_top_values} countries based on the number of loans. In terms of countries the Phillipines had a great number of Irregular loans."
182
 
183
  # Display description
184
  st.write(description)
@@ -205,15 +207,12 @@ elif page == "Repayment Interval by Selected Variable":
205
  plt.legend(title=plot_var.replace("_", " ").title(), bbox_to_anchor=(1.05, 1), loc='upper left')
206
  st.pyplot(fig)
207
 
208
-
209
-
210
-
211
- # Page 5: Country Comparison
212
  elif page == "Country Comparison Deepdive":
213
  st.subheader("Country Comparison Deepdive")
214
 
215
  # Multi-select for countries
216
- selected_countries = st.multiselect("Select Countries to Compare(Please select one or more)", options=df_kiva_loans_cleaned['country'].unique())
217
 
218
  # Option to choose between count or sum of funded amounts
219
  aggregation_option = st.radio("Select Aggregation Type:", ("Count of Loans", "Summary of Funded Amount"))
@@ -266,7 +265,7 @@ elif page == "Country Comparison Deepdive":
266
  else:
267
  st.write("Please select one or more countries to compare from the dropdown above.")
268
 
269
- # Page 6: Sector Comparison
270
  elif page == "Sector Comparison Deepdive":
271
  st.subheader("Sector Comparison Deepdive")
272
 
@@ -274,7 +273,7 @@ elif page == "Sector Comparison Deepdive":
274
  selected_sectors = st.multiselect("Select Sectors to Compare (Please select one or more)", options=df_kiva_loans_cleaned['sector'].unique())
275
 
276
  # Option to choose between count or sum of funded amounts
277
- aggregation_option = st.radio("Select Aggregation Type:", ("Count of Loans", "Summmary of Funded Amount"))
278
 
279
  if selected_sectors:
280
  # Filter the data based on selected sectors
@@ -326,3 +325,199 @@ elif page == "Sector Comparison Deepdive":
326
  else:
327
  st.write("Please select one or more countries to compare from the dropdown above.")
328
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5
  import matplotlib.pyplot as plt
6
  import seaborn as sns
7
  from scipy.stats import zscore
8
+ from sklearn.preprocessing import LabelEncoder, StandardScaler
9
+ from sklearn.decomposition import PCA
10
+ from sklearn.cluster import KMeans, AgglomerativeClustering
11
+ from scipy.spatial.distance import cdist
12
+ from scipy.cluster.hierarchy import dendrogram, linkage
13
 
14
  # Function to load and clean data
15
  @st.cache_data
 
37
 
38
  # Sidebar for navigation
39
  st.sidebar.title("Navigation")
40
+ page = st.sidebar.radio("Select a page:", ["Introduction", "Data Overview", "Top Values by Selected Variable", "Repayment Interval by Selected Variable", "Country Comparison Deepdive", "Sector Comparison Deepdive", "KMeans Clustering & Recommendations","Hierarchical Clustering & Dendrogram"])
41
 
42
  # Introduction Page
43
  if page == "Introduction":
 
99
  top_values = df_kiva_loans_cleaned.groupby('sector')['funded_amount'].agg(['sum', 'count']).nlargest(num_columns, 'sum').reset_index()
100
  x_column = 'sector'
101
  count_column = 'count'
102
+ description = f"This chart illustrates the top {num_columns} sectors by total funded amount. The blue bars represent the total funded amount, while the red line indicates the count of loans. Most loans are funded to the Agriculture Sector with Food and Retail completing the first three. Looks like that if the sector of the business is close to Primary production or its Basic Necessities(food) "
103
 
104
  # Display description
105
  st.write(description)
 
156
  plt.xticks(rotation=90)
157
  st.pyplot(fig)
158
 
159
+ # Remaining pages (Repayment Interval by Selected Variable, Country Comparison Deepdive, Sector Comparison Deepdive)
 
 
 
160
  elif page == "Repayment Interval by Selected Variable":
161
  st.subheader('Repayment Interval by Selected Variable')
162
 
 
180
  elif plot_var == 'country':
181
  top_values_plot = df_kiva_loans_cleaned.groupby('country')['funded_amount'].agg('count').nlargest(num_top_values).index
182
  filtered_df_plot = df_kiva_loans_cleaned[df_kiva_loans_cleaned['country'].isin(top_values_plot)]
183
+ description = f"This countplot illustrates the distribution of repayment intervals for the top {num_top_values} countries based on the number of loans. In terms of countries the Philippines had a great number of Irregular loans."
184
 
185
  # Display description
186
  st.write(description)
 
207
  plt.legend(title=plot_var.replace("_", " ").title(), bbox_to_anchor=(1.05, 1), loc='upper left')
208
  st.pyplot(fig)
209
 
210
+ # Page 5: Country Comparison Deepdive
 
 
 
211
  elif page == "Country Comparison Deepdive":
212
  st.subheader("Country Comparison Deepdive")
213
 
214
  # Multi-select for countries
215
+ selected_countries = st.multiselect("Select Countries to Compare (Please select one or more)", options=df_kiva_loans_cleaned['country'].unique())
216
 
217
  # Option to choose between count or sum of funded amounts
218
  aggregation_option = st.radio("Select Aggregation Type:", ("Count of Loans", "Summary of Funded Amount"))
 
265
  else:
266
  st.write("Please select one or more countries to compare from the dropdown above.")
267
 
268
+ # Page 6: Sector Comparison Deepdive
269
  elif page == "Sector Comparison Deepdive":
270
  st.subheader("Sector Comparison Deepdive")
271
 
 
273
  selected_sectors = st.multiselect("Select Sectors to Compare (Please select one or more)", options=df_kiva_loans_cleaned['sector'].unique())
274
 
275
  # Option to choose between count or sum of funded amounts
276
+ aggregation_option = st.radio("Select Aggregation Type:", ("Count of Loans", "Summary of Funded Amount"))
277
 
278
  if selected_sectors:
279
  # Filter the data based on selected sectors
 
325
  else:
326
  st.write("Please select one or more countries to compare from the dropdown above.")
327
 
328
+ # Page 7: KMeans Clustering & Recommendations
329
+ elif page == "KMeans Clustering & Recommendations":
330
+ st.subheader("KMeans Clustering & Recommendations")
331
+
332
+ # User input to choose the number of sample rows
333
+ sample_size = st.slider("Select the number of sample rows for clustering:", min_value=1000, max_value=100000, value=20000, step=1000)
334
+
335
+ # Sample the selected number of rows from the DataFrame
336
+ df_sample = df_kiva_loans_cleaned.sample(n=sample_size, random_state=42).copy()
337
+
338
+ # Keeping only the relevant columns and storing original indices
339
+ df_original = df_sample[['country','funded_amount', 'sector','repayment_interval']].copy()
340
+ df_original['original_index'] = df_sample.index # Keep track of original indices
341
+
342
+ # Label Encoding for categorical variables and adding encoded columns with "_id" suffix
343
+ label_encoders = {}
344
+ for column in df_original.select_dtypes(include=['object']).columns:
345
+ le = LabelEncoder()
346
+ df_original[column + '_id'] = le.fit_transform(df_original[column])
347
+ label_encoders[column] = le
348
+
349
+ # Standardizing the data using the encoded columns
350
+ encoded_columns = [col + '_id' for col in df_original.select_dtypes(include=['object']).columns]
351
+ scaler = StandardScaler()
352
+ df_scaled = scaler.fit_transform(df_original[encoded_columns + ['funded_amount']])
353
+
354
+ # Applying PCA
355
+ pca = PCA(n_components=2) # Reduce to 2 dimensions for visualization
356
+ df_pca = pca.fit_transform(df_scaled)
357
+
358
+ # Elbow Method to find the optimal number of clusters
359
+ inertia = []
360
+ for n in range(1, 11):
361
+ kmeans = KMeans(n_clusters=n, random_state=42)
362
+ kmeans.fit(df_pca)
363
+ inertia.append(kmeans.inertia_)
364
+
365
+ # Plotting the Elbow Method
366
+ plt.figure(figsize=(8, 6))
367
+ plt.plot(range(1, 11), inertia, marker='o', linestyle='--')
368
+ plt.title('Elbow Method for Optimal Number of Clusters')
369
+ plt.xlabel('Number of Clusters')
370
+ plt.ylabel('Inertia')
371
+ st.pyplot(plt.gcf())
372
+
373
+ # User input to choose the optimal number of clusters
374
+ optimal_clusters = st.slider("Select the number of optimal clusters:", min_value=1, max_value=10, value=4, step=1)
375
+
376
+ # Apply KMeans with optimal clusters
377
+ kmeans = KMeans(n_clusters=optimal_clusters, random_state=42)
378
+ df_original['cluster'] = kmeans.fit_predict(df_pca)
379
+
380
+ # Visualize the clustering results at different iterations
381
+ max_iters = [1, 2, 5, 6, 8, 10] # Different iterations you want to visualize
382
+
383
+ # Increase the figure size for better visibility
384
+ plt.figure(figsize=(15, 55)) # Adjusted the figsize to make plots larger
385
+ for i, max_iter in enumerate(max_iters, start=1):
386
+ kmeans = KMeans(n_clusters=optimal_clusters, random_state=42, max_iter=max_iter)
387
+ df_original['cluster'] = kmeans.fit_predict(df_pca)
388
+
389
+ # Plotting the clusters
390
+ plt.subplot(6, 1, i) # Changed the layout to 3 rows x 2 columns for larger plots
391
+ sns.scatterplot(x=df_pca[:, 0], y=df_pca[:, 1], hue=df_original['cluster'], palette='viridis', s=100)
392
+
393
+ # Plotting the centroids
394
+ centroids = kmeans.cluster_centers_
395
+ plt.scatter(centroids[:, 0], centroids[:, 1], c='red', s=300, marker='X', label='Centroids') # Increased centroid size
396
+
397
+ plt.title(f'K-means Clustering - Iteration {max_iter}', fontsize=16)
398
+ plt.xlabel('Principal Component 1', fontsize=14)
399
+ plt.ylabel('Principal Component 2', fontsize=14)
400
+ plt.xticks(fontsize=12)
401
+ plt.yticks(fontsize=12)
402
+ if i == 1:
403
+ plt.legend()
404
+
405
+ plt.tight_layout()
406
+ st.pyplot(plt.gcf())
407
+
408
+ # Dynamic input for the new data point
409
+ st.subheader("Input New Data Point for Recommendations")
410
+
411
+ # Allow the user to select the country, sector, and repayment interval
412
+ country = st.selectbox("Select Country", options=df_kiva_loans_cleaned['country'].unique())
413
+ sector = st.selectbox("Select Sector", options=df_kiva_loans_cleaned['sector'].unique())
414
+ repayment_interval = st.selectbox("Select Repayment Interval", options=df_kiva_loans_cleaned['repayment_interval'].unique())
415
+
416
+ # Allow the user to select the funded amount using a slider
417
+ funded_amount = st.slider("Select Funded Amount", min_value=int(df_kiva_loans_cleaned['funded_amount'].min()), max_value=int(df_kiva_loans_cleaned['funded_amount'].max()), value=1500)
418
+
419
+ new_data = {
420
+ 'country': country,
421
+ 'funded_amount': funded_amount,
422
+ 'sector': sector,
423
+ 'repayment_interval': repayment_interval
424
+ }
425
+
426
+ # Convert new data to DataFrame
427
+ new_data_df = pd.DataFrame([new_data])
428
+
429
+ # Encode the new data point and add encoded columns with "_id" suffix
430
+ for column in new_data_df.select_dtypes(include=['object']).columns:
431
+ new_data_df[column + '_id'] = label_encoders[column].transform(new_data_df[column])
432
+
433
+ # Standardize the new data using the encoded columns
434
+ new_data_scaled = scaler.transform(new_data_df[[col + '_id' for col in new_data_df.select_dtypes(include=['object']).columns] + ['funded_amount']])
435
+
436
+ # Apply PCA to the new data
437
+ new_data_pca = pca.transform(new_data_scaled)
438
+
439
+ # Predict the cluster for the new data point
440
+ new_cluster = kmeans.predict(new_data_pca)[0]
441
+
442
+ st.subheader("Top 5 Similar Items to the Input")
443
+ st.write(f"The new data point belongs to cluster: {new_cluster}")
444
+ # Get all data points in the same cluster
445
+ cluster_data = df_original[df_original['cluster'] == new_cluster]
446
+
447
+ # Apply the same PCA transformation to the scaled data of the entire cluster
448
+ cluster_data_pca = pca.transform(scaler.transform(cluster_data[encoded_columns + ['funded_amount']]))
449
+
450
+ # Calculate the Euclidean distance between the new data point and all points in the same cluster
451
+ distances = cdist(new_data_pca, cluster_data_pca, 'euclidean')[0]
452
+
453
+ # Add distances to the cluster data DataFrame
454
+ cluster_data = cluster_data.copy()
455
+ cluster_data['distance'] = distances
456
+
457
+ # Sort by distance and select the top 5 closest items
458
+ top_5_recommendations = cluster_data.sort_values('distance').head(5)
459
+
460
+ # Retrieve the original rows from the original DataFrame before encoding
461
+ recommended_indices = top_5_recommendations['original_index']
462
+ recommendations = df_kiva_loans_cleaned.loc[recommended_indices]
463
+
464
+ # Display the original rows as the top 5 recommendations
465
+
466
+ st.write(recommendations)
467
+
468
+
469
+ # Page 8: Hierarchical Clustering & Dendrogram
470
+ elif page == "Hierarchical Clustering & Dendrogram":
471
+ st.subheader("Hierarchical Clustering & Dendrogram")
472
+
473
+ # User input to choose the number of sample rows
474
+ sample_size = st.slider("Select the number of sample rows for clustering:", min_value=1000, max_value=5000, value=5000, step=50)
475
+
476
+ # User input to choose the number of clusters
477
+ n_clusters = st.slider("Select the number of clusters:", min_value=2, max_value=10, value=4, step=1)
478
+
479
+ # Sample the selected number of rows from the DataFrame
480
+ df_sample = df_kiva_loans_cleaned.sample(n=sample_size, random_state=42).copy()
481
+
482
+ # Keeping only the relevant columns and storing original indices
483
+ df_original = df_sample[['country','funded_amount', 'sector','repayment_interval']].copy()
484
+ df_original['original_index'] = df_sample.index # Keep track of original indices
485
+
486
+ # Label Encoding for categorical variables and adding encoded columns with "_id" suffix
487
+ label_encoders = {}
488
+ for column in df_original.select_dtypes(include=['object']).columns:
489
+ le = LabelEncoder()
490
+ df_original[column + '_id'] = le.fit_transform(df_original[column])
491
+ label_encoders[column] = le
492
+
493
+ # Standardizing the data using the encoded columns
494
+ encoded_columns = [col + '_id' for col in df_original.select_dtypes(include=['object']).columns]
495
+ scaler = StandardScaler()
496
+ df_scaled = scaler.fit_transform(df_original[encoded_columns + ['funded_amount']])
497
+
498
+ # Applying PCA
499
+ pca = PCA(n_components=2) # Reduce to 2 dimensions for visualization
500
+ df_pca = pca.fit_transform(df_scaled)
501
+
502
+ # Perform Agglomerative Clustering with dynamic n_clusters
503
+ agg_clustering = AgglomerativeClustering(n_clusters=n_clusters, linkage='ward')
504
+ df_original['cluster'] = agg_clustering.fit_predict(df_pca)
505
+
506
+ # Plot the resulting clusters
507
+ plt.figure(figsize=(10, 7))
508
+ sns.scatterplot(x=df_pca[:, 0], y=df_pca[:, 1], hue=df_original['cluster'], palette='viridis', s=50)
509
+ plt.title(f'Agglomerative Clustering (Hierarchical) Results - {n_clusters} Clusters')
510
+ plt.xlabel('Principal Component 1')
511
+ plt.ylabel('Principal Component 2')
512
+ st.pyplot(plt.gcf())
513
+
514
+ # Dendrogram Visualization
515
+ linked = linkage(df_pca, method='ward')
516
+
517
+ plt.figure(figsize=(10, 7))
518
+ dendrogram(linked,
519
+ orientation='top',
520
+ distance_sort='descending',
521
+ show_leaf_counts=True)
522
+ plt.title('Hierarchical Clustering Dendrogram')
523
+ st.pyplot(plt.gcf())
requirements.txt CHANGED
@@ -3,4 +3,5 @@ seaborn
3
  pandas
4
  matplotlib
5
  altair
6
- scipy
 
 
3
  pandas
4
  matplotlib
5
  altair
6
+ scipy
7
+ scikit-learn