Upload folder using huggingface_hub
Browse files- +209 -14
- requirements.txt +2 -1
@@ -5,6 +5,11 @@ import altair as alt
5 |
import matplotlib.pyplot as plt
6 |
import seaborn as sns
7 |
from scipy.stats import zscore
8 |
9 |
# Function to load and clean data
10 |
@@ -32,7 +37,7 @@ st.title('BDS24_Weekly_Assignment_Week 2 | Tryfonas Karmiris')
32 |
33 |
# Sidebar for navigation
34 |
35 |
page ="Select a page:", ["Introduction", "Data Overview", "Top Values by Selected Variable", "Repayment Interval by Selected Variable", "Country Comparison Deepdive", "Sector Comparison Deepdive"])
36 |
37 |
# Introduction Page
38 |
if page == "Introduction":
@@ -94,7 +99,7 @@ elif page == "Top Values by Selected Variable":
94 |
top_values = df_kiva_loans_cleaned.groupby('sector')['funded_amount'].agg(['sum', 'count']).nlargest(num_columns, 'sum').reset_index()
95 |
x_column = 'sector'
96 |
count_column = 'count'
97 |
description = f"This chart illustrates the top {num_columns} sectors by total funded amount. The blue bars represent the total funded amount, while the red line indicates the count of loans. Most loans are funded to the
98 |
99 |
# Display description
100 |
@@ -151,10 +156,7 @@ elif page == "Top Values by Selected Variable":
151 |
152 |
153 |
154 |
155 |
156 |
157 |
# Page 4: Other Plots
158 |
elif page == "Repayment Interval by Selected Variable":
159 |
st.subheader('Repayment Interval by Selected Variable')
160 |
@@ -178,7 +180,7 @@ elif page == "Repayment Interval by Selected Variable":
178 |
elif plot_var == 'country':
179 |
top_values_plot = df_kiva_loans_cleaned.groupby('country')['funded_amount'].agg('count').nlargest(num_top_values).index
180 |
filtered_df_plot = df_kiva_loans_cleaned[df_kiva_loans_cleaned['country'].isin(top_values_plot)]
181 |
description = f"This countplot illustrates the distribution of repayment intervals for the top {num_top_values} countries based on the number of loans. In terms of countries the
182 |
183 |
# Display description
184 |
@@ -205,15 +207,12 @@ elif page == "Repayment Interval by Selected Variable":
205 |
plt.legend(title=plot_var.replace("_", " ").title(), bbox_to_anchor=(1.05, 1), loc='upper left')
206 |
207 |
208 |
209 |
210 |
211 |
# Page 5: Country Comparison
212 |
elif page == "Country Comparison Deepdive":
213 |
st.subheader("Country Comparison Deepdive")
214 |
215 |
# Multi-select for countries
216 |
selected_countries = st.multiselect("Select Countries to Compare(Please select one or more)", options=df_kiva_loans_cleaned['country'].unique())
217 |
218 |
# Option to choose between count or sum of funded amounts
219 |
aggregation_option ="Select Aggregation Type:", ("Count of Loans", "Summary of Funded Amount"))
@@ -266,7 +265,7 @@ elif page == "Country Comparison Deepdive":
266 |
267 |
st.write("Please select one or more countries to compare from the dropdown above.")
268 |
269 |
# Page 6: Sector Comparison
270 |
elif page == "Sector Comparison Deepdive":
271 |
st.subheader("Sector Comparison Deepdive")
272 |
@@ -274,7 +273,7 @@ elif page == "Sector Comparison Deepdive":
274 |
selected_sectors = st.multiselect("Select Sectors to Compare (Please select one or more)", options=df_kiva_loans_cleaned['sector'].unique())
275 |
276 |
# Option to choose between count or sum of funded amounts
277 |
aggregation_option ="Select Aggregation Type:", ("Count of Loans", "
278 |
279 |
if selected_sectors:
280 |
# Filter the data based on selected sectors
@@ -326,3 +325,199 @@ elif page == "Sector Comparison Deepdive":
326 |
327 |
st.write("Please select one or more countries to compare from the dropdown above.")
328 |
5 |
import matplotlib.pyplot as plt
6 |
import seaborn as sns
7 |
from scipy.stats import zscore
8 |
from sklearn.preprocessing import LabelEncoder, StandardScaler
9 |
from sklearn.decomposition import PCA
10 |
from sklearn.cluster import KMeans, AgglomerativeClustering
11 |
from scipy.spatial.distance import cdist
12 |
from scipy.cluster.hierarchy import dendrogram, linkage
13 |
14 |
# Function to load and clean data
15 |
37 |
38 |
# Sidebar for navigation
39 |
40 |
page ="Select a page:", ["Introduction", "Data Overview", "Top Values by Selected Variable", "Repayment Interval by Selected Variable", "Country Comparison Deepdive", "Sector Comparison Deepdive", "KMeans Clustering & Recommendations","Hierarchical Clustering & Dendrogram"])
41 |
42 |
# Introduction Page
43 |
if page == "Introduction":
99 |
top_values = df_kiva_loans_cleaned.groupby('sector')['funded_amount'].agg(['sum', 'count']).nlargest(num_columns, 'sum').reset_index()
100 |
x_column = 'sector'
101 |
count_column = 'count'
102 |
description = f"This chart illustrates the top {num_columns} sectors by total funded amount. The blue bars represent the total funded amount, while the red line indicates the count of loans. Most loans are funded to the Agriculture Sector with Food and Retail completing the first three. Looks like that if the sector of the business is close to Primary production or its Basic Necessities(food) "
103 |
104 |
# Display description
105 |
156 |
157 |
158 |
159 |
# Remaining pages (Repayment Interval by Selected Variable, Country Comparison Deepdive, Sector Comparison Deepdive)
160 |
elif page == "Repayment Interval by Selected Variable":
161 |
st.subheader('Repayment Interval by Selected Variable')
162 |
180 |
elif plot_var == 'country':
181 |
top_values_plot = df_kiva_loans_cleaned.groupby('country')['funded_amount'].agg('count').nlargest(num_top_values).index
182 |
filtered_df_plot = df_kiva_loans_cleaned[df_kiva_loans_cleaned['country'].isin(top_values_plot)]
183 |
description = f"This countplot illustrates the distribution of repayment intervals for the top {num_top_values} countries based on the number of loans. In terms of countries the Philippines had a great number of Irregular loans."
184 |
185 |
# Display description
186 |
207 |
plt.legend(title=plot_var.replace("_", " ").title(), bbox_to_anchor=(1.05, 1), loc='upper left')
208 |
209 |
210 |
# Page 5: Country Comparison Deepdive
211 |
elif page == "Country Comparison Deepdive":
212 |
st.subheader("Country Comparison Deepdive")
213 |
214 |
# Multi-select for countries
215 |
selected_countries = st.multiselect("Select Countries to Compare (Please select one or more)", options=df_kiva_loans_cleaned['country'].unique())
216 |
217 |
# Option to choose between count or sum of funded amounts
218 |
aggregation_option ="Select Aggregation Type:", ("Count of Loans", "Summary of Funded Amount"))
265 |
266 |
st.write("Please select one or more countries to compare from the dropdown above.")
267 |
268 |
# Page 6: Sector Comparison Deepdive
269 |
elif page == "Sector Comparison Deepdive":
270 |
st.subheader("Sector Comparison Deepdive")
271 |
273 |
selected_sectors = st.multiselect("Select Sectors to Compare (Please select one or more)", options=df_kiva_loans_cleaned['sector'].unique())
274 |
275 |
# Option to choose between count or sum of funded amounts
276 |
aggregation_option ="Select Aggregation Type:", ("Count of Loans", "Summary of Funded Amount"))
277 |
278 |
if selected_sectors:
279 |
# Filter the data based on selected sectors
325 |
326 |
st.write("Please select one or more countries to compare from the dropdown above.")
327 |
328 |
# Page 7: KMeans Clustering & Recommendations
329 |
elif page == "KMeans Clustering & Recommendations":
330 |
st.subheader("KMeans Clustering & Recommendations")
331 |
332 |
# User input to choose the number of sample rows
333 |
sample_size = st.slider("Select the number of sample rows for clustering:", min_value=1000, max_value=100000, value=20000, step=1000)
334 |
335 |
# Sample the selected number of rows from the DataFrame
336 |
df_sample = df_kiva_loans_cleaned.sample(n=sample_size, random_state=42).copy()
337 |
338 |
# Keeping only the relevant columns and storing original indices
339 |
df_original = df_sample[['country','funded_amount', 'sector','repayment_interval']].copy()
340 |
df_original['original_index'] = df_sample.index # Keep track of original indices
341 |
342 |
# Label Encoding for categorical variables and adding encoded columns with "_id" suffix
343 |
label_encoders = {}
344 |
for column in df_original.select_dtypes(include=['object']).columns:
345 |
le = LabelEncoder()
346 |
df_original[column + '_id'] = le.fit_transform(df_original[column])
347 |
label_encoders[column] = le
348 |
349 |
# Standardizing the data using the encoded columns
350 |
encoded_columns = [col + '_id' for col in df_original.select_dtypes(include=['object']).columns]
351 |
scaler = StandardScaler()
352 |
df_scaled = scaler.fit_transform(df_original[encoded_columns + ['funded_amount']])
353 |
354 |
# Applying PCA
355 |
pca = PCA(n_components=2) # Reduce to 2 dimensions for visualization
356 |
df_pca = pca.fit_transform(df_scaled)
357 |
358 |
# Elbow Method to find the optimal number of clusters
359 |
inertia = []
360 |
for n in range(1, 11):
361 |
kmeans = KMeans(n_clusters=n, random_state=42)
362 |
363 |
364 |
365 |
# Plotting the Elbow Method
366 |
plt.figure(figsize=(8, 6))
367 |
plt.plot(range(1, 11), inertia, marker='o', linestyle='--')
368 |
plt.title('Elbow Method for Optimal Number of Clusters')
369 |
plt.xlabel('Number of Clusters')
370 |
371 |
372 |
373 |
# User input to choose the optimal number of clusters
374 |
optimal_clusters = st.slider("Select the number of optimal clusters:", min_value=1, max_value=10, value=4, step=1)
375 |
376 |
# Apply KMeans with optimal clusters
377 |
kmeans = KMeans(n_clusters=optimal_clusters, random_state=42)
378 |
df_original['cluster'] = kmeans.fit_predict(df_pca)
379 |
380 |
# Visualize the clustering results at different iterations
381 |
max_iters = [1, 2, 5, 6, 8, 10] # Different iterations you want to visualize
382 |
383 |
# Increase the figure size for better visibility
384 |
plt.figure(figsize=(15, 55)) # Adjusted the figsize to make plots larger
385 |
for i, max_iter in enumerate(max_iters, start=1):
386 |
kmeans = KMeans(n_clusters=optimal_clusters, random_state=42, max_iter=max_iter)
387 |
df_original['cluster'] = kmeans.fit_predict(df_pca)
388 |
389 |
# Plotting the clusters
390 |
plt.subplot(6, 1, i) # Changed the layout to 3 rows x 2 columns for larger plots
391 |
sns.scatterplot(x=df_pca[:, 0], y=df_pca[:, 1], hue=df_original['cluster'], palette='viridis', s=100)
392 |
393 |
# Plotting the centroids
394 |
centroids = kmeans.cluster_centers_
395 |
plt.scatter(centroids[:, 0], centroids[:, 1], c='red', s=300, marker='X', label='Centroids') # Increased centroid size
396 |
397 |
plt.title(f'K-means Clustering - Iteration {max_iter}', fontsize=16)
398 |
plt.xlabel('Principal Component 1', fontsize=14)
399 |
plt.ylabel('Principal Component 2', fontsize=14)
400 |
401 |
402 |
if i == 1:
403 |
404 |
405 |
406 |
407 |
408 |
# Dynamic input for the new data point
409 |
st.subheader("Input New Data Point for Recommendations")
410 |
411 |
# Allow the user to select the country, sector, and repayment interval
412 |
country = st.selectbox("Select Country", options=df_kiva_loans_cleaned['country'].unique())
413 |
sector = st.selectbox("Select Sector", options=df_kiva_loans_cleaned['sector'].unique())
414 |
repayment_interval = st.selectbox("Select Repayment Interval", options=df_kiva_loans_cleaned['repayment_interval'].unique())
415 |
416 |
# Allow the user to select the funded amount using a slider
417 |
funded_amount = st.slider("Select Funded Amount", min_value=int(df_kiva_loans_cleaned['funded_amount'].min()), max_value=int(df_kiva_loans_cleaned['funded_amount'].max()), value=1500)
418 |
419 |
new_data = {
420 |
'country': country,
421 |
'funded_amount': funded_amount,
422 |
'sector': sector,
423 |
'repayment_interval': repayment_interval
424 |
425 |
426 |
# Convert new data to DataFrame
427 |
new_data_df = pd.DataFrame([new_data])
428 |
429 |
# Encode the new data point and add encoded columns with "_id" suffix
430 |
for column in new_data_df.select_dtypes(include=['object']).columns:
431 |
new_data_df[column + '_id'] = label_encoders[column].transform(new_data_df[column])
432 |
433 |
# Standardize the new data using the encoded columns
434 |
new_data_scaled = scaler.transform(new_data_df[[col + '_id' for col in new_data_df.select_dtypes(include=['object']).columns] + ['funded_amount']])
435 |
436 |
# Apply PCA to the new data
437 |
new_data_pca = pca.transform(new_data_scaled)
438 |
439 |
# Predict the cluster for the new data point
440 |
new_cluster = kmeans.predict(new_data_pca)[0]
441 |
442 |
st.subheader("Top 5 Similar Items to the Input")
443 |
st.write(f"The new data point belongs to cluster: {new_cluster}")
444 |
# Get all data points in the same cluster
445 |
cluster_data = df_original[df_original['cluster'] == new_cluster]
446 |
447 |
# Apply the same PCA transformation to the scaled data of the entire cluster
448 |
cluster_data_pca = pca.transform(scaler.transform(cluster_data[encoded_columns + ['funded_amount']]))
449 |
450 |
# Calculate the Euclidean distance between the new data point and all points in the same cluster
451 |
distances = cdist(new_data_pca, cluster_data_pca, 'euclidean')[0]
452 |
453 |
# Add distances to the cluster data DataFrame
454 |
cluster_data = cluster_data.copy()
455 |
cluster_data['distance'] = distances
456 |
457 |
# Sort by distance and select the top 5 closest items
458 |
top_5_recommendations = cluster_data.sort_values('distance').head(5)
459 |
460 |
# Retrieve the original rows from the original DataFrame before encoding
461 |
recommended_indices = top_5_recommendations['original_index']
462 |
recommendations = df_kiva_loans_cleaned.loc[recommended_indices]
463 |
464 |
# Display the original rows as the top 5 recommendations
465 |
466 |
467 |
468 |
469 |
# Page 8: Hierarchical Clustering & Dendrogram
470 |
elif page == "Hierarchical Clustering & Dendrogram":
471 |
st.subheader("Hierarchical Clustering & Dendrogram")
472 |
473 |
# User input to choose the number of sample rows
474 |
sample_size = st.slider("Select the number of sample rows for clustering:", min_value=1000, max_value=5000, value=5000, step=50)
475 |
476 |
# User input to choose the number of clusters
477 |
n_clusters = st.slider("Select the number of clusters:", min_value=2, max_value=10, value=4, step=1)
478 |
479 |
# Sample the selected number of rows from the DataFrame
480 |
df_sample = df_kiva_loans_cleaned.sample(n=sample_size, random_state=42).copy()
481 |
482 |
# Keeping only the relevant columns and storing original indices
483 |
df_original = df_sample[['country','funded_amount', 'sector','repayment_interval']].copy()
484 |
df_original['original_index'] = df_sample.index # Keep track of original indices
485 |
486 |
# Label Encoding for categorical variables and adding encoded columns with "_id" suffix
487 |
label_encoders = {}
488 |
for column in df_original.select_dtypes(include=['object']).columns:
489 |
le = LabelEncoder()
490 |
df_original[column + '_id'] = le.fit_transform(df_original[column])
491 |
label_encoders[column] = le
492 |
493 |
# Standardizing the data using the encoded columns
494 |
encoded_columns = [col + '_id' for col in df_original.select_dtypes(include=['object']).columns]
495 |
scaler = StandardScaler()
496 |
df_scaled = scaler.fit_transform(df_original[encoded_columns + ['funded_amount']])
497 |
498 |
# Applying PCA
499 |
pca = PCA(n_components=2) # Reduce to 2 dimensions for visualization
500 |
df_pca = pca.fit_transform(df_scaled)
501 |
502 |
# Perform Agglomerative Clustering with dynamic n_clusters
503 |
agg_clustering = AgglomerativeClustering(n_clusters=n_clusters, linkage='ward')
504 |
df_original['cluster'] = agg_clustering.fit_predict(df_pca)
505 |
506 |
# Plot the resulting clusters
507 |
plt.figure(figsize=(10, 7))
508 |
sns.scatterplot(x=df_pca[:, 0], y=df_pca[:, 1], hue=df_original['cluster'], palette='viridis', s=50)
509 |
plt.title(f'Agglomerative Clustering (Hierarchical) Results - {n_clusters} Clusters')
510 |
plt.xlabel('Principal Component 1')
511 |
plt.ylabel('Principal Component 2')
512 |
513 |
514 |
# Dendrogram Visualization
515 |
linked = linkage(df_pca, method='ward')
516 |
517 |
plt.figure(figsize=(10, 7))
518 |
519 |
520 |
521 |
522 |
plt.title('Hierarchical Clustering Dendrogram')
523 |
@@ -3,4 +3,5 @@ seaborn
3 |
4 |
5 |
6 |
3 |
4 |
5 |
6 |
7 |