Spaces:
Sleeping
Sleeping
import matplotlib.pyplot as plt | |
import numpy as np | |
import pandas as pd | |
import seaborn as sns | |
from lifelines import KaplanMeierFitter | |
from matplotlib.colors import LinearSegmentedColormap | |
from utils import save_plot | |
def load_and_prepare_data(file_path): | |
""" | |
Load CSV data, convert 'month_year' to datetime, and prepare cohort and duration calculations. | |
Filter data to include only entries from 2021 onwards and adjust the cohort calculation based on the first active month. | |
Additionally, eliminate all months with a negative 'Order' so we only get the months after the cohort of the individual. | |
""" | |
df = pd.read_csv(file_path) | |
df["month_year"] = pd.to_datetime(df["month_year"], format="%B_%Y") | |
df = df[df["month_year"] >= "2021-09-01"] | |
df["Active"] = df["total_commits"] > 0 | |
df.sort_values(by=["developer", "month_year"], inplace=True) | |
first_active_month = ( | |
df[df["Active"]].groupby("developer")["month_year"].min().reset_index() | |
) | |
first_active_month.rename(columns={"month_year": "FirstActiveMonth"}, inplace=True) | |
df = df.merge(first_active_month, on="developer", how="left") | |
df["Cohort"] = df["FirstActiveMonth"].dt.to_period("M") | |
def calculate_order(row): | |
if pd.isnull(row["Cohort"]): | |
return None | |
return (row["month_year"].to_period("M") - row["Cohort"]).n | |
df["Order"] = df.apply(calculate_order, axis=1) | |
df = df[df["Order"] >= 0] | |
df["Inactive_Month"] = df.groupby("developer")["Active"].transform( | |
lambda x: x.rolling(window=2, min_periods=2).sum() == 0 | |
) | |
df["inactive_for_two_months"] = ( | |
df.groupby("developer")["Inactive_Month"].transform("max").astype(int) | |
) | |
df["duration"] = df.groupby("developer")["month_year"].transform("nunique") | |
df.to_csv("debug.csv", index=False) | |
return df | |
def visualize_developer_retention(df): | |
cohort_counts = ( | |
df[~df["Inactive_Month"]] | |
.groupby(["Cohort", "Order"]) | |
.developer.nunique() | |
.unstack(0) | |
) | |
cohort_sizes = cohort_counts.iloc[0] | |
retention = cohort_counts.divide(cohort_sizes, axis=1) | |
colors = [(0, "#FF0000"), (0.15, "#FFA500"), (0.2, "#FFFF00"), (1, "#008000")] | |
cmap = LinearSegmentedColormap.from_list("custom_cmap", colors, N=256) | |
plt.figure(figsize=(12, 8)) # Adjusted figure size for better visibility | |
sns.heatmap(retention.T, annot=False, cmap=cmap) | |
plt.title("Journey Through Code: Tracking Developer Engagement Over Time", pad=20) | |
plt.subplots_adjust(bottom=0.3) | |
description_text = ( | |
"This heatmap visualizes the engagement journey of developers, tracked monthly across cohorts." | |
" Each cohort represents developers who began contributing in the same month." | |
" The color gradient from red to green signifies the evolution of active engagement over time," | |
" with red indicating lower engagement levels and green denoting higher activity." | |
" Cohorts are plotted on the y-axis, and the actual months since the start of the cohort on the x-axis." | |
" This visualization offers insights into how developer activity trends evolve," | |
" highlighting periods of increased or decreased engagement and aiding in understanding" | |
" the effectiveness of retention strategies over time." | |
" Parameters:" | |
"(a) A developer is considered inactive if they have at least 2 continuous inactive months." | |
"(b) With one commit in a month, the developer is considered active." | |
"(c) The data is filtered to include only entries from September 2021 onwards." | |
) | |
plt.figtext(0.5, -0.0001, description_text, ha="center", fontsize=9, wrap=True) | |
save_plot(plt, "developer_engagement_journey") | |
def survival_curve_analysis_and_plot(df): | |
""" | |
Perform analysis on the DataFrame to calculate durations and generate visualizations, with annotations explaining the analysis. | |
Adjust the event definition and perform Log-Rank Test. | |
""" | |
summary_df = ( | |
df.groupby("developer") | |
.agg({"duration": "first", "inactive_for_two_months": "last"}) | |
.reset_index() | |
) | |
kmf = KaplanMeierFitter() | |
kmf.fit( | |
durations=summary_df["duration"], | |
event_observed=summary_df["inactive_for_two_months"], | |
label="Developer Survival Probability", | |
) | |
plt.figure(figsize=(10, 6)) | |
ax = plt.subplot(111) | |
kmf.plot_survival_function(ax=ax) | |
plt.title("Developer Survival Curve: Probability of Active Contribution Over Time") | |
plt.grid(True, which="both", linestyle="--", linewidth=0.5) | |
median_survival_time = kmf.median_survival_time_ | |
ax.axhline(y=0.5, color="red", linestyle="--") | |
ax.text( | |
median_survival_time, | |
0.48, | |
"Median Survival Time", | |
verticalalignment="center", | |
color="red", | |
fontsize=8, | |
) | |
ax.axvline(x=3, color="green", linestyle="--") | |
ax.text( | |
3, | |
0.95, | |
"Inactive Month + 1", | |
verticalalignment="top", | |
horizontalalignment="center", | |
color="green", | |
fontsize=8, | |
) | |
ax.axvline(x=median_survival_time, color="green", linestyle="--") | |
ax.text( | |
len(df["duration"].unique()), | |
0.9, | |
f"After month {int(median_survival_time)} the probability of developers staying is lower than 50 percent", | |
verticalalignment="top", | |
horizontalalignment="right", | |
color="green", | |
fontsize=8, | |
) | |
ax.set_yticks(np.arange(0, 1.1, 0.1)) | |
# Setting the x-axis and y-axis labels as per the request | |
plt.xlabel("Months since the developer started committing code") | |
plt.ylabel("Probability of a developer staying in the ecosystem") | |
description_text = ( | |
"The Kaplan-Meier survival curve shows the probability of developers continuing to contribute over time." | |
"Parameters:" | |
"(a) A developer is consider as inactive if they have at least 2 continuous inactive months." | |
"(b) With one commit in a month, the developer is considered active." | |
"(c) The data is filtered to include only entries from September 2021 onwards." | |
"The Kaplan-Meier estimator is a non-parametric statistic used to estimate the survival function from lifetime data." | |
"It requires to know the duration each subject was observed for, and whether the event of interest" | |
"(in this case, becoming inactive for two months) was observed." | |
"The 'Median Survival Time' shows when the chance of further contributions drops below 50%. " | |
"This analysis helps in understanding the retention of developers and predicting future contribution patterns." | |
) | |
plt.figtext(0.1, -0.1, description_text, ha="left", fontsize=8, wrap=True) | |
save_plot(plt, "developer_survival_curve") | |
if __name__ == "__main__": | |
csv_path = "data/source/all_networks_developer_classification.csv" | |
df = load_and_prepare_data(csv_path) | |
visualize_developer_retention(df) | |
survival_curve_analysis_and_plot(df) | |