Starknet_Dev_Metrics / github_metrics /developer_survival_plot.py
espejelomar's picture
Upload folder using huggingface_hub
151eb1b verified
raw
history blame
7.06 kB
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from lifelines import KaplanMeierFitter
from matplotlib.colors import LinearSegmentedColormap
from utils import save_plot
def load_and_prepare_data(file_path):
"""
Load CSV data, convert 'month_year' to datetime, and prepare cohort and duration calculations.
Filter data to include only entries from 2021 onwards and adjust the cohort calculation based on the first active month.
Additionally, eliminate all months with a negative 'Order' so we only get the months after the cohort of the individual.
"""
df = pd.read_csv(file_path)
df["month_year"] = pd.to_datetime(df["month_year"], format="%B_%Y")
df = df[df["month_year"] >= "2021-09-01"]
df["Active"] = df["total_commits"] > 0
df.sort_values(by=["developer", "month_year"], inplace=True)
first_active_month = (
df[df["Active"]].groupby("developer")["month_year"].min().reset_index()
)
first_active_month.rename(columns={"month_year": "FirstActiveMonth"}, inplace=True)
df = df.merge(first_active_month, on="developer", how="left")
df["Cohort"] = df["FirstActiveMonth"].dt.to_period("M")
def calculate_order(row):
if pd.isnull(row["Cohort"]):
return None
return (row["month_year"].to_period("M") - row["Cohort"]).n
df["Order"] = df.apply(calculate_order, axis=1)
df = df[df["Order"] >= 0]
df["Inactive_Month"] = df.groupby("developer")["Active"].transform(
lambda x: x.rolling(window=2, min_periods=2).sum() == 0
)
df["inactive_for_two_months"] = (
df.groupby("developer")["Inactive_Month"].transform("max").astype(int)
)
df["duration"] = df.groupby("developer")["month_year"].transform("nunique")
df.to_csv("debug.csv", index=False)
return df
def visualize_developer_retention(df):
cohort_counts = (
df[~df["Inactive_Month"]]
.groupby(["Cohort", "Order"])
.developer.nunique()
.unstack(0)
)
cohort_sizes = cohort_counts.iloc[0]
retention = cohort_counts.divide(cohort_sizes, axis=1)
colors = [(0, "#FF0000"), (0.15, "#FFA500"), (0.2, "#FFFF00"), (1, "#008000")]
cmap = LinearSegmentedColormap.from_list("custom_cmap", colors, N=256)
plt.figure(figsize=(12, 8)) # Adjusted figure size for better visibility
sns.heatmap(retention.T, annot=False, cmap=cmap)
plt.title("Journey Through Code: Tracking Developer Engagement Over Time", pad=20)
plt.subplots_adjust(bottom=0.3)
description_text = (
"This heatmap visualizes the engagement journey of developers, tracked monthly across cohorts."
" Each cohort represents developers who began contributing in the same month."
" The color gradient from red to green signifies the evolution of active engagement over time,"
" with red indicating lower engagement levels and green denoting higher activity."
" Cohorts are plotted on the y-axis, and the actual months since the start of the cohort on the x-axis."
" This visualization offers insights into how developer activity trends evolve,"
" highlighting periods of increased or decreased engagement and aiding in understanding"
" the effectiveness of retention strategies over time."
" Parameters:"
"(a) A developer is considered inactive if they have at least 2 continuous inactive months."
"(b) With one commit in a month, the developer is considered active."
"(c) The data is filtered to include only entries from September 2021 onwards."
)
plt.figtext(0.5, -0.0001, description_text, ha="center", fontsize=9, wrap=True)
save_plot(plt, "developer_engagement_journey")
def survival_curve_analysis_and_plot(df):
"""
Perform analysis on the DataFrame to calculate durations and generate visualizations, with annotations explaining the analysis.
Adjust the event definition and perform Log-Rank Test.
"""
summary_df = (
df.groupby("developer")
.agg({"duration": "first", "inactive_for_two_months": "last"})
.reset_index()
)
kmf = KaplanMeierFitter()
kmf.fit(
durations=summary_df["duration"],
event_observed=summary_df["inactive_for_two_months"],
label="Developer Survival Probability",
)
plt.figure(figsize=(10, 6))
ax = plt.subplot(111)
kmf.plot_survival_function(ax=ax)
plt.title("Developer Survival Curve: Probability of Active Contribution Over Time")
plt.grid(True, which="both", linestyle="--", linewidth=0.5)
median_survival_time = kmf.median_survival_time_
ax.axhline(y=0.5, color="red", linestyle="--")
ax.text(
median_survival_time,
0.48,
"Median Survival Time",
verticalalignment="center",
color="red",
fontsize=8,
)
ax.axvline(x=3, color="green", linestyle="--")
ax.text(
3,
0.95,
"Inactive Month + 1",
verticalalignment="top",
horizontalalignment="center",
color="green",
fontsize=8,
)
ax.axvline(x=median_survival_time, color="green", linestyle="--")
ax.text(
len(df["duration"].unique()),
0.9,
f"After month {int(median_survival_time)} the probability of developers staying is lower than 50 percent",
verticalalignment="top",
horizontalalignment="right",
color="green",
fontsize=8,
)
ax.set_yticks(np.arange(0, 1.1, 0.1))
# Setting the x-axis and y-axis labels as per the request
plt.xlabel("Months since the developer started committing code")
plt.ylabel("Probability of a developer staying in the ecosystem")
description_text = (
"The Kaplan-Meier survival curve shows the probability of developers continuing to contribute over time."
"Parameters:"
"(a) A developer is consider as inactive if they have at least 2 continuous inactive months."
"(b) With one commit in a month, the developer is considered active."
"(c) The data is filtered to include only entries from September 2021 onwards."
"The Kaplan-Meier estimator is a non-parametric statistic used to estimate the survival function from lifetime data."
"It requires to know the duration each subject was observed for, and whether the event of interest"
"(in this case, becoming inactive for two months) was observed."
"The 'Median Survival Time' shows when the chance of further contributions drops below 50%. "
"This analysis helps in understanding the retention of developers and predicting future contribution patterns."
)
plt.figtext(0.1, -0.1, description_text, ha="left", fontsize=8, wrap=True)
save_plot(plt, "developer_survival_curve")
if __name__ == "__main__":
csv_path = "data/source/all_networks_developer_classification.csv"
df = load_and_prepare_data(csv_path)
visualize_developer_retention(df)
survival_curve_analysis_and_plot(df)