Skip to Content
DocumentationKaplan-Meier Curves

Kaplan-Meier Survival Curves

The Kaplan-Meier estimator is the gold standard for visualizing survival data. It provides a non-parametric estimate of the survival function from observed survival times.

Theory

The Kaplan-Meier survival estimate at time t is:

insert eqn here

Where:

  • $d_i$ = number of events at time $t_i$
  • $n_i$ = number at risk at time $t_i$

Basic Implementation

from lifelines import KaplanMeierFitter import pandas as pd import numpy as np import matplotlib.pyplot as plt # Sample data np.random.seed(42) n = 100 data = pd.DataFrame({ 'time': np.random.exponential(scale=10, size=n), 'event': np.random.binomial(1, 0.6, size=n) }) # Initialize and fit kmf = KaplanMeierFitter() kmf.fit(durations=data['time'], event_observed=data['event']) # Plot plt.figure(figsize=(10, 6)) kmf.plot_survival_function() plt.title('Kaplan-Meier Survival Curve', fontsize=14, fontweight='bold') plt.ylabel('Survival Probability', fontsize=12) plt.xlabel('Time', fontsize=12) plt.grid(alpha=0.3) plt.tight_layout() plt.show()

Customizing Plots

Adding Confidence Intervals

fig, ax = plt.subplots(figsize=(10, 6)) # Plot with confidence intervals kmf.plot_survival_function( ax=ax, ci_show=True, # Show confidence intervals alpha=0.2, # CI transparency color='#2E86AB', linewidth=2.5 ) # Customize ax.set_title('Kaplan-Meier Curve with 95% Confidence Intervals', fontsize=14, fontweight='bold') ax.set_ylabel('Survival Probability', fontsize=12) ax.set_xlabel('Time (months)', fontsize=12) ax.grid(alpha=0.3) ax.set_ylim([0, 1.05]) plt.tight_layout() plt.show()

Showing Number at Risk

from lifelines.plotting import add_at_risk_counts fig, ax = plt.subplots(figsize=(12, 7)) # Plot survival curve kmf.plot_survival_function(ax=ax, ci_show=True) # Add at-risk table below add_at_risk_counts(kmf, ax=ax) ax.set_title('Survival Curve with At-Risk Table', fontsize=14) ax.set_ylabel('Survival Probability') ax.set_xlabel('Time (months)') plt.tight_layout() plt.show()

Comparing Multiple Groups

Two Groups

# Simulate data with two groups np.random.seed(42) group_a = pd.DataFrame({ 'time': np.random.exponential(scale=15, size=50), 'event': np.random.binomial(1, 0.6, size=50), 'group': 'Treatment A' }) group_b = pd.DataFrame({ 'time': np.random.exponential(scale=10, size=50), 'event': np.random.binomial(1, 0.7, size=50), 'group': 'Treatment B' }) data = pd.concat([group_a, group_b], ignore_index=True) # Plot both groups fig, ax = plt.subplots(figsize=(10, 6)) colors = ['#2E86AB', '#A23B72'] for idx, (group_name, group_data) in enumerate(data.groupby('group')): kmf = KaplanMeierFitter() kmf.fit( durations=group_data['time'], event_observed=group_data['event'], label=group_name ) kmf.plot_survival_function(ax=ax, ci_show=True, color=colors[idx]) ax.set_title('Survival Comparison: Treatment A vs B', fontsize=14) ax.set_ylabel('Survival Probability') ax.set_xlabel('Time (months)') ax.legend(loc='best') ax.grid(alpha=0.3) plt.tight_layout() plt.show()

Multiple Groups with At-Risk Table

from lifelines.plotting import add_at_risk_counts # Create data with 3 groups np.random.seed(42) groups_data = [] for group_name, scale in [('Stage I', 20), ('Stage II', 12), ('Stage III', 8)]: df = pd.DataFrame({ 'time': np.random.exponential(scale=scale, size=40), 'event': np.random.binomial(1, 0.65, size=40), 'group': group_name }) groups_data.append(df) data = pd.concat(groups_data, ignore_index=True) # Plot fig, ax = plt.subplots(figsize=(12, 8)) kmf_list = [] colors = ['#06D6A0', '#FFD166', '#EF476F'] for idx, (group_name, group_data) in enumerate(data.groupby('group')): kmf = KaplanMeierFitter() kmf.fit( durations=group_data['time'], event_observed=group_data['event'], label=group_name ) kmf.plot_survival_function(ax=ax, ci_show=False, color=colors[idx], linewidth=2.5) kmf_list.append(kmf) # Add at-risk counts add_at_risk_counts(*kmf_list, ax=ax) ax.set_title('Survival by Cancer Stage', fontsize=14, fontweight='bold') ax.set_ylabel('Survival Probability', fontsize=12) ax.set_xlabel('Time (months)', fontsize=12) ax.legend(loc='best', fontsize=10) ax.grid(alpha=0.3) plt.tight_layout() plt.show()

Statistical Testing

Log-Rank Test

Test if survival curves differ significantly:

from lifelines.statistics import logrank_test # Split by group group_a_data = data[data['group'] == 'Treatment A'] group_b_data = data[data['group'] == 'Treatment B'] # Perform log-rank test results = logrank_test( durations_A=group_a_data['time'], durations_B=group_b_data['time'], event_observed_A=group_a_data['event'], event_observed_B=group_b_data['event'] ) print("Log-Rank Test Results") print("=" * 40) print(f"Test statistic: {results.test_statistic:.4f}") print(f"P-value: {results.p_value:.4f}") print(f"Degrees of freedom: {results.degrees_of_freedom}") if results.p_value < 0.05: print("\n✓ Significant difference (p < 0.05)") else: print("\n✗ No significant difference (p ≥ 0.05)") # Print summary print(f"\n{results.summary}")

Multivariate Log-Rank Test

For more than two groups:

from lifelines.statistics import multivariate_logrank_test # Test across all three stages result = multivariate_logrank_test( data['time'], data['group'], data['event'] ) print("Multivariate Log-Rank Test") print("=" * 40) print(f"Test statistic: {result.test_statistic:.4f}") print(f"P-value: {result.p_value:.4f}") print(f"\n{result.summary}")

Advanced Visualizations

Survival with Events Marked

fig, ax = plt.subplots(figsize=(12, 6)) # Plot survival curve kmf = KaplanMeierFitter() kmf.fit(data['time'], data['event']) kmf.plot_survival_function(ax=ax, ci_show=True, color='#2E86AB') # Mark events event_times = data[data['event'] == 1]['time'] event_survival = [kmf.predict(t) for t in event_times] ax.scatter(event_times, event_survival, color='red', s=30, alpha=0.6, label='Events', zorder=5) ax.set_title('Survival Curve with Event Markers', fontsize=14) ax.set_ylabel('Survival Probability') ax.set_xlabel('Time (months)') ax.legend() ax.grid(alpha=0.3) plt.tight_layout() plt.show()

Median Survival Visualization

fig, ax = plt.subplots(figsize=(10, 6)) kmf = KaplanMeierFitter() kmf.fit(data['time'], data['event']) kmf.plot_survival_function(ax=ax, ci_show=True) # Add median survival line median_time = kmf.median_survival_time_ ax.axhline(y=0.5, color='red', linestyle='--', alpha=0.7, label=f'Median Survival') ax.axvline(x=median_time, color='red', linestyle='--', alpha=0.7) # Annotate ax.annotate(f'Median: {median_time:.1f} months', xy=(median_time, 0.5), xytext=(median_time + 2, 0.45), fontsize=10, bbox=dict(boxstyle='round', facecolor='wheat', alpha=0.5)) ax.set_title('Survival Curve with Median Survival Time', fontsize=14) ax.set_ylabel('Survival Probability') ax.set_xlabel('Time (months)') ax.legend() ax.grid(alpha=0.3) plt.tight_layout() plt.show()

Extracting Survival Estimates

At Specific Time Points

# Get survival estimates at specific times time_points = [6, 12, 24, 36, 48, 60] print("Survival Estimates") print("=" * 50) for t in time_points: survival_prob = kmf.predict(t) ci = kmf.confidence_interval_survival_function_.loc[ kmf.confidence_interval_survival_function_.index <= t ].iloc[-1] print(f"At {t:2d} months: {survival_prob:.2%} " f"(95% CI: {ci.iloc[0]:.2%} - {ci.iloc[1]:.2%})")

Survival Table

# Create survival table survival_table = kmf.survival_function_ confidence_intervals = kmf.confidence_interval_survival_function_ # Combine into single dataframe table = pd.concat([survival_table, confidence_intervals], axis=1) table.columns = ['Survival', 'CI_lower', 'CI_upper'] # Show key timepoints key_times = [0, 10, 20, 30, 40, 50] print("\nSurvival Table (Key Timepoints)") print("=" * 60) for t in key_times: if t in table.index: row = table.loc[t] print(f"{t:3d} months: {row['Survival']:.4f} " f"[{row['CI_lower']:.4f}, {row['CI_upper']:.4f}]")

Publication-Quality Plots

import matplotlib.pyplot as plt from matplotlib.patches import Rectangle fig, ax = plt.subplots(figsize=(10, 7)) # Plot survival curves for two groups for idx, (group_name, group_data) in enumerate(data.groupby('group')): kmf = KaplanMeierFitter() kmf.fit(group_data['time'], group_data['event'], label=group_name) color = '#2E86AB' if idx == 0 else '#A23B72' kmf.plot_survival_function( ax=ax, ci_show=True, color=color, linewidth=2.5, alpha=0.8 ) # Styling ax.set_title('Overall Survival by Treatment Group', fontsize=16, fontweight='bold', pad=20) ax.set_ylabel('Survival Probability', fontsize=13, fontweight='bold') ax.set_xlabel('Time (months)', fontsize=13, fontweight='bold') ax.set_ylim([0, 1.05]) ax.grid(True, alpha=0.3, linestyle='--') ax.spines['top'].set_visible(False) ax.spines['right'].set_visible(False) # Legend ax.legend(loc='lower left', fontsize=11, frameon=True, shadow=True, fancybox=True) # Add p-value annotation ax.text(0.98, 0.98, f'Log-rank p = {results.p_value:.4f}', transform=ax.transAxes, fontsize=11, verticalalignment='top', horizontalalignment='right', bbox=dict(boxstyle='round', facecolor='wheat', alpha=0.5)) plt.tight_layout() plt.savefig('km_curve_publication.png', dpi=300, bbox_inches='tight') plt.show()

Best Practices:

  • Always show confidence intervals for uncertainty
  • Include number at risk table for transparency
  • Report median survival times with CIs
  • Use log-rank test for statistical comparison
  • Consider stratification for confounding variables

Common Pitfalls

Small Sample Sizes

# Check sample size in each group print("Sample Sizes:") for group_name, group_data in data.groupby('group'): n_total = len(group_data) n_events = group_data['event'].sum() n_censored = n_total - n_events print(f"{group_name}:") print(f" Total: {n_total}") print(f" Events: {n_events}") print(f" Censored: {n_censored} ({n_censored/n_total:.1%})")

Groups with fewer than 10 events may produce unreliable estimates. High censoring rates (>50%) can also affect interpretation.

Crossing Curves

When curves cross, the log-rank test may not be appropriate:

# Check for crossing # Visual inspection or use weighted log-rank tests from lifelines.statistics import pairwise_logrank_test # If curves cross, consider: # 1. Different time periods # 2. Weighted tests (Wilcoxon, Tarone-Ware) # 3. Restricted mean survival time

Next Steps

Last updated on