Kaplan-Meier Survival Curves

The Kaplan-Meier estimator is the gold standard for visualizing survival data. It provides a non-parametric estimate of the survival function from observed survival times.

Theory

The Kaplan-Meier survival estimate at time t is:

insert eqn here

Where:

$d_i$ = number of events at time $t_i$
$n_i$ = number at risk at time $t_i$

Basic Implementation


from lifelines import KaplanMeierFitter
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
 
# Sample data
np.random.seed(42)
n = 100
 
data = pd.DataFrame({
    'time': np.random.exponential(scale=10, size=n),
    'event': np.random.binomial(1, 0.6, size=n)
})
 
# Initialize and fit
kmf = KaplanMeierFitter()
kmf.fit(durations=data['time'], event_observed=data['event'])
 
# Plot
plt.figure(figsize=(10, 6))
kmf.plot_survival_function()
plt.title('Kaplan-Meier Survival Curve', fontsize=14, fontweight='bold')
plt.ylabel('Survival Probability', fontsize=12)
plt.xlabel('Time', fontsize=12)
plt.grid(alpha=0.3)
plt.tight_layout()
plt.show()

Customizing Plots

Adding Confidence Intervals


fig, ax = plt.subplots(figsize=(10, 6))
 
# Plot with confidence intervals
kmf.plot_survival_function(
    ax=ax,
    ci_show=True,  # Show confidence intervals
    alpha=0.2,     # CI transparency
    color='#2E86AB',
    linewidth=2.5
)
 
# Customize
ax.set_title('Kaplan-Meier Curve with 95% Confidence Intervals', 
             fontsize=14, fontweight='bold')
ax.set_ylabel('Survival Probability', fontsize=12)
ax.set_xlabel('Time (months)', fontsize=12)
ax.grid(alpha=0.3)
ax.set_ylim([0, 1.05])
 
plt.tight_layout()
plt.show()

Showing Number at Risk


from lifelines.plotting import add_at_risk_counts
 
fig, ax = plt.subplots(figsize=(12, 7))
 
# Plot survival curve
kmf.plot_survival_function(ax=ax, ci_show=True)
 
# Add at-risk table below
add_at_risk_counts(kmf, ax=ax)
 
ax.set_title('Survival Curve with At-Risk Table', fontsize=14)
ax.set_ylabel('Survival Probability')
ax.set_xlabel('Time (months)')
 
plt.tight_layout()
plt.show()

Comparing Multiple Groups

Two Groups


# Simulate data with two groups
np.random.seed(42)
 
group_a = pd.DataFrame({
    'time': np.random.exponential(scale=15, size=50),
    'event': np.random.binomial(1, 0.6, size=50),
    'group': 'Treatment A'
})
 
group_b = pd.DataFrame({
    'time': np.random.exponential(scale=10, size=50),
    'event': np.random.binomial(1, 0.7, size=50),
    'group': 'Treatment B'
})
 
data = pd.concat([group_a, group_b], ignore_index=True)
 
# Plot both groups
fig, ax = plt.subplots(figsize=(10, 6))
 
colors = ['#2E86AB', '#A23B72']
for idx, (group_name, group_data) in enumerate(data.groupby('group')):
    kmf = KaplanMeierFitter()
    kmf.fit(
        durations=group_data['time'],
        event_observed=group_data['event'],
        label=group_name
    )
    kmf.plot_survival_function(ax=ax, ci_show=True, color=colors[idx])
 
ax.set_title('Survival Comparison: Treatment A vs B', fontsize=14)
ax.set_ylabel('Survival Probability')
ax.set_xlabel('Time (months)')
ax.legend(loc='best')
ax.grid(alpha=0.3)
 
plt.tight_layout()
plt.show()

Multiple Groups with At-Risk Table


from lifelines.plotting import add_at_risk_counts
 
# Create data with 3 groups
np.random.seed(42)
groups_data = []
 
for group_name, scale in [('Stage I', 20), ('Stage II', 12), ('Stage III', 8)]:
    df = pd.DataFrame({
        'time': np.random.exponential(scale=scale, size=40),
        'event': np.random.binomial(1, 0.65, size=40),
        'group': group_name
    })
    groups_data.append(df)
 
data = pd.concat(groups_data, ignore_index=True)
 
# Plot
fig, ax = plt.subplots(figsize=(12, 8))
 
kmf_list = []
colors = ['#06D6A0', '#FFD166', '#EF476F']
 
for idx, (group_name, group_data) in enumerate(data.groupby('group')):
    kmf = KaplanMeierFitter()
    kmf.fit(
        durations=group_data['time'],
        event_observed=group_data['event'],
        label=group_name
    )
    kmf.plot_survival_function(ax=ax, ci_show=False, color=colors[idx], linewidth=2.5)
    kmf_list.append(kmf)
 
# Add at-risk counts
add_at_risk_counts(*kmf_list, ax=ax)
 
ax.set_title('Survival by Cancer Stage', fontsize=14, fontweight='bold')
ax.set_ylabel('Survival Probability', fontsize=12)
ax.set_xlabel('Time (months)', fontsize=12)
ax.legend(loc='best', fontsize=10)
ax.grid(alpha=0.3)
 
plt.tight_layout()
plt.show()

Statistical Testing

Log-Rank Test

Test if survival curves differ significantly:


from lifelines.statistics import logrank_test
 
# Split by group
group_a_data = data[data['group'] == 'Treatment A']
group_b_data = data[data['group'] == 'Treatment B']
 
# Perform log-rank test
results = logrank_test(
    durations_A=group_a_data['time'],
    durations_B=group_b_data['time'],
    event_observed_A=group_a_data['event'],
    event_observed_B=group_b_data['event']
)
 
print("Log-Rank Test Results")
print("=" * 40)
print(f"Test statistic: {results.test_statistic:.4f}")
print(f"P-value: {results.p_value:.4f}")
print(f"Degrees of freedom: {results.degrees_of_freedom}")
 
if results.p_value < 0.05:
    print("\n✓ Significant difference (p < 0.05)")
else:
    print("\n✗ No significant difference (p ≥ 0.05)")
 
# Print summary
print(f"\n{results.summary}")

Multivariate Log-Rank Test

For more than two groups:


from lifelines.statistics import multivariate_logrank_test
 
# Test across all three stages
result = multivariate_logrank_test(
    data['time'],
    data['group'],
    data['event']
)
 
print("Multivariate Log-Rank Test")
print("=" * 40)
print(f"Test statistic: {result.test_statistic:.4f}")
print(f"P-value: {result.p_value:.4f}")
print(f"\n{result.summary}")

Advanced Visualizations

Survival with Events Marked


fig, ax = plt.subplots(figsize=(12, 6))
 
# Plot survival curve
kmf = KaplanMeierFitter()
kmf.fit(data['time'], data['event'])
kmf.plot_survival_function(ax=ax, ci_show=True, color='#2E86AB')
 
# Mark events
event_times = data[data['event'] == 1]['time']
event_survival = [kmf.predict(t) for t in event_times]
 
ax.scatter(event_times, event_survival, color='red', s=30, 
           alpha=0.6, label='Events', zorder=5)
 
ax.set_title('Survival Curve with Event Markers', fontsize=14)
ax.set_ylabel('Survival Probability')
ax.set_xlabel('Time (months)')
ax.legend()
ax.grid(alpha=0.3)
 
plt.tight_layout()
plt.show()

Median Survival Visualization


fig, ax = plt.subplots(figsize=(10, 6))
 
kmf = KaplanMeierFitter()
kmf.fit(data['time'], data['event'])
kmf.plot_survival_function(ax=ax, ci_show=True)
 
# Add median survival line
median_time = kmf.median_survival_time_
 
ax.axhline(y=0.5, color='red', linestyle='--', alpha=0.7, 
           label=f'Median Survival')
ax.axvline(x=median_time, color='red', linestyle='--', alpha=0.7)
 
# Annotate
ax.annotate(f'Median: {median_time:.1f} months', 
            xy=(median_time, 0.5),
            xytext=(median_time + 2, 0.45),
            fontsize=10,
            bbox=dict(boxstyle='round', facecolor='wheat', alpha=0.5))
 
ax.set_title('Survival Curve with Median Survival Time', fontsize=14)
ax.set_ylabel('Survival Probability')
ax.set_xlabel('Time (months)')
ax.legend()
ax.grid(alpha=0.3)
 
plt.tight_layout()
plt.show()

Extracting Survival Estimates

At Specific Time Points


# Get survival estimates at specific times
time_points = [6, 12, 24, 36, 48, 60]
 
print("Survival Estimates")
print("=" * 50)
 
for t in time_points:
    survival_prob = kmf.predict(t)
    ci = kmf.confidence_interval_survival_function_.loc[
        kmf.confidence_interval_survival_function_.index <= t
    ].iloc[-1]
    
    print(f"At {t:2d} months: {survival_prob:.2%} "
          f"(95% CI: {ci.iloc[0]:.2%} - {ci.iloc[1]:.2%})")

Survival Table


# Create survival table
survival_table = kmf.survival_function_
confidence_intervals = kmf.confidence_interval_survival_function_
 
# Combine into single dataframe
table = pd.concat([survival_table, confidence_intervals], axis=1)
table.columns = ['Survival', 'CI_lower', 'CI_upper']
 
# Show key timepoints
key_times = [0, 10, 20, 30, 40, 50]
print("\nSurvival Table (Key Timepoints)")
print("=" * 60)
for t in key_times:
    if t in table.index:
        row = table.loc[t]
        print(f"{t:3d} months: {row['Survival']:.4f} "
              f"[{row['CI_lower']:.4f}, {row['CI_upper']:.4f}]")

Publication-Quality Plots


import matplotlib.pyplot as plt
from matplotlib.patches import Rectangle
 
fig, ax = plt.subplots(figsize=(10, 7))
 
# Plot survival curves for two groups
for idx, (group_name, group_data) in enumerate(data.groupby('group')):
    kmf = KaplanMeierFitter()
    kmf.fit(group_data['time'], group_data['event'], label=group_name)
    
    color = '#2E86AB' if idx == 0 else '#A23B72'
    kmf.plot_survival_function(
        ax=ax, 
        ci_show=True, 
        color=color,
        linewidth=2.5,
        alpha=0.8
    )
 
# Styling
ax.set_title('Overall Survival by Treatment Group', 
             fontsize=16, fontweight='bold', pad=20)
ax.set_ylabel('Survival Probability', fontsize=13, fontweight='bold')
ax.set_xlabel('Time (months)', fontsize=13, fontweight='bold')
ax.set_ylim([0, 1.05])
ax.grid(True, alpha=0.3, linestyle='--')
ax.spines['top'].set_visible(False)
ax.spines['right'].set_visible(False)
 
# Legend
ax.legend(loc='lower left', fontsize=11, frameon=True, 
          shadow=True, fancybox=True)
 
# Add p-value annotation
ax.text(0.98, 0.98, f'Log-rank p = {results.p_value:.4f}',
        transform=ax.transAxes,
        fontsize=11,
        verticalalignment='top',
        horizontalalignment='right',
        bbox=dict(boxstyle='round', facecolor='wheat', alpha=0.5))
 
plt.tight_layout()
plt.savefig('km_curve_publication.png', dpi=300, bbox_inches='tight')
plt.show()

Best Practices:

Always show confidence intervals for uncertainty
Include number at risk table for transparency
Report median survival times with CIs
Use log-rank test for statistical comparison
Consider stratification for confounding variables

Common Pitfalls

Small Sample Sizes


# Check sample size in each group
print("Sample Sizes:")
for group_name, group_data in data.groupby('group'):
    n_total = len(group_data)
    n_events = group_data['event'].sum()
    n_censored = n_total - n_events
    
    print(f"{group_name}:")
    print(f"  Total: {n_total}")
    print(f"  Events: {n_events}")
    print(f"  Censored: {n_censored} ({n_censored/n_total:.1%})")

Groups with fewer than 10 events may produce unreliable estimates. High censoring rates (>50%) can also affect interpretation.

Crossing Curves

When curves cross, the log-rank test may not be appropriate:


# Check for crossing
# Visual inspection or use weighted log-rank tests
from lifelines.statistics import pairwise_logrank_test
 
# If curves cross, consider:
# 1. Different time periods
# 2. Weighted tests (Wilcoxon, Tarone-Ware)
# 3. Restricted mean survival time

Next Steps

Cox Regression - Adjust for covariates
Advanced Survival - Competing risks, time-varying covariates