Kaplan-Meier Survival Curves
The Kaplan-Meier estimator is the gold standard for visualizing survival data. It provides a non-parametric estimate of the survival function from observed survival times.
Theory
The Kaplan-Meier survival estimate at time t is:
insert eqn here
Where:
- $d_i$ = number of events at time $t_i$
- $n_i$ = number at risk at time $t_i$
Basic Implementation
from lifelines import KaplanMeierFitter
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
# Sample data
np.random.seed(42)
n = 100
data = pd.DataFrame({
'time': np.random.exponential(scale=10, size=n),
'event': np.random.binomial(1, 0.6, size=n)
})
# Initialize and fit
kmf = KaplanMeierFitter()
kmf.fit(durations=data['time'], event_observed=data['event'])
# Plot
plt.figure(figsize=(10, 6))
kmf.plot_survival_function()
plt.title('Kaplan-Meier Survival Curve', fontsize=14, fontweight='bold')
plt.ylabel('Survival Probability', fontsize=12)
plt.xlabel('Time', fontsize=12)
plt.grid(alpha=0.3)
plt.tight_layout()
plt.show()Customizing Plots
Adding Confidence Intervals
fig, ax = plt.subplots(figsize=(10, 6))
# Plot with confidence intervals
kmf.plot_survival_function(
ax=ax,
ci_show=True, # Show confidence intervals
alpha=0.2, # CI transparency
color='#2E86AB',
linewidth=2.5
)
# Customize
ax.set_title('Kaplan-Meier Curve with 95% Confidence Intervals',
fontsize=14, fontweight='bold')
ax.set_ylabel('Survival Probability', fontsize=12)
ax.set_xlabel('Time (months)', fontsize=12)
ax.grid(alpha=0.3)
ax.set_ylim([0, 1.05])
plt.tight_layout()
plt.show()Showing Number at Risk
from lifelines.plotting import add_at_risk_counts
fig, ax = plt.subplots(figsize=(12, 7))
# Plot survival curve
kmf.plot_survival_function(ax=ax, ci_show=True)
# Add at-risk table below
add_at_risk_counts(kmf, ax=ax)
ax.set_title('Survival Curve with At-Risk Table', fontsize=14)
ax.set_ylabel('Survival Probability')
ax.set_xlabel('Time (months)')
plt.tight_layout()
plt.show()Comparing Multiple Groups
Two Groups
# Simulate data with two groups
np.random.seed(42)
group_a = pd.DataFrame({
'time': np.random.exponential(scale=15, size=50),
'event': np.random.binomial(1, 0.6, size=50),
'group': 'Treatment A'
})
group_b = pd.DataFrame({
'time': np.random.exponential(scale=10, size=50),
'event': np.random.binomial(1, 0.7, size=50),
'group': 'Treatment B'
})
data = pd.concat([group_a, group_b], ignore_index=True)
# Plot both groups
fig, ax = plt.subplots(figsize=(10, 6))
colors = ['#2E86AB', '#A23B72']
for idx, (group_name, group_data) in enumerate(data.groupby('group')):
kmf = KaplanMeierFitter()
kmf.fit(
durations=group_data['time'],
event_observed=group_data['event'],
label=group_name
)
kmf.plot_survival_function(ax=ax, ci_show=True, color=colors[idx])
ax.set_title('Survival Comparison: Treatment A vs B', fontsize=14)
ax.set_ylabel('Survival Probability')
ax.set_xlabel('Time (months)')
ax.legend(loc='best')
ax.grid(alpha=0.3)
plt.tight_layout()
plt.show()Multiple Groups with At-Risk Table
from lifelines.plotting import add_at_risk_counts
# Create data with 3 groups
np.random.seed(42)
groups_data = []
for group_name, scale in [('Stage I', 20), ('Stage II', 12), ('Stage III', 8)]:
df = pd.DataFrame({
'time': np.random.exponential(scale=scale, size=40),
'event': np.random.binomial(1, 0.65, size=40),
'group': group_name
})
groups_data.append(df)
data = pd.concat(groups_data, ignore_index=True)
# Plot
fig, ax = plt.subplots(figsize=(12, 8))
kmf_list = []
colors = ['#06D6A0', '#FFD166', '#EF476F']
for idx, (group_name, group_data) in enumerate(data.groupby('group')):
kmf = KaplanMeierFitter()
kmf.fit(
durations=group_data['time'],
event_observed=group_data['event'],
label=group_name
)
kmf.plot_survival_function(ax=ax, ci_show=False, color=colors[idx], linewidth=2.5)
kmf_list.append(kmf)
# Add at-risk counts
add_at_risk_counts(*kmf_list, ax=ax)
ax.set_title('Survival by Cancer Stage', fontsize=14, fontweight='bold')
ax.set_ylabel('Survival Probability', fontsize=12)
ax.set_xlabel('Time (months)', fontsize=12)
ax.legend(loc='best', fontsize=10)
ax.grid(alpha=0.3)
plt.tight_layout()
plt.show()Statistical Testing
Log-Rank Test
Test if survival curves differ significantly:
from lifelines.statistics import logrank_test
# Split by group
group_a_data = data[data['group'] == 'Treatment A']
group_b_data = data[data['group'] == 'Treatment B']
# Perform log-rank test
results = logrank_test(
durations_A=group_a_data['time'],
durations_B=group_b_data['time'],
event_observed_A=group_a_data['event'],
event_observed_B=group_b_data['event']
)
print("Log-Rank Test Results")
print("=" * 40)
print(f"Test statistic: {results.test_statistic:.4f}")
print(f"P-value: {results.p_value:.4f}")
print(f"Degrees of freedom: {results.degrees_of_freedom}")
if results.p_value < 0.05:
print("\n✓ Significant difference (p < 0.05)")
else:
print("\n✗ No significant difference (p ≥ 0.05)")
# Print summary
print(f"\n{results.summary}")Multivariate Log-Rank Test
For more than two groups:
from lifelines.statistics import multivariate_logrank_test
# Test across all three stages
result = multivariate_logrank_test(
data['time'],
data['group'],
data['event']
)
print("Multivariate Log-Rank Test")
print("=" * 40)
print(f"Test statistic: {result.test_statistic:.4f}")
print(f"P-value: {result.p_value:.4f}")
print(f"\n{result.summary}")Advanced Visualizations
Survival with Events Marked
fig, ax = plt.subplots(figsize=(12, 6))
# Plot survival curve
kmf = KaplanMeierFitter()
kmf.fit(data['time'], data['event'])
kmf.plot_survival_function(ax=ax, ci_show=True, color='#2E86AB')
# Mark events
event_times = data[data['event'] == 1]['time']
event_survival = [kmf.predict(t) for t in event_times]
ax.scatter(event_times, event_survival, color='red', s=30,
alpha=0.6, label='Events', zorder=5)
ax.set_title('Survival Curve with Event Markers', fontsize=14)
ax.set_ylabel('Survival Probability')
ax.set_xlabel('Time (months)')
ax.legend()
ax.grid(alpha=0.3)
plt.tight_layout()
plt.show()Median Survival Visualization
fig, ax = plt.subplots(figsize=(10, 6))
kmf = KaplanMeierFitter()
kmf.fit(data['time'], data['event'])
kmf.plot_survival_function(ax=ax, ci_show=True)
# Add median survival line
median_time = kmf.median_survival_time_
ax.axhline(y=0.5, color='red', linestyle='--', alpha=0.7,
label=f'Median Survival')
ax.axvline(x=median_time, color='red', linestyle='--', alpha=0.7)
# Annotate
ax.annotate(f'Median: {median_time:.1f} months',
xy=(median_time, 0.5),
xytext=(median_time + 2, 0.45),
fontsize=10,
bbox=dict(boxstyle='round', facecolor='wheat', alpha=0.5))
ax.set_title('Survival Curve with Median Survival Time', fontsize=14)
ax.set_ylabel('Survival Probability')
ax.set_xlabel('Time (months)')
ax.legend()
ax.grid(alpha=0.3)
plt.tight_layout()
plt.show()Extracting Survival Estimates
At Specific Time Points
# Get survival estimates at specific times
time_points = [6, 12, 24, 36, 48, 60]
print("Survival Estimates")
print("=" * 50)
for t in time_points:
survival_prob = kmf.predict(t)
ci = kmf.confidence_interval_survival_function_.loc[
kmf.confidence_interval_survival_function_.index <= t
].iloc[-1]
print(f"At {t:2d} months: {survival_prob:.2%} "
f"(95% CI: {ci.iloc[0]:.2%} - {ci.iloc[1]:.2%})")Survival Table
# Create survival table
survival_table = kmf.survival_function_
confidence_intervals = kmf.confidence_interval_survival_function_
# Combine into single dataframe
table = pd.concat([survival_table, confidence_intervals], axis=1)
table.columns = ['Survival', 'CI_lower', 'CI_upper']
# Show key timepoints
key_times = [0, 10, 20, 30, 40, 50]
print("\nSurvival Table (Key Timepoints)")
print("=" * 60)
for t in key_times:
if t in table.index:
row = table.loc[t]
print(f"{t:3d} months: {row['Survival']:.4f} "
f"[{row['CI_lower']:.4f}, {row['CI_upper']:.4f}]")Publication-Quality Plots
import matplotlib.pyplot as plt
from matplotlib.patches import Rectangle
fig, ax = plt.subplots(figsize=(10, 7))
# Plot survival curves for two groups
for idx, (group_name, group_data) in enumerate(data.groupby('group')):
kmf = KaplanMeierFitter()
kmf.fit(group_data['time'], group_data['event'], label=group_name)
color = '#2E86AB' if idx == 0 else '#A23B72'
kmf.plot_survival_function(
ax=ax,
ci_show=True,
color=color,
linewidth=2.5,
alpha=0.8
)
# Styling
ax.set_title('Overall Survival by Treatment Group',
fontsize=16, fontweight='bold', pad=20)
ax.set_ylabel('Survival Probability', fontsize=13, fontweight='bold')
ax.set_xlabel('Time (months)', fontsize=13, fontweight='bold')
ax.set_ylim([0, 1.05])
ax.grid(True, alpha=0.3, linestyle='--')
ax.spines['top'].set_visible(False)
ax.spines['right'].set_visible(False)
# Legend
ax.legend(loc='lower left', fontsize=11, frameon=True,
shadow=True, fancybox=True)
# Add p-value annotation
ax.text(0.98, 0.98, f'Log-rank p = {results.p_value:.4f}',
transform=ax.transAxes,
fontsize=11,
verticalalignment='top',
horizontalalignment='right',
bbox=dict(boxstyle='round', facecolor='wheat', alpha=0.5))
plt.tight_layout()
plt.savefig('km_curve_publication.png', dpi=300, bbox_inches='tight')
plt.show()Best Practices:
- Always show confidence intervals for uncertainty
- Include number at risk table for transparency
- Report median survival times with CIs
- Use log-rank test for statistical comparison
- Consider stratification for confounding variables
Common Pitfalls
Small Sample Sizes
# Check sample size in each group
print("Sample Sizes:")
for group_name, group_data in data.groupby('group'):
n_total = len(group_data)
n_events = group_data['event'].sum()
n_censored = n_total - n_events
print(f"{group_name}:")
print(f" Total: {n_total}")
print(f" Events: {n_events}")
print(f" Censored: {n_censored} ({n_censored/n_total:.1%})")Groups with fewer than 10 events may produce unreliable estimates. High censoring rates (>50%) can also affect interpretation.
Crossing Curves
When curves cross, the log-rank test may not be appropriate:
# Check for crossing
# Visual inspection or use weighted log-rank tests
from lifelines.statistics import pairwise_logrank_test
# If curves cross, consider:
# 1. Different time periods
# 2. Weighted tests (Wilcoxon, Tarone-Ware)
# 3. Restricted mean survival timeNext Steps
- Cox Regression - Adjust for covariates
- Advanced Survival - Competing risks, time-varying covariates
Last updated on