Gene Set Enrichment Analysis (GSEA)
Gene Set Enrichment Analysis (GSEA) determines whether a predefined set of genes shows statistically significant, concordant differences between two biological states. Unlike individual gene analysis, GSEA considers entire pathways and functional groups.
Why GSEA?
Traditional differential expression analysis identifies individual genes, but:
- May miss subtle but coordinated changes
- Difficult to interpret long gene lists
- Ignores biological relationships between genes
GSEA addresses these by:
- Testing entire gene sets (pathways, GO terms)
- Detecting modest changes across related genes
- Providing biological context
Conceptual Overview
GSEA works by:
- Ranking all genes by differential expression
- Checking if pathway genes cluster at top/bottom of ranking
- Calculating enrichment score (ES)
- Assessing statistical significance
GSEA asks: “Are genes in this pathway randomly distributed in my ranked list, or do they cluster at the top (upregulated) or bottom (downregulated)?”
Setup
Install GSEApy
pip install gseapy pandas matplotlib seabornImport Libraries
import gseapy as gp
from gseapy import barplot, dotplot
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as snsRunning GSEA
Method 1: Preranked GSEA
Use when you have a ranked gene list (e.g., from differential expression):
# Load differential expression results
# Assume we have: gene, log2FoldChange, pvalue, padj
dge_results = pd.read_csv('dge_results.csv')
# Create ranked gene list
# Rank by: sign(log2FC) * -log10(pvalue)
dge_results['rank'] = (
np.sign(dge_results['log2FoldChange']) *
-np.log10(dge_results['pvalue'])
)
# Sort by rank
dge_results = dge_results.sort_values('rank', ascending=False)
# Create ranking dictionary
gene_rank = dict(zip(dge_results['gene'], dge_results['rank']))
# Alternative: Simple log2FC ranking
# gene_rank = dict(zip(dge_results['gene'],
# dge_results['log2FoldChange']))
print(f"Total genes: {len(gene_rank)}")
print("\nTop 10 upregulated:")
print(list(gene_rank.items())[:10])
print("\nTop 10 downregulated:")
print(list(gene_rank.items())[-10:])Run preranked GSEA:
# Run GSEA preranked
pre_res = gp.prerank(
rnk=gene_rank, # Ranked gene list
gene_sets='KEGG_2021_Human', # Gene set database
threads=4,
min_size=15, # Minimum genes in set
max_size=500, # Maximum genes in set
permutation_num=1000, # Permutations for p-value
outdir='gsea_prerank_results', # Output directory
seed=42,
verbose=True
)
# View results
results_df = pre_res.res2d
print(f"\nTotal pathways tested: {len(results_df)}")
print(f"Significant pathways (FDR < 0.25): {(results_df['FDR q-val'] < 0.25).sum()}")
# Top enriched pathways
print("\nTop 10 Enriched Pathways:")
print(results_df.nsmallest(10, 'NOM p-val')[
['Term', 'ES', 'NES', 'NOM p-val', 'FDR q-val']
])Method 2: GSEA with Expression Matrix
Use when you have raw expression data:
# Load expression data
# Rows = genes, Columns = samples
expr_data = pd.read_csv('expression_matrix.csv', index_col=0)
# Create phenotype labels
phenotype = ['Control'] * 3 + ['Treatment'] * 3
# Run GSEA
gs_res = gp.gsea(
data=expr_data, # Expression dataframe
gene_sets='KEGG_2021_Human', # Gene set database
cls=phenotype, # Phenotype labels
outdir='gsea_results',
permutation_num=1000,
min_size=15,
max_size=500,
threads=4,
seed=42
)
# Results
results_df = gs_res.res2d
print(results_df.head(10))Gene Set Databases
GSEApy supports many databases:
MSigDB
# Molecular Signatures Database (MSigDB)
# Hallmark gene sets
gp.prerank(rnk=gene_rank, gene_sets='MSigDB_Hallmark_2020')
# C2: Curated gene sets
gp.prerank(rnk=gene_rank, gene_sets='KEGG_2021_Human')
gp.prerank(rnk=gene_rank, gene_sets='Reactome_2022')
gp.prerank(rnk=gene_rank, gene_sets='WikiPathway_2021_Human')
# C5: Gene Ontology
gp.prerank(rnk=gene_rank, gene_sets='GO_Biological_Process_2021')
gp.prerank(rnk=gene_rank, gene_sets='GO_Molecular_Function_2021')
gp.prerank(rnk=gene_rank, gene_sets='GO_Cellular_Component_2021')Visualizing Results
Enrichment Plot
# Plot enrichment for specific pathway
from gseapy import gseaplot
# Get top pathway
top_pathway = results_df.iloc[0]['Term']
# Create enrichment plot
gseaplot(
rank_metric=gene_rank,
term=top_pathway,
**pre_res.results[top_pathway],
ofname='enrichment_plot.png'
)Bar Plot
# Bar plot of top pathways
from gseapy import barplot
# Select significant pathways
sig_results = results_df[results_df['FDR q-val'] < 0.25]
# Create bar plot
ax = barplot(
sig_results,
column="FDR q-val",
group='group', # 'activated' or 'suppressed'
size=5,
title="Top Enriched Pathways",
figsize=(8, 10),
color=['#CD5C5C', '#5C9BD5']
)
plt.tight_layout()
plt.savefig('gsea_barplot.png', dpi=300, bbox_inches='tight')
plt.show()Dot Plot
# Dot plot showing significance and gene ratio
from gseapy import dotplot
ax = dotplot(
sig_results,
column="FDR q-val",
title='GSEA Results',
size=10,
figsize=(10, 12),
cmap='viridis_r',
ofname='gsea_dotplot.png'
)
plt.tight_layout()
plt.show()Custom Heatmap of Enrichment
# Create custom visualization
import seaborn as sns
# Get top 20 pathways
top_20 = results_df.nsmallest(20, 'FDR q-val')
# Prepare data for plotting
plot_data = top_20[['Term', 'NES', 'FDR q-val']].copy()
plot_data['NES'] = plot_data['NES'].astype(float)
plot_data['-log10(FDR)'] = -np.log10(plot_data['FDR q-val'])
# Truncate pathway names
plot_data['Term'] = plot_data['Term'].str[:50]
# Create figure
fig, ax = plt.subplots(figsize=(10, 8))
# Create bar plot colored by NES
colors = ['#CD5C5C' if x > 0 else '#5C9BD5' for x in plot_data['NES']]
bars = ax.barh(plot_data['Term'], plot_data['-log10(FDR)'], color=colors, alpha=0.7)
# Add significance threshold line
ax.axvline(x=-np.log10(0.25), color='red', linestyle='--',
alpha=0.5, label='FDR = 0.25')
ax.set_xlabel('-log10(FDR q-value)', fontsize=12)
ax.set_title('Top Enriched Pathways', fontsize=14, fontweight='bold')
ax.legend()
ax.spines['top'].set_visible(False)
ax.spines['right'].set_visible(False)
plt.tight_layout()
plt.savefig('gsea_custom_plot.png', dpi=300, bbox_inches='tight')
plt.show()Interpreting Results
Key Metrics
# Understanding the output
for idx, row in results_df.head(5).iterrows():
print(f"\nPathway: {row['Term']}")
print(f" ES (Enrichment Score): {row['ES']:.3f}")
print(f" NES (Normalized ES): {row['NES']:.3f}")
print(f" NOM p-val: {row['NOM p-val']:.4f}")
print(f" FDR q-val: {row['FDR q-val']:.4f}")
print(f" Leading edge: {row['Lead_genes'][:5]}...")Metric Definitions:
-
ES (Enrichment Score): Degree of enrichment at extremes of ranking
- Positive = enriched in upregulated genes
- Negative = enriched in downregulated genes
-
NES (Normalized ES): ES normalized for gene set size
- Allows comparison between different gene sets
-
NOM p-val: Nominal p-value from permutation test
- Significance of enrichment
-
FDR q-val: False Discovery Rate
- Adjusted for multiple testing
- Threshold: FDR < 0.25 commonly used
-
Leading Edge: Genes contributing most to enrichment
Biological Interpretation
# Extract leading edge genes from top pathway
top_pathway = results_df.iloc[0]
leading_genes = top_pathway['Lead_genes'].split(';')
print(f"Pathway: {top_pathway['Term']}")
print(f"Direction: {'Activated' if top_pathway['NES'] > 0 else 'Suppressed'}")
print(f"Significance: FDR = {top_pathway['FDR q-val']:.4f}")
print(f"\nLeading edge genes ({len(leading_genes)}):")
for gene in leading_genes[:10]:
print(f" - {gene}")Enrichment Analysis (Alternative)
Over-Representation Analysis (ORA)
Simpler alternative when you have a list of significant genes:
# Get significant genes
sig_genes = dge_results[
(dge_results['padj'] < 0.05) &
(abs(dge_results['log2FoldChange']) > 1)
]['gene'].tolist()
print(f"Significant genes: {len(sig_genes)}")
# Run enrichment
enr_res = gp.enrichr(
gene_list=sig_genes,
gene_sets=['KEGG_2021_Human',
'GO_Biological_Process_2021',
'MSigDB_Hallmark_2020'],
organism='Human',
outdir='enrichr_results',
cutoff=0.05
)
# View results
print(enr_res.results.head(10))Visualization
# Bar plot for ORA results
from gseapy.plot import barplot
barplot(
enr_res.results,
title='Enrichr Analysis',
figsize=(8, 10)
)
plt.tight_layout()
plt.show()Comparing Multiple Conditions
# Compare GSEA results across conditions
conditions = ['Condition_A', 'Condition_B', 'Condition_C']
all_results = {}
for condition in conditions:
# Load ranking for this condition
gene_rank = load_ranking(condition) # Your function
# Run GSEA
res = gp.prerank(
rnk=gene_rank,
gene_sets='KEGG_2021_Human',
outdir=f'gsea_{condition}',
permutation_num=1000
)
all_results[condition] = res.res2d
# Compare top pathways across conditions
# Create comparison heatmap
pathway_comparison = pd.DataFrame()
for condition, results in all_results.items():
# Get NES for top pathways
top_pathways = results.nsmallest(20, 'FDR q-val')
nes_series = top_pathways.set_index('Term')['NES']
pathway_comparison[condition] = nes_series
# Plot
plt.figure(figsize=(12, 10))
sns.heatmap(
pathway_comparison,
cmap='RdBu_r',
center=0,
vmin=-3,
vmax=3,
cbar_kws={'label': 'NES'}
)
plt.title('Pathway Enrichment Across Conditions',
fontsize=14, fontweight='bold')
plt.tight_layout()
plt.show()Best Practices
GSEA Guidelines:
- Use all genes, not just significant ones (for preranked)
- Ensure adequate sample size (typically >6 per group)
- Use FDR < 0.25 as threshold (not 0.05)
- Consider multiple gene set databases
- Validate key findings with literature
Quality Control
# Check ranking distribution
plt.figure(figsize=(10, 5))
plt.hist(list(gene_rank.values()), bins=50, edgecolor='black')
plt.xlabel('Rank Metric')
plt.ylabel('Number of Genes')
plt.title('Distribution of Gene Rankings')
plt.axvline(x=0, color='red', linestyle='--')
plt.tight_layout()
plt.show()
# Check for appropriate spread
print(f"Rank range: {min(gene_rank.values()):.2f} to {max(gene_rank.values()):.2f}")
print(f"Median: {np.median(list(gene_rank.values())):.2f}")Export Results
# Save results
sig_pathways = results_df[results_df['FDR q-val'] < 0.25]
# Full results
results_df.to_csv('gsea_results_all.csv', index=False)
# Significant only
sig_pathways.to_csv('gsea_results_significant.csv', index=False)
# Leading edge genes for top pathway
top_genes = sig_pathways.iloc[0]['Lead_genes'].split(';')
pd.DataFrame({'gene': top_genes}).to_csv(
'leading_edge_genes.csv',
index=False
)Common Pitfalls:
- Using only significant genes (preranked needs all genes)
- Inappropriate ranking metric
- Too stringent FDR cutoff (0.05 vs 0.25)
- Ignoring direction of enrichment (NES sign)
- Not validating with independent data
Next Steps
- Pathway Analysis - Detailed pathway exploration
- GO Enrichment - Gene Ontology analysis
- Network Visualization - Visualize pathway networks