Gene Set Enrichment Analysis (GSEA)

Gene Set Enrichment Analysis (GSEA) determines whether a predefined set of genes shows statistically significant, concordant differences between two biological states. Unlike individual gene analysis, GSEA considers entire pathways and functional groups.

Why GSEA?

Traditional differential expression analysis identifies individual genes, but:

May miss subtle but coordinated changes
Difficult to interpret long gene lists
Ignores biological relationships between genes

GSEA addresses these by:

Testing entire gene sets (pathways, GO terms)
Detecting modest changes across related genes
Providing biological context

Conceptual Overview

GSEA works by:

Ranking all genes by differential expression
Checking if pathway genes cluster at top/bottom of ranking
Calculating enrichment score (ES)
Assessing statistical significance

GSEA asks: “Are genes in this pathway randomly distributed in my ranked list, or do they cluster at the top (upregulated) or bottom (downregulated)?”

Setup

Install GSEApy


pip install gseapy pandas matplotlib seaborn

Import Libraries


import gseapy as gp
from gseapy import barplot, dotplot
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

Running GSEA

Method 1: Preranked GSEA

Use when you have a ranked gene list (e.g., from differential expression):


# Load differential expression results
# Assume we have: gene, log2FoldChange, pvalue, padj
dge_results = pd.read_csv('dge_results.csv')
 
# Create ranked gene list
# Rank by: sign(log2FC) * -log10(pvalue)
dge_results['rank'] = (
    np.sign(dge_results['log2FoldChange']) * 
    -np.log10(dge_results['pvalue'])
)
 
# Sort by rank
dge_results = dge_results.sort_values('rank', ascending=False)
 
# Create ranking dictionary
gene_rank = dict(zip(dge_results['gene'], dge_results['rank']))
 
# Alternative: Simple log2FC ranking
# gene_rank = dict(zip(dge_results['gene'], 
#                      dge_results['log2FoldChange']))
 
print(f"Total genes: {len(gene_rank)}")
print("\nTop 10 upregulated:")
print(list(gene_rank.items())[:10])
print("\nTop 10 downregulated:")
print(list(gene_rank.items())[-10:])

Run preranked GSEA:


# Run GSEA preranked
pre_res = gp.prerank(
    rnk=gene_rank,                    # Ranked gene list
    gene_sets='KEGG_2021_Human',      # Gene set database
    threads=4,
    min_size=15,                       # Minimum genes in set
    max_size=500,                      # Maximum genes in set
    permutation_num=1000,              # Permutations for p-value
    outdir='gsea_prerank_results',    # Output directory
    seed=42,
    verbose=True
)
 
# View results
results_df = pre_res.res2d
print(f"\nTotal pathways tested: {len(results_df)}")
print(f"Significant pathways (FDR < 0.25): {(results_df['FDR q-val'] < 0.25).sum()}")
 
# Top enriched pathways
print("\nTop 10 Enriched Pathways:")
print(results_df.nsmallest(10, 'NOM p-val')[
    ['Term', 'ES', 'NES', 'NOM p-val', 'FDR q-val']
])

Method 2: GSEA with Expression Matrix

Use when you have raw expression data:


# Load expression data
# Rows = genes, Columns = samples
expr_data = pd.read_csv('expression_matrix.csv', index_col=0)
 
# Create phenotype labels
phenotype = ['Control'] * 3 + ['Treatment'] * 3
 
# Run GSEA
gs_res = gp.gsea(
    data=expr_data,                   # Expression dataframe
    gene_sets='KEGG_2021_Human',      # Gene set database
    cls=phenotype,                     # Phenotype labels
    outdir='gsea_results',
    permutation_num=1000,
    min_size=15,
    max_size=500,
    threads=4,
    seed=42
)
 
# Results
results_df = gs_res.res2d
print(results_df.head(10))

Gene Set Databases

GSEApy supports many databases:

MSigDB


# Molecular Signatures Database (MSigDB)
# Hallmark gene sets
gp.prerank(rnk=gene_rank, gene_sets='MSigDB_Hallmark_2020')
 
# C2: Curated gene sets
gp.prerank(rnk=gene_rank, gene_sets='KEGG_2021_Human')
gp.prerank(rnk=gene_rank, gene_sets='Reactome_2022')
gp.prerank(rnk=gene_rank, gene_sets='WikiPathway_2021_Human')
 
# C5: Gene Ontology
gp.prerank(rnk=gene_rank, gene_sets='GO_Biological_Process_2021')
gp.prerank(rnk=gene_rank, gene_sets='GO_Molecular_Function_2021')
gp.prerank(rnk=gene_rank, gene_sets='GO_Cellular_Component_2021')

GO


# Gene Ontology databases
gene_sets = [
    'GO_Biological_Process_2021',
    'GO_Molecular_Function_2021',
    'GO_Cellular_Component_2021'
]
 
for gs in gene_sets:
    print(f"Running GSEA with {gs}...")
    res = gp.prerank(
        rnk=gene_rank,
        gene_sets=gs,
        outdir=f'gsea_{gs}',
        permutation_num=1000
    )

KEGG


# KEGG pathways
kegg_dbs = [
    'KEGG_2021_Human',
    'KEGG_2019_Human',
]
 
res = gp.prerank(
    rnk=gene_rank,
    gene_sets='KEGG_2021_Human',
    outdir='gsea_kegg'
)

Custom


# Use custom gene sets
# Format: dict or GMT file
 
custom_gene_sets = {
    'My_Pathway_1': ['GENE1', 'GENE2', 'GENE3', 'GENE4'],
    'My_Pathway_2': ['GENE5', 'GENE6', 'GENE7'],
    'Custom_Signature': ['GENEX', 'GENEY', 'GENEZ']
}
 
res = gp.prerank(
    rnk=gene_rank,
    gene_sets=custom_gene_sets,
    outdir='gsea_custom'
)
 
# Or load from GMT file
# gene_sets='path/to/custom.gmt'

Visualizing Results

Enrichment Plot


# Plot enrichment for specific pathway
from gseapy import gseaplot
 
# Get top pathway
top_pathway = results_df.iloc[0]['Term']
 
# Create enrichment plot
gseaplot(
    rank_metric=gene_rank,
    term=top_pathway,
    **pre_res.results[top_pathway],
    ofname='enrichment_plot.png'
)

Bar Plot


# Bar plot of top pathways
from gseapy import barplot
 
# Select significant pathways
sig_results = results_df[results_df['FDR q-val'] < 0.25]
 
# Create bar plot
ax = barplot(
    sig_results,
    column="FDR q-val",
    group='group',  # 'activated' or 'suppressed'
    size=5,
    title="Top Enriched Pathways",
    figsize=(8, 10),
    color=['#CD5C5C', '#5C9BD5']
)
plt.tight_layout()
plt.savefig('gsea_barplot.png', dpi=300, bbox_inches='tight')
plt.show()

Dot Plot


# Dot plot showing significance and gene ratio
from gseapy import dotplot
 
ax = dotplot(
    sig_results,
    column="FDR q-val",
    title='GSEA Results',
    size=10,
    figsize=(10, 12),
    cmap='viridis_r',
    ofname='gsea_dotplot.png'
)
plt.tight_layout()
plt.show()

Custom Heatmap of Enrichment


# Create custom visualization
import seaborn as sns
 
# Get top 20 pathways
top_20 = results_df.nsmallest(20, 'FDR q-val')
 
# Prepare data for plotting
plot_data = top_20[['Term', 'NES', 'FDR q-val']].copy()
plot_data['NES'] = plot_data['NES'].astype(float)
plot_data['-log10(FDR)'] = -np.log10(plot_data['FDR q-val'])
 
# Truncate pathway names
plot_data['Term'] = plot_data['Term'].str[:50]
 
# Create figure
fig, ax = plt.subplots(figsize=(10, 8))
 
# Create bar plot colored by NES
colors = ['#CD5C5C' if x > 0 else '#5C9BD5' for x in plot_data['NES']]
bars = ax.barh(plot_data['Term'], plot_data['-log10(FDR)'], color=colors, alpha=0.7)
 
# Add significance threshold line
ax.axvline(x=-np.log10(0.25), color='red', linestyle='--', 
           alpha=0.5, label='FDR = 0.25')
 
ax.set_xlabel('-log10(FDR q-value)', fontsize=12)
ax.set_title('Top Enriched Pathways', fontsize=14, fontweight='bold')
ax.legend()
ax.spines['top'].set_visible(False)
ax.spines['right'].set_visible(False)
 
plt.tight_layout()
plt.savefig('gsea_custom_plot.png', dpi=300, bbox_inches='tight')
plt.show()

Interpreting Results

Key Metrics


# Understanding the output
for idx, row in results_df.head(5).iterrows():
    print(f"\nPathway: {row['Term']}")
    print(f"  ES (Enrichment Score): {row['ES']:.3f}")
    print(f"  NES (Normalized ES): {row['NES']:.3f}")
    print(f"  NOM p-val: {row['NOM p-val']:.4f}")
    print(f"  FDR q-val: {row['FDR q-val']:.4f}")
    print(f"  Leading edge: {row['Lead_genes'][:5]}...")

Metric Definitions:

ES (Enrichment Score): Degree of enrichment at extremes of ranking
- Positive = enriched in upregulated genes
- Negative = enriched in downregulated genes
NES (Normalized ES): ES normalized for gene set size
- Allows comparison between different gene sets
NOM p-val: Nominal p-value from permutation test
- Significance of enrichment
FDR q-val: False Discovery Rate
- Adjusted for multiple testing
- Threshold: FDR < 0.25 commonly used
Leading Edge: Genes contributing most to enrichment

Biological Interpretation


# Extract leading edge genes from top pathway
top_pathway = results_df.iloc[0]
leading_genes = top_pathway['Lead_genes'].split(';')
 
print(f"Pathway: {top_pathway['Term']}")
print(f"Direction: {'Activated' if top_pathway['NES'] > 0 else 'Suppressed'}")
print(f"Significance: FDR = {top_pathway['FDR q-val']:.4f}")
print(f"\nLeading edge genes ({len(leading_genes)}):")
for gene in leading_genes[:10]:
    print(f"  - {gene}")

Enrichment Analysis (Alternative)

Over-Representation Analysis (ORA)

Simpler alternative when you have a list of significant genes:


# Get significant genes
sig_genes = dge_results[
    (dge_results['padj'] < 0.05) & 
    (abs(dge_results['log2FoldChange']) > 1)
]['gene'].tolist()
 
print(f"Significant genes: {len(sig_genes)}")
 
# Run enrichment
enr_res = gp.enrichr(
    gene_list=sig_genes,
    gene_sets=['KEGG_2021_Human', 
               'GO_Biological_Process_2021',
               'MSigDB_Hallmark_2020'],
    organism='Human',
    outdir='enrichr_results',
    cutoff=0.05
)
 
# View results
print(enr_res.results.head(10))

Visualization


# Bar plot for ORA results
from gseapy.plot import barplot
 
barplot(
    enr_res.results,
    title='Enrichr Analysis',
    figsize=(8, 10)
)
plt.tight_layout()
plt.show()

Comparing Multiple Conditions


# Compare GSEA results across conditions
conditions = ['Condition_A', 'Condition_B', 'Condition_C']
all_results = {}
 
for condition in conditions:
    # Load ranking for this condition
    gene_rank = load_ranking(condition)  # Your function
    
    # Run GSEA
    res = gp.prerank(
        rnk=gene_rank,
        gene_sets='KEGG_2021_Human',
        outdir=f'gsea_{condition}',
        permutation_num=1000
    )
    
    all_results[condition] = res.res2d
 
# Compare top pathways across conditions
# Create comparison heatmap
pathway_comparison = pd.DataFrame()
 
for condition, results in all_results.items():
    # Get NES for top pathways
    top_pathways = results.nsmallest(20, 'FDR q-val')
    nes_series = top_pathways.set_index('Term')['NES']
    pathway_comparison[condition] = nes_series
 
# Plot
plt.figure(figsize=(12, 10))
sns.heatmap(
    pathway_comparison,
    cmap='RdBu_r',
    center=0,
    vmin=-3,
    vmax=3,
    cbar_kws={'label': 'NES'}
)
plt.title('Pathway Enrichment Across Conditions', 
          fontsize=14, fontweight='bold')
plt.tight_layout()
plt.show()

Best Practices

GSEA Guidelines:

Use all genes, not just significant ones (for preranked)
Ensure adequate sample size (typically >6 per group)
Use FDR < 0.25 as threshold (not 0.05)
Consider multiple gene set databases
Validate key findings with literature

Quality Control


# Check ranking distribution
plt.figure(figsize=(10, 5))
plt.hist(list(gene_rank.values()), bins=50, edgecolor='black')
plt.xlabel('Rank Metric')
plt.ylabel('Number of Genes')
plt.title('Distribution of Gene Rankings')
plt.axvline(x=0, color='red', linestyle='--')
plt.tight_layout()
plt.show()
 
# Check for appropriate spread
print(f"Rank range: {min(gene_rank.values()):.2f} to {max(gene_rank.values()):.2f}")
print(f"Median: {np.median(list(gene_rank.values())):.2f}")

Export Results


# Save results
sig_pathways = results_df[results_df['FDR q-val'] < 0.25]
 
# Full results
results_df.to_csv('gsea_results_all.csv', index=False)
 
# Significant only
sig_pathways.to_csv('gsea_results_significant.csv', index=False)
 
# Leading edge genes for top pathway
top_genes = sig_pathways.iloc[0]['Lead_genes'].split(';')
pd.DataFrame({'gene': top_genes}).to_csv(
    'leading_edge_genes.csv', 
    index=False
)

Common Pitfalls:

Using only significant genes (preranked needs all genes)
Inappropriate ranking metric
Too stringent FDR cutoff (0.05 vs 0.25)
Ignoring direction of enrichment (NES sign)
Not validating with independent data

Next Steps

Pathway Analysis - Detailed pathway exploration
GO Enrichment - Gene Ontology analysis
Network Visualization - Visualize pathway networks