Skip to Content
DocumentationGene Set Enrichment Analysis

Gene Set Enrichment Analysis (GSEA)

Gene Set Enrichment Analysis (GSEA) determines whether a predefined set of genes shows statistically significant, concordant differences between two biological states. Unlike individual gene analysis, GSEA considers entire pathways and functional groups.

Why GSEA?

Traditional differential expression analysis identifies individual genes, but:

  • May miss subtle but coordinated changes
  • Difficult to interpret long gene lists
  • Ignores biological relationships between genes

GSEA addresses these by:

  • Testing entire gene sets (pathways, GO terms)
  • Detecting modest changes across related genes
  • Providing biological context

Conceptual Overview

GSEA works by:

  1. Ranking all genes by differential expression
  2. Checking if pathway genes cluster at top/bottom of ranking
  3. Calculating enrichment score (ES)
  4. Assessing statistical significance

GSEA asks: “Are genes in this pathway randomly distributed in my ranked list, or do they cluster at the top (upregulated) or bottom (downregulated)?”

Setup

Install GSEApy

pip install gseapy pandas matplotlib seaborn

Import Libraries

import gseapy as gp from gseapy import barplot, dotplot import pandas as pd import numpy as np import matplotlib.pyplot as plt import seaborn as sns

Running GSEA

Method 1: Preranked GSEA

Use when you have a ranked gene list (e.g., from differential expression):

# Load differential expression results # Assume we have: gene, log2FoldChange, pvalue, padj dge_results = pd.read_csv('dge_results.csv') # Create ranked gene list # Rank by: sign(log2FC) * -log10(pvalue) dge_results['rank'] = ( np.sign(dge_results['log2FoldChange']) * -np.log10(dge_results['pvalue']) ) # Sort by rank dge_results = dge_results.sort_values('rank', ascending=False) # Create ranking dictionary gene_rank = dict(zip(dge_results['gene'], dge_results['rank'])) # Alternative: Simple log2FC ranking # gene_rank = dict(zip(dge_results['gene'], # dge_results['log2FoldChange'])) print(f"Total genes: {len(gene_rank)}") print("\nTop 10 upregulated:") print(list(gene_rank.items())[:10]) print("\nTop 10 downregulated:") print(list(gene_rank.items())[-10:])

Run preranked GSEA:

# Run GSEA preranked pre_res = gp.prerank( rnk=gene_rank, # Ranked gene list gene_sets='KEGG_2021_Human', # Gene set database threads=4, min_size=15, # Minimum genes in set max_size=500, # Maximum genes in set permutation_num=1000, # Permutations for p-value outdir='gsea_prerank_results', # Output directory seed=42, verbose=True ) # View results results_df = pre_res.res2d print(f"\nTotal pathways tested: {len(results_df)}") print(f"Significant pathways (FDR < 0.25): {(results_df['FDR q-val'] < 0.25).sum()}") # Top enriched pathways print("\nTop 10 Enriched Pathways:") print(results_df.nsmallest(10, 'NOM p-val')[ ['Term', 'ES', 'NES', 'NOM p-val', 'FDR q-val'] ])

Method 2: GSEA with Expression Matrix

Use when you have raw expression data:

# Load expression data # Rows = genes, Columns = samples expr_data = pd.read_csv('expression_matrix.csv', index_col=0) # Create phenotype labels phenotype = ['Control'] * 3 + ['Treatment'] * 3 # Run GSEA gs_res = gp.gsea( data=expr_data, # Expression dataframe gene_sets='KEGG_2021_Human', # Gene set database cls=phenotype, # Phenotype labels outdir='gsea_results', permutation_num=1000, min_size=15, max_size=500, threads=4, seed=42 ) # Results results_df = gs_res.res2d print(results_df.head(10))

Gene Set Databases

GSEApy supports many databases:

# Molecular Signatures Database (MSigDB) # Hallmark gene sets gp.prerank(rnk=gene_rank, gene_sets='MSigDB_Hallmark_2020') # C2: Curated gene sets gp.prerank(rnk=gene_rank, gene_sets='KEGG_2021_Human') gp.prerank(rnk=gene_rank, gene_sets='Reactome_2022') gp.prerank(rnk=gene_rank, gene_sets='WikiPathway_2021_Human') # C5: Gene Ontology gp.prerank(rnk=gene_rank, gene_sets='GO_Biological_Process_2021') gp.prerank(rnk=gene_rank, gene_sets='GO_Molecular_Function_2021') gp.prerank(rnk=gene_rank, gene_sets='GO_Cellular_Component_2021')

Visualizing Results

Enrichment Plot

# Plot enrichment for specific pathway from gseapy import gseaplot # Get top pathway top_pathway = results_df.iloc[0]['Term'] # Create enrichment plot gseaplot( rank_metric=gene_rank, term=top_pathway, **pre_res.results[top_pathway], ofname='enrichment_plot.png' )

Bar Plot

# Bar plot of top pathways from gseapy import barplot # Select significant pathways sig_results = results_df[results_df['FDR q-val'] < 0.25] # Create bar plot ax = barplot( sig_results, column="FDR q-val", group='group', # 'activated' or 'suppressed' size=5, title="Top Enriched Pathways", figsize=(8, 10), color=['#CD5C5C', '#5C9BD5'] ) plt.tight_layout() plt.savefig('gsea_barplot.png', dpi=300, bbox_inches='tight') plt.show()

Dot Plot

# Dot plot showing significance and gene ratio from gseapy import dotplot ax = dotplot( sig_results, column="FDR q-val", title='GSEA Results', size=10, figsize=(10, 12), cmap='viridis_r', ofname='gsea_dotplot.png' ) plt.tight_layout() plt.show()

Custom Heatmap of Enrichment

# Create custom visualization import seaborn as sns # Get top 20 pathways top_20 = results_df.nsmallest(20, 'FDR q-val') # Prepare data for plotting plot_data = top_20[['Term', 'NES', 'FDR q-val']].copy() plot_data['NES'] = plot_data['NES'].astype(float) plot_data['-log10(FDR)'] = -np.log10(plot_data['FDR q-val']) # Truncate pathway names plot_data['Term'] = plot_data['Term'].str[:50] # Create figure fig, ax = plt.subplots(figsize=(10, 8)) # Create bar plot colored by NES colors = ['#CD5C5C' if x > 0 else '#5C9BD5' for x in plot_data['NES']] bars = ax.barh(plot_data['Term'], plot_data['-log10(FDR)'], color=colors, alpha=0.7) # Add significance threshold line ax.axvline(x=-np.log10(0.25), color='red', linestyle='--', alpha=0.5, label='FDR = 0.25') ax.set_xlabel('-log10(FDR q-value)', fontsize=12) ax.set_title('Top Enriched Pathways', fontsize=14, fontweight='bold') ax.legend() ax.spines['top'].set_visible(False) ax.spines['right'].set_visible(False) plt.tight_layout() plt.savefig('gsea_custom_plot.png', dpi=300, bbox_inches='tight') plt.show()

Interpreting Results

Key Metrics

# Understanding the output for idx, row in results_df.head(5).iterrows(): print(f"\nPathway: {row['Term']}") print(f" ES (Enrichment Score): {row['ES']:.3f}") print(f" NES (Normalized ES): {row['NES']:.3f}") print(f" NOM p-val: {row['NOM p-val']:.4f}") print(f" FDR q-val: {row['FDR q-val']:.4f}") print(f" Leading edge: {row['Lead_genes'][:5]}...")

Metric Definitions:

  • ES (Enrichment Score): Degree of enrichment at extremes of ranking

    • Positive = enriched in upregulated genes
    • Negative = enriched in downregulated genes
  • NES (Normalized ES): ES normalized for gene set size

    • Allows comparison between different gene sets
  • NOM p-val: Nominal p-value from permutation test

    • Significance of enrichment
  • FDR q-val: False Discovery Rate

    • Adjusted for multiple testing
    • Threshold: FDR < 0.25 commonly used
  • Leading Edge: Genes contributing most to enrichment

Biological Interpretation

# Extract leading edge genes from top pathway top_pathway = results_df.iloc[0] leading_genes = top_pathway['Lead_genes'].split(';') print(f"Pathway: {top_pathway['Term']}") print(f"Direction: {'Activated' if top_pathway['NES'] > 0 else 'Suppressed'}") print(f"Significance: FDR = {top_pathway['FDR q-val']:.4f}") print(f"\nLeading edge genes ({len(leading_genes)}):") for gene in leading_genes[:10]: print(f" - {gene}")

Enrichment Analysis (Alternative)

Over-Representation Analysis (ORA)

Simpler alternative when you have a list of significant genes:

# Get significant genes sig_genes = dge_results[ (dge_results['padj'] < 0.05) & (abs(dge_results['log2FoldChange']) > 1) ]['gene'].tolist() print(f"Significant genes: {len(sig_genes)}") # Run enrichment enr_res = gp.enrichr( gene_list=sig_genes, gene_sets=['KEGG_2021_Human', 'GO_Biological_Process_2021', 'MSigDB_Hallmark_2020'], organism='Human', outdir='enrichr_results', cutoff=0.05 ) # View results print(enr_res.results.head(10))

Visualization

# Bar plot for ORA results from gseapy.plot import barplot barplot( enr_res.results, title='Enrichr Analysis', figsize=(8, 10) ) plt.tight_layout() plt.show()

Comparing Multiple Conditions

# Compare GSEA results across conditions conditions = ['Condition_A', 'Condition_B', 'Condition_C'] all_results = {} for condition in conditions: # Load ranking for this condition gene_rank = load_ranking(condition) # Your function # Run GSEA res = gp.prerank( rnk=gene_rank, gene_sets='KEGG_2021_Human', outdir=f'gsea_{condition}', permutation_num=1000 ) all_results[condition] = res.res2d # Compare top pathways across conditions # Create comparison heatmap pathway_comparison = pd.DataFrame() for condition, results in all_results.items(): # Get NES for top pathways top_pathways = results.nsmallest(20, 'FDR q-val') nes_series = top_pathways.set_index('Term')['NES'] pathway_comparison[condition] = nes_series # Plot plt.figure(figsize=(12, 10)) sns.heatmap( pathway_comparison, cmap='RdBu_r', center=0, vmin=-3, vmax=3, cbar_kws={'label': 'NES'} ) plt.title('Pathway Enrichment Across Conditions', fontsize=14, fontweight='bold') plt.tight_layout() plt.show()

Best Practices

GSEA Guidelines:

  1. Use all genes, not just significant ones (for preranked)
  2. Ensure adequate sample size (typically >6 per group)
  3. Use FDR < 0.25 as threshold (not 0.05)
  4. Consider multiple gene set databases
  5. Validate key findings with literature

Quality Control

# Check ranking distribution plt.figure(figsize=(10, 5)) plt.hist(list(gene_rank.values()), bins=50, edgecolor='black') plt.xlabel('Rank Metric') plt.ylabel('Number of Genes') plt.title('Distribution of Gene Rankings') plt.axvline(x=0, color='red', linestyle='--') plt.tight_layout() plt.show() # Check for appropriate spread print(f"Rank range: {min(gene_rank.values()):.2f} to {max(gene_rank.values()):.2f}") print(f"Median: {np.median(list(gene_rank.values())):.2f}")

Export Results

# Save results sig_pathways = results_df[results_df['FDR q-val'] < 0.25] # Full results results_df.to_csv('gsea_results_all.csv', index=False) # Significant only sig_pathways.to_csv('gsea_results_significant.csv', index=False) # Leading edge genes for top pathway top_genes = sig_pathways.iloc[0]['Lead_genes'].split(';') pd.DataFrame({'gene': top_genes}).to_csv( 'leading_edge_genes.csv', index=False )

Common Pitfalls:

  • Using only significant genes (preranked needs all genes)
  • Inappropriate ranking metric
  • Too stringent FDR cutoff (0.05 vs 0.25)
  • Ignoring direction of enrichment (NES sign)
  • Not validating with independent data

Next Steps

Last updated on