Skip to Content
DocumentationDGE Analysis

Differential Gene Expression Analysis

Differential Gene Expression (DGE) analysis identifies genes with significantly different expression levels between experimental conditions. This is fundamental for understanding biological processes, disease mechanisms, and treatment responses.

Overview

DGE analysis answers questions like:

  • Which genes are upregulated or downregulated in disease vs. healthy tissue?
  • How does gene expression change with drug treatment?
  • What genes distinguish different cell types or developmental stages?

Workflow

Load and Prepare Data

Gene expression data typically comes as a count matrix or normalized expression values.

import pandas as pd import numpy as np from scipy import stats import matplotlib.pyplot as plt import seaborn as sns # Load gene expression data # Rows = genes, Columns = samples expr_data = pd.read_csv('gene_counts.csv', index_col=0) # Sample metadata metadata = pd.DataFrame({ 'sample': expr_data.columns, 'condition': ['control', 'control', 'control', 'treatment', 'treatment', 'treatment'], 'batch': [1, 1, 2, 1, 2, 2] }) print(f"Expression matrix shape: {expr_data.shape}") print(f"Genes: {expr_data.shape[0]}, Samples: {expr_data.shape[1]}")

Normalization

Normalize data to account for library size differences:

def normalize_counts(counts, method='tpm'): """ Normalize count data Parameters: ----------- counts : DataFrame Raw count matrix method : str 'tpm' or 'cpm' or 'log2cpm' """ if method == 'cpm': # Counts per million total_counts = counts.sum(axis=0) normalized = counts.div(total_counts) * 1e6 elif method == 'log2cpm': # Log2 transformed CPM total_counts = counts.sum(axis=0) cpm = counts.div(total_counts) * 1e6 normalized = np.log2(cpm + 1) elif method == 'tpm': # Transcripts per million (requires gene lengths) # Simplified version total_counts = counts.sum(axis=0) normalized = counts.div(total_counts) * 1e6 return normalized # Apply normalization expr_normalized = normalize_counts(expr_data, method='log2cpm')

Filter Low-Expressed Genes

Remove genes with insufficient expression:

def filter_low_expression(expr_data, min_count=10, min_samples=3): """ Filter genes with low expression Parameters: ----------- expr_data : DataFrame Expression matrix min_count : int Minimum count threshold min_samples : int Minimum number of samples above threshold """ # Count samples above threshold for each gene pass_filter = (expr_data >= min_count).sum(axis=1) >= min_samples filtered_data = expr_data[pass_filter] print(f"Genes before filtering: {expr_data.shape[0]}") print(f"Genes after filtering: {filtered_data.shape[0]}") print(f"Removed: {expr_data.shape[0] - filtered_data.shape[0]} genes") return filtered_data # Filter expr_filtered = filter_low_expression(expr_data) expr_normalized = normalize_counts(expr_filtered, method='log2cpm')

Perform Statistical Testing

Test for differential expression using t-tests or more sophisticated methods:

def perform_dge_ttest(expr_data, metadata, group_col='condition', control='control', treatment='treatment'): """ Perform differential expression using t-tests Parameters: ----------- expr_data : DataFrame Normalized expression matrix metadata : DataFrame Sample metadata group_col : str Column name for grouping control : str Control group label treatment : str Treatment group label """ # Get sample indices for each group control_samples = metadata[metadata[group_col] == control]['sample'] treatment_samples = metadata[metadata[group_col] == treatment]['sample'] results = [] for gene in expr_data.index: control_expr = expr_data.loc[gene, control_samples] treatment_expr = expr_data.loc[gene, treatment_samples] # Calculate fold change control_mean = control_expr.mean() treatment_mean = treatment_expr.mean() log2fc = treatment_mean - control_mean # Already log2 # Perform t-test t_stat, p_value = stats.ttest_ind(treatment_expr, control_expr) results.append({ 'gene': gene, 'log2FoldChange': log2fc, 'baseMean': expr_data.loc[gene].mean(), 'control_mean': control_mean, 'treatment_mean': treatment_mean, 'pvalue': p_value, 'stat': t_stat }) results_df = pd.DataFrame(results) # Calculate adjusted p-values (FDR correction) from statsmodels.stats.multitest import multipletests _, results_df['padj'], _, _ = multipletests( results_df['pvalue'], method='fdr_bh' ) # Sort by p-value results_df = results_df.sort_values('pvalue') return results_df # Perform DGE dge_results = perform_dge_ttest(expr_normalized, metadata) # Display top genes print("\nTop 10 Differentially Expressed Genes:") print(dge_results.head(10)[['gene', 'log2FoldChange', 'pvalue', 'padj']])

Multiple Testing Correction

Account for multiple hypothesis testing:

# Identify significant genes alpha = 0.05 significant_genes = dge_results[dge_results['padj'] < alpha] print(f"\nSignificant genes (padj < {alpha}): {len(significant_genes)}") print(f"Upregulated: {(significant_genes['log2FoldChange'] > 0).sum()}") print(f"Downregulated: {(significant_genes['log2FoldChange'] < 0).sum()}")

Advanced Methods

Using limma-voom approach (via rpy2)

For more sophisticated analysis, integrate with R’s limma package:

import rpy2.robjects as ro from rpy2.robjects import pandas2ri from rpy2.robjects.packages import importr # Enable pandas conversion pandas2ri.activate() # Import R libraries limma = importr('limma') edgeR = importr('edgeR') # Convert to R r_counts = pandas2ri.py2rpy(expr_data) r_groups = ro.StrVector(metadata['condition']) # Create DGEList dge = edgeR.DGEList(counts=r_counts, group=r_groups) # Normalize dge = edgeR.calcNormFactors(dge) # Design matrix design = ro.r['model.matrix'](ro.Formula('~ 0 + condition'), data=metadata) # Voom transformation v = limma.voom(dge, design) # Fit model fit = limma.lmFit(v, design) # Make contrasts contrast_matrix = limma.makeContrasts( 'treatment-control', levels=design ) fit2 = limma.contrasts_fit(fit, contrast_matrix) fit2 = limma.eBayes(fit2) # Get results results = limma.topTable(fit2, number=float('inf'))

For RNA-seq count data, specialized tools like DESeq2 or edgeR (in R) provide more accurate results by modeling the negative binomial distribution of count data.

Interpreting Results

Understanding Metrics

  • log2FoldChange: Log2 ratio of expression between conditions

    • Positive = upregulated in treatment
    • Negative = downregulated in treatment
    • |log2FC| > 1 typically considered biologically significant (2-fold change)
  • pvalue: Statistical significance of the difference

    • Lower = more confident the difference isn’t due to chance
  • padj (adjusted p-value): Corrected for multiple testing

    • Use this instead of raw p-value
    • Common threshold: padj < 0.05
  • baseMean: Average expression across all samples

    • Higher = more reliably detected

Significance Thresholds

# Common thresholds FC_THRESHOLD = 1.0 # log2 fold change PVAL_THRESHOLD = 0.05 # adjusted p-value # Classify genes dge_results['significant'] = ( (abs(dge_results['log2FoldChange']) > FC_THRESHOLD) & (dge_results['padj'] < PVAL_THRESHOLD) ) dge_results['regulation'] = 'Not Significant' dge_results.loc[ (dge_results['log2FoldChange'] > FC_THRESHOLD) & (dge_results['padj'] < PVAL_THRESHOLD), 'regulation' ] = 'Upregulated' dge_results.loc[ (dge_results['log2FoldChange'] < -FC_THRESHOLD) & (dge_results['padj'] < PVAL_THRESHOLD), 'regulation' ] = 'Downregulated' # Summary print(dge_results['regulation'].value_counts())

Quality Control

Sample Correlation

Check sample similarity:

# Calculate correlation matrix correlation_matrix = expr_normalized.corr() # Plot plt.figure(figsize=(10, 8)) sns.heatmap(correlation_matrix, annot=True, fmt='.2f', cmap='coolwarm', center=0.8) plt.title('Sample Correlation Matrix') plt.tight_layout() plt.show()

PCA Plot

Visualize sample relationships:

from sklearn.decomposition import PCA # Perform PCA pca = PCA(n_components=2) pca_coords = pca.fit_transform(expr_normalized.T) # Create dataframe pca_df = pd.DataFrame( pca_coords, columns=['PC1', 'PC2'], index=expr_normalized.columns ) pca_df['condition'] = metadata.set_index('sample')['condition'] # Plot plt.figure(figsize=(8, 6)) for condition in pca_df['condition'].unique(): subset = pca_df[pca_df['condition'] == condition] plt.scatter(subset['PC1'], subset['PC2'], label=condition, s=100, alpha=0.7) plt.xlabel(f'PC1 ({pca.explained_variance_ratio_[0]:.1%})') plt.ylabel(f'PC2 ({pca.explained_variance_ratio_[1]:.1%})') plt.title('PCA of Samples') plt.legend() plt.grid(alpha=0.3) plt.tight_layout() plt.show()

Exporting Results

# Save complete results dge_results.to_csv('dge_results_all.csv', index=False) # Save only significant genes significant_genes.to_csv('dge_results_significant.csv', index=False) # Save for downstream analysis upregulated = significant_genes[significant_genes['log2FoldChange'] > 0] downregulated = significant_genes[significant_genes['log2FoldChange'] < 0] upregulated['gene'].to_csv('upregulated_genes.txt', index=False, header=False) downregulated['gene'].to_csv('downregulated_genes.txt', index=False, header=False)

Always validate key findings with:

  • Independent validation cohort
  • qPCR for select genes
  • Biological literature review
  • Pathway analysis to check if results make biological sense

Next Steps

Last updated on