Differential Gene Expression Analysis
Differential Gene Expression (DGE) analysis identifies genes with significantly different expression levels between experimental conditions. This is fundamental for understanding biological processes, disease mechanisms, and treatment responses.
Overview
DGE analysis answers questions like:
- Which genes are upregulated or downregulated in disease vs. healthy tissue?
- How does gene expression change with drug treatment?
- What genes distinguish different cell types or developmental stages?
Workflow
Load and Prepare Data
Gene expression data typically comes as a count matrix or normalized expression values.
import pandas as pd
import numpy as np
from scipy import stats
import matplotlib.pyplot as plt
import seaborn as sns
# Load gene expression data
# Rows = genes, Columns = samples
expr_data = pd.read_csv('gene_counts.csv', index_col=0)
# Sample metadata
metadata = pd.DataFrame({
'sample': expr_data.columns,
'condition': ['control', 'control', 'control',
'treatment', 'treatment', 'treatment'],
'batch': [1, 1, 2, 1, 2, 2]
})
print(f"Expression matrix shape: {expr_data.shape}")
print(f"Genes: {expr_data.shape[0]}, Samples: {expr_data.shape[1]}")Normalization
Normalize data to account for library size differences:
def normalize_counts(counts, method='tpm'):
"""
Normalize count data
Parameters:
-----------
counts : DataFrame
Raw count matrix
method : str
'tpm' or 'cpm' or 'log2cpm'
"""
if method == 'cpm':
# Counts per million
total_counts = counts.sum(axis=0)
normalized = counts.div(total_counts) * 1e6
elif method == 'log2cpm':
# Log2 transformed CPM
total_counts = counts.sum(axis=0)
cpm = counts.div(total_counts) * 1e6
normalized = np.log2(cpm + 1)
elif method == 'tpm':
# Transcripts per million (requires gene lengths)
# Simplified version
total_counts = counts.sum(axis=0)
normalized = counts.div(total_counts) * 1e6
return normalized
# Apply normalization
expr_normalized = normalize_counts(expr_data, method='log2cpm')Filter Low-Expressed Genes
Remove genes with insufficient expression:
def filter_low_expression(expr_data, min_count=10, min_samples=3):
"""
Filter genes with low expression
Parameters:
-----------
expr_data : DataFrame
Expression matrix
min_count : int
Minimum count threshold
min_samples : int
Minimum number of samples above threshold
"""
# Count samples above threshold for each gene
pass_filter = (expr_data >= min_count).sum(axis=1) >= min_samples
filtered_data = expr_data[pass_filter]
print(f"Genes before filtering: {expr_data.shape[0]}")
print(f"Genes after filtering: {filtered_data.shape[0]}")
print(f"Removed: {expr_data.shape[0] - filtered_data.shape[0]} genes")
return filtered_data
# Filter
expr_filtered = filter_low_expression(expr_data)
expr_normalized = normalize_counts(expr_filtered, method='log2cpm')Perform Statistical Testing
Test for differential expression using t-tests or more sophisticated methods:
def perform_dge_ttest(expr_data, metadata, group_col='condition',
control='control', treatment='treatment'):
"""
Perform differential expression using t-tests
Parameters:
-----------
expr_data : DataFrame
Normalized expression matrix
metadata : DataFrame
Sample metadata
group_col : str
Column name for grouping
control : str
Control group label
treatment : str
Treatment group label
"""
# Get sample indices for each group
control_samples = metadata[metadata[group_col] == control]['sample']
treatment_samples = metadata[metadata[group_col] == treatment]['sample']
results = []
for gene in expr_data.index:
control_expr = expr_data.loc[gene, control_samples]
treatment_expr = expr_data.loc[gene, treatment_samples]
# Calculate fold change
control_mean = control_expr.mean()
treatment_mean = treatment_expr.mean()
log2fc = treatment_mean - control_mean # Already log2
# Perform t-test
t_stat, p_value = stats.ttest_ind(treatment_expr, control_expr)
results.append({
'gene': gene,
'log2FoldChange': log2fc,
'baseMean': expr_data.loc[gene].mean(),
'control_mean': control_mean,
'treatment_mean': treatment_mean,
'pvalue': p_value,
'stat': t_stat
})
results_df = pd.DataFrame(results)
# Calculate adjusted p-values (FDR correction)
from statsmodels.stats.multitest import multipletests
_, results_df['padj'], _, _ = multipletests(
results_df['pvalue'],
method='fdr_bh'
)
# Sort by p-value
results_df = results_df.sort_values('pvalue')
return results_df
# Perform DGE
dge_results = perform_dge_ttest(expr_normalized, metadata)
# Display top genes
print("\nTop 10 Differentially Expressed Genes:")
print(dge_results.head(10)[['gene', 'log2FoldChange', 'pvalue', 'padj']])Multiple Testing Correction
Account for multiple hypothesis testing:
# Identify significant genes
alpha = 0.05
significant_genes = dge_results[dge_results['padj'] < alpha]
print(f"\nSignificant genes (padj < {alpha}): {len(significant_genes)}")
print(f"Upregulated: {(significant_genes['log2FoldChange'] > 0).sum()}")
print(f"Downregulated: {(significant_genes['log2FoldChange'] < 0).sum()}")Advanced Methods
Using limma-voom approach (via rpy2)
For more sophisticated analysis, integrate with R’s limma package:
import rpy2.robjects as ro
from rpy2.robjects import pandas2ri
from rpy2.robjects.packages import importr
# Enable pandas conversion
pandas2ri.activate()
# Import R libraries
limma = importr('limma')
edgeR = importr('edgeR')
# Convert to R
r_counts = pandas2ri.py2rpy(expr_data)
r_groups = ro.StrVector(metadata['condition'])
# Create DGEList
dge = edgeR.DGEList(counts=r_counts, group=r_groups)
# Normalize
dge = edgeR.calcNormFactors(dge)
# Design matrix
design = ro.r['model.matrix'](ro.Formula('~ 0 + condition'),
data=metadata)
# Voom transformation
v = limma.voom(dge, design)
# Fit model
fit = limma.lmFit(v, design)
# Make contrasts
contrast_matrix = limma.makeContrasts(
'treatment-control',
levels=design
)
fit2 = limma.contrasts_fit(fit, contrast_matrix)
fit2 = limma.eBayes(fit2)
# Get results
results = limma.topTable(fit2, number=float('inf'))For RNA-seq count data, specialized tools like DESeq2 or edgeR (in R) provide more accurate results by modeling the negative binomial distribution of count data.
Interpreting Results
Understanding Metrics
-
log2FoldChange: Log2 ratio of expression between conditions
- Positive = upregulated in treatment
- Negative = downregulated in treatment
- |log2FC| > 1 typically considered biologically significant (2-fold change)
-
pvalue: Statistical significance of the difference
- Lower = more confident the difference isn’t due to chance
-
padj (adjusted p-value): Corrected for multiple testing
- Use this instead of raw p-value
- Common threshold: padj < 0.05
-
baseMean: Average expression across all samples
- Higher = more reliably detected
Significance Thresholds
# Common thresholds
FC_THRESHOLD = 1.0 # log2 fold change
PVAL_THRESHOLD = 0.05 # adjusted p-value
# Classify genes
dge_results['significant'] = (
(abs(dge_results['log2FoldChange']) > FC_THRESHOLD) &
(dge_results['padj'] < PVAL_THRESHOLD)
)
dge_results['regulation'] = 'Not Significant'
dge_results.loc[
(dge_results['log2FoldChange'] > FC_THRESHOLD) &
(dge_results['padj'] < PVAL_THRESHOLD),
'regulation'
] = 'Upregulated'
dge_results.loc[
(dge_results['log2FoldChange'] < -FC_THRESHOLD) &
(dge_results['padj'] < PVAL_THRESHOLD),
'regulation'
] = 'Downregulated'
# Summary
print(dge_results['regulation'].value_counts())Quality Control
Sample Correlation
Check sample similarity:
# Calculate correlation matrix
correlation_matrix = expr_normalized.corr()
# Plot
plt.figure(figsize=(10, 8))
sns.heatmap(correlation_matrix, annot=True, fmt='.2f',
cmap='coolwarm', center=0.8)
plt.title('Sample Correlation Matrix')
plt.tight_layout()
plt.show()PCA Plot
Visualize sample relationships:
from sklearn.decomposition import PCA
# Perform PCA
pca = PCA(n_components=2)
pca_coords = pca.fit_transform(expr_normalized.T)
# Create dataframe
pca_df = pd.DataFrame(
pca_coords,
columns=['PC1', 'PC2'],
index=expr_normalized.columns
)
pca_df['condition'] = metadata.set_index('sample')['condition']
# Plot
plt.figure(figsize=(8, 6))
for condition in pca_df['condition'].unique():
subset = pca_df[pca_df['condition'] == condition]
plt.scatter(subset['PC1'], subset['PC2'],
label=condition, s=100, alpha=0.7)
plt.xlabel(f'PC1 ({pca.explained_variance_ratio_[0]:.1%})')
plt.ylabel(f'PC2 ({pca.explained_variance_ratio_[1]:.1%})')
plt.title('PCA of Samples')
plt.legend()
plt.grid(alpha=0.3)
plt.tight_layout()
plt.show()Exporting Results
# Save complete results
dge_results.to_csv('dge_results_all.csv', index=False)
# Save only significant genes
significant_genes.to_csv('dge_results_significant.csv', index=False)
# Save for downstream analysis
upregulated = significant_genes[significant_genes['log2FoldChange'] > 0]
downregulated = significant_genes[significant_genes['log2FoldChange'] < 0]
upregulated['gene'].to_csv('upregulated_genes.txt',
index=False, header=False)
downregulated['gene'].to_csv('downregulated_genes.txt',
index=False, header=False)Always validate key findings with:
- Independent validation cohort
- qPCR for select genes
- Biological literature review
- Pathway analysis to check if results make biological sense
Next Steps
- Visualization - Create volcano plots and heatmaps
- GSEA - Perform gene set enrichment analysis
- Pathway Analysis - Understand biological context