Principal Component Analysis (PCA)
PCA is a dimensionality reduction technique that transforms high-dimensional data into a lower-dimensional space while preserving as much variance as possible. Essential for visualizing and understanding complex gene expression data.
Why PCA in Bioinformatics?
PCA helps with:
- Visualization: Reduce thousands of genes to 2-3 dimensions
- Quality Control: Identify outliers and batch effects
- Noise Reduction: Focus on major patterns
- Feature Extraction: Create summary variables
- Pattern Discovery: Reveal sample groupings
Theory Overview
PCA finds orthogonal directions (principal components) in data that capture maximum variance:
- PC1: Direction of highest variance
- PC2: Direction of second highest variance (orthogonal to PC1)
- PC3: Third highest (orthogonal to PC1 and PC2)
- And so on…
Each sample gets a score on each PC, allowing high-dimensional data to be plotted in 2D or 3D.
Basic PCA Implementation
Prepare Data
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
# Generate sample gene expression data
np.random.seed(42)
n_genes = 1000
n_samples = 30
# Create three groups with different expression patterns
group1 = np.random.normal(5, 1, (n_genes, 10))
group2 = np.random.normal(7, 1, (n_genes, 10))
group3 = np.random.normal(6, 1.5, (n_genes, 10))
expr_data = np.concatenate([group1, group2, group3], axis=1)
# Create DataFrame
genes = [f'Gene_{i}' for i in range(n_genes)]
samples = [f'Sample_{i}' for i in range(n_samples)]
df = pd.DataFrame(expr_data, index=genes, columns=samples)
# Create metadata
metadata = pd.DataFrame({
'Sample': samples,
'Group': ['A']*10 + ['B']*10 + ['C']*10,
'Batch': ['1', '2']*15
})
print(f"Expression data shape: {df.shape}")
print(f"Genes: {n_genes}, Samples: {n_samples}")Standardize Data
# Standardize (important for PCA)
scaler = StandardScaler()
# Transpose: samples should be rows, genes as features
expr_transposed = df.T
expr_scaled = scaler.fit_transform(expr_transposed)
print(f"Scaled data shape: {expr_scaled.shape}")
print(f"Mean: {expr_scaled.mean():.2e}, Std: {expr_scaled.std():.2f}")Perform PCA
# Perform PCA
n_components = 10
pca = PCA(n_components=n_components)
pca_coords = pca.fit_transform(expr_scaled)
# Create results DataFrame
pca_df = pd.DataFrame(
pca_coords,
columns=[f'PC{i+1}' for i in range(n_components)],
index=df.columns
)
# Add metadata
pca_df = pca_df.merge(metadata, left_index=True, right_on='Sample')
print("PCA coordinates:")
print(pca_df.head())Variance Explained
Scree Plot
# Calculate variance explained
variance_explained = pca.explained_variance_ratio_ * 100
cumulative_variance = np.cumsum(variance_explained)
# Create scree plot
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 5))
# Individual variance
ax1.bar(range(1, n_components + 1), variance_explained,
alpha=0.7, color='steelblue', edgecolor='black')
ax1.set_xlabel('Principal Component', fontsize=12)
ax1.set_ylabel('Variance Explained (%)', fontsize=12)
ax1.set_title('Scree Plot', fontsize=14, fontweight='bold')
ax1.grid(alpha=0.3, axis='y')
# Cumulative variance
ax2.plot(range(1, n_components + 1), cumulative_variance,
'o-', linewidth=2, markersize=8, color='darkred')
ax2.axhline(y=80, color='green', linestyle='--',
label='80% threshold', alpha=0.7)
ax2.set_xlabel('Principal Component', fontsize=12)
ax2.set_ylabel('Cumulative Variance Explained (%)', fontsize=12)
ax2.set_title('Cumulative Variance', fontsize=14, fontweight='bold')
ax2.legend()
ax2.grid(alpha=0.3)
plt.tight_layout()
plt.show()
# Print variance summary
print("Variance Explained:")
for i in range(min(5, n_components)):
print(f" PC{i+1}: {variance_explained[i]:.2f}% "
f"(Cumulative: {cumulative_variance[i]:.2f}%)")Visualization
2D PCA Plot
# Basic 2D PCA plot
plt.figure(figsize=(10, 7))
# Plot by group
colors = {'A': '#FF6B6B', 'B': '#4ECDC4', 'C': '#45B7D1'}
for group in pca_df['Group'].unique():
mask = pca_df['Group'] == group
plt.scatter(
pca_df.loc[mask, 'PC1'],
pca_df.loc[mask, 'PC2'],
label=f'Group {group}',
color=colors[group],
s=100,
alpha=0.7,
edgecolors='black'
)
plt.xlabel(f'PC1 ({variance_explained[0]:.2f}%)', fontsize=12)
plt.ylabel(f'PC2 ({variance_explained[1]:.2f}%)', fontsize=12)
plt.title('PCA Plot - Gene Expression', fontsize=14, fontweight='bold')
plt.legend(loc='best', fontsize=10)
plt.grid(alpha=0.3)
plt.tight_layout()
plt.show()Annotated PCA Plot
# PCA plot with sample labels
fig, ax = plt.subplots(figsize=(12, 8))
for group in pca_df['Group'].unique():
mask = pca_df['Group'] == group
ax.scatter(
pca_df.loc[mask, 'PC1'],
pca_df.loc[mask, 'PC2'],
label=f'Group {group}',
color=colors[group],
s=150,
alpha=0.6,
edgecolors='black',
linewidths=1.5
)
# Add sample labels
for idx, row in pca_df.iterrows():
ax.annotate(
row['Sample'],
(row['PC1'], row['PC2']),
fontsize=8,
alpha=0.7,
xytext=(5, 5),
textcoords='offset points'
)
ax.set_xlabel(f'PC1 ({variance_explained[0]:.2f}%)', fontsize=12)
ax.set_ylabel(f'PC2 ({variance_explained[1]:.2f}%)', fontsize=12)
ax.set_title('Annotated PCA Plot', fontsize=14, fontweight='bold')
ax.legend(loc='best', fontsize=10)
ax.grid(alpha=0.3)
ax.axhline(y=0, color='gray', linestyle='--', alpha=0.3)
ax.axvline(x=0, color='gray', linestyle='--', alpha=0.3)
plt.tight_layout()
plt.show()Multiple PC Pairs
# Plot multiple PC combinations
fig, axes = plt.subplots(2, 2, figsize=(14, 12))
pc_pairs = [(1, 2), (1, 3), (2, 3), (3, 4)]
for idx, (pc_x, pc_y) in enumerate(pc_pairs):
ax = axes[idx // 2, idx % 2]
for group in pca_df['Group'].unique():
mask = pca_df['Group'] == group
ax.scatter(
pca_df.loc[mask, f'PC{pc_x}'],
pca_df.loc[mask, f'PC{pc_y}'],
label=f'Group {group}',
color=colors[group],
s=100,
alpha=0.6,
edgecolors='black'
)
ax.set_xlabel(f'PC{pc_x} ({variance_explained[pc_x-1]:.2f}%)', fontsize=11)
ax.set_ylabel(f'PC{pc_y} ({variance_explained[pc_y-1]:.2f}%)', fontsize=11)
ax.set_title(f'PC{pc_x} vs PC{pc_y}', fontsize=12, fontweight='bold')
ax.legend(fontsize=9)
ax.grid(alpha=0.3)
plt.tight_layout()
plt.show()3D PCA Plot
from mpl_toolkits.mplot3d import Axes3D
# 3D PCA visualization
fig = plt.figure(figsize=(12, 9))
ax = fig.add_subplot(111, projection='3d')
for group in pca_df['Group'].unique():
mask = pca_df['Group'] == group
ax.scatter(
pca_df.loc[mask, 'PC1'],
pca_df.loc[mask, 'PC2'],
pca_df.loc[mask, 'PC3'],
label=f'Group {group}',
color=colors[group],
s=100,
alpha=0.6,
edgecolors='black'
)
ax.set_xlabel(f'PC1 ({variance_explained[0]:.2f}%)', fontsize=11)
ax.set_ylabel(f'PC2 ({variance_explained[1]:.2f}%)', fontsize=11)
ax.set_zlabel(f'PC3 ({variance_explained[2]:.2f}%)', fontsize=11)
ax.set_title('3D PCA Plot', fontsize=14, fontweight='bold')
ax.legend()
plt.tight_layout()
plt.show()Loading Analysis
Understanding Loadings
Loadings show which genes contribute most to each PC:
# Extract loadings
loadings = pd.DataFrame(
pca.components_.T,
columns=[f'PC{i+1}' for i in range(n_components)],
index=df.index
)
print("Loadings shape:", loadings.shape)
print("\nTop 5 genes for PC1:")
print(loadings['PC1'].abs().nlargest(5))Loading Plot
# Plot loadings for PC1 and PC2
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 5))
# PC1 loadings
top_pc1 = loadings['PC1'].abs().nlargest(20)
colors_pc1 = ['red' if x > 0 else 'blue'
for x in loadings.loc[top_pc1.index, 'PC1']]
ax1.barh(range(len(top_pc1)),
loadings.loc[top_pc1.index, 'PC1'],
color=colors_pc1, alpha=0.7)
ax1.set_yticks(range(len(top_pc1)))
ax1.set_yticklabels(top_pc1.index, fontsize=8)
ax1.set_xlabel('Loading', fontsize=11)
ax1.set_title('Top 20 Genes - PC1 Loadings', fontsize=12, fontweight='bold')
ax1.axvline(x=0, color='black', linestyle='-', linewidth=0.5)
ax1.grid(alpha=0.3, axis='x')
# PC2 loadings
top_pc2 = loadings['PC2'].abs().nlargest(20)
colors_pc2 = ['red' if x > 0 else 'blue'
for x in loadings.loc[top_pc2.index, 'PC2']]
ax2.barh(range(len(top_pc2)),
loadings.loc[top_pc2.index, 'PC2'],
color=colors_pc2, alpha=0.7)
ax2.set_yticks(range(len(top_pc2)))
ax2.set_yticklabels(top_pc2.index, fontsize=8)
ax2.set_xlabel('Loading', fontsize=11)
ax2.set_title('Top 20 Genes - PC2 Loadings', fontsize=12, fontweight='bold')
ax2.axvline(x=0, color='black', linestyle='-', linewidth=0.5)
ax2.grid(alpha=0.3, axis='x')
plt.tight_tight()
plt.show()Biplot
# Biplot: samples (scores) + genes (loadings)
def biplot(scores, loadings, labels, n_arrows=10):
"""Create PCA biplot"""
fig, ax = plt.subplots(figsize=(12, 10))
# Plot samples
for group in pca_df['Group'].unique():
mask = pca_df['Group'] == group
ax.scatter(
scores[mask, 0],
scores[mask, 1],
label=f'Group {group}',
color=colors[group],
s=100,
alpha=0.6,
edgecolors='black'
)
# Plot loading vectors for top genes
top_genes_idx = np.argsort(np.abs(loadings[:, 0]) +
np.abs(loadings[:, 1]))[-n_arrows:]
scale = max(np.abs(scores).max(axis=0))
for idx in top_genes_idx:
ax.arrow(0, 0,
loadings[idx, 0] * scale * 0.8,
loadings[idx, 1] * scale * 0.8,
color='red', alpha=0.5, head_width=0.1,
head_length=0.1, linewidth=1.5)
ax.text(loadings[idx, 0] * scale * 0.85,
loadings[idx, 1] * scale * 0.85,
labels[idx], fontsize=9, color='darkred')
ax.set_xlabel(f'PC1 ({variance_explained[0]:.2f}%)', fontsize=12)
ax.set_ylabel(f'PC2 ({variance_explained[1]:.2f}%)', fontsize=12)
ax.set_title('PCA Biplot', fontsize=14, fontweight='bold')
ax.legend()
ax.grid(alpha=0.3)
ax.axhline(y=0, color='gray', linestyle='--', alpha=0.3)
ax.axvline(x=0, color='gray', linestyle='--', alpha=0.3)
plt.tight_layout()
plt.show()
biplot(pca_coords, pca.components_.T, df.index, n_arrows=15)Identifying Outliers
# Calculate Hotelling's T-squared statistic
from scipy.stats import chi2
def hotellings_t2(pca_scores, n_components=2):
"""Calculate Hotelling's T-squared for outlier detection"""
scores = pca_scores[:, :n_components]
mean = scores.mean(axis=0)
cov = np.cov(scores.T)
inv_cov = np.linalg.inv(cov)
t2 = []
for score in scores:
diff = score - mean
t2.append(diff @ inv_cov @ diff.T)
return np.array(t2)
# Calculate T-squared
t2_stats = hotellings_t2(pca_coords, n_components=2)
# Chi-squared critical value (95% confidence)
critical_value = chi2.ppf(0.95, df=2)
# Identify outliers
outliers = t2_stats > critical_value
# Plot with outliers highlighted
plt.figure(figsize=(10, 7))
# Normal samples
mask_normal = ~outliers
plt.scatter(
pca_df.loc[mask_normal, 'PC1'],
pca_df.loc[mask_normal, 'PC2'],
c='steelblue', s=100, alpha=0.6,
label='Normal', edgecolors='black'
)
# Outliers
mask_outlier = outliers
plt.scatter(
pca_df.loc[mask_outlier, 'PC1'],
pca_df.loc[mask_outlier, 'PC2'],
c='red', s=150, alpha=0.8,
label='Outliers', edgecolors='black', marker='X'
)
plt.xlabel(f'PC1 ({variance_explained[0]:.2f}%)', fontsize=12)
plt.ylabel(f'PC2 ({variance_explained[1]:.2f}%)', fontsize=12)
plt.title('PCA with Outlier Detection', fontsize=14, fontweight='bold')
plt.legend()
plt.grid(alpha=0.3)
plt.tight_layout()
plt.show()
print(f"Outliers detected: {outliers.sum()}")
print(f"Outlier samples: {pca_df.loc[outliers, 'Sample'].tolist()}")Batch Effect Detection
# Color by batch instead of group
plt.figure(figsize=(10, 7))
batch_colors = {'1': '#E63946', '2': '#457B9D'}
for batch in pca_df['Batch'].unique():
mask = pca_df['Batch'] == batch
plt.scatter(
pca_df.loc[mask, 'PC1'],
pca_df.loc[mask, 'PC2'],
label=f'Batch {batch}',
color=batch_colors[batch],
s=100,
alpha=0.6,
edgecolors='black'
)
plt.xlabel(f'PC1 ({variance_explained[0]:.2f}%)', fontsize=12)
plt.ylabel(f'PC2 ({variance_explained[1]:.2f}%)', fontsize=12)
plt.title('PCA - Checking for Batch Effects', fontsize=14, fontweight='bold')
plt.legend()
plt.grid(alpha=0.3)
plt.tight_layout()
plt.show()
# Statistical test for batch effect
from scipy.stats import f_oneway
pc1_batch1 = pca_df[pca_df['Batch'] == '1']['PC1']
pc1_batch2 = pca_df[pca_df['Batch'] == '2']['PC1']
f_stat, p_value = f_oneway(pc1_batch1, pc1_batch2)
print(f"PC1 batch effect test:")
print(f" F-statistic: {f_stat:.4f}")
print(f" P-value: {p_value:.4f}")
if p_value < 0.05:
print(" ⚠️ Significant batch effect detected!")Exporting Results
# Save PCA coordinates
pca_df.to_csv('pca_coordinates.csv', index=False)
# Save loadings
loadings.to_csv('pca_loadings.csv')
# Save variance explained
variance_df = pd.DataFrame({
'PC': [f'PC{i+1}' for i in range(n_components)],
'Variance_Explained': variance_explained,
'Cumulative_Variance': cumulative_variance
})
variance_df.to_csv('pca_variance.csv', index=False)
print("PCA results saved!")PCA Best Practices:
- Always standardize data before PCA
- Check variance explained - aim for 70-80% with first few PCs
- Use PCA for visualization, not as final analysis
- Investigate outliers - may be biological or technical
- Consider batch effects
- Validate patterns with known biology
Common Mistakes:
- Not scaling data (biases toward high-variance genes)
- Over-interpreting minor PCs
- Assuming PC1 = biology (could be technical)
- Ignoring percentage of variance explained
- Using too few samples (needs n_samples > n_PCs)
Next Steps
- t-SNE and UMAP - Non-linear dimensionality reduction
- Clustering - Group samples using PCA coordinates
- Visualization - Combine PCA with heatmaps
Last updated on