Principal Component Analysis (PCA)

PCA is a dimensionality reduction technique that transforms high-dimensional data into a lower-dimensional space while preserving as much variance as possible. Essential for visualizing and understanding complex gene expression data.

Why PCA in Bioinformatics?

PCA helps with:

Visualization: Reduce thousands of genes to 2-3 dimensions
Quality Control: Identify outliers and batch effects
Noise Reduction: Focus on major patterns
Feature Extraction: Create summary variables
Pattern Discovery: Reveal sample groupings

Theory Overview

PCA finds orthogonal directions (principal components) in data that capture maximum variance:

PC1: Direction of highest variance
PC2: Direction of second highest variance (orthogonal to PC1)
PC3: Third highest (orthogonal to PC1 and PC2)
And so on…

Each sample gets a score on each PC, allowing high-dimensional data to be plotted in 2D or 3D.

Basic PCA Implementation

Prepare Data


import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
 
# Generate sample gene expression data
np.random.seed(42)
n_genes = 1000
n_samples = 30
 
# Create three groups with different expression patterns
group1 = np.random.normal(5, 1, (n_genes, 10))
group2 = np.random.normal(7, 1, (n_genes, 10))
group3 = np.random.normal(6, 1.5, (n_genes, 10))
 
expr_data = np.concatenate([group1, group2, group3], axis=1)
 
# Create DataFrame
genes = [f'Gene_{i}' for i in range(n_genes)]
samples = [f'Sample_{i}' for i in range(n_samples)]
df = pd.DataFrame(expr_data, index=genes, columns=samples)
 
# Create metadata
metadata = pd.DataFrame({
    'Sample': samples,
    'Group': ['A']*10 + ['B']*10 + ['C']*10,
    'Batch': ['1', '2']*15
})
 
print(f"Expression data shape: {df.shape}")
print(f"Genes: {n_genes}, Samples: {n_samples}")

Standardize Data


# Standardize (important for PCA)
scaler = StandardScaler()
 
# Transpose: samples should be rows, genes as features
expr_transposed = df.T
expr_scaled = scaler.fit_transform(expr_transposed)
 
print(f"Scaled data shape: {expr_scaled.shape}")
print(f"Mean: {expr_scaled.mean():.2e}, Std: {expr_scaled.std():.2f}")

Perform PCA


# Perform PCA
n_components = 10
pca = PCA(n_components=n_components)
pca_coords = pca.fit_transform(expr_scaled)
 
# Create results DataFrame
pca_df = pd.DataFrame(
    pca_coords,
    columns=[f'PC{i+1}' for i in range(n_components)],
    index=df.columns
)
 
# Add metadata
pca_df = pca_df.merge(metadata, left_index=True, right_on='Sample')
 
print("PCA coordinates:")
print(pca_df.head())

Variance Explained

Scree Plot


# Calculate variance explained
variance_explained = pca.explained_variance_ratio_ * 100
cumulative_variance = np.cumsum(variance_explained)
 
# Create scree plot
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 5))
 
# Individual variance
ax1.bar(range(1, n_components + 1), variance_explained, 
        alpha=0.7, color='steelblue', edgecolor='black')
ax1.set_xlabel('Principal Component', fontsize=12)
ax1.set_ylabel('Variance Explained (%)', fontsize=12)
ax1.set_title('Scree Plot', fontsize=14, fontweight='bold')
ax1.grid(alpha=0.3, axis='y')
 
# Cumulative variance
ax2.plot(range(1, n_components + 1), cumulative_variance, 
         'o-', linewidth=2, markersize=8, color='darkred')
ax2.axhline(y=80, color='green', linestyle='--', 
            label='80% threshold', alpha=0.7)
ax2.set_xlabel('Principal Component', fontsize=12)
ax2.set_ylabel('Cumulative Variance Explained (%)', fontsize=12)
ax2.set_title('Cumulative Variance', fontsize=14, fontweight='bold')
ax2.legend()
ax2.grid(alpha=0.3)
 
plt.tight_layout()
plt.show()
 
# Print variance summary
print("Variance Explained:")
for i in range(min(5, n_components)):
    print(f"  PC{i+1}: {variance_explained[i]:.2f}% "
          f"(Cumulative: {cumulative_variance[i]:.2f}%)")

Visualization

2D PCA Plot


# Basic 2D PCA plot
plt.figure(figsize=(10, 7))
 
# Plot by group
colors = {'A': '#FF6B6B', 'B': '#4ECDC4', 'C': '#45B7D1'}
for group in pca_df['Group'].unique():
    mask = pca_df['Group'] == group
    plt.scatter(
        pca_df.loc[mask, 'PC1'],
        pca_df.loc[mask, 'PC2'],
        label=f'Group {group}',
        color=colors[group],
        s=100,
        alpha=0.7,
        edgecolors='black'
    )
 
plt.xlabel(f'PC1 ({variance_explained[0]:.2f}%)', fontsize=12)
plt.ylabel(f'PC2 ({variance_explained[1]:.2f}%)', fontsize=12)
plt.title('PCA Plot - Gene Expression', fontsize=14, fontweight='bold')
plt.legend(loc='best', fontsize=10)
plt.grid(alpha=0.3)
plt.tight_layout()
plt.show()

Annotated PCA Plot


# PCA plot with sample labels
fig, ax = plt.subplots(figsize=(12, 8))
 
for group in pca_df['Group'].unique():
    mask = pca_df['Group'] == group
    ax.scatter(
        pca_df.loc[mask, 'PC1'],
        pca_df.loc[mask, 'PC2'],
        label=f'Group {group}',
        color=colors[group],
        s=150,
        alpha=0.6,
        edgecolors='black',
        linewidths=1.5
    )
 
# Add sample labels
for idx, row in pca_df.iterrows():
    ax.annotate(
        row['Sample'],
        (row['PC1'], row['PC2']),
        fontsize=8,
        alpha=0.7,
        xytext=(5, 5),
        textcoords='offset points'
    )
 
ax.set_xlabel(f'PC1 ({variance_explained[0]:.2f}%)', fontsize=12)
ax.set_ylabel(f'PC2 ({variance_explained[1]:.2f}%)', fontsize=12)
ax.set_title('Annotated PCA Plot', fontsize=14, fontweight='bold')
ax.legend(loc='best', fontsize=10)
ax.grid(alpha=0.3)
ax.axhline(y=0, color='gray', linestyle='--', alpha=0.3)
ax.axvline(x=0, color='gray', linestyle='--', alpha=0.3)
 
plt.tight_layout()
plt.show()

Multiple PC Pairs


# Plot multiple PC combinations
fig, axes = plt.subplots(2, 2, figsize=(14, 12))
pc_pairs = [(1, 2), (1, 3), (2, 3), (3, 4)]
 
for idx, (pc_x, pc_y) in enumerate(pc_pairs):
    ax = axes[idx // 2, idx % 2]
    
    for group in pca_df['Group'].unique():
        mask = pca_df['Group'] == group
        ax.scatter(
            pca_df.loc[mask, f'PC{pc_x}'],
            pca_df.loc[mask, f'PC{pc_y}'],
            label=f'Group {group}',
            color=colors[group],
            s=100,
            alpha=0.6,
            edgecolors='black'
        )
    
    ax.set_xlabel(f'PC{pc_x} ({variance_explained[pc_x-1]:.2f}%)', fontsize=11)
    ax.set_ylabel(f'PC{pc_y} ({variance_explained[pc_y-1]:.2f}%)', fontsize=11)
    ax.set_title(f'PC{pc_x} vs PC{pc_y}', fontsize=12, fontweight='bold')
    ax.legend(fontsize=9)
    ax.grid(alpha=0.3)
 
plt.tight_layout()
plt.show()

3D PCA Plot


from mpl_toolkits.mplot3d import Axes3D
 
# 3D PCA visualization
fig = plt.figure(figsize=(12, 9))
ax = fig.add_subplot(111, projection='3d')
 
for group in pca_df['Group'].unique():
    mask = pca_df['Group'] == group
    ax.scatter(
        pca_df.loc[mask, 'PC1'],
        pca_df.loc[mask, 'PC2'],
        pca_df.loc[mask, 'PC3'],
        label=f'Group {group}',
        color=colors[group],
        s=100,
        alpha=0.6,
        edgecolors='black'
    )
 
ax.set_xlabel(f'PC1 ({variance_explained[0]:.2f}%)', fontsize=11)
ax.set_ylabel(f'PC2 ({variance_explained[1]:.2f}%)', fontsize=11)
ax.set_zlabel(f'PC3 ({variance_explained[2]:.2f}%)', fontsize=11)
ax.set_title('3D PCA Plot', fontsize=14, fontweight='bold')
ax.legend()
 
plt.tight_layout()
plt.show()

Loading Analysis

Understanding Loadings

Loadings show which genes contribute most to each PC:


# Extract loadings
loadings = pd.DataFrame(
    pca.components_.T,
    columns=[f'PC{i+1}' for i in range(n_components)],
    index=df.index
)
 
print("Loadings shape:", loadings.shape)
print("\nTop 5 genes for PC1:")
print(loadings['PC1'].abs().nlargest(5))

Loading Plot


# Plot loadings for PC1 and PC2
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 5))
 
# PC1 loadings
top_pc1 = loadings['PC1'].abs().nlargest(20)
colors_pc1 = ['red' if x > 0 else 'blue' 
              for x in loadings.loc[top_pc1.index, 'PC1']]
ax1.barh(range(len(top_pc1)), 
         loadings.loc[top_pc1.index, 'PC1'],
         color=colors_pc1, alpha=0.7)
ax1.set_yticks(range(len(top_pc1)))
ax1.set_yticklabels(top_pc1.index, fontsize=8)
ax1.set_xlabel('Loading', fontsize=11)
ax1.set_title('Top 20 Genes - PC1 Loadings', fontsize=12, fontweight='bold')
ax1.axvline(x=0, color='black', linestyle='-', linewidth=0.5)
ax1.grid(alpha=0.3, axis='x')
 
# PC2 loadings
top_pc2 = loadings['PC2'].abs().nlargest(20)
colors_pc2 = ['red' if x > 0 else 'blue' 
              for x in loadings.loc[top_pc2.index, 'PC2']]
ax2.barh(range(len(top_pc2)), 
         loadings.loc[top_pc2.index, 'PC2'],
         color=colors_pc2, alpha=0.7)
ax2.set_yticks(range(len(top_pc2)))
ax2.set_yticklabels(top_pc2.index, fontsize=8)
ax2.set_xlabel('Loading', fontsize=11)
ax2.set_title('Top 20 Genes - PC2 Loadings', fontsize=12, fontweight='bold')
ax2.axvline(x=0, color='black', linestyle='-', linewidth=0.5)
ax2.grid(alpha=0.3, axis='x')
 
plt.tight_tight()
plt.show()

Biplot


# Biplot: samples (scores) + genes (loadings)
def biplot(scores, loadings, labels, n_arrows=10):
    """Create PCA biplot"""
    
    fig, ax = plt.subplots(figsize=(12, 10))
    
    # Plot samples
    for group in pca_df['Group'].unique():
        mask = pca_df['Group'] == group
        ax.scatter(
            scores[mask, 0],
            scores[mask, 1],
            label=f'Group {group}',
            color=colors[group],
            s=100,
            alpha=0.6,
            edgecolors='black'
        )
    
    # Plot loading vectors for top genes
    top_genes_idx = np.argsort(np.abs(loadings[:, 0]) + 
                                np.abs(loadings[:, 1]))[-n_arrows:]
    
    scale = max(np.abs(scores).max(axis=0))
    for idx in top_genes_idx:
        ax.arrow(0, 0, 
                loadings[idx, 0] * scale * 0.8,
                loadings[idx, 1] * scale * 0.8,
                color='red', alpha=0.5, head_width=0.1,
                head_length=0.1, linewidth=1.5)
        ax.text(loadings[idx, 0] * scale * 0.85,
               loadings[idx, 1] * scale * 0.85,
               labels[idx], fontsize=9, color='darkred')
    
    ax.set_xlabel(f'PC1 ({variance_explained[0]:.2f}%)', fontsize=12)
    ax.set_ylabel(f'PC2 ({variance_explained[1]:.2f}%)', fontsize=12)
    ax.set_title('PCA Biplot', fontsize=14, fontweight='bold')
    ax.legend()
    ax.grid(alpha=0.3)
    ax.axhline(y=0, color='gray', linestyle='--', alpha=0.3)
    ax.axvline(x=0, color='gray', linestyle='--', alpha=0.3)
    
    plt.tight_layout()
    plt.show()
 
biplot(pca_coords, pca.components_.T, df.index, n_arrows=15)

Identifying Outliers


# Calculate Hotelling's T-squared statistic
from scipy.stats import chi2
 
def hotellings_t2(pca_scores, n_components=2):
    """Calculate Hotelling's T-squared for outlier detection"""
    scores = pca_scores[:, :n_components]
    mean = scores.mean(axis=0)
    cov = np.cov(scores.T)
    inv_cov = np.linalg.inv(cov)
    
    t2 = []
    for score in scores:
        diff = score - mean
        t2.append(diff @ inv_cov @ diff.T)
    
    return np.array(t2)
 
# Calculate T-squared
t2_stats = hotellings_t2(pca_coords, n_components=2)
 
# Chi-squared critical value (95% confidence)
critical_value = chi2.ppf(0.95, df=2)
 
# Identify outliers
outliers = t2_stats > critical_value
 
# Plot with outliers highlighted
plt.figure(figsize=(10, 7))
 
# Normal samples
mask_normal = ~outliers
plt.scatter(
    pca_df.loc[mask_normal, 'PC1'],
    pca_df.loc[mask_normal, 'PC2'],
    c='steelblue', s=100, alpha=0.6,
    label='Normal', edgecolors='black'
)
 
# Outliers
mask_outlier = outliers
plt.scatter(
    pca_df.loc[mask_outlier, 'PC1'],
    pca_df.loc[mask_outlier, 'PC2'],
    c='red', s=150, alpha=0.8,
    label='Outliers', edgecolors='black', marker='X'
)
 
plt.xlabel(f'PC1 ({variance_explained[0]:.2f}%)', fontsize=12)
plt.ylabel(f'PC2 ({variance_explained[1]:.2f}%)', fontsize=12)
plt.title('PCA with Outlier Detection', fontsize=14, fontweight='bold')
plt.legend()
plt.grid(alpha=0.3)
plt.tight_layout()
plt.show()
 
print(f"Outliers detected: {outliers.sum()}")
print(f"Outlier samples: {pca_df.loc[outliers, 'Sample'].tolist()}")

Batch Effect Detection


# Color by batch instead of group
plt.figure(figsize=(10, 7))
 
batch_colors = {'1': '#E63946', '2': '#457B9D'}
for batch in pca_df['Batch'].unique():
    mask = pca_df['Batch'] == batch
    plt.scatter(
        pca_df.loc[mask, 'PC1'],
        pca_df.loc[mask, 'PC2'],
        label=f'Batch {batch}',
        color=batch_colors[batch],
        s=100,
        alpha=0.6,
        edgecolors='black'
    )
 
plt.xlabel(f'PC1 ({variance_explained[0]:.2f}%)', fontsize=12)
plt.ylabel(f'PC2 ({variance_explained[1]:.2f}%)', fontsize=12)
plt.title('PCA - Checking for Batch Effects', fontsize=14, fontweight='bold')
plt.legend()
plt.grid(alpha=0.3)
plt.tight_layout()
plt.show()
 
# Statistical test for batch effect
from scipy.stats import f_oneway
 
pc1_batch1 = pca_df[pca_df['Batch'] == '1']['PC1']
pc1_batch2 = pca_df[pca_df['Batch'] == '2']['PC1']
 
f_stat, p_value = f_oneway(pc1_batch1, pc1_batch2)
print(f"PC1 batch effect test:")
print(f"  F-statistic: {f_stat:.4f}")
print(f"  P-value: {p_value:.4f}")
if p_value < 0.05:
    print("  ⚠️  Significant batch effect detected!")

Exporting Results


# Save PCA coordinates
pca_df.to_csv('pca_coordinates.csv', index=False)
 
# Save loadings
loadings.to_csv('pca_loadings.csv')
 
# Save variance explained
variance_df = pd.DataFrame({
    'PC': [f'PC{i+1}' for i in range(n_components)],
    'Variance_Explained': variance_explained,
    'Cumulative_Variance': cumulative_variance
})
variance_df.to_csv('pca_variance.csv', index=False)
 
print("PCA results saved!")

PCA Best Practices:

Always standardize data before PCA
Check variance explained - aim for 70-80% with first few PCs
Use PCA for visualization, not as final analysis
Investigate outliers - may be biological or technical
Consider batch effects
Validate patterns with known biology

Common Mistakes:

Not scaling data (biases toward high-variance genes)
Over-interpreting minor PCs
Assuming PC1 = biology (could be technical)
Ignoring percentage of variance explained
Using too few samples (needs n_samples > n_PCs)

Next Steps

t-SNE and UMAP - Non-linear dimensionality reduction
Clustering - Group samples using PCA coordinates
Visualization - Combine PCA with heatmaps