Skip to Content
DocumentationPrincipal Component Analysis

Principal Component Analysis (PCA)

PCA is a dimensionality reduction technique that transforms high-dimensional data into a lower-dimensional space while preserving as much variance as possible. Essential for visualizing and understanding complex gene expression data.

Why PCA in Bioinformatics?

PCA helps with:

  • Visualization: Reduce thousands of genes to 2-3 dimensions
  • Quality Control: Identify outliers and batch effects
  • Noise Reduction: Focus on major patterns
  • Feature Extraction: Create summary variables
  • Pattern Discovery: Reveal sample groupings

Theory Overview

PCA finds orthogonal directions (principal components) in data that capture maximum variance:

  • PC1: Direction of highest variance
  • PC2: Direction of second highest variance (orthogonal to PC1)
  • PC3: Third highest (orthogonal to PC1 and PC2)
  • And so on…

Each sample gets a score on each PC, allowing high-dimensional data to be plotted in 2D or 3D.

Basic PCA Implementation

Prepare Data

import numpy as np import pandas as pd import matplotlib.pyplot as plt import seaborn as sns from sklearn.decomposition import PCA from sklearn.preprocessing import StandardScaler # Generate sample gene expression data np.random.seed(42) n_genes = 1000 n_samples = 30 # Create three groups with different expression patterns group1 = np.random.normal(5, 1, (n_genes, 10)) group2 = np.random.normal(7, 1, (n_genes, 10)) group3 = np.random.normal(6, 1.5, (n_genes, 10)) expr_data = np.concatenate([group1, group2, group3], axis=1) # Create DataFrame genes = [f'Gene_{i}' for i in range(n_genes)] samples = [f'Sample_{i}' for i in range(n_samples)] df = pd.DataFrame(expr_data, index=genes, columns=samples) # Create metadata metadata = pd.DataFrame({ 'Sample': samples, 'Group': ['A']*10 + ['B']*10 + ['C']*10, 'Batch': ['1', '2']*15 }) print(f"Expression data shape: {df.shape}") print(f"Genes: {n_genes}, Samples: {n_samples}")

Standardize Data

# Standardize (important for PCA) scaler = StandardScaler() # Transpose: samples should be rows, genes as features expr_transposed = df.T expr_scaled = scaler.fit_transform(expr_transposed) print(f"Scaled data shape: {expr_scaled.shape}") print(f"Mean: {expr_scaled.mean():.2e}, Std: {expr_scaled.std():.2f}")

Perform PCA

# Perform PCA n_components = 10 pca = PCA(n_components=n_components) pca_coords = pca.fit_transform(expr_scaled) # Create results DataFrame pca_df = pd.DataFrame( pca_coords, columns=[f'PC{i+1}' for i in range(n_components)], index=df.columns ) # Add metadata pca_df = pca_df.merge(metadata, left_index=True, right_on='Sample') print("PCA coordinates:") print(pca_df.head())

Variance Explained

Scree Plot

# Calculate variance explained variance_explained = pca.explained_variance_ratio_ * 100 cumulative_variance = np.cumsum(variance_explained) # Create scree plot fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 5)) # Individual variance ax1.bar(range(1, n_components + 1), variance_explained, alpha=0.7, color='steelblue', edgecolor='black') ax1.set_xlabel('Principal Component', fontsize=12) ax1.set_ylabel('Variance Explained (%)', fontsize=12) ax1.set_title('Scree Plot', fontsize=14, fontweight='bold') ax1.grid(alpha=0.3, axis='y') # Cumulative variance ax2.plot(range(1, n_components + 1), cumulative_variance, 'o-', linewidth=2, markersize=8, color='darkred') ax2.axhline(y=80, color='green', linestyle='--', label='80% threshold', alpha=0.7) ax2.set_xlabel('Principal Component', fontsize=12) ax2.set_ylabel('Cumulative Variance Explained (%)', fontsize=12) ax2.set_title('Cumulative Variance', fontsize=14, fontweight='bold') ax2.legend() ax2.grid(alpha=0.3) plt.tight_layout() plt.show() # Print variance summary print("Variance Explained:") for i in range(min(5, n_components)): print(f" PC{i+1}: {variance_explained[i]:.2f}% " f"(Cumulative: {cumulative_variance[i]:.2f}%)")

Visualization

2D PCA Plot

# Basic 2D PCA plot plt.figure(figsize=(10, 7)) # Plot by group colors = {'A': '#FF6B6B', 'B': '#4ECDC4', 'C': '#45B7D1'} for group in pca_df['Group'].unique(): mask = pca_df['Group'] == group plt.scatter( pca_df.loc[mask, 'PC1'], pca_df.loc[mask, 'PC2'], label=f'Group {group}', color=colors[group], s=100, alpha=0.7, edgecolors='black' ) plt.xlabel(f'PC1 ({variance_explained[0]:.2f}%)', fontsize=12) plt.ylabel(f'PC2 ({variance_explained[1]:.2f}%)', fontsize=12) plt.title('PCA Plot - Gene Expression', fontsize=14, fontweight='bold') plt.legend(loc='best', fontsize=10) plt.grid(alpha=0.3) plt.tight_layout() plt.show()

Annotated PCA Plot

# PCA plot with sample labels fig, ax = plt.subplots(figsize=(12, 8)) for group in pca_df['Group'].unique(): mask = pca_df['Group'] == group ax.scatter( pca_df.loc[mask, 'PC1'], pca_df.loc[mask, 'PC2'], label=f'Group {group}', color=colors[group], s=150, alpha=0.6, edgecolors='black', linewidths=1.5 ) # Add sample labels for idx, row in pca_df.iterrows(): ax.annotate( row['Sample'], (row['PC1'], row['PC2']), fontsize=8, alpha=0.7, xytext=(5, 5), textcoords='offset points' ) ax.set_xlabel(f'PC1 ({variance_explained[0]:.2f}%)', fontsize=12) ax.set_ylabel(f'PC2 ({variance_explained[1]:.2f}%)', fontsize=12) ax.set_title('Annotated PCA Plot', fontsize=14, fontweight='bold') ax.legend(loc='best', fontsize=10) ax.grid(alpha=0.3) ax.axhline(y=0, color='gray', linestyle='--', alpha=0.3) ax.axvline(x=0, color='gray', linestyle='--', alpha=0.3) plt.tight_layout() plt.show()

Multiple PC Pairs

# Plot multiple PC combinations fig, axes = plt.subplots(2, 2, figsize=(14, 12)) pc_pairs = [(1, 2), (1, 3), (2, 3), (3, 4)] for idx, (pc_x, pc_y) in enumerate(pc_pairs): ax = axes[idx // 2, idx % 2] for group in pca_df['Group'].unique(): mask = pca_df['Group'] == group ax.scatter( pca_df.loc[mask, f'PC{pc_x}'], pca_df.loc[mask, f'PC{pc_y}'], label=f'Group {group}', color=colors[group], s=100, alpha=0.6, edgecolors='black' ) ax.set_xlabel(f'PC{pc_x} ({variance_explained[pc_x-1]:.2f}%)', fontsize=11) ax.set_ylabel(f'PC{pc_y} ({variance_explained[pc_y-1]:.2f}%)', fontsize=11) ax.set_title(f'PC{pc_x} vs PC{pc_y}', fontsize=12, fontweight='bold') ax.legend(fontsize=9) ax.grid(alpha=0.3) plt.tight_layout() plt.show()

3D PCA Plot

from mpl_toolkits.mplot3d import Axes3D # 3D PCA visualization fig = plt.figure(figsize=(12, 9)) ax = fig.add_subplot(111, projection='3d') for group in pca_df['Group'].unique(): mask = pca_df['Group'] == group ax.scatter( pca_df.loc[mask, 'PC1'], pca_df.loc[mask, 'PC2'], pca_df.loc[mask, 'PC3'], label=f'Group {group}', color=colors[group], s=100, alpha=0.6, edgecolors='black' ) ax.set_xlabel(f'PC1 ({variance_explained[0]:.2f}%)', fontsize=11) ax.set_ylabel(f'PC2 ({variance_explained[1]:.2f}%)', fontsize=11) ax.set_zlabel(f'PC3 ({variance_explained[2]:.2f}%)', fontsize=11) ax.set_title('3D PCA Plot', fontsize=14, fontweight='bold') ax.legend() plt.tight_layout() plt.show()

Loading Analysis

Understanding Loadings

Loadings show which genes contribute most to each PC:

# Extract loadings loadings = pd.DataFrame( pca.components_.T, columns=[f'PC{i+1}' for i in range(n_components)], index=df.index ) print("Loadings shape:", loadings.shape) print("\nTop 5 genes for PC1:") print(loadings['PC1'].abs().nlargest(5))

Loading Plot

# Plot loadings for PC1 and PC2 fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 5)) # PC1 loadings top_pc1 = loadings['PC1'].abs().nlargest(20) colors_pc1 = ['red' if x > 0 else 'blue' for x in loadings.loc[top_pc1.index, 'PC1']] ax1.barh(range(len(top_pc1)), loadings.loc[top_pc1.index, 'PC1'], color=colors_pc1, alpha=0.7) ax1.set_yticks(range(len(top_pc1))) ax1.set_yticklabels(top_pc1.index, fontsize=8) ax1.set_xlabel('Loading', fontsize=11) ax1.set_title('Top 20 Genes - PC1 Loadings', fontsize=12, fontweight='bold') ax1.axvline(x=0, color='black', linestyle='-', linewidth=0.5) ax1.grid(alpha=0.3, axis='x') # PC2 loadings top_pc2 = loadings['PC2'].abs().nlargest(20) colors_pc2 = ['red' if x > 0 else 'blue' for x in loadings.loc[top_pc2.index, 'PC2']] ax2.barh(range(len(top_pc2)), loadings.loc[top_pc2.index, 'PC2'], color=colors_pc2, alpha=0.7) ax2.set_yticks(range(len(top_pc2))) ax2.set_yticklabels(top_pc2.index, fontsize=8) ax2.set_xlabel('Loading', fontsize=11) ax2.set_title('Top 20 Genes - PC2 Loadings', fontsize=12, fontweight='bold') ax2.axvline(x=0, color='black', linestyle='-', linewidth=0.5) ax2.grid(alpha=0.3, axis='x') plt.tight_tight() plt.show()

Biplot

# Biplot: samples (scores) + genes (loadings) def biplot(scores, loadings, labels, n_arrows=10): """Create PCA biplot""" fig, ax = plt.subplots(figsize=(12, 10)) # Plot samples for group in pca_df['Group'].unique(): mask = pca_df['Group'] == group ax.scatter( scores[mask, 0], scores[mask, 1], label=f'Group {group}', color=colors[group], s=100, alpha=0.6, edgecolors='black' ) # Plot loading vectors for top genes top_genes_idx = np.argsort(np.abs(loadings[:, 0]) + np.abs(loadings[:, 1]))[-n_arrows:] scale = max(np.abs(scores).max(axis=0)) for idx in top_genes_idx: ax.arrow(0, 0, loadings[idx, 0] * scale * 0.8, loadings[idx, 1] * scale * 0.8, color='red', alpha=0.5, head_width=0.1, head_length=0.1, linewidth=1.5) ax.text(loadings[idx, 0] * scale * 0.85, loadings[idx, 1] * scale * 0.85, labels[idx], fontsize=9, color='darkred') ax.set_xlabel(f'PC1 ({variance_explained[0]:.2f}%)', fontsize=12) ax.set_ylabel(f'PC2 ({variance_explained[1]:.2f}%)', fontsize=12) ax.set_title('PCA Biplot', fontsize=14, fontweight='bold') ax.legend() ax.grid(alpha=0.3) ax.axhline(y=0, color='gray', linestyle='--', alpha=0.3) ax.axvline(x=0, color='gray', linestyle='--', alpha=0.3) plt.tight_layout() plt.show() biplot(pca_coords, pca.components_.T, df.index, n_arrows=15)

Identifying Outliers

# Calculate Hotelling's T-squared statistic from scipy.stats import chi2 def hotellings_t2(pca_scores, n_components=2): """Calculate Hotelling's T-squared for outlier detection""" scores = pca_scores[:, :n_components] mean = scores.mean(axis=0) cov = np.cov(scores.T) inv_cov = np.linalg.inv(cov) t2 = [] for score in scores: diff = score - mean t2.append(diff @ inv_cov @ diff.T) return np.array(t2) # Calculate T-squared t2_stats = hotellings_t2(pca_coords, n_components=2) # Chi-squared critical value (95% confidence) critical_value = chi2.ppf(0.95, df=2) # Identify outliers outliers = t2_stats > critical_value # Plot with outliers highlighted plt.figure(figsize=(10, 7)) # Normal samples mask_normal = ~outliers plt.scatter( pca_df.loc[mask_normal, 'PC1'], pca_df.loc[mask_normal, 'PC2'], c='steelblue', s=100, alpha=0.6, label='Normal', edgecolors='black' ) # Outliers mask_outlier = outliers plt.scatter( pca_df.loc[mask_outlier, 'PC1'], pca_df.loc[mask_outlier, 'PC2'], c='red', s=150, alpha=0.8, label='Outliers', edgecolors='black', marker='X' ) plt.xlabel(f'PC1 ({variance_explained[0]:.2f}%)', fontsize=12) plt.ylabel(f'PC2 ({variance_explained[1]:.2f}%)', fontsize=12) plt.title('PCA with Outlier Detection', fontsize=14, fontweight='bold') plt.legend() plt.grid(alpha=0.3) plt.tight_layout() plt.show() print(f"Outliers detected: {outliers.sum()}") print(f"Outlier samples: {pca_df.loc[outliers, 'Sample'].tolist()}")

Batch Effect Detection

# Color by batch instead of group plt.figure(figsize=(10, 7)) batch_colors = {'1': '#E63946', '2': '#457B9D'} for batch in pca_df['Batch'].unique(): mask = pca_df['Batch'] == batch plt.scatter( pca_df.loc[mask, 'PC1'], pca_df.loc[mask, 'PC2'], label=f'Batch {batch}', color=batch_colors[batch], s=100, alpha=0.6, edgecolors='black' ) plt.xlabel(f'PC1 ({variance_explained[0]:.2f}%)', fontsize=12) plt.ylabel(f'PC2 ({variance_explained[1]:.2f}%)', fontsize=12) plt.title('PCA - Checking for Batch Effects', fontsize=14, fontweight='bold') plt.legend() plt.grid(alpha=0.3) plt.tight_layout() plt.show() # Statistical test for batch effect from scipy.stats import f_oneway pc1_batch1 = pca_df[pca_df['Batch'] == '1']['PC1'] pc1_batch2 = pca_df[pca_df['Batch'] == '2']['PC1'] f_stat, p_value = f_oneway(pc1_batch1, pc1_batch2) print(f"PC1 batch effect test:") print(f" F-statistic: {f_stat:.4f}") print(f" P-value: {p_value:.4f}") if p_value < 0.05: print(" ⚠️ Significant batch effect detected!")

Exporting Results

# Save PCA coordinates pca_df.to_csv('pca_coordinates.csv', index=False) # Save loadings loadings.to_csv('pca_loadings.csv') # Save variance explained variance_df = pd.DataFrame({ 'PC': [f'PC{i+1}' for i in range(n_components)], 'Variance_Explained': variance_explained, 'Cumulative_Variance': cumulative_variance }) variance_df.to_csv('pca_variance.csv', index=False) print("PCA results saved!")

PCA Best Practices:

  • Always standardize data before PCA
  • Check variance explained - aim for 70-80% with first few PCs
  • Use PCA for visualization, not as final analysis
  • Investigate outliers - may be biological or technical
  • Consider batch effects
  • Validate patterns with known biology

Common Mistakes:

  • Not scaling data (biases toward high-variance genes)
  • Over-interpreting minor PCs
  • Assuming PC1 = biology (could be technical)
  • Ignoring percentage of variance explained
  • Using too few samples (needs n_samples > n_PCs)

Next Steps

Last updated on