Skip to Content
DocumentationClustering Methods

Clustering Methods

Clustering groups similar samples or genes based on expression patterns. This unsupervised learning approach reveals natural groupings in data without predefined labels.

Why Clustering in Bioinformatics?

Clustering helps:

  • Identify sample subtypes or patient groups
  • Find co-expressed genes
  • Detect batch effects
  • Validate experimental design
  • Discover biomarker signatures

Hierarchical Clustering

Hierarchical clustering builds a tree (dendrogram) showing relationships between samples.

Basic Implementation

import numpy as np import pandas as pd import matplotlib.pyplot as plt import seaborn as sns from scipy.cluster.hierarchy import dendrogram, linkage, fcluster from scipy.spatial.distance import pdist, squareform # Generate sample gene expression data np.random.seed(42) n_genes = 100 n_samples = 20 # Create three distinct groups group1 = np.random.normal(5, 1, (n_genes, 7)) group2 = np.random.normal(7, 1, (n_genes, 7)) group3 = np.random.normal(6, 1.5, (n_genes, 6)) expr_data = np.concatenate([group1, group2, group3], axis=1) # Create DataFrame genes = [f'Gene_{i}' for i in range(n_genes)] samples = [f'Sample_{i}' for i in range(n_samples)] df = pd.DataFrame(expr_data, index=genes, columns=samples) print(f"Expression matrix: {df.shape}") print(df.head())

Sample Clustering

from scipy.cluster.hierarchy import dendrogram, linkage # Compute linkage matrix # Transpose so samples are rows linkage_matrix = linkage(df.T, method='average', metric='correlation') # Plot dendrogram plt.figure(figsize=(12, 6)) dendrogram( linkage_matrix, labels=df.columns, leaf_font_size=10, color_threshold=0.7 * max(linkage_matrix[:, 2]) ) plt.title('Hierarchical Clustering of Samples', fontsize=14, fontweight='bold') plt.xlabel('Sample', fontsize=12) plt.ylabel('Distance', fontsize=12) plt.xticks(rotation=45, ha='right') plt.tight_layout() plt.show()

Different Linkage Methods

# Average linkage (UPGMA) linkage_avg = linkage(df.T, method='average', metric='euclidean') fig, ax = plt.subplots(figsize=(10, 5)) dendrogram(linkage_avg, labels=df.columns, ax=ax) ax.set_title('Average Linkage') plt.tight_layout() plt.show()

Cutting Dendrogram into Clusters

from scipy.cluster.hierarchy import fcluster # Cut dendrogram at specific height clusters = fcluster(linkage_matrix, t=1.5, criterion='distance') # Or specify number of clusters n_clusters = 3 clusters = fcluster(linkage_matrix, t=n_clusters, criterion='maxclust') # Create cluster assignment DataFrame cluster_df = pd.DataFrame({ 'Sample': df.columns, 'Cluster': clusters }) print("Cluster assignments:") print(cluster_df) # Count samples per cluster print("\nSamples per cluster:") print(cluster_df['Cluster'].value_counts().sort_index())

Visualizing Clusters

# Plot dendrogram with cluster colors from scipy.cluster.hierarchy import dendrogram, set_link_color_palette # Set color palette set_link_color_palette(['#FF6B6B', '#4ECDC4', '#45B7D1']) plt.figure(figsize=(12, 6)) dendro = dendrogram( linkage_matrix, labels=df.columns, color_threshold=1.5, above_threshold_color='gray' ) plt.title('Hierarchical Clustering with Cluster Coloring', fontsize=14, fontweight='bold') plt.xlabel('Sample') plt.ylabel('Distance') plt.axhline(y=1.5, color='red', linestyle='--', label='Cut threshold') plt.legend() plt.tight_layout() plt.show()

Gene Clustering

# Cluster genes instead of samples gene_linkage = linkage(df, method='average', metric='correlation') plt.figure(figsize=(10, 12)) dendrogram( gene_linkage, labels=df.index, orientation='left', leaf_font_size=6 ) plt.title('Hierarchical Clustering of Genes', fontsize=14) plt.xlabel('Distance') plt.ylabel('Gene') plt.tight_layout() plt.show() # Identify gene clusters gene_clusters = fcluster(gene_linkage, t=5, criterion='maxclust') gene_cluster_df = pd.DataFrame({ 'Gene': df.index, 'Cluster': gene_clusters }) print(f"Genes per cluster:") print(gene_cluster_df['Cluster'].value_counts())

K-means Clustering

K-means partitions data into K clusters by minimizing within-cluster variance.

Basic K-means

from sklearn.cluster import KMeans from sklearn.preprocessing import StandardScaler # Standardize data (important for K-means) scaler = StandardScaler() expr_scaled = scaler.fit_transform(df.T) # Perform K-means n_clusters = 3 kmeans = KMeans(n_clusters=n_clusters, random_state=42, n_init=10) clusters = kmeans.fit_predict(expr_scaled) # Create results DataFrame results = pd.DataFrame({ 'Sample': df.columns, 'Cluster': clusters }) print("K-means cluster assignments:") print(results) print(f"\nSamples per cluster:") print(results['Cluster'].value_counts().sort_index())

Determining Optimal K

Elbow Method

from sklearn.cluster import KMeans import matplotlib.pyplot as plt # Test different numbers of clusters inertias = [] silhouette_scores = [] K_range = range(2, 11) for k in K_range: kmeans = KMeans(n_clusters=k, random_state=42, n_init=10) kmeans.fit(expr_scaled) inertias.append(kmeans.inertia_) # Plot elbow curve plt.figure(figsize=(10, 5)) plt.plot(K_range, inertias, 'bo-', linewidth=2, markersize=8) plt.xlabel('Number of Clusters (K)', fontsize=12) plt.ylabel('Within-Cluster Sum of Squares (Inertia)', fontsize=12) plt.title('Elbow Method for Optimal K', fontsize=14, fontweight='bold') plt.grid(alpha=0.3) plt.tight_layout() plt.show()

Silhouette Analysis

from sklearn.metrics import silhouette_score, silhouette_samples import matplotlib.pyplot as plt import numpy as np # Calculate silhouette scores silhouette_scores = [] K_range = range(2, 11) for k in K_range: kmeans = KMeans(n_clusters=k, random_state=42, n_init=10) clusters = kmeans.fit_predict(expr_scaled) score = silhouette_score(expr_scaled, clusters) silhouette_scores.append(score) # Plot silhouette scores plt.figure(figsize=(10, 5)) plt.plot(K_range, silhouette_scores, 'ro-', linewidth=2, markersize=8) plt.xlabel('Number of Clusters (K)', fontsize=12) plt.ylabel('Silhouette Score', fontsize=12) plt.title('Silhouette Analysis for Optimal K', fontsize=14, fontweight='bold') plt.grid(alpha=0.3) plt.axhline(y=0.5, color='green', linestyle='--', alpha=0.5, label='Good separation') plt.legend() plt.tight_layout() plt.show() print(f"Best K (highest silhouette score): {K_range[np.argmax(silhouette_scores)]}")

Silhouette Plot

from sklearn.metrics import silhouette_samples import matplotlib.cm as cm # Perform clustering with chosen K n_clusters = 3 kmeans = KMeans(n_clusters=n_clusters, random_state=42, n_init=10) cluster_labels = kmeans.fit_predict(expr_scaled) # Calculate silhouette values silhouette_vals = silhouette_samples(expr_scaled, cluster_labels) silhouette_avg = silhouette_score(expr_scaled, cluster_labels) # Create silhouette plot fig, ax = plt.subplots(figsize=(10, 7)) y_lower = 10 for i in range(n_clusters): # Get silhouette values for this cluster cluster_silhouette_vals = silhouette_vals[cluster_labels == i] cluster_silhouette_vals.sort() size_cluster_i = cluster_silhouette_vals.shape[0] y_upper = y_lower + size_cluster_i color = cm.nipy_spectral(float(i) / n_clusters) ax.fill_betweenx(np.arange(y_lower, y_upper), 0, cluster_silhouette_vals, facecolor=color, edgecolor=color, alpha=0.7) # Label clusters ax.text(-0.05, y_lower + 0.5 * size_cluster_i, str(i)) y_lower = y_upper + 10 ax.set_title('Silhouette Plot', fontsize=14, fontweight='bold') ax.set_xlabel('Silhouette Coefficient', fontsize=12) ax.set_ylabel('Cluster', fontsize=12) ax.axvline(x=silhouette_avg, color="red", linestyle="--", label=f'Average: {silhouette_avg:.3f}') ax.set_xlim([-0.1, 1]) ax.legend() plt.tight_layout() plt.show()

K-means Visualization

from sklearn.decomposition import PCA # Reduce to 2D for visualization pca = PCA(n_components=2) pca_coords = pca.fit_transform(expr_scaled) # Plot clusters plt.figure(figsize=(10, 7)) scatter = plt.scatter(pca_coords[:, 0], pca_coords[:, 1], c=cluster_labels, cmap='viridis', s=100, alpha=0.6, edgecolors='black') # Plot centroids centroids_pca = pca.transform(kmeans.cluster_centers_) plt.scatter(centroids_pca[:, 0], centroids_pca[:, 1], c='red', marker='X', s=300, edgecolors='black', linewidths=2, label='Centroids') # Annotate samples for i, txt in enumerate(df.columns): plt.annotate(txt, (pca_coords[i, 0], pca_coords[i, 1]), fontsize=8, alpha=0.7) plt.xlabel(f'PC1 ({pca.explained_variance_ratio_[0]:.1%})', fontsize=12) plt.ylabel(f'PC2 ({pca.explained_variance_ratio_[1]:.1%})', fontsize=12) plt.title('K-means Clustering Visualization (PCA)', fontsize=14, fontweight='bold') plt.colorbar(scatter, label='Cluster') plt.legend() plt.grid(alpha=0.3) plt.tight_layout() plt.show()

Heatmap with Clustering

Combine heatmaps with hierarchical clustering:

import seaborn as sns # Select top variable genes gene_vars = df.var(axis=1) top_genes = gene_vars.nlargest(50).index df_subset = df.loc[top_genes] # Create clustermap g = sns.clustermap( df_subset, method='average', metric='correlation', cmap='RdBu_r', center=0, z_score=0, # Standardize genes (rows) figsize=(12, 10), cbar_kws={'label': 'Z-score'}, dendrogram_ratio=0.15, row_cluster=True, col_cluster=True ) g.ax_heatmap.set_xlabel('Samples', fontsize=12) g.ax_heatmap.set_ylabel('Genes', fontsize=12) plt.suptitle('Gene Expression Heatmap with Hierarchical Clustering', y=0.98, fontsize=14, fontweight='bold') plt.tight_layout() plt.savefig('clustermap.png', dpi=300, bbox_inches='tight') plt.show()

Adding Annotations

# Create sample annotations sample_groups = pd.Series( ['Group1']*7 + ['Group2']*7 + ['Group3']*6, index=df.columns, name='Group' ) # Color mapping colors = {'Group1': '#FF6B6B', 'Group2': '#4ECDC4', 'Group3': '#45B7D1'} col_colors = sample_groups.map(colors) # Clustermap with annotations g = sns.clustermap( df_subset, method='average', metric='correlation', cmap='viridis', figsize=(12, 10), col_colors=col_colors, cbar_kws={'label': 'Expression'}, dendrogram_ratio=0.15 ) plt.suptitle('Annotated Clustermap', y=0.98, fontsize=14, fontweight='bold') plt.tight_layout() plt.show()

Consensus Clustering

Assess clustering stability:

from sklearn.cluster import KMeans import numpy as np def consensus_clustering(data, n_clusters, n_iterations=100): """ Perform consensus clustering Parameters: ----------- data : array-like Data matrix (samples x features) n_clusters : int Number of clusters n_iterations : int Number of clustering iterations Returns: -------- consensus_matrix : array Consensus matrix showing co-clustering frequency """ n_samples = data.shape[0] consensus_matrix = np.zeros((n_samples, n_samples)) for i in range(n_iterations): # Subsample data (80%) indices = np.random.choice(n_samples, size=int(0.8 * n_samples), replace=False) subset = data[indices] # Cluster kmeans = KMeans(n_clusters=n_clusters, random_state=i, n_init=10) labels = kmeans.fit_predict(subset) # Update consensus matrix for j in range(len(indices)): for k in range(j+1, len(indices)): if labels[j] == labels[k]: consensus_matrix[indices[j], indices[k]] += 1 consensus_matrix[indices[k], indices[j]] += 1 # Normalize by number of iterations consensus_matrix /= n_iterations return consensus_matrix # Perform consensus clustering consensus_mat = consensus_clustering(expr_scaled, n_clusters=3, n_iterations=50) # Visualize consensus matrix plt.figure(figsize=(10, 8)) sns.heatmap(consensus_mat, cmap='YlOrRd', xticklabels=df.columns, yticklabels=df.columns, cbar_kws={'label': 'Co-clustering Frequency'}) plt.title('Consensus Clustering Matrix', fontsize=14, fontweight='bold') plt.tight_layout() plt.show()

Comparing Clustering Methods

from sklearn.metrics import adjusted_rand_score, normalized_mutual_info_score # Perform different clustering methods # Hierarchical h_clusters = fcluster(linkage_matrix, t=3, criterion='maxclust') # K-means k_clusters = KMeans(n_clusters=3, random_state=42, n_init=10).fit_predict(expr_scaled) # Compare results ari = adjusted_rand_score(h_clusters, k_clusters) nmi = normalized_mutual_info_score(h_clusters, k_clusters) print(f"Agreement between methods:") print(f" Adjusted Rand Index: {ari:.3f}") print(f" Normalized Mutual Information: {nmi:.3f}") # Visualize comparison comparison_df = pd.DataFrame({ 'Sample': df.columns, 'Hierarchical': h_clusters, 'K-means': k_clusters }) print("\nCluster assignments:") print(comparison_df)

Gene Co-expression Modules

from scipy.cluster.hierarchy import fcluster # Cluster genes gene_linkage = linkage(df, method='average', metric='correlation') gene_modules = fcluster(gene_linkage, t=5, criterion='maxclust') # Assign genes to modules gene_module_df = pd.DataFrame({ 'Gene': df.index, 'Module': gene_modules }) # Calculate module eigengenes (first principal component) from sklearn.decomposition import PCA module_eigengenes = {} for module in range(1, gene_modules.max() + 1): module_genes = gene_module_df[gene_module_df['Module'] == module]['Gene'] module_expr = df.loc[module_genes] # Get first PC as eigengene pca = PCA(n_components=1) eigengene = pca.fit_transform(module_expr.T).flatten() module_eigengenes[f'Module_{module}'] = eigengene # Create eigengene DataFrame eigengene_df = pd.DataFrame(module_eigengenes, index=df.columns) # Plot eigengenes fig, axes = plt.subplots(len(module_eigengenes), 1, figsize=(12, 3 * len(module_eigengenes))) for i, (module, eigengene) in enumerate(module_eigengenes.items()): axes[i].plot(eigengene, 'o-', linewidth=2) axes[i].set_ylabel('Eigengene', fontsize=10) axes[i].set_title(module, fontsize=12) axes[i].grid(alpha=0.3) axes[i].set_xticks(range(len(df.columns))) axes[i].set_xticklabels(df.columns, rotation=45, ha='right') plt.tight_layout() plt.show()

Clustering Best Practices:

  • Standardize features before clustering
  • Try multiple methods and compare
  • Assess cluster stability
  • Consider biological interpretation
  • Validate with external data when possible

Common Pitfalls:

  • Not scaling data (especially for K-means)
  • Choosing K arbitrarily
  • Over-interpreting small clusters
  • Ignoring batch effects
  • Clustering without filtering low-variance genes

Next Steps

  • PCA - Dimensionality reduction for clustering
  • t-SNE and UMAP - Non-linear dimensionality reduction
  • Heatmaps - Advanced heatmap visualization
Last updated on