Clustering Methods
Clustering groups similar samples or genes based on expression patterns. This unsupervised learning approach reveals natural groupings in data without predefined labels.
Why Clustering in Bioinformatics?
Clustering helps:
- Identify sample subtypes or patient groups
- Find co-expressed genes
- Detect batch effects
- Validate experimental design
- Discover biomarker signatures
Hierarchical Clustering
Hierarchical clustering builds a tree (dendrogram) showing relationships between samples.
Basic Implementation
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.cluster.hierarchy import dendrogram, linkage, fcluster
from scipy.spatial.distance import pdist, squareform
# Generate sample gene expression data
np.random.seed(42)
n_genes = 100
n_samples = 20
# Create three distinct groups
group1 = np.random.normal(5, 1, (n_genes, 7))
group2 = np.random.normal(7, 1, (n_genes, 7))
group3 = np.random.normal(6, 1.5, (n_genes, 6))
expr_data = np.concatenate([group1, group2, group3], axis=1)
# Create DataFrame
genes = [f'Gene_{i}' for i in range(n_genes)]
samples = [f'Sample_{i}' for i in range(n_samples)]
df = pd.DataFrame(expr_data, index=genes, columns=samples)
print(f"Expression matrix: {df.shape}")
print(df.head())Sample Clustering
from scipy.cluster.hierarchy import dendrogram, linkage
# Compute linkage matrix
# Transpose so samples are rows
linkage_matrix = linkage(df.T, method='average', metric='correlation')
# Plot dendrogram
plt.figure(figsize=(12, 6))
dendrogram(
linkage_matrix,
labels=df.columns,
leaf_font_size=10,
color_threshold=0.7 * max(linkage_matrix[:, 2])
)
plt.title('Hierarchical Clustering of Samples', fontsize=14, fontweight='bold')
plt.xlabel('Sample', fontsize=12)
plt.ylabel('Distance', fontsize=12)
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.show()Different Linkage Methods
Average
# Average linkage (UPGMA)
linkage_avg = linkage(df.T, method='average', metric='euclidean')
fig, ax = plt.subplots(figsize=(10, 5))
dendrogram(linkage_avg, labels=df.columns, ax=ax)
ax.set_title('Average Linkage')
plt.tight_layout()
plt.show()Cutting Dendrogram into Clusters
from scipy.cluster.hierarchy import fcluster
# Cut dendrogram at specific height
clusters = fcluster(linkage_matrix, t=1.5, criterion='distance')
# Or specify number of clusters
n_clusters = 3
clusters = fcluster(linkage_matrix, t=n_clusters, criterion='maxclust')
# Create cluster assignment DataFrame
cluster_df = pd.DataFrame({
'Sample': df.columns,
'Cluster': clusters
})
print("Cluster assignments:")
print(cluster_df)
# Count samples per cluster
print("\nSamples per cluster:")
print(cluster_df['Cluster'].value_counts().sort_index())Visualizing Clusters
# Plot dendrogram with cluster colors
from scipy.cluster.hierarchy import dendrogram, set_link_color_palette
# Set color palette
set_link_color_palette(['#FF6B6B', '#4ECDC4', '#45B7D1'])
plt.figure(figsize=(12, 6))
dendro = dendrogram(
linkage_matrix,
labels=df.columns,
color_threshold=1.5,
above_threshold_color='gray'
)
plt.title('Hierarchical Clustering with Cluster Coloring',
fontsize=14, fontweight='bold')
plt.xlabel('Sample')
plt.ylabel('Distance')
plt.axhline(y=1.5, color='red', linestyle='--', label='Cut threshold')
plt.legend()
plt.tight_layout()
plt.show()Gene Clustering
# Cluster genes instead of samples
gene_linkage = linkage(df, method='average', metric='correlation')
plt.figure(figsize=(10, 12))
dendrogram(
gene_linkage,
labels=df.index,
orientation='left',
leaf_font_size=6
)
plt.title('Hierarchical Clustering of Genes', fontsize=14)
plt.xlabel('Distance')
plt.ylabel('Gene')
plt.tight_layout()
plt.show()
# Identify gene clusters
gene_clusters = fcluster(gene_linkage, t=5, criterion='maxclust')
gene_cluster_df = pd.DataFrame({
'Gene': df.index,
'Cluster': gene_clusters
})
print(f"Genes per cluster:")
print(gene_cluster_df['Cluster'].value_counts())K-means Clustering
K-means partitions data into K clusters by minimizing within-cluster variance.
Basic K-means
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
# Standardize data (important for K-means)
scaler = StandardScaler()
expr_scaled = scaler.fit_transform(df.T)
# Perform K-means
n_clusters = 3
kmeans = KMeans(n_clusters=n_clusters, random_state=42, n_init=10)
clusters = kmeans.fit_predict(expr_scaled)
# Create results DataFrame
results = pd.DataFrame({
'Sample': df.columns,
'Cluster': clusters
})
print("K-means cluster assignments:")
print(results)
print(f"\nSamples per cluster:")
print(results['Cluster'].value_counts().sort_index())Determining Optimal K
Elbow Method
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
# Test different numbers of clusters
inertias = []
silhouette_scores = []
K_range = range(2, 11)
for k in K_range:
kmeans = KMeans(n_clusters=k, random_state=42, n_init=10)
kmeans.fit(expr_scaled)
inertias.append(kmeans.inertia_)
# Plot elbow curve
plt.figure(figsize=(10, 5))
plt.plot(K_range, inertias, 'bo-', linewidth=2, markersize=8)
plt.xlabel('Number of Clusters (K)', fontsize=12)
plt.ylabel('Within-Cluster Sum of Squares (Inertia)', fontsize=12)
plt.title('Elbow Method for Optimal K', fontsize=14, fontweight='bold')
plt.grid(alpha=0.3)
plt.tight_layout()
plt.show()Silhouette Analysis
from sklearn.metrics import silhouette_score, silhouette_samples
import matplotlib.pyplot as plt
import numpy as np
# Calculate silhouette scores
silhouette_scores = []
K_range = range(2, 11)
for k in K_range:
kmeans = KMeans(n_clusters=k, random_state=42, n_init=10)
clusters = kmeans.fit_predict(expr_scaled)
score = silhouette_score(expr_scaled, clusters)
silhouette_scores.append(score)
# Plot silhouette scores
plt.figure(figsize=(10, 5))
plt.plot(K_range, silhouette_scores, 'ro-', linewidth=2, markersize=8)
plt.xlabel('Number of Clusters (K)', fontsize=12)
plt.ylabel('Silhouette Score', fontsize=12)
plt.title('Silhouette Analysis for Optimal K', fontsize=14, fontweight='bold')
plt.grid(alpha=0.3)
plt.axhline(y=0.5, color='green', linestyle='--', alpha=0.5, label='Good separation')
plt.legend()
plt.tight_layout()
plt.show()
print(f"Best K (highest silhouette score): {K_range[np.argmax(silhouette_scores)]}")Silhouette Plot
from sklearn.metrics import silhouette_samples
import matplotlib.cm as cm
# Perform clustering with chosen K
n_clusters = 3
kmeans = KMeans(n_clusters=n_clusters, random_state=42, n_init=10)
cluster_labels = kmeans.fit_predict(expr_scaled)
# Calculate silhouette values
silhouette_vals = silhouette_samples(expr_scaled, cluster_labels)
silhouette_avg = silhouette_score(expr_scaled, cluster_labels)
# Create silhouette plot
fig, ax = plt.subplots(figsize=(10, 7))
y_lower = 10
for i in range(n_clusters):
# Get silhouette values for this cluster
cluster_silhouette_vals = silhouette_vals[cluster_labels == i]
cluster_silhouette_vals.sort()
size_cluster_i = cluster_silhouette_vals.shape[0]
y_upper = y_lower + size_cluster_i
color = cm.nipy_spectral(float(i) / n_clusters)
ax.fill_betweenx(np.arange(y_lower, y_upper),
0, cluster_silhouette_vals,
facecolor=color, edgecolor=color, alpha=0.7)
# Label clusters
ax.text(-0.05, y_lower + 0.5 * size_cluster_i, str(i))
y_lower = y_upper + 10
ax.set_title('Silhouette Plot', fontsize=14, fontweight='bold')
ax.set_xlabel('Silhouette Coefficient', fontsize=12)
ax.set_ylabel('Cluster', fontsize=12)
ax.axvline(x=silhouette_avg, color="red", linestyle="--",
label=f'Average: {silhouette_avg:.3f}')
ax.set_xlim([-0.1, 1])
ax.legend()
plt.tight_layout()
plt.show()K-means Visualization
from sklearn.decomposition import PCA
# Reduce to 2D for visualization
pca = PCA(n_components=2)
pca_coords = pca.fit_transform(expr_scaled)
# Plot clusters
plt.figure(figsize=(10, 7))
scatter = plt.scatter(pca_coords[:, 0], pca_coords[:, 1],
c=cluster_labels, cmap='viridis',
s=100, alpha=0.6, edgecolors='black')
# Plot centroids
centroids_pca = pca.transform(kmeans.cluster_centers_)
plt.scatter(centroids_pca[:, 0], centroids_pca[:, 1],
c='red', marker='X', s=300, edgecolors='black',
linewidths=2, label='Centroids')
# Annotate samples
for i, txt in enumerate(df.columns):
plt.annotate(txt, (pca_coords[i, 0], pca_coords[i, 1]),
fontsize=8, alpha=0.7)
plt.xlabel(f'PC1 ({pca.explained_variance_ratio_[0]:.1%})', fontsize=12)
plt.ylabel(f'PC2 ({pca.explained_variance_ratio_[1]:.1%})', fontsize=12)
plt.title('K-means Clustering Visualization (PCA)', fontsize=14, fontweight='bold')
plt.colorbar(scatter, label='Cluster')
plt.legend()
plt.grid(alpha=0.3)
plt.tight_layout()
plt.show()Heatmap with Clustering
Combine heatmaps with hierarchical clustering:
import seaborn as sns
# Select top variable genes
gene_vars = df.var(axis=1)
top_genes = gene_vars.nlargest(50).index
df_subset = df.loc[top_genes]
# Create clustermap
g = sns.clustermap(
df_subset,
method='average',
metric='correlation',
cmap='RdBu_r',
center=0,
z_score=0, # Standardize genes (rows)
figsize=(12, 10),
cbar_kws={'label': 'Z-score'},
dendrogram_ratio=0.15,
row_cluster=True,
col_cluster=True
)
g.ax_heatmap.set_xlabel('Samples', fontsize=12)
g.ax_heatmap.set_ylabel('Genes', fontsize=12)
plt.suptitle('Gene Expression Heatmap with Hierarchical Clustering',
y=0.98, fontsize=14, fontweight='bold')
plt.tight_layout()
plt.savefig('clustermap.png', dpi=300, bbox_inches='tight')
plt.show()Adding Annotations
# Create sample annotations
sample_groups = pd.Series(
['Group1']*7 + ['Group2']*7 + ['Group3']*6,
index=df.columns,
name='Group'
)
# Color mapping
colors = {'Group1': '#FF6B6B', 'Group2': '#4ECDC4', 'Group3': '#45B7D1'}
col_colors = sample_groups.map(colors)
# Clustermap with annotations
g = sns.clustermap(
df_subset,
method='average',
metric='correlation',
cmap='viridis',
figsize=(12, 10),
col_colors=col_colors,
cbar_kws={'label': 'Expression'},
dendrogram_ratio=0.15
)
plt.suptitle('Annotated Clustermap', y=0.98, fontsize=14, fontweight='bold')
plt.tight_layout()
plt.show()Consensus Clustering
Assess clustering stability:
from sklearn.cluster import KMeans
import numpy as np
def consensus_clustering(data, n_clusters, n_iterations=100):
"""
Perform consensus clustering
Parameters:
-----------
data : array-like
Data matrix (samples x features)
n_clusters : int
Number of clusters
n_iterations : int
Number of clustering iterations
Returns:
--------
consensus_matrix : array
Consensus matrix showing co-clustering frequency
"""
n_samples = data.shape[0]
consensus_matrix = np.zeros((n_samples, n_samples))
for i in range(n_iterations):
# Subsample data (80%)
indices = np.random.choice(n_samples,
size=int(0.8 * n_samples),
replace=False)
subset = data[indices]
# Cluster
kmeans = KMeans(n_clusters=n_clusters, random_state=i, n_init=10)
labels = kmeans.fit_predict(subset)
# Update consensus matrix
for j in range(len(indices)):
for k in range(j+1, len(indices)):
if labels[j] == labels[k]:
consensus_matrix[indices[j], indices[k]] += 1
consensus_matrix[indices[k], indices[j]] += 1
# Normalize by number of iterations
consensus_matrix /= n_iterations
return consensus_matrix
# Perform consensus clustering
consensus_mat = consensus_clustering(expr_scaled, n_clusters=3, n_iterations=50)
# Visualize consensus matrix
plt.figure(figsize=(10, 8))
sns.heatmap(consensus_mat, cmap='YlOrRd',
xticklabels=df.columns, yticklabels=df.columns,
cbar_kws={'label': 'Co-clustering Frequency'})
plt.title('Consensus Clustering Matrix', fontsize=14, fontweight='bold')
plt.tight_layout()
plt.show()Comparing Clustering Methods
from sklearn.metrics import adjusted_rand_score, normalized_mutual_info_score
# Perform different clustering methods
# Hierarchical
h_clusters = fcluster(linkage_matrix, t=3, criterion='maxclust')
# K-means
k_clusters = KMeans(n_clusters=3, random_state=42, n_init=10).fit_predict(expr_scaled)
# Compare results
ari = adjusted_rand_score(h_clusters, k_clusters)
nmi = normalized_mutual_info_score(h_clusters, k_clusters)
print(f"Agreement between methods:")
print(f" Adjusted Rand Index: {ari:.3f}")
print(f" Normalized Mutual Information: {nmi:.3f}")
# Visualize comparison
comparison_df = pd.DataFrame({
'Sample': df.columns,
'Hierarchical': h_clusters,
'K-means': k_clusters
})
print("\nCluster assignments:")
print(comparison_df)Gene Co-expression Modules
from scipy.cluster.hierarchy import fcluster
# Cluster genes
gene_linkage = linkage(df, method='average', metric='correlation')
gene_modules = fcluster(gene_linkage, t=5, criterion='maxclust')
# Assign genes to modules
gene_module_df = pd.DataFrame({
'Gene': df.index,
'Module': gene_modules
})
# Calculate module eigengenes (first principal component)
from sklearn.decomposition import PCA
module_eigengenes = {}
for module in range(1, gene_modules.max() + 1):
module_genes = gene_module_df[gene_module_df['Module'] == module]['Gene']
module_expr = df.loc[module_genes]
# Get first PC as eigengene
pca = PCA(n_components=1)
eigengene = pca.fit_transform(module_expr.T).flatten()
module_eigengenes[f'Module_{module}'] = eigengene
# Create eigengene DataFrame
eigengene_df = pd.DataFrame(module_eigengenes, index=df.columns)
# Plot eigengenes
fig, axes = plt.subplots(len(module_eigengenes), 1,
figsize=(12, 3 * len(module_eigengenes)))
for i, (module, eigengene) in enumerate(module_eigengenes.items()):
axes[i].plot(eigengene, 'o-', linewidth=2)
axes[i].set_ylabel('Eigengene', fontsize=10)
axes[i].set_title(module, fontsize=12)
axes[i].grid(alpha=0.3)
axes[i].set_xticks(range(len(df.columns)))
axes[i].set_xticklabels(df.columns, rotation=45, ha='right')
plt.tight_layout()
plt.show()Clustering Best Practices:
- Standardize features before clustering
- Try multiple methods and compare
- Assess cluster stability
- Consider biological interpretation
- Validate with external data when possible
Common Pitfalls:
- Not scaling data (especially for K-means)
- Choosing K arbitrarily
- Over-interpreting small clusters
- Ignoring batch effects
- Clustering without filtering low-variance genes
Next Steps
- PCA - Dimensionality reduction for clustering
- t-SNE and UMAP - Non-linear dimensionality reduction
- Heatmaps - Advanced heatmap visualization
Last updated on