|
| 1 | +#!/usr/bin/python3 |
| 2 | + |
| 3 | +########################################## |
| 4 | +# HeatCluster-0.4.4 # |
| 5 | +# written by Stephen Beckstrom-Sternberg # |
| 6 | +# Creates SNP heat/cluster maps # |
| 7 | +# from SNP matrices # |
| 8 | +########################################### |
| 9 | + |
| 10 | +import re |
| 11 | +import pandas as pd |
| 12 | +import numpy as np |
| 13 | +import seaborn as sns |
| 14 | +import matplotlib.pyplot as plt |
| 15 | +import fastcluster as sch |
| 16 | +from sklearn.cluster import KMeans |
| 17 | +from sklearn.metrics import silhouette_score |
| 18 | +from io import StringIO |
| 19 | +from pathlib import Path |
| 20 | + |
| 21 | +# Read the SNP matrix file and determine the delimiter |
| 22 | +path = Path('./snp-dists.txt') |
| 23 | +try: |
| 24 | + path.resolve(strict=True) |
| 25 | +except FileNotFoundError: |
| 26 | + path = Path('./snp_matrix.txt') |
| 27 | + |
| 28 | +print("Using file path:", path) |
| 29 | + |
| 30 | +# Read the SNP matrix file into a DataFrame |
| 31 | +with open(path, "r") as infile: |
| 32 | + lines = infile.readlines() # Do NOT Skip the header line |
| 33 | + |
| 34 | +numSamples = len(lines) - 1 # counts data lines |
| 35 | + |
| 36 | +# Combine the cleaned lines into a single string instead of a file |
| 37 | +snp_matrix_string = "\n".join(line for line in lines) |
| 38 | + |
| 39 | +print("\n\n", snp_matrix_string, "\n\n") |
| 40 | + |
| 41 | +# Read the tab-delimited string into a DataFrame |
| 42 | +df = pd.read_csv(StringIO(snp_matrix_string), sep=',') |
| 43 | + |
| 44 | + |
| 45 | +# Remove 'snp-dists 0.8.2,' from the first column name |
| 46 | +# Define the regular expression patterns for replacements |
| 47 | +consensus_pattern = r'snp-dists 0\.8\.2,|\.consensus_threshold_0\.6_quality_20|Consensus_' |
| 48 | + |
| 49 | +# Replace the header row using the pattern |
| 50 | +df.columns = df.columns.str.replace(consensus_pattern, '', regex=True) |
| 51 | + |
| 52 | +# Replace the data rows using the same pattern |
| 53 | +df = df.replace(consensus_pattern, '', regex=True) |
| 54 | + |
| 55 | +df = df.set_index(df.columns[0]) |
| 56 | + |
| 57 | +print("/n/n", df, "\n\n") |
| 58 | +# Define colormap for heatmap |
| 59 | +cmap = 'Reds_r' |
| 60 | +print("Colormap set to '", cmap, "'") |
| 61 | + |
| 62 | +# Vectorized operation to add 'Total_SNPs' column |
| 63 | +df['Total_SNPs'] = df.sum(axis=1) |
| 64 | + |
| 65 | +# Vectorized operation to sort based on 'Total_SNPs' |
| 66 | +df = df.sort_values(by='Total_SNPs', kind='mergesort', axis=0) |
| 67 | + |
| 68 | +# Reorder the columns to mirror the row order |
| 69 | +sorted_cluster_matrix = df.reindex(columns=df.index) |
| 70 | + |
| 71 | +# Check for Missing Values |
| 72 | +if sorted_cluster_matrix.isnull().sum().sum() > 0: |
| 73 | + sorted_cluster_matrix = sorted_cluster_matrix.dropna() |
| 74 | + |
| 75 | +# Check for Non-Finite Values |
| 76 | +if ~np.isfinite(sorted_cluster_matrix).all().all(): |
| 77 | + sorted_cluster_matrix = sorted_cluster_matrix.replace([np.inf, -np.inf], np.nan).dropna() |
| 78 | + |
| 79 | +# Change output figure size tuple based on number of samples |
| 80 | +if numSamples <= 20: |
| 81 | + figureSize = (10, 8) |
| 82 | +elif numSamples <= 40: |
| 83 | + figureSize = (20, 16) |
| 84 | +elif numSamples <= 60: |
| 85 | + figureSize = (30, 24) |
| 86 | +else: |
| 87 | + figureSize = (40, 32) |
| 88 | +print("Number of samples:", numSamples, "\nFigure size:", figureSize) |
| 89 | + |
| 90 | +# Compute clusters |
| 91 | +# Compute clusters using fastcluster |
| 92 | +#clusters = fastcluster.linkage(sorted_cluster_matrix.values, method='complete', metric='euclidean') |
| 93 | +clusters = sch.linkage(sorted_cluster_matrix.values, method='complete', metric='euclidean') |
| 94 | + |
| 95 | +# Create clustermap to get the order of rows and columns based on clustering |
| 96 | +clustergrid = sns.clustermap( |
| 97 | + sorted_cluster_matrix, |
| 98 | + xticklabels=True, |
| 99 | + vmin=0, |
| 100 | + vmax=50, |
| 101 | + center=20, |
| 102 | + annot=True, |
| 103 | + annot_kws={'size': 6}, |
| 104 | + cbar_kws={"pad": 0.5}, |
| 105 | + cmap=cmap, |
| 106 | + linecolor="white", |
| 107 | + linewidths=.2, |
| 108 | + fmt='d', |
| 109 | + dendrogram_ratio=0.1, |
| 110 | + col_cluster=True, |
| 111 | + row_cluster=True, |
| 112 | + figsize=figureSize |
| 113 | +) |
| 114 | +plt.setp(clustergrid.ax_heatmap.get_xticklabels(), rotation=45, ha='right') |
| 115 | + |
| 116 | +# Suppress printing of dendrogram along the y-axis |
| 117 | +clustergrid.ax_row_dendrogram.set_visible(False) |
| 118 | +clustergrid.ax_col_dendrogram.set_visible(True) |
| 119 | + |
| 120 | +row_order = clustergrid.dendrogram_row.reordered_ind |
| 121 | +col_order = row_order |
| 122 | + |
| 123 | +# Sort the DataFrame based on the cluster order |
| 124 | +sorted_df = sorted_cluster_matrix.iloc[row_order, col_order] |
| 125 | + |
| 126 | +# Vectorized operation to calculate 'within_cluster_snps' |
| 127 | +within_cluster_snps = (df[df < 500]).sum(axis=1) |
| 128 | + |
| 129 | +# Computing the number of SNPs within the cluster per row |
| 130 | +within_cluster_snps = sorted_df.apply(lambda row: row[row < 500].sum(), axis=1) |
| 131 | + |
| 132 | +# Add 'Within_Cluster_SNPs' column to the sorted DataFrame |
| 133 | +sorted_df['Within_Cluster_SNPs'] = within_cluster_snps.values |
| 134 | + |
| 135 | +# Calculate silhouette scores for different numbers of clusters |
| 136 | +silhouette_scores = [] |
| 137 | +upper_range = min(numSamples, 11) |
| 138 | +for n_clusters in range(2, upper_range): |
| 139 | + kmeans = KMeans(n_clusters=n_clusters, n_init=10) |
| 140 | + cluster_labels = kmeans.fit_predict(sorted_df.values) |
| 141 | + silhouette_avg = silhouette_score(sorted_df.values, cluster_labels) |
| 142 | + silhouette_scores.append(silhouette_avg) |
| 143 | + |
| 144 | +# Find the optimal number of clusters with the highest silhouette score |
| 145 | +optimal_num_clusters = silhouette_scores.index(max(silhouette_scores)) + 2 |
| 146 | + |
| 147 | +# Use the optimal number of clusters to assign cluster labels and sort the DataFrame |
| 148 | +kmeans = KMeans(n_clusters=optimal_num_clusters, n_init=10) |
| 149 | +cluster_labels = kmeans.fit_predict(sorted_df.values) |
| 150 | +sorted_df['Cluster'] = cluster_labels |
| 151 | + |
| 152 | +# Sort the DataFrame first by 'Cluster' and then by 'Within_Cluster_SNPs' |
| 153 | +sorted_df = sorted_df.sort_values(by=['Cluster', 'Within_Cluster_SNPs'], ascending=[True, True], kind="mergesort") |
| 154 | + |
| 155 | +# Drop 'Cluster' and 'Within_Cluster_SNPs' columns |
| 156 | +sorted_df = sorted_df.drop(['Cluster', 'Within_Cluster_SNPs'], axis='columns') |
| 157 | +sorted_df = sorted_df.reindex(columns=sorted_df.index) |
| 158 | + |
| 159 | +# Save the finally sorted tab-delimited SNP matrix |
| 160 | +sorted_df.to_csv('Final_snp_matrix.tsv', sep='\t', header=True, index=True) |
| 161 | + |
| 162 | +# Create the reordered heatmap with correct values |
| 163 | +heatmap = sns.clustermap( |
| 164 | + sorted_df, xticklabels=True, yticklabels=True, vmin=0, vmax=50, center=20, |
| 165 | + annot=True, annot_kws={'size': 6}, cbar_kws={"orientation": "vertical", "pad": 0.5}, |
| 166 | + cmap=cmap, linecolor="white", linewidths=.1, fmt='d', dendrogram_ratio=0.1, |
| 167 | + col_cluster=True, row_cluster=True, figsize=figureSize |
| 168 | +) |
| 169 | +plt.setp(heatmap.ax_heatmap.get_xticklabels(), rotation=45, ha='right') |
| 170 | + |
| 171 | +heatmap.ax_row_dendrogram.set_visible(False) |
| 172 | + |
| 173 | +# Save the reordered heatmap as PDF and PNG |
| 174 | +heatmap.savefig('SNP_matrix.pdf') |
| 175 | +heatmap.savefig('SNP_matrix.png') |
| 176 | + |
| 177 | +plt.show() |
| 178 | +plt.close() |
| 179 | +print("Done") |
0 commit comments