Skip to content

Commit f03eba2

Browse files
authored
Add files via upload
Latest version of HeatCluster using fastcluster. Automatically sets output figure size based on number of samples, and saves heatcluster in both pdf and png format.
1 parent 9923dc1 commit f03eba2

File tree

1 file changed

+179
-0
lines changed

1 file changed

+179
-0
lines changed

HeatCluster-0.4.4.py

Lines changed: 179 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,179 @@
1+
#!/usr/bin/python3
2+
3+
##########################################
4+
# HeatCluster-0.4.4 #
5+
# written by Stephen Beckstrom-Sternberg #
6+
# Creates SNP heat/cluster maps #
7+
# from SNP matrices #
8+
###########################################
9+
10+
import re
11+
import pandas as pd
12+
import numpy as np
13+
import seaborn as sns
14+
import matplotlib.pyplot as plt
15+
import fastcluster as sch
16+
from sklearn.cluster import KMeans
17+
from sklearn.metrics import silhouette_score
18+
from io import StringIO
19+
from pathlib import Path
20+
21+
# Read the SNP matrix file and determine the delimiter
22+
path = Path('./snp-dists.txt')
23+
try:
24+
path.resolve(strict=True)
25+
except FileNotFoundError:
26+
path = Path('./snp_matrix.txt')
27+
28+
print("Using file path:", path)
29+
30+
# Read the SNP matrix file into a DataFrame
31+
with open(path, "r") as infile:
32+
lines = infile.readlines() # Do NOT Skip the header line
33+
34+
numSamples = len(lines) - 1 # counts data lines
35+
36+
# Combine the cleaned lines into a single string instead of a file
37+
snp_matrix_string = "\n".join(line for line in lines)
38+
39+
print("\n\n", snp_matrix_string, "\n\n")
40+
41+
# Read the tab-delimited string into a DataFrame
42+
df = pd.read_csv(StringIO(snp_matrix_string), sep=',')
43+
44+
45+
# Remove 'snp-dists 0.8.2,' from the first column name
46+
# Define the regular expression patterns for replacements
47+
consensus_pattern = r'snp-dists 0\.8\.2,|\.consensus_threshold_0\.6_quality_20|Consensus_'
48+
49+
# Replace the header row using the pattern
50+
df.columns = df.columns.str.replace(consensus_pattern, '', regex=True)
51+
52+
# Replace the data rows using the same pattern
53+
df = df.replace(consensus_pattern, '', regex=True)
54+
55+
df = df.set_index(df.columns[0])
56+
57+
print("/n/n", df, "\n\n")
58+
# Define colormap for heatmap
59+
cmap = 'Reds_r'
60+
print("Colormap set to '", cmap, "'")
61+
62+
# Vectorized operation to add 'Total_SNPs' column
63+
df['Total_SNPs'] = df.sum(axis=1)
64+
65+
# Vectorized operation to sort based on 'Total_SNPs'
66+
df = df.sort_values(by='Total_SNPs', kind='mergesort', axis=0)
67+
68+
# Reorder the columns to mirror the row order
69+
sorted_cluster_matrix = df.reindex(columns=df.index)
70+
71+
# Check for Missing Values
72+
if sorted_cluster_matrix.isnull().sum().sum() > 0:
73+
sorted_cluster_matrix = sorted_cluster_matrix.dropna()
74+
75+
# Check for Non-Finite Values
76+
if ~np.isfinite(sorted_cluster_matrix).all().all():
77+
sorted_cluster_matrix = sorted_cluster_matrix.replace([np.inf, -np.inf], np.nan).dropna()
78+
79+
# Change output figure size tuple based on number of samples
80+
if numSamples <= 20:
81+
figureSize = (10, 8)
82+
elif numSamples <= 40:
83+
figureSize = (20, 16)
84+
elif numSamples <= 60:
85+
figureSize = (30, 24)
86+
else:
87+
figureSize = (40, 32)
88+
print("Number of samples:", numSamples, "\nFigure size:", figureSize)
89+
90+
# Compute clusters
91+
# Compute clusters using fastcluster
92+
#clusters = fastcluster.linkage(sorted_cluster_matrix.values, method='complete', metric='euclidean')
93+
clusters = sch.linkage(sorted_cluster_matrix.values, method='complete', metric='euclidean')
94+
95+
# Create clustermap to get the order of rows and columns based on clustering
96+
clustergrid = sns.clustermap(
97+
sorted_cluster_matrix,
98+
xticklabels=True,
99+
vmin=0,
100+
vmax=50,
101+
center=20,
102+
annot=True,
103+
annot_kws={'size': 6},
104+
cbar_kws={"pad": 0.5},
105+
cmap=cmap,
106+
linecolor="white",
107+
linewidths=.2,
108+
fmt='d',
109+
dendrogram_ratio=0.1,
110+
col_cluster=True,
111+
row_cluster=True,
112+
figsize=figureSize
113+
)
114+
plt.setp(clustergrid.ax_heatmap.get_xticklabels(), rotation=45, ha='right')
115+
116+
# Suppress printing of dendrogram along the y-axis
117+
clustergrid.ax_row_dendrogram.set_visible(False)
118+
clustergrid.ax_col_dendrogram.set_visible(True)
119+
120+
row_order = clustergrid.dendrogram_row.reordered_ind
121+
col_order = row_order
122+
123+
# Sort the DataFrame based on the cluster order
124+
sorted_df = sorted_cluster_matrix.iloc[row_order, col_order]
125+
126+
# Vectorized operation to calculate 'within_cluster_snps'
127+
within_cluster_snps = (df[df < 500]).sum(axis=1)
128+
129+
# Computing the number of SNPs within the cluster per row
130+
within_cluster_snps = sorted_df.apply(lambda row: row[row < 500].sum(), axis=1)
131+
132+
# Add 'Within_Cluster_SNPs' column to the sorted DataFrame
133+
sorted_df['Within_Cluster_SNPs'] = within_cluster_snps.values
134+
135+
# Calculate silhouette scores for different numbers of clusters
136+
silhouette_scores = []
137+
upper_range = min(numSamples, 11)
138+
for n_clusters in range(2, upper_range):
139+
kmeans = KMeans(n_clusters=n_clusters, n_init=10)
140+
cluster_labels = kmeans.fit_predict(sorted_df.values)
141+
silhouette_avg = silhouette_score(sorted_df.values, cluster_labels)
142+
silhouette_scores.append(silhouette_avg)
143+
144+
# Find the optimal number of clusters with the highest silhouette score
145+
optimal_num_clusters = silhouette_scores.index(max(silhouette_scores)) + 2
146+
147+
# Use the optimal number of clusters to assign cluster labels and sort the DataFrame
148+
kmeans = KMeans(n_clusters=optimal_num_clusters, n_init=10)
149+
cluster_labels = kmeans.fit_predict(sorted_df.values)
150+
sorted_df['Cluster'] = cluster_labels
151+
152+
# Sort the DataFrame first by 'Cluster' and then by 'Within_Cluster_SNPs'
153+
sorted_df = sorted_df.sort_values(by=['Cluster', 'Within_Cluster_SNPs'], ascending=[True, True], kind="mergesort")
154+
155+
# Drop 'Cluster' and 'Within_Cluster_SNPs' columns
156+
sorted_df = sorted_df.drop(['Cluster', 'Within_Cluster_SNPs'], axis='columns')
157+
sorted_df = sorted_df.reindex(columns=sorted_df.index)
158+
159+
# Save the finally sorted tab-delimited SNP matrix
160+
sorted_df.to_csv('Final_snp_matrix.tsv', sep='\t', header=True, index=True)
161+
162+
# Create the reordered heatmap with correct values
163+
heatmap = sns.clustermap(
164+
sorted_df, xticklabels=True, yticklabels=True, vmin=0, vmax=50, center=20,
165+
annot=True, annot_kws={'size': 6}, cbar_kws={"orientation": "vertical", "pad": 0.5},
166+
cmap=cmap, linecolor="white", linewidths=.1, fmt='d', dendrogram_ratio=0.1,
167+
col_cluster=True, row_cluster=True, figsize=figureSize
168+
)
169+
plt.setp(heatmap.ax_heatmap.get_xticklabels(), rotation=45, ha='right')
170+
171+
heatmap.ax_row_dendrogram.set_visible(False)
172+
173+
# Save the reordered heatmap as PDF and PNG
174+
heatmap.savefig('SNP_matrix.pdf')
175+
heatmap.savefig('SNP_matrix.png')
176+
177+
plt.show()
178+
plt.close()
179+
print("Done")

0 commit comments

Comments
 (0)