Skip to content

Commit e1fa9c3

Browse files
authored
Merge pull request #8 from SMorrison42/main
update file permissions
2 parents 32ac945 + 6701147 commit e1fa9c3

File tree

1 file changed

+199
-0
lines changed

1 file changed

+199
-0
lines changed

bin/18S_toolv1.2.py

Lines changed: 199 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,199 @@
1+
#!/usr/bin/env python
2+
3+
import argparse
4+
import os
5+
import subprocess
6+
import pandas as pd
7+
import shutil
8+
import numpy as np
9+
10+
11+
#Tool details
12+
'''
13+
This script takes the raw blast results to process and filter the best hits for 18s RNA classification of Cryptosporidium
14+
usage: python <script> --localdir <rawblastresultsfolder> --resultsdir <finalresultfolder>
15+
'''
16+
algoname = "Crypto_18S_RNA_typing"
17+
version=1.1
18+
19+
#Set colors for cmdline messages/warnings
20+
CRED = '\033[91m' + '\nError:'
21+
CYEL = '\033[93m' + '\nWarning:'
22+
CGRE = '\033[92m' + '\nStatus:'
23+
CEND = '\033[0m'
24+
25+
class myargumentparser(argparse.ArgumentParser):
26+
# override to split space based cmdline args
27+
def convert_arg_line_to_args(self, arg_line):
28+
return arg_line.split()
29+
30+
def cmdline_args():
31+
parser = myargumentparser(fromfile_prefix_chars='@')
32+
#parser.add_argument('--reference_folder',default='',help="Blast Reference Database",type=str) # where reference database is located
33+
#parser.add_argument('--query',default='', help='filename for the assembled query genome', type=str) # where fasta assemblies are located
34+
parser.add_argument('--inputdir',default='',help='blast files directory',type=str) #takes in the blast input files
35+
parser.add_argument('--resultsdir',default='',help='results directory',type=str) #this is where intermediate files and results are saved
36+
parser.add_argument('--localdir', help='scratch location for intermediate files', type=str) # just a place holder
37+
args = parser.parse_args()
38+
return args
39+
40+
def dirs(args):
41+
folder= (args.resultsdir)
42+
for f in folder:
43+
os.path.join(os.getcwd(),f)
44+
os.makedirs(f, exist_ok=True)
45+
46+
def subdirs(args):
47+
mode=0o777
48+
tempsub = args.localdir+"/sorted_blastresults"
49+
os.makedirs(tempsub,mode,exist_ok=True)
50+
# os.makedirs(logsub,mode,exist_ok=True)
51+
52+
def blast_output(args):
53+
fpath=os.path.join(args.localdir, "sorted_blastresults")
54+
for blast in os.listdir(args.inputdir):
55+
if blast.endswith(".blast"):
56+
genename=os.path.basename(blast)
57+
genomename=genename.split(".")[0]
58+
blastresult_path=os.path.join(args.inputdir, blast)
59+
gene={}
60+
blastresultsDF = pd.read_csv(blastresult_path,sep="\t", header=None)
61+
blastresultsDF.columns=['qseqid','sseqid','pident','length','qcovs','mismatch','gapopen','qstart','qend','sstart','send','qlen','slen','bitscore','qseq']
62+
# print(blastresultsDF)
63+
with open(blastresult_path) as blastresult:
64+
fname=os.path.join(fpath,"{}.blast".format(genomename))
65+
for line in blastresult:
66+
# print(line)
67+
try:
68+
line = line.split( )
69+
qseqid=line[0]
70+
sseqid=line[1]
71+
pident=float(line[2])
72+
length=(line[3])
73+
qcovs=float(line[4])
74+
mismatch=(line[5])
75+
gapopen=(line[6])
76+
qstart=int(line[7])
77+
qend=int(line[8])
78+
sstart=int(line[9])
79+
send=int(line[10])
80+
qlen=int(line[11])
81+
slen=int(line[12])
82+
# qseq=(line[13])
83+
bitscore=(line[13])
84+
qseq=(line[14])
85+
# sec_qlen=len(qseq)
86+
# print(qseqid)
87+
# print(sec_qlen)
88+
if qstart < qend:
89+
aligned_query_length = max(qend - qstart + 1, 0) # Length of aligned region on query
90+
else:
91+
aligned_query_length = max(abs(qstart - qend)+ 1, 0) #indicates the alignment is in reverse direction
92+
aligned_subject_length = max(send - sstart + 1, 0)
93+
if aligned_subject_length != 0:
94+
# print(aligned_subject_length)
95+
query_coverage = (aligned_query_length / aligned_subject_length) * 100
96+
else:
97+
if aligned_query_length == 0:
98+
query_coverage = 0 # If both are zero, coverage is zero
99+
else:
100+
query_coverage = 100 # Assume full coverage if aligned_subject_length is zero but aligned_query_length is not
101+
if (pident> 98) & (query_coverage > 75) :
102+
if qseqid in gene:
103+
gene[qseqid].append(sseqid)
104+
else:
105+
gene[qseqid]=[sseqid]
106+
#print(gene)
107+
except IOError as err:
108+
print(err)
109+
with open(fname,"a") as ofile:
110+
for key in gene:
111+
if len(gene[key]) > 1:
112+
for i in range(0,len(gene[key])):
113+
# print(i)
114+
tempResult = blastresultsDF.loc[blastresultsDF['sseqid']==gene[key][i]].values[0]
115+
finalResults = tempResult.tolist()
116+
str1 = '\t'.join(str(e) for e in finalResults)
117+
# print(str1)
118+
ofile.write(str1+"\n")
119+
else:
120+
#print(gene[key][0])
121+
tempResult = blastresultsDF.loc[blastresultsDF['sseqid']==gene[key][0]].values[0]
122+
str1 = '\t'.join(str(e) for e in tempResult)
123+
ofile.write(str1+"\n")
124+
125+
ofile.close
126+
127+
def write_csvs(args):
128+
blastpath=os.path.join(args.localdir, "sorted_blastresults")
129+
csv_dir = os.path.join(blastpath, "blast_csv")
130+
os.makedirs(csv_dir, exist_ok=True)
131+
for file in os.listdir(blastpath):
132+
if file.endswith(".blast"):
133+
filename=file.split(".")[0]
134+
blast_file_path = os.path.join(blastpath, file)
135+
if os.path.getsize(blast_file_path) == 0:
136+
print(f"Skipping empty file: {file}")
137+
continue
138+
try:
139+
blast_data = pd.read_csv(blast_file_path, sep="\t", header=None)
140+
except pd.errors.EmptyDataError:
141+
print(f"File is empty or not found: {file}")
142+
continue
143+
try:
144+
blast_data.columns=["query_genome","db_bestmatch","pident","alignment_length","coverage","mismatch","gapopen","query_start","query_end","subject_start","subject_end","query_length","subject_length","bitscore","qseq"]
145+
data_filtered=blast_data[blast_data.bitscore == blast_data.bitscore.max()]
146+
filtered_uniq = data_filtered.drop_duplicates()
147+
blast_csvfmt=os.path.join(csv_dir, f"{filename}.csv")
148+
filtered_uniq.to_csv(blast_csvfmt)
149+
except Exception as e:
150+
print(f"Error processing file {file}: {e}")
151+
152+
def filter_besthit(args):
153+
i = 0
154+
csvpath=os.path.join(args.localdir,"sorted_blastresults","blast_csv")
155+
csvfiles=[file for file in os.listdir(csvpath) if file.endswith(".csv")]
156+
if not csvfiles:
157+
df_empty = pd.DataFrame({"NCE": ["Sample did not meet the BLAST parameters, check input sequence or BLAST results"]})
158+
df_empty.to_csv(os.path.join(args.resultsdir, "NO_18SResults.csv"), index=False)
159+
else:
160+
for file in csvfiles:
161+
filename=os.path.basename(file)
162+
fname=filename.split(".")[0]
163+
data=pd.read_csv(os.path.join(csvpath, file), sep=",", index_col=False)
164+
data = data.drop(["Unnamed: 0","mismatch","gapopen","query_length","subject_length"],axis=1)
165+
data["rank"]=data[["query_genome","pident","coverage","bitscore"]].apply(tuple,axis=1).rank(method='dense',ascending=False).astype(int)
166+
#print("HERE")
167+
#print(data["db_bestmatch"][0])
168+
#species=data["db_bestmatch"][0].split('_',2, expand=True)
169+
species=data["db_bestmatch"][0].split('_',2)
170+
data["species"] = species[0]+"."+species[1]
171+
data["sample_name"]=fname
172+
data_uniq = data.drop_duplicates()
173+
NCE_1 = ["Alignment length is <700bp - do manual check for Identity and coverage" if x < 700 else "NA" for x in data_uniq["alignment_length"]]
174+
NCE_2 = ["Sample has multiple hits - Review manually" if len(data_uniq) > 1 else "NA"] * len(data_uniq)
175+
NCE_combined = [f"{n1};{n2}" for n1, n2 in zip(NCE_1, NCE_2)]
176+
data_uniq["NCE"]= NCE_combined
177+
finaldata = data_uniq.drop(columns=["rank"])
178+
# print(finaldata)
179+
# Reorder the columns
180+
column_order = ["sample_name","query_genome", "db_bestmatch", "species", "pident", "alignment_length", "coverage", "query_start", "query_end", "subject_start", "subject_end", "bitscore", "NCE"]
181+
finaldata = finaldata[column_order]
182+
output_file = os.path.join(args.resultsdir, f"{fname}.18SResults.csv")
183+
if i == 0:
184+
finaldata.to_csv(output_file,index=False)
185+
else:
186+
finaldata.to_csv(output_file,mode='a',index=False, header = False)
187+
i += 1
188+
189+
def main():
190+
args = cmdline_args()
191+
dirs(args)
192+
subdirs(args)
193+
blast_output(args)
194+
write_csvs(args)
195+
filter_besthit(args)
196+
#print(CGRE + "Job has completed successfully" + CEND)
197+
198+
if __name__=='__main__':
199+
main()

0 commit comments

Comments
 (0)