|
| 1 | +#!/usr/bin/env python |
| 2 | + |
| 3 | +import argparse |
| 4 | +import os |
| 5 | +import subprocess |
| 6 | +import pandas as pd |
| 7 | +import shutil |
| 8 | +import numpy as np |
| 9 | + |
| 10 | + |
| 11 | +#Tool details |
| 12 | +''' |
| 13 | +This script takes the raw blast results to process and filter the best hits for 18s RNA classification of Cryptosporidium |
| 14 | +usage: python <script> --localdir <rawblastresultsfolder> --resultsdir <finalresultfolder> |
| 15 | +''' |
| 16 | +algoname = "Crypto_18S_RNA_typing" |
| 17 | +version=1.1 |
| 18 | + |
| 19 | +#Set colors for cmdline messages/warnings |
| 20 | +CRED = '\033[91m' + '\nError:' |
| 21 | +CYEL = '\033[93m' + '\nWarning:' |
| 22 | +CGRE = '\033[92m' + '\nStatus:' |
| 23 | +CEND = '\033[0m' |
| 24 | + |
| 25 | +class myargumentparser(argparse.ArgumentParser): |
| 26 | + # override to split space based cmdline args |
| 27 | + def convert_arg_line_to_args(self, arg_line): |
| 28 | + return arg_line.split() |
| 29 | + |
| 30 | +def cmdline_args(): |
| 31 | + parser = myargumentparser(fromfile_prefix_chars='@') |
| 32 | + #parser.add_argument('--reference_folder',default='',help="Blast Reference Database",type=str) # where reference database is located |
| 33 | + #parser.add_argument('--query',default='', help='filename for the assembled query genome', type=str) # where fasta assemblies are located |
| 34 | + parser.add_argument('--inputdir',default='',help='blast files directory',type=str) #takes in the blast input files |
| 35 | + parser.add_argument('--resultsdir',default='',help='results directory',type=str) #this is where intermediate files and results are saved |
| 36 | + parser.add_argument('--localdir', help='scratch location for intermediate files', type=str) # just a place holder |
| 37 | + args = parser.parse_args() |
| 38 | + return args |
| 39 | + |
| 40 | +def dirs(args): |
| 41 | + folder= (args.resultsdir) |
| 42 | + for f in folder: |
| 43 | + os.path.join(os.getcwd(),f) |
| 44 | + os.makedirs(f, exist_ok=True) |
| 45 | + |
| 46 | +def subdirs(args): |
| 47 | + mode=0o777 |
| 48 | + tempsub = args.localdir+"/sorted_blastresults" |
| 49 | + os.makedirs(tempsub,mode,exist_ok=True) |
| 50 | + # os.makedirs(logsub,mode,exist_ok=True) |
| 51 | + |
| 52 | +def blast_output(args): |
| 53 | + fpath=os.path.join(args.localdir, "sorted_blastresults") |
| 54 | + for blast in os.listdir(args.inputdir): |
| 55 | + if blast.endswith(".blast"): |
| 56 | + genename=os.path.basename(blast) |
| 57 | + genomename=genename.split(".")[0] |
| 58 | + blastresult_path=os.path.join(args.inputdir, blast) |
| 59 | + gene={} |
| 60 | + blastresultsDF = pd.read_csv(blastresult_path,sep="\t", header=None) |
| 61 | + blastresultsDF.columns=['qseqid','sseqid','pident','length','qcovs','mismatch','gapopen','qstart','qend','sstart','send','qlen','slen','bitscore','qseq'] |
| 62 | + # print(blastresultsDF) |
| 63 | + with open(blastresult_path) as blastresult: |
| 64 | + fname=os.path.join(fpath,"{}.blast".format(genomename)) |
| 65 | + for line in blastresult: |
| 66 | + # print(line) |
| 67 | + try: |
| 68 | + line = line.split( ) |
| 69 | + qseqid=line[0] |
| 70 | + sseqid=line[1] |
| 71 | + pident=float(line[2]) |
| 72 | + length=(line[3]) |
| 73 | + qcovs=float(line[4]) |
| 74 | + mismatch=(line[5]) |
| 75 | + gapopen=(line[6]) |
| 76 | + qstart=int(line[7]) |
| 77 | + qend=int(line[8]) |
| 78 | + sstart=int(line[9]) |
| 79 | + send=int(line[10]) |
| 80 | + qlen=int(line[11]) |
| 81 | + slen=int(line[12]) |
| 82 | + # qseq=(line[13]) |
| 83 | + bitscore=(line[13]) |
| 84 | + qseq=(line[14]) |
| 85 | + # sec_qlen=len(qseq) |
| 86 | + # print(qseqid) |
| 87 | + # print(sec_qlen) |
| 88 | + if qstart < qend: |
| 89 | + aligned_query_length = max(qend - qstart + 1, 0) # Length of aligned region on query |
| 90 | + else: |
| 91 | + aligned_query_length = max(abs(qstart - qend)+ 1, 0) #indicates the alignment is in reverse direction |
| 92 | + aligned_subject_length = max(send - sstart + 1, 0) |
| 93 | + if aligned_subject_length != 0: |
| 94 | + # print(aligned_subject_length) |
| 95 | + query_coverage = (aligned_query_length / aligned_subject_length) * 100 |
| 96 | + else: |
| 97 | + if aligned_query_length == 0: |
| 98 | + query_coverage = 0 # If both are zero, coverage is zero |
| 99 | + else: |
| 100 | + query_coverage = 100 # Assume full coverage if aligned_subject_length is zero but aligned_query_length is not |
| 101 | + if (pident> 98) & (query_coverage > 75) : |
| 102 | + if qseqid in gene: |
| 103 | + gene[qseqid].append(sseqid) |
| 104 | + else: |
| 105 | + gene[qseqid]=[sseqid] |
| 106 | + #print(gene) |
| 107 | + except IOError as err: |
| 108 | + print(err) |
| 109 | + with open(fname,"a") as ofile: |
| 110 | + for key in gene: |
| 111 | + if len(gene[key]) > 1: |
| 112 | + for i in range(0,len(gene[key])): |
| 113 | + # print(i) |
| 114 | + tempResult = blastresultsDF.loc[blastresultsDF['sseqid']==gene[key][i]].values[0] |
| 115 | + finalResults = tempResult.tolist() |
| 116 | + str1 = '\t'.join(str(e) for e in finalResults) |
| 117 | + # print(str1) |
| 118 | + ofile.write(str1+"\n") |
| 119 | + else: |
| 120 | + #print(gene[key][0]) |
| 121 | + tempResult = blastresultsDF.loc[blastresultsDF['sseqid']==gene[key][0]].values[0] |
| 122 | + str1 = '\t'.join(str(e) for e in tempResult) |
| 123 | + ofile.write(str1+"\n") |
| 124 | + |
| 125 | + ofile.close |
| 126 | + |
| 127 | +def write_csvs(args): |
| 128 | + blastpath=os.path.join(args.localdir, "sorted_blastresults") |
| 129 | + csv_dir = os.path.join(blastpath, "blast_csv") |
| 130 | + os.makedirs(csv_dir, exist_ok=True) |
| 131 | + for file in os.listdir(blastpath): |
| 132 | + if file.endswith(".blast"): |
| 133 | + filename=file.split(".")[0] |
| 134 | + blast_file_path = os.path.join(blastpath, file) |
| 135 | + if os.path.getsize(blast_file_path) == 0: |
| 136 | + print(f"Skipping empty file: {file}") |
| 137 | + continue |
| 138 | + try: |
| 139 | + blast_data = pd.read_csv(blast_file_path, sep="\t", header=None) |
| 140 | + except pd.errors.EmptyDataError: |
| 141 | + print(f"File is empty or not found: {file}") |
| 142 | + continue |
| 143 | + try: |
| 144 | + blast_data.columns=["query_genome","db_bestmatch","pident","alignment_length","coverage","mismatch","gapopen","query_start","query_end","subject_start","subject_end","query_length","subject_length","bitscore","qseq"] |
| 145 | + data_filtered=blast_data[blast_data.bitscore == blast_data.bitscore.max()] |
| 146 | + filtered_uniq = data_filtered.drop_duplicates() |
| 147 | + blast_csvfmt=os.path.join(csv_dir, f"{filename}.csv") |
| 148 | + filtered_uniq.to_csv(blast_csvfmt) |
| 149 | + except Exception as e: |
| 150 | + print(f"Error processing file {file}: {e}") |
| 151 | + |
| 152 | +def filter_besthit(args): |
| 153 | + i = 0 |
| 154 | + csvpath=os.path.join(args.localdir,"sorted_blastresults","blast_csv") |
| 155 | + csvfiles=[file for file in os.listdir(csvpath) if file.endswith(".csv")] |
| 156 | + if not csvfiles: |
| 157 | + df_empty = pd.DataFrame({"NCE": ["Sample did not meet the BLAST parameters, check input sequence or BLAST results"]}) |
| 158 | + df_empty.to_csv(os.path.join(args.resultsdir, "NO_18SResults.csv"), index=False) |
| 159 | + else: |
| 160 | + for file in csvfiles: |
| 161 | + filename=os.path.basename(file) |
| 162 | + fname=filename.split(".")[0] |
| 163 | + data=pd.read_csv(os.path.join(csvpath, file), sep=",", index_col=False) |
| 164 | + data = data.drop(["Unnamed: 0","mismatch","gapopen","query_length","subject_length"],axis=1) |
| 165 | + data["rank"]=data[["query_genome","pident","coverage","bitscore"]].apply(tuple,axis=1).rank(method='dense',ascending=False).astype(int) |
| 166 | + #print("HERE") |
| 167 | + #print(data["db_bestmatch"][0]) |
| 168 | + #species=data["db_bestmatch"][0].split('_',2, expand=True) |
| 169 | + species=data["db_bestmatch"][0].split('_',2) |
| 170 | + data["species"] = species[0]+"."+species[1] |
| 171 | + data["sample_name"]=fname |
| 172 | + data_uniq = data.drop_duplicates() |
| 173 | + NCE_1 = ["Alignment length is <700bp - do manual check for Identity and coverage" if x < 700 else "NA" for x in data_uniq["alignment_length"]] |
| 174 | + NCE_2 = ["Sample has multiple hits - Review manually" if len(data_uniq) > 1 else "NA"] * len(data_uniq) |
| 175 | + NCE_combined = [f"{n1};{n2}" for n1, n2 in zip(NCE_1, NCE_2)] |
| 176 | + data_uniq["NCE"]= NCE_combined |
| 177 | + finaldata = data_uniq.drop(columns=["rank"]) |
| 178 | + # print(finaldata) |
| 179 | + # Reorder the columns |
| 180 | + column_order = ["sample_name","query_genome", "db_bestmatch", "species", "pident", "alignment_length", "coverage", "query_start", "query_end", "subject_start", "subject_end", "bitscore", "NCE"] |
| 181 | + finaldata = finaldata[column_order] |
| 182 | + output_file = os.path.join(args.resultsdir, f"{fname}.18SResults.csv") |
| 183 | + if i == 0: |
| 184 | + finaldata.to_csv(output_file,index=False) |
| 185 | + else: |
| 186 | + finaldata.to_csv(output_file,mode='a',index=False, header = False) |
| 187 | + i += 1 |
| 188 | + |
| 189 | +def main(): |
| 190 | + args = cmdline_args() |
| 191 | + dirs(args) |
| 192 | + subdirs(args) |
| 193 | + blast_output(args) |
| 194 | + write_csvs(args) |
| 195 | + filter_besthit(args) |
| 196 | + #print(CGRE + "Job has completed successfully" + CEND) |
| 197 | + |
| 198 | +if __name__=='__main__': |
| 199 | + main() |
0 commit comments