-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathmatch_TFBS_promoter.sh
More file actions
176 lines (152 loc) · 7.43 KB
/
match_TFBS_promoter.sh
File metadata and controls
176 lines (152 loc) · 7.43 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
#!/bin/bash
export PATH=$PATH:~/meme/bin
export PATH=$PATH:~/data/scripts
usage()
{
cat << EOF
usage $0 <options>
A TF-Target Validator based on TFBS (transcription factor binding sites) being present on the promoter region sequences of their putative targets
OPTION
-h show this Help message
-n path to file containing the Co-expression network (delimiter=",")
-p path to file containing the promoter sequences of all genes
-t string with the TF ID (ex: LOC1234) from Quercus suber; a list of one TF per line is also accepted
-v minimum pvalue to establish a valid match in FIMO (default = 1.0E-4)
-o output directory name (default = TF_Targets_match)
DETAILS
This script receives as input: 1)The co-expression network (3 mandatory columns: LOC1 | LOC2 | edge_score/irp_score); 2)The table with the gene promoter sequences (2 mandatory columns: Gene | Sequence); 3)A list with TF IDs (ex:LOC123 OR AT123).
It creates a output directory with severall output files, one of which is a table with 2 columns: TF | Validated Target
The TF putative targets are the genes that the TF is linked to by an edge in the network, and the validation conducted in this script is a check using FIMO (MEME Suite) of a motif match between the TFBS (transcription factor binding site) and the promoter sequences of its putative targets.
I conducted my work using this script with the following input files: --network Cork_network.txt --promoter_sequences Promoter_sequences_df.csv --TF [relevant TFs from the previous TF_analysis step] --TFBS_matrix TFBS_TF_[relevant TF LOCs]_fasta.fa
I stored my output tables into ...
EOF
}
printf "\n\n -------------------- HELLO AND WELCOME TO THE TF-TARGET VALIDATOR --------------------- \n\n"
#get options
while getopts "h:n:p:t:v:o:" OPTION
do
case $OPTION in
h) usage; exit 1;;
n) network=$OPTARG;;
p) promoters=$OPTARG;;
t) tf=$OPTARG;;
v) pvalue=$OPTARG;;
o) outdir=$OPTARG;;
?) usage; exit;;
esac
done
#Check that all required arguments were passed
if [[ -z $network ]] || [[ -z $promoters ]] || [[ -z $tf ]]
then
printf "\n-----------------\n -- ERROR: options -n ; -p and -t are required! -- \n------------------\n\n"
usage
exit 1
fi
if [[ -z $pvalue ]];
then
printf " | Using the FIMO default pvalue threshold of 1.0E-4 |\n"
pvalue="1.0E-4"
else
pvalue = $pvalue
printf " | Using the user defined pvalue treshold = $pvalue |\n"
fi
if [[ -z $outdir ]];
then
printf " | Using the default output directory = TF_Target_Validator_out |\n"
outdir="TF_Target_Validator_out"
else
outdir=$outdir
printf " | Outputting the files to the Out-directory = $outdir |\n"
fi
# --------------------------------- MAIN CODE -------------------------------- #
#Create an output directory
mkdir -p $outdir
#Get the correspondent AT IDs from the LOC IDs
while read f
do
grep $f arabidopsis_concise.txt > tf_loc.txt
sed 's/.*\t.*\tAT/AT/' tf_loc.txt > at_ID.txt
at_tf=$(<at_ID.txt)
at_tf="$(echo -e "${at_tf}" | tr -d '[:space:]')"
rm tf_loc.txt
rm at_ID.txt
#This script receives the TF ID and returns a file with the correspondent TFBS matrix
printf "\n | The TFBS of ${f} ($at_tf) is being searched within the JASPAR database using your input TF ID query | \n"i
#The following subscript stopped working properly
#get_TFBS.sh $at_tf
curl -X GET "https://jaspar.genereg.net/api/v1/matrix/?search=$at_tf" -H "accept: application/json" -H "X-CSRFToken: NzxtdFE3eih6q6L4ALnKx9Jf00L50eQO17X3J1cyFlS44E8svSYy7CBl90H1bgyZ"
cat ./.*meme > ./matrixID1.txt
rm ./.*meme
grep -o 'base_id.*version' ./matrixID1.txt > ./matrixID2.txt
sed 's/base_id":"//' ./matrixID2.txt > ./matrixID3.txt
sed 's/","version//' ./matrixID3.txt > ./matrixID.txt
matrixID=$(<matrixID.txt)
jaspar_out=$(<matrixID1.txt)
#Downloading the TFBS matrix by the matrix ID
wget https://jaspar.genereg.net/api/v1/matrix/$matrixID.meme
mv ./$matrixID.meme ./$at_tf.meme
rm matrixID1.txt
rm ./matrixID2.txt
rm ./matrixID3.txt
rm ./matrixID.txt
#Second Queried Database - PlantTFDB
#If the TFBS does not exist within the JASPAR database, the script tries another search in PlantTFDB
if [ $jaspar_out=='{"count":0,"next":null,"previous":null,"results":[]}' ];
then
printf "\n | TF ${f} ($at_tf) NOT FOUND ON JASPAR DATABASE, SEARCHING IN PLANTTFDB INSTEAD | \n"
rm ./${at_tf}.meme
wget http://planttfdb.gao-lab.org/motif/Ath/${at_tf}.meme
fi
#Worst case possible where the script doesnt find the TFBS in all of the queried databases
FILE=./${at_tf}.meme
if test -f "$FILE"; then
mv ./$at_tf.meme ./$outdir/$at_tf.meme
#This script receives all the previous inputs plus the TFBS matrix to prepare a fasta file ready to enter into FIMO
match_TFBS_promoter.py --network $network --promoter_sequences $promoters --TF $f
mv ./TFBS_TF_${f}_fasta.fa ./$outdir/TFBS_TF_${f}_fasta.fa
grep '>' ./$outdir/TFBS_TF_${f}_fasta.fa > ./$outdir/TF_${f}_target_list1.txt
sed 's/>//g' ./$outdir/TF_${f}_target_list1.txt > ./$outdir/TF_${f}_target_list2.txt
sed 's/ .*//g' ./$outdir/TF_${f}_target_list2.txt > ./$outdir/TF_${f}_target_list.txt
rm ./$outdir/TF_${f}_target_list1.txt
rm ./$outdir/TF_${f}_target_list2.txt
#This line runs the FIMO tool from MEME Suite and performs the motif match between the TFBS and the promoter region sequences of its putative targets
printf " | Running FIMO on TF ${f} for motif matching | \n"
fimo --oc ./$outdir/fimo_${f} --verbosity 1 --thresh $pvalue ./$outdir/$at_tf.meme ./$outdir/TFBS_TF_${f}_fasta.fa
printf "\n | Printing all matching targets of the TF ${f} | \n"
cat ./$outdir/fimo_${f}/fimo.gff | awk '{print $1,$6}' > ./$outdir/motif_match_${f}.txt
#cat ./$outdir/fimo_${f}/fimo.gff | awk '{print $1}' > ./$outdir/motif_match_${f}.txt
awk -F' ' '!a[$1]++' ./$outdir/motif_match_${f}.txt > ./$outdir/best_motif_match_with_score_${f}.txt
rm ./$outdir/motif_match_${f}.txt
sed 's/ .*//g' ./$outdir/best_motif_match_with_score_${f}.txt > ./$outdir/best_motif_match_without_score_${f}.txt
else
printf "${f} NO_TFBS\n" >> ./$outdir/TFs_with_NO_TFBS.txt
#printf "${f} NO_TFBS\n" >> ./$outdir/ALL_TF_Matches_Output.txt
fi
#Starts building the Output File Table based on the FIMO results. YES - match | NO - no match | NO_TFBS - no TFBS found
FILE=./$outdir/TF_${f}_target_list.txt
if [ -f "$FILE" ]; then
while read target
do
if grep -Fxq "$target" ./$outdir/best_motif_match_without_score_${f}.txt
then
#printf "${f} ${target} YES\n" >> ./$outdir/TF_${f}_matches.txt
grep "$target" ./$outdir/best_motif_match_with_score_${f}.txt > ./$outdir/fimo_score_prov.txt
sed 's/.* //g' ./$outdir/fimo_score_prov.txt > ./$outdir/fimo_score.txt
fimo_score=$(<./$outdir/fimo_score.txt)
rm ./$outdir/fimo_score_prov.txt
rm ./$outdir/fimo_score.txt
printf "${f} ${target} YES_${fimo_score}\n" >> ./$outdir/ALL_TF_Matches_Output.txt
else
#printf "${f} ${target} NO\n" >> ./$outdir/TF_${f}_matches.txt
printf "${f} ${target} NO\n" >> ./$outdir/ALL_TF_Matches_Output.txt
fi
done < ./$outdir/TF_${f}_target_list.txt
#Removing leftover temporary files; comment to keep the files inside output directory
rm ./$outdir/TF_${f}_target_list.txt
rm ./$outdir/best_motif_match_with_score_${f}.txt
rm ./$outdir/best_motif_match_without_score_${f}.txt
rm ./$outdir/$at_tf.meme
rm ./$outdir/TFBS_TF_${f}_fasta.fa
rm -r ./$outdir/fimo_${f}
fi
done < $tf