public_health_viral_genomics/workflows/wf_sarscov2_nextstrain_modified.wdl at main · theiagen/public_health_viral_genomics · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
version 1.0

import "../tasks/tasks_nextstrain.wdl" as nextstrain
import "../tasks/tasks_reports.wdl" as reports
import "../tasks/tasks_intrahost.wdl" as intrahost
import "../tasks/tasks_utils.wdl" as utils

workflow sarscov2_nextstrain {
  meta {
    description: "Modified version of the Broad's sars_cov2_nextstrain WDL Worfklow to align assemblies, build trees, and convert to json representation suitable for Nextstrain visualization. See https://nextstrain.org/docs/getting-started/ and https://nextstrain-augur.readthedocs.io/en/stable/"
    author: "Kevin Libuit"
    email:  "kevin.libuit@theiagen.com"
  }
  input {
    Array[File]+ assembly_fastas
    Array[File]+ sample_metadata_tsvs
    String tree_root_seq_id = "Wuhan-Hu-1/2019"
    String build_name
    File? builds_yaml
    Array[String]? ancestral_traits_to_infer
    File? auspice_config
    File? ref_fasta
    File? clades_tsv
    File? lat_longs_tsv
    Float? clock_rate
    Float? clock_std_dev
    Int mafft_cpu=64
    Int mafft_mem_size=500
    Int min_unambig_genome = 27000
  }
  parameter_meta {
    assembly_fastas: {
      description: "Set of assembled genomes to align and build trees. These must represent a single chromosome/segment of a genome only. Fastas may be one-sequence-per-individual or a concatenated multi-fasta (unaligned) or a mixture of the two. They may be compressed (gz, bz2, zst, lz4), uncompressed, or a mixture.",
      patterns: ["*.fasta", "*.fa", "*.fasta.gz", "*.fasta.zst"]
    }
    sample_metadata_tsvs: {
      description: "Tab-separated metadata file that contain binning variables and values. Must contain all samples: output will be filtered to the IDs present in this file.",
      patterns: ["*.txt", "*.tsv"]
    }
    ref_fasta: {
      description: "A reference assembly (not included in assembly_fastas) to align assembly_fastas against. Typically from NCBI RefSeq or similar.",
      patterns: ["*.fasta", "*.fa"]
    }
    min_unambig_genome: {
      description: "Minimum number of called bases in genome to pass prefilter."
    }
    ancestral_traits_to_infer: {
      description: "A list of metadata traits to use for ancestral node inference (see https://nextstrain-augur.readthedocs.io/en/stable/usage/cli/traits.html). Multiple traits may be specified; must correspond exactly to column headers in metadata file. Omitting these values will skip ancestral trait inference, and ancestral nodes will not have estimated values for metadata."
    }
    clades_tsv: {
      description: "A TSV file containing clade mutation positions in four columns: [clade  gene    site    alt]; see: https://nextstrain.org/docs/tutorials/defining-clades",
      patterns: ["*.tsv", "*.txt"]
    }
  }
  call nextstrain.nextstrain_ncov_defaults
  #### mafft_and_snp
  call utils.zcat {
    input:
      infiles = assembly_fastas,
      output_name = "all_samples_combined_assembly.fasta"
  }
  call nextstrain.nextstrain_deduplicate_sequences as dedup_seqs {
    input:
      sequences_fasta = zcat.combined
  }
  call utils.filter_sequences_by_length {
    input:
      sequences_fasta = dedup_seqs.sequences_deduplicated_fasta,
      min_non_N = min_unambig_genome
  }
  call nextstrain.mafft_one_chr_chunked as mafft {
    input:
      sequences = filter_sequences_by_length.filtered_fasta,
      ref_fasta = select_first([ref_fasta, nextstrain_ncov_defaults.reference_fasta]),
      basename = "all_samples_aligned.fasta"
  }
  #### merge metadata, compute derived cols
  if(length(sample_metadata_tsvs)>1) {
    call utils.tsv_join {
      input:
        input_tsvs = sample_metadata_tsvs,
        id_col = 'strain',
        out_basename = "metadata-merged"
    }
  }
  call nextstrain.derived_cols {
    input:
      metadata_tsv = select_first(flatten([[tsv_join.out_tsv], sample_metadata_tsvs]))
  }
  ## Subsample if builds.yaml file provided
  if(defined(builds_yaml)) {
    call nextstrain.nextstrain_build_subsample as subsample {
      input:
        alignment_msa_fasta = mafft.aligned_sequences,
        sample_metadata_tsv = derived_cols.derived_metadata,
        build_name = build_name,
        builds_yaml = builds_yaml
    }
  }
  call utils.fasta_to_ids {
    input:
      sequences_fasta = select_first([subsample.subsampled_msa, mafft.aligned_sequences])
  }
  call nextstrain.snp_sites {
    input:
      msa_fasta = select_first([subsample.subsampled_msa, mafft.aligned_sequences])
  }
  #### augur_from_msa
  call nextstrain.augur_mask_sites {
    input:
      sequences = select_first([subsample.subsampled_msa, mafft.aligned_sequences])
  }
  call nextstrain.draft_augur_tree {
    input:
      msa_or_vcf = augur_mask_sites.masked_sequences
  }
  call nextstrain.refine_augur_tree {
    input:
      raw_tree = draft_augur_tree.aligned_tree,
      msa_or_vcf = select_first([subsample.subsampled_msa, augur_mask_sites.masked_sequences]),
      metadata = derived_cols.derived_metadata,
      clock_rate = clock_rate,
      clock_std_dev = clock_std_dev,
      root = tree_root_seq_id
  }
  if(defined(ancestral_traits_to_infer) && length(select_first([ancestral_traits_to_infer,[]]))>0) {
    call nextstrain.ancestral_traits {
      input:
        tree = refine_augur_tree.tree_refined,
        metadata = derived_cols.derived_metadata,
        columns = select_first([ancestral_traits_to_infer,[]])
    }
  }
  call nextstrain.tip_frequencies {
    input:
      tree = refine_augur_tree.tree_refined,
      metadata = derived_cols.derived_metadata,
      min_date = 2020.0,
      pivot_interval = 1,
      pivot_interval_units = "weeks",
      narrow_bandwidth = 0.05,
      proportion_wide = 0.0,
      out_basename = "auspice-~{build_name}"
  }
  call nextstrain.ancestral_tree {
    input:
      tree = refine_augur_tree.tree_refined,
      msa_or_vcf = select_first([subsample.subsampled_msa, augur_mask_sites.masked_sequences])
  }
  call nextstrain.translate_augur_tree {
    input:
      tree = refine_augur_tree.tree_refined,
      nt_muts = ancestral_tree.nt_muts_json,
      genbank_gb = nextstrain_ncov_defaults.reference_gb
  }
  call nextstrain.assign_clades_to_nodes {
    input:
      tree_nwk = refine_augur_tree.tree_refined,
      nt_muts_json = ancestral_tree.nt_muts_json,
      aa_muts_json = translate_augur_tree.aa_muts_json,
      ref_fasta = select_first([ref_fasta, nextstrain_ncov_defaults.reference_fasta]),
      clades_tsv = select_first([clades_tsv, nextstrain_ncov_defaults.clades_tsv])
  }
  call nextstrain.export_auspice_json {
    input:
      tree = refine_augur_tree.tree_refined,
      sample_metadata = derived_cols.derived_metadata,
      lat_longs_tsv = select_first([lat_longs_tsv, nextstrain_ncov_defaults.lat_longs_tsv]),
      node_data_jsons = select_all([
                          refine_augur_tree.branch_lengths,
                          ancestral_traits.node_data_json,
                          ancestral_tree.nt_muts_json,
                          translate_augur_tree.aa_muts_json,
                          assign_clades_to_nodes.node_clade_data_json]),
      auspice_config = select_first([auspice_config, nextstrain_ncov_defaults.auspice_config]),
      out_basename = "auspice-~{build_name}"
  }
  output {
    File combined_assemblies = filter_sequences_by_length.filtered_fasta
    File multiple_alignment = mafft.aligned_sequences
    File unmasked_snps = snp_sites.snps_vcf
    File masked_alignment = augur_mask_sites.masked_sequences
    File metadata_merged = derived_cols.derived_metadata
    File keep_list = fasta_to_ids.ids_txt
    File mafft_alignment = select_first([subsample.subsampled_msa, mafft.aligned_sequences])
    File ml_tree = draft_augur_tree.aligned_tree
    File time_tree = refine_augur_tree.tree_refined
    Array[File] node_data_jsons = select_all([
                  refine_augur_tree.branch_lengths,
                  ancestral_traits.node_data_json,
                  ancestral_tree.nt_muts_json,
                  translate_augur_tree.aa_muts_json,
                  assign_clades_to_nodes.node_clade_data_json])
    File tip_frequencies_json = tip_frequencies.node_data_json
    File root_sequence_json = export_auspice_json.root_sequence_json
    File auspice_input_json = export_auspice_json.virus_json
  }
}