public_health_viral_genomics/workflows/wf_theiacov_augur_distance_tree.wdl at main · theiagen/public_health_viral_genomics · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
version 1.0

import "../tasks/tasks_nextstrain.wdl" as nextstrain
import "../tasks/tasks_utils.wdl" as utils
import "../tasks/task_phylo.wdl" as phylo
import "../tasks/task_versioning.wdl" as versioning

workflow theiacov_distance_tree {
  meta {
    description: "Workflow for SC2 cluster investigations. TheiaCoV_Augur_DistanceTree is will generate a ML distance tree using select tasks incorporated in the ThieaCoV_Augur_Run workflow; output from the modified sarscov2_nextstrain workflow will also be used to infer SNP distances. The ML distance tree output can be visualized using the Auspice web application https://auspice.us/"
    author: "Kevin G Libuit"
    email:  "kevin.libuit@theiagen.com"
  }
  input {
    Array[File]+ assembly_fastas
    Array[File]+ sample_metadata_tsvs
    String build_name
    File? builds_yaml
    File? ref_fasta
    Int min_unambig_genome = 27000
  }
  parameter_meta {
    assembly_fastas: {
    description: "Set of assembled genomes to align and build trees. These must represent a single chromosome/segment of a genome only. Fastas may be one-sequence-per-individual or a concatenated multi-fasta (unaligned) or a mixture of the two. They may be compressed (gz, bz2, zst, lz4), uncompressed, or a mixture.",
    patterns: ["*.fasta", "*.fa", "*.fasta.gz", "*.fasta.zst"]
    }
    sample_metadata_tsvs: {
    description: "Tab-separated metadata file that contain binning variables and values. Must contain all samples: output will be filtered to the IDs present in this file.",
    patterns: ["*.txt", "*.tsv"]
    }
    ref_fasta: {
    description: "A reference assembly (not included in assembly_fastas) to align assembly_fastas against. Typically from NCBI RefSeq or similar.",
    patterns: ["*.fasta", "*.fa"]
    }
    min_unambig_genome: {
    description: "Minimum number of called bases in genome to pass prefilter."
    }
    ancestral_traits_to_infer: {
    description: "A list of metadata traits to use for ancestral node inference (see https://nextstrain-augur.readthedocs.io/en/stable/usage/cli/traits.html). Multiple traits may be specified; must correspond exactly to column headers in metadata file. Omitting these values will skip ancestral trait inference, and ancestral nodes will not have estimated values for metadata."
    }
    clades_tsv: {
    description: "A TSV file containing clade mutation positions in four columns: [clade  gene    site    alt]; see: https://nextstrain.org/docs/tutorials/defining-clades",
    patterns: ["*.tsv", "*.txt"]
    }
  }
  call nextstrain.nextstrain_ncov_defaults
  #### mafft_and_snp
  call utils.zcat {
    input:
      infiles = assembly_fastas,
      output_name = "all_samples_combined_assembly.fasta"
    }
  call nextstrain.nextstrain_deduplicate_sequences as dedup_seqs {
    input:
      sequences_fasta = zcat.combined
    }
  call utils.filter_sequences_by_length {
    input:
      sequences_fasta = dedup_seqs.sequences_deduplicated_fasta,
      min_non_N = min_unambig_genome
    }
  call nextstrain.mafft_one_chr_chunked as mafft {
    input:
      sequences = filter_sequences_by_length.filtered_fasta,
      ref_fasta = select_first([ref_fasta, nextstrain_ncov_defaults.reference_fasta]),
      basename = "all_samples_aligned.fasta"
    }
  #### merge metadata, compute derived cols
  if(length(sample_metadata_tsvs)>1) {
    call utils.tsv_join {
      input:
        input_tsvs = sample_metadata_tsvs,
        id_col = 'strain',
        out_basename = "metadata-merged"
    }
  }
  call nextstrain.derived_cols {
    input:
      metadata_tsv = select_first(flatten([[tsv_join.out_tsv], sample_metadata_tsvs]))
  }
  ## Subsample if builds.yaml file provided
  if(defined(builds_yaml)) {
    call nextstrain.nextstrain_build_subsample as subsample {
      input:
        alignment_msa_fasta = mafft.aligned_sequences,
        sample_metadata_tsv = derived_cols.derived_metadata,
        build_name = build_name,
        builds_yaml = builds_yaml
    }
  }
  call utils.fasta_to_ids {
    input:
      sequences_fasta = select_first([subsample.subsampled_msa, mafft.aligned_sequences])
  }
  call nextstrain.snp_sites {
    input:
      msa_fasta = select_first([subsample.subsampled_msa, mafft.aligned_sequences])
  }
  #### augur_from_msa
  call nextstrain.augur_mask_sites {
    input:
      sequences = select_first([subsample.subsampled_msa, mafft.aligned_sequences])
  }
  call nextstrain.draft_augur_tree {
    input:
     msa_or_vcf = augur_mask_sites.masked_sequences
  }
  call phylo.snp_dists {
    input:
      cluster_name = build_name,
      alignment = select_first([subsample.subsampled_msa, mafft.aligned_sequences])
  }
  call versioning.version_capture{
    input:
  }
  output {
    # Version Capture
    String TheiaCoV_Augur_DistanceTree_version = version_capture.phvg_version
    String TheiaCoV_Augur_DistanceTree_analysis_date = version_capture.date
    # Tree, Intermediates, and Metadata
    File combined_assemblies = filter_sequences_by_length.filtered_fasta
    File multiple_alignment = mafft.aligned_sequences
    File unmasked_snps = snp_sites.snps_vcf
    File masked_alignment = augur_mask_sites.masked_sequences
    File metadata_merged = derived_cols.derived_metadata
    File keep_list = fasta_to_ids.ids_txt
    File mafft_alignment = select_first([subsample.subsampled_msa, mafft.aligned_sequences])
    File distance_tree = draft_augur_tree.aligned_tree
    # SNP Matrix
    File snp_matrix = snp_dists.snp_matrix
  }
}