Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions preprocessing/nextclade/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -123,6 +123,8 @@ However, the `preprocessing` field can be customized to take an arbitrary number
4. `parse_date_into_range`: Takes an incomplete (or complete) date (Just `%Y`, just `%Y-%m` or a full date) and turns it into two date fields: an upper and a lower date for the date range. Can optionally take another date field (the release date) into account, as an upper bound for the date range. For example, a sample collected in "2025-03" and released "2025-03-23" will mean the lower bound for the collection date is 2025-03-01 and the upper bound is the release date, 2025-03-23. To use this function fully, define three metadata fields: one for the plain string, one for the upper bound, one for the lower bound. See example below.
5. `concatenate`: Take multiple metadata fields (including the accessionVersion) and concatenate them in the order specified by the `arg.order` parameter, fields will first be processed based on their `arg.type` (the order of the types should correspond to the order of fields specified by the order argument).
6. `process_options`: Only accept input that is in `args.options`, this check is case-insensitive. If input value is not in options raise an error, or return null if the submitter is in the "insdc_ingest" group.
7. `check_regex`: Validate that the input field matches the pattern in `args.pattern`.
8. `extract_regex`: Extracts a substring from input field using the provided regex `args.pattern` with a `args.capture_group`. For example the pattern `^(?P<segment>[^-]+)-(?P<subtype>[^-]+)$` with capture group `subtype` would extract `HA` from the field `seg1-HA`. Returns an error if the pattern does not match (and internal error if capture group does not exist in pattern). If `arg.uppercase` is added the extracted string will be capitalized.

Using these functions in your `values.yaml` will look like:

Expand Down
4 changes: 4 additions & 0 deletions preprocessing/nextclade/src/loculus_preprocessing/prepro.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,7 @@
from .processing_functions import (
ProcessingFunctions,
process_frameshifts,
process_phenotype_values,
process_stop_codons,
)
from .sequence_checks import errors_if_non_iupac
Expand Down Expand Up @@ -176,6 +177,9 @@ def add_nextclade_metadata(
case "qc.stopCodons.stopCodons":
result = None if raw is None else str(raw)
return process_stop_codons(result)
case "phenotypeValues":
result = None if raw is None else str(raw)
return process_phenotype_values(result, spec.args)
case _:
return InputData(datum=str(raw))

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
This makes it easy to test and reason about the code
"""

import ast
import calendar
import json
import logging
Expand Down Expand Up @@ -168,6 +169,16 @@ def format_authors(authors: str) -> str:
return "; ".join(loculus_authors).strip()


def regex_error(
function_name: str, function_arg: str, input_data: InputMetadata, args: FunctionArgs
) -> str:
return (
f"Internal Error: Function {function_name} did not receive valid "
f"regex {function_arg}, with input {input_data} and args {args}, "
"please contact the administrator."
)


class ProcessingFunctions:
@classmethod
def call_function(
Expand Down Expand Up @@ -842,6 +853,84 @@ def check_authors(
warnings=warnings,
)

@staticmethod
def extract_regex(
input_data: InputMetadata,
output_field: str,
input_fields: list[str],
args: FunctionArgs,
) -> ProcessingResult:
"""
Extracts a substring from the `regex_field` using the provided regex `pattern`
with a `capture_group`, if `uppercase` is set to true the extracted value is capitalized.
e.g. ^(?P<segment>[^-]+)-(?P<subtype>[^-]+)$ where segment or subtype could be used
as a capture_group to extract their respective value from the regex_field.
"""
regex_field = input_data["regex_field"]

warnings: list[ProcessingAnnotation] = []
errors: list[ProcessingAnnotation] = []

pattern = args.get("pattern")
capture_group = args.get("capture_group")
uppercase = args.get("uppercase", False)

if not regex_field:
return ProcessingResult(datum=None, warnings=warnings, errors=errors)
if not isinstance(pattern, str):
errors.append(
ProcessingAnnotation.from_fields(
input_fields,
[output_field],
AnnotationSourceType.METADATA,
message=regex_error("extract_regex", "pattern", input_data, args),
)
)
return ProcessingResult(datum=None, warnings=warnings, errors=errors)
if not isinstance(capture_group, str):
errors.append(
ProcessingAnnotation.from_fields(
input_fields,
[output_field],
AnnotationSourceType.METADATA,
message=regex_error("extract_regex", "capture_group", input_data, args),
)
)
return ProcessingResult(datum=None, warnings=warnings, errors=errors)
match = re.match(pattern, regex_field.strip())
if match:
try:
result = match.group(capture_group)
if uppercase:
result = result.upper()
return ProcessingResult(datum=result, warnings=warnings, errors=errors)
except IndexError:
errors.append(
ProcessingAnnotation.from_fields(
input_fields,
[output_field],
AnnotationSourceType.METADATA,
message=(
f"The pattern '{pattern}' does not contain a capture group: "
f"'{capture_group}'- this is an internal error,"
" please contact your local administrator."
),
)
)
else:
errors.append(
ProcessingAnnotation.from_fields(
input_fields,
[output_field],
AnnotationSourceType.METADATA,
message=(
f"The value '{regex_field}' does not match the expected regex "
f"pattern: '{pattern}'."
),
)
)
return ProcessingResult(datum=None, warnings=warnings, errors=errors)

@staticmethod
def check_regex(
input_data: InputMetadata,
Expand All @@ -868,11 +957,7 @@ def check_regex(
input_fields,
[output_field],
AnnotationSourceType.METADATA,
message=(
f"Internal Error: Function check_regex did not receive valid "
f"regex pattern, with input {input_data} and args {args}, "
"please contact the administrator."
),
message=regex_error("check_regex", "pattern", input_data, args),
)
)
return ProcessingResult(datum=None, warnings=warnings, errors=errors)
Expand Down Expand Up @@ -1169,6 +1254,32 @@ def format_stop_codon(result: str | None) -> str | None:
return ",".join(stop_codon_strings)


def process_phenotype_values(input: str | None, args: FunctionArgs | None) -> InputData:
"""Processes phenotype values string to InputData for processing"""
if input is None:
return InputData(datum=None)
name = args.get("name", "") if args else ""
try:
data = ast.literal_eval(input)
for entry in data:
if entry.get("name") == name:
return InputData(datum=str(entry.get("value")))
except Exception as e:
msg = (
"Was unable to process phenotype values - this is likely an internal error. "
"Please contact the administrator."
)
logger.error(msg + f" Error: {e}")
return InputData(
datum=None,
errors=single_metadata_annotation(
"phenotypeValues",
msg,
),
)
return InputData(datum=None)


def trim_ns(sequence: str) -> str:
"""
Trims 'N' and 'n' characters from the start and end of a nucleotide sequence.
Expand Down
7 changes: 7 additions & 0 deletions preprocessing/nextclade/tests/no_alignment_config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -82,5 +82,12 @@ processing_spec:
function: check_regex
args:
pattern: "^EPI_ISL_[0-9]+$"
inputs:
regex_field: regex_field
extracted_regex_field:
function: extract_regex
args:
pattern: "^EPI_ISL_(?P<id>[0-9]+)$"
capture_group: id
inputs:
regex_field: regex_field
Original file line number Diff line number Diff line change
Expand Up @@ -219,6 +219,7 @@
"required_collection_date": "2022-11-01",
"concatenated_string": "LOC_6.1/2022-11-01",
"regex_field": "EPI_ISL_123456",
"extracted_regex_field": "123456",
},
expected_errors=[],
expected_warnings=[],
Expand All @@ -239,6 +240,7 @@
"required_collection_date": "2022-11-01",
"concatenated_string": "LOC_6.1/2022-11-01",
"regex_field": None,
"extracted_regex_field": None,
},
expected_errors=build_processing_annotations(
[
Expand All @@ -250,6 +252,14 @@
"'^EPI_ISL_[0-9]+$'."
),
),
ProcessingAnnotationHelper(
["regex_field"],
["extracted_regex_field"],
(
"The value 'EPIISL_123456' does not match the expected regex pattern: "
"'^EPI_ISL_(?P<id>[0-9]+)$'."
),
),
]
),
expected_warnings=[],
Expand Down Expand Up @@ -598,6 +608,7 @@
"concatenated_string": "LOC_16.1/2022-11-01",
"authors": "Smith, John II; Doe, A. B. C.",
"regex_field": "EPI_ISL_123456",
"extracted_regex_field": "123456",
},
expected_errors=[],
expected_warnings=[],
Expand Down
16 changes: 16 additions & 0 deletions preprocessing/nextclade/tests/test_nextclade_preprocessing.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@
from loculus_preprocessing.processing_functions import (
format_frameshift,
format_stop_codon,
process_phenotype_values,
)

# Config file used for testing
Expand Down Expand Up @@ -1246,6 +1247,21 @@ def test_format_stop_codon():
assert format_stop_codon(input_zero) == expected_zero


def test_process_phenotype_values():
assert process_phenotype_values("[]", {"name": "NAI"}).datum is None
assert (
process_phenotype_values(
'[{"name": "NAI","cds": "NA","value": 0.0}, {"name": "Other","cds": "NA","value": 1.0}]',
{"name": "NAI"},
).datum
== "0.0"
)
assert process_phenotype_values('[{"name": "NAI","cds": "NA","value": 0.0}]', {}).datum is None
invalid = process_phenotype_values("Malformed JSON", {"name": "NAI"})
assert invalid.datum is None
assert "Was unable to process phenotype values" in invalid.errors[0].message


def test_reformat_authors_from_loculus_to_embl_style():
authors = "Xi,L.;Smith, Anna Maria; Perez Gonzalez, Anthony J.;Doe,;von Doe, John"
result = reformat_authors_from_loculus_to_embl_style(authors)
Expand Down
Loading