loculus-project · anna-parker · Jan 27, 2026 · Jan 27, 2026 · Feb 2, 2026 · Feb 2, 2026
diff --git a/preprocessing/nextclade/README.md b/preprocessing/nextclade/README.md
@@ -123,6 +123,8 @@ However, the `preprocessing` field can be customized to take an arbitrary number
 4. `parse_date_into_range`: Takes an incomplete (or complete) date (Just `%Y`, just `%Y-%m` or a full date) and turns it into two date fields: an upper and a lower date for the date range. Can optionally take another date field (the release date) into account, as an upper bound for the date range. For example, a sample collected in "2025-03" and released "2025-03-23" will mean the lower bound for the collection date is 2025-03-01 and the upper bound is the release date, 2025-03-23. To use this function fully, define three metadata fields: one for the plain string, one for the upper bound, one for the lower bound. See example below.
 5. `concatenate`: Take multiple metadata fields (including the accessionVersion) and concatenate them in the order specified by the `arg.order` parameter, fields will first be processed based on their `arg.type` (the order of the types should correspond to the order of fields specified by the order argument).
 6. `process_options`: Only accept input that is in `args.options`, this check is case-insensitive. If input value is not in options raise an error, or return null if the submitter is in the "insdc_ingest" group.
+7. `check_regex`: Validate that the input field matches the pattern in `args.pattern`.
+8. `extract_regex`: Extracts a substring from input field using the provided regex `args.pattern` with a `args.capture_group`. For example the pattern `^(?P<segment>[^-]+)-(?P<subtype>[^-]+)$` with capture group `subtype` would extract `HA` from the field `seg1-HA`. Returns an error if the pattern does not match (and internal error if capture group does not exist in pattern). If `arg.uppercase` is added the extracted string will be capitalized.
 
 Using these functions in your `values.yaml` will look like:
 

diff --git a/preprocessing/nextclade/src/loculus_preprocessing/prepro.py b/preprocessing/nextclade/src/loculus_preprocessing/prepro.py
@@ -51,6 +51,7 @@
 from .processing_functions import (
     ProcessingFunctions,
     process_frameshifts,
+    process_phenotype_values,
     process_stop_codons,
 )
 from .sequence_checks import errors_if_non_iupac
@@ -176,6 +177,9 @@ def add_nextclade_metadata(
         case "qc.stopCodons.stopCodons":
             result = None if raw is None else str(raw)
             return process_stop_codons(result)
+        case "phenotypeValues":
+            result = None if raw is None else str(raw)
+            return process_phenotype_values(result, spec.args)
         case _:
             return InputData(datum=str(raw))
 

diff --git a/preprocessing/nextclade/src/loculus_preprocessing/processing_functions.py b/preprocessing/nextclade/src/loculus_preprocessing/processing_functions.py
@@ -3,6 +3,7 @@
 This makes it easy to test and reason about the code
 """
 
+import ast
 import calendar
 import json
 import logging
@@ -168,6 +169,16 @@ def format_authors(authors: str) -> str:
     return "; ".join(loculus_authors).strip()
 
 
+def regex_error(
+    function_name: str, function_arg: str, input_data: InputMetadata, args: FunctionArgs
+) -> str:
+    return (
+        f"Internal Error: Function {function_name} did not receive valid "
+        f"regex {function_arg}, with input {input_data} and args {args}, "
+        "please contact the administrator."
+    )
+
+
 class ProcessingFunctions:
     @classmethod
     def call_function(
@@ -842,6 +853,84 @@ def check_authors(
             warnings=warnings,
         )
 
+    @staticmethod
+    def extract_regex(
+        input_data: InputMetadata,
+        output_field: str,
+        input_fields: list[str],
+        args: FunctionArgs,
+    ) -> ProcessingResult:
+        """
+        Extracts a substring from the `regex_field` using the provided regex `pattern`
+        with a `capture_group`, if `uppercase` is set to true the extracted value is capitalized.
+        e.g. ^(?P<segment>[^-]+)-(?P<subtype>[^-]+)$ where segment or subtype could be used
+        as a capture_group to extract their respective value from the regex_field.
+        """
+        regex_field = input_data["regex_field"]
+
+        warnings: list[ProcessingAnnotation] = []
+        errors: list[ProcessingAnnotation] = []
+
+        pattern = args.get("pattern")
+        capture_group = args.get("capture_group")
+        uppercase = args.get("uppercase", False)
+
+        if not regex_field:
+            return ProcessingResult(datum=None, warnings=warnings, errors=errors)
+        if not isinstance(pattern, str):
+            errors.append(
+                ProcessingAnnotation.from_fields(
+                    input_fields,
+                    [output_field],
+                    AnnotationSourceType.METADATA,
+                    message=regex_error("extract_regex", "pattern", input_data, args),
+                )
+            )
+            return ProcessingResult(datum=None, warnings=warnings, errors=errors)
+        if not isinstance(capture_group, str):
+            errors.append(
+                ProcessingAnnotation.from_fields(
+                    input_fields,
+                    [output_field],
+                    AnnotationSourceType.METADATA,
+                    message=regex_error("extract_regex", "capture_group", input_data, args),
+                )
+            )
+            return ProcessingResult(datum=None, warnings=warnings, errors=errors)
+        match = re.match(pattern, regex_field.strip())
+        if match:
+            try:
+                result = match.group(capture_group)
+                if uppercase:
+                    result = result.upper()
+                return ProcessingResult(datum=result, warnings=warnings, errors=errors)
+            except IndexError:
+                errors.append(
+                    ProcessingAnnotation.from_fields(
+                        input_fields,
+                        [output_field],
+                        AnnotationSourceType.METADATA,
+                        message=(
+                            f"The pattern '{pattern}' does not contain a capture group: "
+                            f"'{capture_group}'- this is an internal error,"
+                            " please contact your local administrator."
+                        ),
+                    )
+                )
+        else:
+            errors.append(
+                ProcessingAnnotation.from_fields(
+                    input_fields,
+                    [output_field],
+                    AnnotationSourceType.METADATA,
+                    message=(
+                        f"The value '{regex_field}' does not match the expected regex "
+                        f"pattern: '{pattern}'."
+                    ),
+                )
+            )
+        return ProcessingResult(datum=None, warnings=warnings, errors=errors)
+
     @staticmethod
     def check_regex(
         input_data: InputMetadata,
@@ -868,11 +957,7 @@ def check_regex(
                     input_fields,
                     [output_field],
                     AnnotationSourceType.METADATA,
-                    message=(
-                        f"Internal Error: Function check_regex did not receive valid "
-                        f"regex pattern, with input {input_data} and args {args}, "
-                        "please contact the administrator."
-                    ),
+                    message=regex_error("check_regex", "pattern", input_data, args),
                 )
             )
             return ProcessingResult(datum=None, warnings=warnings, errors=errors)
@@ -1169,6 +1254,32 @@ def format_stop_codon(result: str | None) -> str | None:
     return ",".join(stop_codon_strings)
 
 
+def process_phenotype_values(input: str | None, args: FunctionArgs | None) -> InputData:
+    """Processes phenotype values string to InputData for processing"""
+    if input is None:
+        return InputData(datum=None)
+    name = args.get("name", "") if args else ""
+    try:
+        data = ast.literal_eval(input)
+        for entry in data:
+            if entry.get("name") == name:
+                return InputData(datum=str(entry.get("value")))
+    except Exception as e:
+        msg = (
+            "Was unable to process phenotype values - this is likely an internal error. "
+            "Please contact the administrator."
+        )
+        logger.error(msg + f" Error: {e}")
+        return InputData(
+            datum=None,
+            errors=single_metadata_annotation(
+                "phenotypeValues",
+                msg,
+            ),
+        )
+    return InputData(datum=None)
+
+
 def trim_ns(sequence: str) -> str:
     """
     Trims 'N' and 'n' characters from the start and end of a nucleotide sequence.

diff --git a/preprocessing/nextclade/tests/no_alignment_config.yaml b/preprocessing/nextclade/tests/no_alignment_config.yaml
@@ -82,5 +82,12 @@ processing_spec:
     function: check_regex
     args:
       pattern: "^EPI_ISL_[0-9]+$"
+    inputs:
+      regex_field: regex_field
+  extracted_regex_field:
+    function: extract_regex
+    args:
+      pattern: "^EPI_ISL_(?P<id>[0-9]+)$"
+      capture_group: id
     inputs:
       regex_field: regex_field
diff --git a/preprocessing/nextclade/tests/test_metadata_processing_functions.py b/preprocessing/nextclade/tests/test_metadata_processing_functions.py
@@ -219,6 +219,7 @@
             "required_collection_date": "2022-11-01",
             "concatenated_string": "LOC_6.1/2022-11-01",
             "regex_field": "EPI_ISL_123456",
+            "extracted_regex_field": "123456",
         },
         expected_errors=[],
         expected_warnings=[],
@@ -239,6 +240,7 @@
             "required_collection_date": "2022-11-01",
             "concatenated_string": "LOC_6.1/2022-11-01",
             "regex_field": None,
+            "extracted_regex_field": None,
         },
         expected_errors=build_processing_annotations(
             [
@@ -250,6 +252,14 @@
                         "'^EPI_ISL_[0-9]+$'."
                     ),
                 ),
+                ProcessingAnnotationHelper(
+                    ["regex_field"],
+                    ["extracted_regex_field"],
+                    (
+                        "The value 'EPIISL_123456' does not match the expected regex pattern: "
+                        "'^EPI_ISL_(?P<id>[0-9]+)$'."
+                    ),
+                ),
             ]
         ),
         expected_warnings=[],
@@ -598,6 +608,7 @@
             "concatenated_string": "LOC_16.1/2022-11-01",
             "authors": "Smith, John II; Doe, A. B. C.",
             "regex_field": "EPI_ISL_123456",
+            "extracted_regex_field": "123456",
         },
         expected_errors=[],
         expected_warnings=[],

diff --git a/preprocessing/nextclade/tests/test_nextclade_preprocessing.py b/preprocessing/nextclade/tests/test_nextclade_preprocessing.py
@@ -31,6 +31,7 @@
 from loculus_preprocessing.processing_functions import (
     format_frameshift,
     format_stop_codon,
+    process_phenotype_values,
 )
 
 # Config file used for testing
@@ -1246,6 +1247,21 @@ def test_format_stop_codon():
     assert format_stop_codon(input_zero) == expected_zero
 
 
+def test_process_phenotype_values():
+    assert process_phenotype_values("[]", {"name": "NAI"}).datum is None
+    assert (
+        process_phenotype_values(
+            '[{"name": "NAI","cds": "NA","value": 0.0}, {"name": "Other","cds": "NA","value": 1.0}]',
+            {"name": "NAI"},
+        ).datum
+        == "0.0"
+    )
+    assert process_phenotype_values('[{"name": "NAI","cds": "NA","value": 0.0}]', {}).datum is None
+    invalid = process_phenotype_values("Malformed JSON", {"name": "NAI"})
+    assert invalid.datum is None
+    assert "Was unable to process phenotype values" in invalid.errors[0].message
+
+
 def test_reformat_authors_from_loculus_to_embl_style():
     authors = "Xi,L.;Smith, Anna Maria; Perez Gonzalez, Anthony J.;Doe,;von Doe, John"
     result = reformat_authors_from_loculus_to_embl_style(authors)