Merge branch 'main' of github.com:Adamtaranto/tSplit

Adamtaranto · Adamtaranto · commit 8cedee96f01a · 2026-01-19T11:24:25.000+11:00
diff --git a/.github/workflows/ruff.yml b/.github/workflows/ruff.yml
@@ -9,7 +9,7 @@ jobs:
       contents: write # Allows reading and writing repository contents (e.g., commits)
       pull-requests: write # Allows reading and writing pull requests
     steps:
-      - uses: actions/checkout@v5
+      - uses: actions/checkout@v6
         with:
           ref: ${{ github.sha }}
           token: ${{ secrets.GITHUB_TOKEN }}
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -20,13 +20,13 @@ repos:
         args: [--pytest-test-first]
       - id: trailing-whitespace
   - repo: https://github.com/astral-sh/ruff-pre-commit
-    rev: v0.14.3
+    rev: v0.14.11
     hooks:
       - id: ruff-check
         args: [--fix, --exit-non-zero-on-fix, --show-fixes]
       - id: ruff-format
   - repo: https://github.com/numpy/numpydoc
-    rev: v1.9.0
+    rev: v1.10.0
     hooks:
       - id: numpydoc-validation
         exclude: (tests/|docs/|scripts/).*
@@ -38,7 +38,7 @@ repos:
     rev: v0.20.0
     hooks:
       - id: yamlfmt
-  - repo: https://github.com/pre-commit/mirrors-prettier
-    rev: v4.0.0-alpha.8
-    hooks:
-      - id: prettier
+#  - repo: https://github.com/pre-commit/mirrors-prettier
+#    rev: v4.0.0-alpha.8
+#    hooks:
+#      - id: prettier
diff --git a/LICENSE b/LICENSE
diff --git a/README.md b/README.md
@@ -80,13 +80,13 @@ Split segments will be written to _TIR_split_tsplit_output.fasta_ with suffix "\
 TIRs must be at least 10bp in length and share 80%
 identity and occur within 10bp of each end of the input element.
 
-Additionally, synthetic MITEs will be constructed by concatenation of left and right TIRs, with internal segments excised.
 
 ```bash
-tsplit TIR -i tests/data/TIR_element.fa -p TIR_split --makemites --keeptemp
+tsplit TIR -i tests/data/TIR_element.fa -p TIR_split
 
 # Equivalet to defaults
-tsplit TIR -i tests/data/TIR_element.fa -p TIR_split --maxdist 10 --minid 80.0 --minterm 10 --method blastn --splitmode split --makemites --keeptemp
+tsplit TIR -i tests/data/TIR_element.fa -p TIR_split --maxdist 10 --minid 80.0 --minterm 10 --method blastn --splitmode split
+# Use '--both' if you want to report both left and right TIRs
 ```
 
 Output: `TIR_split_tsplit_output.fasta`
diff --git a/environment.yml b/environment.yml
@@ -10,6 +10,4 @@ dependencies:
   - pip:
       - "biopython>=1.70"
       - "pymummer>=0.12.0"
-      - pytest
-      - hatch
       - rich
diff --git a/pyproject.toml b/pyproject.toml
@@ -9,7 +9,7 @@ name = "tsplit"
 description = "Extract terminal repeats from retrotransposons (LTRs) or DNA transposons (TIRs). Compose synthetic MITES from complete DNA transposons."
 readme = "README.md"
 requires-python = ">=3.8"
-license = { text = "MIT" }
+license = { text = "GPL-3.0-or-later" }
 authors = [{ name = "Adam Taranto" }]
 
 # Classifiers for project categorization
@@ -35,9 +35,11 @@ dev = [
     "mkdocs",
     "mkdocstrings-python",
     "mkdocstrings",
+    "mypy",
     "notebook",
     "numpydoc-validation",
     "pre-commit",
+    "pydocstyle",
     "pymdown-extensions",
     "pytest-cov",
     "pytest",
@@ -101,6 +103,9 @@ ignore = [
     "B905", # `zip()` without an explicit `strict=` parameter
 ]
 
+# Don't auto-fix docstring issues - they're too fragile
+unfixable = ["D"]
+
 [tool.ruff.lint.per-file-ignores]
 "tests/*" = ["D"] # Ignore all pydocstyle rules in tests
 
@@ -137,6 +142,11 @@ override_SS05 = [ # allow docstrings to start with these words
     '^Access ',
 ]
 
+# Don't process filepaths that match these regex patterns
+exclude_files = [
+    '^_version\\.py$',
+]
+
 [tool.mypy]
 python_version = "3.10"
 warn_return_any = true
@@ -145,5 +155,5 @@ disallow_untyped_defs = true
 
 [tool.pydocstyle]
 convention = "numpy"
-# Ignore files in directories named "tests"
-match-dir = "((?!tests).)*"
+match-dir = "[^\\.].*"     # matches all dirs that don't start with a dot
+match = "(?!test_).*\\.py" # matches files that don't start with 'test_' but end with '.py'
diff --git a/src/tsplit/app_tsplit.py b/src/tsplit/app_tsplit.py
@@ -130,6 +130,12 @@ def parse_args() -> Namespace:
         choices=['blastn', 'nucmer'],
         help='Select alignment method: "blastn" or "nucmer".(Default: blastn)',
     )
+    tir_parser.add_argument(
+        '--both',
+        action='store_true',
+        default=False,
+        help='Report both left and right terminal repeats when splitmode is one of {all, split, external}. Suffixes will be "_L_TIR" and "_R_TIR". Right TIR will be reverse complemented for alignment with left TIR.',
+    )
 
     # Set up parser for LTR subcommand
     ltr_parser = subparsers.add_parser(
@@ -212,6 +218,12 @@ def parse_args() -> Namespace:
         choices=['blastn', 'nucmer'],
         help='Select alignment method: "blastn" or "nucmer".(Default: blastn)',
     )
+    ltr_parser.add_argument(
+        '--both',
+        action='store_true',
+        default=False,
+        help='Report both left and right terminal repeats when splitmode is one of {all, split, external}. Suffixes will be "_L_LTR" and "_R_LTR". Right LTR will be in the same orientation as left LTR.',
+    )
 
     # Parse and return the command-line arguments
     return parser.parse_args()
diff --git a/src/tsplit/cmd_LTR.py b/src/tsplit/cmd_LTR.py
@@ -79,6 +79,7 @@ def main(args: Optional[Namespace] = None) -> None:
         temp=args.outdir,  # Directory for temporary files
         alignTool=args.method,  # Alignment tool (blastn or nucmer)
         keeptemp=args.keeptemp,  # Whether to keep temporary files
+        both=args.both,  # Whether to report both terminal repeats
     )
 
     # Write the identified segments to output file
diff --git a/src/tsplit/cmd_TIR.py b/src/tsplit/cmd_TIR.py
@@ -83,6 +83,7 @@ def main(args: Optional[Namespace] = None) -> None:
         alignTool=args.method,  # Alignment tool (blastn or nucmer)
         temp=args.outdir,  # Directory for temporary files
         keeptemp=args.keeptemp,  # Whether to keep temporary files
+        both=args.both,  # Whether to report both terminal repeats
     )
 
     # Write the identified segments to output file
diff --git a/src/tsplit/parseAlign.py b/src/tsplit/parseAlign.py
@@ -37,6 +37,7 @@ def getTIRs(
     keeptemp: bool = False,
     alignTool: str = 'nucmer',
     verbose: bool = True,
+    both: bool = False,
 ) -> Generator[SeqRecord, None, None]:
     """
     Align elements to self and attempt to identify TIRs.
@@ -71,6 +72,8 @@ def getTIRs(
         Alignment tool to use ('nucmer' or 'blastn'), by default 'nucmer'.
     verbose : bool, optional
         Whether to print verbose output, by default True.
+    both : bool, optional
+        Whether to report both left and right terminal repeats, by default False.
 
     Yields
     ------
@@ -85,6 +88,10 @@ def getTIRs(
     -----
     When mites=True, the function will also yield synthetic MITEs constructed by
     joining the identified TIRs.
+
+    When both=True and report is in ['split', 'external', 'all'], both left and right
+    TIRs will be yielded with suffixes '_L_TIR' and '_R_TIR'. The right TIR will be
+    reverse complemented so it can be aligned with the left TIR.
     """
     # Set temp directory to cwd if none is provided
     if not temp:
@@ -212,15 +219,42 @@ def getTIRs(
                         print(pairwise_alignments[0])
 
                     if report in ['split', 'external', 'all']:
-                        # yield TIR slice - append "_TIR"
-                        extSeg = rec[ref_start : ref_end + 1]  # +1 to include end base
-                        extSeg.id = f'{extSeg.id}_TIR'
-                        extSeg.name = extSeg.id
-                        extSeg.description = f'[{rec.id} TIR segment]'
-                        logging.info(
-                            f'Yielding TIR segment: {extSeg.id}, len: {len(extSeg)}bp'
-                        )
-                        yield extSeg
+                        if both:
+                            # Yield both left and right TIRs
+                            # Left TIR
+                            leftSeg = rec[ref_start : ref_end + 1]
+                            leftSeg.id = f'{leftSeg.id}_L_TIR'
+                            leftSeg.name = leftSeg.id
+                            leftSeg.description = f'[{rec.id} left TIR segment]'
+                            logging.info(
+                                f'Yielding left TIR segment: {leftSeg.id}, len: {len(leftSeg)}bp'
+                            )
+                            yield leftSeg
+
+                            # Right TIR - reverse complement so it aligns with left TIR
+                            rightSeg = rec[qry_start : qry_end + 1]
+                            # Reverse complement the right TIR
+                            rightSeg = rightSeg.reverse_complement(
+                                id=f'{rec.id}_R_TIR',
+                                name=f'{rec.id}_R_TIR',
+                                description=f'[{rec.id} right TIR segment, reverse complemented]',
+                            )
+                            logging.info(
+                                f'Yielding right TIR segment (reverse complemented): {rightSeg.id}, len: {len(rightSeg)}bp'
+                            )
+                            yield rightSeg
+                        else:
+                            # yield TIR slice - append "_TIR"
+                            extSeg = rec[
+                                ref_start : ref_end + 1
+                            ]  # +1 to include end base
+                            extSeg.id = f'{extSeg.id}_TIR'
+                            extSeg.name = extSeg.id
+                            extSeg.description = f'[{rec.id} TIR segment]'
+                            logging.info(
+                                f'Yielding TIR segment: {extSeg.id}, len: {len(extSeg)}bp'
+                            )
+                            yield extSeg
 
                     if report in ['split', 'internal', 'all']:
                         # yield internal slice - append "_I"
@@ -371,6 +405,7 @@ def getLTRs(
     keeptemp: bool = False,
     alignTool: str = 'nucmer',
     verbose: bool = True,
+    both: bool = False,
 ) -> Generator[SeqRecord, None, None]:
     """
     Align elements to self and attempt to identify LTRs.
@@ -402,6 +437,8 @@ def getLTRs(
         Alignment tool to use ('nucmer' or 'blastn'), by default 'nucmer'.
     verbose : bool, optional
         Whether to print verbose output, by default True.
+    both : bool, optional
+        Whether to report both left and right terminal repeats, by default False.
 
     Yields
     ------
@@ -411,6 +448,13 @@ def getLTRs(
         - 'external': Only LTRs
         - 'internal': Only internal regions
         - 'all': Original sequences plus all segments
+
+    Notes
+    -----
+    When both=True and report is in ['split', 'external', 'all'], both left and right
+    LTRs will be yielded with suffixes '_L_LTR' and '_R_LTR'. Unlike TIRs, the right
+    LTR will NOT be reverse complemented as it is already in the same orientation as
+    the left LTR.
     """
     # Set temp directory to cwd if none is provided
     if not temp:
@@ -520,15 +564,39 @@ def getLTRs(
                         print(pairwise_alignments[0])
 
                     if report in ['split', 'external', 'all']:
-                        # yield LTR slice - append "_LTR"
-                        extSeg = rec[ref_start : ref_end + 1]  # +1 to include end base
-                        extSeg.id = f'{extSeg.id}_LTR'
-                        extSeg.name = extSeg.id
-                        extSeg.description = f'[{rec.id} LTR segment]'
-                        logging.info(
-                            f'Yielding LTR segment: {extSeg.id}, len: {len(extSeg)}bp'
-                        )
-                        yield extSeg
+                        if both:
+                            # Yield both left and right LTRs
+                            # Left LTR
+                            leftSeg = rec[ref_start : ref_end + 1]
+                            leftSeg.id = f'{leftSeg.id}_L_LTR'
+                            leftSeg.name = leftSeg.id
+                            leftSeg.description = f'[{rec.id} left LTR segment]'
+                            logging.info(
+                                f'Yielding left LTR segment: {leftSeg.id}, len: {len(leftSeg)}bp'
+                            )
+                            yield leftSeg
+
+                            # Right LTR - keep in same orientation (do NOT reverse complement)
+                            rightSeg = rec[qry_start : qry_end + 1]
+                            rightSeg.id = f'{rightSeg.id}_R_LTR'
+                            rightSeg.name = rightSeg.id
+                            rightSeg.description = f'[{rec.id} right LTR segment]'
+                            logging.info(
+                                f'Yielding right LTR segment: {rightSeg.id}, len: {len(rightSeg)}bp'
+                            )
+                            yield rightSeg
+                        else:
+                            # yield LTR slice - append "_LTR"
+                            extSeg = rec[
+                                ref_start : ref_end + 1
+                            ]  # +1 to include end base
+                            extSeg.id = f'{extSeg.id}_LTR'
+                            extSeg.name = extSeg.id
+                            extSeg.description = f'[{rec.id} LTR segment]'
+                            logging.info(
+                                f'Yielding LTR segment: {extSeg.id}, len: {len(extSeg)}bp'
+                            )
+                            yield extSeg
 
                     if report in ['split', 'internal', 'all']:
                         # yield internal slice - append "_I"
diff --git a/tests/test_both_option.py b/tests/test_both_option.py
diff --git a/tests/test_both_option_unit.py b/tests/test_both_option_unit.py

Original file line number	Diff line number	Diff line change
`@@ -79,6 +79,7 @@ def main(args: Optional[Namespace] = None) -> None:`
`79`	`79`	`temp=args.outdir, # Directory for temporary files`
`80`	`80`	`alignTool=args.method, # Alignment tool (blastn or nucmer)`
`81`	`81`	`keeptemp=args.keeptemp, # Whether to keep temporary files`
	`82`	`+ both=args.both, # Whether to report both terminal repeats`
`82`	`83`	`)`
`83`	`84`
`84`	`85`	`# Write the identified segments to output file`
Original file line number	Diff line number	Diff line change
`@@ -83,6 +83,7 @@ def main(args: Optional[Namespace] = None) -> None:`
`83`	`83`	`alignTool=args.method, # Alignment tool (blastn or nucmer)`
`84`	`84`	`temp=args.outdir, # Directory for temporary files`
`85`	`85`	`keeptemp=args.keeptemp, # Whether to keep temporary files`
	`86`	`+ both=args.both, # Whether to report both terminal repeats`
`86`	`87`	`)`
`87`	`88`
`88`	`89`	`# Write the identified segments to output file`