fix: improve robustness and formatting in snakefmt for complex cases (#259)

Hocnonsense · web-flow · commit 2f8e69361c54 · 2025-09-01T09:15:44.000+10:00
A big thank you to @Hocnonsense for this PR. This commit addresses multiple edge cases and improves the reliability and output of snakefmt, resolving several long-standing issues: Safer handling of multi-piece strings and f-strings, resulting in more robust formatting. More compact and stable formatting of inline parameters. Improved consistency of indentation and alignment for complex string and parameter scenarios. Enhanced parsing error messages for clearer diagnostics. Updated supported Python targets to 3.11–3.13 and adjusted the CI matrix accordingly. Switched configuration parsing to Python’s standard library TOML reader, reducing dependencies. CLI tests now assert stdout, and additional regression tests were added for complex parameter handling. The “one-line” format now gracefully falls back to normal style where appropriate. Delegated line merging and comment handling to Black for improved output. Issues addressed: Fixes #190, #208, #240, #242, and closes #255.
diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml
@@ -14,7 +14,7 @@ jobs:
     runs-on: ${{ matrix.os }}
     strategy:
       matrix:
-        python-version: [ 3.9, "3.10", "3.11", "3.12" ]
+        python-version: [ "3.11", "3.12", "3.13" ]
         os: [ ubuntu-latest, macos-latest ]
 
     steps:
diff --git a/pyproject.toml b/pyproject.toml
@@ -17,8 +17,6 @@ snakefmt = 'snakefmt.snakefmt:main'
 python = "^3.11"
 click = "^8.0.0"
 black = "^24.3.0"
-toml = "^0.10.2"
-importlib_metadata = {version = ">=1.7.0,<5.0", python = "<3.8"}
 
 [tool.poetry.dev-dependencies]
 pytest = "^7.4.4"
diff --git a/snakefmt/__init__.py b/snakefmt/__init__.py
@@ -6,12 +6,9 @@
 (in dist-info or egg-info dirs).
 From Python 3.8, importlib_metadata is in standard library as importlib.metadata.
 """
-from black import TargetVersion
+from importlib import metadata
 
-if sys.version_info >= (3, 8):
-    from importlib import metadata
-else:
-    import importlib_metadata as metadata
+from black.mode import TargetVersion
 
 __version__ = metadata.version("snakefmt")
 
@@ -20,9 +17,7 @@
 
 DEFAULT_LINE_LENGTH = 88
 DEFAULT_TARGET_VERSIONS = {
-    TargetVersion.PY38,
-    TargetVersion.PY39,
-    TargetVersion.PY310,
     TargetVersion.PY311,
     TargetVersion.PY312,
+    TargetVersion.PY313,
 }
diff --git a/snakefmt/config.py b/snakefmt/config.py
@@ -2,13 +2,13 @@
 Code for searching for and parsing snakefmt configuration files
 """
 
+import tomllib
 from dataclasses import fields
 from functools import lru_cache
 from pathlib import Path
 from typing import Dict, Optional, Sequence, Tuple, Union
 
 import click
-import toml
 from black import Mode
 
 from snakefmt import DEFAULT_LINE_LENGTH, DEFAULT_TARGET_VERSIONS
@@ -79,11 +79,12 @@ def read_snakefmt_config(path: Optional[str]) -> Dict[str, str]:
     if path is None:
         return dict()
     try:
-        config_toml = toml.load(path)
+        with open(path, "rb") as f:
+            config_toml = tomllib.load(f)
         config = config_toml.get("tool", {}).get("snakefmt", {})
         config = {k.replace("--", "").replace("-", "_"): v for k, v in config.items()}
         return config
-    except (toml.TomlDecodeError, OSError) as error:
+    except (tomllib.TOMLDecodeError, OSError) as error:
         raise click.FileError(
             filename=path, hint=f"Error reading configuration file: {error}"
         )
@@ -118,9 +119,10 @@ def read_black_config(path: Optional[PathLike]) -> Mode:
         raise FileNotFoundError(f"{path} is not a file.")
 
     try:
-        pyproject_toml = toml.load(path)
+        with open(path, "rb") as f:
+            pyproject_toml = tomllib.load(f)
         config = pyproject_toml.get("tool", {}).get("black", {})
-    except toml.TomlDecodeError as error:
+    except tomllib.TOMLDecodeError as error:
         raise MalformattedToml(error)
 
     valid_black_filemode_params = sorted([field.name for field in fields(Mode)])
diff --git a/snakefmt/formatter.py b/snakefmt/formatter.py
@@ -4,12 +4,12 @@
 from copy import copy
 from typing import Optional
 
-import black
+import black.parsing
 
 from snakefmt.config import PathLike, read_black_config
 from snakefmt.exceptions import InvalidParameterSyntax, InvalidPython
 from snakefmt.logging import Warnings
-from snakefmt.parser.parser import Parser, comment_start
+from snakefmt.parser.parser import Parser, Snakefile, comment_start
 from snakefmt.parser.syntax import (
     COMMENT_SPACING,
     InlineSingleParam,
@@ -18,14 +18,11 @@
     ParamList,
     SingleParam,
     Syntax,
+    split_code_string,
 )
-from snakefmt.types import TAB, TokenIterator
+from snakefmt.types import TAB
 
 TAB_SIZE = len(TAB)
-# This regex matches any number of consecutive strings; each can span multiple lines.
-full_string_matcher = re.compile(
-    r"^\s*(\w?([\"']{3}.*?[\"']{3})|([\"']{1}.*?[\"']{1}))$", re.DOTALL | re.MULTILINE
-)
 # this regex matches any docstring; can span multiple lines
 docstring_matcher = re.compile(
     r"\s*([rR]?[\"']{3}.*?[\"']{3})", re.DOTALL | re.MULTILINE
@@ -59,7 +56,7 @@ def index_of_first_docstring(s: str) -> Optional[int]:
 class Formatter(Parser):
     def __init__(
         self,
-        snakefile: TokenIterator,
+        snakefile: Snakefile,
         line_length: Optional[int] = None,
         black_config_file: Optional[PathLike] = None,
     ):
@@ -193,7 +190,7 @@ def run_black_format_str(
         )
         try:
             fmted = black.format_str(string, mode=black_mode)
-        except black.InvalidInput as e:
+        except black.parsing.InvalidInput as e:
             err_msg = ""
             # Not clear whether all Black errors start with 'Cannot parse' - it seems to
             # in the tests I ran
@@ -228,61 +225,25 @@ def align_strings(self, string: str, target_indent: int) -> str:
         """
         Takes an ensemble of strings and indents/reindents it
         """
-        pos = 0
         used_indent = TAB * target_indent
-        indented = ""
-        for match in re.finditer(full_string_matcher, string):
-            indented += textwrap.indent(string[pos : match.start(1)], used_indent)
-            lagging_spaces = len(indented) - len(indented.rstrip(" "))
-            lagging_indent_lvl = lagging_spaces // TAB_SIZE
-            match_slice = string[match.start(1) : match.end(1)].replace("\t", TAB)
-            all_lines = match_slice.splitlines(keepends=True)
-            first = textwrap.indent(textwrap.dedent(all_lines[0]), used_indent)
-            indented += first
-
-            is_multiline_string = re.match(
-                r"[bfru]?\"\"\"|'''", first.lstrip(), flags=re.IGNORECASE
-            )
-            if not is_multiline_string:
-                # this check if string is a single-quoted multiline string
-                # e.g. https://github.com/snakemake/snakefmt/issues/121
-                is_multiline_string = "\\\n" in first
-
-            if len(all_lines) > 2:
-                if is_multiline_string:
-                    middle = "".join(all_lines[1:-1])
-                else:
-                    mid = "".join(all_lines[1:-1])
-                    dedent_mid = textwrap.dedent(mid)
-
-                    if lagging_indent_lvl == 0:
-                        required_indent_lvl = target_indent
-                    else:
-                        current_indent_lvl = (len(mid) - len(mid.lstrip())) // TAB_SIZE
-                        required_indent_lvl = current_indent_lvl + target_indent
-
-                    required_indent = TAB * required_indent_lvl
-                    middle = textwrap.indent(
-                        dedent_mid,
-                        required_indent,
-                    )
-                indented += middle
-
-            if len(all_lines) > 1:
-                if is_multiline_string:
-                    last = all_lines[-1]
-                else:
-                    leading_spaces = len(all_lines[-1]) - len(
-                        textwrap.dedent(all_lines[-1])
-                    )
-                    leading_indent = leading_spaces // TAB_SIZE * TAB
-                    last = textwrap.indent(
-                        textwrap.dedent(all_lines[-1]), used_indent + leading_indent
-                    )
-                indented += last
-            pos = match.end()
-        indented += textwrap.indent(string[pos:], used_indent)
-
+        split_string = split_code_string(string)
+        if len(split_string) == 1:
+            return textwrap.indent(split_string[0], used_indent)
+        # First, masks all multi-line strings
+        mask_string = "`~!@#$%^&*|?"
+        while mask_string in string:
+            mask_string += mask_string
+        mask_string = f'"""{mask_string}"""'
+        fakewrap = textwrap.indent(
+            "".join(mask_string if i % 2 else s for i, s in enumerate(split_string)),
+            used_indent,
+        )
+        split_code = fakewrap.split(mask_string)
+        # After indenting, we puts those strings back
+        indented = "".join(
+            s.replace("\t", TAB) if i % 2 else split_code[i // 2]
+            for i, s in enumerate(split_string)
+        )
         return indented
 
     def format_param(
@@ -304,12 +265,10 @@ def format_param(
             raise InvalidParameterSyntax(f"{parameter.line_nb}{val}") from None
 
         if inline_formatting or param_list:
-            val = " ".join(
-                val.rstrip().split("\n")
-            )  # collapse strings on multiple lines
+            val = val.rstrip()
         extra_spacing = 0
         if param_list:
-            val = f"f({val})"
+            val = f"f({val}\n)"
             extra_spacing = 3
 
         # get the index of the last character of the first docstring, if any
@@ -367,26 +326,36 @@ def format_params(self, parameters: ParameterSyntax) -> str:
 
         p_class = parameters.__class__
         param_list = issubclass(p_class, ParamList)
-        inline_fmting = False
-        if p_class is InlineSingleParam:
-            inline_fmting = True
+        inline_fmting = p_class is InlineSingleParam
 
         result = f"{used_indent}{parameters.keyword_line}:"
         if inline_fmting:
-            result += " "
+            # here, check if the value is too large to put in one line
+            params_iter = iter(parameters.all_params)
+            try:
+                param = next(params_iter)
+            except StopIteration:
+                # No params; render just the keyword line and its comment.
+                return f"{result}{parameters.comment}\n"
+            param_result = self.format_param(
+                param, target_indent, inline_fmting, param_list
+            )
+            inline_fmting = param_result.count("\n") == 1
+        if inline_fmting:
             prepended_comments = ""
             if parameters.comment != "":
                 prepended_comments += f"{used_indent}{parameters.comment.lstrip()}\n"
-            param = next(iter(parameters.all_params))
             for comment in param.pre_comments:
                 prepended_comments += f"{used_indent}{comment}\n"
             if prepended_comments != "":
                 Warnings.comment_relocation(parameters.keyword_name, param.line_nb)
-            result = f"{prepended_comments}{result}"
+            result = f"{prepended_comments}{result} {param_result}"
         else:
-            result += f"{parameters.comment}\n"
-        for param in parameters.all_params:
-            result += self.format_param(param, target_indent, inline_fmting, param_list)
+            result = f"{result}{parameters.comment}\n"
+            for param in parameters.all_params:
+                result += self.format_param(
+                    param, target_indent, inline_fmting, param_list
+                )
         num_c = len(param.post_comments)
         if num_c > 1 or (not param._has_inline_comment and num_c == 1):
             Warnings.block_comment_below(parameters.keyword_name, param.line_nb)
diff --git a/snakefmt/parser/syntax.py b/snakefmt/parser/syntax.py
@@ -59,6 +59,56 @@
     }
 
 
+def split_code_string(string: str) -> list[str]:
+    """Splits a code string into individual lines, preserving leading whitespace.
+    >>> string = '''a = 1\nb = f\"\"\"\n{a}\n1\n2\n\"\"\"\nc=2'''
+    >>> split_code_string(string)
+    ['a = 1\nb = ', 'f\"\"\"\n{a}\n1\n2\n\"\"\"', '\nc=2']
+    """
+    lines = string.splitlines(keepends=True)
+    lineiter = iter(lines)
+    tokens = list(tokenize.generate_tokens(lambda: next(lineiter)))
+    string_areas = []
+    tokeniter = iter(tokens)
+    for token in tokeniter:
+        if token.type == tokenize.STRING:
+            if token.start[0] != token.end[0]:
+                string_areas.append((token.start, token.end))
+        if fstring_tokeniser_in_use and token.type == tokenize.FSTRING_START:
+            isin_fstring = 1
+            for t1 in tokeniter:
+                if t1.type == tokenize.FSTRING_START:
+                    isin_fstring += 1
+                elif t1.type == tokenize.FSTRING_END:
+                    isin_fstring -= 1
+                if isin_fstring == 0:
+                    break
+            if token.start[0] != t1.end[0]:
+                string_areas.append((token.start, t1.end))
+    code_str = [""]
+    last_area = (1, 0), (1, 0)
+    for area in string_areas:
+        code_str[-1] += _extract_line_mid(lines, last_area[-1], area[0])
+        code_str.append(_extract_line_mid(lines, area[0], area[1]))
+        code_str.append("")
+        last_area = area
+    code_str[-1] += _extract_line_mid(
+        lines, last_area[-1], (len(lines), len(lines[-1]))
+    )
+    return code_str
+
+
+def _extract_line_mid(
+    lines: list[str], start: tuple[int, int], end: tuple[int, int]
+) -> str:
+    s = "".join(lines[i] for i in range(start[0] - 1, end[0]))
+    t = s[start[1] :]
+    end_trim = end[1] - len(lines[end[0] - 1])
+    if end_trim != 0:
+        t = t[:end_trim]
+    return t
+
+
 def re_add_curly_bracket_if_needed(token: Token) -> str:
     result = ""
     if (
@@ -107,7 +157,9 @@ def operator_skip_spacing(prev_token: Token, token: Token) -> bool:
         return False
 
 
-def add_token_space(prev_token: Token, token: Token, in_fstring: bool = False) -> bool:
+def add_token_space(
+    prev_token: Optional[Token], token: Token, in_fstring: bool = False
+) -> bool:
     result = False
     if prev_token is not None:
         if not operator_skip_spacing(prev_token, token):
diff --git a/tests/conftest.py b/tests/conftest.py
@@ -4,7 +4,7 @@
 
 @pytest.fixture
 def cli_runner() -> CliRunner:
-    return CliRunner(mix_stderr=False)
+    return CliRunner()
 
 
 pytest_plugins = "pytester"
diff --git a/tests/test_config.py b/tests/test_config.py
diff --git a/tests/test_formatter.py b/tests/test_formatter.py
diff --git a/tests/test_snakefmt.py b/tests/test_snakefmt.py