From 62d33dad961c90418bd63b872c5d9472e5fbdaed Mon Sep 17 00:00:00 2001 From: Clemens Kaposi Date: Sat, 12 Nov 2022 22:18:43 +0100 Subject: [PATCH 1/3] Improve parsing of author information MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Instead of relying on regular expressions, this patch leverages Python’s builtin `email.utils.parseaddr()` functionality to parse an RFC-822-compliant email address string into its name and address parts. This should also resolve issues with special characters in the name part; see for example Poetry issues #370 and #798. https://github.com/sdispater/poetry/issues/370 https://github.com/sdispater/poetry/issues/798 --- src/poetry/core/masonry/builders/builder.py | 12 ++--- src/poetry/core/packages/package.py | 17 +++--- src/poetry/core/utils/helpers.py | 24 +++++++++ tests/utils/test_helpers.py | 58 +++++++++++++++++++++ 4 files changed, 93 insertions(+), 18 deletions(-) diff --git a/src/poetry/core/masonry/builders/builder.py b/src/poetry/core/masonry/builders/builder.py index 9f569299e..84fb75b40 100644 --- a/src/poetry/core/masonry/builders/builder.py +++ b/src/poetry/core/masonry/builders/builder.py @@ -1,7 +1,6 @@ from __future__ import annotations import logging -import re import sys import warnings @@ -14,8 +13,6 @@ from poetry.core.poetry import Poetry -AUTHOR_REGEX = re.compile(r"(?u)^(?P[- .,\w\d'’\"()]+) <(?P.+?)>$") - METADATA_BASE = """\ Metadata-Version: 2.1 Name: {name} @@ -344,12 +341,11 @@ def convert_script_files(self) -> list[Path]: @classmethod def convert_author(cls, author: str) -> dict[str, str]: - m = AUTHOR_REGEX.match(author) - if m is None: - raise RuntimeError(f"{author} does not match regex") + from poetry.core.utils.helpers import parse_author - name = m.group("name") - email = m.group("email") + name, email = parse_author(author) + if not name or not email: + raise RuntimeError(f"{author} does not match regex") return {"name": name, "email": email} diff --git a/src/poetry/core/packages/package.py b/src/poetry/core/packages/package.py index 1651ca6b6..7ff61b06c 100644 --- a/src/poetry/core/packages/package.py +++ b/src/poetry/core/packages/package.py @@ -16,6 +16,7 @@ from poetry.core.packages.dependency_group import MAIN_GROUP from poetry.core.packages.specification import PackageSpecification from poetry.core.packages.utils.utils import create_nested_marker +from poetry.core.utils.helpers import parse_author from poetry.core.version.exceptions import InvalidVersion from poetry.core.version.markers import parse_marker @@ -32,6 +33,8 @@ T = TypeVar("T", bound="Package") +# TODO: once poetry.console.commands.init.InitCommand._validate_author +# uses poetry.core.utils.helpers.parse_author, this can be removed. AUTHOR_REGEX = re.compile(r"(?u)^(?P[- .,\w\d'’\"():&]+)(?: <(?P.+?)>)?$") @@ -231,34 +234,28 @@ def _get_author(self) -> dict[str, str | None]: if not self._authors: return {"name": None, "email": None} - m = AUTHOR_REGEX.match(self._authors[0]) + name, email = parse_author(self._authors[0]) - if m is None: + if not name or not email: raise ValueError( "Invalid author string. Must be in the format: " "John Smith " ) - name = m.group("name") - email = m.group("email") - return {"name": name, "email": email} def _get_maintainer(self) -> dict[str, str | None]: if not self._maintainers: return {"name": None, "email": None} - m = AUTHOR_REGEX.match(self._maintainers[0]) + name, email = parse_author(self._maintainers[0]) - if m is None: + if not name or not email: raise ValueError( "Invalid maintainer string. Must be in the format: " "John Smith " ) - name = m.group("name") - email = m.group("email") - return {"name": name, "email": email} @property diff --git a/src/poetry/core/utils/helpers.py b/src/poetry/core/utils/helpers.py index dd41b2459..5d953a3b1 100644 --- a/src/poetry/core/utils/helpers.py +++ b/src/poetry/core/utils/helpers.py @@ -8,6 +8,7 @@ import warnings from contextlib import contextmanager +from email.utils import parseaddr from pathlib import Path from typing import Any from typing import Iterator @@ -105,3 +106,26 @@ def readme_content_type(path: str | Path) -> str: return "text/markdown" else: return "text/plain" + + +def parse_author(address: str) -> tuple[str | None, str | None]: + """Parse name and address parts from an email address string. + + >>> parse_author("John Doe ") + ('John Doe', 'john.doe@example.com') + + .. note:: + + If the input string does not contain an ``@`` character, it is + assumed that it represents only a name without an email address. + + :param address: the email address string to parse. + :return: a 2-tuple with the parsed name and email address. If a + part is missing, ``None`` will be returned in its place. + """ + if "@" not in address: + return address, None + name, email = parseaddr(address) + if not name and "@" not in email: + return email, None + return name or None, email or None diff --git a/tests/utils/test_helpers.py b/tests/utils/test_helpers.py index f8fe4393c..ace202c18 100644 --- a/tests/utils/test_helpers.py +++ b/tests/utils/test_helpers.py @@ -8,6 +8,7 @@ import pytest from poetry.core.utils.helpers import combine_unicode +from poetry.core.utils.helpers import parse_author from poetry.core.utils.helpers import parse_requires from poetry.core.utils.helpers import readme_content_type from poetry.core.utils.helpers import temporary_directory @@ -118,3 +119,60 @@ def test_utils_helpers_readme_content_type( readme: str | Path, content_type: str ) -> None: assert readme_content_type(readme) == content_type + + +def test_utils_helpers_parse_author(): + """Test the :func:`parse_author` function.""" + + # Verify the (probable) default use case + name, email = parse_author("John Doe ") + assert name == "John Doe" + assert email == "john.doe@example.com" + + # Name only + name, email = parse_author("John Doe") + assert name == "John Doe" + assert email is None + + # Name with a “special” character + email address + name, email = parse_author("R&D ") + assert name == "R&D" + assert email == "researchanddevelopment@example.com" + + # Name with a “special” character only + name, email = parse_author("R&D") + assert name == "R&D" + assert email is None + + # Name with fancy unicode character + email address + name, email = parse_author("my·fancy corp ") + assert name == "my·fancy corp" + assert email == "my-fancy-corp@example.com" + + # Name with fancy unicode character only + name, email = parse_author("my·fancy corp") + assert name == "my·fancy corp" + assert email is None + + # Email address only, wrapped in angular brackets + name, email = parse_author("") + assert name is None + assert email == "john.doe@example.com" + + # Email address only + name, email = parse_author("john.doe@example.com") + assert name is None + assert email == "john.doe@example.com" + + # Non-RFC-conform cases with unquoted commas + name, email = parse_author("asf,dfu@t.b") + assert name == "asf" + assert email is None + + name, email = parse_author("asf,") + assert name == "asf" + assert email is None + + name, email = parse_author("asf, dfu@t.b") + assert name == "asf" + assert email is None From 8286618408cd2ba7250251ae48873146d4de3f31 Mon Sep 17 00:00:00 2001 From: Clemens Kaposi Date: Mon, 14 Nov 2022 11:45:01 +0100 Subject: [PATCH 2/3] Parametrize tests for `parse_author` helper --- tests/utils/test_helpers.py | 91 +++++++++++++++---------------------- 1 file changed, 37 insertions(+), 54 deletions(-) diff --git a/tests/utils/test_helpers.py b/tests/utils/test_helpers.py index ace202c18..ca5411ab1 100644 --- a/tests/utils/test_helpers.py +++ b/tests/utils/test_helpers.py @@ -121,58 +121,41 @@ def test_utils_helpers_readme_content_type( assert readme_content_type(readme) == content_type -def test_utils_helpers_parse_author(): +@pytest.mark.parametrize( + "author, name, email", + [ + # Verify the (probable) default use case + ("John Doe ", "John Doe", "john.doe@example.com"), + # Name only + ("John Doe", "John Doe", None), + # Name with a “special” character + email address + ( + "R&D ", + "R&D", + "researchanddevelopment@example.com", + ), + # Name with a “special” character only + ("R&D", "R&D", None), + # Name with fancy unicode character + email address + ( + "my·fancy corp ", + "my·fancy corp", + "my-fancy-corp@example.com", + ), + # Name with fancy unicode character only + ("my·fancy corp", "my·fancy corp", None), + # Email address only, wrapped in angular brackets + ("", None, "john.doe@example.com"), + # Email address only + ("john.doe@example.com", None, "john.doe@example.com"), + # Non-RFC-conform cases with unquoted commas + ("asf,dfu@t.b", "asf", None), + ("asf,", "asf", None), + ("asf, dfu@t.b", "asf", None), + ], +) +def test_utils_helpers_parse_author( + author: str, name: str | None, email: str | None +) -> None: """Test the :func:`parse_author` function.""" - - # Verify the (probable) default use case - name, email = parse_author("John Doe ") - assert name == "John Doe" - assert email == "john.doe@example.com" - - # Name only - name, email = parse_author("John Doe") - assert name == "John Doe" - assert email is None - - # Name with a “special” character + email address - name, email = parse_author("R&D ") - assert name == "R&D" - assert email == "researchanddevelopment@example.com" - - # Name with a “special” character only - name, email = parse_author("R&D") - assert name == "R&D" - assert email is None - - # Name with fancy unicode character + email address - name, email = parse_author("my·fancy corp ") - assert name == "my·fancy corp" - assert email == "my-fancy-corp@example.com" - - # Name with fancy unicode character only - name, email = parse_author("my·fancy corp") - assert name == "my·fancy corp" - assert email is None - - # Email address only, wrapped in angular brackets - name, email = parse_author("") - assert name is None - assert email == "john.doe@example.com" - - # Email address only - name, email = parse_author("john.doe@example.com") - assert name is None - assert email == "john.doe@example.com" - - # Non-RFC-conform cases with unquoted commas - name, email = parse_author("asf,dfu@t.b") - assert name == "asf" - assert email is None - - name, email = parse_author("asf,") - assert name == "asf" - assert email is None - - name, email = parse_author("asf, dfu@t.b") - assert name == "asf" - assert email is None + assert parse_author(author) == (name, email) From ad7e11575029f04c9bd5a853a6b56ae1aadc3f40 Mon Sep 17 00:00:00 2001 From: Clemens Kaposi Date: Wed, 23 Nov 2022 21:57:58 +0100 Subject: [PATCH 3/3] Make author parsing PyPA-compatible, with mandatory name See https://github.com/python-poetry/poetry-core/pull/521#issuecomment-1324692069 --- src/poetry/core/masonry/builders/builder.py | 5 +--- src/poetry/core/packages/package.py | 14 ---------- src/poetry/core/utils/helpers.py | 19 ++++++-------- tests/packages/test_package.py | 21 +++++++-------- tests/utils/test_helpers.py | 29 ++++++++++++++------- 5 files changed, 37 insertions(+), 51 deletions(-) diff --git a/src/poetry/core/masonry/builders/builder.py b/src/poetry/core/masonry/builders/builder.py index 84fb75b40..ec961634c 100644 --- a/src/poetry/core/masonry/builders/builder.py +++ b/src/poetry/core/masonry/builders/builder.py @@ -340,13 +340,10 @@ def convert_script_files(self) -> list[Path]: return script_files @classmethod - def convert_author(cls, author: str) -> dict[str, str]: + def convert_author(cls, author: str) -> dict[str, str | None]: from poetry.core.utils.helpers import parse_author name, email = parse_author(author) - if not name or not email: - raise RuntimeError(f"{author} does not match regex") - return {"name": name, "email": email} diff --git a/src/poetry/core/packages/package.py b/src/poetry/core/packages/package.py index 7ff61b06c..151fc1642 100644 --- a/src/poetry/core/packages/package.py +++ b/src/poetry/core/packages/package.py @@ -235,13 +235,6 @@ def _get_author(self) -> dict[str, str | None]: return {"name": None, "email": None} name, email = parse_author(self._authors[0]) - - if not name or not email: - raise ValueError( - "Invalid author string. Must be in the format: " - "John Smith " - ) - return {"name": name, "email": email} def _get_maintainer(self) -> dict[str, str | None]: @@ -249,13 +242,6 @@ def _get_maintainer(self) -> dict[str, str | None]: return {"name": None, "email": None} name, email = parse_author(self._maintainers[0]) - - if not name or not email: - raise ValueError( - "Invalid maintainer string. Must be in the format: " - "John Smith " - ) - return {"name": name, "email": email} @property diff --git a/src/poetry/core/utils/helpers.py b/src/poetry/core/utils/helpers.py index 5d953a3b1..6ae14d51b 100644 --- a/src/poetry/core/utils/helpers.py +++ b/src/poetry/core/utils/helpers.py @@ -108,24 +108,21 @@ def readme_content_type(path: str | Path) -> str: return "text/plain" -def parse_author(address: str) -> tuple[str | None, str | None]: +def parse_author(address: str) -> tuple[str, str | None]: """Parse name and address parts from an email address string. >>> parse_author("John Doe ") ('John Doe', 'john.doe@example.com') - .. note:: - - If the input string does not contain an ``@`` character, it is - assumed that it represents only a name without an email address. - :param address: the email address string to parse. - :return: a 2-tuple with the parsed name and email address. If a - part is missing, ``None`` will be returned in its place. + :return: a 2-tuple with the parsed name and optional email address. + :raises ValueError: if the parsed string does not contain a name. """ if "@" not in address: return address, None name, email = parseaddr(address) - if not name and "@" not in email: - return email, None - return name or None, email or None + if not name or ( + email and address not in [f"{name} <{email}>", f'"{name}" <{email}>'] + ): + raise ValueError(f"Invalid author string: {address!r}") + return name, email or None diff --git a/tests/packages/test_package.py b/tests/packages/test_package.py index 998b25e22..1ed790193 100644 --- a/tests/packages/test_package.py +++ b/tests/packages/test_package.py @@ -55,14 +55,11 @@ def test_package_authors() -> None: def test_package_authors_invalid() -> None: package = Package("foo", "0.1.0") - package.authors.insert(0, "" - ) + assert str(e.value) == "Invalid author string: 'john.doe@example.com'" @pytest.mark.parametrize( @@ -78,11 +75,14 @@ def test_package_authors_invalid() -> None: ("Doe, John", None), ("(Doe, John)", None), ("John Doe", "john@john.doe"), - ("Doe, John", "dj@john.doe"), ("MyCompanyName R&D", "rnd@MyCompanyName.MyTLD"), ("John-Paul: Doe", None), - ("John-Paul: Doe", "jp@nomail.none"), ("John Doe the 3rd", "3rd@jd.net"), + (" None: @@ -102,11 +102,8 @@ def test_package_authors_valid(name: str, email: str | None) -> None: [ ("",), ("john@john.doe",), - ("",), + ("John-Paul: Doe ",), ], ) def test_package_author_names_invalid(name: str) -> None: diff --git a/tests/utils/test_helpers.py b/tests/utils/test_helpers.py index ca5411ab1..cd4fd7e3b 100644 --- a/tests/utils/test_helpers.py +++ b/tests/utils/test_helpers.py @@ -144,18 +144,27 @@ def test_utils_helpers_readme_content_type( ), # Name with fancy unicode character only ("my·fancy corp", "my·fancy corp", None), + ], +) +def test_utils_helpers_parse_author(author: str, name: str, email: str | None) -> None: + """Test valid inputs for the :func:`parse_author` function.""" + assert parse_author(author) == (name, email) + + +@pytest.mark.parametrize( + "author", + [ # Email address only, wrapped in angular brackets - ("", None, "john.doe@example.com"), + "", # Email address only - ("john.doe@example.com", None, "john.doe@example.com"), + "john.doe@example.com", # Non-RFC-conform cases with unquoted commas - ("asf,dfu@t.b", "asf", None), - ("asf,", "asf", None), - ("asf, dfu@t.b", "asf", None), + "asf,dfu@t.b", + "asf,", + "asf, dfu@t.b", ], ) -def test_utils_helpers_parse_author( - author: str, name: str | None, email: str | None -) -> None: - """Test the :func:`parse_author` function.""" - assert parse_author(author) == (name, email) +def test_utils_helpers_parse_author_invalid(author: str) -> None: + """Test invalid inputs for the :func:`parse_author` function.""" + with pytest.raises(ValueError): + parse_author(author)