diff --git a/.github/workflows/backfill_metadata.yml b/.github/workflows/backfill_metadata.yml new file mode 100644 index 00000000..a5c2d62d --- /dev/null +++ b/.github/workflows/backfill_metadata.yml @@ -0,0 +1,21 @@ +name: backfill metadata + +on: + workflow_dispatch: + +jobs: + backfill-metadata: + runs-on: ubuntu-latest + permissions: + id-token: write + contents: read + steps: + - uses: actions/checkout@v3 + - uses: actions/setup-python@v4 + with: + python-version: '3.11' + - uses: google-github-actions/auth@ba79af03959ebeac9769e648f473a284504d9193 # v2.1.10 + with: + workload_identity_provider: projects/868781662168/locations/global/workloadIdentityPools/prod-github/providers/github-oidc-pool + service_account: gha-pypi@sac-prod-sa.iam.gserviceaccount.com + - run: python3 -uS bin/backfill-core-metadata --pypi-url https://pypi.devinfra.sentry.io diff --git a/bin/backfill-core-metadata b/bin/backfill-core-metadata new file mode 100755 index 00000000..9d6406c0 --- /dev/null +++ b/bin/backfill-core-metadata @@ -0,0 +1,94 @@ +#!/usr/bin/env python3 +from __future__ import annotations + +import argparse +import hashlib +import json +import os.path +import shutil +import subprocess +import tempfile +import urllib.parse +import urllib.request +import zipfile +from collections.abc import Sequence + + +def _get_metadata_bytes(filename: str) -> bytes: + with zipfile.ZipFile(filename) as zipf: + (metadata,) = ( + name + for name in zipf.namelist() + if name.endswith(".dist-info/METADATA") and name.count("/") == 1 + ) + with zipf.open(metadata) as f: + return f.read() + + +def main(argv: Sequence[str] | None = None) -> int: + parser = argparse.ArgumentParser() + parser.add_argument("--pypi-url", required=True) + args = parser.parse_args(argv) + + url = urllib.parse.urljoin(args.pypi_url, "packages.json") + packages = [json.loads(line) for line in urllib.request.urlopen(url)] + + with tempfile.TemporaryDirectory() as tmpdir: + os.makedirs(f"{tmpdir}/metadata") + for package in packages: + basename = os.path.basename(package["filename"]) + + if package.get("core_metadata"): + print(f"skipping: core metadata already present for {basename}") + continue + + url = f"{args.pypi_url}/wheels/{basename}" + fn = f"{tmpdir}/{basename}" + + with urllib.request.urlopen(url) as resp, open(fn, "wb") as f: + shutil.copyfileobj(resp, f) + + metadata_bytes = _get_metadata_bytes(fn) + metadata_sha256 = hashlib.sha256(metadata_bytes).hexdigest() + + with open(f"{tmpdir}/metadata/{basename}.metadata", "wb") as f: + f.write(metadata_bytes) + + package["core_metadata"] = f"sha256={metadata_sha256}" + print(f"core metadata fetched for {basename}") + + packages_json = os.path.join(tmpdir, "packages.json") + with open(packages_json, "w") as f: + for package in packages: + f.write(f"{json.dumps(package)}\n") + + subprocess.check_call( + ( + "gcloud", + "storage", + "cp", + "-n", # no-clobber + "--cache-control", + "public, max-age=3600", + f"{tmpdir}/metadata/*", + "gs://pypi.devinfra.sentry.io/wheels/", + ) + ) + subprocess.check_call( + ( + "gcloud", + "storage", + "cp", + # the packages.json file must be consistently read so no caching + "--cache-control", + "no-store", + packages_json, + "gs://pypi.devinfra.sentry.io", + ) + ) + + return 0 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/docker/Dockerfile b/docker/Dockerfile index 75ca0456..5e7c2875 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -1,4 +1,5 @@ FROM python:3.11.4-slim-bullseye + RUN : \ && apt-get update \ && DEBIAN_FRONTEND=noninteractive apt-get install \ diff --git a/docker/requirements.in b/docker/requirements.in index 569fe2fb..da1dd59e 100644 --- a/docker/requirements.in +++ b/docker/requirements.in @@ -1,6 +1,6 @@ auditwheel>=5.1.2 delocate>=0.10.4 -dumb-pypi>=1.13.0 +dumb-pypi>=1.15.0 packaging>=21.3 patchelf>=0.14.5;sys_platform=="linux" pip>=22.1.2 diff --git a/docker/requirements.txt b/docker/requirements.txt index 5a2bddc2..2e87ede6 100644 --- a/docker/requirements.txt +++ b/docker/requirements.txt @@ -7,7 +7,7 @@ auditwheel==5.1.2 delocate==0.10.4 distlib==0.3.8 -dumb-pypi==1.13.0 +dumb-pypi==1.15.0 filelock==3.13.1 jinja2==3.1.6 markupsafe==2.1.1 diff --git a/make_index.py b/make_index.py index 8f32f17f..61c71d62 100644 --- a/make_index.py +++ b/make_index.py @@ -25,12 +25,7 @@ def _commit_info() -> tuple[str, int]: return h, int(t) -def _make_info(filename: str) -> dict[str, Any]: - h, t = _commit_info() - - with open(filename, "rb") as f: - sha256 = hashlib.sha256(f.read()).hexdigest() - +def _get_metadata_bytes(filename: str) -> bytes: with zipfile.ZipFile(filename) as zipf: (metadata,) = ( name @@ -38,16 +33,29 @@ def _make_info(filename: str) -> dict[str, Any]: if name.endswith(".dist-info/METADATA") and name.count("/") == 1 ) with zipf.open(metadata) as f: - info = email.message_from_binary_file(f) + return f.read() + + +def _make_info(filename: str) -> dict[str, Any]: + h, t = _commit_info() + + with open(filename, "rb") as f: + sha256 = hashlib.sha256(f.read()).hexdigest() + + metadata_bytes = _get_metadata_bytes(filename) + metadata_sha256 = hashlib.sha256(metadata_bytes).hexdigest() + info = email.message_from_bytes(metadata_bytes) dist_info = { "requires_dist": info.get_all("requires-dist"), "requires_python": info.get("requires-python"), } + # this is intended to be exactly the structure dumb-pypi generates return { "filename": os.path.basename(filename), "hash": f"sha256={sha256}", + "core_metadata": f"sha256={metadata_sha256}", "upload_timestamp": t, "uploaded_by": f"git@{h}", **{k: v for k, v in dist_info.items() if v}, @@ -92,6 +100,9 @@ def main(argv: Sequence[str] | None = None) -> int: new_packages.append(_make_info(filename)) shutil.copy(filename, wheels_dir) + with open(f"{wheels_dir}/{basename}.metadata", "wb") as f: + f.write(_get_metadata_bytes(filename)) + with tempfile.TemporaryDirectory() as tmpdir: prev_json = os.path.join(tmpdir, "previous.json") with open(prev_json, "w") as f: diff --git a/packages.ini b/packages.ini index fab7afb3..1ba8621b 100644 --- a/packages.ini +++ b/packages.ini @@ -519,6 +519,7 @@ validate_incorrect_missing_deps = psycopg2-binary [drf-spectacular==0.27.2] [dumb-pypi==1.13.0] +[dumb-pypi==1.15.0] [ecdsa==0.18.0] diff --git a/tests/make_index_test.py b/tests/make_index_test.py index 82254f66..48a9570f 100644 --- a/tests/make_index_test.py +++ b/tests/make_index_test.py @@ -3,6 +3,7 @@ import io import json import os.path +import re import urllib.request import zipfile from unittest import mock @@ -30,6 +31,7 @@ def test_make_info_empty_wheel_metadata(tmp_path): assert ret == { "filename": "a-1-py3-none-any.whl", "hash": "sha256=64f7f4664408d711c17ad28c1d3ba7dd155501e67c8632fafc8a525ba3ebc527", + "core_metadata": "sha256=d4528dc2d072c0e6d65addae8b5700fd29253b9eb9a9214aba539447d6f29fae", "upload_timestamp": mock.ANY, "uploaded_by": re_assert.Matches(r"^git@[a-f0-9]{7}"), } @@ -56,6 +58,7 @@ def test_make_info_full_wheel_metadata(tmp_path): "jsonschema", "packaging (==21.3) ; extra = 'p'", ], + "core_metadata": "sha256=a015186125a83e6667547b156f8c6813e72fbab48c4ae635ac3c3a5f1d86aa9f", "requires_python": ">= 3.7, != 3.7.0", "upload_timestamp": mock.ANY, "uploaded_by": re_assert.Matches(r"^git@[a-f0-9]{7}"), @@ -81,7 +84,36 @@ def test_main_new_package(tmp_path): # just some smoke tests about the output assert dest.joinpath("packages.json").exists() assert dest.joinpath("wheels/a-1-py3-none-any.whl").exists() - assert dest.joinpath("simple/a/index.html").exists() + + +def test_main_core_metadata(tmp_path): + dist = tmp_path.joinpath("dist") + dist.mkdir() + make_wheel(dist.joinpath("a-1-py3-none-any.whl"), ()) + dest = tmp_path.joinpath("dest") + + bio = io.BytesIO(b"") + with mock.patch.object(urllib.request, "urlopen", return_value=bio): + assert not make_index.main( + ( + f"--dist={dist}", + f"--dest={dest}", + "--pypi-url=http://example.com", + ) + ) + + wheel_sha = "64f7f4664408d711c17ad28c1d3ba7dd155501e67c8632fafc8a525ba3ebc527" + metadata_sha = "d4528dc2d072c0e6d65addae8b5700fd29253b9eb9a9214aba539447d6f29fae" + + with open(dest.joinpath("simple/a/index.html")) as f: + index_html = re.sub(r"\s+", " ", f.read()) + assert ( + f'a-1-py3-none-any.whl' + in index_html + ) + + with open(dest.joinpath("wheels/a-1-py3-none-any.whl.metadata")) as f: + assert f.read() == "Name: a\nVersion: 1\n" def test_main_multiple_provide_same_package_first_wins(tmp_path): diff --git a/tox.ini b/tox.ini index b292649b..f5f9c3a2 100644 --- a/tox.ini +++ b/tox.ini @@ -1,5 +1,5 @@ [tox] -envlist = py38 +envlist = py [testenv] skip_install = true