Update/annual patches (#115)

sethblack · web-flow · commit c8ea55ee2954 · 2025-04-03T15:43:00.000-05:00
* Update README.md with a better description of the Python capabilities

* Update pyproject.toml and Dependencies

* Revert last pyproject.toml command name

* Add non-privileged user to Docker

* Fix failing test cases

* Add supply chain attestations to GitHub Action

* Update for failing Docker builds
diff --git a/.github/workflows/docker-publish.yml b/.github/workflows/docker-publish.yml
@@ -3,6 +3,12 @@ name: Manual Docker Build and Push
 on:
   workflow_dispatch: # Allows manual triggering
 
+# Add permissions for pushing packages and OIDC token
+permissions:
+  contents: read
+  packages: write # Needed to push container images
+  id-token: write # Needed for signing/attestations
+
 jobs:
   build-and-push:
     runs-on: ubuntu-latest
@@ -16,12 +22,50 @@ jobs:
           username: ${{ secrets.DOCKERHUB_USERNAME }}
           password: ${{ secrets.DOCKERHUB_TOKEN }}
 
+      # Install the cosign tool
+      # https://github.com/sigstore/cosign-installer
+      - name: Install cosign
+        uses: sigstore/cosign-installer@v3.5.0
+        with:
+          cosign-release: 'v2.2.4'
+
+      # Setup Docker buildx
+      # https://github.com/docker/build-push-action/issues/461
+      - name: Setup Docker buildx
+        uses: docker/setup-buildx-action@v3
+
+      # Extract metadata (tags, labels) for Docker
+      # https://github.com/docker/metadata-action
+      - name: Extract Docker metadata
+        id: meta
+        uses: docker/metadata-action@v5
+        with:
+          images: sethblack/python-seo-analyzer # Your Docker Hub image
+
+      # Build and push Docker image with attestation
+      # https://github.com/docker/build-push-action
       - name: Build and push Docker image
+        id: build-and-push # Add id to reference outputs
         uses: docker/build-push-action@v5
         with:
           context: .
           push: true
-          tags: sethblack/python-seo-analyzer:latest # Use your Docker Hub username
+          tags: ${{ steps.meta.outputs.tags }} # Use tags from metadata
+          labels: ${{ steps.meta.outputs.labels }} # Use labels from metadata
+          # Attestations for provenance and SBOM
+          attests: |
+            provenance:builder-id=${{ github.workflow }}/${{ github.job_id }}
+            sbom:scan-mode=local,scan-args=--exclude=./tests
+
+      # Sign the resulting Docker image digest.
+      # https://github.com/sigstore/cosign
+      - name: Sign the published Docker image
+        env:
+          # https://docs.github.com/en/actions/security-guides/security-hardening-for-github-actions#using-an-intermediate-repository-for-the-build
+          COSIGN_EXPERIMENTAL: "true"
+        # This step uses the identity token to provision an ephemeral certificate
+        # against the sigstore community Fulcio instance.
+        run: echo "${{ steps.meta.outputs.tags }}" | xargs -I {} cosign sign --yes {}@${{ steps.build-and-push.outputs.digest }}
 
   publish-to-pypi:
     needs: build-and-push # Optional: Make this job depend on the Docker build
diff --git a/Dockerfile b/Dockerfile
@@ -1,4 +1,4 @@
-FROM python:3.12-bullseye
+FROM python:3.13.2-bookworm
 
 RUN apt-get update -y && apt-get upgrade -y
 
@@ -12,6 +12,20 @@ RUN uv cache clean --verbose
 
 COPY . /python-seo-analyzer
 
+# Create a non-root user
+RUN groupadd -r appgroup && useradd --no-log-init -r -g appgroup appuser
+
+# Set ownership of the app directory
+RUN chown -R appuser:appgroup /python-seo-analyzer
+
+# Switch back to root to install the package system-wide
+USER root
 RUN python3 -m pip install /python-seo-analyzer
 
-ENTRYPOINT ["/usr/local/bin/seoanalyze"]
+# Switch back to the non-root user
+USER appuser
+
+WORKDIR /app
+
+ENTRYPOINT ["python-seo-analyzer"]
+CMD ["--version"]
diff --git a/README.md b/README.md
@@ -1,10 +1,18 @@
 Python SEO and GEO Analyzer
-===================
+===========================
 
-A modern SEO and GEO analysis tool that combines technical optimization and authentic human value. Beyond traditional site crawling and structure analysis, it uses AI to evaluate content's expertise signals, conversational engagement, and cross-platform presence. It helps you maintain strong technical foundations while ensuring your site demonstrates genuine authority and value to real users.
+[![PyPI version](https://badge.fury.io/py/pyseoanalyzer.svg)](https://badge.fury.io/py/pyseoanalyzer)
+[![Docker Pulls](https://img.shields.io/docker/pulls/sethblack/python-seo-analyzer.svg)](https://hub.docker.com/r/sethblack/python-seo-analyzer)
+
+A modern SEO and GEO (Generative AI Engine Optimization or better AI Search Optimization) analysis tool that combines technical optimization and authentic human value. Beyond traditional site crawling and structure analysis, it uses AI to evaluate content's expertise signals, conversational engagement, and cross-platform presence. It helps you maintain strong technical foundations while ensuring your site demonstrates genuine authority and value to real users.
 
 The AI features were heavily influenced by the clickbait-titled SEL article [A 13-point roadmap for thriving in the age of AI search](https://searchengineland.com/seo-roadmap-ai-search-449199).
 
+Note About Python
+-----------------
+
+I've written quite a bit about the speed of Python and how there are very specific use cases where it isn't the best choice. I feel like crawling websites is definitely one of those cases. I wrote this tool in Python around 2010 to solve thea very specific need of crawling some small HTML-only websites for startups I was working at. I'm excited to see how much it has grown and how many people are using it. I feel like Python SEO Analyzer is acceptable for most smaller use cases, but if you are looking for something faster, I've built a much faster and more comprehensive tool [Black SEO Analyzer](https://github.com/sethblack/black-seo-analyzer).
+
 Installation
 ------------
 
@@ -16,10 +24,67 @@ pip install pyseoanalyzer
 
 ### Docker
 
-The docker image is available on [Docker Hub](https://hub.docker.com/r/sethblack/python-seo-analyzer) and can be run with the same command-line arguments as the script.
+#### Using the Pre-built Image from Docker Hub
+
+The easiest way to use the Docker image is to pull it directly from [Docker Hub](https://hub.docker.com/r/sethblack/python-seo-analyzer).
+
+```bash
+# Pull the latest image
+docker pull sethblack/python-seo-analyzer:latest
+
+# Run the analyzer (replace example.com with the target URL)
+# The --rm flag automatically removes the container when it exits
+docker run --rm sethblack/python-seo-analyzer http://example.com/
+
+# Run with specific arguments (e.g., sitemap and HTML output)
+# Note: If the sitemap is local, you'll need to mount it (see mounting example below)
+docker run --rm sethblack/python-seo-analyzer http://example.com/ --sitemap /path/inside/container/sitemap.xml --output-format html
+
+# Run with AI analysis (requires ANTHROPIC_API_KEY)
+# Replace "your_api_key_here" with your actual Anthropic API key
+docker run --rm -e ANTHROPIC_API_KEY="your_api_key_here" sethblack/python-seo-analyzer http://example.com/ --run-llm-analysis
+
+# Save HTML output to your local machine
+# This mounts the current directory (.) into /app/output inside the container.
+# The output file 'results.html' will be saved in your current directory.
+# The tool outputs JSON by default to stdout, so we redirect it for HTML.
+# Since the ENTRYPOINT handles the command, we redirect the container's stdout.
+# We need a shell inside the container to handle the redirection.
+docker run --rm -v "$(pwd):/app/output" sethblack/python-seo-analyzer /bin/sh -c "seoanalyze http://example.com/ --output-format html > /app/output/results.html"
+# Note for Windows CMD users: Use %cd% instead of $(pwd)
+# docker run --rm -v "%cd%:/app/output" sethblack/python-seo-analyzer /bin/sh -c "seoanalyze http://example.com/ --output-format html > /app/output/results.html"
+# Note for Windows PowerShell users: Use ${pwd} instead of $(pwd)
+# docker run --rm -v "${pwd}:/app/output" sethblack/python-seo-analyzer /bin/sh -c "seoanalyze http://example.com/ --output-format html > /app/output/results.html"
+
+
+# Mount a local sitemap file
+# This mounts 'local-sitemap.xml' from the current directory to '/app/sitemap.xml' inside the container
+docker run --rm -v "$(pwd)/local-sitemap.xml:/app/sitemap.xml" sethblack/python-seo-analyzer http://example.com/ --sitemap /app/sitemap.xml
+# Adjust paths and Windows commands as needed (see volume mounting example above)
 
 ```
-docker run sethblack/python-seo-analyzer [ARGS ...]
+
+#### Building the Image Locally
+
+You can also build the Docker image yourself from the source code. Make sure you have Docker installed and running.
+
+```bash
+# Clone the repository (if you haven't already)
+# git clone https://github.com/sethblack/python-seo-analyzer.git
+# cd python-seo-analyzer
+
+# Build the Docker image (tag it as 'my-seo-analyzer' for easy reference)
+docker build -t my-seo-analyzer .
+
+# Run the locally built image
+docker run --rm my-seo-analyzer http://example.com/
+
+# Run with AI analysis using the locally built image
+docker run --rm -e ANTHROPIC_API_KEY="your_api_key_here" my-seo-analyzer http://example.com/ --run-llm-analysis
+
+# Run with HTML output saved locally using the built image
+docker run --rm -v "$(pwd):/app/output" my-seo-analyzer /bin/sh -c "seoanalyze http://example.com/ --output-format html > /app/output/results.html"
+# Adjust Windows commands as needed (see volume mounting example above)
 ```
 
 Command-line Usage
diff --git a/pyproject.toml b/pyproject.toml
@@ -4,18 +4,21 @@ build-backend = "hatchling.build"
 
 [project]
 name = "pyseoanalyzer"
-version = "2024.12.12"
+version = "2025.4.3"
 authors = [
   {name = "Seth Black", email = "sblack@sethserver.com"},
 ]
 dependencies = [
-    "beautifulsoup4>=4.12.3",
-    "certifi>=2024.8.30",
-    "Jinja2>=3.1.4",
-    "lxml>=5.3.0",
-    "MarkupSafe>=3.0.2",
-    "trafilatura>=2.0.0",
-    "urllib3>=2.2.3",
+    "beautifulsoup4==4.13.3",
+    "certifi==2025.1.31",
+    "Jinja2==3.1.6",
+    "langchain==0.3.22",
+    "langchain-anthropic==0.3.10",
+    "lxml==5.3.1",
+    "MarkupSafe==3.0.2",
+    "python-dotenv==1.1.0",
+    "trafilatura==2.0.0",
+    "urllib3==2.3.0",
 ]
 requires-python = ">= 3.8"
 description = "An SEO tool that analyzes the structure of a site, crawls the site, count words in the body of the site and warns of any technical SEO issues."
@@ -47,9 +50,9 @@ classifiers = [
 ]
 
 [project.scripts]
-seoanalyze = "pyseoanalyzer.__main__:main"
+python-seo-analyzer = "pyseoanalyzer.__main__:main"
 
 [project.urls]
 Homepage = "https://github.com/sethblack/python-seo-analyzer"
 Repository = "https://github.com/sethblack/python-seo-analyzer.git"
-Issues = "https://github.com/sethblack/python-seo-analyzer/issues"
+Issues = "https://github.com/sethblack/python-seo-analyzer/issues"
diff --git a/pyseoanalyzer/__init__.py b/pyseoanalyzer/__init__.py
@@ -1,3 +1,25 @@
 #!/usr/bin/env python3
 
+import sys
+
+# Use importlib.metadata (available in Python 3.8+) to get the version
+# defined in pyproject.toml. This avoids duplicating the version string.
+if sys.version_info >= (3, 8):
+    from importlib import metadata
+else:
+    # Fallback for Python < 3.8 (requires importlib-metadata backport)
+    # Consider adding 'importlib-metadata; python_version < "3.8"' to dependencies
+    # if you need to support older Python versions.
+    import importlib_metadata as metadata
+
+try:
+    # __package__ refers to the package name ('pyseoanalyzer')
+    __version__ = metadata.version(__package__)
+except metadata.PackageNotFoundError:
+    # Fallback if the package is not installed (e.g., when running from source)
+    # You might want to handle this differently, e.g., raise an error
+    # or read from a VERSION file. For now, setting it to unknown.
+    __version__ = "0.0.0-unknown"
+
+
 from .analyzer import analyze
diff --git a/pyseoanalyzer/__main__.py b/pyseoanalyzer/__main__.py
@@ -4,15 +4,20 @@
 import inspect
 import json
 import os
+import sys
 
 from .analyzer import analyze
+from . import __version__
 
 
 def main():
     module_path = os.path.dirname(inspect.getfile(analyze))
-
-    arg_parser = argparse.ArgumentParser()
-
+    arg_parser = argparse.ArgumentParser(
+        description="Analyze SEO aspects of a website."
+    )
+    arg_parser.add_argument(
+        "--version", action="version", version=f"%(prog)s {__version__}"
+    )
     arg_parser.add_argument("site", help="URL of the site you are wanting to analyze.")
     arg_parser.add_argument(
         "-s", "--sitemap", help="URL of the sitemap to seed the crawler with."
diff --git a/pyseoanalyzer/page.py b/pyseoanalyzer/page.py
@@ -81,10 +81,10 @@ def __init__(
         self.analyze_extra_tags = analyze_extra_tags
         self.encoding = encoding
         self.run_llm_analysis = run_llm_analysis
-        self.title: str
-        self.author: str
-        self.description: str
-        self.hostname: str
+        self.title: str = ""
+        self.author: str = ""
+        self.description: str = ""
+        self.hostname: str = ""
         self.sitename: str
         self.date: str
         self.keywords = {}
@@ -224,15 +224,21 @@ def analyze(self, raw_html=None):
         )
 
         # I want to grab values from this even if they don't exist
-        metadata = metadata.as_dict() if metadata else {}
-
-        self.title = metadata.get("title", "")
-        self.author = metadata.get("author", "")
-        self.description = metadata.get("description", "")
-        self.hostname = metadata.get("hostname", "")
-        self.sitename = metadata.get("sitename", "")
-        self.date = metadata.get("date", "")
-        metadata_keywords = metadata.get("keywords", "")
+        metadata_dict = metadata.as_dict() if metadata else {}
+
+        # Helper function to get value or default to "" if None or 'None'
+        def get_meta_value(key):
+            value = metadata_dict.get(key)
+            return "" if value is None or value == "None" else value
+
+        # Ensure fields are strings, defaulting to "" if None or 'None'
+        self.title = get_meta_value("title")
+        self.author = get_meta_value("author")
+        self.description = get_meta_value("description")
+        self.hostname = get_meta_value("hostname")
+        self.sitename = get_meta_value("sitename")
+        self.date = get_meta_value("date")
+        metadata_keywords = get_meta_value("keywords")
 
         if len(metadata_keywords) > 0:
             self.warn(
diff --git a/pyseoanalyzer/website.py b/pyseoanalyzer/website.py
@@ -78,18 +78,24 @@ def crawl(self):
                 if page.parsed_url.netloc != page.base_domain.netloc:
                     continue
 
-                page.analyze()
+                # Analyze the page and check if successful
+                analysis_successful = page.analyze()
 
-                self.content_hashes[page.content_hash].add(page.url)
-                self.wordcount.update(page.wordcount)
-                self.bigrams.update(page.bigrams)
-                self.trigrams.update(page.trigrams)
+                # Only process and add the page if analysis completed
+                if analysis_successful:
+                    self.content_hashes[page.content_hash].add(page.url)
+                    self.wordcount.update(page.wordcount)
+                    self.bigrams.update(page.bigrams)
+                    self.trigrams.update(page.trigrams)
 
-                self.page_queue.extend(page.links)
+                    # Only add links if following is enabled and analysis was successful
+                    if self.follow_links:
+                        self.page_queue.extend(page.links)
 
-                self.crawled_pages.append(page)
-                self.crawled_urls.add(page.url)
+                    self.crawled_pages.append(page)
+                    self.crawled_urls.add(page.url)
 
+                # Stop after the first page if not following links, regardless of analysis success
                 if not self.follow_links:
                     break
         except Exception as e:
diff --git a/requirements.txt b/requirements.txt
@@ -1,10 +1,12 @@
-beautifulsoup4==4.12.3
-certifi==2024.8.30
+beautifulsoup4==4.13.3
+certifi==2025.1.31
 Jinja2==3.1.6
-langchain==0.3.11
-langchain-anthropic==0.3.0
-lxml==5.3.0
+langchain==0.3.22
+langchain-anthropic==0.3.10
+lxml==5.3.1
 MarkupSafe==3.0.2
-python-dotenv==1.0.1
+pytest==8.3.2 # Added for testing
+pytest-mock==3.14.0 # Added for testing
+python-dotenv==1.1.0
 trafilatura==2.0.0
-urllib3==2.2.3
+urllib3==2.3.0
diff --git a/tests/test_analyzer.py b/tests/test_analyzer.py