Skip to content
Merged
Show file tree
Hide file tree
Changes from 31 commits
Commits
Show all changes
41 commits
Select commit Hold shift + click to select a range
05c2be5
init
max-ostapenko Jan 12, 2026
73efcee
update chart formatting script
max-ostapenko Jan 13, 2026
40149d5
text
max-ostapenko Jan 14, 2026
79bfd6c
Merge branch 'main' into privacy-markdown-2025
max-ostapenko Jan 14, 2026
e7314a9
chapter + tools
max-ostapenko Jan 14, 2026
941752b
Optimised images with calibre/image-actions
github-actions[bot] Jan 14, 2026
44bf6f3
lint
max-ostapenko Jan 14, 2026
3f6c1e1
Merge branch 'privacy-markdown-2025' of https://github.com/HTTPArchiv…
max-ostapenko Jan 14, 2026
585fe72
lint
max-ostapenko Jan 14, 2026
8bb1809
lint
max-ostapenko Jan 14, 2026
92c584b
Merge branch 'main' into privacy-markdown-2025
max-ostapenko Jan 14, 2026
b6f4b85
readme for chart tools
max-ostapenko Jan 15, 2026
0f6ee30
revert changes
max-ostapenko Jan 15, 2026
eabf2c9
lint
max-ostapenko Jan 15, 2026
635cc18
fix
max-ostapenko Jan 15, 2026
aee4df9
CodeQL fix
max-ostapenko Jan 15, 2026
cfc3dbb
Update src/content/en/2025/privacy.md
max-ostapenko Jan 15, 2026
6a57098
Merge branch 'privacy-markdown-2025' of https://github.com/HTTPArchiv…
max-ostapenko Jan 15, 2026
c944424
jannis's suggestion
max-ostapenko Jan 15, 2026
59e77b8
Apply suggestion from @JannisBush
max-ostapenko Jan 15, 2026
ea1c907
nrllh as 3rd author
max-ostapenko Jan 15, 2026
160abc0
copilot review
max-ostapenko Jan 15, 2026
f06f529
Merge branch 'main' into privacy-markdown-2025
max-ostapenko Jan 15, 2026
b64a48a
Update src/content/en/2025/privacy.md
max-ostapenko Jan 15, 2026
abd35c4
Update src/content/en/2025/privacy.md
max-ostapenko Jan 15, 2026
1a35552
Metadata cleanup
tunetheweb Jan 15, 2026
8e8e86a
Interationalise links with no translations
tunetheweb Jan 15, 2026
6fa352f
Code formatting and smart quotes
tunetheweb Jan 15, 2026
23b356c
Headings
tunetheweb Jan 15, 2026
4cbc9b1
Big number
tunetheweb Jan 15, 2026
6d40b07
Misc edits
tunetheweb Jan 15, 2026
1aaec86
Chart title for Clients Hints
tunetheweb Jan 15, 2026
08e3bfb
Merge branch 'main' into privacy-markdown-2025
max-ostapenko Jan 15, 2026
011f26b
Update src/content/en/2025/privacy.md
max-ostapenko Jan 15, 2026
c696dba
featured stats
max-ostapenko Jan 15, 2026
a2154e1
new images
max-ostapenko Jan 15, 2026
abdc018
Optimised images with calibre/image-actions
github-actions[bot] Jan 15, 2026
721ba3f
capitalized client names for charts
max-ostapenko Jan 15, 2026
c76e91d
lint
max-ostapenko Jan 15, 2026
1b74a27
mention the UA reduction upgrade
max-ostapenko Jan 15, 2026
98281db
fix
max-ostapenko Jan 15, 2026
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -12,15 +12,39 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"# Authenticate the user\n",
"import sys\n",
"\n",
"if 'google.colab' in sys.modules:\n",
" from google.colab import auth\n",
"\n",
" auth.authenticate_user()\n",
" credentials = auth.get_user_credentials()\n",
"else:\n",
" import google.auth\n",
"\n",
" SCOPES = [\n",
" 'https://www.googleapis.com/auth/spreadsheets'\n",
" ]\n",
" credentials, project = google.auth.default(scopes=SCOPES)"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {
"id": "cOcbpC6qRou_"
},
"outputs": [],
"source": [
"from google.colab import auth\n",
"from googleapiclient.discovery import build\n",
"\n",
"sheets_service = build('sheets', 'v4', cache_discovery=False, credentials=credentials)\n",
"\n",
"def update_chart_size(spreadsheet_id, is_dry_run=False, target_width=600, target_height=371):\n",
" response = sheets_service.spreadsheets().get(spreadsheetId=spreadsheet_id, includeGridData=False).execute()\n",
" sheets = response.get('sheets', [])\n",
Expand Down Expand Up @@ -54,26 +78,22 @@
" print(f\"\"\"sheet: {sheet['properties']['title']},\n",
"chart: {chart['spec']['title']},\n",
"dimensions: {chart['position']['overlayPosition']['widthPixels']} x {chart['position']['overlayPosition'].get('heightPixels', 'N/A')}\n",
" \"\"\")\n",
"\n",
"# Authenticate the user\n",
"auth.authenticate_user()\n",
"sheets_service = build('sheets', 'v4', cache_discovery=False)"
" \"\"\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 3,
"metadata": {
"id": "vp1izUBSLxp9"
},
"outputs": [],
"source": [
"# Replace this with the ID of your Google Sheets file\n",
"SPREADSHEET_ID = '18r8cT6x9lPdM-rXvXjsqx84W7ZDdTDYGD59xr0UGOwg'\n",
"SPREADSHEET_ID = '1Svyw40Th7VbigX6lpR1lb1WXwTUVKZWrK7O2YELrml4'\n",
"\n",
"# Call the function to update the chart width\n",
"update_chart_size(SPREADSHEET_ID, target_height=None, is_dry_run=True)"
"update_chart_size(SPREADSHEET_ID, target_height=None, is_dry_run=False)"
]
}
],
Expand All @@ -85,11 +105,21 @@
"provenance": []
},
"kernelspec": {
"display_name": "Python 3",
"display_name": ".venv",
"language": "python",
"name": "python3"
},
"language_info": {
"name": "python"
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.14.2"
}
},
"nbformat": 4,
Expand Down
80 changes: 80 additions & 0 deletions sql/util/generate_figure_markup.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,80 @@
import re
import os
from googleapiclient.discovery import build # pylint: disable=import-error
import google.auth # pylint: disable=import-error

# Configuration
SPREADSHEET_ID = '1Svyw40Th7VbigX6lpR1lb1WXwTUVKZWrK7O2YELrml4'
PUBCHART_ID = '2PACX-1vRC5wrzy5NEsWNHn9w38RLsMURRScnP4jgjO1mDiVhsfFCY55tujlTUZhUaEWzmPtJza0QA7w8S4uK5'
SQL_DIR = '../2025/privacy' # Relative to this script's location

SCOPES = ['https://www.googleapis.com/auth/spreadsheets']


def get_sql_to_sheet_map(sql_dir):
mapping = {}
if not os.path.exists(sql_dir):
print(f"Directory not found: {sql_dir}")
return mapping
for filename in os.listdir(sql_dir):
if filename.endswith(".sql"):
# Generate sheet name from filename using the regex:
# re.sub(r'(\.sql|[^a-zA-Z0-9]+)', ' ', filename).strip().title()
sheet_name = re.sub(r'(\.sql|[^a-zA-Z0-9]+)', ' ', filename).strip().title()
mapping[sheet_name] = filename
return mapping


def generate_figure_markup(spreadsheet_id, sql_dir):
try:
credentials, project = google.auth.default(scopes=SCOPES)
sheets_service = build('sheets', 'v4', cache_discovery=False, credentials=credentials)
except Exception as e:
print(f"Authentication failed: {e}")
print("Please ensure you have application default credentials set up.")
return

sql_map = get_sql_to_sheet_map(sql_dir)
response = sheets_service.spreadsheets().get(spreadsheetId=spreadsheet_id, includeGridData=False).execute()
sheets = response.get('sheets', [])

for sheet in sheets:
sheet_name = sheet['properties']['title']
sheet_id = sheet['properties']['sheetId']
charts = sheet.get('charts', [])

sql_file = sql_map.get(sheet_name)
if not sql_file:
# Try to match case-insensitively or show warning
sql_file = "TODO.sql"

for chart in charts:
title = chart['spec'].get('title', 'Untitled Chart')
chart_id = chart['chartId']

# Slugify for image name
image_name = re.sub(r'[^a-z0-9]+', '-', title.lower()).strip('-') + ".png"

# Construct markup
markup = f"""{{{{ figure_markup(
image="{image_name}",
caption="{title}",
description="",
chart_url="https://docs.google.com/spreadsheets/d/e/{PUBCHART_ID}/pubchart?oid={chart_id}&format=interactive",
sheets_gid="{sheet_id}",
sql_file="{sql_file}"
)
}}}}"""
print(markup)
print()


if __name__ == "__main__":
# Resolve relative SQL_DIR based on script location
script_dir = os.path.dirname(os.path.abspath(__file__))
absolute_sql_dir = os.path.normpath(os.path.join(script_dir, SQL_DIR))

print(f"Processing Spreadsheet: {SPREADSHEET_ID}")
print(f"SQL Directory: {absolute_sql_dir}\n")

generate_figure_markup(SPREADSHEET_ID, absolute_sql_dir)
2 changes: 2 additions & 0 deletions sql/util/requirements.txt
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
pandas==2.3.3
google-cloud-bigquery==3.40.0
google-api-python-client==2.188.0
google-auth==2.47.0
requests==2.32.5
tabulate==0.9.0
gspread==6.2.1
Expand Down
1 change: 1 addition & 0 deletions src/.gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -9,3 +9,4 @@ templates/sitemap.xml
static/html/
static/js/web-vitals.js
.coverage
eng.traineddata
96 changes: 54 additions & 42 deletions src/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -117,58 +117,70 @@ There is also a file watcher, which monitors the `content` directory and automat
npm run watch
```

## Generating chapter images

We can automate the generation of chapter images from the command line to save this onerous task.
## Figures preparation

This requires the figure markup to exist in the chapter's markdown file, including the `image` and `chart_url` attributes:
The following tools help automate figure positioning and metadata:

```py
{{ figure_markup(
image="pwa-timeseries-of-service-worker-installations.png",
...
chart_url="https://docs.google.com/spreadsheets/d/e/2PACX-1vRRpTSA4fsHwUap-ByQ08j95uo7Zm1kY6lTSvA-DZT54g2QZ0guV7db3QyQwQgMPzsKsJ43gbzqfJst/pubchart?oid=1883263914&format=interactive",
...
)
}}
```
1. **Running queries and exporting to sheets**: [bq_to_sheets.ipynb](../sql/util/bq_to_sheets.ipynb) runs queries and exports the results to Google Sheets.
2. **Normalization**: [chart-adjustments.ipynb](../sql/util/chart-adjustments.ipynb) normalizes chart sizes in Google Sheets to ensure consistent dimensions for image generation.
3. **Figures markup generation**: [generate_figure_markup.py](../sql/util/generate_figure_markup.py) scans Google Sheets for charts and generates the corresponding `figure_markup` shortcodes with mapped SQL files and sheet IDs.
4. **Generating chapter images**: We can automate the generation of chapter images from the command line to save this onerous task.

It can be run like below, by passing a chapter markdown (with or without the `.md` extension):
This requires the figure markup to exist in the chapter's markdown file, including the `image` and `chart_url` attributes:

```bash
npm run figure-images en/2021/pwa
```
```py
{{ figure_markup(
image="pwa-timeseries-of-service-worker-installations.png",
...
chart_url="https://docs.google.com/spreadsheets/d/e/2PACX-1vRRpTSA4fsHwUap-ByQ08j95uo7Zm1kY6lTSvA-DZT54g2QZ0guV7db3QyQwQgMPzsKsJ43gbzqfJst/pubchart?oid=1883263914&format=interactive",
...
)
}}
```

Which will then generate any missing figures based on the chapter markup, skipping images that already exist:
It can be run like below, by passing a chapter markdown (with or without the `.md` extension):

```log
> almanac.httparchive.org@0.0.1 figure-images
> node ./tools/generate/generate_figure_images "en/2021/pwa"

Generating for chapter: pwa for year 2021
Skipping: pwa-service-worker-controlled-pages-by-rank.png as image already exists
Skipping: pwa-most-used-service-worker-events.png as image already exists
Skipping: pwa-service-worker-and-manifest-usage.png as image already exists
Skipping: pwa-top-pwa-manifest-properties.png as image already exists
Skipping: pwa-top-pwa-manifest-icon-sizes.png as image already exists
Skipping: pwa-manifest-display-values.png as image already exists
Skipping: pwa-manifests-preferring-native-app.png as image already exists
Skipping: pwa-industry-categories.png as image already exists
Skipping: pwa-lighthouse-pwa-audits.png as image already exists
Skipping: pwa-lighthouse-pwa-scores.png as image already exists
Skipping: pwa-libraries-and-scripts.png as image already exists
Skipping: pwa-top-workbox-versions.png as image already exists
Skipping: pwa-top-workbox-packages.png as image already exists
Generating image pwa-workbox-runtime-caching-strategies.png...
Generating image pwa-notification-acceptance-rates.png...
Generating image pwa-install-events.png...
```
```bash
npm run figure-images en/2021/pwa
```

Which will then generate any missing figures based on the chapter markup, skipping images that already exist:

```log
> almanac.httparchive.org@0.0.1 figure-images
> node ./tools/generate/generate_figure_images "en/2021/pwa"

Generating for chapter: pwa for year 2021
Skipping: pwa-service-worker-controlled-pages-by-rank.png as image already exists
Skipping: pwa-most-used-service-worker-events.png as image already exists
Skipping: pwa-service-worker-and-manifest-usage.png as image already exists
Skipping: pwa-top-pwa-manifest-properties.png as image already exists
Skipping: pwa-top-pwa-manifest-icon-sizes.png as image already exists
Skipping: pwa-manifest-display-values.png as image already exists
Skipping: pwa-manifests-preferring-native-app.png as image already exists
Skipping: pwa-industry-categories.png as image already exists
Skipping: pwa-lighthouse-pwa-audits.png as image already exists
Skipping: pwa-lighthouse-pwa-scores.png as image already exists
Skipping: pwa-libraries-and-scripts.png as image already exists
Skipping: pwa-top-workbox-versions.png as image already exists
Skipping: pwa-top-workbox-packages.png as image already exists
Generating image pwa-workbox-runtime-caching-strategies.png...
Generating image pwa-notification-acceptance-rates.png...
Generating image pwa-install-events.png...
```

Authors can delete images and rerun if they want to, to regenerate images.

Authors can delete images and rerun if they want to, to regenerate images.
Images will automatically be compressed by our Calibre GitHub Action when uploaded to GitHub, but you can get a lot more compression (about 44% more!) by running them through <https://tinypng.com> instead (at which point the Calibre Action will usually not find any further compression gains). It's quite simple to drag them up, and download them, so would encourage analysts/authors to take this step.

Images will automatically be compressed by our Calibre GitHub Action when uploaded to GitHub, but you can get a lot more compression (about 44% more!) by running them through <https://tinypng.com> instead (at which point the Calibre Action will usually not find any further compression gains). It's quite simple to drag them up, and download them, so would encourage analysts/authors to take this step.
Running them through <https://tinypng.com> also has the added advantage of the compression being repeatable each time. So if you are not sure which images you have changed, you can delete them all, regenerate them all, run them through TinyPNG, and then a `git diff` will only show differences on the images that have changed. This will not be the case if you use the Calibre GitHub Action and it will look like all images have changed.

Running them through <https://tinypng.com> also has the added advantage of the compression being repeatable each time. So if you are not sure which images you have changed, you can delete them all, regenerate them all, run them through TinyPNG, and then a `git diff` will only show differences on the images that have changed. This will not be the case if you use the Calibre GitHub Action and it will look like all images have changed.
5. **Automated descriptions**: [generate_figure_descriptions.js](tools/generate/generate_figure_descriptions.js) uses OCR (via Tesseract.js) to automatically populate `description=""` fields in markdown chapters based on the generated images.

```bash
node ./tools/generate/generate_figure_descriptions en/2025/privacy
```

## Linting files

Expand Down
3 changes: 1 addition & 2 deletions src/config/2025.json
Original file line number Diff line number Diff line change
Expand Up @@ -67,8 +67,7 @@
"chapter_number": "8",
"title": "Privacy",
"slug": "privacy",
"hero_dir": "2020",
"todo": true
"hero_dir": "2020"
},
{
"part": "II",
Expand Down
19 changes: 19 additions & 0 deletions src/config/contributors.json
Original file line number Diff line number Diff line change
Expand Up @@ -3229,6 +3229,10 @@
"analysts",
"developers",
"authors"
],
"2025": [
"analysts",
"editors"
]
},
"website": "https://maxostapenko.com"
Expand Down Expand Up @@ -3772,6 +3776,7 @@
"2025": [
"committee",
"leads",
"authors",
"reviewers"
]
},
Expand Down Expand Up @@ -5154,6 +5159,7 @@
"name": "Vinod Tiwari",
"teams": {
"2025": [
"authors",
"reviewers"
]
},
Expand Down Expand Up @@ -5421,5 +5427,18 @@
]
},
"twitter": "_cybai"
},
"RumaisaHabib": {
"avatar_url": "66083065",
"github": "RumaisaHabib",
"name": "Rumaisa Habib",
"teams": {
"2025": [
"authors"
]
},
"website": "https://rumaisahabib.com/",
"linkedin": "rumaisahabib"
}

}
Loading
Loading