Skip to content

Commit 508f7e5

Browse files
feat: [#45] workflow for a transformation report
1 parent f14a4fa commit 508f7e5

File tree

8 files changed

+484
-0
lines changed

8 files changed

+484
-0
lines changed

analytics/__init__.py

Whitespace-only changes.

analytics/create_report.py

Lines changed: 142 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,142 @@
1+
import os
2+
import psycopg
3+
from datetime import datetime, UTC
4+
from jinja2 import Template
5+
from dotenv import load_dotenv
6+
7+
load_dotenv()
8+
9+
# DB connection parameters
10+
USER = os.environ.get("POSTGRES_USER", "postgres")
11+
PW = os.environ.get("POSTGRES_PASSWORD", "postgres")
12+
ADDRESS = os.environ.get("POSTGRES_ADDRESS", "127.0.0.1")
13+
PORT = int(os.environ.get("POSTGRES_PORT", "5432"))
14+
DB_NAME = os.environ.get("POSTGRES_DB", "postgres")
15+
16+
BASE_DIR = os.path.dirname(os.path.abspath(__file__))
17+
SQL_DIR = os.path.join(BASE_DIR, "sql")
18+
TPL_DIR = os.path.join(BASE_DIR, "templates")
19+
REPORTS_DIR = os.path.join(BASE_DIR, "reports")
20+
21+
# Create reports directory if it doesn't exist
22+
os.makedirs(REPORTS_DIR, exist_ok=True)
23+
24+
25+
def load_sql_files() -> dict[str, str]:
26+
"""
27+
Load all SQL files from /sql directory.
28+
29+
Returns:
30+
dict[str, str]: SQL query name -> SQL query content mapping
31+
"""
32+
queries = {}
33+
for filename in os.listdir(SQL_DIR):
34+
if filename.endswith(".sql"):
35+
key = filename.replace(".sql", "")
36+
with open(os.path.join(SQL_DIR, filename), "r") as f:
37+
queries[key] = f.read()
38+
return queries
39+
40+
41+
def run_queries(queries: dict[str, str]) -> dict[str, list[dict]]:
42+
"""
43+
Execute SQL queries using psycopg3 with clean defaults.
44+
45+
Args:
46+
queries: dict[str, str] - SQL query name -> SQL query content mapping
47+
48+
Returns:
49+
dict[str, list[dict]]: Query name -> list of result rows (as dicts) mapping
50+
"""
51+
results = {}
52+
53+
with psycopg.connect(
54+
dbname=DB_NAME,
55+
user=USER,
56+
password=PW,
57+
host=ADDRESS,
58+
port=PORT,
59+
) as conn:
60+
with conn.cursor() as cur:
61+
for name, sql in queries.items():
62+
cur.execute(sql)
63+
rows = cur.fetchall()
64+
columns = [desc[0] for desc in cur.description]
65+
results[name] = [dict(zip(columns, row)) for row in rows]
66+
67+
return results
68+
69+
70+
def aggregate_statistics(data: dict[str, list[dict]]) -> dict[str, int]:
71+
"""
72+
Calculate summary statistics from query results.
73+
74+
Args:
75+
data: dict[str, list[dict]] - Query results mapping
76+
77+
Returns:
78+
dict[str, int]: Statistics key -> value mapping
79+
"""
80+
stats = {}
81+
82+
# Total records
83+
count_records = data.get("count_records_per_endpoint", [])
84+
stats["total_records"] = sum(row.get("record_count", 0) for row in count_records)
85+
86+
# Total harvest events
87+
summary = data.get("summary", [])
88+
stats["total_events"] = sum(row.get("total_harvest_events", 0) for row in summary)
89+
90+
return stats
91+
92+
93+
def render_markdown(data: dict[str, list[dict]]) -> str:
94+
"""
95+
Render Markdown report using Jinja2 template.
96+
97+
Args:
98+
data: dict[str, list[dict]] - Query results mapping
99+
100+
Returns:
101+
str: Rendered Markdown report content
102+
"""
103+
template_path = os.path.join(TPL_DIR, "report.md.jinja")
104+
105+
# Calculate aggregated statistics
106+
stats = aggregate_statistics(data)
107+
108+
with open(template_path) as f:
109+
template = Template(f.read())
110+
111+
return template.render(
112+
generated_at=datetime.now(UTC).isoformat(),
113+
database_name=DB_NAME,
114+
**data,
115+
**stats
116+
)
117+
118+
119+
def save_report(content: str) -> None:
120+
"""
121+
Save the Markdown report to a timestamped file in reports/ directory.
122+
123+
Args:
124+
content: str - Rendered Markdown report content
125+
126+
Returns:
127+
None
128+
"""
129+
filename = f"report_{datetime.now(UTC).strftime('%Y-%m-%d_%H-%M-%S')}.md"
130+
output_path = os.path.join(REPORTS_DIR, filename)
131+
132+
with open(output_path, "w") as f:
133+
f.write(content)
134+
135+
print(f"✓ Report saved to {output_path}")
136+
137+
138+
if __name__ == "__main__":
139+
sql_queries = load_sql_files()
140+
query_results = run_queries(sql_queries)
141+
markdown = render_markdown(query_results)
142+
save_report(markdown)
Lines changed: 229 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,229 @@
1+
# Harvest Report
2+
3+
Generated at: **2025-12-04T18:08:43.394581+00:00**
4+
5+
---
6+
7+
## Summary Statistics
8+
9+
- **Total Records Harvested:** 52260
10+
- **Total Endpoints:** 8
11+
- **Harvest Events:** 52267
12+
- **Errors:** 6
13+
- **Duplicates:** 12
14+
15+
---
16+
17+
## Records per Endpoint
18+
19+
| Endpoint | Records | % of Total |
20+
|----------|---------|-----------|
21+
| Archaeology Data Station | 32394 | 62.0% |
22+
| Generalist | 8553 | 16.4% |
23+
| Social Sciences Data Station | 7968 | 15.2% |
24+
| SwissUbase | 1175 | 2.2% |
25+
| Life Sciences | 1020 | 2.0% |
26+
| Physical and Technical Sciences | 861 | 1.6% |
27+
| DABAR | 287 | 0.5% |
28+
| Onedata | 2 | 0.0% |
29+
30+
31+
---
32+
33+
## Harvest Summary
34+
35+
| Endpoint | Total Events | Successful | Failed | Success Rate |
36+
|----------|--------------|-----------|--------|--------------|
37+
| Archaeology Data Station | 32401 | 32395 | 6 | 99.98% |
38+
| Generalist | 8553 | 8553 | 0 | 100.0% |
39+
| Social Sciences Data Station | 7968 | 7968 | 0 | 100.0% |
40+
| SwissUbase | 1175 | 1175 | 0 | 100.0% |
41+
| Life Sciences | 1020 | 1020 | 0 | 100.0% |
42+
| Physical and Technical Sciences | 861 | 861 | 0 | 100.0% |
43+
| DABAR | 287 | 287 | 0 | 100.0% |
44+
| Onedata | 2 | 2 | 0 | 100.0% |
45+
46+
47+
---
48+
49+
## Data Quality Issues
50+
51+
### Errors (6)
52+
53+
54+
The following validation errors were detected during harvest:
55+
56+
57+
**Archaeology Data Station** (ID: `ebcc8eaa-5da7-4fee-b354-557f3ee7f3d0` | Identifier: `doi:10.17026/AR/ZWGO7D`)
58+
59+
```
60+
'202-05-29' does not match '^[0-9]{4}-[0-9]{2}-[0-9]{2}$'
61+
62+
Failed validating 'pattern' in schema['properties']['dates']['items']['properties']['date']:
63+
{'type': 'string', 'pattern': '^[0-9]{4}-[0-9]{2}-[0-9]{2}$'}
64+
65+
On instance['dates'][0]['date']:
66+
'202-05-29'
67+
```
68+
69+
**Archaeology Data Station** (ID: `ebcc8eaa-5da7-4fee-b354-557f3ee7f3d0` | Identifier: `doi:10.17026/AR/8AQPUU`)
70+
71+
```
72+
'22-04-24' does not match '^[0-9]{4}-[0-9]{2}-[0-9]{2}$'
73+
74+
Failed validating 'pattern' in schema['properties']['dates']['items']['properties']['date']:
75+
{'type': 'string', 'pattern': '^[0-9]{4}-[0-9]{2}-[0-9]{2}$'}
76+
77+
On instance['dates'][0]['date']:
78+
'22-04-24'
79+
```
80+
81+
**Archaeology Data Station** (ID: `ebcc8eaa-5da7-4fee-b354-557f3ee7f3d0` | Identifier: `doi:10.17026/AR/CATOE2`)
82+
83+
```
84+
'21-04-23' does not match '^[0-9]{4}-[0-9]{2}-[0-9]{2}$'
85+
86+
Failed validating 'pattern' in schema['properties']['dates']['items']['properties']['date']:
87+
{'type': 'string', 'pattern': '^[0-9]{4}-[0-9]{2}-[0-9]{2}$'}
88+
89+
On instance['dates'][0]['date']:
90+
'21-04-23'
91+
```
92+
93+
**Archaeology Data Station** (ID: `ebcc8eaa-5da7-4fee-b354-557f3ee7f3d0` | Identifier: `doi:10.17026/AR/QDHTLZ`)
94+
95+
```
96+
'12-11-24' does not match '^[0-9]{4}-[0-9]{2}-[0-9]{2}$'
97+
98+
Failed validating 'pattern' in schema['properties']['dates']['items']['properties']['date']:
99+
{'type': 'string', 'pattern': '^[0-9]{4}-[0-9]{2}-[0-9]{2}$'}
100+
101+
On instance['dates'][0]['date']:
102+
'12-11-24'
103+
```
104+
105+
**Archaeology Data Station** (ID: `ebcc8eaa-5da7-4fee-b354-557f3ee7f3d0` | Identifier: `doi:10.17026/AR/DDDINZ`)
106+
107+
```
108+
'202-02-09' does not match '^[0-9]{4}-[0-9]{2}-[0-9]{2}$'
109+
110+
Failed validating 'pattern' in schema['properties']['dates']['items']['properties']['date']:
111+
{'type': 'string', 'pattern': '^[0-9]{4}-[0-9]{2}-[0-9]{2}$'}
112+
113+
On instance['dates'][0]['date']:
114+
'202-02-09'
115+
```
116+
117+
**Archaeology Data Station** (ID: `ebcc8eaa-5da7-4fee-b354-557f3ee7f3d0` | Identifier: `doi:10.17026/AR/9Z3TD3`)
118+
119+
```
120+
'21-04-22' does not match '^[0-9]{4}-[0-9]{2}-[0-9]{2}$'
121+
122+
Failed validating 'pattern' in schema['properties']['dates']['items']['properties']['date']:
123+
{'type': 'string', 'pattern': '^[0-9]{4}-[0-9]{2}-[0-9]{2}$'}
124+
125+
On instance['dates'][0]['date']:
126+
'21-04-22'
127+
```
128+
129+
130+
131+
### Duplicates (12)
132+
133+
134+
The following record identifiers appear in multiple endpoints:
135+
136+
137+
138+
139+
**doi:10.17026/DANS-23V-7HCG**
140+
141+
142+
- `Life Sciences` (ID: `1ed6f781-ab7d-43fa-a4de-6834299f2f4e::doi:10.17026/DANS-23V-7HCG`)
143+
144+
145+
146+
**doi:10.17026/DANS-23V-7HCG**
147+
148+
149+
- `Social Sciences Data Station` (ID: `d15f10e3-7e13-44d1-8349-382a849cb92e::doi:10.17026/DANS-23V-7HCG`)
150+
151+
152+
153+
**doi:10.17026/DANS-28X-UT8Q**
154+
155+
156+
- `Life Sciences` (ID: `1ed6f781-ab7d-43fa-a4de-6834299f2f4e::doi:10.17026/DANS-28X-UT8Q`)
157+
158+
159+
160+
**doi:10.17026/DANS-28X-UT8Q**
161+
162+
163+
- `Social Sciences Data Station` (ID: `d15f10e3-7e13-44d1-8349-382a849cb92e::doi:10.17026/DANS-28X-UT8Q`)
164+
165+
166+
167+
**doi:10.17026/DANS-XCK-9ZHP**
168+
169+
170+
- `Life Sciences` (ID: `1ed6f781-ab7d-43fa-a4de-6834299f2f4e::doi:10.17026/DANS-XCK-9ZHP`)
171+
172+
173+
174+
**doi:10.17026/DANS-XCK-9ZHP**
175+
176+
177+
- `Social Sciences Data Station` (ID: `d15f10e3-7e13-44d1-8349-382a849cb92e::doi:10.17026/DANS-XCK-9ZHP`)
178+
179+
180+
181+
**doi:10.17026/DANS-XFT-EPRJ**
182+
183+
184+
- `Physical and Technical Sciences` (ID: `6d1f99b3-6357-4986-8947-f57af06ae191::doi:10.17026/DANS-XFT-EPRJ`)
185+
186+
187+
188+
**doi:10.17026/DANS-XFT-EPRJ**
189+
190+
191+
- `Social Sciences Data Station` (ID: `d15f10e3-7e13-44d1-8349-382a849cb92e::doi:10.17026/DANS-XFT-EPRJ`)
192+
193+
194+
195+
**doi:10.17026/DANS-XS8-RAUM**
196+
197+
198+
- `Life Sciences` (ID: `1ed6f781-ab7d-43fa-a4de-6834299f2f4e::doi:10.17026/DANS-XS8-RAUM`)
199+
200+
201+
202+
**doi:10.17026/DANS-XS8-RAUM**
203+
204+
205+
- `Social Sciences Data Station` (ID: `d15f10e3-7e13-44d1-8349-382a849cb92e::doi:10.17026/DANS-XS8-RAUM`)
206+
207+
208+
209+
**doi:10.17026/DANS-ZGN-BEEZ**
210+
211+
212+
- `Life Sciences` (ID: `1ed6f781-ab7d-43fa-a4de-6834299f2f4e::doi:10.17026/DANS-ZGN-BEEZ`)
213+
214+
215+
216+
**doi:10.17026/DANS-ZGN-BEEZ**
217+
218+
219+
- `Social Sciences Data Station` (ID: `d15f10e3-7e13-44d1-8349-382a849cb92e::doi:10.17026/DANS-ZGN-BEEZ`)
220+
221+
222+
223+
---
224+
225+
## Report Metadata
226+
227+
- **Generated:** 2025-12-04T18:08:43.394581+00:00
228+
- **Database:** postgres
229+
- **Report Period:** Latest harvest run
Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
SELECT
2+
e.name AS endpoint_name,
3+
COUNT(r.id) AS record_count
4+
FROM records r
5+
JOIN endpoints e ON r.endpoint_id = e.id
6+
GROUP BY e.name
7+
ORDER BY record_count DESC;

analytics/sql/duplicates.sql

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,13 @@
1+
SELECT
2+
r.record_identifier,
3+
r.id AS record_id,
4+
e.name AS endpoint_name
5+
FROM records r
6+
JOIN endpoints e ON r.endpoint_id = e.id
7+
WHERE r.record_identifier IN (
8+
SELECT record_identifier
9+
FROM records
10+
GROUP BY record_identifier
11+
HAVING COUNT(*) > 1
12+
)
13+
ORDER BY r.record_identifier, e.name, r.id;

0 commit comments

Comments
 (0)