EOSC-Data-Commons
diff --git a/‎analytics/__init__.py‎ b/‎analytics/__init__.py‎
diff --git a/‎analytics/create_report.py‎
Lines changed: 142 additions & 0 deletions b/‎analytics/create_report.py‎
Lines changed: 142 additions & 0 deletions
diff --git a/‎analytics/reports/report_2025-12-04_18-08-43.md‎
Lines changed: 229 additions & 0 deletions b/‎analytics/reports/report_2025-12-04_18-08-43.md‎
Lines changed: 229 additions & 0 deletions
diff --git a/‎analytics/sql/count_records_per_endpoint.sql‎
Lines changed: 7 additions & 0 deletions b/‎analytics/sql/count_records_per_endpoint.sql‎
Lines changed: 7 additions & 0 deletions
diff --git a/‎analytics/sql/duplicates.sql‎
Lines changed: 13 additions & 0 deletions b/‎analytics/sql/duplicates.sql‎
Lines changed: 13 additions & 0 deletions
@@ -0,0 +1,142 @@
+import os
+import psycopg
+from datetime import datetime, UTC
+from jinja2 import Template
+from dotenv import load_dotenv
+
+load_dotenv()
+
+# DB connection parameters
+USER = os.environ.get("POSTGRES_USER", "postgres")
+PW = os.environ.get("POSTGRES_PASSWORD", "postgres")
+ADDRESS = os.environ.get("POSTGRES_ADDRESS", "127.0.0.1")
+PORT = int(os.environ.get("POSTGRES_PORT", "5432"))
+DB_NAME = os.environ.get("POSTGRES_DB", "postgres")
+
+BASE_DIR = os.path.dirname(os.path.abspath(__file__))
+SQL_DIR = os.path.join(BASE_DIR, "sql")
+TPL_DIR = os.path.join(BASE_DIR, "templates")
+REPORTS_DIR = os.path.join(BASE_DIR, "reports")
+
+# Create reports directory if it doesn't exist
+os.makedirs(REPORTS_DIR, exist_ok=True)
+
+
+def load_sql_files() -> dict[str, str]:
+    """
+    Load all SQL files from /sql directory.
+
+    Returns:
+        dict[str, str]: SQL query name -> SQL query content mapping
+    """
+    queries = {}
+    for filename in os.listdir(SQL_DIR):
+        if filename.endswith(".sql"):
+            key = filename.replace(".sql", "")
+            with open(os.path.join(SQL_DIR, filename), "r") as f:
+                queries[key] = f.read()
+    return queries
+
+
+def run_queries(queries: dict[str, str]) -> dict[str, list[dict]]:
+    """
+    Execute SQL queries using psycopg3 with clean defaults.
+
+    Args:
+        queries: dict[str, str] - SQL query name -> SQL query content mapping
+
+    Returns:
+        dict[str, list[dict]]: Query name -> list of result rows (as dicts) mapping
+    """
+    results = {}
+
+    with psycopg.connect(
+        dbname=DB_NAME,
+        user=USER,
+        password=PW,
+        host=ADDRESS,
+        port=PORT,
+    ) as conn:
+        with conn.cursor() as cur:
+            for name, sql in queries.items():
+                cur.execute(sql)
+                rows = cur.fetchall()
+                columns = [desc[0] for desc in cur.description]
+                results[name] = [dict(zip(columns, row)) for row in rows]
+
+    return results
+
+
+def aggregate_statistics(data: dict[str, list[dict]]) -> dict[str, int]:
+    """
+    Calculate summary statistics from query results.
+
+    Args:
+        data: dict[str, list[dict]] - Query results mapping
+
+    Returns:
+        dict[str, int]: Statistics key -> value mapping
+    """
+    stats = {}
+
+    # Total records
+    count_records = data.get("count_records_per_endpoint", [])
+    stats["total_records"] = sum(row.get("record_count", 0) for row in count_records)
+
+    # Total harvest events
+    summary = data.get("summary", [])
+    stats["total_events"] = sum(row.get("total_harvest_events", 0) for row in summary)
+
+    return stats
+
+
+def render_markdown(data: dict[str, list[dict]]) -> str:
+    """
+    Render Markdown report using Jinja2 template.
+
+    Args:
+        data: dict[str, list[dict]] - Query results mapping
+
+    Returns:
+        str: Rendered Markdown report content
+    """
+    template_path = os.path.join(TPL_DIR, "report.md.jinja")
+
+    # Calculate aggregated statistics
+    stats = aggregate_statistics(data)
+
+    with open(template_path) as f:
+        template = Template(f.read())
+
+    return template.render(
+        generated_at=datetime.now(UTC).isoformat(),
+        database_name=DB_NAME,
+        **data,
+        **stats
+    )
+
+
+def save_report(content: str) -> None:
+    """
+    Save the Markdown report to a timestamped file in reports/ directory.
+
+    Args:
+        content: str - Rendered Markdown report content
+
+    Returns:
+        None
+    """
+    filename = f"report_{datetime.now(UTC).strftime('%Y-%m-%d_%H-%M-%S')}.md"
+    output_path = os.path.join(REPORTS_DIR, filename)
+
+    with open(output_path, "w") as f:
+        f.write(content)
+
+    print(f"✓ Report saved to {output_path}")
+
+
+if __name__ == "__main__":
+    sql_queries = load_sql_files()
+    query_results = run_queries(sql_queries)
+    markdown = render_markdown(query_results)
+    save_report(markdown)
@@ -0,0 +1,229 @@
+# Harvest Report
+
+Generated at: **2025-12-04T18:08:43.394581+00:00**
+
+---
+
+## Summary Statistics
+
+- **Total Records Harvested:** 52260
+- **Total Endpoints:** 8
+- **Harvest Events:** 52267
+- **Errors:** 6
+- **Duplicates:** 12
+
+---
+
+## Records per Endpoint
+
+| Endpoint | Records | % of Total |
+|----------|---------|-----------|
+| Archaeology Data Station | 32394 | 62.0% |
+| Generalist | 8553 | 16.4% |
+| Social Sciences Data Station | 7968 | 15.2% |
+| SwissUbase | 1175 | 2.2% |
+| Life Sciences | 1020 | 2.0% |
+| Physical and Technical Sciences | 861 | 1.6% |
+| DABAR | 287 | 0.5% |
+| Onedata | 2 | 0.0% |
+
+
+---
+
+## Harvest Summary
+
+| Endpoint | Total Events | Successful | Failed | Success Rate |
+|----------|--------------|-----------|--------|--------------|
+| Archaeology Data Station | 32401 | 32395 | 6 | 99.98% |
+| Generalist | 8553 | 8553 | 0 | 100.0% |
+| Social Sciences Data Station | 7968 | 7968 | 0 | 100.0% |
+| SwissUbase | 1175 | 1175 | 0 | 100.0% |
+| Life Sciences | 1020 | 1020 | 0 | 100.0% |
+| Physical and Technical Sciences | 861 | 861 | 0 | 100.0% |
+| DABAR | 287 | 287 | 0 | 100.0% |
+| Onedata | 2 | 2 | 0 | 100.0% |
+
+
+---
+
+## Data Quality Issues
+
+### Errors (6)
+
+
+The following validation errors were detected during harvest:
+
+
+**Archaeology Data Station** (ID: `ebcc8eaa-5da7-4fee-b354-557f3ee7f3d0` | Identifier: `doi:10.17026/AR/ZWGO7D`)
+
+```
+'202-05-29' does not match '^[0-9]{4}-[0-9]{2}-[0-9]{2}$'
+
+Failed validating 'pattern' in schema['properties']['dates']['items']['properties']['date']:
+    {'type': 'string', 'pattern': '^[0-9]{4}-[0-9]{2}-[0-9]{2}$'}
+
+On instance['dates'][0]['date']:
+    '202-05-29'
+```
+
+**Archaeology Data Station** (ID: `ebcc8eaa-5da7-4fee-b354-557f3ee7f3d0` | Identifier: `doi:10.17026/AR/8AQPUU`)
+
+```
+'22-04-24' does not match '^[0-9]{4}-[0-9]{2}-[0-9]{2}$'
+
+Failed validating 'pattern' in schema['properties']['dates']['items']['properties']['date']:
+    {'type': 'string', 'pattern': '^[0-9]{4}-[0-9]{2}-[0-9]{2}$'}
+
+On instance['dates'][0]['date']:
+    '22-04-24'
+```
+
+**Archaeology Data Station** (ID: `ebcc8eaa-5da7-4fee-b354-557f3ee7f3d0` | Identifier: `doi:10.17026/AR/CATOE2`)
+
+```
+'21-04-23' does not match '^[0-9]{4}-[0-9]{2}-[0-9]{2}$'
+
+Failed validating 'pattern' in schema['properties']['dates']['items']['properties']['date']:
+    {'type': 'string', 'pattern': '^[0-9]{4}-[0-9]{2}-[0-9]{2}$'}
+
+On instance['dates'][0]['date']:
+    '21-04-23'
+```
+
+**Archaeology Data Station** (ID: `ebcc8eaa-5da7-4fee-b354-557f3ee7f3d0` | Identifier: `doi:10.17026/AR/QDHTLZ`)
+
+```
+'12-11-24' does not match '^[0-9]{4}-[0-9]{2}-[0-9]{2}$'
+
+Failed validating 'pattern' in schema['properties']['dates']['items']['properties']['date']:
+    {'type': 'string', 'pattern': '^[0-9]{4}-[0-9]{2}-[0-9]{2}$'}
+
+On instance['dates'][0]['date']:
+    '12-11-24'
+```
+
+**Archaeology Data Station** (ID: `ebcc8eaa-5da7-4fee-b354-557f3ee7f3d0` | Identifier: `doi:10.17026/AR/DDDINZ`)
+
+```
+'202-02-09' does not match '^[0-9]{4}-[0-9]{2}-[0-9]{2}$'
+
+Failed validating 'pattern' in schema['properties']['dates']['items']['properties']['date']:
+    {'type': 'string', 'pattern': '^[0-9]{4}-[0-9]{2}-[0-9]{2}$'}
+
+On instance['dates'][0]['date']:
+    '202-02-09'
+```
+
+**Archaeology Data Station** (ID: `ebcc8eaa-5da7-4fee-b354-557f3ee7f3d0` | Identifier: `doi:10.17026/AR/9Z3TD3`)
+
+```
+'21-04-22' does not match '^[0-9]{4}-[0-9]{2}-[0-9]{2}$'
+
+Failed validating 'pattern' in schema['properties']['dates']['items']['properties']['date']:
+    {'type': 'string', 'pattern': '^[0-9]{4}-[0-9]{2}-[0-9]{2}$'}
+
+On instance['dates'][0]['date']:
+    '21-04-22'
+```
+
+
+
+### Duplicates (12)
+
+
+The following record identifiers appear in multiple endpoints:
+
+
+
+
+**doi:10.17026/DANS-23V-7HCG**
+
+
+- `Life Sciences` (ID: `1ed6f781-ab7d-43fa-a4de-6834299f2f4e::doi:10.17026/DANS-23V-7HCG`)
+
+
+
+**doi:10.17026/DANS-23V-7HCG**
+
+
+- `Social Sciences Data Station` (ID: `d15f10e3-7e13-44d1-8349-382a849cb92e::doi:10.17026/DANS-23V-7HCG`)
+
+
+
+**doi:10.17026/DANS-28X-UT8Q**
+
+
+- `Life Sciences` (ID: `1ed6f781-ab7d-43fa-a4de-6834299f2f4e::doi:10.17026/DANS-28X-UT8Q`)
+
+
+
+**doi:10.17026/DANS-28X-UT8Q**
+
+
+- `Social Sciences Data Station` (ID: `d15f10e3-7e13-44d1-8349-382a849cb92e::doi:10.17026/DANS-28X-UT8Q`)
+
+
+
+**doi:10.17026/DANS-XCK-9ZHP**
+
+
+- `Life Sciences` (ID: `1ed6f781-ab7d-43fa-a4de-6834299f2f4e::doi:10.17026/DANS-XCK-9ZHP`)
+
+
+
+**doi:10.17026/DANS-XCK-9ZHP**
+
+
+- `Social Sciences Data Station` (ID: `d15f10e3-7e13-44d1-8349-382a849cb92e::doi:10.17026/DANS-XCK-9ZHP`)
+
+
+
+**doi:10.17026/DANS-XFT-EPRJ**
+
+
+- `Physical and Technical Sciences` (ID: `6d1f99b3-6357-4986-8947-f57af06ae191::doi:10.17026/DANS-XFT-EPRJ`)
+
+
+
+**doi:10.17026/DANS-XFT-EPRJ**
+
+
+- `Social Sciences Data Station` (ID: `d15f10e3-7e13-44d1-8349-382a849cb92e::doi:10.17026/DANS-XFT-EPRJ`)
+
+
+
+**doi:10.17026/DANS-XS8-RAUM**
+
+
+- `Life Sciences` (ID: `1ed6f781-ab7d-43fa-a4de-6834299f2f4e::doi:10.17026/DANS-XS8-RAUM`)
+
+
+
+**doi:10.17026/DANS-XS8-RAUM**
+
+
+- `Social Sciences Data Station` (ID: `d15f10e3-7e13-44d1-8349-382a849cb92e::doi:10.17026/DANS-XS8-RAUM`)
+
+
+
+**doi:10.17026/DANS-ZGN-BEEZ**
+
+
+- `Life Sciences` (ID: `1ed6f781-ab7d-43fa-a4de-6834299f2f4e::doi:10.17026/DANS-ZGN-BEEZ`)
+
+
+
+**doi:10.17026/DANS-ZGN-BEEZ**
+
+
+- `Social Sciences Data Station` (ID: `d15f10e3-7e13-44d1-8349-382a849cb92e::doi:10.17026/DANS-ZGN-BEEZ`)
+
+
+
+---
+
+## Report Metadata
+
+- **Generated:** 2025-12-04T18:08:43.394581+00:00
+- **Database:** postgres
+- **Report Period:** Latest harvest run
@@ -0,0 +1,7 @@
+SELECT
+    e.name AS endpoint_name,
+    COUNT(r.id) AS record_count
+FROM records r
+JOIN endpoints e ON r.endpoint_id = e.id
+GROUP BY e.name
+ORDER BY record_count DESC;
@@ -0,0 +1,13 @@
+SELECT
+    r.record_identifier,
+    r.id AS record_id,
+    e.name AS endpoint_name
+FROM records r
+JOIN endpoints e ON r.endpoint_id = e.id
+WHERE r.record_identifier IN (
+    SELECT record_identifier
+    FROM records
+    GROUP BY record_identifier
+    HAVING COUNT(*) > 1
+)
+ORDER BY r.record_identifier, e.name, r.id;