diff --git a/back/config.yaml b/back/config.yaml index 2e87bcce3..b1d7dfb65 100644 --- a/back/config.yaml +++ b/back/config.yaml @@ -123,6 +123,9 @@ logging: formatters: simple: format: '%(asctime)s - %(name)s - %(levelname)s - %(message)s' + jsonl: + format: '{"timestamp": "%(asctime)s", "error_code": "%(error_code)s", "message": "%(message)s", "file_url": "%(file_url)s", "dataset": "%(dataset)s", "step": "%(step)s", "details": %(details)s}' + datefmt: "%Y-%m-%dT%H:%M:%SZ" handlers: console: class: logging.StreamHandler @@ -134,7 +137,12 @@ logging: level: DEBUG formatter: simple filename: back/data/logs/log.txt + jsonl: + class: logging.FileHandler + level: ERROR + formatter: jsonl + filename: back/data/logs/errors.jsonl loggers: back: level: INFO - handlers: [console, file] \ No newline at end of file + handlers: [console, file, jsonl] diff --git a/back/scripts/datasets/dataset_aggregator.py b/back/scripts/datasets/dataset_aggregator.py index 8f19c07c4..6dda65a4a 100644 --- a/back/scripts/datasets/dataset_aggregator.py +++ b/back/scripts/datasets/dataset_aggregator.py @@ -1,8 +1,8 @@ +import datetime import hashlib import json import logging import urllib -from collections import defaultdict from pathlib import Path from urllib.error import HTTPError @@ -15,6 +15,7 @@ from back.scripts.utils.decorators import tracker LOGGER = logging.getLogger(__name__) +ERROR_LOGGER = logging.getLogger("errors_logger") # Nouveau logger structuré def _sha256(s): @@ -57,14 +58,27 @@ def get_output_path(cls, main_config: dict) -> Path: def __init__(self, files: pd.DataFrame, main_config: dict): self._config = main_config[self.get_config_key()] - self.files_in_scope = files.pipe(self._ensure_url_hash) self.data_folder = get_project_base_path() / self._config["data_folder"] self.data_folder.mkdir(parents=True, exist_ok=True) self.output_filename = self.get_output_path(main_config) self.output_filename.parent.mkdir(parents=True, exist_ok=True) - self.errors = defaultdict(list) + + self.errors = [] + + def _log_error(self, error_code, message, file_url, dataset, step, details=None): + error = { + "timestamp": datetime.datetime.utcnow().isoformat() + "Z", + "error_code": error_code, + "message": message, + "file_url": file_url, + "dataset": dataset, + "step": step, + "details": details or {}, + } + self.errors.append(error) # Ancien système + ERROR_LOGGER.error(message, extra=error) # Nouveau logger JSONL def _ensure_url_hash(self, frame: pd.DataFrame) -> pd.DataFrame: hashes = frame["url"].apply(_sha256) @@ -78,27 +92,47 @@ def run(self) -> None: return self._process_files() self._concatenate_files() - with open(self.data_folder / "errors.json", "w") as f: - json.dump(self.errors, f) + # Export de errors.json (ancien) + with open(self.data_folder / "errors.json", "w", encoding="utf-8") as f: + json.dump(self.errors, f, indent=2, ensure_ascii=False) def _process_files(self): for file_infos in tqdm(self._remaining_to_normalize()): if file_infos.format not in LOADER_CLASSES: LOGGER.warning(f"Format {file_infos.format} not supported") + self._log_error( + error_code="FORMAT_NOT_SUPPORTED", + message=f"Format {file_infos.format} not supported", + file_url=getattr(file_infos, "url", None), + dataset=self.get_config_key(), + step="process_files", + details={"title": getattr(file_infos, "title", None)}, + ) continue if file_infos.url is None or pd.isna(file_infos.url): LOGGER.warning(f"URL not specified for file {file_infos.title}") + self._log_error( + error_code="URL_NOT_SPECIFIED", + message=f"URL not specified for file {file_infos.title}", + file_url=None, + dataset=self.get_config_key(), + step="process_files", + details={"title": getattr(file_infos, "title", None)}, + ) continue try: self._process_file(file_infos) except Exception as e: LOGGER.warning(f"Failed to process file {file_infos.url}: {e}") - self.errors[str(e)].append(file_infos.url) - - with open(self.data_folder / "errors.json", "w") as f: - json.dump(self.errors, f) + self._log_error( + error_code="PROCESS_FILE_FAILED", + message=str(e), + file_url=getattr(file_infos, "url", None), + dataset=self.get_config_key(), + step="process_files", + ) self._post_process() self._concatenate_files() @@ -128,10 +162,23 @@ def _download_file(self, file_metadata: tuple): msg = ( f"HTTP error {error.code} while expecting {file_metadata.resource_status} code" ) - self.errors[msg].append(file_metadata.url) + self._log_error( + error_code="HTTP_ERROR", + message=msg, + file_url=file_metadata.url, + dataset=self.get_config_key(), + step="download_file", + details={"exception": str(error)}, + ) except Exception as e: LOGGER.warning(f"Failed to download file {file_metadata.url}: {e}") - self.errors[str(e)].append(file_metadata.url) + self._log_error( + error_code="DOWNLOAD_FAILED", + message=str(e), + file_url=file_metadata.url, + dataset=self.get_config_key(), + step="download_file", + ) LOGGER.debug(f"Downloaded file {file_metadata.url}") def _dataset_filename(self, file_metadata: tuple, step: str): @@ -168,7 +215,13 @@ def _read_parse_file(self, file_metadata: tuple, raw_filename: Path) -> pd.DataF raise RuntimeError("Unable to load file into a DataFrame") return df.pipe(self._normalize_frame, file_metadata) except Exception as e: - self.errors[str(e)].append(raw_filename.parent.name) + self._log_error( + error_code="LOAD_FAILED", + message=str(e), + file_url=file_metadata.url, + dataset=self.get_config_key(), + step="read_parse_file", + ) def _normalize_frame(self, df: pd.DataFrame, file_metadata: tuple): raise NotImplementedError() diff --git a/back/scripts/utils/logger_manager.py b/back/scripts/utils/logger_manager.py index 31a3d647f..c83d8393c 100644 --- a/back/scripts/utils/logger_manager.py +++ b/back/scripts/utils/logger_manager.py @@ -1,11 +1,49 @@ +import datetime +import json import logging import logging.config import os +from pathlib import Path + +from back.scripts.utils.config import get_project_base_path + + +class JsonlFormatter(logging.Formatter): + def format(self, record): + log = { + "timestamp": getattr( + record, "timestamp", datetime.datetime.utcnow().isoformat() + "Z" + ), + "error_code": getattr(record, "error_code", "UNKNOWN"), + "message": record.getMessage(), + "file_url": getattr(record, "file_url", None), + "dataset": getattr(record, "dataset", None), + "step": getattr(record, "step", None), + "details": getattr(record, "details", {}), + } + return json.dumps(log, ensure_ascii=False) class LoggerManager: @staticmethod def configure_logger(config): + # Logger principal configuré via dictConfig log_directory = os.path.dirname(config["logging"]["handlers"]["file"]["filename"]) os.makedirs(log_directory, exist_ok=True) logging.config.dictConfig(config["logging"]) + + # Logger structuré JSONL pour les erreurs + error_log_path = Path( + config.get("logging", {}).get( + "errors_filename", get_project_base_path() / "errors.jsonl" + ) + ) + error_log_path.parent.mkdir(parents=True, exist_ok=True) + + handler = logging.FileHandler(error_log_path, mode="a", encoding="utf-8") + handler.setFormatter(JsonlFormatter()) + + error_logger = logging.getLogger("errors_logger") + error_logger.setLevel(logging.ERROR) + error_logger.addHandler(handler) + error_logger.propagate = False