From d1eafebfbd6336fbd94545aa0f12ce6de7c75169 Mon Sep 17 00:00:00 2001 From: Seph75010 Date: Wed, 28 May 2025 17:25:09 +0200 Subject: [PATCH 1/3] [BACK] new structure for errors detected exported in jsonl --- back/scripts/datasets/dataset_aggregator.py | 75 +++++++++++++++++---- 1 file changed, 62 insertions(+), 13 deletions(-) diff --git a/back/scripts/datasets/dataset_aggregator.py b/back/scripts/datasets/dataset_aggregator.py index 8f19c07c4..fc46cc83a 100644 --- a/back/scripts/datasets/dataset_aggregator.py +++ b/back/scripts/datasets/dataset_aggregator.py @@ -2,7 +2,7 @@ import json import logging import urllib -from collections import defaultdict +import datetime from pathlib import Path from urllib.error import HTTPError @@ -64,7 +64,19 @@ def __init__(self, files: pd.DataFrame, main_config: dict): self.data_folder.mkdir(parents=True, exist_ok=True) self.output_filename = self.get_output_path(main_config) self.output_filename.parent.mkdir(parents=True, exist_ok=True) - self.errors = defaultdict(list) + self.errors = [] + + def _log_error(self, error_code, message, file_url, dataset, step, details=None): + error = { + "timestamp": datetime.datetime.utcnow().isoformat() + "Z", + "error_code": error_code, + "message": message, + "file_url": file_url, + "dataset": dataset, + "step": step, + "details": details or {} + } + self.errors.append(error) def _ensure_url_hash(self, frame: pd.DataFrame) -> pd.DataFrame: hashes = frame["url"].apply(_sha256) @@ -78,27 +90,47 @@ def run(self) -> None: return self._process_files() self._concatenate_files() - with open(self.data_folder / "errors.json", "w") as f: - json.dump(self.errors, f) + with open(self.data_folder / "errors.jsonl", "w") as f: + for error in self.errors: + f.write(json.dumps(error, ensure_ascii=False) + "\n") def _process_files(self): for file_infos in tqdm(self._remaining_to_normalize()): if file_infos.format not in LOADER_CLASSES: LOGGER.warning(f"Format {file_infos.format} not supported") + self._log_error( + error_code="FORMAT_NOT_SUPPORTED", + message=f"Format {file_infos.format} not supported", + file_url=getattr(file_infos, "url", None), + dataset=self.get_config_key(), + step="process_files", + details={"title": getattr(file_infos, "title", None)} + ) continue if file_infos.url is None or pd.isna(file_infos.url): LOGGER.warning(f"URL not specified for file {file_infos.title}") + self._log_error( + error_code="URL_NOT_SPECIFIED", + message=f"URL not specified for file {file_infos.title}", + file_url=None, + dataset=self.get_config_key(), + step="process_files", + details={"title": getattr(file_infos, "title", None)} + ) continue try: self._process_file(file_infos) except Exception as e: LOGGER.warning(f"Failed to process file {file_infos.url}: {e}") - self.errors[str(e)].append(file_infos.url) - - with open(self.data_folder / "errors.json", "w") as f: - json.dump(self.errors, f) + self._log_error( + error_code="PROCESS_FILE_FAILED", + message=str(e), + file_url=getattr(file_infos, "url", None), + dataset=self.get_config_key(), + step="process_files" + ) self._post_process() self._concatenate_files() @@ -125,13 +157,24 @@ def _download_file(self, file_metadata: tuple): urllib.request.urlretrieve(file_metadata.url, output_filename) except HTTPError as error: LOGGER.warning(f"Failed to download file {file_metadata.url}: {error}") - msg = ( - f"HTTP error {error.code} while expecting {file_metadata.resource_status} code" + msg = f"HTTP error {error.code} while expecting {file_metadata.resource_status} code" + self._log_error( + error_code="HTTP_ERROR", + message=msg, + file_url=file_metadata.url, + dataset=self.get_config_key(), + step="download_file", + details={"exception": str(error)} ) - self.errors[msg].append(file_metadata.url) except Exception as e: LOGGER.warning(f"Failed to download file {file_metadata.url}: {e}") - self.errors[str(e)].append(file_metadata.url) + self._log_error( + error_code="DOWNLOAD_FAILED", + message=str(e), + file_url=file_metadata.url, + dataset=self.get_config_key(), + step="download_file" + ) LOGGER.debug(f"Downloaded file {file_metadata.url}") def _dataset_filename(self, file_metadata: tuple, step: str): @@ -168,7 +211,13 @@ def _read_parse_file(self, file_metadata: tuple, raw_filename: Path) -> pd.DataF raise RuntimeError("Unable to load file into a DataFrame") return df.pipe(self._normalize_frame, file_metadata) except Exception as e: - self.errors[str(e)].append(raw_filename.parent.name) + self._log_error( + error_code="LOAD_FAILED", + message=str(e), + file_url=file_metadata.url, + dataset=self.get_config_key(), + step="read_parse_file" + ) def _normalize_frame(self, df: pd.DataFrame, file_metadata: tuple): raise NotImplementedError() From f823fec497e0a485a9560bb00d0b82cc0483a76b Mon Sep 17 00:00:00 2001 From: Seph75010 Date: Wed, 11 Jun 2025 11:08:58 +0200 Subject: [PATCH 2/3] old error json and new jsonl with logger --- back/scripts/datasets/dataset_aggregator.py | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/back/scripts/datasets/dataset_aggregator.py b/back/scripts/datasets/dataset_aggregator.py index fc46cc83a..44c97deda 100644 --- a/back/scripts/datasets/dataset_aggregator.py +++ b/back/scripts/datasets/dataset_aggregator.py @@ -15,6 +15,7 @@ from back.scripts.utils.decorators import tracker LOGGER = logging.getLogger(__name__) +ERROR_LOGGER = logging.getLogger("errors_logger") # Nouveau logger structuré def _sha256(s): @@ -57,13 +58,13 @@ def get_output_path(cls, main_config: dict) -> Path: def __init__(self, files: pd.DataFrame, main_config: dict): self._config = main_config[self.get_config_key()] - self.files_in_scope = files.pipe(self._ensure_url_hash) self.data_folder = get_project_base_path() / self._config["data_folder"] self.data_folder.mkdir(parents=True, exist_ok=True) self.output_filename = self.get_output_path(main_config) self.output_filename.parent.mkdir(parents=True, exist_ok=True) + self.errors = [] def _log_error(self, error_code, message, file_url, dataset, step, details=None): @@ -76,7 +77,8 @@ def _log_error(self, error_code, message, file_url, dataset, step, details=None) "step": step, "details": details or {} } - self.errors.append(error) + self.errors.append(error) # Ancien système + ERROR_LOGGER.error(message, extra=error) # Nouveau logger JSONL def _ensure_url_hash(self, frame: pd.DataFrame) -> pd.DataFrame: hashes = frame["url"].apply(_sha256) @@ -90,9 +92,9 @@ def run(self) -> None: return self._process_files() self._concatenate_files() - with open(self.data_folder / "errors.jsonl", "w") as f: - for error in self.errors: - f.write(json.dumps(error, ensure_ascii=False) + "\n") + # Export de errors.json (ancien) + with open(self.data_folder / "errors.json", "w", encoding="utf-8") as f: + json.dump(self.errors, f, indent=2, ensure_ascii=False) def _process_files(self): for file_infos in tqdm(self._remaining_to_normalize()): @@ -266,9 +268,6 @@ def _concatenate_files(self): @property def aggregated_dataset(self): - """ - Property to return the aggregated dataset. - """ if not self.output_filename.exists(): raise RuntimeError("Combined file does not exists. You must run .load() first.") return pd.read_parquet(self.output_filename) From 673bd7abe33573945a5bb982583894232b815e9a Mon Sep 17 00:00:00 2001 From: Seph75010 Date: Wed, 11 Jun 2025 11:27:21 +0200 Subject: [PATCH 3/3] config and structured logger manager --- back/config.yaml | 10 +++++- back/scripts/datasets/dataset_aggregator.py | 23 ++++++++----- back/scripts/utils/logger_manager.py | 38 +++++++++++++++++++++ 3 files changed, 61 insertions(+), 10 deletions(-) diff --git a/back/config.yaml b/back/config.yaml index 2e87bcce3..b1d7dfb65 100644 --- a/back/config.yaml +++ b/back/config.yaml @@ -123,6 +123,9 @@ logging: formatters: simple: format: '%(asctime)s - %(name)s - %(levelname)s - %(message)s' + jsonl: + format: '{"timestamp": "%(asctime)s", "error_code": "%(error_code)s", "message": "%(message)s", "file_url": "%(file_url)s", "dataset": "%(dataset)s", "step": "%(step)s", "details": %(details)s}' + datefmt: "%Y-%m-%dT%H:%M:%SZ" handlers: console: class: logging.StreamHandler @@ -134,7 +137,12 @@ logging: level: DEBUG formatter: simple filename: back/data/logs/log.txt + jsonl: + class: logging.FileHandler + level: ERROR + formatter: jsonl + filename: back/data/logs/errors.jsonl loggers: back: level: INFO - handlers: [console, file] \ No newline at end of file + handlers: [console, file, jsonl] diff --git a/back/scripts/datasets/dataset_aggregator.py b/back/scripts/datasets/dataset_aggregator.py index 44c97deda..6dda65a4a 100644 --- a/back/scripts/datasets/dataset_aggregator.py +++ b/back/scripts/datasets/dataset_aggregator.py @@ -1,8 +1,8 @@ +import datetime import hashlib import json import logging import urllib -import datetime from pathlib import Path from urllib.error import HTTPError @@ -75,7 +75,7 @@ def _log_error(self, error_code, message, file_url, dataset, step, details=None) "file_url": file_url, "dataset": dataset, "step": step, - "details": details or {} + "details": details or {}, } self.errors.append(error) # Ancien système ERROR_LOGGER.error(message, extra=error) # Nouveau logger JSONL @@ -106,7 +106,7 @@ def _process_files(self): file_url=getattr(file_infos, "url", None), dataset=self.get_config_key(), step="process_files", - details={"title": getattr(file_infos, "title", None)} + details={"title": getattr(file_infos, "title", None)}, ) continue @@ -118,7 +118,7 @@ def _process_files(self): file_url=None, dataset=self.get_config_key(), step="process_files", - details={"title": getattr(file_infos, "title", None)} + details={"title": getattr(file_infos, "title", None)}, ) continue @@ -131,7 +131,7 @@ def _process_files(self): message=str(e), file_url=getattr(file_infos, "url", None), dataset=self.get_config_key(), - step="process_files" + step="process_files", ) self._post_process() self._concatenate_files() @@ -159,14 +159,16 @@ def _download_file(self, file_metadata: tuple): urllib.request.urlretrieve(file_metadata.url, output_filename) except HTTPError as error: LOGGER.warning(f"Failed to download file {file_metadata.url}: {error}") - msg = f"HTTP error {error.code} while expecting {file_metadata.resource_status} code" + msg = ( + f"HTTP error {error.code} while expecting {file_metadata.resource_status} code" + ) self._log_error( error_code="HTTP_ERROR", message=msg, file_url=file_metadata.url, dataset=self.get_config_key(), step="download_file", - details={"exception": str(error)} + details={"exception": str(error)}, ) except Exception as e: LOGGER.warning(f"Failed to download file {file_metadata.url}: {e}") @@ -175,7 +177,7 @@ def _download_file(self, file_metadata: tuple): message=str(e), file_url=file_metadata.url, dataset=self.get_config_key(), - step="download_file" + step="download_file", ) LOGGER.debug(f"Downloaded file {file_metadata.url}") @@ -218,7 +220,7 @@ def _read_parse_file(self, file_metadata: tuple, raw_filename: Path) -> pd.DataF message=str(e), file_url=file_metadata.url, dataset=self.get_config_key(), - step="read_parse_file" + step="read_parse_file", ) def _normalize_frame(self, df: pd.DataFrame, file_metadata: tuple): @@ -268,6 +270,9 @@ def _concatenate_files(self): @property def aggregated_dataset(self): + """ + Property to return the aggregated dataset. + """ if not self.output_filename.exists(): raise RuntimeError("Combined file does not exists. You must run .load() first.") return pd.read_parquet(self.output_filename) diff --git a/back/scripts/utils/logger_manager.py b/back/scripts/utils/logger_manager.py index 31a3d647f..c83d8393c 100644 --- a/back/scripts/utils/logger_manager.py +++ b/back/scripts/utils/logger_manager.py @@ -1,11 +1,49 @@ +import datetime +import json import logging import logging.config import os +from pathlib import Path + +from back.scripts.utils.config import get_project_base_path + + +class JsonlFormatter(logging.Formatter): + def format(self, record): + log = { + "timestamp": getattr( + record, "timestamp", datetime.datetime.utcnow().isoformat() + "Z" + ), + "error_code": getattr(record, "error_code", "UNKNOWN"), + "message": record.getMessage(), + "file_url": getattr(record, "file_url", None), + "dataset": getattr(record, "dataset", None), + "step": getattr(record, "step", None), + "details": getattr(record, "details", {}), + } + return json.dumps(log, ensure_ascii=False) class LoggerManager: @staticmethod def configure_logger(config): + # Logger principal configuré via dictConfig log_directory = os.path.dirname(config["logging"]["handlers"]["file"]["filename"]) os.makedirs(log_directory, exist_ok=True) logging.config.dictConfig(config["logging"]) + + # Logger structuré JSONL pour les erreurs + error_log_path = Path( + config.get("logging", {}).get( + "errors_filename", get_project_base_path() / "errors.jsonl" + ) + ) + error_log_path.parent.mkdir(parents=True, exist_ok=True) + + handler = logging.FileHandler(error_log_path, mode="a", encoding="utf-8") + handler.setFormatter(JsonlFormatter()) + + error_logger = logging.getLogger("errors_logger") + error_logger.setLevel(logging.ERROR) + error_logger.addHandler(handler) + error_logger.propagate = False