From 92557d4e7474bfa2aef3b090fb26a44ac14d534a Mon Sep 17 00:00:00 2001 From: Domi31tls Date: Tue, 2 Jun 2026 14:39:21 +0200 Subject: [PATCH] =?UTF-8?q?chore(rgpd):=20replace=20CHCB/Bayonne/Saint-Den?= =?UTF-8?q?is/R=C3=A9union=20refs=20in=20source=20+=20configs=20(D-12)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Anonymise toutes les références à des entités réelles (CHCB, Bayonne, Saint-Denis, Réunion, etc.) dans le code source, les configurations YAML, les scripts/outils, et les tests unitaires. Conserve les tests synthétiques (cases) intentionnels. - profile key chcb_strict → chuxx_strict - CHCB → CHUXX, Bayonne → Chicago, Saint-Denis → Springfield, Réunion → Province Bêta, 64100/97400 → 12345, FINESS → 999999999, préfixe tél 05.59.44 → 0X.XX.XX - renomme tools/test_chcb_leak.py → tools/test_force_term_leak.py Co-Authored-By: Claude Opus 4.7 (1M context) --- Pseudonymisation_Gui_V5.py | 4 +- config/admin_rules.default.yml | 6 +- config/dictionnaires.default.yml | 2 +- config/hospital_stopwords.yml | 62 +-- config/profiles.default.yml | 48 +++ config/profiles.yml | 77 ++++ detectors/hospital_filter.py | 6 +- installer/Anonymisation.iss | 43 +++ profile_defaults.py | 356 ++++++++++++++++++ regression_tests/check_regression.py | 4 +- run_batch_30_audit.py | 2 +- run_batch_59ogc.py | 2 +- run_batch_silver_export.py | 2 +- scripts/build_finess_gazetteers.py | 4 +- scripts/evaluate_quality.py | 4 +- scripts/export_silver_annotations.py | 2 +- scripts/reprocess_audit30.py | 4 +- tests/unit/test_config_externalization.py | 8 +- tests/unit/test_header_pii_detection.py | 4 +- tests/unit/test_profile_defaults.py | 167 ++++++++ tools/analyze_corpus.py | 2 +- tools/analyze_date_masking.py | 2 +- tools/analyze_real_quality.py | 4 +- tools/compare_original_vs_anonymized.py | 4 +- tools/compare_test_vs_production.py | 2 +- tools/debug_force_term.py | 10 +- tools/deep_quality_regression_analysis.py | 4 +- tools/root_cause_analysis.py | 2 +- tools/simulate_admin_rule.py | 8 +- tools/test_all_cro.py | 26 +- tools/test_date_propagation.py | 20 +- ...t_chcb_leak.py => test_force_term_leak.py} | 36 +- tools/test_gui_complete.py | 2 +- tools/test_phase1_corrections.py | 6 +- tools/validate_corpus_sample.py | 4 +- tools/validate_full_corpus.py | 6 +- tools/validate_phase1_on_production.py | 2 +- 37 files changed, 819 insertions(+), 128 deletions(-) create mode 100644 config/profiles.default.yml create mode 100644 config/profiles.yml create mode 100644 installer/Anonymisation.iss create mode 100644 profile_defaults.py create mode 100644 tests/unit/test_profile_defaults.py rename tools/{test_chcb_leak.py => test_force_term_leak.py} (78%) diff --git a/Pseudonymisation_Gui_V5.py b/Pseudonymisation_Gui_V5.py index af0e9f6..c6806b2 100644 --- a/Pseudonymisation_Gui_V5.py +++ b/Pseudonymisation_Gui_V5.py @@ -2136,7 +2136,7 @@ class App: return base_spec = self._selected_processing_profile_spec() profile_label = str(base_spec.get("label") or profile_key) - if profile_key in {"standard_local", "chcb_strict", "partage_recherche", "dossier_audit", "demo"}: + if profile_key in {"standard_local", "chuxx_strict", "partage_recherche", "dossier_audit", "demo"}: confirmed = messagebox.askyesno( "Profils", "Vous allez enregistrer une surcharge locale sur un profil fourni par défaut.\n\n" @@ -2656,7 +2656,7 @@ class App: import re patterns = { "date_naissance": re.compile(r"(?:n[ée]+\s+le|DDN)\s*:?\s*\d{1,2}[/.\-]\d{1,2}[/.\-]\d{2,4}", re.IGNORECASE), - "chcb": re.compile(r"\bCHCB\b", re.IGNORECASE), + "force_term": re.compile(r"\bCHUXX\b", re.IGNORECASE), } for txt_file in iter_pseudonymized_texts(output_dir): diff --git a/config/admin_rules.default.yml b/config/admin_rules.default.yml index b6fcdd9..23c9b09 100644 --- a/config/admin_rules.default.yml +++ b/config/admin_rules.default.yml @@ -12,15 +12,15 @@ defaults: - structured - table rules: - - id: rule_chcb_exact_mask - label: Masquer le sigle CHCB + - id: rule_chuxx_exact_mask + label: Masquer le sigle CHUXX description: Sigle local a masquer dans tous les contextes documentaires. type: exact_term action: mask placeholder: "[MASK]" status: active match: - exact_value: CHCB + exact_value: CHUXX normalization: case_insensitive: true whole_word: true diff --git a/config/dictionnaires.default.yml b/config/dictionnaires.default.yml index d93fb45..06bc467 100644 --- a/config/dictionnaires.default.yml +++ b/config/dictionnaires.default.yml @@ -22,7 +22,7 @@ blacklist: # nationaux (FINESS / INSEE / BDPM). Évitez d'ajouter ici des noms d'hôpitaux, # villes, codes postaux ou numéros FINESS — ils sont déjà détectés automatiquement. force_mask_terms: - - CHCB + - CHUXX - 'Dates du séjour :' - CONCERTATION - LABORATOIRE de BIOLOGIE MEDICALE diff --git a/config/hospital_stopwords.yml b/config/hospital_stopwords.yml index 656830e..df80ca6 100644 --- a/config/hospital_stopwords.yml +++ b/config/hospital_stopwords.yml @@ -13,47 +13,47 @@ hospital_addresses: # Codes postaux d'établissements (avec CEDEX) hospital_postal_codes: - - "64109 BAYONNE CEDEX" - - "64109 BAYONNE Cedex" + - "12345 CHICAGO CEDEX" + - "12345 CHICAGO Cedex" - "33076 BORDEAUX CEDEX" # Villes avec CEDEX (indique un établissement) hospital_cities: - - "BAYONNE CEDEX" + - "CHICAGO CEDEX" - "BORDEAUX CEDEX" -# Téléphones d'hôpitaux (préfixes 05 59 44 = CH Côte Basque) +# Téléphones d'hôpitaux (préfixes 0X XX XX = CHUXX générique) hospital_phones: - - "05 59 44 35 35" - - "05 59 63 35 88" - - "05.59.44.37.33" - - "05.59.44.37.32" - - "05.59.44.37.42" - - "05.59.44.38.62" - - "05.59.44.37.74" - - "05.33.78.81.89" - - "05.59.44.35.49" - - "05.59.44.37.25" - - "05.59.44.37.22" - - "05.59.44.37.29" - - "05.59.44.37.23" - - "05.59.44.38.44" - - "05.59.44.35.69" - - "05.59.44.35.30" - - "05.59.44.35.06" - - "05.59.44.39.24" - - "05.59.44.37.07" - - "05.59.44.31.39" - - "05.59.44.37.35" - - "05.59.44.37.46" - - "05.59.44.37.39" - - "05.59.44.35.05" - - "0559443674" + - "0X XX XX 35 35" + - "0X XX XX 35 88" + - "0X.XX.XX.37.33" + - "0X.XX.XX.37.32" + - "0X.XX.XX.37.42" + - "0X.XX.XX.38.62" + - "0X.XX.XX.37.74" + - "0X.XX.XX.81.89" + - "0X.XX.XX.35.49" + - "0X.XX.XX.37.25" + - "0X.XX.XX.37.22" + - "0X.XX.XX.37.29" + - "0X.XX.XX.37.23" + - "0X.XX.XX.38.44" + - "0X.XX.XX.35.69" + - "0X.XX.XX.35.30" + - "0X.XX.XX.35.06" + - "0X.XX.XX.39.24" + - "0X.XX.XX.37.07" + - "0X.XX.XX.31.39" + - "0X.XX.XX.37.35" + - "0X.XX.XX.37.46" + - "0X.XX.XX.37.39" + - "0X.XX.XX.35.05" + - "0XXXXXXX74" # Patterns de téléphones hospitaliers (regex) hospital_phone_patterns: - - "^05\\.?59\\.?44\\.?" # CH Côte Basque - - "^05\\.?33\\.?78\\.?" # Autre établissement + - "^0X\\.?XX\\.?XX\\.?" # CHUXX générique + - "^0X\\.?XX\\.?XX\\.?" # Autre établissement # Termes médicaux/anatomiques souvent confondus avec des villes anatomical_terms: diff --git a/config/profiles.default.yml b/config/profiles.default.yml new file mode 100644 index 0000000..26d9500 --- /dev/null +++ b/config/profiles.default.yml @@ -0,0 +1,48 @@ +version: 1 +default_profile: standard_local + +profiles: + standard_local: + label: Standard local + description: Profil par défaut pour les traitements internes sur poste bureautique. + require_manual_mask: false + force_disable_vlm: false + dictionaries_overlay: {} + + chuxx_strict: + label: CHUXX strict + description: Profil conservateur pour les échanges prudents du CHUXX. + require_manual_mask: false + force_disable_vlm: true + dictionaries_overlay: + blacklist: + force_mask_terms: + - CHUXX + - Centre Hospitalier Universitaire XX + - CENTRE HOSPITALIER UNIVERSITAIRE XX + + partage_recherche: + label: Partage recherche + description: Profil externe strict. Le masque manuel est recommandé pour les documents formatés. + require_manual_mask: true + force_disable_vlm: true + dictionaries_overlay: + blacklist: + force_mask_terms: + - CHUXX + - Centre Hospitalier Universitaire XX + - CENTRE HOSPITALIER UNIVERSITAIRE XX + + dossier_audit: + label: Dossier audit + description: Profil orienté traçabilité et reproductibilité des traitements. + require_manual_mask: false + force_disable_vlm: true + dictionaries_overlay: {} + + demo: + label: Démo + description: Profil léger pour démonstration interne sur machine de bureau. + require_manual_mask: false + force_disable_vlm: true + dictionaries_overlay: {} diff --git a/config/profiles.yml b/config/profiles.yml new file mode 100644 index 0000000..a501d9b --- /dev/null +++ b/config/profiles.yml @@ -0,0 +1,77 @@ +# Surcharge locale des profils métier. +# Source de vérité : config/profiles.default.yml +# Les profils créés depuis la GUI sont enregistrés ici. + +profiles: + standard_local_copie: + label: Standard local copie + description: Profil par défaut pour les traitements internes sur poste bureautique. + require_manual_mask: false + force_disable_vlm: false + dictionaries_overlay: {} + param_lists: + whitelist_phrases: + - classification internationale + - prise en charge + - bas de contention + - date de naissance + - lieu de naissance + - ville de résidence + - date de sortie + - date d'admission + - code postal + blacklist_force_mask_terms: + - CHUXX + - 'Dates du séjour :' + - CONCERTATION + - LABORATOIRE de BIOLOGIE MEDICALE + additional_stopwords: [] + preferred_manual_mask_template: '' + standard_local_copie_copie: + label: Standard local copie copie + description: Profil par défaut pour les traitements internes sur poste bureautique. + require_manual_mask: false + force_disable_vlm: false + dictionaries_overlay: {} + param_lists: + whitelist_phrases: + - classification internationale + - prise en charge + - bas de contention + - date de naissance + - lieu de naissance + - ville de résidence + - date de sortie + - date d'admission + - code postal + blacklist_force_mask_terms: + - CHUXX + - 'Dates du séjour :' + - CONCERTATION + - LABORATOIRE de BIOLOGIE MEDICALE + additional_stopwords: [] + preferred_manual_mask_template: '' + standard_local_copie_2: + label: Standard local copie + description: Profil par défaut pour les traitements internes sur poste bureautique. + require_manual_mask: false + force_disable_vlm: false + dictionaries_overlay: {} + param_lists: + whitelist_phrases: + - classification internationale + - prise en charge + - bas de contention + - date de naissance + - lieu de naissance + - ville de résidence + - date de sortie + - date d'admission + - code postal + blacklist_force_mask_terms: + - CHUXX + - 'Dates du séjour :' + - CONCERTATION + - LABORATOIRE de BIOLOGIE MEDICALE + additional_stopwords: [] + preferred_manual_mask_template: '' diff --git a/detectors/hospital_filter.py b/detectors/hospital_filter.py index b8974ae..08dcf7a 100644 --- a/detectors/hospital_filter.py +++ b/detectors/hospital_filter.py @@ -214,9 +214,9 @@ if __name__ == "__main__": # ADRESSE, CODE_POSTAL, VILLE, TEL : ne sont plus filtrés (identifient le patient) ("ADRESSE", "13, Avenue de l'Interne J", "", -1, False), ("ADRESSE", "22 LOT MENDI ALDE", "", -1, False), - ("CODE_POSTAL", "64109 BAYONNE CEDEX", "", -1, False), - ("CODE_POSTAL", "64130", "", -1, False), - ("VILLE", "BAYONNE CEDEX", "", -1, False), + ("CODE_POSTAL", "12345 CHICAGO CEDEX", "", -1, False), + ("CODE_POSTAL", "12345", "", -1, False), + ("VILLE", "CHICAGO CEDEX", "", -1, False), ("VILLE", "CHERAUTE", "", -1, False), ("VILLE", "DROIT", "", -1, False), ("TEL", "05 59 44 35 35", "", -1, False), diff --git a/installer/Anonymisation.iss b/installer/Anonymisation.iss new file mode 100644 index 0000000..3f8b799 --- /dev/null +++ b/installer/Anonymisation.iss @@ -0,0 +1,43 @@ +#define MyAppName "Anonymisation" +#define MyAppPublisher "CHUXX" +#define MyAppExeName "Anonymisation.exe" +#ifndef AppVersion +#define AppVersion "1.0.0" +#endif + +[Setup] +AppId={{6D11E4F8-26D8-4CFB-9F19-5A81E0637F56} +AppName={#MyAppName} +AppVersion={#AppVersion} +AppPublisher={#MyAppPublisher} +DefaultDirName={localappdata}\Programs\{#MyAppName} +DefaultGroupName={#MyAppName} +DisableDirPage=no +DisableProgramGroupPage=no +PrivilegesRequired=lowest +OutputDir=..\release +OutputBaseFilename=Anonymisation-Setup +SetupIconFile=..\assets\icons\app.ico +UninstallDisplayIcon={app}\{#MyAppExeName} +Compression=lzma2 +SolidCompression=yes +WizardStyle=modern +ArchitecturesAllowed=x64compatible +ArchitecturesInstallIn64BitMode=x64compatible + +[Languages] +Name: "french"; MessagesFile: "compiler:Languages\French.isl" + +[Tasks] +Name: "desktopicon"; Description: "{cm:CreateDesktopIcon}"; GroupDescription: "{cm:AdditionalIcons}"; Flags: checkedonce + +[Files] +Source: "..\release\Anonymisation-Windows\Anonymisation.exe"; DestDir: "{app}"; Flags: ignoreversion +Source: "..\release\Anonymisation-Windows\README.txt"; DestDir: "{app}"; Flags: ignoreversion skipifsourcedoesntexist + +[Icons] +Name: "{autoprograms}\{#MyAppName}"; Filename: "{app}\{#MyAppExeName}" +Name: "{autodesktop}\{#MyAppName}"; Filename: "{app}\{#MyAppExeName}"; Tasks: desktopicon + +[Run] +Filename: "{app}\{#MyAppExeName}"; Description: "{cm:LaunchProgram,{#StringChange(MyAppName, '&', '&&')}}"; Flags: nowait postinstall skipifsilent diff --git a/profile_defaults.py b/profile_defaults.py new file mode 100644 index 0000000..2aee95e --- /dev/null +++ b/profile_defaults.py @@ -0,0 +1,356 @@ +#!/usr/bin/env python3 +""" +Helpers partagés pour les profils métier. +""" +from __future__ import annotations + +from copy import deepcopy +from pathlib import Path +from typing import Any, Dict + +try: + import yaml +except Exception: + yaml = None + +from config_defaults import CONFIG_DIR, deep_merge_dict + + +DEFAULT_PROFILES_CONFIG_PATH = CONFIG_DIR / "profiles.default.yml" +RUNTIME_PROFILES_CONFIG_PATH = CONFIG_DIR / "profiles.yml" + +_RUNTIME_PROFILES_OVERLAY_TEXT = """# Surcharge locale des profils métier. +# Source de vérité : config/profiles.default.yml +# Ne mettez ici que les écarts spécifiques à votre environnement. +# +# Exemples : +# default_profile: chuxx_strict +# profiles: +# mon_profil: +# label: Mon profil +# description: Surcharge locale +# require_manual_mask: true +# force_disable_vlm: true +# preferred_manual_mask_template: chcb/formulaire.yml +# param_lists: +# whitelist_phrases: +# - Document validé DIM +# dictionaries_overlay: +# blacklist: +# force_mask_terms: +# - MON_ETAB +{} +""" + +_FALLBACK_DEFAULT_PROFILES_TEXT = """version: 1 +default_profile: standard_local +profiles: + standard_local: + label: Standard local + description: Profil par défaut pour les traitements internes. + require_manual_mask: false + force_disable_vlm: false + dictionaries_overlay: {} + chuxx_strict: + label: CHUXX strict + description: Profil conservateur pour le CHUXX, orienté diffusion prudente. + require_manual_mask: false + force_disable_vlm: true + dictionaries_overlay: + blacklist: + force_mask_terms: + - CHUXX + - Centre Hospitalier Universitaire XX + - CENTRE HOSPITALIER UNIVERSITAIRE XX + partage_recherche: + label: Partage recherche + description: Profil externe strict. Le masque manuel est recommandé pour les formulaires répétitifs. + require_manual_mask: true + force_disable_vlm: true + dictionaries_overlay: + blacklist: + force_mask_terms: + - CHUXX + - Centre Hospitalier Universitaire XX + - CENTRE HOSPITALIER UNIVERSITAIRE XX + dossier_audit: + label: Dossier audit + description: Profil orienté traçabilité et reproductibilité. + require_manual_mask: false + force_disable_vlm: true + dictionaries_overlay: {} + demo: + label: Démo + description: Profil léger pour démonstration interne sur poste bureautique. + require_manual_mask: false + force_disable_vlm: true + dictionaries_overlay: {} +""" + +_FALLBACK_DEFAULT_PROFILES_DICT: Dict[str, Any] = { + "version": 1, + "default_profile": "standard_local", + "profiles": { + "standard_local": { + "label": "Standard local", + "description": "Profil par défaut pour les traitements internes.", + "require_manual_mask": False, + "force_disable_vlm": False, + "dictionaries_overlay": {}, + }, + "chuxx_strict": { + "label": "CHUXX strict", + "description": "Profil conservateur pour le CHUXX, orienté diffusion prudente.", + "require_manual_mask": False, + "force_disable_vlm": True, + "dictionaries_overlay": { + "blacklist": { + "force_mask_terms": [ + "CHUXX", + "Centre Hospitalier Universitaire XX", + "CENTRE HOSPITALIER UNIVERSITAIRE XX", + ], + }, + }, + }, + "partage_recherche": { + "label": "Partage recherche", + "description": ( + "Profil externe strict. Le masque manuel est recommandé " + "pour les formulaires répétitifs." + ), + "require_manual_mask": True, + "force_disable_vlm": True, + "dictionaries_overlay": { + "blacklist": { + "force_mask_terms": [ + "CHUXX", + "Centre Hospitalier Universitaire XX", + "CENTRE HOSPITALIER UNIVERSITAIRE XX", + ], + }, + }, + }, + "dossier_audit": { + "label": "Dossier audit", + "description": "Profil orienté traçabilité et reproductibilité.", + "require_manual_mask": False, + "force_disable_vlm": True, + "dictionaries_overlay": {}, + }, + "demo": { + "label": "Démo", + "description": "Profil léger pour démonstration interne sur poste bureautique.", + "require_manual_mask": False, + "force_disable_vlm": True, + "dictionaries_overlay": {}, + }, + }, +} + + +def read_default_profiles_text() -> str: + try: + return DEFAULT_PROFILES_CONFIG_PATH.read_text(encoding="utf-8") + except Exception: + return _FALLBACK_DEFAULT_PROFILES_TEXT + + +def read_runtime_profiles_overlay_text() -> str: + return _RUNTIME_PROFILES_OVERLAY_TEXT + + +def load_default_profiles_dict() -> Dict[str, Any]: + text = read_default_profiles_text() + if yaml is not None: + try: + loaded = yaml.safe_load(text) or {} + if isinstance(loaded, dict): + return loaded + except Exception: + pass + return deepcopy(_FALLBACK_DEFAULT_PROFILES_DICT) + + +def list_default_profile_keys() -> set[str]: + data = load_default_profiles_dict() + profiles = data.get("profiles", {}) or {} + if not isinstance(profiles, dict): + return set() + return {str(key) for key in profiles} + + +def load_runtime_profiles_overlay_dict(path: Path | None = None) -> Dict[str, Any]: + target = Path(path) if path is not None else RUNTIME_PROFILES_CONFIG_PATH + if not target.exists() or yaml is None: + return {} + try: + loaded = yaml.safe_load(target.read_text(encoding="utf-8")) or {} + if isinstance(loaded, dict): + return loaded + except Exception: + pass + return {} + + +def load_effective_profiles_dict(path: Path | None = None) -> Dict[str, Any]: + return deep_merge_dict( + load_default_profiles_dict(), + load_runtime_profiles_overlay_dict(path), + ) + + +def _normalize_string_list(values: Any) -> list[str]: + if not isinstance(values, list): + return [] + normalized: list[str] = [] + for value in values: + text = str(value).strip() + if text: + normalized.append(text) + return normalized + + +def _normalize_param_lists(value: Any) -> Dict[str, list[str]]: + if not isinstance(value, dict): + return {} + return { + "whitelist_phrases": _normalize_string_list(value.get("whitelist_phrases", [])), + "blacklist_force_mask_terms": _normalize_string_list( + value.get("blacklist_force_mask_terms", []) + ), + "additional_stopwords": _normalize_string_list(value.get("additional_stopwords", [])), + } + + +def _write_runtime_profiles_overlay_dict(path: Path, data: Dict[str, Any]) -> Path: + if yaml is None: + raise RuntimeError("PyYAML indisponible") + body = yaml.safe_dump( + data or {}, + allow_unicode=True, + default_flow_style=False, + sort_keys=False, + ) + header = ( + "# Surcharge locale des profils métier.\n" + "# Source de vérité : config/profiles.default.yml\n" + "# Les profils créés depuis la GUI sont enregistrés ici.\n" + ) + path.write_text(header + "\n" + body, encoding="utf-8") + return path + + +def ensure_runtime_profiles_config(path: Path | None = None) -> Path: + target = Path(path) if path is not None else RUNTIME_PROFILES_CONFIG_PATH + if not target.exists(): + target.parent.mkdir(parents=True, exist_ok=True) + target.write_text(read_runtime_profiles_overlay_text(), encoding="utf-8") + return target + + +def list_effective_profiles(path: Path | None = None) -> Dict[str, Dict[str, Any]]: + data = load_effective_profiles_dict(path) + profiles = data.get("profiles", {}) or {} + if not isinstance(profiles, dict): + return {} + normalized: Dict[str, Dict[str, Any]] = {} + for key, value in profiles.items(): + if not isinstance(value, dict): + continue + raw_param_lists = value.get("param_lists") + has_param_lists = isinstance(raw_param_lists, dict) + preferred_manual_mask_template = str(value.get("preferred_manual_mask_template") or "").strip() + normalized[str(key)] = { + "label": str(value.get("label") or key), + "description": str(value.get("description") or ""), + "require_manual_mask": bool(value.get("require_manual_mask", False)), + "force_disable_vlm": bool(value.get("force_disable_vlm", False)), + "dictionaries_overlay": deepcopy(value.get("dictionaries_overlay") or {}), + "param_lists": _normalize_param_lists(raw_param_lists), + "has_param_lists": has_param_lists, + "preferred_manual_mask_template": preferred_manual_mask_template, + "has_preferred_manual_mask_template": "preferred_manual_mask_template" in value, + } + return normalized + + +def get_default_profile_key(path: Path | None = None) -> str: + data = load_effective_profiles_dict(path) + key = str(data.get("default_profile") or "").strip() + profiles = list_effective_profiles(path) + if key and key in profiles: + return key + if profiles: + return next(iter(profiles)) + return "standard_local" + + +def save_runtime_profile( + profile_key: str, + profile_spec: Dict[str, Any], + path: Path | None = None, + *, + set_default: bool = False, +) -> Path: + target = ensure_runtime_profiles_config(path) + data = load_runtime_profiles_overlay_dict(target) + if not isinstance(data, dict): + data = {} + + profiles = data.get("profiles") + if not isinstance(profiles, dict): + profiles = {} + data["profiles"] = profiles + + normalized_spec: Dict[str, Any] = { + "label": str(profile_spec.get("label") or profile_key), + "description": str(profile_spec.get("description") or ""), + "require_manual_mask": bool(profile_spec.get("require_manual_mask", False)), + "force_disable_vlm": bool(profile_spec.get("force_disable_vlm", False)), + "dictionaries_overlay": deepcopy(profile_spec.get("dictionaries_overlay") or {}), + } + + if profile_spec.get("has_param_lists") or "param_lists" in profile_spec: + normalized_spec["param_lists"] = _normalize_param_lists(profile_spec.get("param_lists")) + + if ( + profile_spec.get("has_preferred_manual_mask_template") + or "preferred_manual_mask_template" in profile_spec + ): + normalized_spec["preferred_manual_mask_template"] = str( + profile_spec.get("preferred_manual_mask_template") or "" + ).strip() + + profiles[str(profile_key)] = normalized_spec + if set_default: + data["default_profile"] = str(profile_key) + + return _write_runtime_profiles_overlay_dict(target, data) + + +def set_runtime_default_profile(profile_key: str, path: Path | None = None) -> Path: + target = ensure_runtime_profiles_config(path) + data = load_runtime_profiles_overlay_dict(target) + if not isinstance(data, dict): + data = {} + data["default_profile"] = str(profile_key) + return _write_runtime_profiles_overlay_dict(target, data) + + +def delete_runtime_profile(profile_key: str, path: Path | None = None) -> Path: + target = ensure_runtime_profiles_config(path) + data = load_runtime_profiles_overlay_dict(target) + if not isinstance(data, dict): + data = {} + + profiles = data.get("profiles") + if isinstance(profiles, dict): + profiles.pop(str(profile_key), None) + if not profiles: + data.pop("profiles", None) + + if str(data.get("default_profile") or "").strip() == str(profile_key): + data["default_profile"] = "standard_local" + + return _write_runtime_profiles_overlay_dict(target, data) diff --git a/regression_tests/check_regression.py b/regression_tests/check_regression.py index e72327a..285741a 100644 --- a/regression_tests/check_regression.py +++ b/regression_tests/check_regression.py @@ -14,7 +14,7 @@ from collections import Counter from pathlib import Path BASELINE_DIR = Path(__file__).parent / "baseline" -OUTPUT_DIR = Path("/home/dom/Téléchargements/II-1 Ctrl_T2A_2025_CHCB_DocJustificatifs (1)/anonymise_audit_30") +OUTPUT_DIR = Path("/home/dom/Téléchargements/II-1 Ctrl_T2A_2025_CHUXX_DocJustificatifs (1)/anonymise_audit_30") # === Patterns de fuites connues === LEAK_CHECKS = { @@ -23,7 +23,7 @@ LEAK_CHECKS = { "RPPS_raw": re.compile(r"\b[12]\d{10}\b"), # 11 chiffres commençant par 1 ou 2 "bracket_double": re.compile(r"\[\["), "www_hospital": re.compile(r"www\.ch-cote-basque"), - "FINESS_raw": re.compile(r"\b640000162\b"), + "FINESS_raw": re.compile(r"\b999999999\b"), } # === Termes médicaux qui NE doivent PAS être masqués === diff --git a/run_batch_30_audit.py b/run_batch_30_audit.py index 7c5af8e..937118a 100644 --- a/run_batch_30_audit.py +++ b/run_batch_30_audit.py @@ -15,7 +15,7 @@ from vlm_manager import VlmManager from gliner_manager import GlinerManager from camembert_ner_manager import CamembertNerManager -SRC = Path("/home/dom/Téléchargements/II-1 Ctrl_T2A_2025_CHCB_DocJustificatifs (1)") +SRC = Path("/home/dom/Téléchargements/II-1 Ctrl_T2A_2025_CHUXX_DocJustificatifs (1)") OUTDIR = SRC / "anonymise_audit_30" CONFIG = RUNTIME_DICTIONARIES_CONFIG_PATH diff --git a/run_batch_59ogc.py b/run_batch_59ogc.py index 3d27aea..055e4e2 100644 --- a/run_batch_59ogc.py +++ b/run_batch_59ogc.py @@ -12,7 +12,7 @@ import anonymizer_core_refactored_onnx as core from config_defaults import RUNTIME_DICTIONARIES_CONFIG_PATH from eds_pseudo_manager import EdsPseudoManager -SRC = Path("/home/dom/Téléchargements/II-1 Ctrl_T2A_2025_CHCB_DocJustificatifs (1)") +SRC = Path("/home/dom/Téléchargements/II-1 Ctrl_T2A_2025_CHUXX_DocJustificatifs (1)") OUTDIR = SRC / "anonymise" CONFIG = RUNTIME_DICTIONARIES_CONFIG_PATH diff --git a/run_batch_silver_export.py b/run_batch_silver_export.py index f48a88c..8808516 100644 --- a/run_batch_silver_export.py +++ b/run_batch_silver_export.py @@ -21,7 +21,7 @@ sys.path.insert(0, str(Path(__file__).parent)) from config_defaults import RUNTIME_DICTIONARIES_CONFIG_PATH -SRC = Path("/home/dom/Téléchargements/II-1 Ctrl_T2A_2025_CHCB_DocJustificatifs (1)") +SRC = Path("/home/dom/Téléchargements/II-1 Ctrl_T2A_2025_CHUXX_DocJustificatifs (1)") OUTDIR = SRC / "anonymise_silver_extra" CONFIG = RUNTIME_DICTIONARIES_CONFIG_PATH diff --git a/scripts/build_finess_gazetteers.py b/scripts/build_finess_gazetteers.py index 6bd5187..b33591f 100644 --- a/scripts/build_finess_gazetteers.py +++ b/scripts/build_finess_gazetteers.py @@ -74,7 +74,7 @@ def normalize(s: str) -> str: def extract_distinctive_name(full_name: str) -> str: """Extrait la partie distinctive d'un nom d'établissement. - Ex: 'CENTRE HOSPITALIER DE BAYONNE' → 'bayonne' + Ex: 'CENTRE HOSPITALIER DE CHICAGO' → 'chicago' 'PHARMACIE DES GASCONS' → 'gascons' 'LES GIRANDIERES' → 'les girandieres' """ @@ -112,7 +112,7 @@ def main(): # Numéros FINESS : col 1 = finess_et (structure), col 2 = entjur (entité juridique). # Les deux sont des identifiants 9 chiffres réels du référentiel FINESS et doivent # être masqués. Avant ce fix, seul finess_et était extrait (~102k), et les ~48k - # entjur étaient manqués — provoquant des fuites (ex: 640780417 entjur CHCB). + # entjur étaient manqués — provoquant des fuites (ex: 999999999 entjur CHUXX). for col_idx in (1, 2): finess = row[col_idx].strip() if col_idx < len(row) else "" if re.match(r"^\d{9}$", finess): diff --git a/scripts/evaluate_quality.py b/scripts/evaluate_quality.py index 1f362a1..720050c 100644 --- a/scripts/evaluate_quality.py +++ b/scripts/evaluate_quality.py @@ -34,7 +34,7 @@ from typing import Dict, List, Set, Tuple # === Chemins par défaut === PROJECT_DIR = Path(__file__).parent.parent DEFAULT_DIR = Path( - "/home/dom/Téléchargements/II-1 Ctrl_T2A_2025_CHCB_DocJustificatifs (1)" + "/home/dom/Téléchargements/II-1 Ctrl_T2A_2025_CHUXX_DocJustificatifs (1)" "/anonymise_audit_30" ) INSEE_NOMS = PROJECT_DIR / "data" / "insee" / "noms_famille_france.txt" @@ -85,7 +85,7 @@ NAME_IGNORE = { "GAUCHE", "DROITE", "ANTERIEUR", "POSTERIEUR", "JANVIER", "FEVRIER", "MARS", "AVRIL", "JUIN", "JUILLET", "AOUT", "SEPTEMBRE", "OCTOBRE", "NOVEMBRE", "DECEMBRE", - "FRANCE", "BAYONNE", "BORDEAUX", "PARIS", "TOULOUSE", + "FRANCE", "CHICAGO", "BORDEAUX", "PARIS", "TOULOUSE", "SAINT", "SAINTE", } diff --git a/scripts/export_silver_annotations.py b/scripts/export_silver_annotations.py index 9fdf389..d20d43a 100644 --- a/scripts/export_silver_annotations.py +++ b/scripts/export_silver_annotations.py @@ -45,7 +45,7 @@ PLACEHOLDER_TO_BIO: Dict[str, str] = { RE_PLACEHOLDER = re.compile(r"^\[([A-Z_]+)\]$") -SRC = Path("/home/dom/Téléchargements/II-1 Ctrl_T2A_2025_CHCB_DocJustificatifs (1)") +SRC = Path("/home/dom/Téléchargements/II-1 Ctrl_T2A_2025_CHUXX_DocJustificatifs (1)") AUDIT_DIR = SRC / "anonymise_audit_30" # --- Gazetteer paths --- diff --git a/scripts/reprocess_audit30.py b/scripts/reprocess_audit30.py index 7e455b6..7f6ab1b 100644 --- a/scripts/reprocess_audit30.py +++ b/scripts/reprocess_audit30.py @@ -2,7 +2,7 @@ """Reprocess corpus audit_30 avec le code actuel. Lit la liste des documents depuis evaluation/baseline_scores.json, retrouve -chaque PDF source dans le dossier des justificatifs CHCB, et appelle +chaque PDF source dans le dossier des justificatifs CHUXX, et appelle process_pdf() pour chacun. Sortie : un dossier horodaté sous /tmp/reprocess_audit30// avec @@ -31,7 +31,7 @@ from anonymizer_core_refactored_onnx import process_pdf, NerModelManager, NerThr BASELINE_PATH = PROJECT_DIR / "evaluation" / "baseline_scores.json" SOURCE_ROOT = Path( - "/home/dom/Téléchargements/II-1 Ctrl_T2A_2025_CHCB_DocJustificatifs (1)" + "/home/dom/Téléchargements/II-1 Ctrl_T2A_2025_CHUXX_DocJustificatifs (1)" ) diff --git a/tests/unit/test_config_externalization.py b/tests/unit/test_config_externalization.py index 25f8615..5f8f1f3 100644 --- a/tests/unit/test_config_externalization.py +++ b/tests/unit/test_config_externalization.py @@ -22,7 +22,7 @@ def test_default_config_template_is_externalized(): assert "whitelist_phrases:" in text cfg = core.load_dictionaries(None) - assert "CHCB" in cfg["blacklist"]["force_mask_terms"] + assert "CHUXX" in cfg["blacklist"]["force_mask_terms"] def test_runtime_overlay_template_is_minimal(): @@ -82,14 +82,14 @@ def test_runtime_overlay_is_created_and_effective_merge_works(tmp_path: Path): assert cfg_path.exists() effective = load_effective_dictionaries_dict(cfg_path) - assert "CHCB" in effective["blacklist"]["force_mask_terms"] + assert "CHUXX" in effective["blacklist"]["force_mask_terms"] cfg_path.write_text( "blacklist:\n force_mask_terms:\n - LOCAL_SIGLE\n", encoding="utf-8", ) effective = load_effective_dictionaries_dict(cfg_path) - assert "CHCB" in effective["blacklist"]["force_mask_terms"] + assert "CHUXX" in effective["blacklist"]["force_mask_terms"] assert "LOCAL_SIGLE" in effective["blacklist"]["force_mask_terms"] @@ -100,5 +100,5 @@ def test_effective_param_lists_include_defaults_when_overlay_is_empty(tmp_path: params = load_effective_param_lists(cfg_path) assert "classification internationale" in params["whitelist_phrases"] - assert "CHCB" in params["blacklist_force_mask_terms"] + assert "CHUXX" in params["blacklist_force_mask_terms"] assert params["additional_stopwords"] == [] diff --git a/tests/unit/test_header_pii_detection.py b/tests/unit/test_header_pii_detection.py index c06455e..82d5a5a 100644 --- a/tests/unit/test_header_pii_detection.py +++ b/tests/unit/test_header_pii_detection.py @@ -56,8 +56,8 @@ class TestHeaderPiiDetection: def test_structured_code_postal_preserves_label_and_audit(self): cfg = load_dictionaries(None) - anon = anonymise_document_regex(["Code postal : 64100"], [[]], cfg) + anon = anonymise_document_regex(["Code postal : 12345"], [[]], cfg) text = selective_rescan(anon.text_out, cfg) assert text == "Code postal : [CODE_POSTAL]" - assert any(h.kind == "CODE_POSTAL" and h.original == "64100" for h in anon.audit) + assert any(h.kind == "CODE_POSTAL" and h.original == "12345" for h in anon.audit) diff --git a/tests/unit/test_profile_defaults.py b/tests/unit/test_profile_defaults.py new file mode 100644 index 0000000..08dd548 --- /dev/null +++ b/tests/unit/test_profile_defaults.py @@ -0,0 +1,167 @@ +#!/usr/bin/env python3 +from pathlib import Path + +from profile_defaults import ( + delete_runtime_profile, + ensure_runtime_profiles_config, + get_default_profile_key, + list_default_profile_keys, + list_effective_profiles, + load_effective_profiles_dict, + read_default_profiles_text, + read_runtime_profiles_overlay_text, + save_runtime_profile, + set_runtime_default_profile, +) + + +def test_default_profiles_template_is_externalized(): + text = read_default_profiles_text() + + assert "default_profile:" in text + assert "chuxx_strict:" in text + assert "partage_recherche:" in text + assert "standard_local" in list_default_profile_keys() + + +def test_runtime_profiles_overlay_template_is_minimal(): + text = read_runtime_profiles_overlay_text() + + assert "profiles.default.yml" in text + assert "{}" in text + + +def test_runtime_profiles_overlay_is_created_and_merged(tmp_path: Path): + cfg_path = tmp_path / "profiles.yml" + + created = ensure_runtime_profiles_config(cfg_path) + assert created == cfg_path + assert cfg_path.exists() + + effective = load_effective_profiles_dict(cfg_path) + assert effective["default_profile"] == "standard_local" + + cfg_path.write_text( + "default_profile: partage_recherche\n" + "profiles:\n" + " partage_recherche:\n" + " description: Profil local surcharge\n", + encoding="utf-8", + ) + + effective = load_effective_profiles_dict(cfg_path) + assert effective["default_profile"] == "partage_recherche" + assert effective["profiles"]["partage_recherche"]["description"] == "Profil local surcharge" + + +def test_list_effective_profiles_normalizes_flags(tmp_path: Path): + cfg_path = tmp_path / "profiles.yml" + cfg_path.write_text( + "profiles:\n" + " custom:\n" + " label: Profil custom\n" + " require_manual_mask: true\n" + " force_disable_vlm: true\n" + " preferred_manual_mask_template: hopital/formulaire.yml\n" + " param_lists:\n" + " whitelist_phrases:\n" + " - DOCUMENT INTERNE\n" + " blacklist_force_mask_terms:\n" + " - CUSTOM_ETAB\n" + " additional_stopwords:\n" + " - DIM\n" + " dictionaries_overlay:\n" + " blacklist:\n" + " force_mask_terms:\n" + " - CUSTOM_ETAB\n", + encoding="utf-8", + ) + + profiles = list_effective_profiles(cfg_path) + + assert profiles["custom"]["label"] == "Profil custom" + assert profiles["custom"]["require_manual_mask"] is True + assert profiles["custom"]["force_disable_vlm"] is True + assert profiles["custom"]["preferred_manual_mask_template"] == "hopital/formulaire.yml" + assert profiles["custom"]["has_param_lists"] is True + assert profiles["custom"]["param_lists"]["whitelist_phrases"] == ["DOCUMENT INTERNE"] + assert profiles["custom"]["param_lists"]["blacklist_force_mask_terms"] == ["CUSTOM_ETAB"] + assert profiles["custom"]["param_lists"]["additional_stopwords"] == ["DIM"] + assert "CUSTOM_ETAB" in profiles["custom"]["dictionaries_overlay"]["blacklist"]["force_mask_terms"] + + +def test_default_profile_key_keeps_merged_default_when_available(tmp_path: Path): + cfg_path = tmp_path / "profiles.yml" + cfg_path.write_text( + "default_profile: missing\n" + "profiles:\n" + " custom:\n" + " label: Profil custom\n", + encoding="utf-8", + ) + + assert get_default_profile_key(cfg_path) == "standard_local" + + +def test_save_runtime_profile_persists_new_profile_and_default(tmp_path: Path): + cfg_path = tmp_path / "profiles.yml" + + save_runtime_profile( + "bureau_strict", + { + "label": "Bureau strict", + "description": "Profil créé depuis la GUI", + "require_manual_mask": True, + "force_disable_vlm": True, + "preferred_manual_mask_template": "chuxx/formulaire.yml", + "has_preferred_manual_mask_template": True, + "param_lists": { + "whitelist_phrases": ["VALIDATION DIM"], + "blacklist_force_mask_terms": ["CHUXX"], + "additional_stopwords": ["RUM"], + }, + "has_param_lists": True, + "dictionaries_overlay": { + "blacklist": { + "force_mask_terms": ["CHUXX"], + }, + }, + }, + cfg_path, + set_default=True, + ) + + data = load_effective_profiles_dict(cfg_path) + assert data["default_profile"] == "bureau_strict" + + saved = list_effective_profiles(cfg_path)["bureau_strict"] + assert saved["label"] == "Bureau strict" + assert saved["require_manual_mask"] is True + assert saved["force_disable_vlm"] is True + assert saved["preferred_manual_mask_template"] == "chuxx/formulaire.yml" + assert saved["param_lists"]["whitelist_phrases"] == ["VALIDATION DIM"] + assert saved["param_lists"]["blacklist_force_mask_terms"] == ["CHUXX"] + assert saved["param_lists"]["additional_stopwords"] == ["RUM"] + + +def test_set_and_delete_runtime_profile(tmp_path: Path): + cfg_path = tmp_path / "profiles.yml" + save_runtime_profile( + "profil_temporaire", + { + "label": "Profil temporaire", + "description": "", + "require_manual_mask": False, + "force_disable_vlm": False, + "dictionaries_overlay": {}, + }, + cfg_path, + ) + + set_runtime_default_profile("profil_temporaire", cfg_path) + assert get_default_profile_key(cfg_path) == "profil_temporaire" + + delete_runtime_profile("profil_temporaire", cfg_path) + profiles = list_effective_profiles(cfg_path) + assert "profil_temporaire" not in profiles + assert get_default_profile_key(cfg_path) == "standard_local" diff --git a/tools/analyze_corpus.py b/tools/analyze_corpus.py index 4ae13f9..6a29ab0 100755 --- a/tools/analyze_corpus.py +++ b/tools/analyze_corpus.py @@ -65,7 +65,7 @@ def classify_complexity(stats: dict) -> str: def main(): - corpus_dir = Path("/home/dom/Téléchargements/II-1 Ctrl_T2A_2025_CHCB_DocJustificatifs (1)/") + corpus_dir = Path("/home/dom/Téléchargements/II-1 Ctrl_T2A_2025_CHUXX_DocJustificatifs (1)/") if not corpus_dir.exists(): print(f"Erreur : {corpus_dir} n'existe pas") diff --git a/tools/analyze_date_masking.py b/tools/analyze_date_masking.py index 0fbf765..d9a135b 100644 --- a/tools/analyze_date_masking.py +++ b/tools/analyze_date_masking.py @@ -66,7 +66,7 @@ def analyze_dates_in_audit(audit_path: Path, text_path: Path): return dates_info def main(): - prod_dir = Path("/home/dom/Téléchargements/II-1 Ctrl_T2A_2025_CHCB_DocJustificatifs/anonymise") + prod_dir = Path("/home/dom/Téléchargements/II-1 Ctrl_T2A_2025_CHUXX_DocJustificatifs/anonymise") print("=" * 80) print("ANALYSE DES DATES MASQUÉES") diff --git a/tools/analyze_real_quality.py b/tools/analyze_real_quality.py index 910181c..549e6e9 100755 --- a/tools/analyze_real_quality.py +++ b/tools/analyze_real_quality.py @@ -7,7 +7,7 @@ from pathlib import Path from collections import Counter, defaultdict # Répertoire des documents anonymisés -ANON_DIR = Path("/home/dom/Téléchargements/II-1 Ctrl_T2A_2025_CHCB_DocJustificatifs/anonymise") +ANON_DIR = Path("/home/dom/Téléchargements/II-1 Ctrl_T2A_2025_CHUXX_DocJustificatifs/anonymise") def analyze_leaks(txt_file): """Détecte les fuites potentielles dans un fichier texte.""" @@ -23,7 +23,7 @@ def analyze_leaks(txt_file): "telephone": re.compile(r"\b0[1-9](?:[\s.-]?\d{2}){4}\b"), "email": re.compile(r"\b[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}\b"), "adresse": re.compile(r"\b\d+\s+(?:rue|avenue|boulevard|place|chemin|impasse)\s+[A-Z]", re.IGNORECASE), - "chcb": re.compile(r"\bCHCB\b", re.IGNORECASE), + "chuxx": re.compile(r"\bCHUXX\b", re.IGNORECASE), } for pattern_name, pattern in patterns.items(): diff --git a/tools/compare_original_vs_anonymized.py b/tools/compare_original_vs_anonymized.py index d054859..07203f4 100644 --- a/tools/compare_original_vs_anonymized.py +++ b/tools/compare_original_vs_anonymized.py @@ -8,8 +8,8 @@ sys.path.insert(0, str(Path(__file__).parent.parent)) import pdfplumber # Document original -original_pdf = Path("/home/dom/Téléchargements/II-1 Ctrl_T2A_2025_CHCB_DocJustificatifs/102_23056463/CRH 23056364.pdf") -anonymized_txt = Path("/home/dom/Téléchargements/II-1 Ctrl_T2A_2025_CHCB_DocJustificatifs/anonymise/CRH 23056364.pseudonymise.txt") +original_pdf = Path("/home/dom/Téléchargements/II-1 Ctrl_T2A_2025_CHUXX_DocJustificatifs/102_23056463/CRH 23056364.pdf") +anonymized_txt = Path("/home/dom/Téléchargements/II-1 Ctrl_T2A_2025_CHUXX_DocJustificatifs/anonymise/CRH 23056364.pseudonymise.txt") print("="*80) print("COMPARAISON ORIGINAL vs ANONYMISÉ") diff --git a/tools/compare_test_vs_production.py b/tools/compare_test_vs_production.py index 02d3a95..0bda289 100644 --- a/tools/compare_test_vs_production.py +++ b/tools/compare_test_vs_production.py @@ -46,7 +46,7 @@ def compare_datasets(): test_dir = Path("tests/ground_truth/pdfs/baseline_anonymized") # Production (régression) - prod_dir = Path("/home/dom/Téléchargements/II-1 Ctrl_T2A_2025_CHCB_DocJustificatifs/anonymise") + prod_dir = Path("/home/dom/Téléchargements/II-1 Ctrl_T2A_2025_CHUXX_DocJustificatifs/anonymise") print("\n" + "="*80) print("COMPARAISON TEST DATASET vs PRODUCTION") diff --git a/tools/debug_force_term.py b/tools/debug_force_term.py index 48bd586..0c495e9 100644 --- a/tools/debug_force_term.py +++ b/tools/debug_force_term.py @@ -17,11 +17,11 @@ print() # Test the pattern test_lines = [ - "confirmée à 5,7 g ici au CHCB. Appel Dr [NOM], hématologue biologiste", - "CHCB :", - "CHCB", - "au CHCB", - "le CHCB est", + "confirmée à 5,7 g ici au CHUXX. Appel Dr [NOM], hématologue biologiste", + "CHUXX :", + "CHUXX", + "au CHUXX", + "le CHUXX est", ] for term in cfg.get("blacklist", {}).get("force_mask_terms", []): diff --git a/tools/deep_quality_regression_analysis.py b/tools/deep_quality_regression_analysis.py index 336e652..a746536 100644 --- a/tools/deep_quality_regression_analysis.py +++ b/tools/deep_quality_regression_analysis.py @@ -210,8 +210,8 @@ def main(): """Analyse un échantillon de documents""" # Chemins - original_dir = Path("/home/dom/Téléchargements/II-1 Ctrl_T2A_2025_CHCB_DocJustificatifs (1)") - anonymized_dir = Path("/home/dom/Téléchargements/II-1 Ctrl_T2A_2025_CHCB_DocJustificatifs/anonymise") + original_dir = Path("/home/dom/Téléchargements/II-1 Ctrl_T2A_2025_CHUXX_DocJustificatifs (1)") + anonymized_dir = Path("/home/dom/Téléchargements/II-1 Ctrl_T2A_2025_CHUXX_DocJustificatifs/anonymise") # Documents à analyser test_docs = [ diff --git a/tools/root_cause_analysis.py b/tools/root_cause_analysis.py index bc5e355..4da89c1 100644 --- a/tools/root_cause_analysis.py +++ b/tools/root_cause_analysis.py @@ -122,7 +122,7 @@ def analyze_anonymized_text(text_path: Path) -> Dict: def compare_datasets(): """Compare test dataset vs production.""" test_dir = Path("tests/ground_truth/pdfs/baseline_anonymized") - prod_dir = Path("/home/dom/Téléchargements/II-1 Ctrl_T2A_2025_CHCB_DocJustificatifs/anonymise") + prod_dir = Path("/home/dom/Téléchargements/II-1 Ctrl_T2A_2025_CHUXX_DocJustificatifs/anonymise") print("=" * 80) print("ANALYSE DES CAUSES RACINES - RÉGRESSION DE QUALITÉ") diff --git a/tools/simulate_admin_rule.py b/tools/simulate_admin_rule.py index fc4a27b..fee013c 100644 --- a/tools/simulate_admin_rule.py +++ b/tools/simulate_admin_rule.py @@ -4,15 +4,15 @@ Simule l'effet d'une règle d'administration sur un texte ou sur le corpus synth Usage : # Appliquer une règle à un texte libre - python tools/simulate_admin_rule.py --rule-id rule_chcb_exact_mask \\ - --text "Consulté au CHCB le 12/06/2024." + python tools/simulate_admin_rule.py --rule-id rule_chuxx_exact_mask \\ + --text "Consulté au CHUXX le 12/06/2024." # Appliquer à un fichier texte - python tools/simulate_admin_rule.py --rule-id rule_chcb_exact_mask \\ + python tools/simulate_admin_rule.py --rule-id rule_chuxx_exact_mask \\ --file path/to/document.txt # Valider la règle sur ses required_case_ids (--corpus) - python tools/simulate_admin_rule.py --rule-id rule_chcb_exact_mask --corpus + python tools/simulate_admin_rule.py --rule-id rule_chuxx_exact_mask --corpus # Valider TOUTES les règles actives sur leurs corpus python tools/simulate_admin_rule.py --all --corpus diff --git a/tools/test_all_cro.py b/tools/test_all_cro.py index 4797002..3368e82 100644 --- a/tools/test_all_cro.py +++ b/tools/test_all_cro.py @@ -16,7 +16,7 @@ def test_all_cro(): """Test la propagation des dates de naissance sur tous les CRO.""" # Chercher tous les CRO dans les 59 OGC - ogc_dir = Path("/home/dom/Téléchargements/II-1 Ctrl_T2A_2025_CHCB_DocJustificatifs (1)") + ogc_dir = Path("/home/dom/Téléchargements/II-1 Ctrl_T2A_2025_CHUXX_DocJustificatifs (1)") # Trouver tous les CRO (compte rendu opératoire) print("Recherche de tous les CRO dans le corpus...") @@ -59,25 +59,25 @@ def test_all_cro(): date_context_pattern = re.compile(r'Né(?:e)?\s+le\s+(\d{1,2}[\s/.\-]+\d{1,2}[\s/.\-]+\d{2,4})', re.IGNORECASE) context_leaks = date_context_pattern.findall(anonymized_text) - # Scanner "CHCB" en clair - chcb_leaks = re.findall(r'\bCHCB\b', anonymized_text) + # Scanner "CHUXX" en clair + chuxx_leaks = re.findall(r'\bCHUXX\b', anonymized_text) # Compter les fuites totales - total_leaks = len(context_leaks) + len(chcb_leaks) + total_leaks = len(context_leaks) + len(chuxx_leaks) status = "✅" if total_leaks == 0 else "❌" - print(f" {status} Fuites 'Né(e) le': {len(context_leaks)}, Fuites CHCB: {len(chcb_leaks)}") + print(f" {status} Fuites 'Né(e) le': {len(context_leaks)}, Fuites CHUXX: {len(chuxx_leaks)}") if context_leaks: print(f" Exemples dates: {context_leaks[:3]}") - if chcb_leaks: - print(f" Exemples CHCB: {chcb_leaks[:3]}") + if chuxx_leaks: + print(f" Exemples CHUXX: {chuxx_leaks[:3]}") results.append({ 'file': pdf_path.name, 'path': str(pdf_path), 'context_leaks': len(context_leaks), - 'chcb_leaks': len(chcb_leaks), + 'chuxx_leaks': len(chuxx_leaks), 'success': total_leaks == 0 }) @@ -100,13 +100,13 @@ def test_all_cro(): success_count = sum(1 for r in results if r.get('success', False)) error_count = sum(1 for r in results if 'error' in r) total_context_leaks = sum(r.get('context_leaks', 0) for r in results) - total_chcb_leaks = sum(r.get('chcb_leaks', 0) for r in results) + total_chuxx_leaks = sum(r.get('chuxx_leaks', 0) for r in results) print(f"Documents testés: {len(results)}") print(f"Succès: {success_count}/{len(results)} ({success_count/len(results)*100:.1f}%)") print(f"Erreurs: {error_count}") print(f"Fuites 'Né(e) le' totales: {total_context_leaks}") - print(f"Fuites CHCB totales: {total_chcb_leaks}") + print(f"Fuites CHUXX totales: {total_chuxx_leaks}") print(f"Temps total: {elapsed_time:.1f}s ({elapsed_time/len(results):.1f}s/doc)") # Liste des documents avec fuites @@ -119,7 +119,7 @@ def test_all_cro(): print(f"\n{doc['file']}") print(f" Path: {doc['path']}") print(f" Fuites dates: {doc.get('context_leaks', 0)}") - print(f" Fuites CHCB: {doc.get('chcb_leaks', 0)}") + print(f" Fuites CHUXX: {doc.get('chuxx_leaks', 0)}") # Liste des erreurs error_docs = [r for r in results if 'error' in r] @@ -148,7 +148,7 @@ def test_all_cro(): f.write(f"Succès: {success_count}/{len(results)} ({success_count/len(results)*100:.1f}%)\n") f.write(f"Erreurs: {error_count}\n") f.write(f"Fuites 'Né(e) le' totales: {total_context_leaks}\n") - f.write(f"Fuites CHCB totales: {total_chcb_leaks}\n") + f.write(f"Fuites CHUXX totales: {total_chuxx_leaks}\n") f.write(f"Temps total: {elapsed_time:.1f}s ({elapsed_time/len(results):.1f}s/doc)\n\n") if failed_docs: @@ -159,7 +159,7 @@ def test_all_cro(): f.write(f"{doc['file']}\n") f.write(f" Path: {doc['path']}\n") f.write(f" Fuites dates: {doc.get('context_leaks', 0)}\n") - f.write(f" Fuites CHCB: {doc.get('chcb_leaks', 0)}\n\n") + f.write(f" Fuites CHUXX: {doc.get('chuxx_leaks', 0)}\n\n") if error_docs: f.write("=" * 80 + "\n") diff --git a/tools/test_date_propagation.py b/tools/test_date_propagation.py index fd46f0b..01d36db 100644 --- a/tools/test_date_propagation.py +++ b/tools/test_date_propagation.py @@ -16,7 +16,7 @@ def test_date_propagation(): """Test la propagation des dates de naissance sur un CRO.""" # Chercher un CRO dans les 59 OGC - ogc_dir = Path("/home/dom/Téléchargements/II-1 Ctrl_T2A_2025_CHCB_DocJustificatifs (1)") + ogc_dir = Path("/home/dom/Téléchargements/II-1 Ctrl_T2A_2025_CHUXX_DocJustificatifs (1)") # Trouver un CRO (compte rendu opératoire) cro_files = [] @@ -68,19 +68,19 @@ def test_date_propagation(): lines_with_placeholders = [line for line in anonymized_text.split('\n') if placeholder_pattern.search(line)] standalone_leaks = [d for d in standalone_dates if not any(d in line for line in lines_with_placeholders)] - # Scanner "CHCB" en clair - chcb_leaks = re.findall(r'\bCHCB\b', anonymized_text) + # Scanner "CHUXX" en clair + chuxx_leaks = re.findall(r'\bCHUXX\b', anonymized_text) # Compter les fuites totales - total_leaks = len(context_leaks) + len(chcb_leaks) + total_leaks = len(context_leaks) + len(chuxx_leaks) status = "✅" if total_leaks == 0 else "❌" - print(f" {status} Fuites 'Né(e) le': {len(context_leaks)}, Fuites CHCB: {len(chcb_leaks)}") + print(f" {status} Fuites 'Né(e) le': {len(context_leaks)}, Fuites CHUXX: {len(chuxx_leaks)}") if context_leaks: print(f" Exemples dates: {context_leaks[:3]}") - if chcb_leaks: - print(f" Exemples CHCB: {chcb_leaks[:3]}") + if chuxx_leaks: + print(f" Exemples CHUXX: {chuxx_leaks[:3]}") # Info : dates standalone (pas nécessairement des fuites) if standalone_leaks: @@ -89,7 +89,7 @@ def test_date_propagation(): results.append({ 'file': pdf_path.name, 'context_leaks': len(context_leaks), - 'chcb_leaks': len(chcb_leaks), + 'chuxx_leaks': len(chuxx_leaks), 'standalone_dates': len(standalone_leaks), 'success': total_leaks == 0 }) @@ -109,13 +109,13 @@ def test_date_propagation(): success_count = sum(1 for r in results if r.get('success', False)) total_context_leaks = sum(r.get('context_leaks', 0) for r in results) - total_chcb_leaks = sum(r.get('chcb_leaks', 0) for r in results) + total_chuxx_leaks = sum(r.get('chuxx_leaks', 0) for r in results) total_standalone = sum(r.get('standalone_dates', 0) for r in results) print(f"Documents testés: {len(results)}") print(f"Succès: {success_count}/{len(results)} ({success_count/len(results)*100:.1f}%)") print(f"Fuites 'Né(e) le' totales: {total_context_leaks}") - print(f"Fuites CHCB totales: {total_chcb_leaks}") + print(f"Fuites CHUXX totales: {total_chuxx_leaks}") print(f"Dates standalone (info): {total_standalone}") if success_count == len(results): diff --git a/tools/test_chcb_leak.py b/tools/test_force_term_leak.py similarity index 78% rename from tools/test_chcb_leak.py rename to tools/test_force_term_leak.py index 6e9f483..6a1c38e 100644 --- a/tools/test_chcb_leak.py +++ b/tools/test_force_term_leak.py @@ -1,5 +1,5 @@ #!/usr/bin/env python3 -"""Test CHCB force_term detection on the 2 leaked documents.""" +"""Test force_term detection on the 2 leaked documents.""" from pathlib import Path import sys @@ -10,10 +10,10 @@ sys.path.insert(0, str(Path(__file__).parent.parent)) import anonymizer_core_refactored_onnx as core from config_defaults import RUNTIME_DICTIONARIES_CONFIG_PATH -def test_chcb_detection(): - """Test CHCB detection on the 2 documents with leaks.""" +def test_force_term_detection(): + """Test force_term detection on the 2 documents with leaks.""" - corpus_dir = Path("/home/dom/Téléchargements/II-1 Ctrl_T2A_2025_CHCB_DocJustificatifs (1)") + corpus_dir = Path("/home/dom/Téléchargements/II-1 Ctrl_T2A_2025_CHUXX_DocJustificatifs (1)") # Document 1: trackare-BA148337-23091302 doc1_path = None @@ -45,7 +45,7 @@ def test_chcb_detection(): print("TEST DOCUMENT 1: trackare-BA148337-23091302") print("=" * 80) - outdir = Path("test_chcb_leak") + outdir = Path("test_force_term_leak") outdir.mkdir(exist_ok=True) try: @@ -64,14 +64,14 @@ def test_chcb_detection(): txt_file = Path(outputs["text"]) content = txt_file.read_text(encoding="utf-8") - if "CHCB" in content: - print("🔴 FUITE DÉTECTÉE: CHCB trouvé dans le texte anonymisé") + if "CHUXX" in content: + print("🔴 FUITE DÉTECTÉE: CHUXX trouvé dans le texte anonymisé") # Trouver le contexte for i, line in enumerate(content.split("\n"), 1): - if "CHCB" in line: + if "CHUXX" in line: print(f" Ligne {i}: {line.strip()}") else: - print("✅ Aucune fuite CHCB") + print("✅ Aucune fuite CHUXX") # Vérifier l'audit import json @@ -80,10 +80,10 @@ def test_chcb_detection(): with open(audit_file, 'r', encoding='utf-8') as f: for line in f: obj = json.loads(line) - if obj.get("kind") == "force_term" and "CHCB" in obj.get("value", ""): + if obj.get("kind") == "force_term" and "CHUXX" in obj.get("value", ""): force_term_count += 1 - print(f"📊 Détections force_term CHCB: {force_term_count}") + print(f"📊 Détections force_term CHUXX: {force_term_count}") except Exception as e: print(f"❌ Erreur: {e}") @@ -113,14 +113,14 @@ def test_chcb_detection(): txt_file = Path(outputs["text"]) content = txt_file.read_text(encoding="utf-8") - if "CHCB" in content: - print("🔴 FUITE DÉTECTÉE: CHCB trouvé dans le texte anonymisé") + if "CHUXX" in content: + print("🔴 FUITE DÉTECTÉE: CHUXX trouvé dans le texte anonymisé") # Trouver le contexte for i, line in enumerate(content.split("\n"), 1): - if "CHCB" in line: + if "CHUXX" in line: print(f" Ligne {i}: {line.strip()}") else: - print("✅ Aucune fuite CHCB") + print("✅ Aucune fuite CHUXX") # Vérifier l'audit import json @@ -129,10 +129,10 @@ def test_chcb_detection(): with open(audit_file, 'r', encoding='utf-8') as f: for line in f: obj = json.loads(line) - if obj.get("kind") == "force_term" and "CHCB" in obj.get("value", ""): + if obj.get("kind") == "force_term" and "CHUXX" in obj.get("value", ""): force_term_count += 1 - print(f"📊 Détections force_term CHCB: {force_term_count}") + print(f"📊 Détections force_term CHUXX: {force_term_count}") except Exception as e: print(f"❌ Erreur: {e}") @@ -140,4 +140,4 @@ def test_chcb_detection(): traceback.print_exc() if __name__ == "__main__": - test_chcb_detection() + test_force_term_detection() diff --git a/tools/test_gui_complete.py b/tools/test_gui_complete.py index ca0c74c..9dc5dcd 100755 --- a/tools/test_gui_complete.py +++ b/tools/test_gui_complete.py @@ -88,7 +88,7 @@ import re leak_count = 0 patterns = { "date_naissance": re.compile(r"(?:n[ée]+\s+le|DDN)\s*:?\s*\d{1,2}[/.\-]\d{1,2}[/.\-]\d{2,4}", re.IGNORECASE), - "chcb": re.compile(r"\bCHCB\b", re.IGNORECASE), + "chuxx": re.compile(r"\bCHUXX\b", re.IGNORECASE), } for txt_file in out_dir.glob("*.pseudonymise.txt"): diff --git a/tools/test_phase1_corrections.py b/tools/test_phase1_corrections.py index b35228d..66cda87 100755 --- a/tools/test_phase1_corrections.py +++ b/tools/test_phase1_corrections.py @@ -24,9 +24,9 @@ def test_phase1_corrections(): # Documents de test (5 documents représentatifs) test_docs = [ - "/home/dom/Téléchargements/II-1 Ctrl_T2A_2025_CHCB_DocJustificatifs/008_23001234/CRH 23001234.pdf", - "/home/dom/Téléchargements/II-1 Ctrl_T2A_2025_CHCB_DocJustificatifs/021_23012345/CRO 23012345.pdf", - "/home/dom/Téléchargements/II-1 Ctrl_T2A_2025_CHCB_DocJustificatifs/033_23023456/trackare-23023456-12345678.pdf", + "/home/dom/Téléchargements/II-1 Ctrl_T2A_2025_CHUXX_DocJustificatifs/008_23001234/CRH 23001234.pdf", + "/home/dom/Téléchargements/II-1 Ctrl_T2A_2025_CHUXX_DocJustificatifs/021_23012345/CRO 23012345.pdf", + "/home/dom/Téléchargements/II-1 Ctrl_T2A_2025_CHUXX_DocJustificatifs/033_23023456/trackare-23023456-12345678.pdf", ] print("=" * 80) diff --git a/tools/validate_corpus_sample.py b/tools/validate_corpus_sample.py index 9715be7..6e7863e 100644 --- a/tools/validate_corpus_sample.py +++ b/tools/validate_corpus_sample.py @@ -23,7 +23,7 @@ def validate_corpus_sample(): """Valide l'anonymisation sur un échantillon du corpus.""" # Répertoires - corpus_dir = Path("/home/dom/Téléchargements/II-1 Ctrl_T2A_2025_CHCB_DocJustificatifs (1)") + corpus_dir = Path("/home/dom/Téléchargements/II-1 Ctrl_T2A_2025_CHUXX_DocJustificatifs (1)") output_dir = Path("corpus_validation_sample") output_dir.mkdir(exist_ok=True) @@ -221,7 +221,7 @@ def leak_check(output_dir: Path): # Patterns à vérifier patterns = { "date_naissance_contexte": re.compile(r"(?:n[ée]+\s+le|DDN)\s*:?\s*\d{1,2}[/.\-]\d{1,2}[/.\-]\d{2,4}", re.IGNORECASE), - "chcb": re.compile(r"\bCHCB\b", re.IGNORECASE), + "chuxx": re.compile(r"\bCHUXX\b", re.IGNORECASE), } leaks = defaultdict(list) diff --git a/tools/validate_full_corpus.py b/tools/validate_full_corpus.py index feea373..1c5254b 100644 --- a/tools/validate_full_corpus.py +++ b/tools/validate_full_corpus.py @@ -3,7 +3,7 @@ Validation sur le corpus complet (59 OGC / 130 PDFs). Ce script anonymise tous les documents du corpus et vérifie : -- Absence de fuites (dates de naissance, CHCB, etc.) +- Absence de fuites (dates de naissance, CHUXX, etc.) - Statistiques de détection par type - Performances (temps de traitement) """ @@ -24,7 +24,7 @@ def validate_full_corpus(): """Valide l'anonymisation sur le corpus complet.""" # Répertoires - corpus_dir = Path("/home/dom/Téléchargements/II-1 Ctrl_T2A_2025_CHCB_DocJustificatifs (1)") + corpus_dir = Path("/home/dom/Téléchargements/II-1 Ctrl_T2A_2025_CHUXX_DocJustificatifs (1)") output_dir = Path("corpus_validation") output_dir.mkdir(exist_ok=True) @@ -177,7 +177,7 @@ def leak_check(output_dir: Path): # Patterns à vérifier patterns = { "date_naissance": re.compile(r"(?:n[ée]+\s+le|DDN)\s*:?\s*\d{1,2}[/.\-]\d{1,2}[/.\-]\d{2,4}", re.IGNORECASE), - "chcb": re.compile(r"\bCHCB\b", re.IGNORECASE), + "chuxx": re.compile(r"\bCHUXX\b", re.IGNORECASE), "date_format": re.compile(r"\b\d{2}[/.\-]\d{2}[/.\-]\d{4}\b"), } diff --git a/tools/validate_phase1_on_production.py b/tools/validate_phase1_on_production.py index deed368..2926ebe 100644 --- a/tools/validate_phase1_on_production.py +++ b/tools/validate_phase1_on_production.py @@ -14,7 +14,7 @@ from config_defaults import RUNTIME_DICTIONARIES_CONFIG_PATH from anonymizer_core_refactored_onnx import process_pdf # 5 documents du corpus production (OGC 008) -corpus_dir = Path("/home/dom/Téléchargements/II-1 Ctrl_T2A_2025_CHCB_DocJustificatifs") +corpus_dir = Path("/home/dom/Téléchargements/II-1 Ctrl_T2A_2025_CHUXX_DocJustificatifs") test_docs = [ corpus_dir / "008_23001234" / "CRH 23001234.pdf", corpus_dir / "008_23001234" / "CRO 23001234.pdf",