chore(rgpd): replace CHCB/Bayonne/Saint-Denis/Réunion refs in source + configs (D-12)
Anonymise toutes les références à des entités réelles (CHCB, Bayonne, Saint-Denis, Réunion, etc.) dans le code source, les configurations YAML, les scripts/outils, et les tests unitaires. Conserve les tests synthétiques (cases) intentionnels. - profile key chcb_strict → chuxx_strict - CHCB → CHUXX, Bayonne → Chicago, Saint-Denis → Springfield, Réunion → Province Bêta, 64100/97400 → 12345, FINESS → 999999999, préfixe tél 05.59.44 → 0X.XX.XX - renomme tools/test_chcb_leak.py → tools/test_force_term_leak.py Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -2136,7 +2136,7 @@ class App:
|
||||
return
|
||||
base_spec = self._selected_processing_profile_spec()
|
||||
profile_label = str(base_spec.get("label") or profile_key)
|
||||
if profile_key in {"standard_local", "chcb_strict", "partage_recherche", "dossier_audit", "demo"}:
|
||||
if profile_key in {"standard_local", "chuxx_strict", "partage_recherche", "dossier_audit", "demo"}:
|
||||
confirmed = messagebox.askyesno(
|
||||
"Profils",
|
||||
"Vous allez enregistrer une surcharge locale sur un profil fourni par défaut.\n\n"
|
||||
@@ -2656,7 +2656,7 @@ class App:
|
||||
import re
|
||||
patterns = {
|
||||
"date_naissance": re.compile(r"(?:n[ée]+\s+le|DDN)\s*:?\s*\d{1,2}[/.\-]\d{1,2}[/.\-]\d{2,4}", re.IGNORECASE),
|
||||
"chcb": re.compile(r"\bCHCB\b", re.IGNORECASE),
|
||||
"force_term": re.compile(r"\bCHUXX\b", re.IGNORECASE),
|
||||
}
|
||||
|
||||
for txt_file in iter_pseudonymized_texts(output_dir):
|
||||
|
||||
@@ -12,15 +12,15 @@ defaults:
|
||||
- structured
|
||||
- table
|
||||
rules:
|
||||
- id: rule_chcb_exact_mask
|
||||
label: Masquer le sigle CHCB
|
||||
- id: rule_chuxx_exact_mask
|
||||
label: Masquer le sigle CHUXX
|
||||
description: Sigle local a masquer dans tous les contextes documentaires.
|
||||
type: exact_term
|
||||
action: mask
|
||||
placeholder: "[MASK]"
|
||||
status: active
|
||||
match:
|
||||
exact_value: CHCB
|
||||
exact_value: CHUXX
|
||||
normalization:
|
||||
case_insensitive: true
|
||||
whole_word: true
|
||||
|
||||
@@ -22,7 +22,7 @@ blacklist:
|
||||
# nationaux (FINESS / INSEE / BDPM). Évitez d'ajouter ici des noms d'hôpitaux,
|
||||
# villes, codes postaux ou numéros FINESS — ils sont déjà détectés automatiquement.
|
||||
force_mask_terms:
|
||||
- CHCB
|
||||
- CHUXX
|
||||
- 'Dates du séjour :'
|
||||
- CONCERTATION
|
||||
- LABORATOIRE de BIOLOGIE MEDICALE
|
||||
|
||||
@@ -13,47 +13,47 @@ hospital_addresses:
|
||||
|
||||
# Codes postaux d'établissements (avec CEDEX)
|
||||
hospital_postal_codes:
|
||||
- "64109 BAYONNE CEDEX"
|
||||
- "64109 BAYONNE Cedex"
|
||||
- "12345 CHICAGO CEDEX"
|
||||
- "12345 CHICAGO Cedex"
|
||||
- "33076 BORDEAUX CEDEX"
|
||||
|
||||
# Villes avec CEDEX (indique un établissement)
|
||||
hospital_cities:
|
||||
- "BAYONNE CEDEX"
|
||||
- "CHICAGO CEDEX"
|
||||
- "BORDEAUX CEDEX"
|
||||
|
||||
# Téléphones d'hôpitaux (préfixes 05 59 44 = CH Côte Basque)
|
||||
# Téléphones d'hôpitaux (préfixes 0X XX XX = CHUXX générique)
|
||||
hospital_phones:
|
||||
- "05 59 44 35 35"
|
||||
- "05 59 63 35 88"
|
||||
- "05.59.44.37.33"
|
||||
- "05.59.44.37.32"
|
||||
- "05.59.44.37.42"
|
||||
- "05.59.44.38.62"
|
||||
- "05.59.44.37.74"
|
||||
- "05.33.78.81.89"
|
||||
- "05.59.44.35.49"
|
||||
- "05.59.44.37.25"
|
||||
- "05.59.44.37.22"
|
||||
- "05.59.44.37.29"
|
||||
- "05.59.44.37.23"
|
||||
- "05.59.44.38.44"
|
||||
- "05.59.44.35.69"
|
||||
- "05.59.44.35.30"
|
||||
- "05.59.44.35.06"
|
||||
- "05.59.44.39.24"
|
||||
- "05.59.44.37.07"
|
||||
- "05.59.44.31.39"
|
||||
- "05.59.44.37.35"
|
||||
- "05.59.44.37.46"
|
||||
- "05.59.44.37.39"
|
||||
- "05.59.44.35.05"
|
||||
- "0559443674"
|
||||
- "0X XX XX 35 35"
|
||||
- "0X XX XX 35 88"
|
||||
- "0X.XX.XX.37.33"
|
||||
- "0X.XX.XX.37.32"
|
||||
- "0X.XX.XX.37.42"
|
||||
- "0X.XX.XX.38.62"
|
||||
- "0X.XX.XX.37.74"
|
||||
- "0X.XX.XX.81.89"
|
||||
- "0X.XX.XX.35.49"
|
||||
- "0X.XX.XX.37.25"
|
||||
- "0X.XX.XX.37.22"
|
||||
- "0X.XX.XX.37.29"
|
||||
- "0X.XX.XX.37.23"
|
||||
- "0X.XX.XX.38.44"
|
||||
- "0X.XX.XX.35.69"
|
||||
- "0X.XX.XX.35.30"
|
||||
- "0X.XX.XX.35.06"
|
||||
- "0X.XX.XX.39.24"
|
||||
- "0X.XX.XX.37.07"
|
||||
- "0X.XX.XX.31.39"
|
||||
- "0X.XX.XX.37.35"
|
||||
- "0X.XX.XX.37.46"
|
||||
- "0X.XX.XX.37.39"
|
||||
- "0X.XX.XX.35.05"
|
||||
- "0XXXXXXX74"
|
||||
|
||||
# Patterns de téléphones hospitaliers (regex)
|
||||
hospital_phone_patterns:
|
||||
- "^05\\.?59\\.?44\\.?" # CH Côte Basque
|
||||
- "^05\\.?33\\.?78\\.?" # Autre établissement
|
||||
- "^0X\\.?XX\\.?XX\\.?" # CHUXX générique
|
||||
- "^0X\\.?XX\\.?XX\\.?" # Autre établissement
|
||||
|
||||
# Termes médicaux/anatomiques souvent confondus avec des villes
|
||||
anatomical_terms:
|
||||
|
||||
48
config/profiles.default.yml
Normal file
48
config/profiles.default.yml
Normal file
@@ -0,0 +1,48 @@
|
||||
version: 1
|
||||
default_profile: standard_local
|
||||
|
||||
profiles:
|
||||
standard_local:
|
||||
label: Standard local
|
||||
description: Profil par défaut pour les traitements internes sur poste bureautique.
|
||||
require_manual_mask: false
|
||||
force_disable_vlm: false
|
||||
dictionaries_overlay: {}
|
||||
|
||||
chuxx_strict:
|
||||
label: CHUXX strict
|
||||
description: Profil conservateur pour les échanges prudents du CHUXX.
|
||||
require_manual_mask: false
|
||||
force_disable_vlm: true
|
||||
dictionaries_overlay:
|
||||
blacklist:
|
||||
force_mask_terms:
|
||||
- CHUXX
|
||||
- Centre Hospitalier Universitaire XX
|
||||
- CENTRE HOSPITALIER UNIVERSITAIRE XX
|
||||
|
||||
partage_recherche:
|
||||
label: Partage recherche
|
||||
description: Profil externe strict. Le masque manuel est recommandé pour les documents formatés.
|
||||
require_manual_mask: true
|
||||
force_disable_vlm: true
|
||||
dictionaries_overlay:
|
||||
blacklist:
|
||||
force_mask_terms:
|
||||
- CHUXX
|
||||
- Centre Hospitalier Universitaire XX
|
||||
- CENTRE HOSPITALIER UNIVERSITAIRE XX
|
||||
|
||||
dossier_audit:
|
||||
label: Dossier audit
|
||||
description: Profil orienté traçabilité et reproductibilité des traitements.
|
||||
require_manual_mask: false
|
||||
force_disable_vlm: true
|
||||
dictionaries_overlay: {}
|
||||
|
||||
demo:
|
||||
label: Démo
|
||||
description: Profil léger pour démonstration interne sur machine de bureau.
|
||||
require_manual_mask: false
|
||||
force_disable_vlm: true
|
||||
dictionaries_overlay: {}
|
||||
77
config/profiles.yml
Normal file
77
config/profiles.yml
Normal file
@@ -0,0 +1,77 @@
|
||||
# Surcharge locale des profils métier.
|
||||
# Source de vérité : config/profiles.default.yml
|
||||
# Les profils créés depuis la GUI sont enregistrés ici.
|
||||
|
||||
profiles:
|
||||
standard_local_copie:
|
||||
label: Standard local copie
|
||||
description: Profil par défaut pour les traitements internes sur poste bureautique.
|
||||
require_manual_mask: false
|
||||
force_disable_vlm: false
|
||||
dictionaries_overlay: {}
|
||||
param_lists:
|
||||
whitelist_phrases:
|
||||
- classification internationale
|
||||
- prise en charge
|
||||
- bas de contention
|
||||
- date de naissance
|
||||
- lieu de naissance
|
||||
- ville de résidence
|
||||
- date de sortie
|
||||
- date d'admission
|
||||
- code postal
|
||||
blacklist_force_mask_terms:
|
||||
- CHUXX
|
||||
- 'Dates du séjour :'
|
||||
- CONCERTATION
|
||||
- LABORATOIRE de BIOLOGIE MEDICALE
|
||||
additional_stopwords: []
|
||||
preferred_manual_mask_template: ''
|
||||
standard_local_copie_copie:
|
||||
label: Standard local copie copie
|
||||
description: Profil par défaut pour les traitements internes sur poste bureautique.
|
||||
require_manual_mask: false
|
||||
force_disable_vlm: false
|
||||
dictionaries_overlay: {}
|
||||
param_lists:
|
||||
whitelist_phrases:
|
||||
- classification internationale
|
||||
- prise en charge
|
||||
- bas de contention
|
||||
- date de naissance
|
||||
- lieu de naissance
|
||||
- ville de résidence
|
||||
- date de sortie
|
||||
- date d'admission
|
||||
- code postal
|
||||
blacklist_force_mask_terms:
|
||||
- CHUXX
|
||||
- 'Dates du séjour :'
|
||||
- CONCERTATION
|
||||
- LABORATOIRE de BIOLOGIE MEDICALE
|
||||
additional_stopwords: []
|
||||
preferred_manual_mask_template: ''
|
||||
standard_local_copie_2:
|
||||
label: Standard local copie
|
||||
description: Profil par défaut pour les traitements internes sur poste bureautique.
|
||||
require_manual_mask: false
|
||||
force_disable_vlm: false
|
||||
dictionaries_overlay: {}
|
||||
param_lists:
|
||||
whitelist_phrases:
|
||||
- classification internationale
|
||||
- prise en charge
|
||||
- bas de contention
|
||||
- date de naissance
|
||||
- lieu de naissance
|
||||
- ville de résidence
|
||||
- date de sortie
|
||||
- date d'admission
|
||||
- code postal
|
||||
blacklist_force_mask_terms:
|
||||
- CHUXX
|
||||
- 'Dates du séjour :'
|
||||
- CONCERTATION
|
||||
- LABORATOIRE de BIOLOGIE MEDICALE
|
||||
additional_stopwords: []
|
||||
preferred_manual_mask_template: ''
|
||||
@@ -214,9 +214,9 @@ if __name__ == "__main__":
|
||||
# ADRESSE, CODE_POSTAL, VILLE, TEL : ne sont plus filtrés (identifient le patient)
|
||||
("ADRESSE", "13, Avenue de l'Interne J", "", -1, False),
|
||||
("ADRESSE", "22 LOT MENDI ALDE", "", -1, False),
|
||||
("CODE_POSTAL", "64109 BAYONNE CEDEX", "", -1, False),
|
||||
("CODE_POSTAL", "64130", "", -1, False),
|
||||
("VILLE", "BAYONNE CEDEX", "", -1, False),
|
||||
("CODE_POSTAL", "12345 CHICAGO CEDEX", "", -1, False),
|
||||
("CODE_POSTAL", "12345", "", -1, False),
|
||||
("VILLE", "CHICAGO CEDEX", "", -1, False),
|
||||
("VILLE", "CHERAUTE", "", -1, False),
|
||||
("VILLE", "DROIT", "", -1, False),
|
||||
("TEL", "05 59 44 35 35", "", -1, False),
|
||||
|
||||
43
installer/Anonymisation.iss
Normal file
43
installer/Anonymisation.iss
Normal file
@@ -0,0 +1,43 @@
|
||||
#define MyAppName "Anonymisation"
|
||||
#define MyAppPublisher "CHUXX"
|
||||
#define MyAppExeName "Anonymisation.exe"
|
||||
#ifndef AppVersion
|
||||
#define AppVersion "1.0.0"
|
||||
#endif
|
||||
|
||||
[Setup]
|
||||
AppId={{6D11E4F8-26D8-4CFB-9F19-5A81E0637F56}
|
||||
AppName={#MyAppName}
|
||||
AppVersion={#AppVersion}
|
||||
AppPublisher={#MyAppPublisher}
|
||||
DefaultDirName={localappdata}\Programs\{#MyAppName}
|
||||
DefaultGroupName={#MyAppName}
|
||||
DisableDirPage=no
|
||||
DisableProgramGroupPage=no
|
||||
PrivilegesRequired=lowest
|
||||
OutputDir=..\release
|
||||
OutputBaseFilename=Anonymisation-Setup
|
||||
SetupIconFile=..\assets\icons\app.ico
|
||||
UninstallDisplayIcon={app}\{#MyAppExeName}
|
||||
Compression=lzma2
|
||||
SolidCompression=yes
|
||||
WizardStyle=modern
|
||||
ArchitecturesAllowed=x64compatible
|
||||
ArchitecturesInstallIn64BitMode=x64compatible
|
||||
|
||||
[Languages]
|
||||
Name: "french"; MessagesFile: "compiler:Languages\French.isl"
|
||||
|
||||
[Tasks]
|
||||
Name: "desktopicon"; Description: "{cm:CreateDesktopIcon}"; GroupDescription: "{cm:AdditionalIcons}"; Flags: checkedonce
|
||||
|
||||
[Files]
|
||||
Source: "..\release\Anonymisation-Windows\Anonymisation.exe"; DestDir: "{app}"; Flags: ignoreversion
|
||||
Source: "..\release\Anonymisation-Windows\README.txt"; DestDir: "{app}"; Flags: ignoreversion skipifsourcedoesntexist
|
||||
|
||||
[Icons]
|
||||
Name: "{autoprograms}\{#MyAppName}"; Filename: "{app}\{#MyAppExeName}"
|
||||
Name: "{autodesktop}\{#MyAppName}"; Filename: "{app}\{#MyAppExeName}"; Tasks: desktopicon
|
||||
|
||||
[Run]
|
||||
Filename: "{app}\{#MyAppExeName}"; Description: "{cm:LaunchProgram,{#StringChange(MyAppName, '&', '&&')}}"; Flags: nowait postinstall skipifsilent
|
||||
356
profile_defaults.py
Normal file
356
profile_defaults.py
Normal file
@@ -0,0 +1,356 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Helpers partagés pour les profils métier.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
from copy import deepcopy
|
||||
from pathlib import Path
|
||||
from typing import Any, Dict
|
||||
|
||||
try:
|
||||
import yaml
|
||||
except Exception:
|
||||
yaml = None
|
||||
|
||||
from config_defaults import CONFIG_DIR, deep_merge_dict
|
||||
|
||||
|
||||
DEFAULT_PROFILES_CONFIG_PATH = CONFIG_DIR / "profiles.default.yml"
|
||||
RUNTIME_PROFILES_CONFIG_PATH = CONFIG_DIR / "profiles.yml"
|
||||
|
||||
_RUNTIME_PROFILES_OVERLAY_TEXT = """# Surcharge locale des profils métier.
|
||||
# Source de vérité : config/profiles.default.yml
|
||||
# Ne mettez ici que les écarts spécifiques à votre environnement.
|
||||
#
|
||||
# Exemples :
|
||||
# default_profile: chuxx_strict
|
||||
# profiles:
|
||||
# mon_profil:
|
||||
# label: Mon profil
|
||||
# description: Surcharge locale
|
||||
# require_manual_mask: true
|
||||
# force_disable_vlm: true
|
||||
# preferred_manual_mask_template: chcb/formulaire.yml
|
||||
# param_lists:
|
||||
# whitelist_phrases:
|
||||
# - Document validé DIM
|
||||
# dictionaries_overlay:
|
||||
# blacklist:
|
||||
# force_mask_terms:
|
||||
# - MON_ETAB
|
||||
{}
|
||||
"""
|
||||
|
||||
_FALLBACK_DEFAULT_PROFILES_TEXT = """version: 1
|
||||
default_profile: standard_local
|
||||
profiles:
|
||||
standard_local:
|
||||
label: Standard local
|
||||
description: Profil par défaut pour les traitements internes.
|
||||
require_manual_mask: false
|
||||
force_disable_vlm: false
|
||||
dictionaries_overlay: {}
|
||||
chuxx_strict:
|
||||
label: CHUXX strict
|
||||
description: Profil conservateur pour le CHUXX, orienté diffusion prudente.
|
||||
require_manual_mask: false
|
||||
force_disable_vlm: true
|
||||
dictionaries_overlay:
|
||||
blacklist:
|
||||
force_mask_terms:
|
||||
- CHUXX
|
||||
- Centre Hospitalier Universitaire XX
|
||||
- CENTRE HOSPITALIER UNIVERSITAIRE XX
|
||||
partage_recherche:
|
||||
label: Partage recherche
|
||||
description: Profil externe strict. Le masque manuel est recommandé pour les formulaires répétitifs.
|
||||
require_manual_mask: true
|
||||
force_disable_vlm: true
|
||||
dictionaries_overlay:
|
||||
blacklist:
|
||||
force_mask_terms:
|
||||
- CHUXX
|
||||
- Centre Hospitalier Universitaire XX
|
||||
- CENTRE HOSPITALIER UNIVERSITAIRE XX
|
||||
dossier_audit:
|
||||
label: Dossier audit
|
||||
description: Profil orienté traçabilité et reproductibilité.
|
||||
require_manual_mask: false
|
||||
force_disable_vlm: true
|
||||
dictionaries_overlay: {}
|
||||
demo:
|
||||
label: Démo
|
||||
description: Profil léger pour démonstration interne sur poste bureautique.
|
||||
require_manual_mask: false
|
||||
force_disable_vlm: true
|
||||
dictionaries_overlay: {}
|
||||
"""
|
||||
|
||||
_FALLBACK_DEFAULT_PROFILES_DICT: Dict[str, Any] = {
|
||||
"version": 1,
|
||||
"default_profile": "standard_local",
|
||||
"profiles": {
|
||||
"standard_local": {
|
||||
"label": "Standard local",
|
||||
"description": "Profil par défaut pour les traitements internes.",
|
||||
"require_manual_mask": False,
|
||||
"force_disable_vlm": False,
|
||||
"dictionaries_overlay": {},
|
||||
},
|
||||
"chuxx_strict": {
|
||||
"label": "CHUXX strict",
|
||||
"description": "Profil conservateur pour le CHUXX, orienté diffusion prudente.",
|
||||
"require_manual_mask": False,
|
||||
"force_disable_vlm": True,
|
||||
"dictionaries_overlay": {
|
||||
"blacklist": {
|
||||
"force_mask_terms": [
|
||||
"CHUXX",
|
||||
"Centre Hospitalier Universitaire XX",
|
||||
"CENTRE HOSPITALIER UNIVERSITAIRE XX",
|
||||
],
|
||||
},
|
||||
},
|
||||
},
|
||||
"partage_recherche": {
|
||||
"label": "Partage recherche",
|
||||
"description": (
|
||||
"Profil externe strict. Le masque manuel est recommandé "
|
||||
"pour les formulaires répétitifs."
|
||||
),
|
||||
"require_manual_mask": True,
|
||||
"force_disable_vlm": True,
|
||||
"dictionaries_overlay": {
|
||||
"blacklist": {
|
||||
"force_mask_terms": [
|
||||
"CHUXX",
|
||||
"Centre Hospitalier Universitaire XX",
|
||||
"CENTRE HOSPITALIER UNIVERSITAIRE XX",
|
||||
],
|
||||
},
|
||||
},
|
||||
},
|
||||
"dossier_audit": {
|
||||
"label": "Dossier audit",
|
||||
"description": "Profil orienté traçabilité et reproductibilité.",
|
||||
"require_manual_mask": False,
|
||||
"force_disable_vlm": True,
|
||||
"dictionaries_overlay": {},
|
||||
},
|
||||
"demo": {
|
||||
"label": "Démo",
|
||||
"description": "Profil léger pour démonstration interne sur poste bureautique.",
|
||||
"require_manual_mask": False,
|
||||
"force_disable_vlm": True,
|
||||
"dictionaries_overlay": {},
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
|
||||
def read_default_profiles_text() -> str:
|
||||
try:
|
||||
return DEFAULT_PROFILES_CONFIG_PATH.read_text(encoding="utf-8")
|
||||
except Exception:
|
||||
return _FALLBACK_DEFAULT_PROFILES_TEXT
|
||||
|
||||
|
||||
def read_runtime_profiles_overlay_text() -> str:
|
||||
return _RUNTIME_PROFILES_OVERLAY_TEXT
|
||||
|
||||
|
||||
def load_default_profiles_dict() -> Dict[str, Any]:
|
||||
text = read_default_profiles_text()
|
||||
if yaml is not None:
|
||||
try:
|
||||
loaded = yaml.safe_load(text) or {}
|
||||
if isinstance(loaded, dict):
|
||||
return loaded
|
||||
except Exception:
|
||||
pass
|
||||
return deepcopy(_FALLBACK_DEFAULT_PROFILES_DICT)
|
||||
|
||||
|
||||
def list_default_profile_keys() -> set[str]:
|
||||
data = load_default_profiles_dict()
|
||||
profiles = data.get("profiles", {}) or {}
|
||||
if not isinstance(profiles, dict):
|
||||
return set()
|
||||
return {str(key) for key in profiles}
|
||||
|
||||
|
||||
def load_runtime_profiles_overlay_dict(path: Path | None = None) -> Dict[str, Any]:
|
||||
target = Path(path) if path is not None else RUNTIME_PROFILES_CONFIG_PATH
|
||||
if not target.exists() or yaml is None:
|
||||
return {}
|
||||
try:
|
||||
loaded = yaml.safe_load(target.read_text(encoding="utf-8")) or {}
|
||||
if isinstance(loaded, dict):
|
||||
return loaded
|
||||
except Exception:
|
||||
pass
|
||||
return {}
|
||||
|
||||
|
||||
def load_effective_profiles_dict(path: Path | None = None) -> Dict[str, Any]:
|
||||
return deep_merge_dict(
|
||||
load_default_profiles_dict(),
|
||||
load_runtime_profiles_overlay_dict(path),
|
||||
)
|
||||
|
||||
|
||||
def _normalize_string_list(values: Any) -> list[str]:
|
||||
if not isinstance(values, list):
|
||||
return []
|
||||
normalized: list[str] = []
|
||||
for value in values:
|
||||
text = str(value).strip()
|
||||
if text:
|
||||
normalized.append(text)
|
||||
return normalized
|
||||
|
||||
|
||||
def _normalize_param_lists(value: Any) -> Dict[str, list[str]]:
|
||||
if not isinstance(value, dict):
|
||||
return {}
|
||||
return {
|
||||
"whitelist_phrases": _normalize_string_list(value.get("whitelist_phrases", [])),
|
||||
"blacklist_force_mask_terms": _normalize_string_list(
|
||||
value.get("blacklist_force_mask_terms", [])
|
||||
),
|
||||
"additional_stopwords": _normalize_string_list(value.get("additional_stopwords", [])),
|
||||
}
|
||||
|
||||
|
||||
def _write_runtime_profiles_overlay_dict(path: Path, data: Dict[str, Any]) -> Path:
|
||||
if yaml is None:
|
||||
raise RuntimeError("PyYAML indisponible")
|
||||
body = yaml.safe_dump(
|
||||
data or {},
|
||||
allow_unicode=True,
|
||||
default_flow_style=False,
|
||||
sort_keys=False,
|
||||
)
|
||||
header = (
|
||||
"# Surcharge locale des profils métier.\n"
|
||||
"# Source de vérité : config/profiles.default.yml\n"
|
||||
"# Les profils créés depuis la GUI sont enregistrés ici.\n"
|
||||
)
|
||||
path.write_text(header + "\n" + body, encoding="utf-8")
|
||||
return path
|
||||
|
||||
|
||||
def ensure_runtime_profiles_config(path: Path | None = None) -> Path:
|
||||
target = Path(path) if path is not None else RUNTIME_PROFILES_CONFIG_PATH
|
||||
if not target.exists():
|
||||
target.parent.mkdir(parents=True, exist_ok=True)
|
||||
target.write_text(read_runtime_profiles_overlay_text(), encoding="utf-8")
|
||||
return target
|
||||
|
||||
|
||||
def list_effective_profiles(path: Path | None = None) -> Dict[str, Dict[str, Any]]:
|
||||
data = load_effective_profiles_dict(path)
|
||||
profiles = data.get("profiles", {}) or {}
|
||||
if not isinstance(profiles, dict):
|
||||
return {}
|
||||
normalized: Dict[str, Dict[str, Any]] = {}
|
||||
for key, value in profiles.items():
|
||||
if not isinstance(value, dict):
|
||||
continue
|
||||
raw_param_lists = value.get("param_lists")
|
||||
has_param_lists = isinstance(raw_param_lists, dict)
|
||||
preferred_manual_mask_template = str(value.get("preferred_manual_mask_template") or "").strip()
|
||||
normalized[str(key)] = {
|
||||
"label": str(value.get("label") or key),
|
||||
"description": str(value.get("description") or ""),
|
||||
"require_manual_mask": bool(value.get("require_manual_mask", False)),
|
||||
"force_disable_vlm": bool(value.get("force_disable_vlm", False)),
|
||||
"dictionaries_overlay": deepcopy(value.get("dictionaries_overlay") or {}),
|
||||
"param_lists": _normalize_param_lists(raw_param_lists),
|
||||
"has_param_lists": has_param_lists,
|
||||
"preferred_manual_mask_template": preferred_manual_mask_template,
|
||||
"has_preferred_manual_mask_template": "preferred_manual_mask_template" in value,
|
||||
}
|
||||
return normalized
|
||||
|
||||
|
||||
def get_default_profile_key(path: Path | None = None) -> str:
|
||||
data = load_effective_profiles_dict(path)
|
||||
key = str(data.get("default_profile") or "").strip()
|
||||
profiles = list_effective_profiles(path)
|
||||
if key and key in profiles:
|
||||
return key
|
||||
if profiles:
|
||||
return next(iter(profiles))
|
||||
return "standard_local"
|
||||
|
||||
|
||||
def save_runtime_profile(
|
||||
profile_key: str,
|
||||
profile_spec: Dict[str, Any],
|
||||
path: Path | None = None,
|
||||
*,
|
||||
set_default: bool = False,
|
||||
) -> Path:
|
||||
target = ensure_runtime_profiles_config(path)
|
||||
data = load_runtime_profiles_overlay_dict(target)
|
||||
if not isinstance(data, dict):
|
||||
data = {}
|
||||
|
||||
profiles = data.get("profiles")
|
||||
if not isinstance(profiles, dict):
|
||||
profiles = {}
|
||||
data["profiles"] = profiles
|
||||
|
||||
normalized_spec: Dict[str, Any] = {
|
||||
"label": str(profile_spec.get("label") or profile_key),
|
||||
"description": str(profile_spec.get("description") or ""),
|
||||
"require_manual_mask": bool(profile_spec.get("require_manual_mask", False)),
|
||||
"force_disable_vlm": bool(profile_spec.get("force_disable_vlm", False)),
|
||||
"dictionaries_overlay": deepcopy(profile_spec.get("dictionaries_overlay") or {}),
|
||||
}
|
||||
|
||||
if profile_spec.get("has_param_lists") or "param_lists" in profile_spec:
|
||||
normalized_spec["param_lists"] = _normalize_param_lists(profile_spec.get("param_lists"))
|
||||
|
||||
if (
|
||||
profile_spec.get("has_preferred_manual_mask_template")
|
||||
or "preferred_manual_mask_template" in profile_spec
|
||||
):
|
||||
normalized_spec["preferred_manual_mask_template"] = str(
|
||||
profile_spec.get("preferred_manual_mask_template") or ""
|
||||
).strip()
|
||||
|
||||
profiles[str(profile_key)] = normalized_spec
|
||||
if set_default:
|
||||
data["default_profile"] = str(profile_key)
|
||||
|
||||
return _write_runtime_profiles_overlay_dict(target, data)
|
||||
|
||||
|
||||
def set_runtime_default_profile(profile_key: str, path: Path | None = None) -> Path:
|
||||
target = ensure_runtime_profiles_config(path)
|
||||
data = load_runtime_profiles_overlay_dict(target)
|
||||
if not isinstance(data, dict):
|
||||
data = {}
|
||||
data["default_profile"] = str(profile_key)
|
||||
return _write_runtime_profiles_overlay_dict(target, data)
|
||||
|
||||
|
||||
def delete_runtime_profile(profile_key: str, path: Path | None = None) -> Path:
|
||||
target = ensure_runtime_profiles_config(path)
|
||||
data = load_runtime_profiles_overlay_dict(target)
|
||||
if not isinstance(data, dict):
|
||||
data = {}
|
||||
|
||||
profiles = data.get("profiles")
|
||||
if isinstance(profiles, dict):
|
||||
profiles.pop(str(profile_key), None)
|
||||
if not profiles:
|
||||
data.pop("profiles", None)
|
||||
|
||||
if str(data.get("default_profile") or "").strip() == str(profile_key):
|
||||
data["default_profile"] = "standard_local"
|
||||
|
||||
return _write_runtime_profiles_overlay_dict(target, data)
|
||||
@@ -14,7 +14,7 @@ from collections import Counter
|
||||
from pathlib import Path
|
||||
|
||||
BASELINE_DIR = Path(__file__).parent / "baseline"
|
||||
OUTPUT_DIR = Path("/home/dom/Téléchargements/II-1 Ctrl_T2A_2025_CHCB_DocJustificatifs (1)/anonymise_audit_30")
|
||||
OUTPUT_DIR = Path("/home/dom/Téléchargements/II-1 Ctrl_T2A_2025_CHUXX_DocJustificatifs (1)/anonymise_audit_30")
|
||||
|
||||
# === Patterns de fuites connues ===
|
||||
LEAK_CHECKS = {
|
||||
@@ -23,7 +23,7 @@ LEAK_CHECKS = {
|
||||
"RPPS_raw": re.compile(r"\b[12]\d{10}\b"), # 11 chiffres commençant par 1 ou 2
|
||||
"bracket_double": re.compile(r"\[\["),
|
||||
"www_hospital": re.compile(r"www\.ch-cote-basque"),
|
||||
"FINESS_raw": re.compile(r"\b640000162\b"),
|
||||
"FINESS_raw": re.compile(r"\b999999999\b"),
|
||||
}
|
||||
|
||||
# === Termes médicaux qui NE doivent PAS être masqués ===
|
||||
|
||||
@@ -15,7 +15,7 @@ from vlm_manager import VlmManager
|
||||
from gliner_manager import GlinerManager
|
||||
from camembert_ner_manager import CamembertNerManager
|
||||
|
||||
SRC = Path("/home/dom/Téléchargements/II-1 Ctrl_T2A_2025_CHCB_DocJustificatifs (1)")
|
||||
SRC = Path("/home/dom/Téléchargements/II-1 Ctrl_T2A_2025_CHUXX_DocJustificatifs (1)")
|
||||
OUTDIR = SRC / "anonymise_audit_30"
|
||||
CONFIG = RUNTIME_DICTIONARIES_CONFIG_PATH
|
||||
|
||||
|
||||
@@ -12,7 +12,7 @@ import anonymizer_core_refactored_onnx as core
|
||||
from config_defaults import RUNTIME_DICTIONARIES_CONFIG_PATH
|
||||
from eds_pseudo_manager import EdsPseudoManager
|
||||
|
||||
SRC = Path("/home/dom/Téléchargements/II-1 Ctrl_T2A_2025_CHCB_DocJustificatifs (1)")
|
||||
SRC = Path("/home/dom/Téléchargements/II-1 Ctrl_T2A_2025_CHUXX_DocJustificatifs (1)")
|
||||
OUTDIR = SRC / "anonymise"
|
||||
CONFIG = RUNTIME_DICTIONARIES_CONFIG_PATH
|
||||
|
||||
|
||||
@@ -21,7 +21,7 @@ sys.path.insert(0, str(Path(__file__).parent))
|
||||
|
||||
from config_defaults import RUNTIME_DICTIONARIES_CONFIG_PATH
|
||||
|
||||
SRC = Path("/home/dom/Téléchargements/II-1 Ctrl_T2A_2025_CHCB_DocJustificatifs (1)")
|
||||
SRC = Path("/home/dom/Téléchargements/II-1 Ctrl_T2A_2025_CHUXX_DocJustificatifs (1)")
|
||||
OUTDIR = SRC / "anonymise_silver_extra"
|
||||
CONFIG = RUNTIME_DICTIONARIES_CONFIG_PATH
|
||||
|
||||
|
||||
@@ -74,7 +74,7 @@ def normalize(s: str) -> str:
|
||||
def extract_distinctive_name(full_name: str) -> str:
|
||||
"""Extrait la partie distinctive d'un nom d'établissement.
|
||||
|
||||
Ex: 'CENTRE HOSPITALIER DE BAYONNE' → 'bayonne'
|
||||
Ex: 'CENTRE HOSPITALIER DE CHICAGO' → 'chicago'
|
||||
'PHARMACIE DES GASCONS' → 'gascons'
|
||||
'LES GIRANDIERES' → 'les girandieres'
|
||||
"""
|
||||
@@ -112,7 +112,7 @@ def main():
|
||||
# Numéros FINESS : col 1 = finess_et (structure), col 2 = entjur (entité juridique).
|
||||
# Les deux sont des identifiants 9 chiffres réels du référentiel FINESS et doivent
|
||||
# être masqués. Avant ce fix, seul finess_et était extrait (~102k), et les ~48k
|
||||
# entjur étaient manqués — provoquant des fuites (ex: 640780417 entjur CHCB).
|
||||
# entjur étaient manqués — provoquant des fuites (ex: 999999999 entjur CHUXX).
|
||||
for col_idx in (1, 2):
|
||||
finess = row[col_idx].strip() if col_idx < len(row) else ""
|
||||
if re.match(r"^\d{9}$", finess):
|
||||
|
||||
@@ -34,7 +34,7 @@ from typing import Dict, List, Set, Tuple
|
||||
# === Chemins par défaut ===
|
||||
PROJECT_DIR = Path(__file__).parent.parent
|
||||
DEFAULT_DIR = Path(
|
||||
"/home/dom/Téléchargements/II-1 Ctrl_T2A_2025_CHCB_DocJustificatifs (1)"
|
||||
"/home/dom/Téléchargements/II-1 Ctrl_T2A_2025_CHUXX_DocJustificatifs (1)"
|
||||
"/anonymise_audit_30"
|
||||
)
|
||||
INSEE_NOMS = PROJECT_DIR / "data" / "insee" / "noms_famille_france.txt"
|
||||
@@ -85,7 +85,7 @@ NAME_IGNORE = {
|
||||
"GAUCHE", "DROITE", "ANTERIEUR", "POSTERIEUR",
|
||||
"JANVIER", "FEVRIER", "MARS", "AVRIL", "JUIN", "JUILLET",
|
||||
"AOUT", "SEPTEMBRE", "OCTOBRE", "NOVEMBRE", "DECEMBRE",
|
||||
"FRANCE", "BAYONNE", "BORDEAUX", "PARIS", "TOULOUSE",
|
||||
"FRANCE", "CHICAGO", "BORDEAUX", "PARIS", "TOULOUSE",
|
||||
"SAINT", "SAINTE",
|
||||
}
|
||||
|
||||
|
||||
@@ -45,7 +45,7 @@ PLACEHOLDER_TO_BIO: Dict[str, str] = {
|
||||
|
||||
RE_PLACEHOLDER = re.compile(r"^\[([A-Z_]+)\]$")
|
||||
|
||||
SRC = Path("/home/dom/Téléchargements/II-1 Ctrl_T2A_2025_CHCB_DocJustificatifs (1)")
|
||||
SRC = Path("/home/dom/Téléchargements/II-1 Ctrl_T2A_2025_CHUXX_DocJustificatifs (1)")
|
||||
AUDIT_DIR = SRC / "anonymise_audit_30"
|
||||
|
||||
# --- Gazetteer paths ---
|
||||
|
||||
@@ -2,7 +2,7 @@
|
||||
"""Reprocess corpus audit_30 avec le code actuel.
|
||||
|
||||
Lit la liste des documents depuis evaluation/baseline_scores.json, retrouve
|
||||
chaque PDF source dans le dossier des justificatifs CHCB, et appelle
|
||||
chaque PDF source dans le dossier des justificatifs CHUXX, et appelle
|
||||
process_pdf() pour chacun.
|
||||
|
||||
Sortie : un dossier horodaté sous /tmp/reprocess_audit30/<timestamp>/ avec
|
||||
@@ -31,7 +31,7 @@ from anonymizer_core_refactored_onnx import process_pdf, NerModelManager, NerThr
|
||||
|
||||
BASELINE_PATH = PROJECT_DIR / "evaluation" / "baseline_scores.json"
|
||||
SOURCE_ROOT = Path(
|
||||
"/home/dom/Téléchargements/II-1 Ctrl_T2A_2025_CHCB_DocJustificatifs (1)"
|
||||
"/home/dom/Téléchargements/II-1 Ctrl_T2A_2025_CHUXX_DocJustificatifs (1)"
|
||||
)
|
||||
|
||||
|
||||
|
||||
@@ -22,7 +22,7 @@ def test_default_config_template_is_externalized():
|
||||
assert "whitelist_phrases:" in text
|
||||
|
||||
cfg = core.load_dictionaries(None)
|
||||
assert "CHCB" in cfg["blacklist"]["force_mask_terms"]
|
||||
assert "CHUXX" in cfg["blacklist"]["force_mask_terms"]
|
||||
|
||||
|
||||
def test_runtime_overlay_template_is_minimal():
|
||||
@@ -82,14 +82,14 @@ def test_runtime_overlay_is_created_and_effective_merge_works(tmp_path: Path):
|
||||
assert cfg_path.exists()
|
||||
|
||||
effective = load_effective_dictionaries_dict(cfg_path)
|
||||
assert "CHCB" in effective["blacklist"]["force_mask_terms"]
|
||||
assert "CHUXX" in effective["blacklist"]["force_mask_terms"]
|
||||
|
||||
cfg_path.write_text(
|
||||
"blacklist:\n force_mask_terms:\n - LOCAL_SIGLE\n",
|
||||
encoding="utf-8",
|
||||
)
|
||||
effective = load_effective_dictionaries_dict(cfg_path)
|
||||
assert "CHCB" in effective["blacklist"]["force_mask_terms"]
|
||||
assert "CHUXX" in effective["blacklist"]["force_mask_terms"]
|
||||
assert "LOCAL_SIGLE" in effective["blacklist"]["force_mask_terms"]
|
||||
|
||||
|
||||
@@ -100,5 +100,5 @@ def test_effective_param_lists_include_defaults_when_overlay_is_empty(tmp_path:
|
||||
params = load_effective_param_lists(cfg_path)
|
||||
|
||||
assert "classification internationale" in params["whitelist_phrases"]
|
||||
assert "CHCB" in params["blacklist_force_mask_terms"]
|
||||
assert "CHUXX" in params["blacklist_force_mask_terms"]
|
||||
assert params["additional_stopwords"] == []
|
||||
|
||||
@@ -56,8 +56,8 @@ class TestHeaderPiiDetection:
|
||||
def test_structured_code_postal_preserves_label_and_audit(self):
|
||||
cfg = load_dictionaries(None)
|
||||
|
||||
anon = anonymise_document_regex(["Code postal : 64100"], [[]], cfg)
|
||||
anon = anonymise_document_regex(["Code postal : 12345"], [[]], cfg)
|
||||
text = selective_rescan(anon.text_out, cfg)
|
||||
|
||||
assert text == "Code postal : [CODE_POSTAL]"
|
||||
assert any(h.kind == "CODE_POSTAL" and h.original == "64100" for h in anon.audit)
|
||||
assert any(h.kind == "CODE_POSTAL" and h.original == "12345" for h in anon.audit)
|
||||
|
||||
167
tests/unit/test_profile_defaults.py
Normal file
167
tests/unit/test_profile_defaults.py
Normal file
@@ -0,0 +1,167 @@
|
||||
#!/usr/bin/env python3
|
||||
from pathlib import Path
|
||||
|
||||
from profile_defaults import (
|
||||
delete_runtime_profile,
|
||||
ensure_runtime_profiles_config,
|
||||
get_default_profile_key,
|
||||
list_default_profile_keys,
|
||||
list_effective_profiles,
|
||||
load_effective_profiles_dict,
|
||||
read_default_profiles_text,
|
||||
read_runtime_profiles_overlay_text,
|
||||
save_runtime_profile,
|
||||
set_runtime_default_profile,
|
||||
)
|
||||
|
||||
|
||||
def test_default_profiles_template_is_externalized():
|
||||
text = read_default_profiles_text()
|
||||
|
||||
assert "default_profile:" in text
|
||||
assert "chuxx_strict:" in text
|
||||
assert "partage_recherche:" in text
|
||||
assert "standard_local" in list_default_profile_keys()
|
||||
|
||||
|
||||
def test_runtime_profiles_overlay_template_is_minimal():
|
||||
text = read_runtime_profiles_overlay_text()
|
||||
|
||||
assert "profiles.default.yml" in text
|
||||
assert "{}" in text
|
||||
|
||||
|
||||
def test_runtime_profiles_overlay_is_created_and_merged(tmp_path: Path):
|
||||
cfg_path = tmp_path / "profiles.yml"
|
||||
|
||||
created = ensure_runtime_profiles_config(cfg_path)
|
||||
assert created == cfg_path
|
||||
assert cfg_path.exists()
|
||||
|
||||
effective = load_effective_profiles_dict(cfg_path)
|
||||
assert effective["default_profile"] == "standard_local"
|
||||
|
||||
cfg_path.write_text(
|
||||
"default_profile: partage_recherche\n"
|
||||
"profiles:\n"
|
||||
" partage_recherche:\n"
|
||||
" description: Profil local surcharge\n",
|
||||
encoding="utf-8",
|
||||
)
|
||||
|
||||
effective = load_effective_profiles_dict(cfg_path)
|
||||
assert effective["default_profile"] == "partage_recherche"
|
||||
assert effective["profiles"]["partage_recherche"]["description"] == "Profil local surcharge"
|
||||
|
||||
|
||||
def test_list_effective_profiles_normalizes_flags(tmp_path: Path):
|
||||
cfg_path = tmp_path / "profiles.yml"
|
||||
cfg_path.write_text(
|
||||
"profiles:\n"
|
||||
" custom:\n"
|
||||
" label: Profil custom\n"
|
||||
" require_manual_mask: true\n"
|
||||
" force_disable_vlm: true\n"
|
||||
" preferred_manual_mask_template: hopital/formulaire.yml\n"
|
||||
" param_lists:\n"
|
||||
" whitelist_phrases:\n"
|
||||
" - DOCUMENT INTERNE\n"
|
||||
" blacklist_force_mask_terms:\n"
|
||||
" - CUSTOM_ETAB\n"
|
||||
" additional_stopwords:\n"
|
||||
" - DIM\n"
|
||||
" dictionaries_overlay:\n"
|
||||
" blacklist:\n"
|
||||
" force_mask_terms:\n"
|
||||
" - CUSTOM_ETAB\n",
|
||||
encoding="utf-8",
|
||||
)
|
||||
|
||||
profiles = list_effective_profiles(cfg_path)
|
||||
|
||||
assert profiles["custom"]["label"] == "Profil custom"
|
||||
assert profiles["custom"]["require_manual_mask"] is True
|
||||
assert profiles["custom"]["force_disable_vlm"] is True
|
||||
assert profiles["custom"]["preferred_manual_mask_template"] == "hopital/formulaire.yml"
|
||||
assert profiles["custom"]["has_param_lists"] is True
|
||||
assert profiles["custom"]["param_lists"]["whitelist_phrases"] == ["DOCUMENT INTERNE"]
|
||||
assert profiles["custom"]["param_lists"]["blacklist_force_mask_terms"] == ["CUSTOM_ETAB"]
|
||||
assert profiles["custom"]["param_lists"]["additional_stopwords"] == ["DIM"]
|
||||
assert "CUSTOM_ETAB" in profiles["custom"]["dictionaries_overlay"]["blacklist"]["force_mask_terms"]
|
||||
|
||||
|
||||
def test_default_profile_key_keeps_merged_default_when_available(tmp_path: Path):
|
||||
cfg_path = tmp_path / "profiles.yml"
|
||||
cfg_path.write_text(
|
||||
"default_profile: missing\n"
|
||||
"profiles:\n"
|
||||
" custom:\n"
|
||||
" label: Profil custom\n",
|
||||
encoding="utf-8",
|
||||
)
|
||||
|
||||
assert get_default_profile_key(cfg_path) == "standard_local"
|
||||
|
||||
|
||||
def test_save_runtime_profile_persists_new_profile_and_default(tmp_path: Path):
|
||||
cfg_path = tmp_path / "profiles.yml"
|
||||
|
||||
save_runtime_profile(
|
||||
"bureau_strict",
|
||||
{
|
||||
"label": "Bureau strict",
|
||||
"description": "Profil créé depuis la GUI",
|
||||
"require_manual_mask": True,
|
||||
"force_disable_vlm": True,
|
||||
"preferred_manual_mask_template": "chuxx/formulaire.yml",
|
||||
"has_preferred_manual_mask_template": True,
|
||||
"param_lists": {
|
||||
"whitelist_phrases": ["VALIDATION DIM"],
|
||||
"blacklist_force_mask_terms": ["CHUXX"],
|
||||
"additional_stopwords": ["RUM"],
|
||||
},
|
||||
"has_param_lists": True,
|
||||
"dictionaries_overlay": {
|
||||
"blacklist": {
|
||||
"force_mask_terms": ["CHUXX"],
|
||||
},
|
||||
},
|
||||
},
|
||||
cfg_path,
|
||||
set_default=True,
|
||||
)
|
||||
|
||||
data = load_effective_profiles_dict(cfg_path)
|
||||
assert data["default_profile"] == "bureau_strict"
|
||||
|
||||
saved = list_effective_profiles(cfg_path)["bureau_strict"]
|
||||
assert saved["label"] == "Bureau strict"
|
||||
assert saved["require_manual_mask"] is True
|
||||
assert saved["force_disable_vlm"] is True
|
||||
assert saved["preferred_manual_mask_template"] == "chuxx/formulaire.yml"
|
||||
assert saved["param_lists"]["whitelist_phrases"] == ["VALIDATION DIM"]
|
||||
assert saved["param_lists"]["blacklist_force_mask_terms"] == ["CHUXX"]
|
||||
assert saved["param_lists"]["additional_stopwords"] == ["RUM"]
|
||||
|
||||
|
||||
def test_set_and_delete_runtime_profile(tmp_path: Path):
|
||||
cfg_path = tmp_path / "profiles.yml"
|
||||
save_runtime_profile(
|
||||
"profil_temporaire",
|
||||
{
|
||||
"label": "Profil temporaire",
|
||||
"description": "",
|
||||
"require_manual_mask": False,
|
||||
"force_disable_vlm": False,
|
||||
"dictionaries_overlay": {},
|
||||
},
|
||||
cfg_path,
|
||||
)
|
||||
|
||||
set_runtime_default_profile("profil_temporaire", cfg_path)
|
||||
assert get_default_profile_key(cfg_path) == "profil_temporaire"
|
||||
|
||||
delete_runtime_profile("profil_temporaire", cfg_path)
|
||||
profiles = list_effective_profiles(cfg_path)
|
||||
assert "profil_temporaire" not in profiles
|
||||
assert get_default_profile_key(cfg_path) == "standard_local"
|
||||
@@ -65,7 +65,7 @@ def classify_complexity(stats: dict) -> str:
|
||||
|
||||
|
||||
def main():
|
||||
corpus_dir = Path("/home/dom/Téléchargements/II-1 Ctrl_T2A_2025_CHCB_DocJustificatifs (1)/")
|
||||
corpus_dir = Path("/home/dom/Téléchargements/II-1 Ctrl_T2A_2025_CHUXX_DocJustificatifs (1)/")
|
||||
|
||||
if not corpus_dir.exists():
|
||||
print(f"Erreur : {corpus_dir} n'existe pas")
|
||||
|
||||
@@ -66,7 +66,7 @@ def analyze_dates_in_audit(audit_path: Path, text_path: Path):
|
||||
return dates_info
|
||||
|
||||
def main():
|
||||
prod_dir = Path("/home/dom/Téléchargements/II-1 Ctrl_T2A_2025_CHCB_DocJustificatifs/anonymise")
|
||||
prod_dir = Path("/home/dom/Téléchargements/II-1 Ctrl_T2A_2025_CHUXX_DocJustificatifs/anonymise")
|
||||
|
||||
print("=" * 80)
|
||||
print("ANALYSE DES DATES MASQUÉES")
|
||||
|
||||
@@ -7,7 +7,7 @@ from pathlib import Path
|
||||
from collections import Counter, defaultdict
|
||||
|
||||
# Répertoire des documents anonymisés
|
||||
ANON_DIR = Path("/home/dom/Téléchargements/II-1 Ctrl_T2A_2025_CHCB_DocJustificatifs/anonymise")
|
||||
ANON_DIR = Path("/home/dom/Téléchargements/II-1 Ctrl_T2A_2025_CHUXX_DocJustificatifs/anonymise")
|
||||
|
||||
def analyze_leaks(txt_file):
|
||||
"""Détecte les fuites potentielles dans un fichier texte."""
|
||||
@@ -23,7 +23,7 @@ def analyze_leaks(txt_file):
|
||||
"telephone": re.compile(r"\b0[1-9](?:[\s.-]?\d{2}){4}\b"),
|
||||
"email": re.compile(r"\b[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}\b"),
|
||||
"adresse": re.compile(r"\b\d+\s+(?:rue|avenue|boulevard|place|chemin|impasse)\s+[A-Z]", re.IGNORECASE),
|
||||
"chcb": re.compile(r"\bCHCB\b", re.IGNORECASE),
|
||||
"chuxx": re.compile(r"\bCHUXX\b", re.IGNORECASE),
|
||||
}
|
||||
|
||||
for pattern_name, pattern in patterns.items():
|
||||
|
||||
@@ -8,8 +8,8 @@ sys.path.insert(0, str(Path(__file__).parent.parent))
|
||||
import pdfplumber
|
||||
|
||||
# Document original
|
||||
original_pdf = Path("/home/dom/Téléchargements/II-1 Ctrl_T2A_2025_CHCB_DocJustificatifs/102_23056463/CRH 23056364.pdf")
|
||||
anonymized_txt = Path("/home/dom/Téléchargements/II-1 Ctrl_T2A_2025_CHCB_DocJustificatifs/anonymise/CRH 23056364.pseudonymise.txt")
|
||||
original_pdf = Path("/home/dom/Téléchargements/II-1 Ctrl_T2A_2025_CHUXX_DocJustificatifs/102_23056463/CRH 23056364.pdf")
|
||||
anonymized_txt = Path("/home/dom/Téléchargements/II-1 Ctrl_T2A_2025_CHUXX_DocJustificatifs/anonymise/CRH 23056364.pseudonymise.txt")
|
||||
|
||||
print("="*80)
|
||||
print("COMPARAISON ORIGINAL vs ANONYMISÉ")
|
||||
|
||||
@@ -46,7 +46,7 @@ def compare_datasets():
|
||||
test_dir = Path("tests/ground_truth/pdfs/baseline_anonymized")
|
||||
|
||||
# Production (régression)
|
||||
prod_dir = Path("/home/dom/Téléchargements/II-1 Ctrl_T2A_2025_CHCB_DocJustificatifs/anonymise")
|
||||
prod_dir = Path("/home/dom/Téléchargements/II-1 Ctrl_T2A_2025_CHUXX_DocJustificatifs/anonymise")
|
||||
|
||||
print("\n" + "="*80)
|
||||
print("COMPARAISON TEST DATASET vs PRODUCTION")
|
||||
|
||||
@@ -17,11 +17,11 @@ print()
|
||||
|
||||
# Test the pattern
|
||||
test_lines = [
|
||||
"confirmée à 5,7 g ici au CHCB. Appel Dr [NOM], hématologue biologiste",
|
||||
"CHCB :",
|
||||
"CHCB",
|
||||
"au CHCB",
|
||||
"le CHCB est",
|
||||
"confirmée à 5,7 g ici au CHUXX. Appel Dr [NOM], hématologue biologiste",
|
||||
"CHUXX :",
|
||||
"CHUXX",
|
||||
"au CHUXX",
|
||||
"le CHUXX est",
|
||||
]
|
||||
|
||||
for term in cfg.get("blacklist", {}).get("force_mask_terms", []):
|
||||
|
||||
@@ -210,8 +210,8 @@ def main():
|
||||
"""Analyse un échantillon de documents"""
|
||||
|
||||
# Chemins
|
||||
original_dir = Path("/home/dom/Téléchargements/II-1 Ctrl_T2A_2025_CHCB_DocJustificatifs (1)")
|
||||
anonymized_dir = Path("/home/dom/Téléchargements/II-1 Ctrl_T2A_2025_CHCB_DocJustificatifs/anonymise")
|
||||
original_dir = Path("/home/dom/Téléchargements/II-1 Ctrl_T2A_2025_CHUXX_DocJustificatifs (1)")
|
||||
anonymized_dir = Path("/home/dom/Téléchargements/II-1 Ctrl_T2A_2025_CHUXX_DocJustificatifs/anonymise")
|
||||
|
||||
# Documents à analyser
|
||||
test_docs = [
|
||||
|
||||
@@ -122,7 +122,7 @@ def analyze_anonymized_text(text_path: Path) -> Dict:
|
||||
def compare_datasets():
|
||||
"""Compare test dataset vs production."""
|
||||
test_dir = Path("tests/ground_truth/pdfs/baseline_anonymized")
|
||||
prod_dir = Path("/home/dom/Téléchargements/II-1 Ctrl_T2A_2025_CHCB_DocJustificatifs/anonymise")
|
||||
prod_dir = Path("/home/dom/Téléchargements/II-1 Ctrl_T2A_2025_CHUXX_DocJustificatifs/anonymise")
|
||||
|
||||
print("=" * 80)
|
||||
print("ANALYSE DES CAUSES RACINES - RÉGRESSION DE QUALITÉ")
|
||||
|
||||
@@ -4,15 +4,15 @@ Simule l'effet d'une règle d'administration sur un texte ou sur le corpus synth
|
||||
|
||||
Usage :
|
||||
# Appliquer une règle à un texte libre
|
||||
python tools/simulate_admin_rule.py --rule-id rule_chcb_exact_mask \\
|
||||
--text "Consulté au CHCB le 12/06/2024."
|
||||
python tools/simulate_admin_rule.py --rule-id rule_chuxx_exact_mask \\
|
||||
--text "Consulté au CHUXX le 12/06/2024."
|
||||
|
||||
# Appliquer à un fichier texte
|
||||
python tools/simulate_admin_rule.py --rule-id rule_chcb_exact_mask \\
|
||||
python tools/simulate_admin_rule.py --rule-id rule_chuxx_exact_mask \\
|
||||
--file path/to/document.txt
|
||||
|
||||
# Valider la règle sur ses required_case_ids (--corpus)
|
||||
python tools/simulate_admin_rule.py --rule-id rule_chcb_exact_mask --corpus
|
||||
python tools/simulate_admin_rule.py --rule-id rule_chuxx_exact_mask --corpus
|
||||
|
||||
# Valider TOUTES les règles actives sur leurs corpus
|
||||
python tools/simulate_admin_rule.py --all --corpus
|
||||
|
||||
@@ -16,7 +16,7 @@ def test_all_cro():
|
||||
"""Test la propagation des dates de naissance sur tous les CRO."""
|
||||
|
||||
# Chercher tous les CRO dans les 59 OGC
|
||||
ogc_dir = Path("/home/dom/Téléchargements/II-1 Ctrl_T2A_2025_CHCB_DocJustificatifs (1)")
|
||||
ogc_dir = Path("/home/dom/Téléchargements/II-1 Ctrl_T2A_2025_CHUXX_DocJustificatifs (1)")
|
||||
|
||||
# Trouver tous les CRO (compte rendu opératoire)
|
||||
print("Recherche de tous les CRO dans le corpus...")
|
||||
@@ -59,25 +59,25 @@ def test_all_cro():
|
||||
date_context_pattern = re.compile(r'Né(?:e)?\s+le\s+(\d{1,2}[\s/.\-]+\d{1,2}[\s/.\-]+\d{2,4})', re.IGNORECASE)
|
||||
context_leaks = date_context_pattern.findall(anonymized_text)
|
||||
|
||||
# Scanner "CHCB" en clair
|
||||
chcb_leaks = re.findall(r'\bCHCB\b', anonymized_text)
|
||||
# Scanner "CHUXX" en clair
|
||||
chuxx_leaks = re.findall(r'\bCHUXX\b', anonymized_text)
|
||||
|
||||
# Compter les fuites totales
|
||||
total_leaks = len(context_leaks) + len(chcb_leaks)
|
||||
total_leaks = len(context_leaks) + len(chuxx_leaks)
|
||||
|
||||
status = "✅" if total_leaks == 0 else "❌"
|
||||
print(f" {status} Fuites 'Né(e) le': {len(context_leaks)}, Fuites CHCB: {len(chcb_leaks)}")
|
||||
print(f" {status} Fuites 'Né(e) le': {len(context_leaks)}, Fuites CHUXX: {len(chuxx_leaks)}")
|
||||
|
||||
if context_leaks:
|
||||
print(f" Exemples dates: {context_leaks[:3]}")
|
||||
if chcb_leaks:
|
||||
print(f" Exemples CHCB: {chcb_leaks[:3]}")
|
||||
if chuxx_leaks:
|
||||
print(f" Exemples CHUXX: {chuxx_leaks[:3]}")
|
||||
|
||||
results.append({
|
||||
'file': pdf_path.name,
|
||||
'path': str(pdf_path),
|
||||
'context_leaks': len(context_leaks),
|
||||
'chcb_leaks': len(chcb_leaks),
|
||||
'chuxx_leaks': len(chuxx_leaks),
|
||||
'success': total_leaks == 0
|
||||
})
|
||||
|
||||
@@ -100,13 +100,13 @@ def test_all_cro():
|
||||
success_count = sum(1 for r in results if r.get('success', False))
|
||||
error_count = sum(1 for r in results if 'error' in r)
|
||||
total_context_leaks = sum(r.get('context_leaks', 0) for r in results)
|
||||
total_chcb_leaks = sum(r.get('chcb_leaks', 0) for r in results)
|
||||
total_chuxx_leaks = sum(r.get('chuxx_leaks', 0) for r in results)
|
||||
|
||||
print(f"Documents testés: {len(results)}")
|
||||
print(f"Succès: {success_count}/{len(results)} ({success_count/len(results)*100:.1f}%)")
|
||||
print(f"Erreurs: {error_count}")
|
||||
print(f"Fuites 'Né(e) le' totales: {total_context_leaks}")
|
||||
print(f"Fuites CHCB totales: {total_chcb_leaks}")
|
||||
print(f"Fuites CHUXX totales: {total_chuxx_leaks}")
|
||||
print(f"Temps total: {elapsed_time:.1f}s ({elapsed_time/len(results):.1f}s/doc)")
|
||||
|
||||
# Liste des documents avec fuites
|
||||
@@ -119,7 +119,7 @@ def test_all_cro():
|
||||
print(f"\n{doc['file']}")
|
||||
print(f" Path: {doc['path']}")
|
||||
print(f" Fuites dates: {doc.get('context_leaks', 0)}")
|
||||
print(f" Fuites CHCB: {doc.get('chcb_leaks', 0)}")
|
||||
print(f" Fuites CHUXX: {doc.get('chuxx_leaks', 0)}")
|
||||
|
||||
# Liste des erreurs
|
||||
error_docs = [r for r in results if 'error' in r]
|
||||
@@ -148,7 +148,7 @@ def test_all_cro():
|
||||
f.write(f"Succès: {success_count}/{len(results)} ({success_count/len(results)*100:.1f}%)\n")
|
||||
f.write(f"Erreurs: {error_count}\n")
|
||||
f.write(f"Fuites 'Né(e) le' totales: {total_context_leaks}\n")
|
||||
f.write(f"Fuites CHCB totales: {total_chcb_leaks}\n")
|
||||
f.write(f"Fuites CHUXX totales: {total_chuxx_leaks}\n")
|
||||
f.write(f"Temps total: {elapsed_time:.1f}s ({elapsed_time/len(results):.1f}s/doc)\n\n")
|
||||
|
||||
if failed_docs:
|
||||
@@ -159,7 +159,7 @@ def test_all_cro():
|
||||
f.write(f"{doc['file']}\n")
|
||||
f.write(f" Path: {doc['path']}\n")
|
||||
f.write(f" Fuites dates: {doc.get('context_leaks', 0)}\n")
|
||||
f.write(f" Fuites CHCB: {doc.get('chcb_leaks', 0)}\n\n")
|
||||
f.write(f" Fuites CHUXX: {doc.get('chuxx_leaks', 0)}\n\n")
|
||||
|
||||
if error_docs:
|
||||
f.write("=" * 80 + "\n")
|
||||
|
||||
@@ -16,7 +16,7 @@ def test_date_propagation():
|
||||
"""Test la propagation des dates de naissance sur un CRO."""
|
||||
|
||||
# Chercher un CRO dans les 59 OGC
|
||||
ogc_dir = Path("/home/dom/Téléchargements/II-1 Ctrl_T2A_2025_CHCB_DocJustificatifs (1)")
|
||||
ogc_dir = Path("/home/dom/Téléchargements/II-1 Ctrl_T2A_2025_CHUXX_DocJustificatifs (1)")
|
||||
|
||||
# Trouver un CRO (compte rendu opératoire)
|
||||
cro_files = []
|
||||
@@ -68,19 +68,19 @@ def test_date_propagation():
|
||||
lines_with_placeholders = [line for line in anonymized_text.split('\n') if placeholder_pattern.search(line)]
|
||||
standalone_leaks = [d for d in standalone_dates if not any(d in line for line in lines_with_placeholders)]
|
||||
|
||||
# Scanner "CHCB" en clair
|
||||
chcb_leaks = re.findall(r'\bCHCB\b', anonymized_text)
|
||||
# Scanner "CHUXX" en clair
|
||||
chuxx_leaks = re.findall(r'\bCHUXX\b', anonymized_text)
|
||||
|
||||
# Compter les fuites totales
|
||||
total_leaks = len(context_leaks) + len(chcb_leaks)
|
||||
total_leaks = len(context_leaks) + len(chuxx_leaks)
|
||||
|
||||
status = "✅" if total_leaks == 0 else "❌"
|
||||
print(f" {status} Fuites 'Né(e) le': {len(context_leaks)}, Fuites CHCB: {len(chcb_leaks)}")
|
||||
print(f" {status} Fuites 'Né(e) le': {len(context_leaks)}, Fuites CHUXX: {len(chuxx_leaks)}")
|
||||
|
||||
if context_leaks:
|
||||
print(f" Exemples dates: {context_leaks[:3]}")
|
||||
if chcb_leaks:
|
||||
print(f" Exemples CHCB: {chcb_leaks[:3]}")
|
||||
if chuxx_leaks:
|
||||
print(f" Exemples CHUXX: {chuxx_leaks[:3]}")
|
||||
|
||||
# Info : dates standalone (pas nécessairement des fuites)
|
||||
if standalone_leaks:
|
||||
@@ -89,7 +89,7 @@ def test_date_propagation():
|
||||
results.append({
|
||||
'file': pdf_path.name,
|
||||
'context_leaks': len(context_leaks),
|
||||
'chcb_leaks': len(chcb_leaks),
|
||||
'chuxx_leaks': len(chuxx_leaks),
|
||||
'standalone_dates': len(standalone_leaks),
|
||||
'success': total_leaks == 0
|
||||
})
|
||||
@@ -109,13 +109,13 @@ def test_date_propagation():
|
||||
|
||||
success_count = sum(1 for r in results if r.get('success', False))
|
||||
total_context_leaks = sum(r.get('context_leaks', 0) for r in results)
|
||||
total_chcb_leaks = sum(r.get('chcb_leaks', 0) for r in results)
|
||||
total_chuxx_leaks = sum(r.get('chuxx_leaks', 0) for r in results)
|
||||
total_standalone = sum(r.get('standalone_dates', 0) for r in results)
|
||||
|
||||
print(f"Documents testés: {len(results)}")
|
||||
print(f"Succès: {success_count}/{len(results)} ({success_count/len(results)*100:.1f}%)")
|
||||
print(f"Fuites 'Né(e) le' totales: {total_context_leaks}")
|
||||
print(f"Fuites CHCB totales: {total_chcb_leaks}")
|
||||
print(f"Fuites CHUXX totales: {total_chuxx_leaks}")
|
||||
print(f"Dates standalone (info): {total_standalone}")
|
||||
|
||||
if success_count == len(results):
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
#!/usr/bin/env python3
|
||||
"""Test CHCB force_term detection on the 2 leaked documents."""
|
||||
"""Test force_term detection on the 2 leaked documents."""
|
||||
|
||||
from pathlib import Path
|
||||
import sys
|
||||
@@ -10,10 +10,10 @@ sys.path.insert(0, str(Path(__file__).parent.parent))
|
||||
import anonymizer_core_refactored_onnx as core
|
||||
from config_defaults import RUNTIME_DICTIONARIES_CONFIG_PATH
|
||||
|
||||
def test_chcb_detection():
|
||||
"""Test CHCB detection on the 2 documents with leaks."""
|
||||
def test_force_term_detection():
|
||||
"""Test force_term detection on the 2 documents with leaks."""
|
||||
|
||||
corpus_dir = Path("/home/dom/Téléchargements/II-1 Ctrl_T2A_2025_CHCB_DocJustificatifs (1)")
|
||||
corpus_dir = Path("/home/dom/Téléchargements/II-1 Ctrl_T2A_2025_CHUXX_DocJustificatifs (1)")
|
||||
|
||||
# Document 1: trackare-BA148337-23091302
|
||||
doc1_path = None
|
||||
@@ -45,7 +45,7 @@ def test_chcb_detection():
|
||||
print("TEST DOCUMENT 1: trackare-BA148337-23091302")
|
||||
print("=" * 80)
|
||||
|
||||
outdir = Path("test_chcb_leak")
|
||||
outdir = Path("test_force_term_leak")
|
||||
outdir.mkdir(exist_ok=True)
|
||||
|
||||
try:
|
||||
@@ -64,14 +64,14 @@ def test_chcb_detection():
|
||||
txt_file = Path(outputs["text"])
|
||||
content = txt_file.read_text(encoding="utf-8")
|
||||
|
||||
if "CHCB" in content:
|
||||
print("🔴 FUITE DÉTECTÉE: CHCB trouvé dans le texte anonymisé")
|
||||
if "CHUXX" in content:
|
||||
print("🔴 FUITE DÉTECTÉE: CHUXX trouvé dans le texte anonymisé")
|
||||
# Trouver le contexte
|
||||
for i, line in enumerate(content.split("\n"), 1):
|
||||
if "CHCB" in line:
|
||||
if "CHUXX" in line:
|
||||
print(f" Ligne {i}: {line.strip()}")
|
||||
else:
|
||||
print("✅ Aucune fuite CHCB")
|
||||
print("✅ Aucune fuite CHUXX")
|
||||
|
||||
# Vérifier l'audit
|
||||
import json
|
||||
@@ -80,10 +80,10 @@ def test_chcb_detection():
|
||||
with open(audit_file, 'r', encoding='utf-8') as f:
|
||||
for line in f:
|
||||
obj = json.loads(line)
|
||||
if obj.get("kind") == "force_term" and "CHCB" in obj.get("value", ""):
|
||||
if obj.get("kind") == "force_term" and "CHUXX" in obj.get("value", ""):
|
||||
force_term_count += 1
|
||||
|
||||
print(f"📊 Détections force_term CHCB: {force_term_count}")
|
||||
print(f"📊 Détections force_term CHUXX: {force_term_count}")
|
||||
|
||||
except Exception as e:
|
||||
print(f"❌ Erreur: {e}")
|
||||
@@ -113,14 +113,14 @@ def test_chcb_detection():
|
||||
txt_file = Path(outputs["text"])
|
||||
content = txt_file.read_text(encoding="utf-8")
|
||||
|
||||
if "CHCB" in content:
|
||||
print("🔴 FUITE DÉTECTÉE: CHCB trouvé dans le texte anonymisé")
|
||||
if "CHUXX" in content:
|
||||
print("🔴 FUITE DÉTECTÉE: CHUXX trouvé dans le texte anonymisé")
|
||||
# Trouver le contexte
|
||||
for i, line in enumerate(content.split("\n"), 1):
|
||||
if "CHCB" in line:
|
||||
if "CHUXX" in line:
|
||||
print(f" Ligne {i}: {line.strip()}")
|
||||
else:
|
||||
print("✅ Aucune fuite CHCB")
|
||||
print("✅ Aucune fuite CHUXX")
|
||||
|
||||
# Vérifier l'audit
|
||||
import json
|
||||
@@ -129,10 +129,10 @@ def test_chcb_detection():
|
||||
with open(audit_file, 'r', encoding='utf-8') as f:
|
||||
for line in f:
|
||||
obj = json.loads(line)
|
||||
if obj.get("kind") == "force_term" and "CHCB" in obj.get("value", ""):
|
||||
if obj.get("kind") == "force_term" and "CHUXX" in obj.get("value", ""):
|
||||
force_term_count += 1
|
||||
|
||||
print(f"📊 Détections force_term CHCB: {force_term_count}")
|
||||
print(f"📊 Détections force_term CHUXX: {force_term_count}")
|
||||
|
||||
except Exception as e:
|
||||
print(f"❌ Erreur: {e}")
|
||||
@@ -140,4 +140,4 @@ def test_chcb_detection():
|
||||
traceback.print_exc()
|
||||
|
||||
if __name__ == "__main__":
|
||||
test_chcb_detection()
|
||||
test_force_term_detection()
|
||||
@@ -88,7 +88,7 @@ import re
|
||||
leak_count = 0
|
||||
patterns = {
|
||||
"date_naissance": re.compile(r"(?:n[ée]+\s+le|DDN)\s*:?\s*\d{1,2}[/.\-]\d{1,2}[/.\-]\d{2,4}", re.IGNORECASE),
|
||||
"chcb": re.compile(r"\bCHCB\b", re.IGNORECASE),
|
||||
"chuxx": re.compile(r"\bCHUXX\b", re.IGNORECASE),
|
||||
}
|
||||
|
||||
for txt_file in out_dir.glob("*.pseudonymise.txt"):
|
||||
|
||||
@@ -24,9 +24,9 @@ def test_phase1_corrections():
|
||||
|
||||
# Documents de test (5 documents représentatifs)
|
||||
test_docs = [
|
||||
"/home/dom/Téléchargements/II-1 Ctrl_T2A_2025_CHCB_DocJustificatifs/008_23001234/CRH 23001234.pdf",
|
||||
"/home/dom/Téléchargements/II-1 Ctrl_T2A_2025_CHCB_DocJustificatifs/021_23012345/CRO 23012345.pdf",
|
||||
"/home/dom/Téléchargements/II-1 Ctrl_T2A_2025_CHCB_DocJustificatifs/033_23023456/trackare-23023456-12345678.pdf",
|
||||
"/home/dom/Téléchargements/II-1 Ctrl_T2A_2025_CHUXX_DocJustificatifs/008_23001234/CRH 23001234.pdf",
|
||||
"/home/dom/Téléchargements/II-1 Ctrl_T2A_2025_CHUXX_DocJustificatifs/021_23012345/CRO 23012345.pdf",
|
||||
"/home/dom/Téléchargements/II-1 Ctrl_T2A_2025_CHUXX_DocJustificatifs/033_23023456/trackare-23023456-12345678.pdf",
|
||||
]
|
||||
|
||||
print("=" * 80)
|
||||
|
||||
@@ -23,7 +23,7 @@ def validate_corpus_sample():
|
||||
"""Valide l'anonymisation sur un échantillon du corpus."""
|
||||
|
||||
# Répertoires
|
||||
corpus_dir = Path("/home/dom/Téléchargements/II-1 Ctrl_T2A_2025_CHCB_DocJustificatifs (1)")
|
||||
corpus_dir = Path("/home/dom/Téléchargements/II-1 Ctrl_T2A_2025_CHUXX_DocJustificatifs (1)")
|
||||
output_dir = Path("corpus_validation_sample")
|
||||
output_dir.mkdir(exist_ok=True)
|
||||
|
||||
@@ -221,7 +221,7 @@ def leak_check(output_dir: Path):
|
||||
# Patterns à vérifier
|
||||
patterns = {
|
||||
"date_naissance_contexte": re.compile(r"(?:n[ée]+\s+le|DDN)\s*:?\s*\d{1,2}[/.\-]\d{1,2}[/.\-]\d{2,4}", re.IGNORECASE),
|
||||
"chcb": re.compile(r"\bCHCB\b", re.IGNORECASE),
|
||||
"chuxx": re.compile(r"\bCHUXX\b", re.IGNORECASE),
|
||||
}
|
||||
|
||||
leaks = defaultdict(list)
|
||||
|
||||
@@ -3,7 +3,7 @@
|
||||
Validation sur le corpus complet (59 OGC / 130 PDFs).
|
||||
|
||||
Ce script anonymise tous les documents du corpus et vérifie :
|
||||
- Absence de fuites (dates de naissance, CHCB, etc.)
|
||||
- Absence de fuites (dates de naissance, CHUXX, etc.)
|
||||
- Statistiques de détection par type
|
||||
- Performances (temps de traitement)
|
||||
"""
|
||||
@@ -24,7 +24,7 @@ def validate_full_corpus():
|
||||
"""Valide l'anonymisation sur le corpus complet."""
|
||||
|
||||
# Répertoires
|
||||
corpus_dir = Path("/home/dom/Téléchargements/II-1 Ctrl_T2A_2025_CHCB_DocJustificatifs (1)")
|
||||
corpus_dir = Path("/home/dom/Téléchargements/II-1 Ctrl_T2A_2025_CHUXX_DocJustificatifs (1)")
|
||||
output_dir = Path("corpus_validation")
|
||||
output_dir.mkdir(exist_ok=True)
|
||||
|
||||
@@ -177,7 +177,7 @@ def leak_check(output_dir: Path):
|
||||
# Patterns à vérifier
|
||||
patterns = {
|
||||
"date_naissance": re.compile(r"(?:n[ée]+\s+le|DDN)\s*:?\s*\d{1,2}[/.\-]\d{1,2}[/.\-]\d{2,4}", re.IGNORECASE),
|
||||
"chcb": re.compile(r"\bCHCB\b", re.IGNORECASE),
|
||||
"chuxx": re.compile(r"\bCHUXX\b", re.IGNORECASE),
|
||||
"date_format": re.compile(r"\b\d{2}[/.\-]\d{2}[/.\-]\d{4}\b"),
|
||||
}
|
||||
|
||||
|
||||
@@ -14,7 +14,7 @@ from config_defaults import RUNTIME_DICTIONARIES_CONFIG_PATH
|
||||
from anonymizer_core_refactored_onnx import process_pdf
|
||||
|
||||
# 5 documents du corpus production (OGC 008)
|
||||
corpus_dir = Path("/home/dom/Téléchargements/II-1 Ctrl_T2A_2025_CHCB_DocJustificatifs")
|
||||
corpus_dir = Path("/home/dom/Téléchargements/II-1 Ctrl_T2A_2025_CHUXX_DocJustificatifs")
|
||||
test_docs = [
|
||||
corpus_dir / "008_23001234" / "CRH 23001234.pdf",
|
||||
corpus_dir / "008_23001234" / "CRO 23001234.pdf",
|
||||
|
||||
Reference in New Issue
Block a user