chore: Avant implémentation Phase 1 corrections qualité

This commit is contained in:
2026-03-02 23:34:06 +01:00
parent 93617bab55
commit 47a71df930
8 changed files with 1157 additions and 2 deletions

View File

@@ -97,6 +97,40 @@ def _load_edsnlp_drug_names() -> set:
return set()
# ----------------- Whitelists Médicales -----------------
_MEDICAL_STRUCTURAL_TERMS = set()
_MEDICATION_WHITELIST = set()
def load_medical_whitelists():
"""Charge les whitelists médicales (termes structurels + médicaments)."""
global _MEDICAL_STRUCTURAL_TERMS, _MEDICATION_WHITELIST
# 1. Charger les termes médicaux structurels
config_path = Path("config/medical_terms_whitelist.yml")
if config_path.exists() and yaml:
try:
with open(config_path, 'r', encoding='utf-8') as f:
data = yaml.safe_load(f)
terms = data.get('medical_structural_terms', [])
_MEDICAL_STRUCTURAL_TERMS = {t.lower() for t in terms}
log.info(f"Whitelist termes médicaux chargée: {len(_MEDICAL_STRUCTURAL_TERMS)} termes")
except Exception as e:
log.warning(f"Erreur chargement whitelist médicale: {e}")
# 2. Charger la whitelist des médicaments
_MEDICATION_WHITELIST = _load_edsnlp_drug_names()
# Ajouter médicaments manquants
additional_meds = {
"idacio", "salazopyrine", "infliximab", "apranax",
"ketoprofene", "prevenar", "pneumovax", "bétadine"
}
_MEDICATION_WHITELIST.update(additional_meds)
log.info(f"Whitelist médicaments chargée: {len(_MEDICATION_WHITELIST)} médicaments")
# Charger les whitelists au démarrage du module
load_medical_whitelists()
# ----------------- Defaults & Config -----------------
DEFAULTS_CFG = {
"version": 1,
@@ -896,7 +930,18 @@ def _mask_line_by_regex(line: str, audit: List[PiiHit], page_idx: int, cfg: Dict
# Services hospitaliers (service de Cardiologie, unité de soins palliatifs, etc.)
def _repl_service(m: re.Match) -> str:
audit.append(PiiHit(page_idx, "ETAB", m.group(0), PLACEHOLDERS["MASK"]))
full_match = m.group(0)
# Vérifier si c'est un terme structurel à préserver
if full_match.lower() in _MEDICAL_STRUCTURAL_TERMS:
return full_match
# Vérifier le contexte avant (Chef de, Praticien, etc.)
start_pos = m.start()
context_before = line[max(0, start_pos-25):start_pos].lower()
# Patterns à préserver
preserve_patterns = ['chef de', 'praticien', 'ancien', 'assistant', 'médecin', 'interne']
if any(pattern in context_before for pattern in preserve_patterns):
return full_match
audit.append(PiiHit(page_idx, "ETAB", full_match, PLACEHOLDERS["MASK"]))
return PLACEHOLDERS["MASK"]
line = RE_SERVICE.sub(_repl_service, line)
@@ -1414,6 +1459,11 @@ def _mask_with_eds_pseudo(text: str, ents: List[Dict[str, Any]], cfg: Dict[str,
# Filtrer les dosages détectés comme noms (ex: "10MG", "300UI", "1 000")
if re.match(r"^\d[\d\s]*(?:mg|MG|ml|ML|UI|µg|mcg|g|kg|%)?$", w.strip()):
continue
# CORRECTION 1.2: Filtrer les médicaments détectés comme NOM/PRENOM
if label in ("NOM", "PRENOM"):
# Vérifier si c'est un médicament connu
if w.lower() in _MEDICATION_WHITELIST:
continue
# Règles de validation heuristiques par type d'entité
if label in ("NOM", "PRENOM"):
# Rejeter si le contexte précédent (15 chars) contient un dosage