chore: Avant implémentation Phase 1 corrections qualité
This commit is contained in:
@@ -97,6 +97,40 @@ def _load_edsnlp_drug_names() -> set:
|
||||
return set()
|
||||
|
||||
|
||||
# ----------------- Whitelists Médicales -----------------
|
||||
_MEDICAL_STRUCTURAL_TERMS = set()
|
||||
_MEDICATION_WHITELIST = set()
|
||||
|
||||
def load_medical_whitelists():
|
||||
"""Charge les whitelists médicales (termes structurels + médicaments)."""
|
||||
global _MEDICAL_STRUCTURAL_TERMS, _MEDICATION_WHITELIST
|
||||
|
||||
# 1. Charger les termes médicaux structurels
|
||||
config_path = Path("config/medical_terms_whitelist.yml")
|
||||
if config_path.exists() and yaml:
|
||||
try:
|
||||
with open(config_path, 'r', encoding='utf-8') as f:
|
||||
data = yaml.safe_load(f)
|
||||
terms = data.get('medical_structural_terms', [])
|
||||
_MEDICAL_STRUCTURAL_TERMS = {t.lower() for t in terms}
|
||||
log.info(f"Whitelist termes médicaux chargée: {len(_MEDICAL_STRUCTURAL_TERMS)} termes")
|
||||
except Exception as e:
|
||||
log.warning(f"Erreur chargement whitelist médicale: {e}")
|
||||
|
||||
# 2. Charger la whitelist des médicaments
|
||||
_MEDICATION_WHITELIST = _load_edsnlp_drug_names()
|
||||
# Ajouter médicaments manquants
|
||||
additional_meds = {
|
||||
"idacio", "salazopyrine", "infliximab", "apranax",
|
||||
"ketoprofene", "prevenar", "pneumovax", "bétadine"
|
||||
}
|
||||
_MEDICATION_WHITELIST.update(additional_meds)
|
||||
log.info(f"Whitelist médicaments chargée: {len(_MEDICATION_WHITELIST)} médicaments")
|
||||
|
||||
# Charger les whitelists au démarrage du module
|
||||
load_medical_whitelists()
|
||||
|
||||
|
||||
# ----------------- Defaults & Config -----------------
|
||||
DEFAULTS_CFG = {
|
||||
"version": 1,
|
||||
@@ -896,7 +930,18 @@ def _mask_line_by_regex(line: str, audit: List[PiiHit], page_idx: int, cfg: Dict
|
||||
|
||||
# Services hospitaliers (service de Cardiologie, unité de soins palliatifs, etc.)
|
||||
def _repl_service(m: re.Match) -> str:
|
||||
audit.append(PiiHit(page_idx, "ETAB", m.group(0), PLACEHOLDERS["MASK"]))
|
||||
full_match = m.group(0)
|
||||
# Vérifier si c'est un terme structurel à préserver
|
||||
if full_match.lower() in _MEDICAL_STRUCTURAL_TERMS:
|
||||
return full_match
|
||||
# Vérifier le contexte avant (Chef de, Praticien, etc.)
|
||||
start_pos = m.start()
|
||||
context_before = line[max(0, start_pos-25):start_pos].lower()
|
||||
# Patterns à préserver
|
||||
preserve_patterns = ['chef de', 'praticien', 'ancien', 'assistant', 'médecin', 'interne']
|
||||
if any(pattern in context_before for pattern in preserve_patterns):
|
||||
return full_match
|
||||
audit.append(PiiHit(page_idx, "ETAB", full_match, PLACEHOLDERS["MASK"]))
|
||||
return PLACEHOLDERS["MASK"]
|
||||
line = RE_SERVICE.sub(_repl_service, line)
|
||||
|
||||
@@ -1414,6 +1459,11 @@ def _mask_with_eds_pseudo(text: str, ents: List[Dict[str, Any]], cfg: Dict[str,
|
||||
# Filtrer les dosages détectés comme noms (ex: "10MG", "300UI", "1 000")
|
||||
if re.match(r"^\d[\d\s]*(?:mg|MG|ml|ML|UI|µg|mcg|g|kg|%)?$", w.strip()):
|
||||
continue
|
||||
# CORRECTION 1.2: Filtrer les médicaments détectés comme NOM/PRENOM
|
||||
if label in ("NOM", "PRENOM"):
|
||||
# Vérifier si c'est un médicament connu
|
||||
if w.lower() in _MEDICATION_WHITELIST:
|
||||
continue
|
||||
# Règles de validation heuristiques par type d'entité
|
||||
if label in ("NOM", "PRENOM"):
|
||||
# Rejeter si le contexte précédent (15 chars) contient un dosage
|
||||
|
||||
Reference in New Issue
Block a user