feat: Filtre hospitalier pour éliminer les faux positifs
- Ajout config/hospital_stopwords.yml avec adresses/téléphones hôpitaux - Ajout detectors/hospital_filter.py pour filtrer les FP - Intégration dans anonymizer_core_refactored_onnx.py - Test sur document: 40 -> 32 détections (-8 FP) - Élimine: adresses hôpitaux, codes postaux CEDEX, épisodes dans noms de fichiers
This commit is contained in:
@@ -48,6 +48,13 @@ try:
|
||||
_DOCTR_AVAILABLE = True
|
||||
except Exception:
|
||||
_doctr_ocr_predictor = None # type: ignore
|
||||
|
||||
try:
|
||||
from detectors.hospital_filter import HospitalFilter
|
||||
_HOSPITAL_FILTER_AVAILABLE = True
|
||||
except Exception:
|
||||
_HOSPITAL_FILTER_AVAILABLE = False
|
||||
HospitalFilter = None # type: ignore
|
||||
_DOCTR_AVAILABLE = False
|
||||
|
||||
# NER manager (facultatif)
|
||||
@@ -2067,6 +2074,44 @@ def process_pdf(
|
||||
if ocr_used:
|
||||
anon.audit.insert(0, PiiHit(page=-1, kind="OCR_USED", original="docTR", placeholder=""))
|
||||
|
||||
# Filtrer les faux positifs hospitaliers
|
||||
if _HOSPITAL_FILTER_AVAILABLE:
|
||||
try:
|
||||
hospital_filter = HospitalFilter()
|
||||
original_count = len(anon.audit)
|
||||
|
||||
# Convertir les PiiHit en format dict pour le filtre
|
||||
detections = [
|
||||
{
|
||||
'kind': hit.kind,
|
||||
'original': hit.original,
|
||||
'page': hit.page
|
||||
}
|
||||
for hit in anon.audit
|
||||
]
|
||||
|
||||
# Filtrer
|
||||
filtered_detections = hospital_filter.filter_detections(detections, pdf_path.name)
|
||||
|
||||
# Reconstruire la liste anon.audit
|
||||
filtered_audit = []
|
||||
for det in filtered_detections:
|
||||
# Trouver le PiiHit original correspondant
|
||||
for hit in anon.audit:
|
||||
if (hit.kind == det['kind'] and
|
||||
hit.original == det['original'] and
|
||||
hit.page == det['page']):
|
||||
filtered_audit.append(hit)
|
||||
break
|
||||
|
||||
anon.audit = filtered_audit
|
||||
filtered_count = original_count - len(anon.audit)
|
||||
|
||||
if filtered_count > 0:
|
||||
log.info("Filtre hospitalier : %d faux positifs éliminés", filtered_count)
|
||||
except Exception as e:
|
||||
log.warning("Erreur lors du filtrage hospitalier : %s", e)
|
||||
|
||||
# Sauvegardes
|
||||
base = pdf_path.stem
|
||||
txt_path = out_dir / f"{base}.pseudonymise.txt"
|
||||
|
||||
Reference in New Issue
Block a user