feat: serveur API FastAPI pour microservice anonymisation

Expose le pipeline complet d'anonymisation (regex + NER ensemble + rescan)
via REST API sur port 8200. Chargement des 3 modèles NER au démarrage
(EDS-Pseudo, CamemBERT-bio ONNX, GLiNER). Endpoints: /anonymize/text,
/anonymize/pdf, /health. Utilisé par T2A v2 comme brique externe.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
2026-03-10 02:04:52 +01:00
parent 7a2af5c905
commit 274e2fa586

328
server.py Normal file
View File

@@ -0,0 +1,328 @@
#!/usr/bin/env python3
"""
Serveur API d'anonymisation — Brique microservice.
===================================================
Expose le pipeline complet d'anonymisation (regex + NER ensemble + VLM)
via une API REST FastAPI.
Endpoints:
POST /anonymize/pdf — Anonymise un PDF complet (texte + redaction)
POST /anonymize/text — Anonymise du texte brut (regex + NER + rescan)
GET /health — Statut des modèles chargés
Lancement:
cd /home/dom/ai/anonymisation
.venv/bin/uvicorn server:app --host 0.0.0.0 --port 8200
ou en production:
.venv/bin/uvicorn server:app --host 0.0.0.0 --port 8200 --workers 1
"""
from __future__ import annotations
import json
import logging
import tempfile
import time
from pathlib import Path
from typing import Optional
from fastapi import FastAPI, File, Form, UploadFile
from fastapi.responses import JSONResponse
logging.basicConfig(
level=logging.INFO,
format="%(asctime)s [%(levelname)s] %(name)s: %(message)s",
)
log = logging.getLogger("anon-server")
# ---------------------------------------------------------------------------
# Imports pipeline (lazy, au démarrage)
# ---------------------------------------------------------------------------
from anonymizer_core_refactored_onnx import (
AnonResult,
PiiHit,
anonymise_document_regex,
apply_eds_pseudo_on_narrative,
apply_hf_ner_on_narrative,
load_dictionaries,
process_pdf,
selective_rescan,
)
# Managers NER (optionnels)
try:
from eds_pseudo_manager import EdsPseudoManager
except ImportError:
EdsPseudoManager = None
try:
from camembert_ner_manager import CamembertNerManager
except ImportError:
CamembertNerManager = None
try:
from gliner_manager import GlinerManager
except ImportError:
GlinerManager = None
try:
from vlm_manager import VlmManager
except ImportError:
VlmManager = None
# ---------------------------------------------------------------------------
# Chargement des modèles au démarrage (singleton)
# ---------------------------------------------------------------------------
_eds_manager = None
_camembert_manager = None
_gliner_manager = None
_vlm_manager = None
_cfg = None
def _load_models():
"""Charge tous les modèles NER une seule fois au démarrage."""
global _eds_manager, _camembert_manager, _gliner_manager, _vlm_manager, _cfg
_cfg = load_dictionaries(Path(__file__).parent / "config" / "dictionnaires.yml")
# EDS-Pseudo (F1=0.97)
if EdsPseudoManager is not None:
try:
_eds_manager = EdsPseudoManager()
_eds_manager.load()
log.info("EDS-Pseudo chargé")
except Exception as e:
log.warning("EDS-Pseudo indisponible: %s", e)
_eds_manager = None
# CamemBERT-bio ONNX (F1=0.90)
if CamembertNerManager is not None:
try:
_camembert_manager = CamembertNerManager()
_camembert_manager.load()
log.info("CamemBERT-bio ONNX chargé")
except Exception as e:
log.warning("CamemBERT-bio indisponible: %s", e)
_camembert_manager = None
# GLiNER (validation croisée)
if GlinerManager is not None:
try:
_gliner_manager = GlinerManager()
_gliner_manager.load()
log.info("GLiNER chargé")
except Exception as e:
log.warning("GLiNER indisponible: %s", e)
_gliner_manager = None
# VLM Ollama (scannés)
if VlmManager is not None:
try:
_vlm_manager = VlmManager()
if _vlm_manager.is_loaded():
log.info("VLM Ollama chargé")
else:
_vlm_manager = None
except Exception:
_vlm_manager = None
# ---------------------------------------------------------------------------
# FastAPI
# ---------------------------------------------------------------------------
app = FastAPI(
title="Anonymisation API",
description="Pipeline d'anonymisation de documents médicaux français",
version="1.0.0",
)
@app.on_event("startup")
async def startup():
log.info("Chargement des modèles NER...")
t0 = time.time()
_load_models()
log.info("Modèles chargés en %.1fs", time.time() - t0)
# ---------------------------------------------------------------------------
# GET /health
# ---------------------------------------------------------------------------
@app.get("/health")
async def health():
"""Statut des modèles et du serveur."""
return {
"status": "ok",
"models": {
"eds_pseudo": _eds_manager is not None and _eds_manager.is_loaded(),
"camembert_bio": _camembert_manager is not None and _camembert_manager.is_loaded(),
"gliner": _gliner_manager is not None and _gliner_manager.is_loaded(),
"vlm": _vlm_manager is not None,
},
}
# ---------------------------------------------------------------------------
# POST /anonymize/text
# ---------------------------------------------------------------------------
@app.post("/anonymize/text")
async def anonymize_text(
text: str = Form(...),
use_ner: bool = Form(True),
):
"""Anonymise du texte brut.
Applique : regex → NER ensemble (si activé) → rescan sélectif.
Args:
text: Texte médical à anonymiser.
use_ner: Activer le NER ensemble (EDS-Pseudo + CamemBERT + GLiNER).
Returns:
JSON avec text_anonymized, audit (liste des PII détectés), stats.
"""
t0 = time.time()
cfg = _cfg or load_dictionaries(None)
# Phase 1 : regex
pages_text = [text]
tables_lines: list[list[str]] = [[]]
anon = anonymise_document_regex(pages_text, tables_lines, cfg)
final_text = anon.text_out
# Phase 2 : NER ensemble
if use_ner and _eds_manager is not None and _eds_manager.is_loaded():
final_text, hf_hits = apply_eds_pseudo_on_narrative(
final_text, cfg, _eds_manager,
gliner_mgr=_gliner_manager,
camembert_mgr=_camembert_manager,
)
anon.audit.extend(hf_hits)
# Phase 3 : rescan sélectif
final_text = selective_rescan(final_text, cfg=cfg)
elapsed = time.time() - t0
audit_list = [
{"kind": h.kind, "original": h.original, "placeholder": h.placeholder, "page": h.page}
for h in anon.audit
if h.page != -1 # exclure les propagations globales
]
return {
"text_anonymized": final_text,
"audit": audit_list,
"stats": {
"pii_detected": len(audit_list),
"elapsed_seconds": round(elapsed, 3),
"ner_active": use_ner and _eds_manager is not None,
},
}
# ---------------------------------------------------------------------------
# POST /anonymize/pdf
# ---------------------------------------------------------------------------
@app.post("/anonymize/pdf")
async def anonymize_pdf(
file: UploadFile = File(...),
use_ner: bool = Form(True),
vector_redaction: bool = Form(True),
raster_redaction: bool = Form(False),
):
"""Anonymise un document PDF complet.
Applique le pipeline complet : extraction texte → regex → NER → rescan → redaction PDF.
Args:
file: Fichier PDF à anonymiser.
use_ner: Activer le NER ensemble.
vector_redaction: Générer le PDF redacté vectoriel.
raster_redaction: Générer le PDF redacté raster (plus lourd).
Returns:
JSON avec text_anonymized, audit, paths des fichiers générés.
"""
t0 = time.time()
# Sauvegarder le PDF uploadé dans un fichier temporaire
with tempfile.NamedTemporaryFile(suffix=".pdf", delete=False) as tmp:
content = await file.read()
tmp.write(content)
tmp_path = Path(tmp.name)
try:
out_dir = tmp_path.parent / f"anon_{tmp_path.stem}"
out_dir.mkdir(exist_ok=True)
# Déterminer le NER manager à passer
ner_mgr = _eds_manager if use_ner else None
outputs = process_pdf(
pdf_path=tmp_path,
out_dir=out_dir,
make_vector_redaction=vector_redaction,
also_make_raster_burn=raster_redaction,
config_path=Path(__file__).parent / "config" / "dictionnaires.yml",
use_hf=use_ner and ner_mgr is not None,
ner_manager=ner_mgr,
gliner_manager=_gliner_manager if use_ner else None,
camembert_manager=_camembert_manager if use_ner else None,
vlm_manager=_vlm_manager,
)
# Lire le texte anonymisé
text_path = outputs.get("text", "")
text_anonymized = ""
if text_path and Path(text_path).exists():
text_anonymized = Path(text_path).read_text(encoding="utf-8")
# Lire l'audit
audit_path = outputs.get("audit", "")
audit_list = []
if audit_path and Path(audit_path).exists():
with open(audit_path, encoding="utf-8") as f:
for line in f:
line = line.strip()
if line:
audit_list.append(json.loads(line))
elapsed = time.time() - t0
return {
"text_anonymized": text_anonymized,
"audit": audit_list,
"files": outputs,
"stats": {
"pii_detected": len(audit_list),
"elapsed_seconds": round(elapsed, 3),
"ner_active": use_ner and ner_mgr is not None,
"filename": file.filename,
},
}
finally:
# Nettoyer le fichier temporaire (garder out_dir avec les résultats)
tmp_path.unlink(missing_ok=True)
# ---------------------------------------------------------------------------
# Main
# ---------------------------------------------------------------------------
if __name__ == "__main__":
import uvicorn
uvicorn.run(
"server:app",
host="0.0.0.0",
port=8200,
workers=1, # 1 worker car modèles en mémoire
log_level="info",
)