feat: serveur API FastAPI pour microservice anonymisation
Expose le pipeline complet d'anonymisation (regex + NER ensemble + rescan) via REST API sur port 8200. Chargement des 3 modèles NER au démarrage (EDS-Pseudo, CamemBERT-bio ONNX, GLiNER). Endpoints: /anonymize/text, /anonymize/pdf, /health. Utilisé par T2A v2 comme brique externe. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
328
server.py
Normal file
328
server.py
Normal file
@@ -0,0 +1,328 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
"""
|
||||||
|
Serveur API d'anonymisation — Brique microservice.
|
||||||
|
===================================================
|
||||||
|
Expose le pipeline complet d'anonymisation (regex + NER ensemble + VLM)
|
||||||
|
via une API REST FastAPI.
|
||||||
|
|
||||||
|
Endpoints:
|
||||||
|
POST /anonymize/pdf — Anonymise un PDF complet (texte + redaction)
|
||||||
|
POST /anonymize/text — Anonymise du texte brut (regex + NER + rescan)
|
||||||
|
GET /health — Statut des modèles chargés
|
||||||
|
|
||||||
|
Lancement:
|
||||||
|
cd /home/dom/ai/anonymisation
|
||||||
|
.venv/bin/uvicorn server:app --host 0.0.0.0 --port 8200
|
||||||
|
|
||||||
|
ou en production:
|
||||||
|
.venv/bin/uvicorn server:app --host 0.0.0.0 --port 8200 --workers 1
|
||||||
|
"""
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import json
|
||||||
|
import logging
|
||||||
|
import tempfile
|
||||||
|
import time
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import Optional
|
||||||
|
|
||||||
|
from fastapi import FastAPI, File, Form, UploadFile
|
||||||
|
from fastapi.responses import JSONResponse
|
||||||
|
|
||||||
|
logging.basicConfig(
|
||||||
|
level=logging.INFO,
|
||||||
|
format="%(asctime)s [%(levelname)s] %(name)s: %(message)s",
|
||||||
|
)
|
||||||
|
log = logging.getLogger("anon-server")
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Imports pipeline (lazy, au démarrage)
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
from anonymizer_core_refactored_onnx import (
|
||||||
|
AnonResult,
|
||||||
|
PiiHit,
|
||||||
|
anonymise_document_regex,
|
||||||
|
apply_eds_pseudo_on_narrative,
|
||||||
|
apply_hf_ner_on_narrative,
|
||||||
|
load_dictionaries,
|
||||||
|
process_pdf,
|
||||||
|
selective_rescan,
|
||||||
|
)
|
||||||
|
|
||||||
|
# Managers NER (optionnels)
|
||||||
|
try:
|
||||||
|
from eds_pseudo_manager import EdsPseudoManager
|
||||||
|
except ImportError:
|
||||||
|
EdsPseudoManager = None
|
||||||
|
|
||||||
|
try:
|
||||||
|
from camembert_ner_manager import CamembertNerManager
|
||||||
|
except ImportError:
|
||||||
|
CamembertNerManager = None
|
||||||
|
|
||||||
|
try:
|
||||||
|
from gliner_manager import GlinerManager
|
||||||
|
except ImportError:
|
||||||
|
GlinerManager = None
|
||||||
|
|
||||||
|
try:
|
||||||
|
from vlm_manager import VlmManager
|
||||||
|
except ImportError:
|
||||||
|
VlmManager = None
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Chargement des modèles au démarrage (singleton)
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
_eds_manager = None
|
||||||
|
_camembert_manager = None
|
||||||
|
_gliner_manager = None
|
||||||
|
_vlm_manager = None
|
||||||
|
_cfg = None
|
||||||
|
|
||||||
|
|
||||||
|
def _load_models():
|
||||||
|
"""Charge tous les modèles NER une seule fois au démarrage."""
|
||||||
|
global _eds_manager, _camembert_manager, _gliner_manager, _vlm_manager, _cfg
|
||||||
|
|
||||||
|
_cfg = load_dictionaries(Path(__file__).parent / "config" / "dictionnaires.yml")
|
||||||
|
|
||||||
|
# EDS-Pseudo (F1=0.97)
|
||||||
|
if EdsPseudoManager is not None:
|
||||||
|
try:
|
||||||
|
_eds_manager = EdsPseudoManager()
|
||||||
|
_eds_manager.load()
|
||||||
|
log.info("EDS-Pseudo chargé")
|
||||||
|
except Exception as e:
|
||||||
|
log.warning("EDS-Pseudo indisponible: %s", e)
|
||||||
|
_eds_manager = None
|
||||||
|
|
||||||
|
# CamemBERT-bio ONNX (F1=0.90)
|
||||||
|
if CamembertNerManager is not None:
|
||||||
|
try:
|
||||||
|
_camembert_manager = CamembertNerManager()
|
||||||
|
_camembert_manager.load()
|
||||||
|
log.info("CamemBERT-bio ONNX chargé")
|
||||||
|
except Exception as e:
|
||||||
|
log.warning("CamemBERT-bio indisponible: %s", e)
|
||||||
|
_camembert_manager = None
|
||||||
|
|
||||||
|
# GLiNER (validation croisée)
|
||||||
|
if GlinerManager is not None:
|
||||||
|
try:
|
||||||
|
_gliner_manager = GlinerManager()
|
||||||
|
_gliner_manager.load()
|
||||||
|
log.info("GLiNER chargé")
|
||||||
|
except Exception as e:
|
||||||
|
log.warning("GLiNER indisponible: %s", e)
|
||||||
|
_gliner_manager = None
|
||||||
|
|
||||||
|
# VLM Ollama (scannés)
|
||||||
|
if VlmManager is not None:
|
||||||
|
try:
|
||||||
|
_vlm_manager = VlmManager()
|
||||||
|
if _vlm_manager.is_loaded():
|
||||||
|
log.info("VLM Ollama chargé")
|
||||||
|
else:
|
||||||
|
_vlm_manager = None
|
||||||
|
except Exception:
|
||||||
|
_vlm_manager = None
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# FastAPI
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
app = FastAPI(
|
||||||
|
title="Anonymisation API",
|
||||||
|
description="Pipeline d'anonymisation de documents médicaux français",
|
||||||
|
version="1.0.0",
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@app.on_event("startup")
|
||||||
|
async def startup():
|
||||||
|
log.info("Chargement des modèles NER...")
|
||||||
|
t0 = time.time()
|
||||||
|
_load_models()
|
||||||
|
log.info("Modèles chargés en %.1fs", time.time() - t0)
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# GET /health
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
@app.get("/health")
|
||||||
|
async def health():
|
||||||
|
"""Statut des modèles et du serveur."""
|
||||||
|
return {
|
||||||
|
"status": "ok",
|
||||||
|
"models": {
|
||||||
|
"eds_pseudo": _eds_manager is not None and _eds_manager.is_loaded(),
|
||||||
|
"camembert_bio": _camembert_manager is not None and _camembert_manager.is_loaded(),
|
||||||
|
"gliner": _gliner_manager is not None and _gliner_manager.is_loaded(),
|
||||||
|
"vlm": _vlm_manager is not None,
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# POST /anonymize/text
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
@app.post("/anonymize/text")
|
||||||
|
async def anonymize_text(
|
||||||
|
text: str = Form(...),
|
||||||
|
use_ner: bool = Form(True),
|
||||||
|
):
|
||||||
|
"""Anonymise du texte brut.
|
||||||
|
|
||||||
|
Applique : regex → NER ensemble (si activé) → rescan sélectif.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
text: Texte médical à anonymiser.
|
||||||
|
use_ner: Activer le NER ensemble (EDS-Pseudo + CamemBERT + GLiNER).
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
JSON avec text_anonymized, audit (liste des PII détectés), stats.
|
||||||
|
"""
|
||||||
|
t0 = time.time()
|
||||||
|
cfg = _cfg or load_dictionaries(None)
|
||||||
|
|
||||||
|
# Phase 1 : regex
|
||||||
|
pages_text = [text]
|
||||||
|
tables_lines: list[list[str]] = [[]]
|
||||||
|
anon = anonymise_document_regex(pages_text, tables_lines, cfg)
|
||||||
|
|
||||||
|
final_text = anon.text_out
|
||||||
|
|
||||||
|
# Phase 2 : NER ensemble
|
||||||
|
if use_ner and _eds_manager is not None and _eds_manager.is_loaded():
|
||||||
|
final_text, hf_hits = apply_eds_pseudo_on_narrative(
|
||||||
|
final_text, cfg, _eds_manager,
|
||||||
|
gliner_mgr=_gliner_manager,
|
||||||
|
camembert_mgr=_camembert_manager,
|
||||||
|
)
|
||||||
|
anon.audit.extend(hf_hits)
|
||||||
|
|
||||||
|
# Phase 3 : rescan sélectif
|
||||||
|
final_text = selective_rescan(final_text, cfg=cfg)
|
||||||
|
|
||||||
|
elapsed = time.time() - t0
|
||||||
|
audit_list = [
|
||||||
|
{"kind": h.kind, "original": h.original, "placeholder": h.placeholder, "page": h.page}
|
||||||
|
for h in anon.audit
|
||||||
|
if h.page != -1 # exclure les propagations globales
|
||||||
|
]
|
||||||
|
|
||||||
|
return {
|
||||||
|
"text_anonymized": final_text,
|
||||||
|
"audit": audit_list,
|
||||||
|
"stats": {
|
||||||
|
"pii_detected": len(audit_list),
|
||||||
|
"elapsed_seconds": round(elapsed, 3),
|
||||||
|
"ner_active": use_ner and _eds_manager is not None,
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# POST /anonymize/pdf
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
@app.post("/anonymize/pdf")
|
||||||
|
async def anonymize_pdf(
|
||||||
|
file: UploadFile = File(...),
|
||||||
|
use_ner: bool = Form(True),
|
||||||
|
vector_redaction: bool = Form(True),
|
||||||
|
raster_redaction: bool = Form(False),
|
||||||
|
):
|
||||||
|
"""Anonymise un document PDF complet.
|
||||||
|
|
||||||
|
Applique le pipeline complet : extraction texte → regex → NER → rescan → redaction PDF.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
file: Fichier PDF à anonymiser.
|
||||||
|
use_ner: Activer le NER ensemble.
|
||||||
|
vector_redaction: Générer le PDF redacté vectoriel.
|
||||||
|
raster_redaction: Générer le PDF redacté raster (plus lourd).
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
JSON avec text_anonymized, audit, paths des fichiers générés.
|
||||||
|
"""
|
||||||
|
t0 = time.time()
|
||||||
|
|
||||||
|
# Sauvegarder le PDF uploadé dans un fichier temporaire
|
||||||
|
with tempfile.NamedTemporaryFile(suffix=".pdf", delete=False) as tmp:
|
||||||
|
content = await file.read()
|
||||||
|
tmp.write(content)
|
||||||
|
tmp_path = Path(tmp.name)
|
||||||
|
|
||||||
|
try:
|
||||||
|
out_dir = tmp_path.parent / f"anon_{tmp_path.stem}"
|
||||||
|
out_dir.mkdir(exist_ok=True)
|
||||||
|
|
||||||
|
# Déterminer le NER manager à passer
|
||||||
|
ner_mgr = _eds_manager if use_ner else None
|
||||||
|
|
||||||
|
outputs = process_pdf(
|
||||||
|
pdf_path=tmp_path,
|
||||||
|
out_dir=out_dir,
|
||||||
|
make_vector_redaction=vector_redaction,
|
||||||
|
also_make_raster_burn=raster_redaction,
|
||||||
|
config_path=Path(__file__).parent / "config" / "dictionnaires.yml",
|
||||||
|
use_hf=use_ner and ner_mgr is not None,
|
||||||
|
ner_manager=ner_mgr,
|
||||||
|
gliner_manager=_gliner_manager if use_ner else None,
|
||||||
|
camembert_manager=_camembert_manager if use_ner else None,
|
||||||
|
vlm_manager=_vlm_manager,
|
||||||
|
)
|
||||||
|
|
||||||
|
# Lire le texte anonymisé
|
||||||
|
text_path = outputs.get("text", "")
|
||||||
|
text_anonymized = ""
|
||||||
|
if text_path and Path(text_path).exists():
|
||||||
|
text_anonymized = Path(text_path).read_text(encoding="utf-8")
|
||||||
|
|
||||||
|
# Lire l'audit
|
||||||
|
audit_path = outputs.get("audit", "")
|
||||||
|
audit_list = []
|
||||||
|
if audit_path and Path(audit_path).exists():
|
||||||
|
with open(audit_path, encoding="utf-8") as f:
|
||||||
|
for line in f:
|
||||||
|
line = line.strip()
|
||||||
|
if line:
|
||||||
|
audit_list.append(json.loads(line))
|
||||||
|
|
||||||
|
elapsed = time.time() - t0
|
||||||
|
return {
|
||||||
|
"text_anonymized": text_anonymized,
|
||||||
|
"audit": audit_list,
|
||||||
|
"files": outputs,
|
||||||
|
"stats": {
|
||||||
|
"pii_detected": len(audit_list),
|
||||||
|
"elapsed_seconds": round(elapsed, 3),
|
||||||
|
"ner_active": use_ner and ner_mgr is not None,
|
||||||
|
"filename": file.filename,
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
finally:
|
||||||
|
# Nettoyer le fichier temporaire (garder out_dir avec les résultats)
|
||||||
|
tmp_path.unlink(missing_ok=True)
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Main
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
import uvicorn
|
||||||
|
uvicorn.run(
|
||||||
|
"server:app",
|
||||||
|
host="0.0.0.0",
|
||||||
|
port=8200,
|
||||||
|
workers=1, # 1 worker car modèles en mémoire
|
||||||
|
log_level="info",
|
||||||
|
)
|
||||||
Reference in New Issue
Block a user