From 274e2fa58662204058110c2348f3cbff674768a6 Mon Sep 17 00:00:00 2001 From: Domi31tls Date: Tue, 10 Mar 2026 02:04:52 +0100 Subject: [PATCH] feat: serveur API FastAPI pour microservice anonymisation MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Expose le pipeline complet d'anonymisation (regex + NER ensemble + rescan) via REST API sur port 8200. Chargement des 3 modèles NER au démarrage (EDS-Pseudo, CamemBERT-bio ONNX, GLiNER). Endpoints: /anonymize/text, /anonymize/pdf, /health. Utilisé par T2A v2 comme brique externe. Co-Authored-By: Claude Opus 4.6 --- server.py | 328 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 328 insertions(+) create mode 100644 server.py diff --git a/server.py b/server.py new file mode 100644 index 0000000..c31b249 --- /dev/null +++ b/server.py @@ -0,0 +1,328 @@ +#!/usr/bin/env python3 +""" +Serveur API d'anonymisation — Brique microservice. +=================================================== +Expose le pipeline complet d'anonymisation (regex + NER ensemble + VLM) +via une API REST FastAPI. + +Endpoints: + POST /anonymize/pdf — Anonymise un PDF complet (texte + redaction) + POST /anonymize/text — Anonymise du texte brut (regex + NER + rescan) + GET /health — Statut des modèles chargés + +Lancement: + cd /home/dom/ai/anonymisation + .venv/bin/uvicorn server:app --host 0.0.0.0 --port 8200 + + ou en production: + .venv/bin/uvicorn server:app --host 0.0.0.0 --port 8200 --workers 1 +""" +from __future__ import annotations + +import json +import logging +import tempfile +import time +from pathlib import Path +from typing import Optional + +from fastapi import FastAPI, File, Form, UploadFile +from fastapi.responses import JSONResponse + +logging.basicConfig( + level=logging.INFO, + format="%(asctime)s [%(levelname)s] %(name)s: %(message)s", +) +log = logging.getLogger("anon-server") + +# --------------------------------------------------------------------------- +# Imports pipeline (lazy, au démarrage) +# --------------------------------------------------------------------------- + +from anonymizer_core_refactored_onnx import ( + AnonResult, + PiiHit, + anonymise_document_regex, + apply_eds_pseudo_on_narrative, + apply_hf_ner_on_narrative, + load_dictionaries, + process_pdf, + selective_rescan, +) + +# Managers NER (optionnels) +try: + from eds_pseudo_manager import EdsPseudoManager +except ImportError: + EdsPseudoManager = None + +try: + from camembert_ner_manager import CamembertNerManager +except ImportError: + CamembertNerManager = None + +try: + from gliner_manager import GlinerManager +except ImportError: + GlinerManager = None + +try: + from vlm_manager import VlmManager +except ImportError: + VlmManager = None + +# --------------------------------------------------------------------------- +# Chargement des modèles au démarrage (singleton) +# --------------------------------------------------------------------------- + +_eds_manager = None +_camembert_manager = None +_gliner_manager = None +_vlm_manager = None +_cfg = None + + +def _load_models(): + """Charge tous les modèles NER une seule fois au démarrage.""" + global _eds_manager, _camembert_manager, _gliner_manager, _vlm_manager, _cfg + + _cfg = load_dictionaries(Path(__file__).parent / "config" / "dictionnaires.yml") + + # EDS-Pseudo (F1=0.97) + if EdsPseudoManager is not None: + try: + _eds_manager = EdsPseudoManager() + _eds_manager.load() + log.info("EDS-Pseudo chargé") + except Exception as e: + log.warning("EDS-Pseudo indisponible: %s", e) + _eds_manager = None + + # CamemBERT-bio ONNX (F1=0.90) + if CamembertNerManager is not None: + try: + _camembert_manager = CamembertNerManager() + _camembert_manager.load() + log.info("CamemBERT-bio ONNX chargé") + except Exception as e: + log.warning("CamemBERT-bio indisponible: %s", e) + _camembert_manager = None + + # GLiNER (validation croisée) + if GlinerManager is not None: + try: + _gliner_manager = GlinerManager() + _gliner_manager.load() + log.info("GLiNER chargé") + except Exception as e: + log.warning("GLiNER indisponible: %s", e) + _gliner_manager = None + + # VLM Ollama (scannés) + if VlmManager is not None: + try: + _vlm_manager = VlmManager() + if _vlm_manager.is_loaded(): + log.info("VLM Ollama chargé") + else: + _vlm_manager = None + except Exception: + _vlm_manager = None + + +# --------------------------------------------------------------------------- +# FastAPI +# --------------------------------------------------------------------------- + +app = FastAPI( + title="Anonymisation API", + description="Pipeline d'anonymisation de documents médicaux français", + version="1.0.0", +) + + +@app.on_event("startup") +async def startup(): + log.info("Chargement des modèles NER...") + t0 = time.time() + _load_models() + log.info("Modèles chargés en %.1fs", time.time() - t0) + + +# --------------------------------------------------------------------------- +# GET /health +# --------------------------------------------------------------------------- + +@app.get("/health") +async def health(): + """Statut des modèles et du serveur.""" + return { + "status": "ok", + "models": { + "eds_pseudo": _eds_manager is not None and _eds_manager.is_loaded(), + "camembert_bio": _camembert_manager is not None and _camembert_manager.is_loaded(), + "gliner": _gliner_manager is not None and _gliner_manager.is_loaded(), + "vlm": _vlm_manager is not None, + }, + } + + +# --------------------------------------------------------------------------- +# POST /anonymize/text +# --------------------------------------------------------------------------- + +@app.post("/anonymize/text") +async def anonymize_text( + text: str = Form(...), + use_ner: bool = Form(True), +): + """Anonymise du texte brut. + + Applique : regex → NER ensemble (si activé) → rescan sélectif. + + Args: + text: Texte médical à anonymiser. + use_ner: Activer le NER ensemble (EDS-Pseudo + CamemBERT + GLiNER). + + Returns: + JSON avec text_anonymized, audit (liste des PII détectés), stats. + """ + t0 = time.time() + cfg = _cfg or load_dictionaries(None) + + # Phase 1 : regex + pages_text = [text] + tables_lines: list[list[str]] = [[]] + anon = anonymise_document_regex(pages_text, tables_lines, cfg) + + final_text = anon.text_out + + # Phase 2 : NER ensemble + if use_ner and _eds_manager is not None and _eds_manager.is_loaded(): + final_text, hf_hits = apply_eds_pseudo_on_narrative( + final_text, cfg, _eds_manager, + gliner_mgr=_gliner_manager, + camembert_mgr=_camembert_manager, + ) + anon.audit.extend(hf_hits) + + # Phase 3 : rescan sélectif + final_text = selective_rescan(final_text, cfg=cfg) + + elapsed = time.time() - t0 + audit_list = [ + {"kind": h.kind, "original": h.original, "placeholder": h.placeholder, "page": h.page} + for h in anon.audit + if h.page != -1 # exclure les propagations globales + ] + + return { + "text_anonymized": final_text, + "audit": audit_list, + "stats": { + "pii_detected": len(audit_list), + "elapsed_seconds": round(elapsed, 3), + "ner_active": use_ner and _eds_manager is not None, + }, + } + + +# --------------------------------------------------------------------------- +# POST /anonymize/pdf +# --------------------------------------------------------------------------- + +@app.post("/anonymize/pdf") +async def anonymize_pdf( + file: UploadFile = File(...), + use_ner: bool = Form(True), + vector_redaction: bool = Form(True), + raster_redaction: bool = Form(False), +): + """Anonymise un document PDF complet. + + Applique le pipeline complet : extraction texte → regex → NER → rescan → redaction PDF. + + Args: + file: Fichier PDF à anonymiser. + use_ner: Activer le NER ensemble. + vector_redaction: Générer le PDF redacté vectoriel. + raster_redaction: Générer le PDF redacté raster (plus lourd). + + Returns: + JSON avec text_anonymized, audit, paths des fichiers générés. + """ + t0 = time.time() + + # Sauvegarder le PDF uploadé dans un fichier temporaire + with tempfile.NamedTemporaryFile(suffix=".pdf", delete=False) as tmp: + content = await file.read() + tmp.write(content) + tmp_path = Path(tmp.name) + + try: + out_dir = tmp_path.parent / f"anon_{tmp_path.stem}" + out_dir.mkdir(exist_ok=True) + + # Déterminer le NER manager à passer + ner_mgr = _eds_manager if use_ner else None + + outputs = process_pdf( + pdf_path=tmp_path, + out_dir=out_dir, + make_vector_redaction=vector_redaction, + also_make_raster_burn=raster_redaction, + config_path=Path(__file__).parent / "config" / "dictionnaires.yml", + use_hf=use_ner and ner_mgr is not None, + ner_manager=ner_mgr, + gliner_manager=_gliner_manager if use_ner else None, + camembert_manager=_camembert_manager if use_ner else None, + vlm_manager=_vlm_manager, + ) + + # Lire le texte anonymisé + text_path = outputs.get("text", "") + text_anonymized = "" + if text_path and Path(text_path).exists(): + text_anonymized = Path(text_path).read_text(encoding="utf-8") + + # Lire l'audit + audit_path = outputs.get("audit", "") + audit_list = [] + if audit_path and Path(audit_path).exists(): + with open(audit_path, encoding="utf-8") as f: + for line in f: + line = line.strip() + if line: + audit_list.append(json.loads(line)) + + elapsed = time.time() - t0 + return { + "text_anonymized": text_anonymized, + "audit": audit_list, + "files": outputs, + "stats": { + "pii_detected": len(audit_list), + "elapsed_seconds": round(elapsed, 3), + "ner_active": use_ner and ner_mgr is not None, + "filename": file.filename, + }, + } + + finally: + # Nettoyer le fichier temporaire (garder out_dir avec les résultats) + tmp_path.unlink(missing_ok=True) + + +# --------------------------------------------------------------------------- +# Main +# --------------------------------------------------------------------------- + +if __name__ == "__main__": + import uvicorn + uvicorn.run( + "server:app", + host="0.0.0.0", + port=8200, + workers=1, # 1 worker car modèles en mémoire + log_level="info", + )