#!/usr/bin/env python3 """ Serveur API d'anonymisation — Brique microservice. =================================================== Expose le pipeline complet d'anonymisation (regex + NER ensemble + VLM) via une API REST FastAPI. Endpoints: POST /anonymize/pdf — Anonymise un PDF complet (texte + redaction) POST /anonymize/text — Anonymise du texte brut (regex + NER + rescan) GET /health — Statut des modèles chargés Lancement: cd /home/dom/ai/anonymisation .venv/bin/uvicorn server:app --host 0.0.0.0 --port 8200 ou en production: .venv/bin/uvicorn server:app --host 0.0.0.0 --port 8200 --workers 1 """ from __future__ import annotations import json import logging import tempfile import time from pathlib import Path from typing import Optional from fastapi import FastAPI, File, Form, UploadFile from fastapi.responses import JSONResponse logging.basicConfig( level=logging.INFO, format="%(asctime)s [%(levelname)s] %(name)s: %(message)s", ) log = logging.getLogger("anon-server") # --------------------------------------------------------------------------- # Imports pipeline (lazy, au démarrage) # --------------------------------------------------------------------------- from anonymizer_core_refactored_onnx import ( AnonResult, PiiHit, anonymise_document_regex, apply_eds_pseudo_on_narrative, apply_hf_ner_on_narrative, load_dictionaries, process_pdf, selective_rescan, ) # Managers NER (optionnels) try: from eds_pseudo_manager import EdsPseudoManager except ImportError: EdsPseudoManager = None try: from camembert_ner_manager import CamembertNerManager except ImportError: CamembertNerManager = None try: from gliner_manager import GlinerManager except ImportError: GlinerManager = None try: from vlm_manager import VlmManager except ImportError: VlmManager = None # --------------------------------------------------------------------------- # Chargement des modèles au démarrage (singleton) # --------------------------------------------------------------------------- _eds_manager = None _camembert_manager = None _gliner_manager = None _vlm_manager = None _cfg = None def _load_models(): """Charge tous les modèles NER une seule fois au démarrage.""" global _eds_manager, _camembert_manager, _gliner_manager, _vlm_manager, _cfg _cfg = load_dictionaries(Path(__file__).parent / "config" / "dictionnaires.yml") # EDS-Pseudo (F1=0.97) if EdsPseudoManager is not None: try: _eds_manager = EdsPseudoManager() _eds_manager.load() log.info("EDS-Pseudo chargé") except Exception as e: log.warning("EDS-Pseudo indisponible: %s", e) _eds_manager = None # CamemBERT-bio ONNX (F1=0.90) if CamembertNerManager is not None: try: _camembert_manager = CamembertNerManager() _camembert_manager.load() log.info("CamemBERT-bio ONNX chargé") except Exception as e: log.warning("CamemBERT-bio indisponible: %s", e) _camembert_manager = None # GLiNER (validation croisée) if GlinerManager is not None: try: _gliner_manager = GlinerManager() _gliner_manager.load() log.info("GLiNER chargé") except Exception as e: log.warning("GLiNER indisponible: %s", e) _gliner_manager = None # VLM Ollama (scannés) if VlmManager is not None: try: _vlm_manager = VlmManager() if _vlm_manager.is_loaded(): log.info("VLM Ollama chargé") else: _vlm_manager = None except Exception: _vlm_manager = None # --------------------------------------------------------------------------- # FastAPI # --------------------------------------------------------------------------- app = FastAPI( title="Anonymisation API", description="Pipeline d'anonymisation de documents médicaux français", version="1.0.0", ) @app.on_event("startup") async def startup(): log.info("Chargement des modèles NER...") t0 = time.time() _load_models() log.info("Modèles chargés en %.1fs", time.time() - t0) # --------------------------------------------------------------------------- # GET /health # --------------------------------------------------------------------------- @app.get("/health") async def health(): """Statut des modèles et du serveur.""" return { "status": "ok", "models": { "eds_pseudo": _eds_manager is not None and _eds_manager.is_loaded(), "camembert_bio": _camembert_manager is not None and _camembert_manager.is_loaded(), "gliner": _gliner_manager is not None and _gliner_manager.is_loaded(), "vlm": _vlm_manager is not None, }, } # --------------------------------------------------------------------------- # POST /anonymize/text # --------------------------------------------------------------------------- @app.post("/anonymize/text") async def anonymize_text( text: str = Form(...), use_ner: bool = Form(True), ): """Anonymise du texte brut. Applique : regex → NER ensemble (si activé) → rescan sélectif. Args: text: Texte médical à anonymiser. use_ner: Activer le NER ensemble (EDS-Pseudo + CamemBERT + GLiNER). Returns: JSON avec text_anonymized, audit (liste des PII détectés), stats. """ t0 = time.time() cfg = _cfg or load_dictionaries(None) # Phase 1 : regex pages_text = [text] tables_lines: list[list[str]] = [[]] anon = anonymise_document_regex(pages_text, tables_lines, cfg) final_text = anon.text_out # Phase 2 : NER ensemble if use_ner and _eds_manager is not None and _eds_manager.is_loaded(): final_text, hf_hits = apply_eds_pseudo_on_narrative( final_text, cfg, _eds_manager, gliner_mgr=_gliner_manager, camembert_mgr=_camembert_manager, ) anon.audit.extend(hf_hits) # Phase 3 : rescan sélectif final_text = selective_rescan(final_text, cfg=cfg) elapsed = time.time() - t0 # Inclure tous les hits (regex page≥0 + NER page=-1) avec source ner_prefixes = ("NER_", "EDS_") audit_list = [] ner_count = 0 regex_count = 0 for h in anon.audit: is_ner = h.kind.startswith(ner_prefixes) or h.page == -1 entry = { "kind": h.kind, "original": h.original, "placeholder": h.placeholder, "page": h.page, "source": "ner" if is_ner else "regex", } audit_list.append(entry) if is_ner: ner_count += 1 else: regex_count += 1 return { "text_anonymized": final_text, "audit": audit_list, "stats": { "pii_detected": len(audit_list), "regex_count": regex_count, "ner_count": ner_count, "elapsed_seconds": round(elapsed, 3), "ner_active": use_ner and _eds_manager is not None, }, } # --------------------------------------------------------------------------- # POST /anonymize/pdf # --------------------------------------------------------------------------- @app.post("/anonymize/pdf") async def anonymize_pdf( file: UploadFile = File(...), use_ner: bool = Form(True), vector_redaction: bool = Form(True), raster_redaction: bool = Form(False), ): """Anonymise un document PDF complet. Applique le pipeline complet : extraction texte → regex → NER → rescan → redaction PDF. Args: file: Fichier PDF à anonymiser. use_ner: Activer le NER ensemble. vector_redaction: Générer le PDF redacté vectoriel. raster_redaction: Générer le PDF redacté raster (plus lourd). Returns: JSON avec text_anonymized, audit, paths des fichiers générés. """ t0 = time.time() # Sauvegarder le PDF uploadé dans un fichier temporaire with tempfile.NamedTemporaryFile(suffix=".pdf", delete=False) as tmp: content = await file.read() tmp.write(content) tmp_path = Path(tmp.name) try: out_dir = tmp_path.parent / f"anon_{tmp_path.stem}" out_dir.mkdir(exist_ok=True) # Déterminer le NER manager à passer ner_mgr = _eds_manager if use_ner else None outputs = process_pdf( pdf_path=tmp_path, out_dir=out_dir, make_vector_redaction=vector_redaction, also_make_raster_burn=raster_redaction, config_path=Path(__file__).parent / "config" / "dictionnaires.yml", use_hf=use_ner and ner_mgr is not None, ner_manager=ner_mgr, gliner_manager=_gliner_manager if use_ner else None, camembert_manager=_camembert_manager if use_ner else None, vlm_manager=_vlm_manager, ) # Lire le texte anonymisé text_path = outputs.get("text", "") text_anonymized = "" if text_path and Path(text_path).exists(): text_anonymized = Path(text_path).read_text(encoding="utf-8") # Lire l'audit audit_path = outputs.get("audit", "") audit_list = [] if audit_path and Path(audit_path).exists(): with open(audit_path, encoding="utf-8") as f: for line in f: line = line.strip() if line: audit_list.append(json.loads(line)) elapsed = time.time() - t0 return { "text_anonymized": text_anonymized, "audit": audit_list, "files": outputs, "stats": { "pii_detected": len(audit_list), "elapsed_seconds": round(elapsed, 3), "ner_active": use_ner and ner_mgr is not None, "filename": file.filename, }, } finally: # Nettoyer le fichier temporaire (garder out_dir avec les résultats) tmp_path.unlink(missing_ok=True) # --------------------------------------------------------------------------- # Main # --------------------------------------------------------------------------- if __name__ == "__main__": import uvicorn uvicorn.run( "server:app", host="0.0.0.0", port=8200, workers=1, # 1 worker car modèles en mémoire log_level="info", )