rpa_vision_v3/tests/unit/test_pii_blur.py

# tests/unit/test_pii_blur.py
"""Tests du pipeline d'anonymisation server-side (core.anonymisation.pii_blur).

On couvre :
1. Logique regex : PERSON / ADDRESS / PHONE / NIR / EMAIL sont détectés ;
   codes CIM, CCAM, montants, dates et IDs techniques sont protégés.
2. Fusion de spans qui se chevauchent.
3. Pipeline complet sur une image synthétique PIL (sans OCR — on patche `_doctr_ocr`)
   avec assertions sur les pixels : les zones PII sont floutées, les autres non.
"""

from __future__ import annotations

import os
from pathlib import Path
from unittest.mock import patch

import pytest
from PIL import Image, ImageDraw, ImageFont


# Éviter de charger docTR pendant les tests rapides
os.environ.setdefault("RPA_PII_BLUR_SERVER", "true")


# --- Import du module sous test --------------------------------------------
from core.anonymisation import pii_blur as mod  # noqa: E402


# ===========================================================================
# Tests regex — pas besoin d'image
# ===========================================================================

class TestRegexPII:
    def test_detect_person_with_title(self):
        hits = mod._regex_find_pii("Patient : M. Dupont Jean, né le 12/03/1965")
        labels = {h[0] for h in hits}
        assert "PERSON" in labels

    def test_detect_email(self):
        hits = mod._regex_find_pii("Contact : jean.dupont@hopital.fr")
        labels = {h[0] for h in hits}
        assert "EMAIL" in labels

    def test_detect_phone_fr(self):
        hits = mod._regex_find_pii("Tél : 06 12 34 56 78")
        labels = {h[0] for h in hits}
        assert "PHONE" in labels

    def test_detect_nir(self):
        hits = mod._regex_find_pii("NIR : 2 85 03 75 115 120 42")
        labels = {h[0] for h in hits}
        assert "NIR" in labels

    def test_detect_address(self):
        hits = mod._regex_find_pii("Adresse : 12 rue de la Paix, Paris")
        labels = {h[0] for h in hits}
        assert "LOCATION" in labels

    # --- Négatifs : ces motifs NE DOIVENT PAS être détectés --------------
    def test_icd10_not_flagged(self):
        hits = mod._regex_find_pii("Code CIM : F32.1 (épisode dépressif)")
        assert not hits, f"CIM ne doit pas être floué, hits={hits}"

    def test_ccam_not_flagged(self):
        hits = mod._regex_find_pii("Acte CCAM : DEQP003")
        assert not hits

    def test_money_not_flagged(self):
        hits = mod._regex_find_pii("Montant facturé : 1250,50 €")
        assert not hits

    def test_date_not_flagged(self):
        hits = mod._regex_find_pii("Séjour du 01/03/2026 au 15/03/2026")
        assert not hits

    def test_tech_id_not_flagged(self):
        hits = mod._regex_find_pii("Fichier : shot_0007_full.png session_abc123")
        assert not hits

    # --- Mélange réaliste -------------------------------------------------
    def test_realistic_hospital_text(self):
        text = (
            "Patient : M. Dupont Jean - NIR : 1 85 03 75 115 120 42 "
            "- Tél : 06 12 34 56 78 - Adresse : 12 rue de la Paix "
            "- Code CIM : F32.1 - Montant : 1250,50 € "
            "- Séjour du 01/03/2026 - Email : jean@test.fr"
        )
        hits = mod._regex_find_pii(text)
        labels = {h[0] for h in hits}
        assert "PERSON" in labels
        assert "NIR" in labels
        assert "PHONE" in labels
        assert "LOCATION" in labels
        assert "EMAIL" in labels

        # Vérifier qu'aucun hit ne couvre F32.1, 1250,50 €, 01/03/2026
        protected_strings = ("F32.1", "1250,50", "01/03/2026")
        for label, s, e in hits:
            span = text[s:e]
            for prot in protected_strings:
                assert prot not in span, f"{label} '{span}' couvre {prot}"


# ===========================================================================
# Tests de fusion de spans
# ===========================================================================

class TestMergeSpans:
    def test_non_overlapping_preserved(self):
        spans = [("PERSON", 0, 5), ("EMAIL", 20, 30)]
        assert mod._merge_spans(spans) == spans

    def test_overlap_kept_widest_label(self):
        spans = [("PERSON", 0, 10), ("LOCATION", 5, 20)]
        merged = mod._merge_spans(spans)
        assert len(merged) == 1
        label, s, e = merged[0]
        assert (s, e) == (0, 20)
        # le plus large est LOCATION (15 chars) > PERSON (10)
        assert label == "LOCATION"

    def test_identical_spans_dedup(self):
        spans = [("EMAIL", 3, 9), ("EMAIL", 3, 9)]
        assert len(mod._merge_spans(spans)) == 1


# ===========================================================================
# Tests pipeline complet avec OCR mocké
# ===========================================================================

@pytest.fixture
def synthetic_screenshot(tmp_path: Path) -> Path:
    """Génère une image synthétique avec 4 lignes de texte aux positions connues."""
    W, H = 900, 300
    img = Image.new("RGB", (W, H), color="white")
    draw = ImageDraw.Draw(img)
    try:
        font = ImageFont.truetype("DejaVuSans.ttf", 22)
    except Exception:
        font = ImageFont.load_default()

    # Lignes (y, text) — on reproduit le test demandé par l'utilisateur
    lines = [
        (20,  "Nom : Dupont"),
        (70,  "Code CIM : F32.1"),
        (120, "Adresse : 12 rue de la Paix"),
        (170, "Montant : 1250€"),
    ]
    for y, t in lines:
        draw.text((20, y), t, fill="black", font=font)

    path = tmp_path / "synth.png"
    img.save(path, format="PNG")
    return path


def _fake_doctr_ocr(image_path: Path):
    """Mock docTR : retourne des bbox word-level connues pour le screenshot synthétique.

    On utilise des bbox approximatives correspondant à la disposition dans le fixture.
    """
    # Format : liste de mots par ligne. Les x sont progressifs pour simuler
    # la largeur rendue. On reste volontairement grossier (le blur tolère).
    words = []

    line_idx = [0]

    def line(y, word_defs):
        x = 20
        for text, w in word_defs:
            words.append({
                "text": text, "x1": x, "y1": y, "x2": x + w, "y2": y + 30,
                "line": line_idx[0],
            })
            x += w + 8
        line_idx[0] += 1

    # "Nom : Dupont"  → ciblé PERSON (via "M. Dupont" ? non, on n'a pas M.) →
    # on ajoute le titre "M." pour déclencher le regex PERSON.
    line(20,  [("M.", 30), ("Dupont", 90)])
    # "Code CIM : F32.1" → doit NE PAS flouter
    line(70,  [("Code", 60), ("CIM", 50), (":", 10), ("F32.1", 80)])
    # "Adresse : 12 rue de la Paix" → LOCATION
    line(120, [("Adresse", 90), (":", 10), ("12", 30), ("rue", 40), ("de", 30),
               ("la", 30), ("Paix", 60)])
    # "Montant : 1250€" → NE PAS flouter
    line(170, [("Montant", 90), (":", 10), ("1250€", 80)])

    return words, 900, 300


def _pixel_variance(img: Image.Image, bbox) -> float:
    """Variance moyenne par canal dans une ROI — proxy pour « y a-t-il du détail ».

    Une zone floutée a une variance très basse ; une zone nette a plus de détail.
    """
    import statistics
    x1, y1, x2, y2 = bbox
    crop = img.crop((x1, y1, x2, y2)).convert("RGB")
    pixels = list(crop.getdata())
    if len(pixels) < 2:
        return 0.0
    rs = [p[0] for p in pixels]
    gs = [p[1] for p in pixels]
    bs = [p[2] for p in pixels]
    return (statistics.pvariance(rs) + statistics.pvariance(gs) + statistics.pvariance(bs)) / 3


class TestPIIBlurrerPipeline:
    def test_blur_only_pii_regions(self, tmp_path, synthetic_screenshot):
        with patch.object(mod, "_doctr_ocr", side_effect=_fake_doctr_ocr):
            blurrer = mod.PIIBlurrer(use_edsnlp=False)
            out = tmp_path / "synth_blurred.png"
            result = blurrer.blur_image(synthetic_screenshot, out)

        # Assertions globales
        assert result.count >= 2, (
            f"Attendu au moins 2 PII (PERSON + LOCATION), reçu {result.count} : "
            f"{[e.label for e in result.entities]}"
        )
        labels = {e.label for e in result.entities}
        assert "PERSON" in labels
        assert "LOCATION" in labels
        # F32.1 ne doit PAS être parmi les entités floutées
        assert not any("F32.1" in e.text for e in result.entities)
        # 1250 ne doit PAS être parmi les entités floutées
        assert not any("1250" in e.text for e in result.entities)

        # Vérification visuelle : la ligne CIM (y=70..100) doit rester nette,
        # la ligne Adresse (y=120..150) doit être floutée.
        original = Image.open(synthetic_screenshot)
        blurred = Image.open(out)

        # Ligne CIM : doit contenir du texte net (variance haute sur la zone)
        cim_bbox = (20, 68, 280, 105)
        var_orig_cim = _pixel_variance(original, cim_bbox)
        var_blur_cim = _pixel_variance(blurred, cim_bbox)
        # Tolérance : la zone CIM doit rester AU MOINS à 60% de la variance d'origine
        assert var_blur_cim >= 0.5 * var_orig_cim, (
            f"Code CIM a été flouté ! var_orig={var_orig_cim:.1f}, "
            f"var_blur={var_blur_cim:.1f}"
        )

        # Ligne Adresse : la variance doit chuter (flou applique un lissage)
        addr_bbox = (20, 118, 400, 155)
        var_orig_addr = _pixel_variance(original, addr_bbox)
        var_blur_addr = _pixel_variance(blurred, addr_bbox)
        assert var_blur_addr < var_orig_addr * 0.85, (
            f"Adresse pas suffisamment floutée : var_orig={var_orig_addr:.1f}, "
            f"var_blur={var_blur_addr:.1f}"
        )

    def test_no_pii_copies_file(self, tmp_path):
        """Si aucun PII n'est détecté, le fichier est copié tel quel."""
        img = Image.new("RGB", (400, 100), "white")
        p = tmp_path / "clean.png"
        img.save(p)

        def fake_clean_ocr(path):
            return (
                [{"text": "Bonjour", "x1": 10, "y1": 10, "x2": 100, "y2": 40},
                 {"text": "monde",   "x1": 110, "y1": 10, "x2": 200, "y2": 40}],
                400, 100,
            )

        with patch.object(mod, "_doctr_ocr", side_effect=fake_clean_ocr):
            res = mod.PIIBlurrer(use_edsnlp=False).blur_image(p, tmp_path / "clean_out.png")

        assert res.count == 0
        assert (tmp_path / "clean_out.png").is_file()

    def test_ocr_failure_falls_back_to_copy(self, tmp_path):
        """Si docTR plante, on copie l'original en version 'blurred' (failsafe)."""
        img = Image.new("RGB", (100, 100), "white")
        p = tmp_path / "fail.png"
        img.save(p)

        def boom(path):
            raise RuntimeError("OCR indispo")

        with patch.object(mod, "_doctr_ocr", side_effect=boom):
            res = mod.PIIBlurrer(use_edsnlp=False).blur_image(p, tmp_path / "fail_out.png")

        assert res.count == 0
        assert (tmp_path / "fail_out.png").is_file()


# ===========================================================================
# Sanity check helper
# ===========================================================================

def test_pii_labels_contains_expected():
    assert "PERSON" in mod.PII_LABELS
    assert "LOCATION" in mod.PII_LABELS
    assert "EMAIL" in mod.PII_LABELS
    assert "NIR" in mod.PII_LABELS
    assert "PHONE" in mod.PII_LABELS