anonymisation/scripts/export_silver_annotations.py

#!/usr/bin/env python3
"""
Export silver annotations — Génère des données d'entraînement BIO à partir du pipeline existant.
================================================================================================
Utilise le pipeline regex+NER+VLM actuel pour produire des annotations "silver standard"
sur les 706 OGC. Ces annotations servent de base pour fine-tuner CamemBERT-bio.

Usage:
    python scripts/export_silver_annotations.py [--limit N] [--out-dir DIR]

Output: data/silver_annotations/ avec un fichier .bio par document
Format BIO: TOKEN\tLABEL (un token par ligne, lignes vides entre phrases)
"""
import sys
import re
import json
import argparse
from pathlib import Path
from typing import List, Tuple

sys.path.insert(0, str(Path(__file__).parent.parent))

# Regex pour détecter les placeholders et reconstruire l'alignement
PLACEHOLDER_RE = re.compile(
    r"\[(NOM|TEL|EMAIL|NIR|IPP|DOSSIER|NDA|EPISODE|RPPS|DATE_NAISSANCE|"
    r"ADRESSE|CODE_POSTAL|VILLE|MASK|FINESS|OGC|AGE|ETAB|IBAN)\]"
)

# Mapping placeholder → label BIO
PH_TO_BIO = {
    "NOM": "PER",
    "TEL": "TEL",
    "EMAIL": "EMAIL",
    "NIR": "NIR",
    "IPP": "IPP",
    "DOSSIER": "NDA",
    "NDA": "NDA",
    "EPISODE": "NDA",
    "RPPS": "RPPS",
    "DATE_NAISSANCE": "DATE_NAISSANCE",
    "ADRESSE": "ADRESSE",
    "CODE_POSTAL": "ZIP",
    "VILLE": "VILLE",
    "ETAB": "HOPITAL",
    "FINESS": "HOPITAL",
    "IBAN": "IBAN",
    "AGE": "AGE",
    "OGC": "NDA",
    "MASK": "O",  # MASK générique = pas d'annotation spécifique
}


def text_to_bio(pseudonymised_text: str) -> List[Tuple[str, str]]:
    """Convertit un texte pseudonymisé en séquence BIO.

    Les tokens [PLACEHOLDER] deviennent B-TYPE / I-TYPE.
    Les tokens normaux deviennent O.
    """
    bio_tokens: List[Tuple[str, str]] = []

    # Split le texte en segments : alternance texte normal / placeholder
    parts = PLACEHOLDER_RE.split(pseudonymised_text)
    # parts = [texte, label, texte, label, texte, ...]

    i = 0
    while i < len(parts):
        if i % 2 == 0:
            # Texte normal
            text_part = parts[i]
            for word in text_part.split():
                word = word.strip()
                if word:
                    bio_tokens.append((word, "O"))
        else:
            # Label de placeholder
            label = parts[i]
            bio_label = PH_TO_BIO.get(label, "O")
            if bio_label != "O":
                # Le placeholder remplace un ou plusieurs tokens
                bio_tokens.append((f"[{label}]", f"B-{bio_label}"))
            else:
                bio_tokens.append((f"[{label}]", "O"))
        i += 1

    return bio_tokens


def export_document(pseudo_path: Path, out_dir: Path) -> int:
    """Exporte un fichier pseudonymisé en format BIO. Retourne le nombre de tokens."""
    text = pseudo_path.read_text(encoding="utf-8", errors="replace")

    bio_tokens = text_to_bio(text)
    if not bio_tokens:
        return 0

    # Écrire en format CoNLL (TOKEN\tLABEL)
    out_path = out_dir / pseudo_path.name.replace(".pseudonymise.txt", ".bio")
    lines = []
    for token, label in bio_tokens:
        # Séparer les "phrases" par des lignes vides (heuristique: point final ou retour ligne)
        if token in (".", "!", "?") and label == "O":
            lines.append(f"{token}\t{label}")
            lines.append("")  # séparateur de phrase
        else:
            lines.append(f"{token}\t{label}")

    out_path.write_text("\n".join(lines), encoding="utf-8")
    return len(bio_tokens)


def main():
    parser = argparse.ArgumentParser(description="Export silver annotations BIO")
    parser.add_argument("--input-dir", type=Path,
                        default=Path("/home/dom/Téléchargements/II-1 Ctrl_T2A_2025_CHCB_DocJustificatifs (1)/anonymise_audit_30"),
                        help="Répertoire contenant les .pseudonymise.txt")
    parser.add_argument("--out-dir", type=Path,
                        default=Path(__file__).parent.parent / "data" / "silver_annotations",
                        help="Répertoire de sortie")
    parser.add_argument("--limit", type=int, default=0, help="Limiter à N fichiers (0=tous)")
    args = parser.parse_args()

    args.out_dir.mkdir(parents=True, exist_ok=True)

    pseudo_files = sorted(args.input_dir.glob("*.pseudonymise.txt"))
    if args.limit > 0:
        pseudo_files = pseudo_files[:args.limit]

    print(f"Export silver annotations: {len(pseudo_files)} fichiers → {args.out_dir}")

    total_tokens = 0
    total_entities = 0
    for f in pseudo_files:
        n = export_document(f, args.out_dir)
        ent_count = sum(1 for line in (args.out_dir / f.name.replace(".pseudonymise.txt", ".bio")).read_text().splitlines()
                        if line and not line.endswith("\tO"))
        total_tokens += n
        total_entities += ent_count
        print(f"  {f.name}: {n} tokens, {ent_count} entités")

    print(f"\nTotal: {total_tokens} tokens, {total_entities} entités annotées")
    print(f"Sortie: {args.out_dir}")


if __name__ == "__main__":
    main()