#!/usr/bin/env python3 """ Outil d'annotation CLI pour créer le dataset de test annoté. Usage: python tools/annotation_tool.py python tools/annotation_tool.py --list python tools/annotation_tool.py --resume """ import json import sys from pathlib import Path from datetime import datetime from typing import List, Dict, Optional, Tuple import re try: import pymupdf as fitz except ImportError: import fitz class AnnotationTool: """Outil d'annotation interactif pour les documents PDF.""" PII_TYPES = [ "NOM", "PRENOM", "DATE_NAISSANCE", "AGE", "TEL", "EMAIL", "ADRESSE", "CODE_POSTAL", "VILLE", "NIR", "IPP", "NDA", "RPPS", "FINESS", "OGC", "NUMERO_PATIENT", "NUMERO_LOT", "NUMERO_ORDONNANCE", "NUMERO_SEJOUR", "ETABLISSEMENT", "SERVICE", "DATE", "AUTRE" ] def __init__(self, pdf_path: Path): self.pdf_path = pdf_path self.annotations_path = pdf_path.parent / f"{pdf_path.stem}.annotations.json" self.doc = None self.annotations = [] self.medical_terms = [] self.metadata = { "annotator": "annotator_1", "annotation_date": datetime.now().isoformat(), "document_type": "unknown", "page_count": 0, "difficulty": "medium" } def load_pdf(self) -> bool: """Charge le PDF et extrait le texte.""" try: self.doc = fitz.open(self.pdf_path) self.metadata["page_count"] = len(self.doc) print(f"✓ PDF chargé: {self.pdf_path.name}") print(f" Pages: {len(self.doc)}") return True except Exception as e: print(f"✗ Erreur lors du chargement du PDF: {e}") return False def extract_text(self, page_num: int) -> str: """Extrait le texte d'une page.""" if not self.doc or page_num >= len(self.doc): return "" page = self.doc[page_num] return page.get_text() def display_page(self, page_num: int): """Affiche le texte d'une page.""" text = self.extract_text(page_num) print(f"\n{'='*80}") print(f"PAGE {page_num + 1}/{len(self.doc)}") print(f"{'='*80}") print(text) print(f"{'='*80}\n") def get_context(self, text: str, pii_text: str, window: int = 50) -> str: """Extrait le contexte autour d'un PII.""" pos = text.find(pii_text) if pos == -1: return "" start = max(0, pos - window) end = min(len(text), pos + len(pii_text) + window) context = text[start:end] # Nettoyer les retours à la ligne multiples context = re.sub(r'\n+', ' ', context) context = re.sub(r'\s+', ' ', context) return context.strip() def input_with_default(self, prompt: str, default: str = "") -> str: """Demande une entrée avec valeur par défaut.""" if default: user_input = input(f"{prompt} [{default}]: ").strip() return user_input if user_input else default else: return input(f"{prompt}: ").strip() def select_from_list(self, prompt: str, options: List[str], default: Optional[str] = None) -> str: """Sélection dans une liste d'options.""" print(f"\n{prompt}") for i, option in enumerate(options, 1): marker = " (défaut)" if option == default else "" print(f" {i}. {option}{marker}") while True: choice = input(f"Choix [1-{len(options)}]: ").strip() if not choice and default: return default try: idx = int(choice) - 1 if 0 <= idx < len(options): return options[idx] except ValueError: pass print(f"✗ Choix invalide. Entrez un nombre entre 1 et {len(options)}") def annotate_pii(self, page_num: int, text: str) -> List[Dict]: """Annotation interactive des PII d'une page.""" page_annotations = [] print(f"\n--- Annotation de la page {page_num + 1} ---") print("Commandes: 'q' pour terminer la page, 's' pour sauter") ann_id = len(self.annotations) + 1 while True: print(f"\n[Annotation #{ann_id}]") # Texte du PII pii_text = input("Texte du PII (ou 'q' pour terminer, 's' pour sauter): ").strip() if pii_text.lower() == 'q': break if pii_text.lower() == 's': continue if not pii_text: print("✗ Le texte ne peut pas être vide") continue # Type de PII pii_type = self.select_from_list( "Type de PII:", self.PII_TYPES, default="NOM" ) # Contexte context = self.get_context(text, pii_text) if context: print(f"Contexte détecté: {context[:100]}...") use_context = input("Utiliser ce contexte? [O/n]: ").strip().lower() if use_context == 'n': context = input("Contexte manuel: ").strip() else: context = input("Contexte: ").strip() # Obligatoire? mandatory_input = input("PII obligatoire (RGPD)? [O/n]: ").strip().lower() mandatory = mandatory_input != 'n' # Difficulté difficulty = self.select_from_list( "Difficulté de détection:", ["easy", "medium", "hard"], default="medium" ) # Méthodes de détection attendues print("\nMéthodes de détection attendues (séparées par des virgules):") print(" Options: regex, vlm, ner, contextual, trackare") methods_input = input("Méthodes [regex,ner]: ").strip() if not methods_input: methods = ["regex", "ner"] else: methods = [m.strip() for m in methods_input.split(',')] # Créer l'annotation annotation = { "id": f"ann_{ann_id:03d}", "page": page_num, "type": pii_type, "text": pii_text, "bbox": None, # Pas de bbox pour l'instant (annotation manuelle) "context": context, "mandatory": mandatory, "difficulty": difficulty, "detection_method_expected": methods } page_annotations.append(annotation) print(f"✓ Annotation ajoutée: {pii_type} = '{pii_text}'") ann_id += 1 return page_annotations def annotate_document(self): """Annotation complète du document.""" if not self.load_pdf(): return False # Métadonnées du document print("\n=== Métadonnées du document ===") self.metadata["annotator"] = self.input_with_default( "Nom de l'annotateur", default="annotator_1" ) self.metadata["document_type"] = self.select_from_list( "Type de document:", ["compte_rendu", "trackare", "anapath", "bacterio", "consultation", "autre"], default="compte_rendu" ) self.metadata["difficulty"] = self.select_from_list( "Difficulté globale du document:", ["simple", "moyen", "complexe"], default="moyen" ) # Annoter chaque page for page_num in range(len(self.doc)): self.display_page(page_num) annotate_page = input(f"\nAnnoter cette page? [O/n]: ").strip().lower() if annotate_page == 'n': continue text = self.extract_text(page_num) page_annotations = self.annotate_pii(page_num, text) self.annotations.extend(page_annotations) # Termes médicaux à préserver print("\n=== Termes médicaux à préserver ===") print("Entrez les termes médicaux qui ne doivent PAS être masqués") print("(un par ligne, ligne vide pour terminer)") while True: term = input("Terme médical: ").strip() if not term: break self.medical_terms.append(term) print(f"✓ Ajouté: {term}") return True def save_annotations(self): """Sauvegarde les annotations au format JSON.""" # Calculer les statistiques stats = { "total_pii": len(self.annotations), "by_type": {} } for ann in self.annotations: pii_type = ann["type"] stats["by_type"][pii_type] = stats["by_type"].get(pii_type, 0) + 1 # Créer la structure finale output = { "pdf_path": str(self.pdf_path), "metadata": self.metadata, "annotations": self.annotations, "medical_terms_to_preserve": self.medical_terms, "statistics": stats } # Sauvegarder with open(self.annotations_path, 'w', encoding='utf-8') as f: json.dump(output, f, indent=2, ensure_ascii=False) print(f"\n✓ Annotations sauvegardées: {self.annotations_path}") print(f" Total PII: {stats['total_pii']}") print(f" Types: {', '.join(f'{k}={v}' for k, v in stats['by_type'].items())}") print(f" Termes médicaux: {len(self.medical_terms)}") def load_existing_annotations(self) -> bool: """Charge les annotations existantes si disponibles.""" if not self.annotations_path.exists(): return False try: with open(self.annotations_path, 'r', encoding='utf-8') as f: data = json.load(f) self.metadata = data.get("metadata", self.metadata) self.annotations = data.get("annotations", []) self.medical_terms = data.get("medical_terms_to_preserve", []) print(f"✓ Annotations existantes chargées: {len(self.annotations)} PII") return True except Exception as e: print(f"✗ Erreur lors du chargement des annotations: {e}") return False def run(self): """Exécute l'outil d'annotation.""" print(f"\n{'='*80}") print(f"OUTIL D'ANNOTATION - {self.pdf_path.name}") print(f"{'='*80}") # Vérifier si des annotations existent déjà if self.annotations_path.exists(): overwrite = input(f"\n⚠ Des annotations existent déjà. Écraser? [o/N]: ").strip().lower() if overwrite != 'o': print("Annotation annulée.") return False # Annoter le document if not self.annotate_document(): return False # Sauvegarder self.save_annotations() return True def list_documents(): """Liste les documents disponibles pour annotation.""" pdfs_dir = Path("tests/ground_truth/pdfs") if not pdfs_dir.exists(): print(f"✗ Répertoire introuvable: {pdfs_dir}") return pdfs = sorted(pdfs_dir.glob("*.pdf")) if not pdfs: print(f"✗ Aucun PDF trouvé dans {pdfs_dir}") return print(f"\n{'='*80}") print(f"DOCUMENTS DISPONIBLES ({len(pdfs)})") print(f"{'='*80}\n") for pdf in pdfs: annotation_file = pdf.parent / f"{pdf.stem}.annotations.json" status = "✓ Annoté" if annotation_file.exists() else "○ À annoter" print(f"{status} {pdf.name}") def find_next_unannotated() -> Optional[Path]: """Trouve le prochain document non annoté.""" pdfs_dir = Path("tests/ground_truth/pdfs") if not pdfs_dir.exists(): return None for pdf in sorted(pdfs_dir.glob("*.pdf")): annotation_file = pdf.parent / f"{pdf.stem}.annotations.json" if not annotation_file.exists(): return pdf return None def main(): if len(sys.argv) > 1: if sys.argv[1] == "--list": list_documents() return elif sys.argv[1] == "--resume": next_pdf = find_next_unannotated() if next_pdf: print(f"Prochain document à annoter: {next_pdf.name}") tool = AnnotationTool(next_pdf) tool.run() else: print("✓ Tous les documents sont annotés!") return else: pdf_path = Path(sys.argv[1]) else: print("Usage:") print(" python tools/annotation_tool.py ") print(" python tools/annotation_tool.py --list") print(" python tools/annotation_tool.py --resume") sys.exit(1) if not pdf_path.exists(): print(f"✗ Fichier introuvable: {pdf_path}") sys.exit(1) tool = AnnotationTool(pdf_path) success = tool.run() sys.exit(0 if success else 1) if __name__ == "__main__": main()