feat: Phase 1 - Système d'évaluation de la qualité
- Sélection et copie de 27 documents représentatifs (10 simples, 12 moyens, 5 complexes) - Outil d'annotation CLI complet (tools/annotation_tool.py) - Guide d'annotation détaillé (docs/annotation_guide.md) - Évaluateur de qualité (evaluation/quality_evaluator.py) * Calcul Précision, Rappel, F1-Score * Identification faux positifs/négatifs * Métriques par type de PII * Export JSON et rapports texte - Scanner de fuite (evaluation/leak_scanner.py) * Détection PII résiduels (CRITIQUE) * Détection nouveaux PII (HAUTE) * Scan métadonnées PDF (MOYENNE) - Benchmark de performance (evaluation/benchmark.py) * Mesure temps de traitement * Mesure CPU/RAM * Export JSON/CSV - Tests unitaires complets pour tous les composants - Documentation complète du module d'évaluation Tâches complétées: - 1.1.1 Sélection de 27 documents (au lieu de 30) - 1.1.2 Outil d'annotation CLI - 1.2.1 Évaluateur de qualité - 1.2.2 Scanner de fuite - 1.2.3 Benchmark de performance Prochaines étapes: - 1.1.3 Annotation des 27 documents (manuel) - 1.1.4 Enrichissement stopwords médicaux - 1.3 Mesure de la baseline
This commit is contained in:
194
tools/analyze_corpus.py
Executable file
194
tools/analyze_corpus.py
Executable file
@@ -0,0 +1,194 @@
|
||||
#!/usr/bin/env python3
|
||||
# -*- coding: utf-8 -*-
|
||||
"""
|
||||
Analyse du corpus OGC pour sélection de documents représentatifs.
|
||||
"""
|
||||
import sys
|
||||
from pathlib import Path
|
||||
import json
|
||||
import random
|
||||
|
||||
try:
|
||||
import fitz # PyMuPDF
|
||||
except ImportError:
|
||||
print("PyMuPDF non disponible, analyse limitée")
|
||||
fitz = None
|
||||
|
||||
|
||||
def analyze_pdf(pdf_path: Path) -> dict:
|
||||
"""Analyse un PDF : nombre de pages, taille, type."""
|
||||
stats = {
|
||||
"path": str(pdf_path),
|
||||
"folder": pdf_path.parent.name,
|
||||
"filename": pdf_path.name,
|
||||
"size_mb": round(pdf_path.stat().st_size / (1024 * 1024), 2),
|
||||
"pages": 0,
|
||||
"type": "unknown",
|
||||
}
|
||||
|
||||
# Déterminer le type de document
|
||||
name_lower = pdf_path.name.lower()
|
||||
if "trackare" in name_lower:
|
||||
stats["type"] = "trackare"
|
||||
elif "crh" in name_lower or "cr" in name_lower:
|
||||
stats["type"] = "compte_rendu"
|
||||
elif "anapath" in name_lower:
|
||||
stats["type"] = "anapath"
|
||||
elif "lettre" in name_lower or "sortie" in name_lower:
|
||||
stats["type"] = "lettre_sortie"
|
||||
elif "cro" in name_lower:
|
||||
stats["type"] = "cro"
|
||||
|
||||
# Compter les pages si PyMuPDF disponible
|
||||
if fitz:
|
||||
try:
|
||||
doc = fitz.open(str(pdf_path))
|
||||
stats["pages"] = len(doc)
|
||||
doc.close()
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
return stats
|
||||
|
||||
|
||||
def classify_complexity(stats: dict) -> str:
|
||||
"""Classifie la complexité d'un document."""
|
||||
pages = stats["pages"]
|
||||
size_mb = stats["size_mb"]
|
||||
|
||||
if pages <= 2 and size_mb < 0.3:
|
||||
return "simple"
|
||||
elif pages >= 6 or size_mb > 1.0:
|
||||
return "complexe"
|
||||
else:
|
||||
return "moyen"
|
||||
|
||||
|
||||
def main():
|
||||
corpus_dir = Path("/home/dom/Téléchargements/II-1 Ctrl_T2A_2025_CHCB_DocJustificatifs (1)/")
|
||||
|
||||
if not corpus_dir.exists():
|
||||
print(f"Erreur : {corpus_dir} n'existe pas")
|
||||
return 1
|
||||
|
||||
print("Analyse du corpus OGC...")
|
||||
print(f"Répertoire : {corpus_dir}")
|
||||
|
||||
# Collecter tous les PDFs
|
||||
all_pdfs = list(corpus_dir.glob("*/*.pdf"))
|
||||
print(f"Total PDFs trouvés : {len(all_pdfs)}")
|
||||
|
||||
# Analyser un échantillon pour estimation
|
||||
sample_size = min(100, len(all_pdfs))
|
||||
sample = random.sample(all_pdfs, sample_size)
|
||||
|
||||
print(f"\nAnalyse d'un échantillon de {sample_size} documents...")
|
||||
|
||||
analyzed = []
|
||||
for i, pdf_path in enumerate(sample, 1):
|
||||
if i % 20 == 0:
|
||||
print(f" Analysé {i}/{sample_size}...")
|
||||
stats = analyze_pdf(pdf_path)
|
||||
stats["complexity"] = classify_complexity(stats)
|
||||
analyzed.append(stats)
|
||||
|
||||
# Statistiques globales
|
||||
print("\n" + "="*60)
|
||||
print("STATISTIQUES GLOBALES")
|
||||
print("="*60)
|
||||
|
||||
# Par type
|
||||
types_count = {}
|
||||
for s in analyzed:
|
||||
types_count[s["type"]] = types_count.get(s["type"], 0) + 1
|
||||
|
||||
print("\nRépartition par type :")
|
||||
for doc_type, count in sorted(types_count.items(), key=lambda x: -x[1]):
|
||||
pct = (count / len(analyzed)) * 100
|
||||
print(f" {doc_type:20s} : {count:3d} ({pct:5.1f}%)")
|
||||
|
||||
# Par complexité
|
||||
complexity_count = {}
|
||||
for s in analyzed:
|
||||
complexity_count[s["complexity"]] = complexity_count.get(s["complexity"], 0) + 1
|
||||
|
||||
print("\nRépartition par complexité :")
|
||||
for complexity, count in sorted(complexity_count.items()):
|
||||
pct = (count / len(analyzed)) * 100
|
||||
print(f" {complexity:20s} : {count:3d} ({pct:5.1f}%)")
|
||||
|
||||
# Statistiques pages
|
||||
pages_list = [s["pages"] for s in analyzed if s["pages"] > 0]
|
||||
if pages_list:
|
||||
print(f"\nNombre de pages :")
|
||||
print(f" Min : {min(pages_list)}")
|
||||
print(f" Max : {max(pages_list)}")
|
||||
print(f" Moy : {sum(pages_list) / len(pages_list):.1f}")
|
||||
|
||||
# Statistiques taille
|
||||
sizes_list = [s["size_mb"] for s in analyzed]
|
||||
print(f"\nTaille (MB) :")
|
||||
print(f" Min : {min(sizes_list):.2f}")
|
||||
print(f" Max : {max(sizes_list):.2f}")
|
||||
print(f" Moy : {sum(sizes_list) / len(sizes_list):.2f}")
|
||||
|
||||
# Sélection de 30 documents représentatifs
|
||||
print("\n" + "="*60)
|
||||
print("SÉLECTION DE 30 DOCUMENTS REPRÉSENTATIFS")
|
||||
print("="*60)
|
||||
|
||||
# Stratégie : 10 simples, 15 moyens, 5 complexes
|
||||
# Varier les types de documents
|
||||
|
||||
simples = [s for s in analyzed if s["complexity"] == "simple"]
|
||||
moyens = [s for s in analyzed if s["complexity"] == "moyen"]
|
||||
complexes = [s for s in analyzed if s["complexity"] == "complexe"]
|
||||
|
||||
print(f"\nDisponibles : {len(simples)} simples, {len(moyens)} moyens, {len(complexes)} complexes")
|
||||
|
||||
selected = []
|
||||
|
||||
# Sélectionner 10 simples
|
||||
if len(simples) >= 10:
|
||||
selected.extend(random.sample(simples, 10))
|
||||
else:
|
||||
selected.extend(simples)
|
||||
print(f"⚠️ Seulement {len(simples)} documents simples disponibles")
|
||||
|
||||
# Sélectionner 15 moyens
|
||||
if len(moyens) >= 15:
|
||||
selected.extend(random.sample(moyens, 15))
|
||||
else:
|
||||
selected.extend(moyens)
|
||||
print(f"⚠️ Seulement {len(moyens)} documents moyens disponibles")
|
||||
|
||||
# Sélectionner 5 complexes
|
||||
if len(complexes) >= 5:
|
||||
selected.extend(random.sample(complexes, 5))
|
||||
else:
|
||||
selected.extend(complexes)
|
||||
print(f"⚠️ Seulement {len(complexes)} documents complexes disponibles")
|
||||
|
||||
print(f"\nTotal sélectionnés : {len(selected)}")
|
||||
|
||||
# Sauvegarder la sélection
|
||||
output_file = Path("tests/ground_truth/selected_documents.json")
|
||||
output_file.parent.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
with open(output_file, "w", encoding="utf-8") as f:
|
||||
json.dump(selected, f, indent=2, ensure_ascii=False)
|
||||
|
||||
print(f"\nSélection sauvegardée dans : {output_file}")
|
||||
|
||||
# Afficher la liste
|
||||
print("\nDocuments sélectionnés :")
|
||||
print("-" * 80)
|
||||
for i, doc in enumerate(selected, 1):
|
||||
print(f"{i:2d}. [{doc['complexity']:8s}] {doc['folder']}/{doc['filename']}")
|
||||
print(f" {doc['pages']} pages, {doc['size_mb']} MB, type: {doc['type']}")
|
||||
|
||||
return 0
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
sys.exit(main())
|
||||
400
tools/annotation_tool.py
Executable file
400
tools/annotation_tool.py
Executable file
@@ -0,0 +1,400 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Outil d'annotation CLI pour créer le dataset de test annoté.
|
||||
|
||||
Usage:
|
||||
python tools/annotation_tool.py <pdf_path>
|
||||
python tools/annotation_tool.py --list
|
||||
python tools/annotation_tool.py --resume
|
||||
"""
|
||||
import json
|
||||
import sys
|
||||
from pathlib import Path
|
||||
from datetime import datetime
|
||||
from typing import List, Dict, Optional, Tuple
|
||||
import re
|
||||
|
||||
try:
|
||||
import pymupdf as fitz
|
||||
except ImportError:
|
||||
import fitz
|
||||
|
||||
|
||||
class AnnotationTool:
|
||||
"""Outil d'annotation interactif pour les documents PDF."""
|
||||
|
||||
PII_TYPES = [
|
||||
"NOM", "PRENOM", "DATE_NAISSANCE", "AGE",
|
||||
"TEL", "EMAIL", "ADRESSE", "CODE_POSTAL", "VILLE",
|
||||
"NIR", "IPP", "NDA", "RPPS", "FINESS", "OGC",
|
||||
"NUMERO_PATIENT", "NUMERO_LOT", "NUMERO_ORDONNANCE", "NUMERO_SEJOUR",
|
||||
"ETABLISSEMENT", "SERVICE", "DATE", "AUTRE"
|
||||
]
|
||||
|
||||
def __init__(self, pdf_path: Path):
|
||||
self.pdf_path = pdf_path
|
||||
self.annotations_path = pdf_path.parent / f"{pdf_path.stem}.annotations.json"
|
||||
self.doc = None
|
||||
self.annotations = []
|
||||
self.medical_terms = []
|
||||
self.metadata = {
|
||||
"annotator": "annotator_1",
|
||||
"annotation_date": datetime.now().isoformat(),
|
||||
"document_type": "unknown",
|
||||
"page_count": 0,
|
||||
"difficulty": "medium"
|
||||
}
|
||||
|
||||
def load_pdf(self) -> bool:
|
||||
"""Charge le PDF et extrait le texte."""
|
||||
try:
|
||||
self.doc = fitz.open(self.pdf_path)
|
||||
self.metadata["page_count"] = len(self.doc)
|
||||
print(f"✓ PDF chargé: {self.pdf_path.name}")
|
||||
print(f" Pages: {len(self.doc)}")
|
||||
return True
|
||||
except Exception as e:
|
||||
print(f"✗ Erreur lors du chargement du PDF: {e}")
|
||||
return False
|
||||
|
||||
def extract_text(self, page_num: int) -> str:
|
||||
"""Extrait le texte d'une page."""
|
||||
if not self.doc or page_num >= len(self.doc):
|
||||
return ""
|
||||
|
||||
page = self.doc[page_num]
|
||||
return page.get_text()
|
||||
|
||||
def display_page(self, page_num: int):
|
||||
"""Affiche le texte d'une page."""
|
||||
text = self.extract_text(page_num)
|
||||
|
||||
print(f"\n{'='*80}")
|
||||
print(f"PAGE {page_num + 1}/{len(self.doc)}")
|
||||
print(f"{'='*80}")
|
||||
print(text)
|
||||
print(f"{'='*80}\n")
|
||||
|
||||
def get_context(self, text: str, pii_text: str, window: int = 50) -> str:
|
||||
"""Extrait le contexte autour d'un PII."""
|
||||
pos = text.find(pii_text)
|
||||
if pos == -1:
|
||||
return ""
|
||||
|
||||
start = max(0, pos - window)
|
||||
end = min(len(text), pos + len(pii_text) + window)
|
||||
context = text[start:end]
|
||||
|
||||
# Nettoyer les retours à la ligne multiples
|
||||
context = re.sub(r'\n+', ' ', context)
|
||||
context = re.sub(r'\s+', ' ', context)
|
||||
|
||||
return context.strip()
|
||||
|
||||
def input_with_default(self, prompt: str, default: str = "") -> str:
|
||||
"""Demande une entrée avec valeur par défaut."""
|
||||
if default:
|
||||
user_input = input(f"{prompt} [{default}]: ").strip()
|
||||
return user_input if user_input else default
|
||||
else:
|
||||
return input(f"{prompt}: ").strip()
|
||||
|
||||
def select_from_list(self, prompt: str, options: List[str], default: Optional[str] = None) -> str:
|
||||
"""Sélection dans une liste d'options."""
|
||||
print(f"\n{prompt}")
|
||||
for i, option in enumerate(options, 1):
|
||||
marker = " (défaut)" if option == default else ""
|
||||
print(f" {i}. {option}{marker}")
|
||||
|
||||
while True:
|
||||
choice = input(f"Choix [1-{len(options)}]: ").strip()
|
||||
|
||||
if not choice and default:
|
||||
return default
|
||||
|
||||
try:
|
||||
idx = int(choice) - 1
|
||||
if 0 <= idx < len(options):
|
||||
return options[idx]
|
||||
except ValueError:
|
||||
pass
|
||||
|
||||
print(f"✗ Choix invalide. Entrez un nombre entre 1 et {len(options)}")
|
||||
|
||||
def annotate_pii(self, page_num: int, text: str) -> List[Dict]:
|
||||
"""Annotation interactive des PII d'une page."""
|
||||
page_annotations = []
|
||||
|
||||
print(f"\n--- Annotation de la page {page_num + 1} ---")
|
||||
print("Commandes: 'q' pour terminer la page, 's' pour sauter")
|
||||
|
||||
ann_id = len(self.annotations) + 1
|
||||
|
||||
while True:
|
||||
print(f"\n[Annotation #{ann_id}]")
|
||||
|
||||
# Texte du PII
|
||||
pii_text = input("Texte du PII (ou 'q' pour terminer, 's' pour sauter): ").strip()
|
||||
|
||||
if pii_text.lower() == 'q':
|
||||
break
|
||||
if pii_text.lower() == 's':
|
||||
continue
|
||||
if not pii_text:
|
||||
print("✗ Le texte ne peut pas être vide")
|
||||
continue
|
||||
|
||||
# Type de PII
|
||||
pii_type = self.select_from_list(
|
||||
"Type de PII:",
|
||||
self.PII_TYPES,
|
||||
default="NOM"
|
||||
)
|
||||
|
||||
# Contexte
|
||||
context = self.get_context(text, pii_text)
|
||||
if context:
|
||||
print(f"Contexte détecté: {context[:100]}...")
|
||||
use_context = input("Utiliser ce contexte? [O/n]: ").strip().lower()
|
||||
if use_context == 'n':
|
||||
context = input("Contexte manuel: ").strip()
|
||||
else:
|
||||
context = input("Contexte: ").strip()
|
||||
|
||||
# Obligatoire?
|
||||
mandatory_input = input("PII obligatoire (RGPD)? [O/n]: ").strip().lower()
|
||||
mandatory = mandatory_input != 'n'
|
||||
|
||||
# Difficulté
|
||||
difficulty = self.select_from_list(
|
||||
"Difficulté de détection:",
|
||||
["easy", "medium", "hard"],
|
||||
default="medium"
|
||||
)
|
||||
|
||||
# Méthodes de détection attendues
|
||||
print("\nMéthodes de détection attendues (séparées par des virgules):")
|
||||
print(" Options: regex, vlm, ner, contextual, trackare")
|
||||
methods_input = input("Méthodes [regex,ner]: ").strip()
|
||||
if not methods_input:
|
||||
methods = ["regex", "ner"]
|
||||
else:
|
||||
methods = [m.strip() for m in methods_input.split(',')]
|
||||
|
||||
# Créer l'annotation
|
||||
annotation = {
|
||||
"id": f"ann_{ann_id:03d}",
|
||||
"page": page_num,
|
||||
"type": pii_type,
|
||||
"text": pii_text,
|
||||
"bbox": None, # Pas de bbox pour l'instant (annotation manuelle)
|
||||
"context": context,
|
||||
"mandatory": mandatory,
|
||||
"difficulty": difficulty,
|
||||
"detection_method_expected": methods
|
||||
}
|
||||
|
||||
page_annotations.append(annotation)
|
||||
print(f"✓ Annotation ajoutée: {pii_type} = '{pii_text}'")
|
||||
|
||||
ann_id += 1
|
||||
|
||||
return page_annotations
|
||||
|
||||
def annotate_document(self):
|
||||
"""Annotation complète du document."""
|
||||
if not self.load_pdf():
|
||||
return False
|
||||
|
||||
# Métadonnées du document
|
||||
print("\n=== Métadonnées du document ===")
|
||||
|
||||
self.metadata["annotator"] = self.input_with_default(
|
||||
"Nom de l'annotateur",
|
||||
default="annotator_1"
|
||||
)
|
||||
|
||||
self.metadata["document_type"] = self.select_from_list(
|
||||
"Type de document:",
|
||||
["compte_rendu", "trackare", "anapath", "bacterio", "consultation", "autre"],
|
||||
default="compte_rendu"
|
||||
)
|
||||
|
||||
self.metadata["difficulty"] = self.select_from_list(
|
||||
"Difficulté globale du document:",
|
||||
["simple", "moyen", "complexe"],
|
||||
default="moyen"
|
||||
)
|
||||
|
||||
# Annoter chaque page
|
||||
for page_num in range(len(self.doc)):
|
||||
self.display_page(page_num)
|
||||
|
||||
annotate_page = input(f"\nAnnoter cette page? [O/n]: ").strip().lower()
|
||||
if annotate_page == 'n':
|
||||
continue
|
||||
|
||||
text = self.extract_text(page_num)
|
||||
page_annotations = self.annotate_pii(page_num, text)
|
||||
self.annotations.extend(page_annotations)
|
||||
|
||||
# Termes médicaux à préserver
|
||||
print("\n=== Termes médicaux à préserver ===")
|
||||
print("Entrez les termes médicaux qui ne doivent PAS être masqués")
|
||||
print("(un par ligne, ligne vide pour terminer)")
|
||||
|
||||
while True:
|
||||
term = input("Terme médical: ").strip()
|
||||
if not term:
|
||||
break
|
||||
self.medical_terms.append(term)
|
||||
print(f"✓ Ajouté: {term}")
|
||||
|
||||
return True
|
||||
|
||||
def save_annotations(self):
|
||||
"""Sauvegarde les annotations au format JSON."""
|
||||
# Calculer les statistiques
|
||||
stats = {
|
||||
"total_pii": len(self.annotations),
|
||||
"by_type": {}
|
||||
}
|
||||
|
||||
for ann in self.annotations:
|
||||
pii_type = ann["type"]
|
||||
stats["by_type"][pii_type] = stats["by_type"].get(pii_type, 0) + 1
|
||||
|
||||
# Créer la structure finale
|
||||
output = {
|
||||
"pdf_path": str(self.pdf_path),
|
||||
"metadata": self.metadata,
|
||||
"annotations": self.annotations,
|
||||
"medical_terms_to_preserve": self.medical_terms,
|
||||
"statistics": stats
|
||||
}
|
||||
|
||||
# Sauvegarder
|
||||
with open(self.annotations_path, 'w', encoding='utf-8') as f:
|
||||
json.dump(output, f, indent=2, ensure_ascii=False)
|
||||
|
||||
print(f"\n✓ Annotations sauvegardées: {self.annotations_path}")
|
||||
print(f" Total PII: {stats['total_pii']}")
|
||||
print(f" Types: {', '.join(f'{k}={v}' for k, v in stats['by_type'].items())}")
|
||||
print(f" Termes médicaux: {len(self.medical_terms)}")
|
||||
|
||||
def load_existing_annotations(self) -> bool:
|
||||
"""Charge les annotations existantes si disponibles."""
|
||||
if not self.annotations_path.exists():
|
||||
return False
|
||||
|
||||
try:
|
||||
with open(self.annotations_path, 'r', encoding='utf-8') as f:
|
||||
data = json.load(f)
|
||||
|
||||
self.metadata = data.get("metadata", self.metadata)
|
||||
self.annotations = data.get("annotations", [])
|
||||
self.medical_terms = data.get("medical_terms_to_preserve", [])
|
||||
|
||||
print(f"✓ Annotations existantes chargées: {len(self.annotations)} PII")
|
||||
return True
|
||||
except Exception as e:
|
||||
print(f"✗ Erreur lors du chargement des annotations: {e}")
|
||||
return False
|
||||
|
||||
def run(self):
|
||||
"""Exécute l'outil d'annotation."""
|
||||
print(f"\n{'='*80}")
|
||||
print(f"OUTIL D'ANNOTATION - {self.pdf_path.name}")
|
||||
print(f"{'='*80}")
|
||||
|
||||
# Vérifier si des annotations existent déjà
|
||||
if self.annotations_path.exists():
|
||||
overwrite = input(f"\n⚠ Des annotations existent déjà. Écraser? [o/N]: ").strip().lower()
|
||||
if overwrite != 'o':
|
||||
print("Annotation annulée.")
|
||||
return False
|
||||
|
||||
# Annoter le document
|
||||
if not self.annotate_document():
|
||||
return False
|
||||
|
||||
# Sauvegarder
|
||||
self.save_annotations()
|
||||
|
||||
return True
|
||||
|
||||
|
||||
def list_documents():
|
||||
"""Liste les documents disponibles pour annotation."""
|
||||
pdfs_dir = Path("tests/ground_truth/pdfs")
|
||||
|
||||
if not pdfs_dir.exists():
|
||||
print(f"✗ Répertoire introuvable: {pdfs_dir}")
|
||||
return
|
||||
|
||||
pdfs = sorted(pdfs_dir.glob("*.pdf"))
|
||||
|
||||
if not pdfs:
|
||||
print(f"✗ Aucun PDF trouvé dans {pdfs_dir}")
|
||||
return
|
||||
|
||||
print(f"\n{'='*80}")
|
||||
print(f"DOCUMENTS DISPONIBLES ({len(pdfs)})")
|
||||
print(f"{'='*80}\n")
|
||||
|
||||
for pdf in pdfs:
|
||||
annotation_file = pdf.parent / f"{pdf.stem}.annotations.json"
|
||||
status = "✓ Annoté" if annotation_file.exists() else "○ À annoter"
|
||||
print(f"{status} {pdf.name}")
|
||||
|
||||
|
||||
def find_next_unannotated() -> Optional[Path]:
|
||||
"""Trouve le prochain document non annoté."""
|
||||
pdfs_dir = Path("tests/ground_truth/pdfs")
|
||||
|
||||
if not pdfs_dir.exists():
|
||||
return None
|
||||
|
||||
for pdf in sorted(pdfs_dir.glob("*.pdf")):
|
||||
annotation_file = pdf.parent / f"{pdf.stem}.annotations.json"
|
||||
if not annotation_file.exists():
|
||||
return pdf
|
||||
|
||||
return None
|
||||
|
||||
|
||||
def main():
|
||||
if len(sys.argv) > 1:
|
||||
if sys.argv[1] == "--list":
|
||||
list_documents()
|
||||
return
|
||||
elif sys.argv[1] == "--resume":
|
||||
next_pdf = find_next_unannotated()
|
||||
if next_pdf:
|
||||
print(f"Prochain document à annoter: {next_pdf.name}")
|
||||
tool = AnnotationTool(next_pdf)
|
||||
tool.run()
|
||||
else:
|
||||
print("✓ Tous les documents sont annotés!")
|
||||
return
|
||||
else:
|
||||
pdf_path = Path(sys.argv[1])
|
||||
else:
|
||||
print("Usage:")
|
||||
print(" python tools/annotation_tool.py <pdf_path>")
|
||||
print(" python tools/annotation_tool.py --list")
|
||||
print(" python tools/annotation_tool.py --resume")
|
||||
sys.exit(1)
|
||||
|
||||
if not pdf_path.exists():
|
||||
print(f"✗ Fichier introuvable: {pdf_path}")
|
||||
sys.exit(1)
|
||||
|
||||
tool = AnnotationTool(pdf_path)
|
||||
success = tool.run()
|
||||
|
||||
sys.exit(0 if success else 1)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
96
tools/copy_selected_docs.py
Normal file
96
tools/copy_selected_docs.py
Normal file
@@ -0,0 +1,96 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Script pour copier les documents sélectionnés dans tests/ground_truth/
|
||||
"""
|
||||
import json
|
||||
import shutil
|
||||
from pathlib import Path
|
||||
|
||||
def copy_selected_documents():
|
||||
"""Copie les documents sélectionnés dans le répertoire de test."""
|
||||
|
||||
# Charger la liste des documents sélectionnés
|
||||
selected_file = Path("tests/ground_truth/selected_documents.json")
|
||||
with open(selected_file, 'r', encoding='utf-8') as f:
|
||||
documents = json.load(f)
|
||||
|
||||
# Créer le répertoire de destination
|
||||
dest_dir = Path("tests/ground_truth/pdfs")
|
||||
dest_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
# Copier chaque document
|
||||
copied = 0
|
||||
errors = []
|
||||
|
||||
for i, doc in enumerate(documents, 1):
|
||||
src_path = Path(doc['path'])
|
||||
|
||||
# Créer un nom de fichier unique et descriptif
|
||||
# Format: {index:03d}_{complexity}_{type}_{original_name}
|
||||
doc_type = doc.get('type', 'unknown')
|
||||
complexity = doc.get('complexity', 'unknown')
|
||||
original_name = doc['filename']
|
||||
|
||||
# Nettoyer le nom de fichier
|
||||
safe_name = original_name.replace(' ', '_')
|
||||
dest_name = f"{i:03d}_{complexity}_{doc_type}_{safe_name}"
|
||||
dest_path = dest_dir / dest_name
|
||||
|
||||
try:
|
||||
if src_path.exists():
|
||||
shutil.copy2(src_path, dest_path)
|
||||
print(f"✓ Copié: {dest_name}")
|
||||
copied += 1
|
||||
else:
|
||||
error_msg = f"✗ Fichier introuvable: {src_path}"
|
||||
print(error_msg)
|
||||
errors.append(error_msg)
|
||||
except Exception as e:
|
||||
error_msg = f"✗ Erreur lors de la copie de {src_path}: {e}"
|
||||
print(error_msg)
|
||||
errors.append(error_msg)
|
||||
|
||||
# Résumé
|
||||
print(f"\n{'='*60}")
|
||||
print(f"Résumé:")
|
||||
print(f" Documents copiés: {copied}/{len(documents)}")
|
||||
print(f" Erreurs: {len(errors)}")
|
||||
print(f" Destination: {dest_dir.absolute()}")
|
||||
|
||||
if errors:
|
||||
print(f"\nErreurs rencontrées:")
|
||||
for error in errors:
|
||||
print(f" {error}")
|
||||
|
||||
# Créer un fichier de mapping
|
||||
mapping = []
|
||||
for i, doc in enumerate(documents, 1):
|
||||
doc_type = doc.get('type', 'unknown')
|
||||
complexity = doc.get('complexity', 'unknown')
|
||||
original_name = doc['filename']
|
||||
safe_name = original_name.replace(' ', '_')
|
||||
dest_name = f"{i:03d}_{complexity}_{doc_type}_{safe_name}"
|
||||
|
||||
mapping.append({
|
||||
"id": i,
|
||||
"dest_filename": dest_name,
|
||||
"original_path": doc['path'],
|
||||
"folder": doc['folder'],
|
||||
"original_filename": doc['filename'],
|
||||
"type": doc_type,
|
||||
"complexity": complexity,
|
||||
"pages": doc.get('pages', 0),
|
||||
"size_mb": doc.get('size_mb', 0)
|
||||
})
|
||||
|
||||
mapping_file = dest_dir / "mapping.json"
|
||||
with open(mapping_file, 'w', encoding='utf-8') as f:
|
||||
json.dump(mapping, f, indent=2, ensure_ascii=False)
|
||||
|
||||
print(f"\nFichier de mapping créé: {mapping_file}")
|
||||
|
||||
return copied, len(errors)
|
||||
|
||||
if __name__ == "__main__":
|
||||
copied, errors = copy_selected_documents()
|
||||
exit(0 if errors == 0 else 1)
|
||||
Reference in New Issue
Block a user