#!/usr/bin/env python3 # -*- coding: utf-8 -*- """ Analyse du corpus OGC pour sélection de documents représentatifs. """ import sys from pathlib import Path import json import random try: import fitz # PyMuPDF except ImportError: print("PyMuPDF non disponible, analyse limitée") fitz = None def analyze_pdf(pdf_path: Path) -> dict: """Analyse un PDF : nombre de pages, taille, type.""" stats = { "path": str(pdf_path), "folder": pdf_path.parent.name, "filename": pdf_path.name, "size_mb": round(pdf_path.stat().st_size / (1024 * 1024), 2), "pages": 0, "type": "unknown", } # Déterminer le type de document name_lower = pdf_path.name.lower() if "trackare" in name_lower: stats["type"] = "trackare" elif "crh" in name_lower or "cr" in name_lower: stats["type"] = "compte_rendu" elif "anapath" in name_lower: stats["type"] = "anapath" elif "lettre" in name_lower or "sortie" in name_lower: stats["type"] = "lettre_sortie" elif "cro" in name_lower: stats["type"] = "cro" # Compter les pages si PyMuPDF disponible if fitz: try: doc = fitz.open(str(pdf_path)) stats["pages"] = len(doc) doc.close() except Exception: pass return stats def classify_complexity(stats: dict) -> str: """Classifie la complexité d'un document.""" pages = stats["pages"] size_mb = stats["size_mb"] if pages <= 2 and size_mb < 0.3: return "simple" elif pages >= 6 or size_mb > 1.0: return "complexe" else: return "moyen" def main(): corpus_dir = Path("/home/dom/Téléchargements/II-1 Ctrl_T2A_2025_CHCB_DocJustificatifs (1)/") if not corpus_dir.exists(): print(f"Erreur : {corpus_dir} n'existe pas") return 1 print("Analyse du corpus OGC...") print(f"Répertoire : {corpus_dir}") # Collecter tous les PDFs all_pdfs = list(corpus_dir.glob("*/*.pdf")) print(f"Total PDFs trouvés : {len(all_pdfs)}") # Analyser un échantillon pour estimation sample_size = min(100, len(all_pdfs)) sample = random.sample(all_pdfs, sample_size) print(f"\nAnalyse d'un échantillon de {sample_size} documents...") analyzed = [] for i, pdf_path in enumerate(sample, 1): if i % 20 == 0: print(f" Analysé {i}/{sample_size}...") stats = analyze_pdf(pdf_path) stats["complexity"] = classify_complexity(stats) analyzed.append(stats) # Statistiques globales print("\n" + "="*60) print("STATISTIQUES GLOBALES") print("="*60) # Par type types_count = {} for s in analyzed: types_count[s["type"]] = types_count.get(s["type"], 0) + 1 print("\nRépartition par type :") for doc_type, count in sorted(types_count.items(), key=lambda x: -x[1]): pct = (count / len(analyzed)) * 100 print(f" {doc_type:20s} : {count:3d} ({pct:5.1f}%)") # Par complexité complexity_count = {} for s in analyzed: complexity_count[s["complexity"]] = complexity_count.get(s["complexity"], 0) + 1 print("\nRépartition par complexité :") for complexity, count in sorted(complexity_count.items()): pct = (count / len(analyzed)) * 100 print(f" {complexity:20s} : {count:3d} ({pct:5.1f}%)") # Statistiques pages pages_list = [s["pages"] for s in analyzed if s["pages"] > 0] if pages_list: print(f"\nNombre de pages :") print(f" Min : {min(pages_list)}") print(f" Max : {max(pages_list)}") print(f" Moy : {sum(pages_list) / len(pages_list):.1f}") # Statistiques taille sizes_list = [s["size_mb"] for s in analyzed] print(f"\nTaille (MB) :") print(f" Min : {min(sizes_list):.2f}") print(f" Max : {max(sizes_list):.2f}") print(f" Moy : {sum(sizes_list) / len(sizes_list):.2f}") # Sélection de 30 documents représentatifs print("\n" + "="*60) print("SÉLECTION DE 30 DOCUMENTS REPRÉSENTATIFS") print("="*60) # Stratégie : 10 simples, 15 moyens, 5 complexes # Varier les types de documents simples = [s for s in analyzed if s["complexity"] == "simple"] moyens = [s for s in analyzed if s["complexity"] == "moyen"] complexes = [s for s in analyzed if s["complexity"] == "complexe"] print(f"\nDisponibles : {len(simples)} simples, {len(moyens)} moyens, {len(complexes)} complexes") selected = [] # Sélectionner 10 simples if len(simples) >= 10: selected.extend(random.sample(simples, 10)) else: selected.extend(simples) print(f"⚠️ Seulement {len(simples)} documents simples disponibles") # Sélectionner 15 moyens if len(moyens) >= 15: selected.extend(random.sample(moyens, 15)) else: selected.extend(moyens) print(f"⚠️ Seulement {len(moyens)} documents moyens disponibles") # Sélectionner 5 complexes if len(complexes) >= 5: selected.extend(random.sample(complexes, 5)) else: selected.extend(complexes) print(f"⚠️ Seulement {len(complexes)} documents complexes disponibles") print(f"\nTotal sélectionnés : {len(selected)}") # Sauvegarder la sélection output_file = Path("tests/ground_truth/selected_documents.json") output_file.parent.mkdir(parents=True, exist_ok=True) with open(output_file, "w", encoding="utf-8") as f: json.dump(selected, f, indent=2, ensure_ascii=False) print(f"\nSélection sauvegardée dans : {output_file}") # Afficher la liste print("\nDocuments sélectionnés :") print("-" * 80) for i, doc in enumerate(selected, 1): print(f"{i:2d}. [{doc['complexity']:8s}] {doc['folder']}/{doc['filename']}") print(f" {doc['pages']} pages, {doc['size_mb']} MB, type: {doc['type']}") return 0 if __name__ == "__main__": sys.exit(main())