#!/usr/bin/env python3 # -*- coding: utf-8 -*- """ Comparaison entre test dataset (100% qualité) et production (régression) Identifie les différences de traitement """ import json from pathlib import Path from typing import Dict, List import re def analyze_audit_file(audit_path: Path) -> Dict: """Analyse un fichier audit""" audit = [] with open(audit_path, 'r', encoding='utf-8') as f: for line in f: if line.strip(): audit.append(json.loads(line)) stats = { "total": len(audit), "by_kind": {}, "by_page": {}, "global_tokens": 0, "extracted_tokens": 0, } for h in audit: kind = h['kind'] page = h.get('page', -1) stats["by_kind"][kind] = stats["by_kind"].get(kind, 0) + 1 stats["by_page"][page] = stats["by_page"].get(page, 0) + 1 if kind.endswith("_GLOBAL"): stats["global_tokens"] += 1 if kind == "NOM_EXTRACTED": stats["extracted_tokens"] += 1 return stats def compare_datasets(): """Compare test dataset vs production""" # Test dataset (bonne qualité) test_dir = Path("tests/ground_truth/pdfs/baseline_anonymized") # Production (régression) prod_dir = Path("/home/dom/Téléchargements/II-1 Ctrl_T2A_2025_CHCB_DocJustificatifs/anonymise") print("\n" + "="*80) print("COMPARAISON TEST DATASET vs PRODUCTION") print("="*80 + "\n") # Analyser test dataset print("📊 Analyse TEST DATASET (bonne qualité)...") test_audits = list(test_dir.glob("*.audit.jsonl")) test_stats_all = [] for audit_file in test_audits[:5]: # 5 premiers stats = analyze_audit_file(audit_file) test_stats_all.append(stats) print(f" • {audit_file.name}: {stats['total']} PII, {stats['global_tokens']} global, {stats['extracted_tokens']} extracted") # Moyennes test test_avg = { "total": sum(s["total"] for s in test_stats_all) / len(test_stats_all), "global": sum(s["global_tokens"] for s in test_stats_all) / len(test_stats_all), "extracted": sum(s["extracted_tokens"] for s in test_stats_all) / len(test_stats_all), } print(f"\n Moyennes TEST:") print(f" - PII/doc: {test_avg['total']:.1f}") print(f" - Global/doc: {test_avg['global']:.1f}") print(f" - Extracted/doc: {test_avg['extracted']:.1f}") # Analyser production print("\n📊 Analyse PRODUCTION (régression)...") prod_audits = list(prod_dir.glob("*.audit.jsonl")) prod_stats_all = [] for audit_file in prod_audits[:5]: # 5 premiers stats = analyze_audit_file(audit_file) prod_stats_all.append(stats) print(f" • {audit_file.name}: {stats['total']} PII, {stats['global_tokens']} global, {stats['extracted_tokens']} extracted") # Moyennes production prod_avg = { "total": sum(s["total"] for s in prod_stats_all) / len(prod_stats_all), "global": sum(s["global_tokens"] for s in prod_stats_all) / len(prod_stats_all), "extracted": sum(s["extracted_tokens"] for s in prod_stats_all) / len(prod_stats_all), } print(f"\n Moyennes PRODUCTION:") print(f" - PII/doc: {prod_avg['total']:.1f}") print(f" - Global/doc: {prod_avg['global']:.1f}") print(f" - Extracted/doc: {prod_avg['extracted']:.1f}") # Comparaison print("\n" + "="*80) print("DIFFÉRENCES") print("="*80) diff_total = prod_avg['total'] - test_avg['total'] diff_global = prod_avg['global'] - test_avg['global'] diff_extracted = prod_avg['extracted'] - test_avg['extracted'] print(f"\n PII/doc: {diff_total:+.1f} ({diff_total/test_avg['total']*100:+.1f}%)") print(f" Global/doc: {diff_global:+.1f} ({diff_global/max(1,test_avg['global'])*100:+.1f}%)") print(f" Extracted/doc: {diff_extracted:+.1f} ({diff_extracted/max(1,test_avg['extracted'])*100:+.1f}%)") # Analyse des types de PII print("\n" + "="*80) print("RÉPARTITION PAR TYPE") print("="*80) # Test dataset test_by_kind = {} for stats in test_stats_all: for kind, count in stats["by_kind"].items(): test_by_kind[kind] = test_by_kind.get(kind, 0) + count # Production prod_by_kind = {} for stats in prod_stats_all: for kind, count in stats["by_kind"].items(): prod_by_kind[kind] = prod_by_kind.get(kind, 0) + count # Top 10 types all_kinds = set(test_by_kind.keys()) | set(prod_by_kind.keys()) kind_diffs = [] for kind in all_kinds: test_count = test_by_kind.get(kind, 0) prod_count = prod_by_kind.get(kind, 0) diff = prod_count - test_count kind_diffs.append((kind, test_count, prod_count, diff)) kind_diffs.sort(key=lambda x: abs(x[3]), reverse=True) print("\n Top 10 différences:") print(f" {'Type':<25} {'Test':<10} {'Prod':<10} {'Diff':<10}") print(f" {'-'*60}") for kind, test_c, prod_c, diff in kind_diffs[:10]: print(f" {kind:<25} {test_c:<10} {prod_c:<10} {diff:+<10}") # Identifier les problèmes print("\n" + "="*80) print("PROBLÈMES IDENTIFIÉS") print("="*80 + "\n") problems = [] # NOM_EXTRACTED if prod_avg['extracted'] > 0: problems.append("⚠️ NOM_EXTRACTED activé en production (devrait être désactivé)") # *_GLOBAL if prod_avg['global'] > test_avg['global'] * 2: problems.append(f"⚠️ Trop de tokens _GLOBAL en production ({prod_avg['global']:.1f} vs {test_avg['global']:.1f})") # PII total if prod_avg['total'] > test_avg['total'] * 1.5: problems.append(f"⚠️ Trop de PII détectés en production ({prod_avg['total']:.1f} vs {test_avg['total']:.1f})") if problems: for p in problems: print(f" {p}") else: print(" ✅ Aucun problème majeur détecté") if __name__ == "__main__": compare_datasets()