docs: Analyse complète de la régression de qualité - Causes racines identifiées
This commit is contained in:
172
tools/compare_test_vs_production.py
Normal file
172
tools/compare_test_vs_production.py
Normal file
@@ -0,0 +1,172 @@
|
||||
#!/usr/bin/env python3
|
||||
# -*- coding: utf-8 -*-
|
||||
"""
|
||||
Comparaison entre test dataset (100% qualité) et production (régression)
|
||||
Identifie les différences de traitement
|
||||
"""
|
||||
import json
|
||||
from pathlib import Path
|
||||
from typing import Dict, List
|
||||
import re
|
||||
|
||||
def analyze_audit_file(audit_path: Path) -> Dict:
|
||||
"""Analyse un fichier audit"""
|
||||
audit = []
|
||||
with open(audit_path, 'r', encoding='utf-8') as f:
|
||||
for line in f:
|
||||
if line.strip():
|
||||
audit.append(json.loads(line))
|
||||
|
||||
stats = {
|
||||
"total": len(audit),
|
||||
"by_kind": {},
|
||||
"by_page": {},
|
||||
"global_tokens": 0,
|
||||
"extracted_tokens": 0,
|
||||
}
|
||||
|
||||
for h in audit:
|
||||
kind = h['kind']
|
||||
page = h.get('page', -1)
|
||||
|
||||
stats["by_kind"][kind] = stats["by_kind"].get(kind, 0) + 1
|
||||
stats["by_page"][page] = stats["by_page"].get(page, 0) + 1
|
||||
|
||||
if kind.endswith("_GLOBAL"):
|
||||
stats["global_tokens"] += 1
|
||||
if kind == "NOM_EXTRACTED":
|
||||
stats["extracted_tokens"] += 1
|
||||
|
||||
return stats
|
||||
|
||||
def compare_datasets():
|
||||
"""Compare test dataset vs production"""
|
||||
|
||||
# Test dataset (bonne qualité)
|
||||
test_dir = Path("tests/ground_truth/pdfs/baseline_anonymized")
|
||||
|
||||
# Production (régression)
|
||||
prod_dir = Path("/home/dom/Téléchargements/II-1 Ctrl_T2A_2025_CHCB_DocJustificatifs/anonymise")
|
||||
|
||||
print("\n" + "="*80)
|
||||
print("COMPARAISON TEST DATASET vs PRODUCTION")
|
||||
print("="*80 + "\n")
|
||||
|
||||
# Analyser test dataset
|
||||
print("📊 Analyse TEST DATASET (bonne qualité)...")
|
||||
test_audits = list(test_dir.glob("*.audit.jsonl"))
|
||||
test_stats_all = []
|
||||
|
||||
for audit_file in test_audits[:5]: # 5 premiers
|
||||
stats = analyze_audit_file(audit_file)
|
||||
test_stats_all.append(stats)
|
||||
print(f" • {audit_file.name}: {stats['total']} PII, {stats['global_tokens']} global, {stats['extracted_tokens']} extracted")
|
||||
|
||||
# Moyennes test
|
||||
test_avg = {
|
||||
"total": sum(s["total"] for s in test_stats_all) / len(test_stats_all),
|
||||
"global": sum(s["global_tokens"] for s in test_stats_all) / len(test_stats_all),
|
||||
"extracted": sum(s["extracted_tokens"] for s in test_stats_all) / len(test_stats_all),
|
||||
}
|
||||
|
||||
print(f"\n Moyennes TEST:")
|
||||
print(f" - PII/doc: {test_avg['total']:.1f}")
|
||||
print(f" - Global/doc: {test_avg['global']:.1f}")
|
||||
print(f" - Extracted/doc: {test_avg['extracted']:.1f}")
|
||||
|
||||
# Analyser production
|
||||
print("\n📊 Analyse PRODUCTION (régression)...")
|
||||
prod_audits = list(prod_dir.glob("*.audit.jsonl"))
|
||||
prod_stats_all = []
|
||||
|
||||
for audit_file in prod_audits[:5]: # 5 premiers
|
||||
stats = analyze_audit_file(audit_file)
|
||||
prod_stats_all.append(stats)
|
||||
print(f" • {audit_file.name}: {stats['total']} PII, {stats['global_tokens']} global, {stats['extracted_tokens']} extracted")
|
||||
|
||||
# Moyennes production
|
||||
prod_avg = {
|
||||
"total": sum(s["total"] for s in prod_stats_all) / len(prod_stats_all),
|
||||
"global": sum(s["global_tokens"] for s in prod_stats_all) / len(prod_stats_all),
|
||||
"extracted": sum(s["extracted_tokens"] for s in prod_stats_all) / len(prod_stats_all),
|
||||
}
|
||||
|
||||
print(f"\n Moyennes PRODUCTION:")
|
||||
print(f" - PII/doc: {prod_avg['total']:.1f}")
|
||||
print(f" - Global/doc: {prod_avg['global']:.1f}")
|
||||
print(f" - Extracted/doc: {prod_avg['extracted']:.1f}")
|
||||
|
||||
# Comparaison
|
||||
print("\n" + "="*80)
|
||||
print("DIFFÉRENCES")
|
||||
print("="*80)
|
||||
|
||||
diff_total = prod_avg['total'] - test_avg['total']
|
||||
diff_global = prod_avg['global'] - test_avg['global']
|
||||
diff_extracted = prod_avg['extracted'] - test_avg['extracted']
|
||||
|
||||
print(f"\n PII/doc: {diff_total:+.1f} ({diff_total/test_avg['total']*100:+.1f}%)")
|
||||
print(f" Global/doc: {diff_global:+.1f} ({diff_global/max(1,test_avg['global'])*100:+.1f}%)")
|
||||
print(f" Extracted/doc: {diff_extracted:+.1f} ({diff_extracted/max(1,test_avg['extracted'])*100:+.1f}%)")
|
||||
|
||||
# Analyse des types de PII
|
||||
print("\n" + "="*80)
|
||||
print("RÉPARTITION PAR TYPE")
|
||||
print("="*80)
|
||||
|
||||
# Test dataset
|
||||
test_by_kind = {}
|
||||
for stats in test_stats_all:
|
||||
for kind, count in stats["by_kind"].items():
|
||||
test_by_kind[kind] = test_by_kind.get(kind, 0) + count
|
||||
|
||||
# Production
|
||||
prod_by_kind = {}
|
||||
for stats in prod_stats_all:
|
||||
for kind, count in stats["by_kind"].items():
|
||||
prod_by_kind[kind] = prod_by_kind.get(kind, 0) + count
|
||||
|
||||
# Top 10 types
|
||||
all_kinds = set(test_by_kind.keys()) | set(prod_by_kind.keys())
|
||||
kind_diffs = []
|
||||
for kind in all_kinds:
|
||||
test_count = test_by_kind.get(kind, 0)
|
||||
prod_count = prod_by_kind.get(kind, 0)
|
||||
diff = prod_count - test_count
|
||||
kind_diffs.append((kind, test_count, prod_count, diff))
|
||||
|
||||
kind_diffs.sort(key=lambda x: abs(x[3]), reverse=True)
|
||||
|
||||
print("\n Top 10 différences:")
|
||||
print(f" {'Type':<25} {'Test':<10} {'Prod':<10} {'Diff':<10}")
|
||||
print(f" {'-'*60}")
|
||||
for kind, test_c, prod_c, diff in kind_diffs[:10]:
|
||||
print(f" {kind:<25} {test_c:<10} {prod_c:<10} {diff:+<10}")
|
||||
|
||||
# Identifier les problèmes
|
||||
print("\n" + "="*80)
|
||||
print("PROBLÈMES IDENTIFIÉS")
|
||||
print("="*80 + "\n")
|
||||
|
||||
problems = []
|
||||
|
||||
# NOM_EXTRACTED
|
||||
if prod_avg['extracted'] > 0:
|
||||
problems.append("⚠️ NOM_EXTRACTED activé en production (devrait être désactivé)")
|
||||
|
||||
# *_GLOBAL
|
||||
if prod_avg['global'] > test_avg['global'] * 2:
|
||||
problems.append(f"⚠️ Trop de tokens _GLOBAL en production ({prod_avg['global']:.1f} vs {test_avg['global']:.1f})")
|
||||
|
||||
# PII total
|
||||
if prod_avg['total'] > test_avg['total'] * 1.5:
|
||||
problems.append(f"⚠️ Trop de PII détectés en production ({prod_avg['total']:.1f} vs {test_avg['total']:.1f})")
|
||||
|
||||
if problems:
|
||||
for p in problems:
|
||||
print(f" {p}")
|
||||
else:
|
||||
print(" ✅ Aucun problème majeur détecté")
|
||||
|
||||
if __name__ == "__main__":
|
||||
compare_datasets()
|
||||
Reference in New Issue
Block a user