analysis: Analyse réelle de la qualité - Identification des faux positifs médicaux
This commit is contained in:
39
tools/compare_original_vs_anonymized.py
Normal file
39
tools/compare_original_vs_anonymized.py
Normal file
@@ -0,0 +1,39 @@
|
||||
#!/usr/bin/env python3
|
||||
"""Compare un document original avec sa version anonymisée."""
|
||||
|
||||
import sys
|
||||
from pathlib import Path
|
||||
sys.path.insert(0, str(Path(__file__).parent.parent))
|
||||
|
||||
import pdfplumber
|
||||
|
||||
# Document original
|
||||
original_pdf = Path("/home/dom/Téléchargements/II-1 Ctrl_T2A_2025_CHCB_DocJustificatifs/102_23056463/CRH 23056364.pdf")
|
||||
anonymized_txt = Path("/home/dom/Téléchargements/II-1 Ctrl_T2A_2025_CHCB_DocJustificatifs/anonymise/CRH 23056364.pseudonymise.txt")
|
||||
|
||||
print("="*80)
|
||||
print("COMPARAISON ORIGINAL vs ANONYMISÉ")
|
||||
print("="*80)
|
||||
|
||||
# Extraire texte original
|
||||
print("\n📄 Extraction du texte original...")
|
||||
with pdfplumber.open(original_pdf) as pdf:
|
||||
original_text = "\n".join([page.extract_text() or "" for page in pdf.pages])
|
||||
|
||||
# Lire texte anonymisé
|
||||
with open(anonymized_txt, 'r', encoding='utf-8') as f:
|
||||
anonymized_text = f.read()
|
||||
|
||||
print(f"\n📊 Longueur texte original: {len(original_text)} caractères")
|
||||
print(f"📊 Longueur texte anonymisé: {len(anonymized_text)} caractères")
|
||||
|
||||
# Afficher les 100 premières lignes de chaque
|
||||
print("\n" + "="*80)
|
||||
print("TEXTE ORIGINAL (100 premières lignes):")
|
||||
print("="*80)
|
||||
print("\n".join(original_text.split('\n')[:100]))
|
||||
|
||||
print("\n" + "="*80)
|
||||
print("TEXTE ANONYMISÉ (100 premières lignes):")
|
||||
print("="*80)
|
||||
print("\n".join(anonymized_text.split('\n')[:100]))
|
||||
Reference in New Issue
Block a user