#!/usr/bin/env python3 """ Convertit les annotations du format structuré vers le format attendu par l'évaluateur. Format source (structuré par page): { "pages": [ { "page_number": 0, "pii": { "NOM": ["text1", "text2"], "TEL": ["text3"] } } ] } Format cible (liste plate): { "annotations": [ {"page": 0, "type": "NOM", "text": "text1"}, {"page": 0, "type": "NOM", "text": "text2"}, {"page": 0, "type": "TEL", "text": "text3"} ] } """ import sys import json from pathlib import Path def convert_annotation(input_file: Path, output_file: Path): """Convertit une annotation du format structuré vers le format liste.""" with open(input_file, 'r', encoding='utf-8') as f: data = json.load(f) annotations = [] for page_data in data.get("pages", []): page_num = page_data.get("page_number", 0) for pii_type, texts in page_data.get("pii", {}).items(): for text in texts: annotations.append({ "page": page_num, "type": pii_type, "text": text }) output_data = { "pdf_path": data.get("pdf_path", ""), "annotations": annotations } with open(output_file, 'w', encoding='utf-8') as f: json.dump(output_data, f, indent=2, ensure_ascii=False) def main(): """Convertit toutes les annotations.""" pdfs_dir = Path("tests/ground_truth/pdfs") annotation_files = sorted(pdfs_dir.glob("*.annotations.json")) print(f"Conversion de {len(annotation_files)} fichiers d'annotations...") for ann_file in annotation_files: convert_annotation(ann_file, ann_file) print(f" ✓ {ann_file.name}") print(f"\n✓ Conversion terminée") return 0 if __name__ == "__main__": sys.exit(main())