Files
anonymisation/tools/convert_annotations_format.py

78 lines
1.9 KiB
Python
Executable File

#!/usr/bin/env python3
"""
Convertit les annotations du format structuré vers le format attendu par l'évaluateur.
Format source (structuré par page):
{
"pages": [
{
"page_number": 0,
"pii": {
"NOM": ["text1", "text2"],
"TEL": ["text3"]
}
}
]
}
Format cible (liste plate):
{
"annotations": [
{"page": 0, "type": "NOM", "text": "text1"},
{"page": 0, "type": "NOM", "text": "text2"},
{"page": 0, "type": "TEL", "text": "text3"}
]
}
"""
import sys
import json
from pathlib import Path
def convert_annotation(input_file: Path, output_file: Path):
"""Convertit une annotation du format structuré vers le format liste."""
with open(input_file, 'r', encoding='utf-8') as f:
data = json.load(f)
annotations = []
for page_data in data.get("pages", []):
page_num = page_data.get("page_number", 0)
for pii_type, texts in page_data.get("pii", {}).items():
for text in texts:
annotations.append({
"page": page_num,
"type": pii_type,
"text": text
})
output_data = {
"pdf_path": data.get("pdf_path", ""),
"annotations": annotations
}
with open(output_file, 'w', encoding='utf-8') as f:
json.dump(output_data, f, indent=2, ensure_ascii=False)
def main():
"""Convertit toutes les annotations."""
pdfs_dir = Path("tests/ground_truth/pdfs")
annotation_files = sorted(pdfs_dir.glob("*.annotations.json"))
print(f"Conversion de {len(annotation_files)} fichiers d'annotations...")
for ann_file in annotation_files:
convert_annotation(ann_file, ann_file)
print(f"{ann_file.name}")
print(f"\n✓ Conversion terminée")
return 0
if __name__ == "__main__":
sys.exit(main())