78 lines
1.9 KiB
Python
Executable File
78 lines
1.9 KiB
Python
Executable File
#!/usr/bin/env python3
|
|
"""
|
|
Convertit les annotations du format structuré vers le format attendu par l'évaluateur.
|
|
|
|
Format source (structuré par page):
|
|
{
|
|
"pages": [
|
|
{
|
|
"page_number": 0,
|
|
"pii": {
|
|
"NOM": ["text1", "text2"],
|
|
"TEL": ["text3"]
|
|
}
|
|
}
|
|
]
|
|
}
|
|
|
|
Format cible (liste plate):
|
|
{
|
|
"annotations": [
|
|
{"page": 0, "type": "NOM", "text": "text1"},
|
|
{"page": 0, "type": "NOM", "text": "text2"},
|
|
{"page": 0, "type": "TEL", "text": "text3"}
|
|
]
|
|
}
|
|
"""
|
|
import sys
|
|
import json
|
|
from pathlib import Path
|
|
|
|
def convert_annotation(input_file: Path, output_file: Path):
|
|
"""Convertit une annotation du format structuré vers le format liste."""
|
|
|
|
with open(input_file, 'r', encoding='utf-8') as f:
|
|
data = json.load(f)
|
|
|
|
annotations = []
|
|
|
|
for page_data in data.get("pages", []):
|
|
page_num = page_data.get("page_number", 0)
|
|
|
|
for pii_type, texts in page_data.get("pii", {}).items():
|
|
for text in texts:
|
|
annotations.append({
|
|
"page": page_num,
|
|
"type": pii_type,
|
|
"text": text
|
|
})
|
|
|
|
output_data = {
|
|
"pdf_path": data.get("pdf_path", ""),
|
|
"annotations": annotations
|
|
}
|
|
|
|
with open(output_file, 'w', encoding='utf-8') as f:
|
|
json.dump(output_data, f, indent=2, ensure_ascii=False)
|
|
|
|
|
|
def main():
|
|
"""Convertit toutes les annotations."""
|
|
|
|
pdfs_dir = Path("tests/ground_truth/pdfs")
|
|
annotation_files = sorted(pdfs_dir.glob("*.annotations.json"))
|
|
|
|
print(f"Conversion de {len(annotation_files)} fichiers d'annotations...")
|
|
|
|
for ann_file in annotation_files:
|
|
convert_annotation(ann_file, ann_file)
|
|
print(f" ✓ {ann_file.name}")
|
|
|
|
print(f"\n✓ Conversion terminée")
|
|
|
|
return 0
|
|
|
|
|
|
if __name__ == "__main__":
|
|
sys.exit(main())
|