feat: Annotation automatique et évaluation qualité baseline - Rappel 100%, Précision 18.97%
This commit is contained in:
77
tools/convert_annotations_format.py
Executable file
77
tools/convert_annotations_format.py
Executable file
@@ -0,0 +1,77 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Convertit les annotations du format structuré vers le format attendu par l'évaluateur.
|
||||
|
||||
Format source (structuré par page):
|
||||
{
|
||||
"pages": [
|
||||
{
|
||||
"page_number": 0,
|
||||
"pii": {
|
||||
"NOM": ["text1", "text2"],
|
||||
"TEL": ["text3"]
|
||||
}
|
||||
}
|
||||
]
|
||||
}
|
||||
|
||||
Format cible (liste plate):
|
||||
{
|
||||
"annotations": [
|
||||
{"page": 0, "type": "NOM", "text": "text1"},
|
||||
{"page": 0, "type": "NOM", "text": "text2"},
|
||||
{"page": 0, "type": "TEL", "text": "text3"}
|
||||
]
|
||||
}
|
||||
"""
|
||||
import sys
|
||||
import json
|
||||
from pathlib import Path
|
||||
|
||||
def convert_annotation(input_file: Path, output_file: Path):
|
||||
"""Convertit une annotation du format structuré vers le format liste."""
|
||||
|
||||
with open(input_file, 'r', encoding='utf-8') as f:
|
||||
data = json.load(f)
|
||||
|
||||
annotations = []
|
||||
|
||||
for page_data in data.get("pages", []):
|
||||
page_num = page_data.get("page_number", 0)
|
||||
|
||||
for pii_type, texts in page_data.get("pii", {}).items():
|
||||
for text in texts:
|
||||
annotations.append({
|
||||
"page": page_num,
|
||||
"type": pii_type,
|
||||
"text": text
|
||||
})
|
||||
|
||||
output_data = {
|
||||
"pdf_path": data.get("pdf_path", ""),
|
||||
"annotations": annotations
|
||||
}
|
||||
|
||||
with open(output_file, 'w', encoding='utf-8') as f:
|
||||
json.dump(output_data, f, indent=2, ensure_ascii=False)
|
||||
|
||||
|
||||
def main():
|
||||
"""Convertit toutes les annotations."""
|
||||
|
||||
pdfs_dir = Path("tests/ground_truth/pdfs")
|
||||
annotation_files = sorted(pdfs_dir.glob("*.annotations.json"))
|
||||
|
||||
print(f"Conversion de {len(annotation_files)} fichiers d'annotations...")
|
||||
|
||||
for ann_file in annotation_files:
|
||||
convert_annotation(ann_file, ann_file)
|
||||
print(f" ✓ {ann_file.name}")
|
||||
|
||||
print(f"\n✓ Conversion terminée")
|
||||
|
||||
return 0
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
sys.exit(main())
|
||||
Reference in New Issue
Block a user