feat: modules batch paths + masquage manuel + templates de masque
- gui_batch_paths.py : listing documents + construction chemins de sortie batch - manual_masking.py : masquage manuel piloté par templates YAML - config/mask_templates/ : template FC19 Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
This commit is contained in:
18
config/mask_templates/FC19_template.yml
Normal file
18
config/mask_templates/FC19_template.yml
Normal file
@@ -0,0 +1,18 @@
|
|||||||
|
version: 1
|
||||||
|
name: FC19_template
|
||||||
|
page_size:
|
||||||
|
width: 595.0
|
||||||
|
height: 842.0
|
||||||
|
masks:
|
||||||
|
- page: 0
|
||||||
|
x0: 123.2
|
||||||
|
y0: 25.6
|
||||||
|
x1: 485.6
|
||||||
|
y1: 66.4
|
||||||
|
label: MASK
|
||||||
|
- page: 0
|
||||||
|
x0: 205.6
|
||||||
|
y0: 351.2
|
||||||
|
x1: 341.6
|
||||||
|
y1: 367.2
|
||||||
|
label: MASK
|
||||||
43
gui_batch_paths.py
Normal file
43
gui_batch_paths.py
Normal file
@@ -0,0 +1,43 @@
|
|||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import Iterable
|
||||||
|
|
||||||
|
|
||||||
|
def _is_relative_to(path: Path, other: Path) -> bool:
|
||||||
|
try:
|
||||||
|
path.relative_to(other)
|
||||||
|
return True
|
||||||
|
except ValueError:
|
||||||
|
return False
|
||||||
|
|
||||||
|
|
||||||
|
def list_supported_documents(root_dir: Path, supported_extensions: Iterable[str]) -> list[Path]:
|
||||||
|
"""List supported input documents while ignoring the GUI output subtree."""
|
||||||
|
normalized_exts = {ext.lower() for ext in supported_extensions}
|
||||||
|
output_dir = root_dir / "anonymise"
|
||||||
|
documents: list[Path] = []
|
||||||
|
|
||||||
|
for path in root_dir.rglob("*"):
|
||||||
|
if not path.is_file():
|
||||||
|
continue
|
||||||
|
if _is_relative_to(path, output_dir):
|
||||||
|
continue
|
||||||
|
if path.suffix.lower() not in normalized_exts:
|
||||||
|
continue
|
||||||
|
documents.append(path)
|
||||||
|
|
||||||
|
return sorted(documents)
|
||||||
|
|
||||||
|
|
||||||
|
def build_batch_output_dir(root_dir: Path, output_root: Path, source_path: Path) -> Path:
|
||||||
|
"""Preserve the source parent path under the batch output directory."""
|
||||||
|
relative_parent = source_path.relative_to(root_dir).parent
|
||||||
|
if relative_parent == Path("."):
|
||||||
|
return output_root
|
||||||
|
return output_root / relative_parent
|
||||||
|
|
||||||
|
|
||||||
|
def iter_pseudonymized_texts(output_dir: Path):
|
||||||
|
"""Yield anonymized text outputs recursively for post-run checks."""
|
||||||
|
return output_dir.rglob("*.pseudonymise.txt")
|
||||||
56
manual_masking.py
Normal file
56
manual_masking.py
Normal file
@@ -0,0 +1,56 @@
|
|||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import Optional
|
||||||
|
|
||||||
|
|
||||||
|
MASK_TEMPLATES_SUBDIR = Path("config") / "mask_templates"
|
||||||
|
MASK_TEMPLATE_EXTENSIONS = {".yml", ".yaml", ".json"}
|
||||||
|
DEFAULT_MASK_OUTPUT_DIRNAME = "anonymise"
|
||||||
|
DEFAULT_MASK_PREVIEW_DIRNAME = "anonymise_preview"
|
||||||
|
|
||||||
|
|
||||||
|
def mask_templates_dir(base_dir: Path) -> Path:
|
||||||
|
return base_dir / MASK_TEMPLATES_SUBDIR
|
||||||
|
|
||||||
|
|
||||||
|
def ensure_mask_templates_dir(base_dir: Path) -> Path:
|
||||||
|
path = mask_templates_dir(base_dir)
|
||||||
|
path.mkdir(parents=True, exist_ok=True)
|
||||||
|
return path
|
||||||
|
|
||||||
|
|
||||||
|
def resolve_manual_mask_pdf(single_file: Optional[Path]) -> Optional[Path]:
|
||||||
|
if single_file is None:
|
||||||
|
return None
|
||||||
|
if single_file.suffix.lower() != ".pdf":
|
||||||
|
return None
|
||||||
|
return single_file
|
||||||
|
|
||||||
|
|
||||||
|
def list_mask_templates(base_dir: Path) -> list[Path]:
|
||||||
|
templates_root = ensure_mask_templates_dir(base_dir)
|
||||||
|
return sorted(
|
||||||
|
path
|
||||||
|
for path in templates_root.rglob("*")
|
||||||
|
if path.is_file() and path.suffix.lower() in MASK_TEMPLATE_EXTENSIONS
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def mask_template_label(path: Path, base_dir: Optional[Path] = None) -> str:
|
||||||
|
if base_dir is None:
|
||||||
|
return path.name
|
||||||
|
try:
|
||||||
|
return str(path.relative_to(mask_templates_dir(base_dir)))
|
||||||
|
except ValueError:
|
||||||
|
return path.name
|
||||||
|
|
||||||
|
|
||||||
|
def append_jsonl_file(target_path: Path, extra_path: Path) -> None:
|
||||||
|
if not target_path.exists() or not extra_path.exists():
|
||||||
|
return
|
||||||
|
extra_text = extra_path.read_text(encoding="utf-8").strip()
|
||||||
|
if not extra_text:
|
||||||
|
return
|
||||||
|
with target_path.open("a", encoding="utf-8") as target:
|
||||||
|
target.write(extra_text + "\n")
|
||||||
Reference in New Issue
Block a user