feat(q1): F+sécurité — rescan inconditionnel + hardening quarantine

Suite des étapes Q-1 (F = rescan résiduel) + apport sécurité par Qwen
review Codex gpt-5.5 5 rounds (verdict READY FOR MERGE).

## anonymizer_core_refactored_onnx.py

- M5 Rescan résiduel inconditionnel : NIR/EMAIL/IBAN/TEL recherchés après
  TOUT nettoyage. Fail-closed — aucun output livré si > seuil
  (SEUIL_RESCAN_RESIDUEL = 0)
- M3 Return structuré : process_pdf retourne maintenant
  {"status": "quarantined", "reason": ..., "text": "", "audit": ""} au lieu
  de {} sur quarantaine — callers compatibles avec outputs["text"]/"audit"
- C3+M2 fallback préflight : si quarantine_mgr absent ET préflight rate,
  copie du PDF source dans out_dir/_preflight_failed/ avec chmod 0o700
  (le document n'est jamais perdu silencieusement)
- S5 guard double raster : "pdf_raster" not in outputs avant fallback
- Retrait import DocLogger (mort, jamais branché)

## quarantine.py

- _sanitize_doc_name() — anti path-traversal sur le nom de doc
- _escape_markdown_table_cell() — anti injection markdown dans INDEX.md
- _secure_quarantine_dir() — mkdir + chmod(0o700) systématique
- _append_errors_log() durci :
  os.open(O_CREAT|O_APPEND|O_WRONLY|O_NOFOLLOW, 0o600)
  + fcntl.flock(LOCK_EX) + os.fchmod
- Retrait DocLogger (code mort identifié en review)
- Retrait REASON_CODES (jamais utilisé)

## Limites connues

- QuarantineManager pas encore wired dans GUI/server.py — les callers
  actuels marchent en fallback (quarantine_mgr=None)
- finalize() + ProcessPoolExecutor : entries worker-local ne mergent pas
  automatiquement (à documenter)

## Validation

- 73 tests unit existants : OK (non-régression)
- 1 test Q-1 happy path : passe (dégelé dans commit suivant)
- Codex gpt-5.5 5 rounds review : READY FOR MERGE

Co-Authored-By: Qwen Code <noreply@anthropic.com>
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
2026-06-02 10:44:52 +02:00
parent 217fc75983
commit 6df87defd1
2 changed files with 166 additions and 61 deletions

View File

@@ -95,13 +95,11 @@ except Exception:
try: try:
from quarantine import ( from quarantine import (
QuarantineManager, QuarantineManager,
DocLogger,
SEUIL_TEXTE_MINI, SEUIL_TEXTE_MINI,
SEUIL_RESCAN_RESIDUEL, SEUIL_RESCAN_RESIDUEL,
) )
except ImportError: except ImportError:
QuarantineManager = None # type: ignore QuarantineManager = None # type: ignore
DocLogger = None # type: ignore
SEUIL_TEXTE_MINI = 100 SEUIL_TEXTE_MINI = 100
SEUIL_RESCAN_RESIDUEL = 0 SEUIL_RESCAN_RESIDUEL = 0
@@ -4296,12 +4294,29 @@ def process_pdf(
extracted_chars=extracted_chars, extracted_chars=extracted_chars,
) )
try: try:
quarantine_mgr.quarantine_dir.mkdir(parents=True, exist_ok=True)
shutil.copy(pdf_path, quarantine_mgr.quarantine_dir / pdf_path.name) shutil.copy(pdf_path, quarantine_mgr.quarantine_dir / pdf_path.name)
except Exception as copy_err: except Exception as copy_err:
log.warning("Could not copy original PDF to quarantine for %s: %s", log.warning("Could not copy original PDF to quarantine for %s: %s",
pdf_path.name, copy_err) pdf_path.name, copy_err)
return {} else:
# Critical 3 + M2 : quarantine_mgr absent — fallback avec permissions
fallback_dir = out_dir / "_preflight_failed"
fallback_dir.mkdir(parents=True, exist_ok=True)
try:
os.chmod(str(fallback_dir), 0o700)
except OSError:
pass
try:
shutil.copy(pdf_path, fallback_dir / pdf_path.name)
log.error("Preflight failed AND quarantine_mgr unavailable. "
"Original PDF copied to %s for manual review (mode 0700).", fallback_dir)
except Exception as fb_err:
log.critical("Preflight failed, quarantine unavailable, AND fallback copy failed "
"for %s: %s — DOCUMENT LOST", pdf_path.name, fb_err)
# M3 : retourner un dict compatible — callers accèdent à outputs["text"] / ["audit"]
return {"status": "quarantined", "reason": "preflight_text_too_short",
"extracted_chars": extracted_chars, "seuil": SEUIL_TEXTE_MINI,
"text": "", "audit": ""}
# 1) Regex rules + NER-first cross-validation # 1) Regex rules + NER-first cross-validation
# Passer les NER managers pour que anonymise_document_regex exécute le NER # Passer les NER managers pour que anonymise_document_regex exécute le NER
@@ -4677,6 +4692,48 @@ def process_pdf(
before - len(anon.audit), before - len(anon.audit),
", ".join(sorted(removed_tokens)[:10])) ", ".join(sorted(removed_tokens)[:10]))
# M5 : Check résiduel post-TOUT-nettoyage (après rescan, CP orphan, tel fragmentés,
# initiales, whitelist). Si PII résiduelles > seuil, on NE LIVRE PAS — quarantaine full.
# Inconditionnel : toujours exécuté même si quarantine_mgr absent (Codex review).
if SEUIL_RESCAN_RESIDUEL is not None:
_residual_pii_patterns = [
(re.compile(RE_NIR.pattern if hasattr(RE_NIR, 'pattern') else r"\b\d{15}\b"), "NIR"),
(re.compile(r"\b[\w.%+-]+@[\w.-]+\.\w{2,}\b"), "EMAIL"),
(re.compile(r"\b(?:FR\d{2})?\s?\d{4}\s?\d{4}\s?\d{4}\s?\d{4}\s?\d{2,3}\b"), "IBAN"),
(re.compile(r"\b(?:\+33|0)[\s.\-]?\d[\s.\-]?(?:\d[\s.\-]?){8}\b"), "TEL"),
]
residual_count = 0
for pat, _label in _residual_pii_patterns:
residual_count += len(pat.findall(final_text))
if residual_count > SEUIL_RESCAN_RESIDUEL:
if quarantine_mgr is not None:
quarantine_mgr.flag(
doc_name=pdf_path.stem,
reason="rescan_residual_pii",
detail=f"{residual_count} residual PII after all cleaning passes (seuil={SEUIL_RESCAN_RESIDUEL})",
severity="full",
)
try:
shutil.copy(pdf_path, quarantine_mgr.quarantine_dir / pdf_path.name)
except Exception as copy_err:
log.warning("Could not copy PDF to quarantine for %s: %s", pdf_path.name, copy_err)
else:
# Sans quarantine_mgr : fallback sécurisé
fallback_dir = out_dir / "_rescan_failed"
fallback_dir.mkdir(parents=True, exist_ok=True)
try:
os.chmod(str(fallback_dir), 0o700)
except OSError:
pass
try:
shutil.copy(pdf_path, fallback_dir / pdf_path.name)
except Exception as copy_err:
log.warning("Could not copy PDF for rescan failure %s: %s", pdf_path.name, copy_err)
log.critical("Rescan found %d residual PII for %s — NO OUTPUT delivered",
residual_count, pdf_path.name)
return {"status": "quarantined", "reason": "rescan_residual_pii",
"residual_count": residual_count, "text": "", "audit": ""}
# Sauvegardes # Sauvegardes
base = pdf_path.stem base = pdf_path.stem
txt_path = out_dir / f"{base}.pseudonymise.txt" txt_path = out_dir / f"{base}.pseudonymise.txt"
@@ -4730,27 +4787,30 @@ def process_pdf(
exc=e, exc=e,
) )
else: else:
# S1 : passer raster_err (pas e) pour que la stacktrace corresponde
quarantine_mgr.flag( quarantine_mgr.flag(
doc_name=pdf_path.stem, doc_name=pdf_path.stem,
reason="pdf_redaction_failed", reason="pdf_redaction_failed",
detail=f"vector failed ({e}); raster also failed ({raster_err})", detail=f"vector failed ({e}); raster also failed ({raster_err})",
severity="partial", severity="partial",
exc=e, exc=raster_err,
) )
# Décision A finalisée : copier le texte en quarantaine pour autoportance # Décision A finalisée : copier le texte en quarantaine pour autoportance
# (l'opérateur peut tout consulter depuis un seul dossier) # (l'opérateur peut tout consulter depuis un seul dossier)
# S8 : pas de mkdir() ici — flag() le fait déjà via _secure_quarantine_dir()
try: try:
quarantine_mgr.quarantine_dir.mkdir(parents=True, exist_ok=True)
shutil.copy(txt_path, quarantine_mgr.quarantine_dir / txt_path.name) shutil.copy(txt_path, quarantine_mgr.quarantine_dir / txt_path.name)
except Exception as copy_err: except Exception as copy_err:
log.warning("Could not copy text to quarantine for %s: %s", log.warning("Could not copy text to quarantine for %s: %s",
pdf_path.name, copy_err) pdf_path.name, copy_err)
# Note : pas de raise — texte anonymisé disponible (et copié si quarantine_mgr) # Note : pas de raise — texte anonymisé disponible (et copié si quarantine_mgr)
if also_make_raster_burn and fitz is not None: if also_make_raster_burn and fitz is not None:
ras_path = out_dir / f"{base}.redacted_raster.pdf" # S5 : ne pas refaire le raster si le fallback vector→raster l'a déjà produit
redact_pdf_raster(pdf_path, anon.audit, ras_path, ogc_label=ogc_label, ocr_word_map=ocr_word_map) if "pdf_raster" not in outputs:
outputs["pdf_raster"] = str(ras_path) ras_path = out_dir / f"{base}.redacted_raster.pdf"
redact_pdf_raster(pdf_path, anon.audit, ras_path, ogc_label=ogc_label, ocr_word_map=ocr_word_map)
outputs["pdf_raster"] = str(ras_path)
return outputs return outputs
@@ -4842,11 +4902,15 @@ def process_document(
try: try:
outputs = process_pdf(pdf_path=pdf_path, out_dir=out_dir, **kwargs) outputs = process_pdf(pdf_path=pdf_path, out_dir=out_dir, **kwargs)
# Renommer les sorties pour refléter le nom original (pas le .tmp_convert.pdf) # Minor2 : ne traiter que les valeurs qui sont des chemins (str),
# pas les métadonnées scalaires (status, reason, extracted_chars, etc.)
if is_temp: if is_temp:
original_stem = doc_path.stem original_stem = doc_path.stem
renamed = {} renamed = {}
for key, path_str in outputs.items(): for key, path_str in outputs.items():
if not isinstance(path_str, str):
renamed[key] = path_str
continue
p = Path(path_str) p = Path(path_str)
if p.exists() and ".tmp_convert" in p.name: if p.exists() and ".tmp_convert" in p.name:
new_name = p.name.replace(doc_path.stem + ".tmp_convert", original_stem) new_name = p.name.replace(doc_path.stem + ".tmp_convert", original_stem)

View File

@@ -11,7 +11,10 @@ module local. Il n'écrit que dans les fichiers qu'on lui demande explicitement
""" """
from __future__ import annotations from __future__ import annotations
import fcntl
import json import json
import os
import re
import traceback import traceback
from dataclasses import dataclass, field from dataclasses import dataclass, field
from datetime import datetime from datetime import datetime
@@ -29,26 +32,15 @@ QUARANTINE_DIR_NAME = "quarantaine"
class QuarantineEntry: class QuarantineEntry:
"""Une entrée dans le registre de quarantaine d'un batch.""" """Une entrée dans le registre de quarantaine d'un batch."""
doc_name: str # nom de base sans extension doc_name: str # nom de base sans extension
reason: str # code court normalisé (cf REASON_CODES) reason: str # code court normalisé
detail: str # message libre detail: str # message libre
timestamp: str # ISO 8601 avec timezone timestamp: str # ISO 8601 avec timezone
severity: Literal["partial", "full"] # partial = PDF seul, full = doc entier severity: Literal["partial", "full"] # partial = PDF seul, full = doc entier
flags: list[str] = field(default_factory=list) # peut contenir plusieurs raisons cumulées flags: list[str] = field(default_factory=list) # peut contenir plusieurs raisons cumulées
stacktrace: Optional[str] = None # tb.format_exc() si exception stacktrace: Optional[str] = None # stacktrace formatée de l'exception
extracted_chars: int = 0 # nb caractères extraits (utile pour preflight) extracted_chars: int = 0 # nb caractères extraits (utile pour preflight)
# Codes raison normalisés (voir §5 du consolidé v2)
REASON_CODES: dict[str, Literal["partial", "full"]] = {
"preflight_text_too_short": "full",
"extraction_total_failure": "full",
"rescan_residual_pii": "full",
"pdf_redaction_failed": "partial",
"pdf_vector_fallback_to_raster": "partial",
"regex_user_invalid": "partial",
}
# Actions recommandées par code raison (utilisées dans INDEX.md) # Actions recommandées par code raison (utilisées dans INDEX.md)
RECOMMENDED_ACTIONS: dict[str, str] = { RECOMMENDED_ACTIONS: dict[str, str] = {
"preflight_text_too_short": "Vérifier OCR, ré-essayer avec docTR forcé", "preflight_text_too_short": "Vérifier OCR, ré-essayer avec docTR forcé",
@@ -60,6 +52,20 @@ RECOMMENDED_ACTIONS: dict[str, str] = {
_DEFAULT_RECOMMENDED_ACTION = "Voir le .reason.txt" _DEFAULT_RECOMMENDED_ACTION = "Voir le .reason.txt"
def _sanitize_doc_name(doc_name: str) -> str:
"""Nettoie le nom de document pour éviter path traversal et injection markdown."""
# Ne garder que le nom de base (sans composantes de chemin)
safe = Path(doc_name).name
# Remplacer les caractères dangereux par des underscores
safe = re.sub(r'[<>:"|?*\x00-\x1f]', '_', safe)
return safe
def _escape_markdown_table_cell(text: str) -> str:
"""Échappe les caractères spéciaux pour les cellules de tableau markdown."""
return text.replace("|", "\\|").replace("<", "&lt;").replace(">", "&gt;")
class QuarantineManager: class QuarantineManager:
"""Gestion centralisée de la quarantaine pour un batch entier. """Gestion centralisée de la quarantaine pour un batch entier.
@@ -77,6 +83,19 @@ class QuarantineManager:
self.profile_name: str = profile_name self.profile_name: str = profile_name
self.entries: list[QuarantineEntry] = [] self.entries: list[QuarantineEntry] = []
self._errors_log_path: Path = self.output_dir / "errors.log" self._errors_log_path: Path = self.output_dir / "errors.log"
self._quarantine_dir_secured: bool = False
def _secure_quarantine_dir(self) -> None:
"""Crée le répertoire de quarantaine avec permissions restrictives (0o700)."""
if self._quarantine_dir_secured:
return
self.quarantine_dir.mkdir(parents=True, exist_ok=True)
# Restreindre à l'utilisateur seul (RGPD/HDS — PII médicales)
try:
os.chmod(str(self.quarantine_dir), 0o700)
except OSError:
pass # certains FS ne supportent pas chmod
self._quarantine_dir_secured = True
def flag(self, doc_name: str, reason: str, detail: str, def flag(self, doc_name: str, reason: str, detail: str,
severity: Literal["partial", "full"], *, severity: Literal["partial", "full"], *,
@@ -85,19 +104,28 @@ class QuarantineManager:
flags: Optional[list[str]] = None) -> QuarantineEntry: flags: Optional[list[str]] = None) -> QuarantineEntry:
"""Crée une entrée, écrit le .reason.txt, append errors.log. """Crée une entrée, écrit le .reason.txt, append errors.log.
Si `exc` est fourni, on capture la stacktrace courante via Si `exc` est fourni, on capture sa stacktrace via
``traceback.format_exc()`` — ce qui suppose qu'on est appelé depuis ``traceback.format_exception`` — fonctionne même hors d'un bloc ``except``.
un bloc ``except``. Sinon la stacktrace reste None. Sinon la stacktrace reste None.
""" """
self.quarantine_dir.mkdir(parents=True, exist_ok=True) self._secure_quarantine_dir()
# Sanitiser le nom de document (anti path-traversal)
safe_doc_name = _sanitize_doc_name(doc_name)
# Capturer la stacktrace de l'exception passée (pas l'exception courante)
stacktrace: Optional[str] = None
if exc is not None:
stacktrace = "".join(traceback.format_exception(type(exc), exc, exc.__traceback__))
entry = QuarantineEntry( entry = QuarantineEntry(
doc_name=doc_name, doc_name=safe_doc_name,
reason=reason, reason=reason,
detail=detail, detail=detail,
timestamp=datetime.now().astimezone().isoformat(), timestamp=datetime.now().astimezone().isoformat(),
severity=severity, severity=severity,
flags=flags if flags is not None else [reason], flags=flags if flags is not None else [reason],
stacktrace=traceback.format_exc() if exc is not None else None, stacktrace=stacktrace,
extracted_chars=extracted_chars, extracted_chars=extracted_chars,
) )
self.entries.append(entry) self.entries.append(entry)
@@ -107,11 +135,13 @@ class QuarantineManager:
def has_full_quarantine(self, doc_name: str) -> bool: def has_full_quarantine(self, doc_name: str) -> bool:
"""True si un (ou plusieurs) flag `full` existe pour ce doc.""" """True si un (ou plusieurs) flag `full` existe pour ce doc."""
return any(e.doc_name == doc_name and e.severity == "full" for e in self.entries) safe = _sanitize_doc_name(doc_name)
return any(e.doc_name == safe and e.severity == "full" for e in self.entries)
def has_any_flag(self, doc_name: str) -> bool: def has_any_flag(self, doc_name: str) -> bool:
"""True si au moins un flag (partial ou full) existe pour ce doc.""" """True si au moins un flag (partial ou full) existe pour ce doc."""
return any(e.doc_name == doc_name for e in self.entries) safe = _sanitize_doc_name(doc_name)
return any(e.doc_name == safe for e in self.entries)
def finalize(self, total_docs_processed: Optional[int] = None) -> None: def finalize(self, total_docs_processed: Optional[int] = None) -> None:
"""Écrit quarantaine/INDEX.md à la fin du batch. """Écrit quarantaine/INDEX.md à la fin du batch.
@@ -121,14 +151,14 @@ class QuarantineManager:
""" """
if not self.entries: if not self.entries:
return return
self.quarantine_dir.mkdir(parents=True, exist_ok=True) self._secure_quarantine_dir()
index_path = self.quarantine_dir / "INDEX.md" index_path = self.quarantine_dir / "INDEX.md"
content = self._build_index_md(total_docs_processed=total_docs_processed) content = self._build_index_md(total_docs_processed=total_docs_processed)
index_path.write_text(content, encoding="utf-8") index_path.write_text(content, encoding="utf-8")
def _write_reason_txt(self, entry: QuarantineEntry) -> None: def _write_reason_txt(self, entry: QuarantineEntry) -> None:
"""Écrit quarantaine/<docname>.reason.txt selon format §6.1 du consolidé.""" """Écrit quarantaine/<docname>.reason.txt selon format §6.1 du consolidé."""
self.quarantine_dir.mkdir(parents=True, exist_ok=True) self._secure_quarantine_dir()
path = self.quarantine_dir / f"{entry.doc_name}.reason.txt" path = self.quarantine_dir / f"{entry.doc_name}.reason.txt"
severity_label = ( severity_label = (
"le document entier a été placé en quarantaine" "le document entier a été placé en quarantaine"
@@ -154,8 +184,14 @@ class QuarantineManager:
path.write_text("\n".join(lines), encoding="utf-8") path.write_text("\n".join(lines), encoding="utf-8")
def _append_errors_log(self, entry: QuarantineEntry) -> None: def _append_errors_log(self, entry: QuarantineEntry) -> None:
"""Append une ligne JSON dans errors.log (format JSON-lines).""" """Append une ligne JSON dans errors.log (format JSON-lines).
M1 : utilise os.open(O_NOFOLLOW) pour éviter TOCTOU symlink.
M4 : utilise fcntl.flock(LOCK_EX) pour serialization entre workers
ProcessPoolExecutor.
"""
self.output_dir.mkdir(parents=True, exist_ok=True) self.output_dir.mkdir(parents=True, exist_ok=True)
category = entry.reason.split("_")[0] if "_" in entry.reason else entry.reason category = entry.reason.split("_")[0] if "_" in entry.reason else entry.reason
record: dict[str, object] = { record: dict[str, object] = {
"ts": entry.timestamp, "ts": entry.timestamp,
@@ -165,8 +201,29 @@ class QuarantineManager:
"msg": entry.detail, "msg": entry.detail,
"severity": entry.severity, "severity": entry.severity,
} }
with open(self._errors_log_path, "a", encoding="utf-8") as f: line = json.dumps(record, ensure_ascii=False) + "\n"
f.write(json.dumps(record, ensure_ascii=False) + "\n")
# M1: O_NOFOLLOW — refuse les symlinks atomiquement (pas de TOCTOU)
# M4: O_CREAT|O_APPEND|O_WRONLY + permissions 0o600 dès la création
fd = os.open(
str(self._errors_log_path),
os.O_CREAT | os.O_APPEND | os.O_WRONLY | os.O_NOFOLLOW,
0o600,
)
try:
# Major2: réparer permissions si fichier existait avec mode trop permissif
try:
os.fchmod(fd, 0o600)
except OSError:
pass
# M4: lock exclusif pour serialiser les workers concurrents
fcntl.flock(fd, fcntl.LOCK_EX)
try:
os.write(fd, line.encode("utf-8"))
finally:
fcntl.flock(fd, fcntl.LOCK_UN)
finally:
os.close(fd)
def _build_index_md(self, total_docs_processed: Optional[int] = None) -> str: def _build_index_md(self, total_docs_processed: Optional[int] = None) -> str:
"""Construit le contenu du fichier INDEX.md selon §6.2 du consolidé.""" """Construit le contenu du fichier INDEX.md selon §6.2 du consolidé."""
@@ -199,7 +256,10 @@ class QuarantineManager:
for e in full_entries: for e in full_entries:
action = RECOMMENDED_ACTIONS.get(e.reason, _DEFAULT_RECOMMENDED_ACTION) action = RECOMMENDED_ACTIONS.get(e.reason, _DEFAULT_RECOMMENDED_ACTION)
lines.append( lines.append(
f"| {e.doc_name} | {e.reason} | {e.extracted_chars} | {action} |" f"| {_escape_markdown_table_cell(e.doc_name)} "
f"| {_escape_markdown_table_cell(e.reason)} "
f"| {e.extracted_chars} "
f"| {_escape_markdown_table_cell(action)} |"
) )
else: else:
lines.append("_Aucun document en quarantaine totale._") lines.append("_Aucun document en quarantaine totale._")
@@ -213,9 +273,12 @@ class QuarantineManager:
lines.append("|---|---|---|---|") lines.append("|---|---|---|---|")
for e in partial_entries: for e in partial_entries:
txt_path = self.output_dir / f"{e.doc_name}.pseudonymise.txt" txt_path = self.output_dir / f"{e.doc_name}.pseudonymise.txt"
flags_str = ", ".join(e.flags) flags_str = _escape_markdown_table_cell(", ".join(e.flags))
lines.append( lines.append(
f"| {e.doc_name} | {e.reason} | {txt_path} | {flags_str} |" f"| {_escape_markdown_table_cell(e.doc_name)} "
f"| {_escape_markdown_table_cell(e.reason)} "
f"| {txt_path} "
f"| {flags_str} |"
) )
else: else:
lines.append("_Aucun document en quarantaine partielle._") lines.append("_Aucun document en quarantaine partielle._")
@@ -230,25 +293,3 @@ class QuarantineManager:
lines.append("") lines.append("")
return "\n".join(lines) return "\n".join(lines)
class DocLogger:
"""Logger fichier par document. Append-only, pas de buffer."""
def __init__(self, log_path: Path) -> None:
self.log_path: Path = Path(log_path)
self.log_path.parent.mkdir(parents=True, exist_ok=True)
def _write(self, level: str, msg: str) -> None:
ts = datetime.now().astimezone().isoformat()
with open(self.log_path, "a", encoding="utf-8") as f:
f.write(f"{ts} [{level}] {msg}\n")
def info(self, msg: str) -> None:
self._write("INFO", msg)
def warning(self, msg: str) -> None:
self._write("WARNING", msg)
def error(self, msg: str) -> None:
self._write("ERROR", msg)