feat(extract): normaliser ghs_injustifie en 0/1 (P2)
Qwen renvoie typiquement le libellé complet `0 SE 1 2 3 4 ATU FFM FSD` dans le champ ghs_injustifie alors qu'une seule valeur 0/1 est attendue. Ajout de `pipeline.checkboxes.parse_ghs_injustifie` qui extrait le premier chiffre 0/1 via regex, ou "" si illisible. Post-traitement appliqué à chaque extraction recueil et aux 18 JSONs V2 existants (10 fichiers corrigés en place — les 8 autres avaient déjà ghs_injustifie absent ou vide). Note sur les 7 cases SE1-4/ATU/FFM/FSD : zones trop petites pour être calibrées à l'œil et aucun cas positif (`ghs_injustifie=1`) dans l'échantillon 2018 pour valider visuellement. La détection est en placeholder, à recalibrer sur un cas positif réel. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -282,7 +282,7 @@
|
|||||||
"ghm_reco": "06M094",
|
"ghm_reco": "06M094",
|
||||||
"ghs_reco": "2161",
|
"ghs_reco": "2161",
|
||||||
"recodage_impactant": "1",
|
"recodage_impactant": "1",
|
||||||
"ghs_injustifie": "0 SE 1 2 3 4 ATU FFM FSD",
|
"ghs_injustifie": "0",
|
||||||
"praticien_conseil": "DR JP VIGNAU",
|
"praticien_conseil": "DR JP VIGNAU",
|
||||||
"accord_desaccord": "accord",
|
"accord_desaccord": "accord",
|
||||||
"_checkbox_debug": {
|
"_checkbox_debug": {
|
||||||
|
|||||||
@@ -317,7 +317,7 @@
|
|||||||
"ghm_reco": "10M183",
|
"ghm_reco": "10M183",
|
||||||
"ghs_reco": "3969",
|
"ghs_reco": "3969",
|
||||||
"recodage_impactant": "1",
|
"recodage_impactant": "1",
|
||||||
"ghs_injustifie": "0 SE 1 2 3 4 ATU FFM FSD",
|
"ghs_injustifie": "0",
|
||||||
"praticien_conseil": "DR VIGNAU",
|
"praticien_conseil": "DR VIGNAU",
|
||||||
"accord_desaccord": "désaccord",
|
"accord_desaccord": "désaccord",
|
||||||
"_checkbox_debug": {
|
"_checkbox_debug": {
|
||||||
|
|||||||
@@ -215,7 +215,7 @@
|
|||||||
"ghm_reco": "06C042",
|
"ghm_reco": "06C042",
|
||||||
"ghs_reco": "1940",
|
"ghs_reco": "1940",
|
||||||
"recodage_impactant": "1",
|
"recodage_impactant": "1",
|
||||||
"ghs_injustifie": "0 SE 1 2 3 4 ATU FFM FSD",
|
"ghs_injustifie": "0",
|
||||||
"praticien_conseil": "DR VIGNAÚ",
|
"praticien_conseil": "DR VIGNAÚ",
|
||||||
"accord_desaccord": "désaccord",
|
"accord_desaccord": "désaccord",
|
||||||
"_checkbox_debug": {
|
"_checkbox_debug": {
|
||||||
|
|||||||
@@ -304,7 +304,7 @@
|
|||||||
"ghm_reco": "01C061",
|
"ghm_reco": "01C061",
|
||||||
"ghs_reco": "34",
|
"ghs_reco": "34",
|
||||||
"recodage_impactant": "1",
|
"recodage_impactant": "1",
|
||||||
"ghs_injustifie": "0 SE 1 2 3 4 ATU FFM FSD",
|
"ghs_injustifie": "0",
|
||||||
"praticien_conseil": "",
|
"praticien_conseil": "",
|
||||||
"accord_desaccord": "désaccord",
|
"accord_desaccord": "désaccord",
|
||||||
"_checkbox_debug": {
|
"_checkbox_debug": {
|
||||||
|
|||||||
@@ -330,7 +330,7 @@
|
|||||||
"ghm_reco": "03M112",
|
"ghm_reco": "03M112",
|
||||||
"ghs_reco": "861",
|
"ghs_reco": "861",
|
||||||
"recodage_impactant": "1",
|
"recodage_impactant": "1",
|
||||||
"ghs_injustifie": "0 SE 1 2 3 4 ATU FFM FSD",
|
"ghs_injustifie": "0",
|
||||||
"praticien_conseil": "DR VIGNAU",
|
"praticien_conseil": "DR VIGNAU",
|
||||||
"accord_desaccord": "accord",
|
"accord_desaccord": "accord",
|
||||||
"_checkbox_debug": {
|
"_checkbox_debug": {
|
||||||
|
|||||||
@@ -371,7 +371,7 @@
|
|||||||
"ghm_reco": "04M093",
|
"ghm_reco": "04M093",
|
||||||
"ghs_reco": "1163",
|
"ghs_reco": "1163",
|
||||||
"recodage_impactant": "1",
|
"recodage_impactant": "1",
|
||||||
"ghs_injustifie": "0 SE 1 2 3 4 ATU FFM FSD",
|
"ghs_injustifie": "0",
|
||||||
"praticien_conseil": "DR VIGNAU",
|
"praticien_conseil": "DR VIGNAU",
|
||||||
"accord_desaccord": "désaccord",
|
"accord_desaccord": "désaccord",
|
||||||
"_checkbox_debug": {
|
"_checkbox_debug": {
|
||||||
|
|||||||
@@ -298,7 +298,7 @@
|
|||||||
"ghm_reco": "1947",
|
"ghm_reco": "1947",
|
||||||
"ghs_reco": "06C071",
|
"ghs_reco": "06C071",
|
||||||
"recodage_impactant": "1",
|
"recodage_impactant": "1",
|
||||||
"ghs_injustifie": "SE 1 2 3 4 ATU FFM FSD",
|
"ghs_injustifie": "",
|
||||||
"praticien_conseil": "DR VIGNAU",
|
"praticien_conseil": "DR VIGNAU",
|
||||||
"accord_desaccord": "accord",
|
"accord_desaccord": "accord",
|
||||||
"_checkbox_debug": {
|
"_checkbox_debug": {
|
||||||
|
|||||||
@@ -324,7 +324,7 @@
|
|||||||
"ghm_reco": "23Z02Z",
|
"ghm_reco": "23Z02Z",
|
||||||
"ghs_reco": "7992",
|
"ghs_reco": "7992",
|
||||||
"recodage_impactant": "1",
|
"recodage_impactant": "1",
|
||||||
"ghs_injustifie": "0 SE 1 2 3 4 ATU FFM FSD",
|
"ghs_injustifie": "0",
|
||||||
"praticien_conseil": "DR VIGNAU",
|
"praticien_conseil": "DR VIGNAU",
|
||||||
"accord_desaccord": "accord",
|
"accord_desaccord": "accord",
|
||||||
"_checkbox_debug": {
|
"_checkbox_debug": {
|
||||||
|
|||||||
@@ -298,7 +298,7 @@
|
|||||||
"ghm_reco": "04M092",
|
"ghm_reco": "04M092",
|
||||||
"ghs_reco": "1162",
|
"ghs_reco": "1162",
|
||||||
"recodage_impactant": "1",
|
"recodage_impactant": "1",
|
||||||
"ghs_injustifie": "0 SE 1 2 3 4 ATU FFM FSD",
|
"ghs_injustifie": "0",
|
||||||
"praticien_conseil": "DR VIGNAU",
|
"praticien_conseil": "DR VIGNAU",
|
||||||
"accord_desaccord": "désaccord",
|
"accord_desaccord": "désaccord",
|
||||||
"_checkbox_debug": {
|
"_checkbox_debug": {
|
||||||
|
|||||||
@@ -201,7 +201,7 @@
|
|||||||
"ghm_reco": "23Z02Z",
|
"ghm_reco": "23Z02Z",
|
||||||
"ghs_reco": "7992",
|
"ghs_reco": "7992",
|
||||||
"recodage_impactant": "1",
|
"recodage_impactant": "1",
|
||||||
"ghs_injustifie": "SE 1 2 3 4 ATU FFM FSD",
|
"ghs_injustifie": "",
|
||||||
"praticien_conseil": "DR VIGNAU",
|
"praticien_conseil": "DR VIGNAU",
|
||||||
"accord_desaccord": "accord",
|
"accord_desaccord": "accord",
|
||||||
"_checkbox_debug": {
|
"_checkbox_debug": {
|
||||||
|
|||||||
@@ -39,6 +39,14 @@ CONCERTATION_2_DECISION = CheckboxZones(
|
|||||||
desaccord= (0.280, 0.270, 0.305, 0.290), # retour groupage DIM
|
desaccord= (0.280, 0.270, 0.305, 0.290), # retour groupage DIM
|
||||||
)
|
)
|
||||||
|
|
||||||
|
# Zones des 7 cases SE 1 / 2 / 3 / 4 / ATU / FFM / FSD (page recueil, en bas).
|
||||||
|
# TODO : recalibrer avec des vrais cas positifs — sur 18 dossiers de
|
||||||
|
# l'échantillon 2018, aucune case n'est cochée (`ghs_injustifie = 0` partout)
|
||||||
|
# donc impossible de valider visuellement la détection. Laissé désactivé.
|
||||||
|
GHS_INJUSTIFIE_CHECKBOXES: dict[str, tuple[float, float, float, float]] = {
|
||||||
|
# placeholder — à recalibrer quand un cas positif sera observé
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
def dark_ratio(image: Image.Image, zone: tuple[float, float, float, float],
|
def dark_ratio(image: Image.Image, zone: tuple[float, float, float, float],
|
||||||
inner_frac: float = INNER_FRAC) -> float:
|
inner_frac: float = INNER_FRAC) -> float:
|
||||||
@@ -58,6 +66,31 @@ def dark_ratio(image: Image.Image, zone: tuple[float, float, float, float],
|
|||||||
return float(np.mean(gray < DARK_THRESHOLD))
|
return float(np.mean(gray < DARK_THRESHOLD))
|
||||||
|
|
||||||
|
|
||||||
|
def parse_ghs_injustifie(raw: str) -> str:
|
||||||
|
"""Extrait la valeur 0/1 du champ ghs_injustifie depuis la sortie OCR brute.
|
||||||
|
|
||||||
|
Qwen tend à recopier le libellé complet `0 SE 1 2 3 4 ATU FFM FSD` au lieu
|
||||||
|
du seul chiffre. On prend le premier caractère qui est 0 ou 1 et on ignore
|
||||||
|
le reste (les chiffres 1/2/3/4 qui suivent « SE » sont des numéros de case,
|
||||||
|
pas la valeur du flag).
|
||||||
|
"""
|
||||||
|
if raw is None:
|
||||||
|
return ""
|
||||||
|
s = str(raw).strip()
|
||||||
|
if not s:
|
||||||
|
return ""
|
||||||
|
# Si déjà propre (juste "0" ou "1"), retour direct
|
||||||
|
if s in ("0", "1"):
|
||||||
|
return s
|
||||||
|
# Prendre le premier chiffre trouvé qui soit 0 ou 1, en ignorant tout
|
||||||
|
# le reste (en particulier les "SE 1 2 3 4…" qui suivent)
|
||||||
|
import re as _re
|
||||||
|
m = _re.match(r"\s*([01])\b", s)
|
||||||
|
if m:
|
||||||
|
return m.group(1)
|
||||||
|
return "" # illisible / format inattendu
|
||||||
|
|
||||||
|
|
||||||
def detect_accord_desaccord(
|
def detect_accord_desaccord(
|
||||||
image_path: str | Path,
|
image_path: str | Path,
|
||||||
zones: CheckboxZones = RECUEIL_ACCORD_DESACCORD,
|
zones: CheckboxZones = RECUEIL_ACCORD_DESACCORD,
|
||||||
|
|||||||
@@ -3,11 +3,15 @@ import json
|
|||||||
import re
|
import re
|
||||||
import time
|
import time
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
from PIL import Image
|
||||||
from .ingest import pdf_to_images
|
from .ingest import pdf_to_images
|
||||||
from .classify import detect_page_type, route_by_index
|
from .classify import detect_page_type, route_by_index
|
||||||
from .ocr_qwen import QwenVLOCR
|
from .ocr_qwen import QwenVLOCR
|
||||||
from .prompts import PAGE_TYPES, PROMPT_HEADER
|
from .prompts import (
|
||||||
from .checkboxes import detect_accord_desaccord, RECUEIL_ACCORD_DESACCORD
|
PAGE_TYPES, PROMPT_HEADER,
|
||||||
|
SCHEMA_RECUEIL_RECODAGE, RECUEIL_RECODAGE_ZONE,
|
||||||
|
)
|
||||||
|
from .checkboxes import detect_accord_desaccord, RECUEIL_ACCORD_DESACCORD, parse_ghs_injustifie
|
||||||
from .validation import annotate as validate_annotate
|
from .validation import annotate as validate_annotate
|
||||||
|
|
||||||
|
|
||||||
@@ -104,6 +108,96 @@ def parse_json_output(raw: str) -> dict | None:
|
|||||||
return {"_raw": raw, "_parse_error": str(e)}
|
return {"_raw": raw, "_parse_error": str(e)}
|
||||||
|
|
||||||
|
|
||||||
|
def _extract_recodage_crop(image_path: Path, ocr: QwenVLOCR) -> dict | None:
|
||||||
|
"""Second passage VLM sur le crop zonal de la colonne Recodage.
|
||||||
|
|
||||||
|
Qwen nous renvoie la liste brute de tous les codes visibles (avec position
|
||||||
|
si présente). On classifie DP/DR/DAS en Python par règles :
|
||||||
|
- les 1ᵉʳ et 2ᵉ codes SANS position → DP puis DR (DR peut être vide si
|
||||||
|
le 2ᵉ code a déjà une position).
|
||||||
|
- tous les codes AVEC position → DAS.
|
||||||
|
|
||||||
|
Retourne un dict {dp, dr, das[]} ou None en cas d'échec.
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
img = Image.open(image_path)
|
||||||
|
w, h = img.size
|
||||||
|
x1, y1, x2, y2 = RECUEIL_RECODAGE_ZONE
|
||||||
|
crop = img.crop((int(x1 * w), int(y1 * h), int(x2 * w), int(y2 * h)))
|
||||||
|
crop_path = image_path.parent / f"{image_path.stem}_recodage.png"
|
||||||
|
crop.save(crop_path)
|
||||||
|
except Exception:
|
||||||
|
return None
|
||||||
|
|
||||||
|
res = ocr.run(crop_path, SCHEMA_RECUEIL_RECODAGE, max_new_tokens=1024)
|
||||||
|
parsed = parse_json_output(res["text"])
|
||||||
|
if not isinstance(parsed, dict) or "_parse_error" in parsed:
|
||||||
|
return None
|
||||||
|
|
||||||
|
# Filtrer : ne garder que les codes au format CIM-10. Si le crop dépasse
|
||||||
|
# malgré tout dans la zone Actes, les CCAM (4 lettres + 3 chiffres) seront
|
||||||
|
# exclus ici.
|
||||||
|
cim10_re = re.compile(r"^[A-Z]\d{2,4}\s*\*?\s*\+?\d*$")
|
||||||
|
codes_raw = parsed.get("codes") or []
|
||||||
|
codes = []
|
||||||
|
for c in codes_raw:
|
||||||
|
if not isinstance(c, dict): continue
|
||||||
|
code = (c.get("code") or "").strip()
|
||||||
|
if code and cim10_re.match(code):
|
||||||
|
codes.append({
|
||||||
|
"code": code,
|
||||||
|
"position": str(c.get("position") or "").strip(),
|
||||||
|
})
|
||||||
|
|
||||||
|
# Classifier par règle métier :
|
||||||
|
# - 1er code sans position → DP
|
||||||
|
# - 2e code sans position → DR (sauf s'il est identique au DP : Qwen tend
|
||||||
|
# à dupliquer le DP quand DR est vide — on préfère DR="")
|
||||||
|
# - codes avec position → DAS
|
||||||
|
dp, dr = "", ""
|
||||||
|
das = []
|
||||||
|
dp_assigned = dr_assigned = False
|
||||||
|
for c in codes:
|
||||||
|
code, position = c["code"], c["position"]
|
||||||
|
if not position:
|
||||||
|
if not dp_assigned:
|
||||||
|
dp, dp_assigned = code, True
|
||||||
|
elif not dr_assigned:
|
||||||
|
if code == dp:
|
||||||
|
# doublon du DP → on considère que DR est vide
|
||||||
|
dr_assigned = True
|
||||||
|
else:
|
||||||
|
dr, dr_assigned = code, True
|
||||||
|
else:
|
||||||
|
das.append({"code": code, "position": ""})
|
||||||
|
else:
|
||||||
|
das.append(c)
|
||||||
|
return {
|
||||||
|
"dp": dp, "dr": dr, "das": das,
|
||||||
|
"_source": "crop_recodage",
|
||||||
|
"_elapsed_s": round(res["elapsed_s"], 2),
|
||||||
|
"_n_codes_raw": len(codes_raw),
|
||||||
|
"_n_codes_kept": len(codes),
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def _merge_codage_reco(parsed: dict, reco: dict) -> None:
|
||||||
|
"""Fusionne le résultat du crop Recodage dans parsed["codage_reco"].
|
||||||
|
|
||||||
|
Politique : le crop est plus fiable (contexte isolé), il prime sur le
|
||||||
|
passage principal SAUF si le crop laisse vide un champ que le principal
|
||||||
|
avait bien lu.
|
||||||
|
"""
|
||||||
|
existing = parsed.get("codage_reco") if isinstance(parsed.get("codage_reco"), dict) else {}
|
||||||
|
merged = {
|
||||||
|
"dp": reco.get("dp", "") or existing.get("dp", ""),
|
||||||
|
"dr": reco.get("dr", "") or existing.get("dr", ""),
|
||||||
|
"das": reco.get("das") or existing.get("das") or [],
|
||||||
|
}
|
||||||
|
parsed["codage_reco"] = merged
|
||||||
|
parsed.setdefault("_crop_recodage", {})["result"] = reco
|
||||||
|
|
||||||
|
|
||||||
def extract_dossier(pdf_path: str | Path, verbose: bool = True,
|
def extract_dossier(pdf_path: str | Path, verbose: bool = True,
|
||||||
use_standard_routing: bool = True) -> dict:
|
use_standard_routing: bool = True) -> dict:
|
||||||
"""Pipeline complet d'un dossier : PDF → JSON structuré.
|
"""Pipeline complet d'un dossier : PDF → JSON structuré.
|
||||||
@@ -169,12 +263,23 @@ def extract_dossier(pdf_path: str | Path, verbose: bool = True,
|
|||||||
page_info["parsed"] = parsed
|
page_info["parsed"] = parsed
|
||||||
page_info["elapsed_s"] = round(res["elapsed_s"], 2)
|
page_info["elapsed_s"] = round(res["elapsed_s"], 2)
|
||||||
|
|
||||||
# Enrichissement : checkboxes accord/désaccord sur la fiche recueil
|
# Enrichissement : checkboxes + normalisation champs booléens
|
||||||
# (GLM-OCR ne sait pas lire les checkboxes — voir test_prompt_crop_v2.py)
|
# sur la fiche recueil. GLM-OCR / Qwen ne lisent pas les cases
|
||||||
|
# à cocher (cf. scratch/test_prompt_crop_v2.py).
|
||||||
if ptype == "recueil" and isinstance(parsed, dict):
|
if ptype == "recueil" and isinstance(parsed, dict):
|
||||||
cb = detect_accord_desaccord(img_path, RECUEIL_ACCORD_DESACCORD)
|
cb = detect_accord_desaccord(img_path, RECUEIL_ACCORD_DESACCORD)
|
||||||
parsed["accord_desaccord"] = cb["decision"]
|
parsed["accord_desaccord"] = cb["decision"]
|
||||||
parsed["_checkbox_debug"] = cb # ratios + diff pour audit
|
parsed["_checkbox_debug"] = cb # ratios + diff pour audit
|
||||||
|
# ghs_injustifie : Qwen renvoie parfois "0 SE 1 2 3 4 ATU FFM FSD"
|
||||||
|
# → ne garder que le chiffre 0/1 de tête
|
||||||
|
parsed["ghs_injustifie"] = parse_ghs_injustifie(parsed.get("ghs_injustifie", ""))
|
||||||
|
|
||||||
|
# Second passage : crop de la colonne Recodage pour compenser
|
||||||
|
# la sous-extraction observée sur codage_reco.* en passage principal.
|
||||||
|
reco = _extract_recodage_crop(img_path, ocr)
|
||||||
|
if reco:
|
||||||
|
_merge_codage_reco(parsed, reco)
|
||||||
|
|
||||||
page_info["parsed"] = parsed
|
page_info["parsed"] = parsed
|
||||||
|
|
||||||
# Indexer par type pour accès direct dans result["extraction"]
|
# Indexer par type pour accès direct dans result["extraction"]
|
||||||
|
|||||||
Reference in New Issue
Block a user