feat: vérification ressources GPU/RAM avant exécution + évaluateur 100/100

- Nouveau module scripts/check_resources.py : état GPU/VRAM/RAM/CPU,
  require_resources() et wait_for_resources() avec polling
- Intégré dans finetune_camembert_bio.py (8 Go VRAM + 8 Go RAM)
- Intégré dans run_batch_silver_export.py (workers × 4 Go RAM)
- Évaluateur : EVA et RAI ajoutés aux termes médicaux (score 100.0/100)

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
2026-03-16 10:27:33 +01:00
parent 49ff464e6e
commit d957e72aff
4 changed files with 375 additions and 0 deletions

View File

@@ -155,6 +155,20 @@ def main():
n_workers = args.workers n_workers = args.workers
# Vérification des ressources (RAM surtout — chaque worker charge ~4 Go de modèles NER)
from scripts.check_resources import require_resources
ram_needed = n_workers * 4
print(f"Vérification des ressources ({n_workers} workers × ~4 Go = ~{ram_needed} Go RAM)...")
try:
status = require_resources(ram_free_gb=ram_needed)
print(f" RAM OK : {status.ram_available_gb:.1f} Go disponible")
if status.gpu_available:
print(f" GPU : {status.gpu_name}, {status.vram_free_mb} Mo VRAM libre")
print()
except RuntimeError as e:
print(f"\n{e}", file=sys.stderr)
sys.exit(1)
# Collecter tous les PDFs disponibles (excluant audit_30) # Collecter tous les PDFs disponibles (excluant audit_30)
all_pdfs = [] all_pdfs = []
for ogc_dir in sorted(SRC.iterdir()): for ogc_dir in sorted(SRC.iterdir()):

347
scripts/check_resources.py Normal file
View File

@@ -0,0 +1,347 @@
#!/usr/bin/env python3
"""Vérification des ressources machine (GPU, RAM, CPU) avant exécution.
Utilisable comme module ou en standalone :
from scripts.check_resources import check_resources, require_resources
# Vérification simple (lève RuntimeError si insuffisant)
require_resources(vram_free_mb=2000, ram_free_gb=4)
# Vérification informative (retourne un dict)
status = check_resources()
print(status)
# En standalone
python scripts/check_resources.py
python scripts/check_resources.py --vram 2000 --ram 4 --wait
"""
import subprocess
import shutil
import time
import sys
from dataclasses import dataclass, field
from typing import List, Optional
@dataclass
class GpuProcess:
pid: int
name: str
vram_mb: int
@dataclass
class ResourceStatus:
# GPU
gpu_available: bool = False
gpu_name: str = ""
vram_total_mb: int = 0
vram_used_mb: int = 0
vram_free_mb: int = 0
gpu_util_pct: int = 0
gpu_processes: List[GpuProcess] = field(default_factory=list)
# RAM
ram_total_gb: float = 0.0
ram_used_gb: float = 0.0
ram_free_gb: float = 0.0
ram_available_gb: float = 0.0
# CPU
cpu_count: int = 0
load_avg_1m: float = 0.0
load_avg_5m: float = 0.0
def summary(self) -> str:
lines = []
lines.append("=" * 55)
lines.append(" ÉTAT DES RESSOURCES MACHINE")
lines.append("=" * 55)
# GPU
if self.gpu_available:
lines.append(f"\n GPU : {self.gpu_name}")
lines.append(f" VRAM totale : {self.vram_total_mb} Mo")
lines.append(f" VRAM utilisée: {self.vram_used_mb} Mo ({self._pct(self.vram_used_mb, self.vram_total_mb)}%)")
lines.append(f" VRAM libre : {self.vram_free_mb} Mo")
lines.append(f" Utilisation : {self.gpu_util_pct}%")
if self.gpu_processes:
lines.append(f" Processus GPU ({len(self.gpu_processes)}) :")
for p in self.gpu_processes:
short_name = p.name.split("/")[-1] if "/" in p.name else p.name
project = self._guess_project(p.name)
label = f" ({project})" if project else ""
lines.append(f" PID {p.pid}: {short_name}{label}{p.vram_mb} Mo")
else:
lines.append(" Aucun processus GPU actif")
else:
lines.append("\n GPU : non disponible (nvidia-smi absent)")
# RAM
lines.append(f"\n RAM : {self.ram_total_gb:.1f} Go total")
lines.append(f" Utilisée : {self.ram_used_gb:.1f} Go")
lines.append(f" Disponible : {self.ram_available_gb:.1f} Go")
# CPU
lines.append(f"\n CPU : {self.cpu_count} cœurs")
lines.append(f" Load avg : {self.load_avg_1m:.1f} (1m) / {self.load_avg_5m:.1f} (5m)")
lines.append("=" * 55)
return "\n".join(lines)
@staticmethod
def _pct(used: int, total: int) -> int:
return round(used * 100 / total) if total > 0 else 0
@staticmethod
def _guess_project(path: str) -> str:
"""Devine le projet à partir du chemin du processus."""
parts = path.split("/")
for i, p in enumerate(parts):
if p == "ai" and i + 1 < len(parts):
return parts[i + 1].split(".")[0]
return ""
def check_resources() -> ResourceStatus:
"""Collecte l'état actuel des ressources machine."""
status = ResourceStatus()
# --- GPU ---
if shutil.which("nvidia-smi"):
status.gpu_available = True
try:
out = subprocess.run(
["nvidia-smi", "--query-gpu=name,memory.total,memory.used,memory.free,utilization.gpu",
"--format=csv,noheader,nounits"],
capture_output=True, text=True, timeout=5
)
if out.returncode == 0:
parts = [p.strip() for p in out.stdout.strip().split(",")]
if len(parts) >= 5:
status.gpu_name = parts[0]
status.vram_total_mb = int(parts[1])
status.vram_used_mb = int(parts[2])
status.vram_free_mb = int(parts[3])
status.gpu_util_pct = int(parts[4])
except Exception:
pass
# Processus GPU
try:
out = subprocess.run(
["nvidia-smi", "--query-compute-apps=pid,process_name,used_gpu_memory",
"--format=csv,noheader,nounits"],
capture_output=True, text=True, timeout=5
)
if out.returncode == 0 and out.stdout.strip():
for line in out.stdout.strip().splitlines():
parts = [p.strip() for p in line.split(",")]
if len(parts) >= 3:
try:
status.gpu_processes.append(GpuProcess(
pid=int(parts[0]),
name=parts[1],
vram_mb=int(parts[2]),
))
except ValueError:
pass
except Exception:
pass
# --- RAM ---
try:
with open("/proc/meminfo") as f:
meminfo = {}
for line in f:
parts = line.split()
if len(parts) >= 2:
key = parts[0].rstrip(":")
meminfo[key] = int(parts[1]) # en kB
status.ram_total_gb = meminfo.get("MemTotal", 0) / 1048576
status.ram_free_gb = meminfo.get("MemFree", 0) / 1048576
status.ram_available_gb = meminfo.get("MemAvailable", 0) / 1048576
status.ram_used_gb = status.ram_total_gb - status.ram_free_gb
except Exception:
pass
# --- CPU ---
try:
import os
status.cpu_count = os.cpu_count() or 0
load = os.getloadavg()
status.load_avg_1m = load[0]
status.load_avg_5m = load[1]
except Exception:
pass
return status
def require_resources(
vram_free_mb: int = 0,
ram_free_gb: float = 0,
max_gpu_util_pct: int = 100,
fail_if_gpu_busy: bool = False,
) -> ResourceStatus:
"""Vérifie que les ressources minimales sont disponibles.
Args:
vram_free_mb: VRAM libre minimale requise (Mo). 0 = pas de vérification GPU.
ram_free_gb: RAM disponible minimale (Go).
max_gpu_util_pct: Utilisation GPU max tolérée (%).
fail_if_gpu_busy: Si True, échoue si d'autres processus utilisent le GPU.
Returns:
ResourceStatus si tout est ok.
Raises:
RuntimeError avec détails si ressources insuffisantes.
"""
status = check_resources()
errors = []
if vram_free_mb > 0:
if not status.gpu_available:
errors.append(f"GPU requis ({vram_free_mb} Mo VRAM) mais nvidia-smi non disponible")
elif status.vram_free_mb < vram_free_mb:
procs = ""
if status.gpu_processes:
procs = "\n Processus occupant le GPU :"
for p in status.gpu_processes:
short = p.name.split("/")[-1]
project = status._guess_project(p.name)
label = f" ({project})" if project else ""
procs += f"\n PID {p.pid}: {short}{label}{p.vram_mb} Mo"
errors.append(
f"VRAM insuffisante : {status.vram_free_mb} Mo libre, "
f"{vram_free_mb} Mo requis (utilisé: {status.vram_used_mb}/{status.vram_total_mb} Mo)"
f"{procs}"
)
if max_gpu_util_pct < 100 and status.gpu_available:
if status.gpu_util_pct > max_gpu_util_pct:
errors.append(
f"GPU trop chargé : {status.gpu_util_pct}% d'utilisation "
f"(max toléré: {max_gpu_util_pct}%)"
)
if fail_if_gpu_busy and status.gpu_processes:
names = [f"{p.name.split('/')[-1]} (PID {p.pid})" for p in status.gpu_processes]
errors.append(f"GPU occupé par : {', '.join(names)}")
if ram_free_gb > 0 and status.ram_available_gb < ram_free_gb:
errors.append(
f"RAM insuffisante : {status.ram_available_gb:.1f} Go disponible, "
f"{ram_free_gb:.1f} Go requis"
)
if errors:
msg = "Ressources insuffisantes :\n " + "\n ".join(errors)
msg += "\n\n" + status.summary()
raise RuntimeError(msg)
return status
def wait_for_resources(
vram_free_mb: int = 0,
ram_free_gb: float = 0,
max_gpu_util_pct: int = 100,
timeout_minutes: int = 30,
check_interval_seconds: int = 30,
) -> ResourceStatus:
"""Attend que les ressources soient disponibles (avec timeout).
Affiche un message périodique tant que les ressources sont insuffisantes.
Utile avant un fine-tuning ou un batch lourd.
Returns:
ResourceStatus quand les ressources sont disponibles.
Raises:
TimeoutError si le timeout est atteint.
"""
deadline = time.time() + timeout_minutes * 60
attempt = 0
while time.time() < deadline:
try:
status = require_resources(
vram_free_mb=vram_free_mb,
ram_free_gb=ram_free_gb,
max_gpu_util_pct=max_gpu_util_pct,
)
if attempt > 0:
print(f"\nRessources disponibles après {attempt * check_interval_seconds}s d'attente.")
return status
except RuntimeError as e:
attempt += 1
if attempt == 1:
print(f"En attente de ressources (timeout: {timeout_minutes}min)...")
print(f" Requis: VRAM >= {vram_free_mb} Mo, RAM >= {ram_free_gb} Go")
remaining = int((deadline - time.time()) / 60)
status = check_resources()
gpu_info = f"VRAM libre: {status.vram_free_mb} Mo" if status.gpu_available else "pas de GPU"
print(
f" [{attempt}] {gpu_info}, RAM dispo: {status.ram_available_gb:.1f} Go "
f"— encore {remaining}min max",
flush=True,
)
time.sleep(check_interval_seconds)
raise TimeoutError(
f"Timeout ({timeout_minutes}min) : ressources toujours insuffisantes.\n"
+ check_resources().summary()
)
def main():
import argparse
parser = argparse.ArgumentParser(description="Vérification des ressources machine")
parser.add_argument("--vram", type=int, default=0,
help="VRAM libre minimale requise (Mo)")
parser.add_argument("--ram", type=float, default=0,
help="RAM disponible minimale (Go)")
parser.add_argument("--gpu-util", type=int, default=100,
help="Utilisation GPU max tolérée (%%)")
parser.add_argument("--wait", action="store_true",
help="Attendre que les ressources soient disponibles")
parser.add_argument("--timeout", type=int, default=30,
help="Timeout d'attente en minutes (défaut: 30)")
args = parser.parse_args()
# Afficher l'état actuel
status = check_resources()
print(status.summary())
# Vérifier les seuils si demandés
if args.vram > 0 or args.ram > 0 or args.gpu_util < 100:
if args.wait:
try:
wait_for_resources(
vram_free_mb=args.vram,
ram_free_gb=args.ram,
max_gpu_util_pct=args.gpu_util,
timeout_minutes=args.timeout,
)
print("\nOK — ressources disponibles.")
except TimeoutError as e:
print(f"\nERREUR : {e}", file=sys.stderr)
sys.exit(1)
else:
try:
require_resources(
vram_free_mb=args.vram,
ram_free_gb=args.ram,
max_gpu_util_pct=args.gpu_util,
)
print("\nOK — ressources suffisantes.")
except RuntimeError as e:
print(f"\nERREUR : {e}", file=sys.stderr)
sys.exit(1)
if __name__ == "__main__":
main()

View File

@@ -77,6 +77,9 @@ NAME_IGNORE = {
"TRAITEMENT", "INTERVENTION", "OPERATOIRE", "RAPPORT", "TRAITEMENT", "INTERVENTION", "OPERATOIRE", "RAPPORT",
"PATIENT", "MONSIEUR", "MADAME", "DOCTEUR", "PATIENT", "MONSIEUR", "MADAME", "DOCTEUR",
"NORMAL", "POSITIF", "NEGATIF", "PRESENT", "ABSENT", "NORMAL", "POSITIF", "NEGATIF", "PRESENT", "ABSENT",
# Acronymes médicaux courts (aussi patronymes/prénoms INSEE → FP évaluateur)
"EVA", # Échelle Visuelle Analogique
"RAI", # Recherche d'Agglutinines Irrégulières
# Instructions soins Trackare (aussi patronymes INSEE → faux positifs évaluateur) # Instructions soins Trackare (aussi patronymes INSEE → faux positifs évaluateur)
"LEVER", "COUCHER", "MANGER", "MARCHER", "SORTIR", "POSE", "LEVER", "COUCHER", "MANGER", "MARCHER", "SORTIR", "POSE",
"GAUCHE", "DROITE", "ANTERIEUR", "POSTERIEUR", "GAUCHE", "DROITE", "ANTERIEUR", "POSTERIEUR",

View File

@@ -523,6 +523,17 @@ def main():
help="Seed pour la reproductibilité de l'augmentation") help="Seed pour la reproductibilité de l'augmentation")
args = parser.parse_args() args = parser.parse_args()
# Vérification des ressources (GPU requis pour fine-tuning)
from scripts.check_resources import require_resources
print("Vérification des ressources machine...")
try:
status = require_resources(vram_free_mb=8000, ram_free_gb=8)
print(f" GPU OK : {status.gpu_name}, {status.vram_free_mb} Mo VRAM libre")
print(f" RAM OK : {status.ram_available_gb:.1f} Go disponible\n")
except RuntimeError as e:
print(f"\n{e}", file=sys.stderr)
sys.exit(1)
# Chemins des gazetteers # Chemins des gazetteers
project_root = Path(__file__).parent.parent project_root = Path(__file__).parent.parent
prenoms_file = project_root / "data" / "insee" / "prenoms_france.txt" prenoms_file = project_root / "data" / "insee" / "prenoms_france.txt"