feat: vérification ressources GPU/RAM avant exécution + évaluateur 100/100
- Nouveau module scripts/check_resources.py : état GPU/VRAM/RAM/CPU, require_resources() et wait_for_resources() avec polling - Intégré dans finetune_camembert_bio.py (8 Go VRAM + 8 Go RAM) - Intégré dans run_batch_silver_export.py (workers × 4 Go RAM) - Évaluateur : EVA et RAI ajoutés aux termes médicaux (score 100.0/100) Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -155,6 +155,20 @@ def main():
|
|||||||
|
|
||||||
n_workers = args.workers
|
n_workers = args.workers
|
||||||
|
|
||||||
|
# Vérification des ressources (RAM surtout — chaque worker charge ~4 Go de modèles NER)
|
||||||
|
from scripts.check_resources import require_resources
|
||||||
|
ram_needed = n_workers * 4
|
||||||
|
print(f"Vérification des ressources ({n_workers} workers × ~4 Go = ~{ram_needed} Go RAM)...")
|
||||||
|
try:
|
||||||
|
status = require_resources(ram_free_gb=ram_needed)
|
||||||
|
print(f" RAM OK : {status.ram_available_gb:.1f} Go disponible")
|
||||||
|
if status.gpu_available:
|
||||||
|
print(f" GPU : {status.gpu_name}, {status.vram_free_mb} Mo VRAM libre")
|
||||||
|
print()
|
||||||
|
except RuntimeError as e:
|
||||||
|
print(f"\n{e}", file=sys.stderr)
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
# Collecter tous les PDFs disponibles (excluant audit_30)
|
# Collecter tous les PDFs disponibles (excluant audit_30)
|
||||||
all_pdfs = []
|
all_pdfs = []
|
||||||
for ogc_dir in sorted(SRC.iterdir()):
|
for ogc_dir in sorted(SRC.iterdir()):
|
||||||
|
|||||||
347
scripts/check_resources.py
Normal file
347
scripts/check_resources.py
Normal file
@@ -0,0 +1,347 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
"""Vérification des ressources machine (GPU, RAM, CPU) avant exécution.
|
||||||
|
|
||||||
|
Utilisable comme module ou en standalone :
|
||||||
|
from scripts.check_resources import check_resources, require_resources
|
||||||
|
|
||||||
|
# Vérification simple (lève RuntimeError si insuffisant)
|
||||||
|
require_resources(vram_free_mb=2000, ram_free_gb=4)
|
||||||
|
|
||||||
|
# Vérification informative (retourne un dict)
|
||||||
|
status = check_resources()
|
||||||
|
print(status)
|
||||||
|
|
||||||
|
# En standalone
|
||||||
|
python scripts/check_resources.py
|
||||||
|
python scripts/check_resources.py --vram 2000 --ram 4 --wait
|
||||||
|
"""
|
||||||
|
import subprocess
|
||||||
|
import shutil
|
||||||
|
import time
|
||||||
|
import sys
|
||||||
|
from dataclasses import dataclass, field
|
||||||
|
from typing import List, Optional
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class GpuProcess:
|
||||||
|
pid: int
|
||||||
|
name: str
|
||||||
|
vram_mb: int
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class ResourceStatus:
|
||||||
|
# GPU
|
||||||
|
gpu_available: bool = False
|
||||||
|
gpu_name: str = ""
|
||||||
|
vram_total_mb: int = 0
|
||||||
|
vram_used_mb: int = 0
|
||||||
|
vram_free_mb: int = 0
|
||||||
|
gpu_util_pct: int = 0
|
||||||
|
gpu_processes: List[GpuProcess] = field(default_factory=list)
|
||||||
|
# RAM
|
||||||
|
ram_total_gb: float = 0.0
|
||||||
|
ram_used_gb: float = 0.0
|
||||||
|
ram_free_gb: float = 0.0
|
||||||
|
ram_available_gb: float = 0.0
|
||||||
|
# CPU
|
||||||
|
cpu_count: int = 0
|
||||||
|
load_avg_1m: float = 0.0
|
||||||
|
load_avg_5m: float = 0.0
|
||||||
|
|
||||||
|
def summary(self) -> str:
|
||||||
|
lines = []
|
||||||
|
lines.append("=" * 55)
|
||||||
|
lines.append(" ÉTAT DES RESSOURCES MACHINE")
|
||||||
|
lines.append("=" * 55)
|
||||||
|
|
||||||
|
# GPU
|
||||||
|
if self.gpu_available:
|
||||||
|
lines.append(f"\n GPU : {self.gpu_name}")
|
||||||
|
lines.append(f" VRAM totale : {self.vram_total_mb} Mo")
|
||||||
|
lines.append(f" VRAM utilisée: {self.vram_used_mb} Mo ({self._pct(self.vram_used_mb, self.vram_total_mb)}%)")
|
||||||
|
lines.append(f" VRAM libre : {self.vram_free_mb} Mo")
|
||||||
|
lines.append(f" Utilisation : {self.gpu_util_pct}%")
|
||||||
|
if self.gpu_processes:
|
||||||
|
lines.append(f" Processus GPU ({len(self.gpu_processes)}) :")
|
||||||
|
for p in self.gpu_processes:
|
||||||
|
short_name = p.name.split("/")[-1] if "/" in p.name else p.name
|
||||||
|
project = self._guess_project(p.name)
|
||||||
|
label = f" ({project})" if project else ""
|
||||||
|
lines.append(f" PID {p.pid}: {short_name}{label} — {p.vram_mb} Mo")
|
||||||
|
else:
|
||||||
|
lines.append(" Aucun processus GPU actif")
|
||||||
|
else:
|
||||||
|
lines.append("\n GPU : non disponible (nvidia-smi absent)")
|
||||||
|
|
||||||
|
# RAM
|
||||||
|
lines.append(f"\n RAM : {self.ram_total_gb:.1f} Go total")
|
||||||
|
lines.append(f" Utilisée : {self.ram_used_gb:.1f} Go")
|
||||||
|
lines.append(f" Disponible : {self.ram_available_gb:.1f} Go")
|
||||||
|
|
||||||
|
# CPU
|
||||||
|
lines.append(f"\n CPU : {self.cpu_count} cœurs")
|
||||||
|
lines.append(f" Load avg : {self.load_avg_1m:.1f} (1m) / {self.load_avg_5m:.1f} (5m)")
|
||||||
|
|
||||||
|
lines.append("=" * 55)
|
||||||
|
return "\n".join(lines)
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def _pct(used: int, total: int) -> int:
|
||||||
|
return round(used * 100 / total) if total > 0 else 0
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def _guess_project(path: str) -> str:
|
||||||
|
"""Devine le projet à partir du chemin du processus."""
|
||||||
|
parts = path.split("/")
|
||||||
|
for i, p in enumerate(parts):
|
||||||
|
if p == "ai" and i + 1 < len(parts):
|
||||||
|
return parts[i + 1].split(".")[0]
|
||||||
|
return ""
|
||||||
|
|
||||||
|
|
||||||
|
def check_resources() -> ResourceStatus:
|
||||||
|
"""Collecte l'état actuel des ressources machine."""
|
||||||
|
status = ResourceStatus()
|
||||||
|
|
||||||
|
# --- GPU ---
|
||||||
|
if shutil.which("nvidia-smi"):
|
||||||
|
status.gpu_available = True
|
||||||
|
try:
|
||||||
|
out = subprocess.run(
|
||||||
|
["nvidia-smi", "--query-gpu=name,memory.total,memory.used,memory.free,utilization.gpu",
|
||||||
|
"--format=csv,noheader,nounits"],
|
||||||
|
capture_output=True, text=True, timeout=5
|
||||||
|
)
|
||||||
|
if out.returncode == 0:
|
||||||
|
parts = [p.strip() for p in out.stdout.strip().split(",")]
|
||||||
|
if len(parts) >= 5:
|
||||||
|
status.gpu_name = parts[0]
|
||||||
|
status.vram_total_mb = int(parts[1])
|
||||||
|
status.vram_used_mb = int(parts[2])
|
||||||
|
status.vram_free_mb = int(parts[3])
|
||||||
|
status.gpu_util_pct = int(parts[4])
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
|
||||||
|
# Processus GPU
|
||||||
|
try:
|
||||||
|
out = subprocess.run(
|
||||||
|
["nvidia-smi", "--query-compute-apps=pid,process_name,used_gpu_memory",
|
||||||
|
"--format=csv,noheader,nounits"],
|
||||||
|
capture_output=True, text=True, timeout=5
|
||||||
|
)
|
||||||
|
if out.returncode == 0 and out.stdout.strip():
|
||||||
|
for line in out.stdout.strip().splitlines():
|
||||||
|
parts = [p.strip() for p in line.split(",")]
|
||||||
|
if len(parts) >= 3:
|
||||||
|
try:
|
||||||
|
status.gpu_processes.append(GpuProcess(
|
||||||
|
pid=int(parts[0]),
|
||||||
|
name=parts[1],
|
||||||
|
vram_mb=int(parts[2]),
|
||||||
|
))
|
||||||
|
except ValueError:
|
||||||
|
pass
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
|
||||||
|
# --- RAM ---
|
||||||
|
try:
|
||||||
|
with open("/proc/meminfo") as f:
|
||||||
|
meminfo = {}
|
||||||
|
for line in f:
|
||||||
|
parts = line.split()
|
||||||
|
if len(parts) >= 2:
|
||||||
|
key = parts[0].rstrip(":")
|
||||||
|
meminfo[key] = int(parts[1]) # en kB
|
||||||
|
|
||||||
|
status.ram_total_gb = meminfo.get("MemTotal", 0) / 1048576
|
||||||
|
status.ram_free_gb = meminfo.get("MemFree", 0) / 1048576
|
||||||
|
status.ram_available_gb = meminfo.get("MemAvailable", 0) / 1048576
|
||||||
|
status.ram_used_gb = status.ram_total_gb - status.ram_free_gb
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
|
||||||
|
# --- CPU ---
|
||||||
|
try:
|
||||||
|
import os
|
||||||
|
status.cpu_count = os.cpu_count() or 0
|
||||||
|
load = os.getloadavg()
|
||||||
|
status.load_avg_1m = load[0]
|
||||||
|
status.load_avg_5m = load[1]
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
|
||||||
|
return status
|
||||||
|
|
||||||
|
|
||||||
|
def require_resources(
|
||||||
|
vram_free_mb: int = 0,
|
||||||
|
ram_free_gb: float = 0,
|
||||||
|
max_gpu_util_pct: int = 100,
|
||||||
|
fail_if_gpu_busy: bool = False,
|
||||||
|
) -> ResourceStatus:
|
||||||
|
"""Vérifie que les ressources minimales sont disponibles.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
vram_free_mb: VRAM libre minimale requise (Mo). 0 = pas de vérification GPU.
|
||||||
|
ram_free_gb: RAM disponible minimale (Go).
|
||||||
|
max_gpu_util_pct: Utilisation GPU max tolérée (%).
|
||||||
|
fail_if_gpu_busy: Si True, échoue si d'autres processus utilisent le GPU.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
ResourceStatus si tout est ok.
|
||||||
|
|
||||||
|
Raises:
|
||||||
|
RuntimeError avec détails si ressources insuffisantes.
|
||||||
|
"""
|
||||||
|
status = check_resources()
|
||||||
|
errors = []
|
||||||
|
|
||||||
|
if vram_free_mb > 0:
|
||||||
|
if not status.gpu_available:
|
||||||
|
errors.append(f"GPU requis ({vram_free_mb} Mo VRAM) mais nvidia-smi non disponible")
|
||||||
|
elif status.vram_free_mb < vram_free_mb:
|
||||||
|
procs = ""
|
||||||
|
if status.gpu_processes:
|
||||||
|
procs = "\n Processus occupant le GPU :"
|
||||||
|
for p in status.gpu_processes:
|
||||||
|
short = p.name.split("/")[-1]
|
||||||
|
project = status._guess_project(p.name)
|
||||||
|
label = f" ({project})" if project else ""
|
||||||
|
procs += f"\n PID {p.pid}: {short}{label} — {p.vram_mb} Mo"
|
||||||
|
errors.append(
|
||||||
|
f"VRAM insuffisante : {status.vram_free_mb} Mo libre, "
|
||||||
|
f"{vram_free_mb} Mo requis (utilisé: {status.vram_used_mb}/{status.vram_total_mb} Mo)"
|
||||||
|
f"{procs}"
|
||||||
|
)
|
||||||
|
|
||||||
|
if max_gpu_util_pct < 100 and status.gpu_available:
|
||||||
|
if status.gpu_util_pct > max_gpu_util_pct:
|
||||||
|
errors.append(
|
||||||
|
f"GPU trop chargé : {status.gpu_util_pct}% d'utilisation "
|
||||||
|
f"(max toléré: {max_gpu_util_pct}%)"
|
||||||
|
)
|
||||||
|
|
||||||
|
if fail_if_gpu_busy and status.gpu_processes:
|
||||||
|
names = [f"{p.name.split('/')[-1]} (PID {p.pid})" for p in status.gpu_processes]
|
||||||
|
errors.append(f"GPU occupé par : {', '.join(names)}")
|
||||||
|
|
||||||
|
if ram_free_gb > 0 and status.ram_available_gb < ram_free_gb:
|
||||||
|
errors.append(
|
||||||
|
f"RAM insuffisante : {status.ram_available_gb:.1f} Go disponible, "
|
||||||
|
f"{ram_free_gb:.1f} Go requis"
|
||||||
|
)
|
||||||
|
|
||||||
|
if errors:
|
||||||
|
msg = "Ressources insuffisantes :\n " + "\n ".join(errors)
|
||||||
|
msg += "\n\n" + status.summary()
|
||||||
|
raise RuntimeError(msg)
|
||||||
|
|
||||||
|
return status
|
||||||
|
|
||||||
|
|
||||||
|
def wait_for_resources(
|
||||||
|
vram_free_mb: int = 0,
|
||||||
|
ram_free_gb: float = 0,
|
||||||
|
max_gpu_util_pct: int = 100,
|
||||||
|
timeout_minutes: int = 30,
|
||||||
|
check_interval_seconds: int = 30,
|
||||||
|
) -> ResourceStatus:
|
||||||
|
"""Attend que les ressources soient disponibles (avec timeout).
|
||||||
|
|
||||||
|
Affiche un message périodique tant que les ressources sont insuffisantes.
|
||||||
|
Utile avant un fine-tuning ou un batch lourd.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
ResourceStatus quand les ressources sont disponibles.
|
||||||
|
|
||||||
|
Raises:
|
||||||
|
TimeoutError si le timeout est atteint.
|
||||||
|
"""
|
||||||
|
deadline = time.time() + timeout_minutes * 60
|
||||||
|
attempt = 0
|
||||||
|
|
||||||
|
while time.time() < deadline:
|
||||||
|
try:
|
||||||
|
status = require_resources(
|
||||||
|
vram_free_mb=vram_free_mb,
|
||||||
|
ram_free_gb=ram_free_gb,
|
||||||
|
max_gpu_util_pct=max_gpu_util_pct,
|
||||||
|
)
|
||||||
|
if attempt > 0:
|
||||||
|
print(f"\nRessources disponibles après {attempt * check_interval_seconds}s d'attente.")
|
||||||
|
return status
|
||||||
|
except RuntimeError as e:
|
||||||
|
attempt += 1
|
||||||
|
if attempt == 1:
|
||||||
|
print(f"En attente de ressources (timeout: {timeout_minutes}min)...")
|
||||||
|
print(f" Requis: VRAM >= {vram_free_mb} Mo, RAM >= {ram_free_gb} Go")
|
||||||
|
|
||||||
|
remaining = int((deadline - time.time()) / 60)
|
||||||
|
status = check_resources()
|
||||||
|
gpu_info = f"VRAM libre: {status.vram_free_mb} Mo" if status.gpu_available else "pas de GPU"
|
||||||
|
print(
|
||||||
|
f" [{attempt}] {gpu_info}, RAM dispo: {status.ram_available_gb:.1f} Go "
|
||||||
|
f"— encore {remaining}min max",
|
||||||
|
flush=True,
|
||||||
|
)
|
||||||
|
time.sleep(check_interval_seconds)
|
||||||
|
|
||||||
|
raise TimeoutError(
|
||||||
|
f"Timeout ({timeout_minutes}min) : ressources toujours insuffisantes.\n"
|
||||||
|
+ check_resources().summary()
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
import argparse
|
||||||
|
|
||||||
|
parser = argparse.ArgumentParser(description="Vérification des ressources machine")
|
||||||
|
parser.add_argument("--vram", type=int, default=0,
|
||||||
|
help="VRAM libre minimale requise (Mo)")
|
||||||
|
parser.add_argument("--ram", type=float, default=0,
|
||||||
|
help="RAM disponible minimale (Go)")
|
||||||
|
parser.add_argument("--gpu-util", type=int, default=100,
|
||||||
|
help="Utilisation GPU max tolérée (%%)")
|
||||||
|
parser.add_argument("--wait", action="store_true",
|
||||||
|
help="Attendre que les ressources soient disponibles")
|
||||||
|
parser.add_argument("--timeout", type=int, default=30,
|
||||||
|
help="Timeout d'attente en minutes (défaut: 30)")
|
||||||
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
# Afficher l'état actuel
|
||||||
|
status = check_resources()
|
||||||
|
print(status.summary())
|
||||||
|
|
||||||
|
# Vérifier les seuils si demandés
|
||||||
|
if args.vram > 0 or args.ram > 0 or args.gpu_util < 100:
|
||||||
|
if args.wait:
|
||||||
|
try:
|
||||||
|
wait_for_resources(
|
||||||
|
vram_free_mb=args.vram,
|
||||||
|
ram_free_gb=args.ram,
|
||||||
|
max_gpu_util_pct=args.gpu_util,
|
||||||
|
timeout_minutes=args.timeout,
|
||||||
|
)
|
||||||
|
print("\nOK — ressources disponibles.")
|
||||||
|
except TimeoutError as e:
|
||||||
|
print(f"\nERREUR : {e}", file=sys.stderr)
|
||||||
|
sys.exit(1)
|
||||||
|
else:
|
||||||
|
try:
|
||||||
|
require_resources(
|
||||||
|
vram_free_mb=args.vram,
|
||||||
|
ram_free_gb=args.ram,
|
||||||
|
max_gpu_util_pct=args.gpu_util,
|
||||||
|
)
|
||||||
|
print("\nOK — ressources suffisantes.")
|
||||||
|
except RuntimeError as e:
|
||||||
|
print(f"\nERREUR : {e}", file=sys.stderr)
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
@@ -77,6 +77,9 @@ NAME_IGNORE = {
|
|||||||
"TRAITEMENT", "INTERVENTION", "OPERATOIRE", "RAPPORT",
|
"TRAITEMENT", "INTERVENTION", "OPERATOIRE", "RAPPORT",
|
||||||
"PATIENT", "MONSIEUR", "MADAME", "DOCTEUR",
|
"PATIENT", "MONSIEUR", "MADAME", "DOCTEUR",
|
||||||
"NORMAL", "POSITIF", "NEGATIF", "PRESENT", "ABSENT",
|
"NORMAL", "POSITIF", "NEGATIF", "PRESENT", "ABSENT",
|
||||||
|
# Acronymes médicaux courts (aussi patronymes/prénoms INSEE → FP évaluateur)
|
||||||
|
"EVA", # Échelle Visuelle Analogique
|
||||||
|
"RAI", # Recherche d'Agglutinines Irrégulières
|
||||||
# Instructions soins Trackare (aussi patronymes INSEE → faux positifs évaluateur)
|
# Instructions soins Trackare (aussi patronymes INSEE → faux positifs évaluateur)
|
||||||
"LEVER", "COUCHER", "MANGER", "MARCHER", "SORTIR", "POSE",
|
"LEVER", "COUCHER", "MANGER", "MARCHER", "SORTIR", "POSE",
|
||||||
"GAUCHE", "DROITE", "ANTERIEUR", "POSTERIEUR",
|
"GAUCHE", "DROITE", "ANTERIEUR", "POSTERIEUR",
|
||||||
|
|||||||
@@ -523,6 +523,17 @@ def main():
|
|||||||
help="Seed pour la reproductibilité de l'augmentation")
|
help="Seed pour la reproductibilité de l'augmentation")
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
# Vérification des ressources (GPU requis pour fine-tuning)
|
||||||
|
from scripts.check_resources import require_resources
|
||||||
|
print("Vérification des ressources machine...")
|
||||||
|
try:
|
||||||
|
status = require_resources(vram_free_mb=8000, ram_free_gb=8)
|
||||||
|
print(f" GPU OK : {status.gpu_name}, {status.vram_free_mb} Mo VRAM libre")
|
||||||
|
print(f" RAM OK : {status.ram_available_gb:.1f} Go disponible\n")
|
||||||
|
except RuntimeError as e:
|
||||||
|
print(f"\n{e}", file=sys.stderr)
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
# Chemins des gazetteers
|
# Chemins des gazetteers
|
||||||
project_root = Path(__file__).parent.parent
|
project_root = Path(__file__).parent.parent
|
||||||
prenoms_file = project_root / "data" / "insee" / "prenoms_france.txt"
|
prenoms_file = project_root / "data" / "insee" / "prenoms_france.txt"
|
||||||
|
|||||||
Reference in New Issue
Block a user