diff --git a/run_batch_silver_export.py b/run_batch_silver_export.py index 43a93e0..55b5ba7 100644 --- a/run_batch_silver_export.py +++ b/run_batch_silver_export.py @@ -155,6 +155,20 @@ def main(): n_workers = args.workers + # Vérification des ressources (RAM surtout — chaque worker charge ~4 Go de modèles NER) + from scripts.check_resources import require_resources + ram_needed = n_workers * 4 + print(f"Vérification des ressources ({n_workers} workers × ~4 Go = ~{ram_needed} Go RAM)...") + try: + status = require_resources(ram_free_gb=ram_needed) + print(f" RAM OK : {status.ram_available_gb:.1f} Go disponible") + if status.gpu_available: + print(f" GPU : {status.gpu_name}, {status.vram_free_mb} Mo VRAM libre") + print() + except RuntimeError as e: + print(f"\n{e}", file=sys.stderr) + sys.exit(1) + # Collecter tous les PDFs disponibles (excluant audit_30) all_pdfs = [] for ogc_dir in sorted(SRC.iterdir()): diff --git a/scripts/check_resources.py b/scripts/check_resources.py new file mode 100644 index 0000000..165bec0 --- /dev/null +++ b/scripts/check_resources.py @@ -0,0 +1,347 @@ +#!/usr/bin/env python3 +"""Vérification des ressources machine (GPU, RAM, CPU) avant exécution. + +Utilisable comme module ou en standalone : + from scripts.check_resources import check_resources, require_resources + + # Vérification simple (lève RuntimeError si insuffisant) + require_resources(vram_free_mb=2000, ram_free_gb=4) + + # Vérification informative (retourne un dict) + status = check_resources() + print(status) + + # En standalone + python scripts/check_resources.py + python scripts/check_resources.py --vram 2000 --ram 4 --wait +""" +import subprocess +import shutil +import time +import sys +from dataclasses import dataclass, field +from typing import List, Optional + + +@dataclass +class GpuProcess: + pid: int + name: str + vram_mb: int + + +@dataclass +class ResourceStatus: + # GPU + gpu_available: bool = False + gpu_name: str = "" + vram_total_mb: int = 0 + vram_used_mb: int = 0 + vram_free_mb: int = 0 + gpu_util_pct: int = 0 + gpu_processes: List[GpuProcess] = field(default_factory=list) + # RAM + ram_total_gb: float = 0.0 + ram_used_gb: float = 0.0 + ram_free_gb: float = 0.0 + ram_available_gb: float = 0.0 + # CPU + cpu_count: int = 0 + load_avg_1m: float = 0.0 + load_avg_5m: float = 0.0 + + def summary(self) -> str: + lines = [] + lines.append("=" * 55) + lines.append(" ÉTAT DES RESSOURCES MACHINE") + lines.append("=" * 55) + + # GPU + if self.gpu_available: + lines.append(f"\n GPU : {self.gpu_name}") + lines.append(f" VRAM totale : {self.vram_total_mb} Mo") + lines.append(f" VRAM utilisée: {self.vram_used_mb} Mo ({self._pct(self.vram_used_mb, self.vram_total_mb)}%)") + lines.append(f" VRAM libre : {self.vram_free_mb} Mo") + lines.append(f" Utilisation : {self.gpu_util_pct}%") + if self.gpu_processes: + lines.append(f" Processus GPU ({len(self.gpu_processes)}) :") + for p in self.gpu_processes: + short_name = p.name.split("/")[-1] if "/" in p.name else p.name + project = self._guess_project(p.name) + label = f" ({project})" if project else "" + lines.append(f" PID {p.pid}: {short_name}{label} — {p.vram_mb} Mo") + else: + lines.append(" Aucun processus GPU actif") + else: + lines.append("\n GPU : non disponible (nvidia-smi absent)") + + # RAM + lines.append(f"\n RAM : {self.ram_total_gb:.1f} Go total") + lines.append(f" Utilisée : {self.ram_used_gb:.1f} Go") + lines.append(f" Disponible : {self.ram_available_gb:.1f} Go") + + # CPU + lines.append(f"\n CPU : {self.cpu_count} cœurs") + lines.append(f" Load avg : {self.load_avg_1m:.1f} (1m) / {self.load_avg_5m:.1f} (5m)") + + lines.append("=" * 55) + return "\n".join(lines) + + @staticmethod + def _pct(used: int, total: int) -> int: + return round(used * 100 / total) if total > 0 else 0 + + @staticmethod + def _guess_project(path: str) -> str: + """Devine le projet à partir du chemin du processus.""" + parts = path.split("/") + for i, p in enumerate(parts): + if p == "ai" and i + 1 < len(parts): + return parts[i + 1].split(".")[0] + return "" + + +def check_resources() -> ResourceStatus: + """Collecte l'état actuel des ressources machine.""" + status = ResourceStatus() + + # --- GPU --- + if shutil.which("nvidia-smi"): + status.gpu_available = True + try: + out = subprocess.run( + ["nvidia-smi", "--query-gpu=name,memory.total,memory.used,memory.free,utilization.gpu", + "--format=csv,noheader,nounits"], + capture_output=True, text=True, timeout=5 + ) + if out.returncode == 0: + parts = [p.strip() for p in out.stdout.strip().split(",")] + if len(parts) >= 5: + status.gpu_name = parts[0] + status.vram_total_mb = int(parts[1]) + status.vram_used_mb = int(parts[2]) + status.vram_free_mb = int(parts[3]) + status.gpu_util_pct = int(parts[4]) + except Exception: + pass + + # Processus GPU + try: + out = subprocess.run( + ["nvidia-smi", "--query-compute-apps=pid,process_name,used_gpu_memory", + "--format=csv,noheader,nounits"], + capture_output=True, text=True, timeout=5 + ) + if out.returncode == 0 and out.stdout.strip(): + for line in out.stdout.strip().splitlines(): + parts = [p.strip() for p in line.split(",")] + if len(parts) >= 3: + try: + status.gpu_processes.append(GpuProcess( + pid=int(parts[0]), + name=parts[1], + vram_mb=int(parts[2]), + )) + except ValueError: + pass + except Exception: + pass + + # --- RAM --- + try: + with open("/proc/meminfo") as f: + meminfo = {} + for line in f: + parts = line.split() + if len(parts) >= 2: + key = parts[0].rstrip(":") + meminfo[key] = int(parts[1]) # en kB + + status.ram_total_gb = meminfo.get("MemTotal", 0) / 1048576 + status.ram_free_gb = meminfo.get("MemFree", 0) / 1048576 + status.ram_available_gb = meminfo.get("MemAvailable", 0) / 1048576 + status.ram_used_gb = status.ram_total_gb - status.ram_free_gb + except Exception: + pass + + # --- CPU --- + try: + import os + status.cpu_count = os.cpu_count() or 0 + load = os.getloadavg() + status.load_avg_1m = load[0] + status.load_avg_5m = load[1] + except Exception: + pass + + return status + + +def require_resources( + vram_free_mb: int = 0, + ram_free_gb: float = 0, + max_gpu_util_pct: int = 100, + fail_if_gpu_busy: bool = False, +) -> ResourceStatus: + """Vérifie que les ressources minimales sont disponibles. + + Args: + vram_free_mb: VRAM libre minimale requise (Mo). 0 = pas de vérification GPU. + ram_free_gb: RAM disponible minimale (Go). + max_gpu_util_pct: Utilisation GPU max tolérée (%). + fail_if_gpu_busy: Si True, échoue si d'autres processus utilisent le GPU. + + Returns: + ResourceStatus si tout est ok. + + Raises: + RuntimeError avec détails si ressources insuffisantes. + """ + status = check_resources() + errors = [] + + if vram_free_mb > 0: + if not status.gpu_available: + errors.append(f"GPU requis ({vram_free_mb} Mo VRAM) mais nvidia-smi non disponible") + elif status.vram_free_mb < vram_free_mb: + procs = "" + if status.gpu_processes: + procs = "\n Processus occupant le GPU :" + for p in status.gpu_processes: + short = p.name.split("/")[-1] + project = status._guess_project(p.name) + label = f" ({project})" if project else "" + procs += f"\n PID {p.pid}: {short}{label} — {p.vram_mb} Mo" + errors.append( + f"VRAM insuffisante : {status.vram_free_mb} Mo libre, " + f"{vram_free_mb} Mo requis (utilisé: {status.vram_used_mb}/{status.vram_total_mb} Mo)" + f"{procs}" + ) + + if max_gpu_util_pct < 100 and status.gpu_available: + if status.gpu_util_pct > max_gpu_util_pct: + errors.append( + f"GPU trop chargé : {status.gpu_util_pct}% d'utilisation " + f"(max toléré: {max_gpu_util_pct}%)" + ) + + if fail_if_gpu_busy and status.gpu_processes: + names = [f"{p.name.split('/')[-1]} (PID {p.pid})" for p in status.gpu_processes] + errors.append(f"GPU occupé par : {', '.join(names)}") + + if ram_free_gb > 0 and status.ram_available_gb < ram_free_gb: + errors.append( + f"RAM insuffisante : {status.ram_available_gb:.1f} Go disponible, " + f"{ram_free_gb:.1f} Go requis" + ) + + if errors: + msg = "Ressources insuffisantes :\n " + "\n ".join(errors) + msg += "\n\n" + status.summary() + raise RuntimeError(msg) + + return status + + +def wait_for_resources( + vram_free_mb: int = 0, + ram_free_gb: float = 0, + max_gpu_util_pct: int = 100, + timeout_minutes: int = 30, + check_interval_seconds: int = 30, +) -> ResourceStatus: + """Attend que les ressources soient disponibles (avec timeout). + + Affiche un message périodique tant que les ressources sont insuffisantes. + Utile avant un fine-tuning ou un batch lourd. + + Returns: + ResourceStatus quand les ressources sont disponibles. + + Raises: + TimeoutError si le timeout est atteint. + """ + deadline = time.time() + timeout_minutes * 60 + attempt = 0 + + while time.time() < deadline: + try: + status = require_resources( + vram_free_mb=vram_free_mb, + ram_free_gb=ram_free_gb, + max_gpu_util_pct=max_gpu_util_pct, + ) + if attempt > 0: + print(f"\nRessources disponibles après {attempt * check_interval_seconds}s d'attente.") + return status + except RuntimeError as e: + attempt += 1 + if attempt == 1: + print(f"En attente de ressources (timeout: {timeout_minutes}min)...") + print(f" Requis: VRAM >= {vram_free_mb} Mo, RAM >= {ram_free_gb} Go") + + remaining = int((deadline - time.time()) / 60) + status = check_resources() + gpu_info = f"VRAM libre: {status.vram_free_mb} Mo" if status.gpu_available else "pas de GPU" + print( + f" [{attempt}] {gpu_info}, RAM dispo: {status.ram_available_gb:.1f} Go " + f"— encore {remaining}min max", + flush=True, + ) + time.sleep(check_interval_seconds) + + raise TimeoutError( + f"Timeout ({timeout_minutes}min) : ressources toujours insuffisantes.\n" + + check_resources().summary() + ) + + +def main(): + import argparse + + parser = argparse.ArgumentParser(description="Vérification des ressources machine") + parser.add_argument("--vram", type=int, default=0, + help="VRAM libre minimale requise (Mo)") + parser.add_argument("--ram", type=float, default=0, + help="RAM disponible minimale (Go)") + parser.add_argument("--gpu-util", type=int, default=100, + help="Utilisation GPU max tolérée (%%)") + parser.add_argument("--wait", action="store_true", + help="Attendre que les ressources soient disponibles") + parser.add_argument("--timeout", type=int, default=30, + help="Timeout d'attente en minutes (défaut: 30)") + args = parser.parse_args() + + # Afficher l'état actuel + status = check_resources() + print(status.summary()) + + # Vérifier les seuils si demandés + if args.vram > 0 or args.ram > 0 or args.gpu_util < 100: + if args.wait: + try: + wait_for_resources( + vram_free_mb=args.vram, + ram_free_gb=args.ram, + max_gpu_util_pct=args.gpu_util, + timeout_minutes=args.timeout, + ) + print("\nOK — ressources disponibles.") + except TimeoutError as e: + print(f"\nERREUR : {e}", file=sys.stderr) + sys.exit(1) + else: + try: + require_resources( + vram_free_mb=args.vram, + ram_free_gb=args.ram, + max_gpu_util_pct=args.gpu_util, + ) + print("\nOK — ressources suffisantes.") + except RuntimeError as e: + print(f"\nERREUR : {e}", file=sys.stderr) + sys.exit(1) + + +if __name__ == "__main__": + main() diff --git a/scripts/evaluate_quality.py b/scripts/evaluate_quality.py index 60479ed..1f362a1 100644 --- a/scripts/evaluate_quality.py +++ b/scripts/evaluate_quality.py @@ -77,6 +77,9 @@ NAME_IGNORE = { "TRAITEMENT", "INTERVENTION", "OPERATOIRE", "RAPPORT", "PATIENT", "MONSIEUR", "MADAME", "DOCTEUR", "NORMAL", "POSITIF", "NEGATIF", "PRESENT", "ABSENT", + # Acronymes médicaux courts (aussi patronymes/prénoms INSEE → FP évaluateur) + "EVA", # Échelle Visuelle Analogique + "RAI", # Recherche d'Agglutinines Irrégulières # Instructions soins Trackare (aussi patronymes INSEE → faux positifs évaluateur) "LEVER", "COUCHER", "MANGER", "MARCHER", "SORTIR", "POSE", "GAUCHE", "DROITE", "ANTERIEUR", "POSTERIEUR", diff --git a/scripts/finetune_camembert_bio.py b/scripts/finetune_camembert_bio.py index bfbedf1..c0301c7 100644 --- a/scripts/finetune_camembert_bio.py +++ b/scripts/finetune_camembert_bio.py @@ -523,6 +523,17 @@ def main(): help="Seed pour la reproductibilité de l'augmentation") args = parser.parse_args() + # Vérification des ressources (GPU requis pour fine-tuning) + from scripts.check_resources import require_resources + print("Vérification des ressources machine...") + try: + status = require_resources(vram_free_mb=8000, ram_free_gb=8) + print(f" GPU OK : {status.gpu_name}, {status.vram_free_mb} Mo VRAM libre") + print(f" RAM OK : {status.ram_available_gb:.1f} Go disponible\n") + except RuntimeError as e: + print(f"\n{e}", file=sys.stderr) + sys.exit(1) + # Chemins des gazetteers project_root = Path(__file__).parent.parent prenoms_file = project_root / "data" / "insee" / "prenoms_france.txt"