feat(phase2): Gazetteers FINESS 102K établissements + fine-tuning CamemBERT-bio F1=89%
Gazetteers FINESS (data.gouv.fr open data): - 102K numéros FINESS → détection par lookup exact dans _mask_admin_label + selective_rescan - 122K noms d'établissements, 113K téléphones, 76K adresses (disponibles) - Un nombre 9 chiffres matchant un vrai FINESS est masqué même sans label "FINESS" Fine-tuning CamemBERT-bio (almanach/camembert-bio-base): - Export silver annotations réécrit : alignement original↔pseudonymisé (difflib) → 6862 entités B- (vs 3344 avec l'ancien audit-only) sur 222K tokens - Sliding windows (200 tokens, stride 100) pour documents longs - WeightedNERTrainer avec class weights cappés (max 10x) + label smoothing - Résultat: Precision=88.1%, Recall=89.8%, F1=88.9% (20 epochs, lr=1e-5) - Modèle sauvegardé dans models/camembert-bio-deid/best (non commité) Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -1,9 +1,10 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Export silver annotations — Génère des données d'entraînement BIO à partir du pipeline existant.
|
||||
================================================================================================
|
||||
Utilise le pipeline regex+NER+VLM actuel pour produire des annotations "silver standard"
|
||||
sur les 706 OGC. Ces annotations servent de base pour fine-tuner CamemBERT-bio.
|
||||
Export silver annotations — BIO via alignement texte original ↔ pseudonymisé.
|
||||
=============================================================================
|
||||
Aligne le texte extrait du PDF original avec le texte pseudonymisé (.pseudonymise.txt)
|
||||
pour créer des annotations BIO fiables. Les placeholders [NOM], [TEL], etc. dans le
|
||||
texte pseudonymisé indiquent exactement quels tokens ont été masqués.
|
||||
|
||||
Usage:
|
||||
python scripts/export_silver_annotations.py [--limit N] [--out-dir DIR]
|
||||
@@ -13,21 +14,15 @@ Format BIO: TOKEN\tLABEL (un token par ligne, lignes vides entre phrases)
|
||||
"""
|
||||
import sys
|
||||
import re
|
||||
import json
|
||||
import difflib
|
||||
import argparse
|
||||
from pathlib import Path
|
||||
from typing import List, Tuple
|
||||
from typing import Dict, List, Tuple
|
||||
|
||||
sys.path.insert(0, str(Path(__file__).parent.parent))
|
||||
|
||||
# Regex pour détecter les placeholders et reconstruire l'alignement
|
||||
PLACEHOLDER_RE = re.compile(
|
||||
r"\[(NOM|TEL|EMAIL|NIR|IPP|DOSSIER|NDA|EPISODE|RPPS|DATE_NAISSANCE|"
|
||||
r"ADRESSE|CODE_POSTAL|VILLE|MASK|FINESS|OGC|AGE|ETAB|IBAN)\]"
|
||||
)
|
||||
|
||||
# Mapping placeholder → label BIO
|
||||
PH_TO_BIO = {
|
||||
PLACEHOLDER_TO_BIO: Dict[str, str] = {
|
||||
"NOM": "PER",
|
||||
"TEL": "TEL",
|
||||
"EMAIL": "EMAIL",
|
||||
@@ -41,78 +36,178 @@ PH_TO_BIO = {
|
||||
"ADRESSE": "ADRESSE",
|
||||
"CODE_POSTAL": "ZIP",
|
||||
"VILLE": "VILLE",
|
||||
"ETAB": "HOPITAL",
|
||||
"FINESS": "HOPITAL",
|
||||
"HOPITAL": "HOPITAL",
|
||||
"MASK": "HOPITAL", # [MASK] = hôpital masqué par force_regex
|
||||
"IBAN": "IBAN",
|
||||
"AGE": "AGE",
|
||||
"OGC": "NDA",
|
||||
"MASK": "O", # MASK générique = pas d'annotation spécifique
|
||||
}
|
||||
|
||||
RE_PLACEHOLDER = re.compile(r"^\[([A-Z_]+)\]$")
|
||||
|
||||
def text_to_bio(pseudonymised_text: str) -> List[Tuple[str, str]]:
|
||||
"""Convertit un texte pseudonymisé en séquence BIO.
|
||||
SRC = Path("/home/dom/Téléchargements/II-1 Ctrl_T2A_2025_CHCB_DocJustificatifs (1)")
|
||||
AUDIT_DIR = SRC / "anonymise_audit_30"
|
||||
|
||||
Les tokens [PLACEHOLDER] deviennent B-TYPE / I-TYPE.
|
||||
Les tokens normaux deviennent O.
|
||||
|
||||
def extract_original_text(pdf_path: Path) -> str:
|
||||
"""Extrait le texte brut d'un PDF (même méthode que le pipeline)."""
|
||||
import anonymizer_core_refactored_onnx as core
|
||||
pages_text, _, _, _ = core.extract_text_with_fallback_ocr(pdf_path)
|
||||
return "\f".join(pages_text)
|
||||
|
||||
|
||||
def tokenize_text(text: str) -> List[str]:
|
||||
"""Split en tokens whitespace, en nettoyant les caractères de contrôle."""
|
||||
# Remplacer \f et \r par \n pour l'alignement
|
||||
text = text.replace("\f", "\n").replace("\r", "")
|
||||
tokens = []
|
||||
for line in text.split("\n"):
|
||||
line_toks = line.split()
|
||||
if line_toks:
|
||||
tokens.extend(line_toks)
|
||||
return tokens
|
||||
|
||||
|
||||
def align_and_annotate(original_text: str, pseudo_text: str) -> List[Tuple[str, str]]:
|
||||
"""Aligne texte original et pseudonymisé pour créer les annotations BIO.
|
||||
|
||||
Utilise SequenceMatcher pour trouver les différences.
|
||||
Quand le pseudo contient [PLACEHOLDER], les tokens originaux correspondants
|
||||
reçoivent le label BIO approprié.
|
||||
"""
|
||||
orig_tokens = tokenize_text(original_text)
|
||||
pseudo_tokens = tokenize_text(pseudo_text)
|
||||
|
||||
# Normaliser pour l'alignement (lowercase, sans accents pour meilleur matching)
|
||||
def normalize(tok):
|
||||
return tok.lower().strip(".,;:!?()[]{}\"'")
|
||||
|
||||
orig_norm = [normalize(t) for t in orig_tokens]
|
||||
pseudo_norm = [normalize(t) for t in pseudo_tokens]
|
||||
|
||||
sm = difflib.SequenceMatcher(None, orig_norm, pseudo_norm, autojunk=False)
|
||||
opcodes = sm.get_opcodes()
|
||||
|
||||
bio_tokens: List[Tuple[str, str]] = []
|
||||
|
||||
# Split le texte en segments : alternance texte normal / placeholder
|
||||
parts = PLACEHOLDER_RE.split(pseudonymised_text)
|
||||
# parts = [texte, label, texte, label, texte, ...]
|
||||
for tag, i1, i2, j1, j2 in opcodes:
|
||||
if tag == "equal":
|
||||
# Tokens identiques → O
|
||||
for t in orig_tokens[i1:i2]:
|
||||
bio_tokens.append((t, "O"))
|
||||
|
||||
i = 0
|
||||
while i < len(parts):
|
||||
if i % 2 == 0:
|
||||
# Texte normal
|
||||
text_part = parts[i]
|
||||
for word in text_part.split():
|
||||
word = word.strip()
|
||||
if word:
|
||||
bio_tokens.append((word, "O"))
|
||||
else:
|
||||
# Label de placeholder
|
||||
label = parts[i]
|
||||
bio_label = PH_TO_BIO.get(label, "O")
|
||||
if bio_label != "O":
|
||||
# Le placeholder remplace un ou plusieurs tokens
|
||||
bio_tokens.append((f"[{label}]", f"B-{bio_label}"))
|
||||
elif tag == "replace":
|
||||
# Analyser le côté pseudo : quels tokens sont des placeholders ?
|
||||
pseudo_chunk = pseudo_tokens[j1:j2]
|
||||
placeholder_labels = [] # (index_in_pseudo, bio_label) pour chaque placeholder
|
||||
non_placeholder_norms = set()
|
||||
for pi, pt in enumerate(pseudo_chunk):
|
||||
m = RE_PLACEHOLDER.match(pt)
|
||||
if m:
|
||||
bio_label = PLACEHOLDER_TO_BIO.get(m.group(1))
|
||||
if bio_label:
|
||||
placeholder_labels.append((pi, bio_label))
|
||||
else:
|
||||
non_placeholder_norms.add(normalize(pt))
|
||||
|
||||
if not placeholder_labels:
|
||||
# Pas de placeholder → O
|
||||
for t in orig_tokens[i1:i2]:
|
||||
bio_tokens.append((t, "O"))
|
||||
elif len(placeholder_labels) == 1:
|
||||
# Un seul placeholder : tous les tokens originaux (sauf ceux
|
||||
# qui matchent un token non-placeholder du pseudo) prennent ce label
|
||||
label = placeholder_labels[0][1]
|
||||
first = True
|
||||
for t in orig_tokens[i1:i2]:
|
||||
if normalize(t) in non_placeholder_norms:
|
||||
bio_tokens.append((t, "O"))
|
||||
first = True
|
||||
else:
|
||||
prefix = "B-" if first else "I-"
|
||||
bio_tokens.append((t, f"{prefix}{label}"))
|
||||
first = False
|
||||
else:
|
||||
bio_tokens.append((f"[{label}]", "O"))
|
||||
i += 1
|
||||
# Plusieurs placeholders : distribuer les tokens originaux
|
||||
# Stratégie : répartir proportionnellement, chaque groupe commence par B-
|
||||
n_orig = i2 - i1
|
||||
n_placeholders = len(placeholder_labels)
|
||||
# Exclure d'abord les tokens qui matchent des non-placeholders
|
||||
orig_assignments = []
|
||||
for t in orig_tokens[i1:i2]:
|
||||
if normalize(t) in non_placeholder_norms:
|
||||
orig_assignments.append(("O", None))
|
||||
else:
|
||||
orig_assignments.append(("PII", None))
|
||||
|
||||
# Distribuer les tokens PII entre les placeholders
|
||||
pii_indices = [k for k, (tp, _) in enumerate(orig_assignments) if tp == "PII"]
|
||||
n_pii = len(pii_indices)
|
||||
if n_pii > 0 and n_placeholders > 0:
|
||||
chunk_size = max(1, n_pii // n_placeholders)
|
||||
for pi_idx, (_, label) in enumerate(placeholder_labels):
|
||||
start_pii = pi_idx * chunk_size
|
||||
end_pii = (pi_idx + 1) * chunk_size if pi_idx < n_placeholders - 1 else n_pii
|
||||
for k in range(start_pii, min(end_pii, n_pii)):
|
||||
orig_assignments[pii_indices[k]] = ("PII", label)
|
||||
|
||||
# Générer les BIO tokens
|
||||
prev_label = None
|
||||
for k, (t, (tp, label)) in enumerate(zip(orig_tokens[i1:i2], orig_assignments)):
|
||||
if tp == "O" or label is None:
|
||||
bio_tokens.append((t, "O"))
|
||||
prev_label = None
|
||||
else:
|
||||
prefix = "B-" if label != prev_label else "I-"
|
||||
bio_tokens.append((t, f"{prefix}{label}"))
|
||||
prev_label = label
|
||||
|
||||
elif tag == "delete":
|
||||
# Tokens présents uniquement dans l'original → O
|
||||
for t in orig_tokens[i1:i2]:
|
||||
bio_tokens.append((t, "O"))
|
||||
|
||||
elif tag == "insert":
|
||||
# Tokens ajoutés dans le pseudo (rare) → ignorer
|
||||
pass
|
||||
|
||||
return bio_tokens
|
||||
|
||||
|
||||
def export_document(pseudo_path: Path, out_dir: Path) -> int:
|
||||
"""Exporte un fichier pseudonymisé en format BIO. Retourne le nombre de tokens."""
|
||||
text = pseudo_path.read_text(encoding="utf-8", errors="replace")
|
||||
def export_document(pdf_path: Path, pseudo_path: Path, out_dir: Path) -> Tuple[int, int]:
|
||||
"""Exporte un document en format BIO. Retourne (nb_tokens, nb_entités)."""
|
||||
# Extraire le texte original
|
||||
original_text = extract_original_text(pdf_path)
|
||||
if not original_text.strip():
|
||||
return 0, 0
|
||||
|
||||
bio_tokens = text_to_bio(text)
|
||||
if not bio_tokens:
|
||||
return 0
|
||||
# Lire le texte pseudonymisé
|
||||
pseudo_text = pseudo_path.read_text(encoding="utf-8")
|
||||
if not pseudo_text.strip():
|
||||
return 0, 0
|
||||
|
||||
# Écrire en format CoNLL (TOKEN\tLABEL)
|
||||
out_path = out_dir / pseudo_path.name.replace(".pseudonymise.txt", ".bio")
|
||||
# Aligner et annoter
|
||||
bio_tokens = align_and_annotate(original_text, pseudo_text)
|
||||
|
||||
# Écrire en format CoNLL
|
||||
out_name = pdf_path.stem + ".bio"
|
||||
out_path = out_dir / out_name
|
||||
lines = []
|
||||
for token, label in bio_tokens:
|
||||
# Séparer les "phrases" par des lignes vides (heuristique: point final ou retour ligne)
|
||||
# Séparer les phrases par des lignes vides (ponctuation finale)
|
||||
if token in (".", "!", "?") and label == "O":
|
||||
lines.append(f"{token}\t{label}")
|
||||
lines.append("") # séparateur de phrase
|
||||
lines.append("")
|
||||
else:
|
||||
lines.append(f"{token}\t{label}")
|
||||
|
||||
out_path.write_text("\n".join(lines), encoding="utf-8")
|
||||
return len(bio_tokens)
|
||||
|
||||
n_ents = sum(1 for _, l in bio_tokens if l.startswith("B-"))
|
||||
return len(bio_tokens), n_ents
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(description="Export silver annotations BIO")
|
||||
parser.add_argument("--input-dir", type=Path,
|
||||
default=Path("/home/dom/Téléchargements/II-1 Ctrl_T2A_2025_CHCB_DocJustificatifs (1)/anonymise_audit_30"),
|
||||
help="Répertoire contenant les .pseudonymise.txt")
|
||||
parser = argparse.ArgumentParser(description="Export silver annotations BIO (alignement original ↔ pseudo)")
|
||||
parser.add_argument("--out-dir", type=Path,
|
||||
default=Path(__file__).parent.parent / "data" / "silver_annotations",
|
||||
help="Répertoire de sortie")
|
||||
@@ -121,23 +216,34 @@ def main():
|
||||
|
||||
args.out_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
pseudo_files = sorted(args.input_dir.glob("*.pseudonymise.txt"))
|
||||
if args.limit > 0:
|
||||
pseudo_files = pseudo_files[:args.limit]
|
||||
# Trouver les paires PDF + pseudo
|
||||
pseudo_files = sorted(AUDIT_DIR.glob("*.pseudonymise.txt"))
|
||||
pairs = []
|
||||
for pseudo_path in pseudo_files:
|
||||
# Retrouver le PDF source
|
||||
base_name = pseudo_path.name.replace(".pseudonymise.txt", ".pdf")
|
||||
# Chercher dans les sous-dossiers OGC
|
||||
found = list(SRC.glob(f"*/{base_name}"))
|
||||
if found:
|
||||
pairs.append((found[0], pseudo_path))
|
||||
|
||||
print(f"Export silver annotations: {len(pseudo_files)} fichiers → {args.out_dir}")
|
||||
if args.limit > 0:
|
||||
pairs = pairs[:args.limit]
|
||||
|
||||
print(f"Export silver annotations: {len(pairs)} documents → {args.out_dir}")
|
||||
|
||||
total_tokens = 0
|
||||
total_entities = 0
|
||||
for f in pseudo_files:
|
||||
n = export_document(f, args.out_dir)
|
||||
ent_count = sum(1 for line in (args.out_dir / f.name.replace(".pseudonymise.txt", ".bio")).read_text().splitlines()
|
||||
if line and not line.endswith("\tO"))
|
||||
total_tokens += n
|
||||
total_entities += ent_count
|
||||
print(f" {f.name}: {n} tokens, {ent_count} entités")
|
||||
for pdf_path, pseudo_path in pairs:
|
||||
try:
|
||||
n_tok, n_ent = export_document(pdf_path, pseudo_path, args.out_dir)
|
||||
total_tokens += n_tok
|
||||
total_entities += n_ent
|
||||
print(f" {pdf_path.name}: {n_tok} tokens, {n_ent} entités")
|
||||
except Exception as e:
|
||||
print(f" {pdf_path.name}: ERREUR {e}")
|
||||
|
||||
print(f"\nTotal: {total_tokens} tokens, {total_entities} entités annotées")
|
||||
print(f"\nTotal: {total_tokens} tokens, {total_entities} entités B-")
|
||||
print(f"Sortie: {args.out_dir}")
|
||||
|
||||
|
||||
|
||||
@@ -15,8 +15,11 @@ import sys
|
||||
import argparse
|
||||
from pathlib import Path
|
||||
from typing import Dict, List
|
||||
from collections import Counter
|
||||
|
||||
import numpy as np
|
||||
import torch
|
||||
from torch import nn
|
||||
|
||||
# Vérifier les dépendances
|
||||
try:
|
||||
@@ -59,38 +62,60 @@ ID2LABEL = {i: l for l, i in LABEL2ID.items()}
|
||||
MODEL_NAME = "almanach/camembert-bio-base"
|
||||
|
||||
|
||||
def load_bio_files(data_dir: Path) -> Dict[str, List]:
|
||||
"""Charge les fichiers .bio en format HuggingFace datasets."""
|
||||
def load_bio_files(data_dir: Path, window_size: int = 200, stride: int = 100) -> Dict[str, List]:
|
||||
"""Charge les fichiers .bio et découpe en fenêtres glissantes.
|
||||
|
||||
Les documents cliniques sont très longs. On les découpe en fenêtres de
|
||||
~window_size tokens avec un chevauchement de stride. On ne garde que les
|
||||
fenêtres contenant au moins une entité (pour l'équilibre des classes).
|
||||
"""
|
||||
tokens_list: List[List[str]] = []
|
||||
labels_list: List[List[int]] = []
|
||||
|
||||
for bio_file in sorted(data_dir.glob("*.bio")):
|
||||
text = bio_file.read_text(encoding="utf-8")
|
||||
current_tokens: List[str] = []
|
||||
current_labels: List[int] = []
|
||||
# Charger tous les tokens du document
|
||||
all_tokens: List[str] = []
|
||||
all_labels: List[int] = []
|
||||
|
||||
for line in text.splitlines():
|
||||
line = line.strip()
|
||||
if not line:
|
||||
# Fin de phrase
|
||||
if current_tokens:
|
||||
tokens_list.append(current_tokens)
|
||||
labels_list.append(current_labels)
|
||||
current_tokens = []
|
||||
current_labels = []
|
||||
continue
|
||||
|
||||
parts = line.split("\t")
|
||||
if len(parts) != 2:
|
||||
continue
|
||||
token, label = parts
|
||||
label_id = LABEL2ID.get(label, LABEL2ID["O"])
|
||||
current_tokens.append(token)
|
||||
current_labels.append(label_id)
|
||||
all_tokens.append(token)
|
||||
all_labels.append(label_id)
|
||||
|
||||
if current_tokens:
|
||||
tokens_list.append(current_tokens)
|
||||
labels_list.append(current_labels)
|
||||
if not all_tokens:
|
||||
continue
|
||||
|
||||
# Découper en fenêtres glissantes
|
||||
n = len(all_tokens)
|
||||
for start in range(0, n, stride):
|
||||
end = min(start + window_size, n)
|
||||
chunk_tokens = all_tokens[start:end]
|
||||
chunk_labels = all_labels[start:end]
|
||||
|
||||
# Corriger les I- en début de fenêtre → B-
|
||||
if chunk_labels and chunk_labels[0] > 0:
|
||||
lbl_name = LABEL_LIST[chunk_labels[0]]
|
||||
if lbl_name.startswith("I-"):
|
||||
b_name = "B-" + lbl_name[2:]
|
||||
if b_name in LABEL2ID:
|
||||
chunk_labels[0] = LABEL2ID[b_name]
|
||||
|
||||
# Garder les fenêtres avec entités + quelques fenêtres "O" (10%)
|
||||
has_entities = any(l != 0 for l in chunk_labels)
|
||||
if has_entities or (start % (stride * 10) == 0):
|
||||
tokens_list.append(chunk_tokens)
|
||||
labels_list.append(chunk_labels)
|
||||
|
||||
if end >= n:
|
||||
break
|
||||
|
||||
return {"tokens": tokens_list, "ner_tags": labels_list}
|
||||
|
||||
@@ -131,6 +156,59 @@ def tokenize_and_align(examples, tokenizer):
|
||||
return tokenized
|
||||
|
||||
|
||||
class WeightedNERTrainer(Trainer):
|
||||
"""Trainer avec poids de classe pour contrer le déséquilibre O vs entités."""
|
||||
|
||||
def __init__(self, class_weights=None, **kwargs):
|
||||
super().__init__(**kwargs)
|
||||
if class_weights is not None:
|
||||
self.class_weights = class_weights.to(self.args.device)
|
||||
else:
|
||||
self.class_weights = None
|
||||
|
||||
def compute_loss(self, model, inputs, return_outputs=False, **kwargs):
|
||||
labels = inputs.pop("labels")
|
||||
outputs = model(**inputs)
|
||||
logits = outputs.logits
|
||||
|
||||
if self.class_weights is not None:
|
||||
loss_fct = nn.CrossEntropyLoss(
|
||||
weight=self.class_weights,
|
||||
ignore_index=-100,
|
||||
label_smoothing=0.1,
|
||||
)
|
||||
else:
|
||||
loss_fct = nn.CrossEntropyLoss(ignore_index=-100, label_smoothing=0.1)
|
||||
|
||||
loss = loss_fct(logits.view(-1, self.model.config.num_labels), labels.view(-1))
|
||||
return (loss, outputs) if return_outputs else loss
|
||||
|
||||
|
||||
def compute_class_weights(raw_data: Dict, num_labels: int, max_weight: float = 10.0) -> torch.FloatTensor:
|
||||
"""Calcule les poids inversement proportionnels à la fréquence, cappés après normalisation."""
|
||||
counts = Counter()
|
||||
for labels in raw_data["ner_tags"]:
|
||||
for l in labels:
|
||||
counts[l] += 1
|
||||
|
||||
total = sum(counts.values())
|
||||
weights = torch.ones(num_labels)
|
||||
for label_id, count in counts.items():
|
||||
if count > 0:
|
||||
weights[label_id] = total / (num_labels * count)
|
||||
|
||||
# Normaliser : O=1.0
|
||||
if weights[0] > 0:
|
||||
scale = 1.0 / weights[0]
|
||||
weights *= scale
|
||||
|
||||
# Capper APRÈS normalisation pour limiter le déséquilibre
|
||||
weights = torch.clamp(weights, max=max_weight)
|
||||
|
||||
print(f" Class weights (O={weights[0]:.1f}, non-O moyen={weights[1:].mean():.1f}, max={weights[1:].max():.1f})")
|
||||
return weights
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(description="Fine-tune CamemBERT-bio pour désidentification")
|
||||
parser.add_argument("--data-dir", type=Path,
|
||||
@@ -203,6 +281,10 @@ def main():
|
||||
"f1": results["overall_f1"],
|
||||
}
|
||||
|
||||
# Class weights pour contrer le déséquilibre 97% O
|
||||
print("\nCalcul des poids de classe...")
|
||||
weights = compute_class_weights(raw_data, len(LABEL_LIST))
|
||||
|
||||
# Training
|
||||
args.output_dir.mkdir(parents=True, exist_ok=True)
|
||||
training_args = TrainingArguments(
|
||||
@@ -218,13 +300,14 @@ def main():
|
||||
load_best_model_at_end=True,
|
||||
metric_for_best_model="f1",
|
||||
logging_steps=50,
|
||||
fp16=False, # CPU training
|
||||
fp16=True, # GPU training avec mixed precision
|
||||
report_to="none",
|
||||
save_total_limit=2,
|
||||
)
|
||||
|
||||
data_collator = DataCollatorForTokenClassification(tokenizer)
|
||||
trainer = Trainer(
|
||||
trainer = WeightedNERTrainer(
|
||||
class_weights=weights,
|
||||
model=model,
|
||||
args=training_args,
|
||||
train_dataset=tokenized["train"],
|
||||
|
||||
Reference in New Issue
Block a user