Files
t2a-finetune/data/datasets/stats.json
2026-03-05 00:37:36 +01:00

31 lines
738 B
JSON

{
"sources": {
"ccam_chatml.jsonl": 1500,
"cim10_chatml.jsonl": 1500,
"cocoa_chatml.jsonl": 2000,
"discrimination_chatml.jsonl": 799,
"fascicule_reasoning_chatml.jsonl": 453,
"guide_metho_chatml.jsonl": 364,
"negative_chatml.jsonl": 1000,
"pipeline_chatml.jsonl": 2795,
"reasoning_chatml.jsonl": 1359,
"referentiels_chatml.jsonl": 5336,
"synthetic_chatml.jsonl": 600
},
"total": 17706,
"train": {
"count": 15936,
"total_tokens_approx": 3739820,
"avg_tokens": 235,
"max_tokens": 1042,
"min_tokens": 41
},
"eval": {
"count": 1770,
"total_tokens_approx": 405094,
"avg_tokens": 229,
"max_tokens": 1030,
"min_tokens": 42
},
"eval_ratio": 0.1
}