Transformers¶

Here’s an example script that uses torchrunx with transformers.Trainer to fine-tune any causal language model (from transformers) on any text dataset (from datasets) with any number of GPUs or nodes.

https://torchrun.xyz/transformers_train.py

python transformers_train.py --help

(expand)

usage: transformers_train.py [-h] [OPTIONS]

╭─ options ──────────────────────────────────────────────────────────────────╮
│ -h, --help                                                                 │
│     show this help message and exit                                        │
╰────────────────────────────────────────────────────────────────────────────╯
╭─ launcher options ─────────────────────────────────────────────────────────╮
│ For configuring the function launch environment.                           │
│ ────────────────────────────────────────────────────────────────────────── │
│ --launcher.hostnames {[STR [STR ...]]}|{auto,slurm}                        │
│     Nodes to launch the function on. By default, infer from SLURM, else    │
│     ``["localhost"]``. (default: auto)                                     │
│ --launcher.workers-per-host INT|{[INT [INT ...]]}|{cpu,gpu}                │
│     Number of processes to run per node. By default, number of GPUs per    │
│     host. (default: gpu)                                                   │
│ --launcher.ssh-config-file {None}|STR|PATHLIKE                             │
│     For connecting to nodes. By default, ``"~/.ssh/config"`` or            │
│     ``"/etc/ssh/ssh_config"``. (default: None)                             │
│ --launcher.backend {None,nccl,gloo,mpi,ucc}                                │
│     `Backend                                                               │
│     <https://pytorch.org/docs/stable/distributed.html#torch.distributed.Ba │
│     ckend>`_                                                               │
│             for worker process group. By default, NCCL (GPU backend).      │
│             Use GLOO for CPU backend. ``None`` for no process group.       │
│     (default: nccl)                                                        │
│ --launcher.worker-timeout INT                                              │
│     Worker process group timeout (seconds). (default: 600)                 │
│ --launcher.agent-timeout INT                                               │
│     Agent communication timeout (seconds). (default: 180)                  │
│ --launcher.copy-env-vars [STR [STR ...]]                                   │
│     Environment variables to copy from the launcher process to workers.    │
│     Supports Unix pattern matching syntax. (default: PATH LD_LIBRARY       │
│     LIBRARY_PATH 'PYTHON*' 'CUDA*' 'TORCH*' 'PYTORCH*' 'NCCL*')            │
│ --launcher.extra-env-vars {None}|{[STR STR [STR STR ...]]}                 │
│     Additional environment variables to load onto workers. (default: None) │
│ --launcher.env-file {None}|STR|PATHLIKE                                    │
│     Path to a ``.env`` file, containing environment variables to load onto │
│     workers. (default: None)                                               │
╰────────────────────────────────────────────────────────────────────────────╯
╭─ model options ────────────────────────────────────────────────────────────╮
│ --model.name STR                                                           │
│     (required)                                                             │
╰────────────────────────────────────────────────────────────────────────────╯
╭─ dataset options ──────────────────────────────────────────────────────────╮
│ --dataset.path STR                                                         │
│     (required)                                                             │
│ --dataset.name {None}|STR                                                  │
│     (default: None)                                                        │
│ --dataset.split {None}|STR                                                 │
│     (default: None)                                                        │
│ --dataset.text-column STR                                                  │
│     (default: text)                                                        │
│ --dataset.num-samples {None}|INT                                           │
│     (default: None)                                                        │
╰────────────────────────────────────────────────────────────────────────────╯
╭─ trainer options ──────────────────────────────────────────────────────────╮
│ --trainer.output-dir {None}|STR                                            │
│     The output directory where the model predictions and checkpoints will  │
│     be written. Defaults to 'trainer_output' if not provided. (default:    │
│     None)                                                                  │
│ --trainer.overwrite-output-dir, --trainer.no-overwrite-output-dir          │
│     Overwrite the content of the output directory. Use this to continue    │
│     training if output_dir points to a checkpoint directory. (default:     │
│     False)                                                                 │
│ --trainer.do-train, --trainer.no-do-train                                  │
│     Whether to run training. (default: False)                              │
│ --trainer.do-eval, --trainer.no-do-eval                                    │
│     Whether to run eval on the dev set. (default: False)                   │
│ --trainer.do-predict, --trainer.no-do-predict                              │
│     Whether to run predictions on the test set. (default: False)           │
│ --trainer.eval-strategy {NO,STEPS,EPOCH}|STR                               │
│     The evaluation strategy to use. (default: no)                          │
│ --trainer.prediction-loss-only, --trainer.no-prediction-loss-only          │
│     When performing evaluation and predictions, only returns the loss.     │
│     (default: False)                                                       │
│ --trainer.per-device-train-batch-size INT                                  │
│     Batch size per device accelerator core/CPU for training. (default: 8)  │
│ --trainer.per-device-eval-batch-size INT                                   │
│     Batch size per device accelerator core/CPU for evaluation. (default:   │
│     8)                                                                     │
│ --trainer.per-gpu-train-batch-size {None}|INT                              │
│     Deprecated, the use of `--per_device_train_batch_size` is preferred.   │
│     Batch size per GPU/TPU core/CPU for training. (default: None)          │
│ --trainer.per-gpu-eval-batch-size {None}|INT                               │
│     Deprecated, the use of `--per_device_eval_batch_size` is preferred.    │
│     Batch size per GPU/TPU core/CPU for evaluation. (default: None)        │
│ --trainer.gradient-accumulation-steps INT                                  │
│     Number of updates steps to accumulate before performing a              │
│     backward/update pass. (default: 1)                                     │
│ --trainer.eval-accumulation-steps {None}|INT                               │
│     Number of predictions steps to accumulate before moving the tensors to │
│     the CPU. (default: None)                                               │
│ --trainer.eval-delay {None}|FLOAT                                          │
│     Number of epochs or steps to wait for before the first evaluation can  │
│     be performed, depending on the eval_strategy. (default: 0)             │
│ --trainer.torch-empty-cache-steps {None}|INT                               │
│     Number of steps to wait before calling                                 │
│     `torch.<device>.empty_cache()`.This can help avoid CUDA out-of-memory  │
│     errors by lowering peak VRAM usage at a cost of about [10% slower      │
│     performance](https://github.com/huggingface/transformers/issues/31372) │
│     .If left unset or set to None, cache will not be emptied. (default:    │
│     None)                                                                  │
│ --trainer.learning-rate FLOAT                                              │
│     The initial learning rate for AdamW. (default: 5e-05)                  │
│ --trainer.weight-decay FLOAT                                               │
│     Weight decay for AdamW if we apply some. (default: 0.0)                │
│ --trainer.adam-beta1 FLOAT                                                 │
│     Beta1 for AdamW optimizer (default: 0.9)                               │
│ --trainer.adam-beta2 FLOAT                                                 │
│     Beta2 for AdamW optimizer (default: 0.999)                             │
│ --trainer.adam-epsilon FLOAT                                               │
│     Epsilon for AdamW optimizer. (default: 1e-08)                          │
│ --trainer.max-grad-norm FLOAT                                              │
│     Max gradient norm. (default: 1.0)                                      │
│ --trainer.num-train-epochs FLOAT                                           │
│     Total number of training epochs to perform. (default: 3.0)             │
│ --trainer.max-steps INT                                                    │
│     If > 0: set total number of training steps to perform. Override        │
│     num_train_epochs. (default: -1)                                        │
│ --trainer.lr-scheduler-type                                                │
│ {LINEAR,COSINE,COSINE_WITH_RESTARTS,POLYNOMIAL,CONSTANT,CONSTANT_WITH_WARM │
│ UP,INVERSE_SQRT,REDUCE_ON_PLATEAU,COSINE_WITH_MIN_LR,WARMUP_STABLE_DECAY}| │
│ STR                                                                        │
│     The scheduler type to use. (default: linear)                           │
│ --trainer.lr-scheduler-kwargs {None}|{[STR STR [STR STR ...]]}|STR         │
│     Extra parameters for the lr_scheduler such as {'num_cycles': 1} for    │
│     the cosine with hard restarts. (default: )                             │
│ --trainer.warmup-ratio FLOAT                                               │
│     Linear warmup over warmup_ratio fraction of total steps. (default:     │
│     0.0)                                                                   │
│ --trainer.warmup-steps INT                                                 │
│     Linear warmup over warmup_steps. (default: 0)                          │
│ --trainer.log-level {None}|STR                                             │
│     Logger log level to use on the main node. Possible choices are the log │
│     levels as strings: 'debug', 'info', 'warning', 'error' and 'critical', │
│     plus a 'passive' level which doesn't set anything and lets the         │
│     application set the level. Defaults to 'passive'. (default: passive)   │
│ --trainer.log-level-replica {None}|STR                                     │
│     Logger log level to use on replica nodes. Same choices and defaults as │
│     ``log_level`` (default: warning)                                       │
│ --trainer.log-on-each-node, --trainer.no-log-on-each-node                  │
│     When doing a multinode distributed training, whether to log once per   │
│     node or just once on the main node. (default: True)                    │
│ --trainer.logging-dir {None}|STR                                           │
│     Tensorboard log dir. (default: None)                                   │
│ --trainer.logging-strategy {NO,STEPS,EPOCH}|STR                            │
│     The logging strategy to use. (default: steps)                          │
│ --trainer.logging-first-step, --trainer.no-logging-first-step              │
│     Log the first global_step (default: False)                             │
│ --trainer.logging-steps FLOAT                                              │
│     Log every X updates steps. Should be an integer or a float in range    │
│     `[0,1)`. If smaller than 1, will be interpreted as ratio of total      │
│     training steps. (default: 500)                                         │
│ --trainer.logging-nan-inf-filter, --trainer.no-logging-nan-inf-filter      │
│     Filter nan and inf losses for logging. (default: True)                 │
│ --trainer.save-strategy {NO,STEPS,EPOCH,BEST}|STR                          │
│     The checkpoint save strategy to use. (default: steps)                  │
│ --trainer.save-steps FLOAT                                                 │
│     Save checkpoint every X updates steps. Should be an integer or a float │
│     in range `[0,1)`. If smaller than 1, will be interpreted as ratio of   │
│     total training steps. (default: 500)                                   │
│ --trainer.save-total-limit {None}|INT                                      │
│     If a value is passed, will limit the total amount of checkpoints.      │
│     Deletes the older checkpoints in `output_dir`. When                    │
│     `load_best_model_at_end` is enabled, the 'best' checkpoint according   │
│     to `metric_for_best_model` will always be retained in addition to the  │
│     most recent ones. For example, for `save_total_limit=5` and            │
│     `load_best_model_at_end=True`, the four last checkpoints will always   │
│     be retained alongside the best model. When `save_total_limit=1` and    │
│     `load_best_model_at_end=True`, it is possible that two checkpoints are │
│     saved: the last one and the best one (if they are different). Default  │
│     is unlimited checkpoints (default: None)                               │
│ --trainer.save-safetensors {None,True,False}                               │
│     Use safetensors saving and loading for state dicts instead of default  │
│     torch.load and torch.save. (default: True)                             │
│ --trainer.save-on-each-node, --trainer.no-save-on-each-node                │
│     When doing multi-node distributed training, whether to save models and │
│     checkpoints on each node, or only on the main one (default: False)     │
│ --trainer.save-only-model, --trainer.no-save-only-model                    │
│     When checkpointing, whether to only save the model, or also the        │
│     optimizer, scheduler & rng state.Note that when this is true, you      │
│     won't be able to resume training from checkpoint.This enables you to   │
│     save storage by not storing the optimizer, scheduler & rng state.You   │
│     can only load the model using from_pretrained with this option set to  │
│     True. (default: False)                                                 │
│ --trainer.restore-callback-states-from-checkpoint,                         │
│ --trainer.no-restore-callback-states-from-checkpoint                       │
│     Whether to restore the callback states from the checkpoint. If `True`, │
│     will override callbacks passed to the `Trainer` if they exist in the   │
│     checkpoint. (default: False)                                           │
│ --trainer.no-cuda, --trainer.no-no-cuda                                    │
│     This argument is deprecated. It will be removed in version 5.0 of 🤗   │
│     Transformers. (default: False)                                         │
│ --trainer.use-cpu, --trainer.no-use-cpu                                    │
│     Whether or not to use cpu. If left to False, we will use the available │
│     torch device/backend (cuda/mps/xpu/hpu etc.) (default: False)          │
│ --trainer.use-mps-device, --trainer.no-use-mps-device                      │
│     This argument is deprecated. `mps` device will be used if available    │
│     similar to `cuda` device. It will be removed in version 5.0 of 🤗      │
│     Transformers (default: False)                                          │
│ --trainer.seed INT                                                         │
│     Random seed that will be set at the beginning of training. (default:   │
│     42)                                                                    │
│ --trainer.data-seed {None}|INT                                             │
│     Random seed to be used with data samplers. (default: None)             │
│ --trainer.jit-mode-eval, --trainer.no-jit-mode-eval                        │
│     Whether or not to use PyTorch jit trace for inference (default: False) │
│ --trainer.use-ipex, --trainer.no-use-ipex                                  │
│     Use Intel extension for PyTorch when it is available, installation:    │
│     'https://github.com/intel/intel-extension-for-pytorch' (default:       │
│     False)                                                                 │
│ --trainer.bf16, --trainer.no-bf16                                          │
│     Whether to use bf16 (mixed) precision instead of 32-bit. Requires      │
│     Ampere or higher NVIDIA architecture or using CPU (use_cpu) or Ascend  │
│     NPU. This is an experimental API and it may change. (default: False)   │
│ --trainer.fp16, --trainer.no-fp16                                          │
│     Whether to use fp16 (mixed) precision instead of 32-bit (default:      │
│     False)                                                                 │
│ --trainer.fp16-opt-level STR                                               │
│     For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2',   │
│     and 'O3']. See details at https://nvidia.github.io/apex/amp.html       │
│     (default: O1)                                                          │
│ --trainer.half-precision-backend STR                                       │
│     The backend to be used for half precision. (default: auto)             │
│ --trainer.bf16-full-eval, --trainer.no-bf16-full-eval                      │
│     Whether to use full bfloat16 evaluation instead of 32-bit. This is an  │
│     experimental API and it may change. (default: False)                   │
│ --trainer.fp16-full-eval, --trainer.no-fp16-full-eval                      │
│     Whether to use full float16 evaluation instead of 32-bit (default:     │
│     False)                                                                 │
│ --trainer.tf32 {None,True,False}                                           │
│     Whether to enable tf32 mode, available in Ampere and newer GPU         │
│     architectures. This is an experimental API and it may change.          │
│     (default: None)                                                        │
│ --trainer.local-rank INT                                                   │
│     For distributed training: local_rank (default: -1)                     │
│ --trainer.ddp-backend {None}|STR                                           │
│     The backend to be used for distributed training (default: None)        │
│ --trainer.tpu-num-cores {None}|INT                                         │
│     TPU: Number of TPU cores (automatically passed by launcher script)     │
│     (default: None)                                                        │
│ --trainer.tpu-metrics-debug, --trainer.no-tpu-metrics-debug                │
│     Deprecated, the use of `--debug tpu_metrics_debug` is preferred. TPU:  │
│     Whether to print debug metrics (default: False)                        │
│ --trainer.debug STR|{[{UNDERFLOW_OVERFLOW,TPU_METRICS_DEBUG} [...]]}       │
│     Whether or not to enable debug mode. Current options:                  │
│     `underflow_overflow` (Detect underflow and overflow in activations and │
│     weights), `tpu_metrics_debug` (print debug metrics on TPU). (default:  │
│     '')                                                                    │
│ --trainer.dataloader-drop-last, --trainer.no-dataloader-drop-last          │
│     Drop the last incomplete batch if it is not divisible by the batch     │
│     size. (default: False)                                                 │
│ --trainer.eval-steps {None}|FLOAT                                          │
│     Run an evaluation every X steps. Should be an integer or a float in    │
│     range `[0,1)`. If smaller than 1, will be interpreted as ratio of      │
│     total training steps. (default: None)                                  │
│ --trainer.dataloader-num-workers INT                                       │
│     Number of subprocesses to use for data loading (PyTorch only). 0 means │
│     that the data will be loaded in the main process. (default: 0)         │
│ --trainer.dataloader-prefetch-factor {None}|INT                            │
│     Number of batches loaded in advance by each worker. 2 means there will │
│     be a total of 2 * num_workers batches prefetched across all workers.   │
│     (default: None)                                                        │
│ --trainer.past-index INT                                                   │
│     If >=0, uses the corresponding part of the output as the past state    │
│     for next step. (default: -1)                                           │
│ --trainer.run-name {None}|STR                                              │
│     An optional descriptor for the run. Notably used for wandb, mlflow     │
│     comet and swanlab logging. (default: None)                             │
│ --trainer.disable-tqdm {None,True,False}                                   │
│     Whether or not to disable the tqdm progress bars. (default: None)      │
│ --trainer.remove-unused-columns {None,True,False}                          │
│     Remove columns not required by the model when using an nlp.Dataset.    │
│     (default: True)                                                        │
│ --trainer.label-names {None}|{[STR [STR ...]]}                             │
│     The list of keys in your dictionary of inputs that correspond to the   │
│     labels. (default: None)                                                │
│ --trainer.load-best-model-at-end {None,True,False}                         │
│     Whether or not to load the best model found during training at the end │
│     of training. When this option is enabled, the best checkpoint will     │
│     always be saved. See `save_total_limit` for more. (default: False)     │
│ --trainer.metric-for-best-model {None}|STR                                 │
│     The metric to use to compare two different models. (default: None)     │
│ --trainer.greater-is-better {None,True,False}                              │
│     Whether the `metric_for_best_model` should be maximized or not.        │
│     (default: None)                                                        │
│ --trainer.ignore-data-skip, --trainer.no-ignore-data-skip                  │
│     When resuming training, whether or not to skip the first epochs and    │
│     batches to get to the same training data. (default: False)             │
│ --trainer.fsdp                                                             │
│ {None}|{[{FULL_SHARD,SHARD_GRAD_OP,NO_SHARD,HYBRID_SHARD,HYBRID_SHARD_ZERO │
│ 2,OFFLOAD,AUTO_WRAP} [...]]}|STR                                           │
│     Whether or not to use PyTorch Fully Sharded Data Parallel (FSDP)       │
│     training (in distributed training only). The base option should be     │
│     `full_shard`, `shard_grad_op` or `no_shard` and you can add            │
│     CPU-offload to `full_shard` or `shard_grad_op` like this: full_shard   │
│     offload` or `shard_grad_op offload`. You can add auto-wrap to          │
│     `full_shard` or `shard_grad_op` with the same syntax: full_shard       │
│     auto_wrap` or `shard_grad_op auto_wrap`. (default: '')                 │
│ --trainer.fsdp-min-num-params INT                                          │
│     This parameter is deprecated. FSDP's minimum number of parameters for  │
│     Default Auto Wrapping. (useful only when `fsdp` field is passed).      │
│     (default: 0)                                                           │
│ --trainer.fsdp-config {None}|{[STR STR [STR STR ...]]}|STR                 │
│     Config to be used with FSDP (Pytorch Fully Sharded  Data Parallel).    │
│     The value is either a fsdp json config file (e.g., `fsdp_config.json`) │
│     or an already loaded json file as `dict`. (default: None)              │
│ --trainer.fsdp-transformer-layer-cls-to-wrap {None}|STR                    │
│     This parameter is deprecated. Transformer layer class name             │
│     (case-sensitive) to wrap, e.g, `BertLayer`, `GPTJBlock`, `T5Block`     │
│     .... (useful only when `fsdp` flag is passed). (default: None)         │
│ --trainer.accelerator-config {None}|{[STR STR [STR STR ...]]}|STR          │
│     Config to be used with the internal Accelerator object initialization. │
│     The value is either a accelerator json config file (e.g.,              │
│     `accelerator_config.json`) or an already loaded json file as `dict`.   │
│     (default: None)                                                        │
│ --trainer.deepspeed {None}|{[STR STR [STR STR ...]]}|STR                   │
│     Enable deepspeed and pass the path to deepspeed json config file (e.g. │
│     `ds_config.json`) or an already loaded json file as a dict (default:   │
│     None)                                                                  │
│ --trainer.label-smoothing-factor FLOAT                                     │
│     The label smoothing epsilon to apply (zero means no label smoothing).  │
│     (default: 0.0)                                                         │
│ --trainer.optim                                                            │
│ {ADAMW_TORCH,ADAMW_TORCH_FUSED,ADAMW_TORCH_XLA,ADAMW_TORCH_NPU_FUSED,ADAMW │
│ _APEX_FUSED,ADAFACTOR,ADAMW_ANYPRECISION,ADAMW_TORCH_4BIT,ADAMW_TORCH_8BIT │
│ ,ADEMAMIX,SGD,ADAGRAD,ADAMW_BNB,ADAMW_8BIT,ADEMAMIX_8BIT,LION_8BIT,LION,PA │
│ GED_ADAMW,PAGED_ADAMW_8BIT,PAGED_ADEMAMIX,PAGED_ADEMAMIX_8BIT,PAGED_LION,P │
│ AGED_LION_8BIT,RMSPROP,RMSPROP_BNB,RMSPROP_8BIT,RMSPROP_32BIT,GALORE_ADAMW │
│ ,GALORE_ADAMW_8BIT,GALORE_ADAFACTOR,GALORE_ADAMW_LAYERWISE,GALORE_ADAMW_8B │
│ IT_LAYERWISE,GALORE_ADAFACTOR_LAYERWISE,LOMO,ADALOMO,GROKADAMW,SCHEDULE_FR │
│ EE_RADAM,SCHEDULE_FREE_ADAMW,SCHEDULE_FREE_SGD,APOLLO_ADAMW,APOLLO_ADAMW_L │
│ AYERWISE}|STR                                                              │
│     The optimizer to use. (default: adamw_torch)                           │
│ --trainer.optim-args {None}|STR                                            │
│     Optional arguments to supply to optimizer. (default: None)             │
│ --trainer.adafactor, --trainer.no-adafactor                                │
│     Whether or not to replace AdamW by Adafactor. (default: False)         │
│ --trainer.group-by-length, --trainer.no-group-by-length                    │
│     Whether or not to group samples of roughly the same length together    │
│     when batching. (default: False)                                        │
│ --trainer.length-column-name {None}|STR                                    │
│     Column name with precomputed lengths to use when grouping by length.   │
│     (default: length)                                                      │
│ --trainer.report-to {None}|STR|{[STR [STR ...]]}                           │
│     The list of integrations to report the results and logs to. (default:  │
│     None)                                                                  │
│ --trainer.ddp-find-unused-parameters {None,True,False}                     │
│     When using distributed training, the value of the flag                 │
│     `find_unused_parameters` passed to `DistributedDataParallel`.          │
│     (default: None)                                                        │
│ --trainer.ddp-bucket-cap-mb {None}|INT                                     │
│     When using distributed training, the value of the flag `bucket_cap_mb` │
│     passed to `DistributedDataParallel`. (default: None)                   │
│ --trainer.ddp-broadcast-buffers {None,True,False}                          │
│     When using distributed training, the value of the flag                 │
│     `broadcast_buffers` passed to `DistributedDataParallel`. (default:     │
│     None)                                                                  │
│ --trainer.dataloader-pin-memory, --trainer.no-dataloader-pin-memory        │
│     Whether or not to pin memory for DataLoader. (default: True)           │
│ --trainer.dataloader-persistent-workers,                                   │
│ --trainer.no-dataloader-persistent-workers                                 │
│     If True, the data loader will not shut down the worker processes after │
│     a dataset has been consumed once. This allows to maintain the workers  │
│     Dataset instances alive. Can potentially speed up training, but will   │
│     increase RAM usage. (default: False)                                   │
│ --trainer.skip-memory-metrics, --trainer.no-skip-memory-metrics            │
│     Whether or not to skip adding of memory profiler reports to metrics.   │
│     (default: True)                                                        │
│ --trainer.use-legacy-prediction-loop,                                      │
│ --trainer.no-use-legacy-prediction-loop                                    │
│     Whether or not to use the legacy prediction_loop in the Trainer.       │
│     (default: False)                                                       │
│ --trainer.push-to-hub, --trainer.no-push-to-hub                            │
│     Whether or not to upload the trained model to the model hub after      │
│     training. (default: False)                                             │
│ --trainer.resume-from-checkpoint {None}|STR                                │
│     The path to a folder with a valid checkpoint for your model. (default: │
│     None)                                                                  │
│ --trainer.hub-model-id {None}|STR                                          │
│     The name of the repository to keep in sync with the local              │
│     `output_dir`. (default: None)                                          │
│ --trainer.hub-strategy {END,EVERY_SAVE,CHECKPOINT,ALL_CHECKPOINTS}|STR     │
│     The hub strategy to use when `--push_to_hub` is activated. (default:   │
│     every_save)                                                            │
│ --trainer.hub-token {None}|STR                                             │
│     The token to use to push to the Model Hub. (default: None)             │
│ --trainer.hub-private-repo {None,True,False}                               │
│     Whether to make the repo private. If `None` (default), the repo will   │
│     be public unless the organization's default is private. This value is  │
│     ignored if the repo already exists. (default: None)                    │
│ --trainer.hub-always-push, --trainer.no-hub-always-push                    │
│     Unless `True`, the Trainer will skip pushes if the previous one wasn't │
│     finished yet. (default: False)                                         │
│ --trainer.gradient-checkpointing, --trainer.no-gradient-checkpointing      │
│     If True, use gradient checkpointing to save memory at the expense of   │
│     slower backward pass. (default: False)                                 │
│ --trainer.gradient-checkpointing-kwargs {None}|{[STR STR [STR STR          │
│ ...]]}|STR                                                                 │
│     Gradient checkpointing key word arguments such as `use_reentrant`.     │
│     Will be passed to `torch.utils.checkpoint.checkpoint` through          │
│     `model.gradient_checkpointing_enable`. (default: None)                 │
│ --trainer.include-inputs-for-metrics,                                      │
│ --trainer.no-include-inputs-for-metrics                                    │
│     This argument is deprecated and will be removed in version 5 of 🤗     │
│     Transformers. Use `include_for_metrics` instead. (default: False)      │
│ --trainer.include-for-metrics [STR [STR ...]]                              │
│     List of strings to specify additional data to include in the           │
│     `compute_metrics` function.Options: 'inputs', 'loss'. (default: )      │
│ --trainer.eval-do-concat-batches, --trainer.no-eval-do-concat-batches      │
│     Whether to recursively concat inputs/losses/labels/predictions across  │
│     batches. If `False`, will instead store them as lists, with each batch │
│     kept separate. (default: True)                                         │
│ --trainer.fp16-backend STR                                                 │
│     Deprecated. Use half_precision_backend instead (default: auto)         │
│ --trainer.push-to-hub-model-id {None}|STR                                  │
│     The name of the repository to which push the `Trainer`. (default:      │
│     None)                                                                  │
│ --trainer.push-to-hub-organization {None}|STR                              │
│     The name of the organization in with to which push the `Trainer`.      │
│     (default: None)                                                        │
│ --trainer.push-to-hub-token {None}|STR                                     │
│     The token to use to push to the Model Hub. (default: None)             │
│ --trainer.mp-parameters STR                                                │
│     Used by the SageMaker launcher to send mp-specific args. Ignored in    │
│     Trainer (default: '')                                                  │
│ --trainer.auto-find-batch-size, --trainer.no-auto-find-batch-size          │
│     Whether to automatically decrease the batch size in half and rerun the │
│     training loop again each time a CUDA Out-of-Memory was reached         │
│     (default: False)                                                       │
│ --trainer.full-determinism, --trainer.no-full-determinism                  │
│     Whether to call enable_full_determinism instead of set_seed for        │
│     reproducibility in distributed training. Important: this will          │
│     negatively impact the performance, so only use it for debugging.       │
│     (default: False)                                                       │
│ --trainer.torchdynamo {None}|STR                                           │
│     This argument is deprecated, use `--torch_compile_backend` instead.    │
│     (default: None)                                                        │
│ --trainer.ray-scope {None}|STR                                             │
│     The scope to use when doing hyperparameter search with Ray. By         │
│     default, `"last"` will be used. Ray will then use the last checkpoint  │
│     of all trials, compare those, and select the best one. However, other  │
│     options are also available. See the Ray documentation                  │
│     (https://docs.ray.io/en/latest/tune/api_docs/analysis.html#ray.tune.Ex │
│     perimentAnalysis.get_best_trial) for more options. (default: last)     │
│ --trainer.ddp-timeout {None}|INT                                           │
│     Overrides the default timeout for distributed training (value should   │
│     be given in seconds). (default: 1800)                                  │
│ --trainer.torch-compile, --trainer.no-torch-compile                        │
│     If set to `True`, the model will be wrapped in `torch.compile`.        │
│     (default: False)                                                       │
│ --trainer.torch-compile-backend {None}|STR                                 │
│     Which backend to use with `torch.compile`, passing one will trigger a  │
│     model compilation. (default: None)                                     │
│ --trainer.torch-compile-mode {None}|STR                                    │
│     Which mode to use with `torch.compile`, passing one will trigger a     │
│     model compilation. (default: None)                                     │
│ --trainer.include-tokens-per-second {None,True,False}                      │
│     If set to `True`, the speed metrics will include `tgs` (tokens per     │
│     second per device). (default: False)                                   │
│ --trainer.include-num-input-tokens-seen {None,True,False}                  │
│     If set to `True`, will track the number of input tokens seen           │
│     throughout training. (May be slower in distributed training) (default: │
│     False)                                                                 │
│ --trainer.neftune-noise-alpha {None}|FLOAT                                 │
│     Activates neftune noise embeddings into the model. NEFTune has been    │
│     proven to drastically improve model performances for instruction       │
│     fine-tuning. Check out the original paper here:                        │
│     https://arxiv.org/abs/2310.05914 and the original code here:           │
│     https://github.com/neelsjain/NEFTune. Only supported for               │
│     `PreTrainedModel` and `PeftModel` classes. (default: None)             │
│ --trainer.optim-target-modules {None}|STR|{[STR [STR ...]]}                │
│     Target modules for the optimizer defined in the `optim` argument. Only │
│     used for the GaLore optimizer at the moment. (default: None)           │
│ --trainer.batch-eval-metrics, --trainer.no-batch-eval-metrics              │
│     Break eval metrics calculation into batches to save memory. (default:  │
│     False)                                                                 │
│ --trainer.eval-on-start, --trainer.no-eval-on-start                        │
│     Whether to run through the entire `evaluation` step at the very        │
│     beginning of training as a sanity check. (default: False)              │
│ --trainer.use-liger-kernel {None,True,False}                               │
│     Whether or not to enable the Liger Kernel for model training.          │
│     (default: False)                                                       │
│ --trainer.eval-use-gather-object {None,True,False}                         │
│     Whether to run recursively gather object in a nested                   │
│     list/tuple/dictionary of objects from all devices. (default: False)    │
│ --trainer.average-tokens-across-devices {None,True,False}                  │
│     Whether or not to average tokens across devices. If enabled, will use  │
│     all_reduce to synchronize num_tokens_in_batch for precise loss         │
│     calculation. Reference:                                                │
│     https://github.com/huggingface/transformers/issues/34242 (default:     │
│     False)                                                                 │
╰────────────────────────────────────────────────────────────────────────────╯

Training GPT-2 on WikiText in One Line¶

The following command runs our script end-to-end: installing all dependencies, downloading model and data, training, logging to TensorBoard, etc. For multi-node training (+ if not using SLURM), you should also pass e.g. --launcher.hostnames node1 node2.

Pre-requisite: uv

uv run --python "3.12" https://torchrun.xyz/transformers_train.py \
   --model.name gpt2 \
   --dataset.path "Salesforce/wikitext" --dataset.name "wikitext-2-v1" --dataset.split "train" --dataset.num-samples 80 \
   --trainer.output-dir output --trainer.per-device-train-batch-size 4 --trainer.report-to tensorboard

You can visualize the logs with:

uv run --with tensorboard tensorboard --logdir output/runs

Script¶

from __future__ import annotations

import functools
import logging
import os
from dataclasses import dataclass
from typing import Annotated

import tyro
from datasets import Dataset, load_dataset
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    PreTrainedModel,
    Trainer,
    TrainingArguments,
    trainer_utils,
)

import torchrunx

logging.basicConfig(level=logging.INFO)

@dataclass
class ModelConfig:
    name: str


@dataclass
class DatasetConfig:
    path: str
    name: str | None = None
    split: str | None = None
    text_column: str = "text"
    num_samples: int | None = None


def load_training_data(
    tokenizer_name: str,
    dataset_config: DatasetConfig,
) -> Dataset:
    # Load dataset

    dataset = load_dataset(
        dataset_config.path, name=dataset_config.name, split=dataset_config.split
    )
    if dataset_config.num_samples is not None:
        dataset = dataset.select(range(dataset_config.num_samples))

    # Build tokenizer

    os.environ["TOKENIZERS_PARALLELISM"] = "false"  # to suppress warnings
    tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)
    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token
    tokenize_fn = functools.partial(
        tokenizer,
        max_length=tokenizer.model_max_length,
        truncation=True,
        padding="max_length",
    )

    # Tokenize dataset

    return dataset.map(
        tokenize_fn,
        batched=True,
        input_columns=[dataset_config.text_column],
        remove_columns=[dataset_config.text_column],
    ).map(lambda x: {"labels": x["input_ids"]})


def train(
    model_config: ModelConfig,
    dataset_config: DatasetConfig,
    training_args: TrainingArguments,
) -> str:
    model = AutoModelForCausalLM.from_pretrained(model_config.name)
    train_dataset = load_training_data(
        tokenizer_name=model_config.name, dataset_config=dataset_config
    )

    trainer = Trainer(model=model, train_dataset=train_dataset, args=training_args)
    trainer.train()
    return trainer_utils.get_last_checkpoint(training_args.output_dir)


def main(
    launcher: torchrunx.Launcher,
    model_config: Annotated[ModelConfig, tyro.conf.arg(name="model")],
    dataset_config: Annotated[DatasetConfig, tyro.conf.arg(name="dataset")],
    training_args: Annotated[TrainingArguments, tyro.conf.arg(name="trainer", help="")],
):
    results = launcher.run(train, model_config, dataset_config, training_args)

    # Loading trained model from checkpoint
    checkpoint_path = results.rank(0)
    trained_model = AutoModelForCausalLM.from_pretrained(checkpoint_path)


if __name__ == "__main__":
    tyro.cli(main)