|
- # -*- coding: UTF-8 -*-
- """
- -----------------------------------
- @Author : Encore
- @Date : 2024/3/29
- -----------------------------------
- """
- import os
-
- from tqdm import tqdm
-
- import torch
- import torch.nn.functional as F
- from torch.utils.data import Dataset, DataLoader
- from torch import nn
-
- from torch.utils.data.distributed import DistributedSampler
- from torch.nn.parallel import DistributedDataParallel
- from torch.distributed import init_process_group, destroy_process_group
- from torch.optim import AdamW
-
- import transformers
- from transformers import DataCollatorForSeq2Seq, TrainingArguments, get_linear_schedule_with_warmup
- import copy
- import logging
- import json
- from dataclasses import dataclass, field
- from typing import Callable, List, Optional, Set, Tuple, Union
-
- from data import TranslationDataset
-
-
- @dataclass
- class ModelArguments:
- model_name_or_path: Optional[str] = field(default=None)
-
-
- @dataclass
- class DataArguments:
- data_path: str = field(default=None, metadata={"help": "Path to the training data."})
-
-
- def ddp_setup():
- init_process_group(backend="nccl")
- # torch.cuda.set_device(int(os.environ["LOCAL_RANK"]))
-
-
- def get_parameter_names(model, forbidden_layer_types):
- """
- Returns the names of the model parameters that are not inside a forbidden layer.
- """
- result = []
- for name, child in model.named_children():
- result += [
- f"{name}.{n}"
- for n in get_parameter_names(child, forbidden_layer_types)
- if not isinstance(child, tuple(forbidden_layer_types))
- ]
- # Add model specific parameters (defined with nn.Parameter) since they are not in any child.
- result += list(model._parameters.keys())
- return result
-
-
- def get_decay_parameter_names(model) -> List[str]:
- """
- Get all parameter names that weight decay will be applied to
-
- Note that some models implement their own layernorm instead of calling nn.LayerNorm, weight decay could still
- apply to those modules since this function only filter out instance of nn.LayerNorm
- """
- decay_parameters = get_parameter_names(model, [nn.LayerNorm])
- decay_parameters = [name for name in decay_parameters if "bias" not in name]
- return decay_parameters
-
-
- def train():
- parser = transformers.HfArgumentParser((ModelArguments, DataArguments, TrainingArguments))
- model_args, data_args, training_args = parser.parse_args_into_dataclasses()
-
- local_rank = int(os.environ["LOCAL_RANK"])
- global_rank = os.environ["RANK"]
-
- model = transformers.MBartForConditionalGeneration.from_pretrained(
- model_args.model_name_or_path,
- )
- print(model.dtype)
- model = model.to(local_rank)
- # model = model.to(f"cuda:{local_rank}")
-
- model_wrapped = DistributedDataParallel(model)
-
- tokenizer = transformers.MBart50TokenizerFast.from_pretrained(
- model_args.model_name_or_path,
- src_lang="zh_CN",
- tgt_lang="en_XX",
- )
-
- train_dataset = TranslationDataset(tokenizer=tokenizer)
- train_dataset.read_file(data_args.data_path)
- data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer)
- train_dataloader = DataLoader(train_dataset,
- batch_size=training_args.per_device_train_batch_size,
- shuffle=True,
- collate_fn=data_collator)
-
- decay_parameters = get_decay_parameter_names(model)
- optimizer_kwargs = {
- "betas": (training_args.adam_beta1, training_args.adam_beta2),
- "eps": training_args.adam_epsilon,
- "lr": training_args.learning_rate
- }
- optimizer_grouped_parameters = [
- {
- "params": [
- p for n, p in model.named_parameters() if (n in decay_parameters and p.requires_grad)
- ],
- "weight_decay": training_args.weight_decay,
- },
- {
- "params": [
- p for n, p in model.named_parameters() if (n not in decay_parameters and p.requires_grad)
- ],
- "weight_decay": 0.0,
- },
- ]
- optimizer = AdamW(optimizer_grouped_parameters, **optimizer_kwargs)
-
- num_examples = len(train_dataset)
- max_steps = training_args.max_steps
- step_per_epoch = len(train_dataloader)
- warmup_step = int(max_steps * training_args.warmup_ratio)
- num_train_epochs = training_args.max_steps // step_per_epoch + int(training_args.max_steps % step_per_epoch > 0)
- lr_scheduler = get_linear_schedule_with_warmup(optimizer, warmup_step, max_steps)
-
- step = 0
- tr_loss = 0
-
- print("***** Running training *****")
- print(f" Num examples = {num_examples:,}")
- print(f" Max steps = {max_steps:,}")
- print(f" num train epochs = {num_train_epochs:,}")
-
- pbar = tqdm(total=max_steps, desc="Training")
- model_wrapped.train()
- for epoch in range(num_train_epochs):
- for inputs in train_dataloader:
- # print(inputs)
- inputs = inputs.to(local_rank)
- outputs = model_wrapped(**inputs)
- loss = outputs["loss"] if isinstance(outputs, dict) else outputs[0]
- optimizer.zero_grad()
- loss.backward()
- optimizer.step()
- lr_scheduler.step()
-
- step += 1
- tr_loss += loss.detach().item()
-
- pbar.update()
-
- if step % training_args.logging_steps == 0:
- print({
- "loss": tr_loss / training_args.logging_steps,
- "step": step
- })
- tr_loss = 0
-
- if step % training_args.save_steps == 0:
- output_dir = f"{training_args.output_dir}/checkpoint-{step}"
- os.makedirs(output_dir, exist_ok=True)
- print(f"Saving model checkpoint to {output_dir}")
- model.save_pretrained(output_dir)
-
- if step == max_steps:
- break
-
- pbar.close()
- print("train done!")
- destroy_process_group()
-
-
- if __name__ == "__main__":
- train()
|