|
- import argparse
- import logging
- import os
-
- import numpy as np
- import torch
- from torch import nn
- import torch.optim as optim
- from torch.utils.data.sampler import RandomSampler
- from tqdm import tqdm
-
- import utils
- import model.net as net
- from evaluate import evaluate
- from dataloader import *
-
- import matplotlib
- matplotlib.use('Agg')
- import matplotlib.pyplot as plt
-
- logger = logging.getLogger('DeepAR.Train')
-
- parser = argparse.ArgumentParser()
- parser.add_argument('--dataset', default='elect', help='Name of the dataset')
- parser.add_argument('--data-folder', default='data', help='Parent dir of the dataset')
- parser.add_argument('--model-name', default='base_model', help='Directory containing params.json')
- parser.add_argument('--relative-metrics', action='store_true', help='Whether to normalize the metrics by label scales') # 是否通过标签比例标准化度量标准
- parser.add_argument('--sampling', action='store_true', help='Whether to sample during evaluation')
- parser.add_argument('--save-best', action='store_true', help='Whether to save best ND to param_search.txt')
- parser.add_argument('--restore-file', default=None,
- help='Optional, name of the file in --model_dir containing weights to reload before \
- training') # 'best' or 'epoch_#'
-
-
- def train(model: nn.Module,
- optimizer: optim,
- loss_fn,
- train_loader: DataLoader,
- test_loader: DataLoader,
- params: utils.Params,
- epoch: int) -> float:
- '''Train the model on one epoch by batches.就是训练一个epoch的步骤
- Args:
- model: (torch.nn.Module) the neural network
- optimizer: (torch.optim) optimizer for parameters of model
- loss_fn: a function that takes outputs and labels per timestep, and then computes the loss for the batch 一个函数,它在每个时间步中获取输出和标签,然后计算批处理的损失
- train_loader: load train data and labels
- test_loader: load test data and labels
- params: (Params) hyperparameters
- epoch: (int) the current training epoch
- '''
- model.train() # 打开训练模式
- loss_epoch = np.zeros(len(train_loader))
- # Train_loader:
- # train_batch ([batch_size, train_window, 1+cov_dim]): z_{0:T-1} + x_{1:T}, note that z_0 = 0;
- # idx ([batch_size]): one integer denoting the time series id; 一个整数,表示时间序列id
- # labels_batch ([batch_size, train_window]): z_{1:T}.
- for i, (train_batch, idx, labels_batch) in enumerate(tqdm(train_loader)):
- optimizer.zero_grad() # 梯度清零
- batch_size = train_batch.shape[0] # 获取 batch_size
-
- train_batch = train_batch.permute(1, 0, 2).to(torch.float32).to(params.device) # not scaled [W, B, F]其中,W为滑动窗口的大小
- labels_batch = labels_batch.permute(1, 0).to(torch.float32).to(params.device) # not scaled [W, F]
- idx = idx.unsqueeze(0).to(params.device) # [1, B] 用户的ID
-
- loss = torch.zeros(1, device=params.device)
- hidden = model.init_hidden(batch_size)
- cell = model.init_cell(batch_size)
-
- for t in range(params.train_window): # 按序列来遍历,这样就保证了前后的时序关系
- # if z_t is missing, replace it by output mu from the last time step 如果z_t缺失,则将其替换为上一时间步的输出mu
- zero_index = (train_batch[t, :, 0] == 0) # 返回True or False
- if t > 0 and torch.sum(zero_index) > 0:
- train_batch[t, zero_index, 0] = mu[zero_index] # 第0列之前是空的(用零填充的),现在需要替换为上一时间步的输出mu
- mu, sigma, hidden, cell = model(train_batch[t].unsqueeze_(0).clone(), idx, hidden, cell)
- loss += loss_fn(mu, sigma, labels_batch[t]) # 每一个时间步骤的loss累加
-
- loss.backward()
- optimizer.step()
- loss = loss.item() / params.train_window # loss per timestep 之前累加了所有时间步的loss,所以取平均
- loss_epoch[i] = loss
- if i % 1000 == 0: # 1000个batch测试一次
- test_metrics = evaluate(model, loss_fn, test_loader, params, epoch, sample=args.sampling)
- model.train()
- logger.info(f'train_loss: {loss}')
- if i == 0:
- logger.info(f'train_loss: {loss}')
- return loss_epoch # 返回一个epoch的损失
-
-
- def train_and_evaluate(model: nn.Module,
- train_loader: DataLoader,
- test_loader: DataLoader,
- optimizer: optim, loss_fn,
- params: utils.Params,
- restore_file: str = None) -> None:
- '''Train the model and evaluate every epoch.
- Args:
- model: (torch.nn.Module) the Deep AR model
- train_loader: load train data and labels
- test_loader: load test data and labels
- optimizer: (torch.optim) optimizer for parameters of model
- loss_fn: a function that takes outputs and labels per timestep, and then computes the loss for the batch
- params: (Params) hyperparameters
- restore_file: (string) optional- name of file to restore from (without its extension .pth.tar)
- '''
- # reload weights from restore_file if specified
- if restore_file is not None:
- restore_path = os.path.join(params.model_dir, restore_file + '.pth.tar') # 保存模型路径
- logger.info('Restoring parameters from {}'.format(restore_path))
- utils.load_checkpoint(restore_path, model, optimizer)
- logger.info('begin training and evaluation')
- best_test_ND = float('inf') # 初始设置成正无穷大
- train_len = len(train_loader) # 训练集的长度
- ND_summary = np.zeros(params.num_epochs) # 一维,长度为epoch
- loss_summary = np.zeros((train_len * params.num_epochs))
- for epoch in range(params.num_epochs): # 进行epoch个轮次训练
- logger.info('Epoch {}/{}'.format(epoch + 1, params.num_epochs))
- loss_summary[epoch * train_len:(epoch + 1) * train_len] = train(model, optimizer, loss_fn, train_loader,
- test_loader, params, epoch) # 训练
- test_metrics = evaluate(model, loss_fn, test_loader, params, epoch, sample=args.sampling) # 每个epoch完了之后,测试一次
- ND_summary[epoch] = test_metrics['ND'] # 把测试的ND结果,保存起来
- is_best = ND_summary[epoch] <= best_test_ND # 判断是不是当前的ND比之前的要好,若好,则返回True,否则返回False
-
- # Save weights
- utils.save_checkpoint({'epoch': epoch + 1,
- 'state_dict': model.state_dict(),
- 'optim_dict': optimizer.state_dict()},
- epoch=epoch,
- is_best=is_best,
- checkpoint=params.model_dir)
-
- if is_best: # 当前训练的epoch结果在测试结果上最好,则保存测试的ND,RMSE,test_loss等
- logger.info('- Found new best ND')
- best_test_ND = ND_summary[epoch]
- best_json_path = os.path.join(params.model_dir, 'metrics_test_best_weights.json')
- utils.save_dict_to_json(test_metrics, best_json_path)
-
- logger.info('Current Best ND is: %.5f' % best_test_ND)
-
- utils.plot_all_epoch(ND_summary[:epoch + 1], args.dataset + '_ND', params.plot_dir) # 画图
- utils.plot_all_epoch(loss_summary[:(epoch + 1) * train_len], args.dataset + '_loss', params.plot_dir)
-
- last_json_path = os.path.join(params.model_dir, 'metrics_test_last_weights.json') # 保存最后一个epoch的测试
- utils.save_dict_to_json(test_metrics, last_json_path)
-
- if args.save_best:
- f = open('./param_search.txt', 'w')
- f.write('-----------\n')
- list_of_params = args.search_params.split(',')
- print_params = ''
- for param in list_of_params:
- param_value = getattr(params, param)
- print_params += f'{param}: {param_value:.2f}'
- print_params = print_params[:-1]
- f.write(print_params + '\n')
- f.write('Best ND: ' + str(best_test_ND) + '\n')
- logger.info(print_params)
- logger.info(f'Best ND: {best_test_ND}')
- f.close()
- utils.plot_all_epoch(ND_summary, print_params + '_ND', location=params.plot_dir) # 画图
- utils.plot_all_epoch(loss_summary, print_params + '_loss', location=params.plot_dir)
-
-
- if __name__ == '__main__':
-
- # Load the parameters from json file
- args = parser.parse_args()
- model_dir = os.path.join('experiments', args.model_name)
- json_path = os.path.join(model_dir, 'params.json')
- data_dir = os.path.join(args.data_folder, args.dataset)
- assert os.path.isfile(json_path), f'No json configuration file found at {json_path}'
- params = utils.Params(json_path)
-
- # 下面这几个就是字典增加键值对
- params.relative_metrics = args.relative_metrics
- params.sampling = args.sampling
- params.model_dir = model_dir
- params.plot_dir = os.path.join(model_dir, 'figures')
-
- # create missing directories
- try:
- os.mkdir(params.plot_dir)
- except FileExistsError:
- pass
-
- # use GPU if available
- cuda_exist = torch.cuda.is_available()
- # Set random seeds for reproducible experiments if necessary
- if cuda_exist:
- params.device = torch.device('cuda')
- # torch.cuda.manual_seed(240)
- logger.info('Using Cuda...')
- model = net.Net(params).cuda() # 把模型放入GPU的显存
- else:
- params.device = torch.device('cpu')
- # torch.manual_seed(230)
- logger.info('Not using cuda...')
- model = net.Net(params) # 模型放入内存
-
- utils.set_logger(os.path.join(model_dir, 'train.log'))
- logger.info('Loading the datasets...')
-
- train_set = TrainDataset(data_dir, args.dataset, params.num_class) # 训练集
- test_set = TestDataset(data_dir, args.dataset, params.num_class) # 测试集
- sampler = WeightedSampler(data_dir, args.dataset) # Use weighted sampler instead of random sampler。使用加权取样器代替随机取样器
- train_loader = DataLoader(train_set, batch_size=params.batch_size, sampler=sampler, num_workers=4) # 使用pytorch的DataLoader包装一下
- test_loader = DataLoader(test_set, batch_size=params.predict_batch, sampler=RandomSampler(test_set), num_workers=4) # 同上
- logger.info('Loading complete.')
-
- logger.info(f'Model: \n{str(model)}')
- optimizer = optim.Adam(model.parameters(), lr=params.learning_rate) # Adam优化器
-
- # fetch loss function
- loss_fn = net.loss_fn
-
- # Train the model
- logger.info('Starting training for {} epoch(s)'.format(params.num_epochs))
- train_and_evaluate(model,
- train_loader,
- test_loader,
- optimizer,
- loss_fn,
- params,
- args.restore_file) # 模型训练
|