From c7f43fd63918221b1bec57e6e923db1f31145cf0 Mon Sep 17 00:00:00 2001 From: LeiZhang Date: Thu, 2 Jun 2022 15:24:05 +0800 Subject: [PATCH] fix PBT bug --- .../PopulationBasedTraining/mnist_model.py | 79 +++++++-------- examples/PopulationBasedTraining/mnist_pbt.py | 19 ++-- examples/PopulationBasedTraining/toy_model.py | 4 +- xbbo/problem/pbt_toy.py | 99 ------------------- xbbo/search_algorithm/pbt_optimizer.py | 1 + 5 files changed, 51 insertions(+), 151 deletions(-) delete mode 100644 xbbo/problem/pbt_toy.py diff --git a/examples/PopulationBasedTraining/mnist_model.py b/examples/PopulationBasedTraining/mnist_model.py index a588d46..23bedf1 100644 --- a/examples/PopulationBasedTraining/mnist_model.py +++ b/examples/PopulationBasedTraining/mnist_model.py @@ -65,6 +65,7 @@ class Model(Abstract_PBT_Model): self.device = torch.device(kwargs.get('device', 'cpu')) self.train_loader = DataLoader(Model.trn_dataset, batch_size=64, shuffle=False,num_workers=0) self.test_loader = DataLoader(Model.tst_dataset, batch_size=64, shuffle=False, num_workers=0) + self.iter_train_loader = iter(self.train_loader) self.net = ConvNet().to(self.device) # self.opt_wrap = lambda params: optim.SGD(self.net.parameters(), lr=lr, momentum=momentum) self.opt = optim.SGD(self.net.parameters(), lr=0.11, momentum=0.9) @@ -73,8 +74,8 @@ class Model(Abstract_PBT_Model): self.ready = False # not ready self.history_hp = [] # for record strategy self.trajectory_hp = [] - self.trajectory_score = [] # 记录该个体score过程 - self.history_loss = [] # 记录使用了(考虑权重迁移)hp-stategy后的score过程 + self.trajectory_loss = [] # 记录该个体loss过程 + self.history_loss = [] # 记录使用了(考虑权重迁移)hp-stategy后的loss过程 def __len__(self): # one epoch has how many batchs return len(self.train_loader) @@ -86,43 +87,34 @@ class Model(Abstract_PBT_Model): for param_group in self.opt.param_groups: param_group[hyperparam_name] = v - def step(self, num): # train need training(optimizer) + def _one_step(self, **kwargs) -> float: + try: + inp, target = next(self.iter_train_loader) + except StopIteration: + self.iter_train_loader = iter(self.train_loader) + inp, target = next(self.iter_train_loader) + inp = inp.to(self.device) + target = target.to(self.device) + output = self.net(inp) + loss = self.loss_fn(output, target) + if np.isnan(loss.item()): + print("Loss is NaN.") + return np.inf + # raise LossIsNaN + self.opt.zero_grad() + loss.backward() + self.opt.step() + return loss.item() + + def step(self, num, **kwargs): # train need training(optimizer) self.net.train() - st = self.step_num % len(self.train_loader) - ed = st + num - it = 0 - while it < ed: - - for (inp, target) in (self.train_loader): - - if it < st: - it += 1 - continue - # it += 1 - inp = inp.to(self.device) - target = target.to(self.device) - output = self.net(inp) - loss = self.loss_fn(output, target) - if np.isnan(loss.item()): - print("Loss is NaN.") - self.step_num += ed - it - it = ed - break - # raise LossIsNaN - self.opt.zero_grad() - loss.backward() - self.opt.step() - self.step_num += 1 - it += 1 - if ed == it: - break - # inp, target = next(self.train_loader) - - - # if self.step_num % len(Model.trn_dataset) == 0: - # self.ready = True - - + for it in range(num): + loss: float = self._one_step(**kwargs) + if not np.isfinite(loss): + self.step_num += num - it + return + self.step_num += 1 + def evaluate(self): # val no training need(optimizer) correct = 0 @@ -134,16 +126,19 @@ class Model(Abstract_PBT_Model): output = self.net(inp) correct += (output.max(1)[1] == target).sum().cpu().item() acc = 100 * correct / len(self.tst_dataset) - self.score = -1 if np.isnan(acc) else acc - self.trajectory_score.append((self.step_num, self.score)) - self.history_loss.append((self.step_num, -self.score)) - return -self.score + self.loss = np.inf if np.isnan(acc) else -acc + self.trajectory_loss.append((self.step_num, self.loss)) + self.history_loss.append((self.step_num, self.loss)) + return self.loss def load_checkpoint(self, checkpoint): self.net.load_state_dict(checkpoint['model_state_dict']) self.opt.load_state_dict(checkpoint['optim_state_dict']) def save_checkpoint(self): + ''' + Optional Serialization to disk + ''' checkpoint = dict(model_state_dict=self.net.state_dict(), optim_state_dict=self.opt.state_dict()) return checkpoint diff --git a/examples/PopulationBasedTraining/mnist_pbt.py b/examples/PopulationBasedTraining/mnist_pbt.py index 1a351b5..f9f3f7e 100644 --- a/examples/PopulationBasedTraining/mnist_pbt.py +++ b/examples/PopulationBasedTraining/mnist_pbt.py @@ -30,6 +30,9 @@ class MnistPBT(PBT): top_id = self.rng.choice(top_ids) checkpoint = population_model[top_id].save_checkpoint() population_model[bot_id].load_checkpoint(checkpoint) + # Keep dataloader iter syncronize(when loss is nan or early stopping) + population_model[bot_id].iter_train_loader = population_model[top_id].iter_train_loader + self.population_hp_array[bot_id] = self.population_hp_array[ top_id].copy() # explore @@ -45,7 +48,7 @@ class MnistPBT(PBT): # x_unwarped = DenseConfiguration.array_to_dict(self.space, x_array) # self.population_hp[bot_id] = x_unwarped population_model[bot_id].history_hp = copy.copy( - population_model[top_id].history_loss) + population_model[top_id].history_hp) population_model[bot_id].history_loss = copy.copy( population_model[top_id].history_loss) population_model[bot_id].update_hp(new_config.get_dictionary()) @@ -53,12 +56,12 @@ class MnistPBT(PBT): if __name__ == "__main__": device = "cuda" if torch.cuda.is_available() else "cpu" - epoch_num = 100 + epoch_num = 10 rng = np.random.RandomState(42) config_space = Model.get_configuration_space(rng.randint(MAXINT)) # define black box optimizer - pbt = MnistPBT(space=config_space, pop_size=2, seed=rng.randint(MAXINT)) + pbt = MnistPBT(space=config_space, pop_size=5, seed=rng.randint(MAXINT)) population_model = [ Model(seed=rng.randint(MAXINT), device=device) for _ in range(pbt.pop_size) @@ -74,7 +77,7 @@ if __name__ == "__main__": for i in range(pbt.pop_size): desc_data = np.array(population_model[i].history_loss) desc_data[:, 0] /= len(population_model[-1]) - ax1.plot(desc_data[:, 0], desc_data[:, 1], alpha=0.5) + ax1.plot(desc_data[:, 0], -desc_data[:, 1], alpha=0.5) ax1.set_xlabel("epoch") ax1.set_ylabel("score") # for i in range(self.pop_size): @@ -86,15 +89,15 @@ if __name__ == "__main__": for i in range(pbt.pop_size): desc_data = np.array([[x[0], x[-1]['lr']] for x in population_model[i].history_hp]) desc_data[:, 0] /= len(population_model[-1]) - desc_data = np.append(desc_data, [[pbt.epoch, desc_data[-1, 1]]], axis=0) + desc_data = np.append(desc_data, [[epoch_num, desc_data[-1, 1]]], axis=0) ax2.plot(desc_data[:, 0], desc_data[:, 1], label='best individual' if i==best_individual_index else None) ax2.set_xlabel("epoch") ax2.set_ylabel("lr") plt.legend() plt.suptitle("PBT search (lr, momentum) in MNIST") plt.tight_layout() - plt.savefig('./out/PBT_mnist.png') + # plt.savefig('./a.png') plt.show() - print('-----\nBest hyper-param strategy: {}'.format(pbt.population_model[best_individual_index].history_hp)) - print('final score: {}'.format(-pbt.population_model[best_individual_index].history_loss[-1])) \ No newline at end of file + print('-----\nBest hyper-param strategy: {}'.format(population_model[best_individual_index].history_hp)) + print('final -score: {}'.format(population_model[best_individual_index].history_loss[-1])) \ No newline at end of file diff --git a/examples/PopulationBasedTraining/toy_model.py b/examples/PopulationBasedTraining/toy_model.py index 91ffb7e..bdd101a 100644 --- a/examples/PopulationBasedTraining/toy_model.py +++ b/examples/PopulationBasedTraining/toy_model.py @@ -20,8 +20,8 @@ class Model(Abstract_PBT_Model): self.opt = SGD([self.theta], lr=0.01) self.history_hp = [] # for record strategy self.trajectory_hp = [] - self.trajectory_loss = [] # 记录该个体score过程 - self.history_loss = [] # 记录使用了(考虑权重迁移)hp-stategy后的score过程 + self.trajectory_loss = [] # 记录该个体loss过程 + self.history_loss = [] # 记录使用了(考虑权重迁移)hp-stategy后的loss过程 self.hp = torch.empty(2, device=self.device) self.obj_val_func = lambda theta: 1.2 - (theta**2).sum() self.obj_train_func = lambda theta, h: 1.2 - ((h * theta)**2).sum() diff --git a/xbbo/problem/pbt_toy.py b/xbbo/problem/pbt_toy.py deleted file mode 100644 index 6439368..0000000 --- a/xbbo/problem/pbt_toy.py +++ /dev/null @@ -1,99 +0,0 @@ -import numpy as np -import torch -import torch.nn as nn -import torch.nn.functional as F -from torch import optim -from torch.autograd import Variable -from torch.nn.parameter import Parameter -from torchvision import datasets, transforms -from torch.utils.data import DataLoader, Dataset - -from xbbo.core.constants import MAXINT - -from xbbo.core import TestFunction - -class LossIsNaN(Exception): - pass - - - - -class Model(TestFunction): - - def __init__(self, cfg, seed, **kwargs): - # np.random.seed(cfg.GENERAL.random_seed) - self.cfg = cfg - # self.dim = 30 - # assert self.dim % 2 == 0 - super().__init__(seed=seed) - - self.api_config = self._load_api_config() - torch.seed(self.rng.randint(MAXINT)) - torch.manual_seed(self.rng.randint(MAXINT)) - self.device = torch.device(kwargs.get('device', 'cpu')) - - self.theta = Parameter(torch.FloatTensor([0.9, 0.9]).to(self.device)) - # self.opt_wrap = lambda params: optim.SGD(self.net.parameters(), lr=lr, momentum=momentum) - self.opt = optim.SGD([self.theta], lr=0.01) - self.step_num = 0 - self.history_hp = [] # for record strategy - self.trajectory_hp = [] - self.trajectory_loss = [] # 记录该个体score过程 - self.history_loss = [] # 记录使用了(考虑权重迁移)hp-stategy后的score过程 - self.hp = torch.empty(2, device=self.device) - self.obj_val_func = lambda theta: 1.2 - (theta ** 2).sum() - self.obj_train_func = lambda theta, h: 1.2 - ((h * theta) ** 2).sum() - - self.trajectory_theta = [] - - def __len__(self): # one epoch has how many batchs - return 1 - - def update_hp(self, params: dict): - self.history_hp.append((self.step_num, params)) # 在该steps上更改超参,acc为该step时的结果(受该step*前*所有超参影响) - self.trajectory_hp.append((self.step_num, params)) - self.trajectory_theta.append(self.theta.detach().cpu().numpy()) - self.hp[0] = params['h1'] - self.hp[1] = params['h2'] - - def step(self, num): # train need training(optimizer) - for it in range(num): - self.trajectory_theta.append(self.theta.detach().cpu().numpy()) - loss = self.obj_train_func(self.theta, self.hp) - if np.isnan(loss.item()): - print("Loss is NaN.") - self.step_num += 1 - return - # raise LossIsNaN - self.opt.zero_grad() - loss.backward() - self.opt.step() - self.step_num += 1 - - - def evaluate(self): # val no training need(optimizer) - with torch.no_grad(): - loss = self.obj_val_func(self.theta).item() - self.loss = np.inf if np.isnan(loss) else loss - self.trajectory_loss.append((self.step_num, self.loss)) - self.history_loss.append((self.step_num, self.loss)) - return self.loss - - def load_checkpoint(self, checkpoint): - with torch.no_grad(): - self.theta.set_(checkpoint['model_state_dict']) - # self.opt.load_state_dict(checkpoint['optim_state_dict']) - - def save_checkpoint(self): - checkpoint = dict(model_state_dict=self.theta.data.clone()) - return checkpoint - - def _load_api_config(self): - return { - 'h1': { - 'type': 'float', 'warp': 'linear', 'range': [0, 1]}, - 'h2': { - 'type': 'float', 'warp': 'linear', 'range': [0, 1] - } - } - diff --git a/xbbo/search_algorithm/pbt_optimizer.py b/xbbo/search_algorithm/pbt_optimizer.py index c40f996..6bbb61d 100644 --- a/xbbo/search_algorithm/pbt_optimizer.py +++ b/xbbo/search_algorithm/pbt_optimizer.py @@ -137,6 +137,7 @@ class PBT(AbstractOptimizer): for i in range(self.pop_size): population_model[i].evaluate() losses = [net.loss for net in population_model] + assert np.any(np.isfinite(losses)), "ERROR: At Least 1 loss is finite" if finished: break # Update respective config -- 2.34.1