#55 fix PBT bug

Merged
isleizhang merged 1 commits from mathcoder/XBBO:dev into dev 1 year ago
  1. +37
    -42
      examples/PopulationBasedTraining/mnist_model.py
  2. +11
    -8
      examples/PopulationBasedTraining/mnist_pbt.py
  3. +2
    -2
      examples/PopulationBasedTraining/toy_model.py
  4. +0
    -99
      xbbo/problem/pbt_toy.py
  5. +1
    -0
      xbbo/search_algorithm/pbt_optimizer.py

+ 37
- 42
examples/PopulationBasedTraining/mnist_model.py View File

@@ -65,6 +65,7 @@ class Model(Abstract_PBT_Model):
self.device = torch.device(kwargs.get('device', 'cpu'))
self.train_loader = DataLoader(Model.trn_dataset, batch_size=64, shuffle=False,num_workers=0)
self.test_loader = DataLoader(Model.tst_dataset, batch_size=64, shuffle=False, num_workers=0)
self.iter_train_loader = iter(self.train_loader)
self.net = ConvNet().to(self.device)
# self.opt_wrap = lambda params: optim.SGD(self.net.parameters(), lr=lr, momentum=momentum)
self.opt = optim.SGD(self.net.parameters(), lr=0.11, momentum=0.9)
@@ -73,8 +74,8 @@ class Model(Abstract_PBT_Model):
self.ready = False # not ready
self.history_hp = [] # for record strategy
self.trajectory_hp = []
self.trajectory_score = [] # 记录该个体score过程
self.history_loss = [] # 记录使用了(考虑权重迁移)hp-stategy后的score过程
self.trajectory_loss = [] # 记录该个体loss过程
self.history_loss = [] # 记录使用了(考虑权重迁移)hp-stategy后的loss过程

def __len__(self): # one epoch has how many batchs
return len(self.train_loader)
@@ -86,43 +87,34 @@ class Model(Abstract_PBT_Model):
for param_group in self.opt.param_groups:
param_group[hyperparam_name] = v

def step(self, num): # train need training(optimizer)
def _one_step(self, **kwargs) -> float:
try:
inp, target = next(self.iter_train_loader)
except StopIteration:
self.iter_train_loader = iter(self.train_loader)
inp, target = next(self.iter_train_loader)
inp = inp.to(self.device)
target = target.to(self.device)
output = self.net(inp)
loss = self.loss_fn(output, target)
if np.isnan(loss.item()):
print("Loss is NaN.")
return np.inf
# raise LossIsNaN
self.opt.zero_grad()
loss.backward()
self.opt.step()
return loss.item()

def step(self, num, **kwargs): # train need training(optimizer)
self.net.train()
st = self.step_num % len(self.train_loader)
ed = st + num
it = 0
while it < ed:

for (inp, target) in (self.train_loader):

if it < st:
it += 1
continue
# it += 1
inp = inp.to(self.device)
target = target.to(self.device)
output = self.net(inp)
loss = self.loss_fn(output, target)
if np.isnan(loss.item()):
print("Loss is NaN.")
self.step_num += ed - it
it = ed
break
# raise LossIsNaN
self.opt.zero_grad()
loss.backward()
self.opt.step()
self.step_num += 1
it += 1
if ed == it:
break
# inp, target = next(self.train_loader)


# if self.step_num % len(Model.trn_dataset) == 0:
# self.ready = True


for it in range(num):
loss: float = self._one_step(**kwargs)
if not np.isfinite(loss):
self.step_num += num - it
return
self.step_num += 1

def evaluate(self): # val no training need(optimizer)
correct = 0
@@ -134,16 +126,19 @@ class Model(Abstract_PBT_Model):
output = self.net(inp)
correct += (output.max(1)[1] == target).sum().cpu().item()
acc = 100 * correct / len(self.tst_dataset)
self.score = -1 if np.isnan(acc) else acc
self.trajectory_score.append((self.step_num, self.score))
self.history_loss.append((self.step_num, -self.score))
return -self.score
self.loss = np.inf if np.isnan(acc) else -acc
self.trajectory_loss.append((self.step_num, self.loss))
self.history_loss.append((self.step_num, self.loss))
return self.loss

def load_checkpoint(self, checkpoint):
self.net.load_state_dict(checkpoint['model_state_dict'])
self.opt.load_state_dict(checkpoint['optim_state_dict'])

def save_checkpoint(self):
'''
Optional Serialization to disk
'''
checkpoint = dict(model_state_dict=self.net.state_dict(),
optim_state_dict=self.opt.state_dict())
return checkpoint


+ 11
- 8
examples/PopulationBasedTraining/mnist_pbt.py View File

@@ -30,6 +30,9 @@ class MnistPBT(PBT):
top_id = self.rng.choice(top_ids)
checkpoint = population_model[top_id].save_checkpoint()
population_model[bot_id].load_checkpoint(checkpoint)
# Keep dataloader iter syncronize(when loss is nan or early stopping)
population_model[bot_id].iter_train_loader = population_model[top_id].iter_train_loader
self.population_hp_array[bot_id] = self.population_hp_array[
top_id].copy()
# explore
@@ -45,7 +48,7 @@ class MnistPBT(PBT):
# x_unwarped = DenseConfiguration.array_to_dict(self.space, x_array)
# self.population_hp[bot_id] = x_unwarped
population_model[bot_id].history_hp = copy.copy(
population_model[top_id].history_loss)
population_model[top_id].history_hp)
population_model[bot_id].history_loss = copy.copy(
population_model[top_id].history_loss)
population_model[bot_id].update_hp(new_config.get_dictionary())
@@ -53,12 +56,12 @@ class MnistPBT(PBT):
if __name__ == "__main__":
device = "cuda" if torch.cuda.is_available() else "cpu"
epoch_num = 100
epoch_num = 10
rng = np.random.RandomState(42)
config_space = Model.get_configuration_space(rng.randint(MAXINT))
# define black box optimizer
pbt = MnistPBT(space=config_space, pop_size=2, seed=rng.randint(MAXINT))
pbt = MnistPBT(space=config_space, pop_size=5, seed=rng.randint(MAXINT))

population_model = [
Model(seed=rng.randint(MAXINT), device=device) for _ in range(pbt.pop_size)
@@ -74,7 +77,7 @@ if __name__ == "__main__":
for i in range(pbt.pop_size):
desc_data = np.array(population_model[i].history_loss)
desc_data[:, 0] /= len(population_model[-1])
ax1.plot(desc_data[:, 0], desc_data[:, 1], alpha=0.5)
ax1.plot(desc_data[:, 0], -desc_data[:, 1], alpha=0.5)
ax1.set_xlabel("epoch")
ax1.set_ylabel("score")
# for i in range(self.pop_size):
@@ -86,15 +89,15 @@ if __name__ == "__main__":
for i in range(pbt.pop_size):
desc_data = np.array([[x[0], x[-1]['lr']] for x in population_model[i].history_hp])
desc_data[:, 0] /= len(population_model[-1])
desc_data = np.append(desc_data, [[pbt.epoch, desc_data[-1, 1]]], axis=0)
desc_data = np.append(desc_data, [[epoch_num, desc_data[-1, 1]]], axis=0)
ax2.plot(desc_data[:, 0], desc_data[:, 1], label='best individual' if i==best_individual_index else None)
ax2.set_xlabel("epoch")
ax2.set_ylabel("lr")
plt.legend()
plt.suptitle("PBT search (lr, momentum) in MNIST")
plt.tight_layout()
plt.savefig('./out/PBT_mnist.png')
# plt.savefig('./a.png')
plt.show()

print('-----\nBest hyper-param strategy: {}'.format(pbt.population_model[best_individual_index].history_hp))
print('final score: {}'.format(-pbt.population_model[best_individual_index].history_loss[-1]))
print('-----\nBest hyper-param strategy: {}'.format(population_model[best_individual_index].history_hp))
print('final -score: {}'.format(population_model[best_individual_index].history_loss[-1]))

+ 2
- 2
examples/PopulationBasedTraining/toy_model.py View File

@@ -20,8 +20,8 @@ class Model(Abstract_PBT_Model):
self.opt = SGD([self.theta], lr=0.01)
self.history_hp = [] # for record strategy
self.trajectory_hp = []
self.trajectory_loss = [] # 记录该个体score过程
self.history_loss = [] # 记录使用了(考虑权重迁移)hp-stategy后的score过程
self.trajectory_loss = [] # 记录该个体loss过程
self.history_loss = [] # 记录使用了(考虑权重迁移)hp-stategy后的loss过程
self.hp = torch.empty(2, device=self.device)
self.obj_val_func = lambda theta: 1.2 - (theta**2).sum()
self.obj_train_func = lambda theta, h: 1.2 - ((h * theta)**2).sum()


+ 0
- 99
xbbo/problem/pbt_toy.py View File

@@ -1,99 +0,0 @@
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch import optim
from torch.autograd import Variable
from torch.nn.parameter import Parameter
from torchvision import datasets, transforms
from torch.utils.data import DataLoader, Dataset

from xbbo.core.constants import MAXINT

from xbbo.core import TestFunction

class LossIsNaN(Exception):
pass




class Model(TestFunction):

def __init__(self, cfg, seed, **kwargs):
# np.random.seed(cfg.GENERAL.random_seed)
self.cfg = cfg
# self.dim = 30
# assert self.dim % 2 == 0
super().__init__(seed=seed)

self.api_config = self._load_api_config()
torch.seed(self.rng.randint(MAXINT))
torch.manual_seed(self.rng.randint(MAXINT))
self.device = torch.device(kwargs.get('device', 'cpu'))

self.theta = Parameter(torch.FloatTensor([0.9, 0.9]).to(self.device))
# self.opt_wrap = lambda params: optim.SGD(self.net.parameters(), lr=lr, momentum=momentum)
self.opt = optim.SGD([self.theta], lr=0.01)
self.step_num = 0
self.history_hp = [] # for record strategy
self.trajectory_hp = []
self.trajectory_loss = [] # 记录该个体score过程
self.history_loss = [] # 记录使用了(考虑权重迁移)hp-stategy后的score过程
self.hp = torch.empty(2, device=self.device)
self.obj_val_func = lambda theta: 1.2 - (theta ** 2).sum()
self.obj_train_func = lambda theta, h: 1.2 - ((h * theta) ** 2).sum()

self.trajectory_theta = []

def __len__(self): # one epoch has how many batchs
return 1

def update_hp(self, params: dict):
self.history_hp.append((self.step_num, params)) # 在该steps上更改超参,acc为该step时的结果(受该step*前*所有超参影响)
self.trajectory_hp.append((self.step_num, params))
self.trajectory_theta.append(self.theta.detach().cpu().numpy())
self.hp[0] = params['h1']
self.hp[1] = params['h2']

def step(self, num): # train need training(optimizer)
for it in range(num):
self.trajectory_theta.append(self.theta.detach().cpu().numpy())
loss = self.obj_train_func(self.theta, self.hp)
if np.isnan(loss.item()):
print("Loss is NaN.")
self.step_num += 1
return
# raise LossIsNaN
self.opt.zero_grad()
loss.backward()
self.opt.step()
self.step_num += 1


def evaluate(self): # val no training need(optimizer)
with torch.no_grad():
loss = self.obj_val_func(self.theta).item()
self.loss = np.inf if np.isnan(loss) else loss
self.trajectory_loss.append((self.step_num, self.loss))
self.history_loss.append((self.step_num, self.loss))
return self.loss

def load_checkpoint(self, checkpoint):
with torch.no_grad():
self.theta.set_(checkpoint['model_state_dict'])
# self.opt.load_state_dict(checkpoint['optim_state_dict'])

def save_checkpoint(self):
checkpoint = dict(model_state_dict=self.theta.data.clone())
return checkpoint

def _load_api_config(self):
return {
'h1': {
'type': 'float', 'warp': 'linear', 'range': [0, 1]},
'h2': {
'type': 'float', 'warp': 'linear', 'range': [0, 1]
}
}


+ 1
- 0
xbbo/search_algorithm/pbt_optimizer.py View File

@@ -137,6 +137,7 @@ class PBT(AbstractOptimizer):
for i in range(self.pop_size):
population_model[i].evaluate()
losses = [net.loss for net in population_model]
assert np.any(np.isfinite(losses)), "ERROR: At Least 1 loss is finite"
if finished:
break
# Update respective config


Loading…
Cancel
Save