paddle 单独训练
def model_train(model, train_loader, dev_loader):
"""
模型训练
:param model:
:param train_loader:
:param dev_loader:
:return: 模型、准确率
"""
start_time = datetime.datetime.now()
# Starts training and evaluating.
callback = paddle.callbacks.ProgBarLogger(log_freq=10, verbose=3)
model.fit(train_loader,
dev_loader,
epochs=args.epochs,
save_dir=args.save_dir,
callbacks=callback)
end_time = datetime.datetime.now()
module_run_time_dict["model_train"] = end_time - start_time
start_time = end_time
# eval model
accu = model_eval(model, dev_loader)
end_time = datetime.datetime.now()
module_run_time_dict["model_eval"] = end_time - start_time
return model, accu
paddle 云际训练
start_time = datetime.datetime.now()
print("Start joint learning...")
# === 云际学习 start ===
# 参数管理器
print("Initial para_hunter")
param_hunter = ParamHunter(model, debug=False)
# api客户端
print("Initial api_client")
api_client = THGYApiClient()
end_time = datetime.datetime.now()
module_run_time_dict["joint_server_initial"] = end_time - start_time
start_time = end_time
# 训练之前,需要从JCCE.agent初始化model的参数
print("Initial init_params")
model, init_params_num = param_hunter.init_params(initial)
print("Initialed model. init_params_num=%d " % init_params_num)
# 界面展示用
print("api_client.add_training_parameters task_id=%d" % task_id)
api_client.add_training_parameters(0, task_id, init_params_num)
end_time = datetime.datetime.now()
module_run_time_dict["initial_params_from_server"] = end_time - start_time
start_time = end_time
# 当前轮次开始时间
start_time = datetime.datetime.now()
# === 云际学习 end ===
print("Start training...")
model, accu = model_train(model, train_loader, dev_loader)
# === 云际学习 参数同步 start ===
# 当前轮次结束时间
# end_time = datetime.datetime.now()
# module_run_time_dict["model_train_eval"] = end_time - start_time
# start_time = end_time
end_time = datetime.datetime.now()
# 界面展示用
print("api_client.add_task_training_data task_id=%d, global_step=%d" % (task_id, global_step))
api_client.add_task_training_data(group_id, task_id, global_step,
recall=0, precision=accu,
startTime=start_time.strftime(
"%Y-%m-%d %H:%M:%S.%f"),
endTime=end_time.strftime("%Y-%m-%d %H:%M:%S.%f"))
end_time = datetime.datetime.now()
module_run_time_dict["upload_train_result"] = end_time - start_time
start_time = end_time
# 训练完成后上传参数到JCCE.agent
print("param_hunter.upload_params...")
print(uuid, step_per_round)
upload_param_nums = param_hunter.upload_params(model, uuid, step_per_round)
print("param_hunter.upload_params, upload_param_nums=%d" % upload_param_nums)
# 界面展示用
api_client.add_training_parameters(1, task_id, upload_param_nums)
end_time = datetime.datetime.now()
module_run_time_dict["upload_params_to_server"] = end_time - start_time
# === 云际学习 end ===