OpenI
/
PARL
mirror of https://gitee.com/paddlepaddle/PARL.git

 
			
							#   Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import argparse
import numpy as np
from parl.utils import logger, summary

from mujoco_config import mujoco_config
from atari_config import atari_config
from env_utils import ParallelEnv, LocalEnv
from storage import RolloutStorage
from atari_model import AtariModel
from mujoco_model import MujocoModel
from parl.algorithms import PPO
from agent import PPOAgent


# Runs policy until 'real done' and returns episode reward
# A fixed seed is used for the eval environment
def run_evaluate_episodes(agent, eval_env, eval_episodes):
    eval_episode_rewards = []
    while len(eval_episode_rewards) < eval_episodes:
        obs = eval_env.reset()
        done = False
        while not done:
            action = agent.predict(obs)
            obs, reward, done, info = eval_env.step(action)
        if "episode" in info.keys():
            eval_reward = info["episode"]["r"]
            eval_episode_rewards.append(eval_reward)
    return np.mean(eval_episode_rewards)


def main():
    config = mujoco_config if args.continuous_action else atari_config
    if args.env_num:
        config['env_num'] = args.env_num
    config['env'] = args.env
    config['seed'] = args.seed
    config['xparl_addr'] = args.xparl_addr
    config['test_every_steps'] = args.test_every_steps
    config['train_total_steps'] = args.train_total_steps

    config['batch_size'] = int(config['env_num'] * config['step_nums'])
    config['num_updates'] = int(
        config['train_total_steps'] // config['batch_size'])

    logger.info("------------------- PPO ---------------------")
    logger.info('Env: {}, seed: {}'.format(config['env'], config['seed']))
    logger.info("---------------------------------------------")
    logger.set_dir('./train_logs/{}_{}'.format(config['env'], config['seed']))

    envs = ParallelEnv(config)
    eval_env = LocalEnv(config['env'], test=True)

    obs_space = eval_env.obs_space
    act_space = eval_env.act_space

    if config['continuous_action']:
        model = MujocoModel(obs_space, act_space)
    else:
        model = AtariModel(obs_space, act_space)
    ppo = PPO(
        model,
        clip_param=config['clip_param'],
        entropy_coef=config['entropy_coef'],
        initial_lr=config['initial_lr'],
        continuous_action=config['continuous_action'])
    agent = PPOAgent(ppo, config)

    rollout = RolloutStorage(config['step_nums'], config['env_num'], obs_space,
                             act_space)

    obs = envs.reset()
    done = np.zeros(config['env_num'], dtype='float32')

    test_flag = 0
    total_steps = 0
    for update in range(1, config['num_updates'] + 1):
        for step in range(0, config['step_nums']):
            total_steps += 1 * config['env_num']

            value, action, logprob, _ = agent.sample(obs)
            next_obs, reward, next_done, info = envs.step(action)
            rollout.append(obs, action, logprob, reward, done, value.flatten())
            obs, done = next_obs, next_done

            for k in range(config['env_num']):
                if done[k] and "episode" in info[k].keys():
                    logger.info(
                        "Training: total steps: {}, episode rewards: {}".
                        format(total_steps, info[k]['episode']['r']))
                    summary.add_scalar("train/episode_reward",
                                       info[k]["episode"]["r"], total_steps)

        # Bootstrap value if not done
        value = agent.value(obs)
        rollout.compute_returns(value, done)

        # Optimizing the policy and value network
        value_loss, action_loss, entropy_loss, lr = agent.learn(rollout)

        if (total_steps + 1) // config['test_every_steps'] >= test_flag:
            while (total_steps + 1) // config['test_every_steps'] >= test_flag:
                test_flag += 1

            if config['continuous_action']:
                # set running mean and variance of obs
                ob_rms = envs.eval_ob_rms
                eval_env.env.set_ob_rms(ob_rms)

            avg_reward = run_evaluate_episodes(agent, eval_env,
                                               config['eval_episode'])
            summary.add_scalar('eval/episode_reward', avg_reward, total_steps)
            logger.info('Evaluation over: {} episodes, Reward: {}'.format(
                config['eval_episode'], avg_reward))


if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument(
        "--env",
        type=str,
        default="PongNoFrameskip-v4",
        help="OpenAI gym environment name")
    parser.add_argument(
        "--seed", type=int, default=None, help="seed of the experiment")
    parser.add_argument(
        "--env_num",
        type=int,
        default=None,
        help=
        "number of the environment. Note: if greater than 1, xparl is needed")
    parser.add_argument(
        '--continuous_action',
        action='store_true',
        default=False,
        help='action type of the environment')
    parser.add_argument(
        "--xparl_addr",
        type=str,
        default=None,
        help="xparl address for distributed training ")
    parser.add_argument(
        '--train_total_steps',
        type=int,
        default=10e6,
        help='number of total time steps to train (default: 10e6)')
    parser.add_argument(
        '--test_every_steps',
        type=int,
        default=int(5e3),
        help='the step interval between two consecutive evaluations')

    args = parser.parse_args()
    main()