|
- #! /usr/bin/python
- # -*- coding: utf-8 -*-
-
- import numpy as np
- import tensorflow as tf
- from six.moves import xrange
-
- __all__ = [
- 'discount_episode_rewards',
- 'cross_entropy_reward_loss',
- 'log_weight',
- 'choice_action_by_probs',
- ]
-
-
- def discount_episode_rewards(rewards=None, gamma=0.99, mode=0):
- """Take 1D float array of rewards and compute discounted rewards for an
- episode. When encount a non-zero value, consider as the end a of an episode.
-
- Parameters
- ----------
- rewards : list
- List of rewards
- gamma : float
- Discounted factor
- mode : int
- Mode for computing the discount rewards.
- - If mode == 0, reset the discount process when encount a non-zero reward (Ping-pong game).
- - If mode == 1, would not reset the discount process.
-
- Returns
- --------
- list of float
- The discounted rewards.
-
- Examples
- ----------
- >>> rewards = np.asarray([0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1])
- >>> gamma = 0.9
- >>> discount_rewards = tl.rein.discount_episode_rewards(rewards, gamma)
- >>> print(discount_rewards)
- [ 0.72899997 0.81 0.89999998 1. 0.72899997 0.81
- 0.89999998 1. 0.72899997 0.81 0.89999998 1. ]
- >>> discount_rewards = tl.rein.discount_episode_rewards(rewards, gamma, mode=1)
- >>> print(discount_rewards)
- [ 1.52110755 1.69011939 1.87791049 2.08656716 1.20729685 1.34144104
- 1.49048996 1.65610003 0.72899997 0.81 0.89999998 1. ]
-
- """
- if rewards is None:
- raise Exception("rewards should be a list")
- discounted_r = np.zeros_like(rewards, dtype=np.float32)
- running_add = 0
- for t in reversed(xrange(0, rewards.size)):
- if mode == 0:
- if rewards[t] != 0: running_add = 0
-
- running_add = running_add * gamma + rewards[t]
- discounted_r[t] = running_add
- return discounted_r
-
-
- def cross_entropy_reward_loss(logits, actions, rewards, name=None):
- """Calculate the loss for Policy Gradient Network.
-
- Parameters
- ----------
- logits : tensor
- The network outputs without softmax. This function implements softmax inside.
- actions : tensor or placeholder
- The agent actions.
- rewards : tensor or placeholder
- The rewards.
-
- Returns
- --------
- Tensor
- The TensorFlow loss function.
-
- Examples
- ----------
- >>> states_batch_pl = tf.placeholder(tf.float32, shape=[None, D])
- >>> network = InputLayer(states_batch_pl, name='input')
- >>> network = DenseLayer(network, n_units=H, act=tf.ops.relu, name='relu1')
- >>> network = DenseLayer(network, n_units=3, name='out')
- >>> probs = network.outputs
- >>> sampling_prob = tf.ops.softmax(probs)
- >>> actions_batch_pl = tf.placeholder(tf.int32, shape=[None])
- >>> discount_rewards_batch_pl = tf.placeholder(tf.float32, shape=[None])
- >>> loss = tl.rein.cross_entropy_reward_loss(probs, actions_batch_pl, discount_rewards_batch_pl)
- >>> train_op = tf.train.RMSPropOptimizer(learning_rate, decay_rate).minimize(loss)
-
- """
- cross_entropy = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=actions, logits=logits, name=name)
-
- return tf.reduce_sum(tf.multiply(cross_entropy, rewards))
-
-
- def log_weight(probs, weights, name='log_weight'):
- """Log weight.
-
- Parameters
- -----------
- probs : tensor
- If it is a network output, usually we should scale it to [0, 1] via softmax.
- weights : tensor
- The weights.
-
- Returns
- --------
- Tensor
- The Tensor after appling the log weighted expression.
-
- """
- with tf.variable_scope(name):
- exp_v = tf.reduce_mean(tf.log(probs) * weights)
- return exp_v
-
-
- def choice_action_by_probs(probs=(0.5, 0.5), action_list=None):
- """Choice and return an an action by given the action probability distribution.
-
- Parameters
- ------------
- probs : list of float.
- The probability distribution of all actions.
- action_list : None or a list of int or others
- A list of action in integer, string or others. If None, returns an integer range between 0 and len(probs)-1.
-
- Returns
- --------
- float int or str
- The chosen action.
-
- Examples
- ----------
- >>> for _ in range(5):
- >>> a = choice_action_by_probs([0.2, 0.4, 0.4])
- >>> print(a)
- 0
- 1
- 1
- 2
- 1
- >>> for _ in range(3):
- >>> a = choice_action_by_probs([0.5, 0.5], ['a', 'b'])
- >>> print(a)
- a
- b
- b
-
- """
- if action_list is None:
- n_action = len(probs)
- action_list = np.arange(n_action)
- else:
- if len(action_list) != len(probs):
- raise Exception("number of actions should equal to number of probabilities.")
- return np.random.choice(action_list, p=probs)
|