Sunday, January 14, 2018

Simple bandit algorithm in TensorFlow

I find it's nice to post here, so probably later more here.

Thanks for Prof. Richard S. Sutton and Andrew G. Barto for open sourcing their wonderful textbook Reinforcement Learning: An Introduction. Google boosts that TensorFlow is a general numerical library, so probably it can do everything. So I decide to implement most of the examples in the textbook in TensorFlow. So this post kicks off the trying, with first example simple bandit algorithm.

Please refer to Figure 2.1 in the textbook and the pseudo code in section 2.3 to try to understand the code. Here we go!

import tensorflow as tf
import sonnet as snt

class Bandit(snt.AbstractModule):
    def __init__(self, k, epsilon, num_iters, name = "bandit"):
        super(Bandit, self).__init__(name = name)
        self._k = k
        assert num_iters > 0, "invalid number of iterations"
        self._num_iters = num_iters
        assert epsilon > 0 and epsilon < 1, "invalid epsilon value"
        self._epsilon = epsilon
        with self._enter_variable_scope():
            self._means = [0.2, -0.8, 1.6, 0.4, 1.4, -1.6, -0.2, -1.0, 0.8, -0.6]
            self._R = tf.stack([tf.truncated_normal([self._num_iters], mean) for mean in self._means], axis = 0)
            self._Q = tf.get_variable("values", [self._k], tf.float32, tf.zeros_initializer, trainable = False)
            self._N = tf.get_variable("occurs", [self._k], tf.int32, tf.constant_initializer(1, tf.int32), trainable = False)

    def _build(self, it):

        probs = tf.random_uniform([self._num_iters], 0.0, 1.0)
        acts = tf.random_uniform([self._num_iters], 0, self._k, tf.int32)
        A = tf.cond(tf.gather(probs, it) >= self._epsilon, lambda: tf.argmax(self._Q, output_type=tf.int32), lambda: tf.gather(acts, it))
        R = tf.gather_nd(self._R, [A, it])
        self._N = tf.scatter_add(self._N, A, 1)
        R_incr =  tf.squeeze(1.0 / tf.cast(tf.gather(self._N, A), tf.float32) * (R - tf.gather(self._Q, A)))
        self._Q = tf.scatter_add(self._Q, A, R_incr)

        with tf.control_dependencies([self._Q, self._N]):
            A = tf.identity(A)
            R = tf.identity(R)

        return A, R

    def get_values(self):
        return self._Q

    def get_means(self):
        return self._means

def test():
    num_iters = 10000

    bandit10 = Bandit(10, 0.1, num_iters)

    it = tf.placeholder(tf.int32, [])
    a, r, r_incr = bandit10(it)
    q = bandit10.get_values()


    R_avg = tf.get_variable("average_reward", [], dtype = tf.float32, initializer = tf.zeros_initializer)
    R_avg = tf.assign_add(R_avg, r)
    tf.summary.scalar("action", a)
    tf.summary.scalar("reward", r)
    tf.summary.scalar("incremental_reward", r_incr)
    tf.summary.scalar("average_reward", tf.divide(R_avg, tf.cast(it, tf.float32)))
    tf.summary.text("estimated_values", tf.as_string(q))
    summ_op = tf.summary.merge_all()

    with tf.Session() as sess:
        sess.run(tf.global_variables_initializer())

        writer = tf.summary.FileWriter("output", sess.graph)
        for i in range(num_iters):
            '''
            a_v, r_v, r_incr_v, q_v = sess.run([a, r, r_incr, q], feed_dict = {it: i})
            print("iteration {}: action {}, reward {}, incremental value {}".format(i, a_v, r_v, r_incr_v))
            print("estimated values are {}".format(q_v))
            '''

            print("iteration {}".format(i))
            summ_op_str = sess.run(summ_op, feed_dict = {it: i})
            writer.add_summary(summ_op_str, i)

        writer.close()

if __name__ == "__main__":
    test()