Monday, July 2, 2018

Dynamic Programming code in TensorFlow

Following is the code implemented in TensorFlow for dynamic programming of example 4.1 from the great book: Reinforcement Learning: An Introduction. As promised at last all the pseudo code will be implemented in TensorFlow.

Enjoy it and welcome further discussion.

import tensorflow as tf

num_iters = 1000
num_states = 16

V = [tf.get_variable("V%d" % i, [], tf.float64, initializer = tf.zeros_initializer()) for i in range(num_states)]

V0 = V[0]
V1 = -0.25 * (1 - V[0] + 1 - V[1] + 1 - V[2] + 1 - V[5])
V2 = -0.25 * (1 - V[1] + 1 - V[2] + 1 - V[3] + 1 - V[6])
V3 = -0.25 * (1 - V[2] + 1 - V[3] + 1 - V[3] + 1 - V[7])
V4 = -0.25 * (1 - V[4] + 1 - V[0] + 1 - V[5] + 1 - V[8])
V5 = -0.25 * (1 - V[4] + 1 - V[1] + 1 - V[6] + 1 - V[9])
V6 = -0.25 * (1 - V[5] + 1 - V[2] + 1 - V[7] + 1 - V[10])
V7 = -0.25 * (1 - V[6] + 1 - V[3] + 1 - V[7] + 1 - V[11])
V8 = -0.25 * (1 - V[8] + 1 - V[4] + 1 - V[9] + 1 - V[12])
V9 = -0.25 * (1 - V[8] + 1 - V[5] + 1 - V[10] + 1 - V[13])
V10 = -0.25 * (1 - V[9] + 1 - V[6] + 1 - V[11] + 1 - V[14])
V11 = -0.25 * (1 - V[10] + 1 - V[7] + 1 - V[11] + 1 - V[15])
V12 = -0.25 * (1 - V[12] + 1 - V[8] + 1 - V[13] + 1 - V[12])
V13 = -0.25 * (1 - V[12] + 1 - V[9] + 1 - V[14] + 1 - V[13])
V14 = -0.25 * (1 - V[13] + 1 - V[10] + 1 - V[15] + 1 - V[14])
V15 = V[15]


delta_lst = []
for i in range(num_states):
    verbose_op = tf.Print(V[i], [tf.round(V[i])], message = "value of V(%d) = " % i)
    delta_lst.append(tf.abs(V[i] - eval("V%d" % i)))
    with tf.control_dependencies([verbose_op]):
        V[i] = tf.assign(V[i], eval("V%d" % i))

delta = tf.reduce_max(delta_lst)

stop_op = tf.cond(tf.less(delta, 0.0001), lambda: True, lambda: False)

with tf.Session() as sess:
    sess.run(tf.global_variables_initializer())

    for i in range(num_iters):
        if sess.run(stop_op):
            print("\ncurrent iteration {}".format(i))
            break

        for j in range(num_states):
            sess.run(V[j])

No comments:

Post a Comment