简介
用一个具体的例子来演示策略梯度的应用。这里使用了OpenAI Gym中的CartPole-v0游戏来作为我们算法应用。这个游戏比较简单,基本要求就是控制下面的cart移动使连接在上面的pole保持垂直不倒。这个任务只有两个离散动作,要么向左用力,要么向右用力。而state状态就是这个cart的位置和速度, pole的角度和角速度,4维的特征。坚持到200分的奖励则为过关。
代码使用了一个三层的神经网络,输入层,一个隐藏层和一个输出层。下面我们看看关键部分的代码。代码参考了博客园的一个例子,并做了适当修改,适用于tensowflow2.0以上的版本。
#######################################################################
# Copyright (C) #
# 2016 - 2019 Pinard Liu(liujianping-ok@163.com) #
# https://www.cnblogs.com/pinard #
# Permission given to modify the code as long as you keep this #
# declaration at the top #
#######################################################################
## https://www.cnblogs.com/pinard/p/10137696.html ##
## 强化学习(十三) 策略梯度(Policy Gradient) ##
import gym
import tensorflow as tf
import numpy as np
import random
from collections import deque
# Hyper Parameters
GAMMA = 0.95 # discount factor
LEARNING_RATE=0.01
class Policy_Gradient():
def __init__(self, env):
# init some parameters
self.time_step = 0
self.state_dim = env.observation_space.shape[0]
self.action_dim = env.action_space.n
self.ep_obs, self.ep_as, self.ep_rs = [], [], []
self.create_softmax_network()
# Init session
self.session = tf.compat.v1.InteractiveSession()
self.session.run(tf.compat.v1.global_variables_initializer())
def create_softmax_network(self):
# network weights
tf.compat.v1.disable_eager_execution()
W1 = self.weight_variable([self.state_dim, 20])
b1 = self.bias_variable([20])
W2 = self.weight_variable([20, self.action_dim])
b2 = self.bias_variable([self.action_dim])
# input layer
self.state_input = tf.compat.v1.placeholder("float", [None, self.state_dim])
self.tf_acts = tf.compat.v1.placeholder(tf.int32, [None, ], name="actions_num")
self.tf_vt = tf.compat.v1.placeholder(tf.float32, [None, ], name="actions_value")
# hidden layers
h_layer = tf.nn.relu(tf.matmul(self.state_input, W1) + b1)
# softmax layer
self.softmax_input = tf.matmul(h_layer, W2) + b2
#softmax output
self.all_act_prob = tf.nn.softmax(self.softmax_input, name='act_prob')
self.neg_log_prob = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=self.softmax_input,
labels=self.tf_acts)
self.loss = tf.reduce_mean(self.neg_log_prob * self.tf_vt) # reward guided loss
self.train_op = tf.compat.v1.train.AdamOptimizer(LEARNING_RATE).minimize(self.loss)
def weight_variable(self, shape):
initial = tf.compat.v1.truncated_normal(shape)
return tf.Variable(initial)
def bias_variable(self, shape):
initial = tf.constant(0.01, shape=shape)
return tf.Variable(initial)
def choose_action(self, observation):
prob_weights = self.session.run(self.all_act_prob, feed_dict={self.state_input: observation[np.newaxis, :]})
action = np.random.choice(range(prob_weights.shape[1]), p=prob_weights.ravel()) # select action w.r.t the actions prob
return action
def store_transition(self, s, a, r):
self.ep_obs.append(s)
self.ep_as.append(a)
self.ep_rs.append(r)
def learn(self):
discounted_ep_rs = np.zeros_like(self.ep_rs)
running_add = 0
for t in reversed(range(0, len(self.ep_rs))):
running_add = running_add * GAMMA + self.ep_rs[t]
discounted_ep_rs[t] = running_add
discounted_ep_rs -= np.mean(discounted_ep_rs)
discounted_ep_rs /= np.std(discounted_ep_rs)
# train on episode
self.session.run(self.train_op, feed_dict={
self.state_input: np.vstack(self.ep_obs),
self.tf_acts: np.array(self.ep_as),
self.tf_vt: discounted_ep_rs,
})
self.ep_obs, self.ep_as, self.ep_rs = [], [], [] # empty episode data
# Hyper Parameters
ENV_NAME = 'CartPole-v0'
EPISODE = 3000 # Episode limitation
STEP = 3000 # Step limitation in an episode
TEST = 10 # The number of experiment test every 100 episode
def main():
# initialize OpenAI Gym env and dqn agent
env = gym.make(ENV_NAME)
agent = Policy_Gradient(env)
for episode in range(EPISODE):
# initialize task
state = env.reset()
# Train
for step in range(STEP):
action = agent.choose_action(state) # e-greedy action for train
next_state,reward,done,_ = env.step(action)
agent.store_transition(state, action, reward)
state = next_state
if done:
#print("stick for ",step, " steps")
agent.learn()
break
# Test every 100 episodes
if episode % 100 == 0:
total_reward = 0
for i in range(TEST):
state = env.reset()
for j in range(STEP):
env.render()
action = agent.choose_action(state) # direct action for test
state,reward,done,_ = env.step(action)
total_reward += reward
if done:
break
ave_reward = total_reward/TEST
print ('episode: ',episode,'Evaluation Average Reward:',ave_reward)
if __name__ == '__main__':
main()
总结
在策略梯度中更新价值函数需要的是完整的序列,所以必须每个序列运行一次才能更新一次策略参数,学习效率来说相对较低,但相对于q学习,策略梯度更适用于连续动作的状态环境。