class SARSA():
def __init__(self, env, num_episodes, discount=1.0, alpha=0.5, epsilon=0.1, n_bins=10):
self.nA = env.action_space.n # 动作空间数
self.nS = env.observation_space.shape[0] # 状态空间数
self.env = env # 环境
self.num_episodes = num_episodes # 迭代次数
self.epsilon = epsilon # 贪婪策略系数
self.discount = discount # 折扣因子
self.alpha = alpha # 时间差分误差系数,即学习率
self.Q = defaultdict(lambda: np.zeros(self.nA)) # 动作值函数
def __epislon_greedy_policy(self, epsilon, nA): # 贪婪策略
def policy(state):
A = np.ones(nA, dtype=float) * epsilon / nA
best_action = np.argmax(self.Q[state])
A[best_action] += (1 - epsilon)
return A
return policy
def __next_action(self, prob): # 动作选择函数
return np.random.choice(np.arange(len(prob)), p=prob)
def sarsa(self):
policy = self.__epislon_greedy_policy(self.epsilon, self.nA) # 定义策略
sumlist = []
for i_episode in range(self.num_episodes): # 迭代经验策略
step = 0
state__ = self.env.reset() # 初始化状态
state = self.__get_bins_states(state__) # 状态重新赋值(可简化状态空间,将连续状态近似为离散情况)
prob_actions = policy(state) # 获得动作选择概率
action = self.__next_action(prob_actions) # 获得动作
while(True):
next_state__, reward, done, info = env.step(action) # 获得下一状态,奖励,done等
next_state = self.__get_bins_states(next_state__)
prob_next_actions = policy(next_state) # 得到下一状态的动作概率
next_action = self.__next_action(prob_next_actions) # 获得下一动作
# 时间差分更新
td_target = reward + self.discount * self.Q[next_state][next_action]
td_delta = td_target - self.Q[state][action]
self.Q[state][action] += self.alpha * td_delta
if done:
reward = -200
break
else:
state = next_state
action = next_action
return self.Q