2021-10-17 5.7

class SARSA():
    def __init__(self, env, num_episodes, discount=1.0, alpha=0.5, epsilon=0.1, n_bins=10):
        self.nA = env.action_space.n  # 动作空间数
        self.nS = env.observation_space.shape[0]  # 状态空间数
        self.env = env  # 环境
        self.num_episodes = num_episodes  # 迭代次数
        self.epsilon = epsilon  # 贪婪策略系数
        self.discount = discount  # 折扣因子
        self.alpha = alpha  # 时间差分误差系数,即学习率
        self.Q = defaultdict(lambda: np.zeros(self.nA))  # 动作值函数

    def __epislon_greedy_policy(self, epsilon, nA):  # 贪婪策略
        def policy(state):
            A = np.ones(nA, dtype=float) * epsilon / nA
            best_action = np.argmax(self.Q[state])
            A[best_action] += (1 - epsilon)
            return A
        return policy

    def __next_action(self, prob):  # 动作选择函数
        return np.random.choice(np.arange(len(prob)), p=prob)

    def sarsa(self):
        policy = self.__epislon_greedy_policy(self.epsilon, self.nA)  # 定义策略
        sumlist = []
        for i_episode in range(self.num_episodes):  # 迭代经验策略
            step = 0
            state__ = self.env.reset()  # 初始化状态
            state = self.__get_bins_states(state__)  # 状态重新赋值(可简化状态空间,将连续状态近似为离散情况)
            prob_actions = policy(state)  # 获得动作选择概率
            action = self.__next_action(prob_actions)  # 获得动作
            while(True):
                next_state__, reward, done, info = env.step(action)  # 获得下一状态,奖励,done等
                next_state = self.__get_bins_states(next_state__)
                prob_next_actions = policy(next_state)  # 得到下一状态的动作概率
                next_action = self.__next_action(prob_next_actions)  # 获得下一动作
                # 时间差分更新
                td_target = reward + self.discount * self.Q[next_state][next_action]
                td_delta = td_target - self.Q[state][action]
                self.Q[state][action] += self.alpha * td_delta
                if done:
                    reward = -200
                    break
                else:
                    state = next_state
                    action = next_action
        return self.Q
上一篇:Redux配合axios的基本使用(Redux第三弹)


下一篇:Form表单使用、ajax解决表单提交后页面跳转数据丢失的问题