python实现q-learning算法

关于q-learning算法,可参照以下博客,我只是复现作者的算法,如有错误,请私信改正。

A Painless Q-learning Tutorial (一个 Q-learning 算法的简明教程)_peghoty-CSDN博客

 

import numpy as np
import pandas as pd

class QL:
    def __init__(self, actions, gamma=0.8, e_greedy=0.9):
        self.actions = actions
        self.gamma = gamma
        self.e_greedy = e_greedy
        self.q_table = pd.DataFrame(columns=actions, dtype=np.float64)  #行为状态,列是动作,这里简化了,同时也是下一个状态

    def choose_action(self, state):
        self.check_state(state)
        if(np.random.uniform(0, 1)<self.e_greedy):
            action_list = self.q_table.loc[state, :]  # 取出当前state的行
            action = action_list[action_list == action_list.max()].index #找最大值的动作,可能有多个,比如刚开始都是0.
            action = np.random.choice(action) # 在最大值中随机选择一个动作。
        else:
            action = np.random.choice(self.actions)
        return action

    def learn(self, state_now, state_next, reward_value):
        state_next_list = self.q_table.loc[state_next, :]
        self.q_table.loc[state_now, state_next] = reward_value + self.gamma * state_next_list.max()


    def check_state(self, state):
        if state not in self.q_table.index:
            self.q_table = self.q_table.append(pd.Series([0]*len(self.actions), index=self.actions,name=state))



terminal = 5   # 出口在5
times = 1000  # 进行1000次的尝试
actions = np.array([0,1,2,3,4,5])  #6个动作

agent = QL(actions)  # 6个动作
reward_rect = [[-1,-1,-1,-1,0,-1],
               [-1,-1,-1,0,-1,100],
               [-1,-1,-1,0,-1,-1],
               [-1,0,0,-1,0,-1],
               [0,-1,-1,0,-1,100],
               [-1,0,-1,-1,0,100]]   
reward_rect = np.array(reward_rect) # 收益矩阵R
for episode in range(times):
    state_now = np.random.choice(agent.actions)  # 每次开始都随机选择一个初始状态作为起点
    agent.check_state(state_now)   # 检查有没有到过这个状态
    route = [state_now]  # 该列表记录到达过的状态,运行的时候,可以先把迭代次数调至50,把route可以打印出来看看(在下面取消注释就行了)
    
# 开始探索
    while True:
        state_next = agent.choose_action(state_now)  # 寻找下一状态
        agent.check_state(state_next)
        route.append(state_next)
        if(reward_rect[state_now, state_next]==-1):
            break
        agent.learn(state_now, state_next, reward_rect[state_now, state_next])
        if(state_next==terminal):
            break
        state_now = state_next
    # print("行走路线:", route)
    
print(agent.q_table)

上一篇:强化学习深度解析之贝尔曼方程(一)


下一篇:20220222省选组总结