首先,在上文的Maze方法中要实现随机采样的方法,这里采用了简单的随机采样方法(用 来模拟随机动作,这好像就是所谓的重要性采样方法吧)
# 续Maze
# 随机取样
def gen_randompi_sample(self, num):
state_sample = []
action_sample = []
reward_sample = []
for _ in range(num):
s_tmp = []
a_tmp = []
r_tmp = []
s = self.states[int(random.random() * len(self.states))]
t = False
while False == t:
# 随机一个动作
actions = self.state_action(s)
a = actions[int(random.random() * len(actions))]
actions, s1, r = self.transform(s,a)
s_tmp.append(s)
r_tmp.append(r)
a_tmp.append(a)
s = s1
t = self.is_done(s1)
state_sample.append(s_tmp)
reward_sample.append(r_tmp)
action_sample.append(a_tmp)
return state_sample, action_sample, reward_sample
MC方法的实现与运行
from env_maze import Maze
import numpy as np
class MC:
# param state_sample
# param action_sample
# param reward_sample
# 一次{状态-动作-奖励序列}的策略评估
def mc(self, gamma, state_sample, action_sample, reward_sample, mdp_states):
# 初始化所有的state v = 0 n = 0
vfunc = dict()
nfunc = dict()
for s in mdp_states:
vfunc[self.encode_state(s)] = 0.0
nfunc[self.encode_state(s)] = 0.0
# 计算过程
for iter1 in range(len(state_sample)):
G = 0.0
for step in range(len(state_sample[iter1])-1,-1,-1): # 逆向
G *= gamma
G += reward_sample[iter1][step]
for step in range(len(state_sample[iter1])): # 正向
s = state_sample[iter1][step]
vfunc[self.encode_state(s)] += G
nfunc[self.encode_state(s)] += 1.0
G -= reward_sample[iter1][step]
G /= gamma
for s in mdp_states: # 求均值
if nfunc[self.encode_state(s)] > 0.000001:
vfunc[self.encode_state(s)] /= nfunc[self.encode_state(s)]
return vfunc
# 解析动作
def encode_state(self, state):
return "%d_%d" % (state[0],state[1])
if __name__ == "__main__":
env = Maze()
worker = MC()
state_sample, action_sample, reward_sample = env.gen_randompi_sample(10000)
vfunc = worker.mc(0.2,state_sample, action_sample, reward_sample, env.states)
print(vfunc)
(未完待续,接下来尝试MetropolisHastings 算法采样)