一个最基本的深度强化学习训练流程 pipeline 应该是这样的:
- 初始化环境、网络、经验池
- 在环境中探索,并把数据存入经验池
- 从经验池中取出数据,更新网络参数
- 对训练得到的策略进行评估,循环 2、3、4 步
# initialization
env = BuildEnv()
actor = PolicyNetwork()
critic = ValueNetwork()
buffer = ExperimenceRelayBuffer()
# training loop
for i in range(training_episode):
# explore in env
state = env.reset()
for _ in range(max_step):
next_state, reward, done, info_dict = env.step(action)
buffer.append((state, reward, done, next_state)) # transition
state = next_state
if done:
break
# update network parameters
for _ in range(...):
batch_data = buffer.random_sample()
Q_label = ...
critic_object = critic_loss = cirterion(Q_label, critic(...)) # loss function
actor_object = Q_value_est = critic(state, actor(...)) # Q value estimation