我按照AC算法结合之前的讲的写了一个倒立摆的AC算法,为什么效果奇差无比!!!!?求指正QAQ
import gym
#######################定义环境########################
#定义环境
class MyWrapper(gym.Wrapper):
def init(self):
env = gym.make('Pendulum-v1', render_mode='rgb_array')
super().init(env)
self.env = env
self.step_n = 0
def reset(self):
state, _ = self.env.reset()
self.step_n = 0
return state
def step(self, action):
state, reward, terminated, truncated, info = self.env.step(action)
done = terminated or truncated
self.step_n += 1
if self.step_n >= 200:
done = True
return state, reward, done, info
env = MyWrapper()
####################展示游戏######################################
from matplotlib import pyplot as plt
%matplotlib inline
#打印游戏
def show():
plt.imshow(env.render())
plt.show()
##################创建网络#################################
创建两个网络,Actot、Critic
import torch
model_actor = torch.nn.Sequential(
torch.nn.Linear(3,128),
torch.nn.ReLU(),
torch.nn.Linear(128,11),
torch.nn.Softmax()
)
model_critic =torch.nn.Sequential(
torch.nn.Linear(3,128),
torch.nn.ReLU(),
torch.nn.Linear(128,1)
)
####################获取动作##################################
import random
def get_action(state):
state = torch.FloatTensor(state).reshape(1, 3)
prob = model_actor(state)
action = random.choices(range(11), weights=prob[0].tolist(),k=1)[0]
action_continuous = action
action_continuous /= 10
action_continuous *= 4
action_continuous -= 2
return action, action_continuous
########################获取数据##################################
def get_data():
states = []
rewards = []
actions = []
next_states = []
overs = []
# 初始化游戏
state = env.reset()
over = False
while not over:
action, action_continuous = get_action(state)
#print(f'执行动作:{action_continuous}')
next_state, reward, over, _ = env.step([action_continuous])
states.append(state)
rewards.append(reward)
actions.append(action)
next_states.append(next_state)
overs.append(over)
state = next_state
states = torch.FloatTensor(states).reshape(-1, 3)
rewards = torch.FloatTensor(rewards).reshape(-1, 1)
actions = torch.LongTensor(actions).reshape(-1, 1)
next_states = torch.FloatTensor(next_states).reshape(-1, 3)
overs = torch.LongTensor(overs).reshape(-1, 1)
return states, rewards, actions, next_states, overs
#########################游戏测试#########################################
from IPython import display
def test(play):
#初始化游戏
state = env.reset()
#记录反馈值的和,这个值越大越好
reward_sum = 0
#玩到游戏结束为止
over = False
while not over:
#根据当前状态得到一个动作
action,action_continuous = get_action(state)
#执行动作,得到反馈
state, reward, over, _ = env.step([action_continuous])
reward_sum += reward
#打印动画
if play:
display.clear_output(wait=True)
show()
return reward_sum
###########################训练函数##################################
def train():
optimizer_actor = torch.optim.Adam(model_actor.parameters(), lr=2e-3)
optimizer_critic = torch.optim.Adam(model_critic.parameters(), lr=1e-2)
loss_fn = torch.nn.MSELoss()
for i in range(2000):
states, rewards, actions, next_states, overs = get_data()
values = model_critic(states)
targets = model_critic(next_states)
targets *= 0.98
targets *= 1 - overs
targets += rewards
delta = (values - targets).detach()
probs = model_actor(states)
probs = probs.gather(dim=1, index=actions)
loss = (-probs.log() * delta).mean()
loss_critic =loss_fn(values,targets.detach())
optimizer_actor.zero_grad()
loss.backward()
optimizer_actor.step()
optimizer_critic.zero_grad()
loss_critic.backward()
optimizer_critic.step()
if i % 100 == 0:
test_result = sum([test(play=False) for _ in range(10)]) / 10
print(f"epoch:{i},score:{test_result}")
train()

[
13AC倒立摆.pdf
](url)
我按照AC算法结合之前的讲的写了一个倒立摆的AC算法,为什么效果奇差无比!!!!?求指正QAQ
import gym
#######################定义环境########################
#定义环境
class MyWrapper(gym.Wrapper):
def init(self):
env = gym.make('Pendulum-v1', render_mode='rgb_array')
super().init(env)
self.env = env
self.step_n = 0
env = MyWrapper()
####################展示游戏######################################
from matplotlib import pyplot as plt
%matplotlib inline
#打印游戏
def show():
plt.imshow(env.render())
plt.show()
##################创建网络#################################
创建两个网络,Actot、Critic
import torch
model_actor = torch.nn.Sequential(
torch.nn.Linear(3,128),
torch.nn.ReLU(),
torch.nn.Linear(128,11),
torch.nn.Softmax()
)
model_critic =torch.nn.Sequential(
torch.nn.Linear(3,128),
torch.nn.ReLU(),
torch.nn.Linear(128,1)
)
####################获取动作##################################
import random
def get_action(state):
state = torch.FloatTensor(state).reshape(1, 3)
########################获取数据##################################
def get_data():
states = []
rewards = []
actions = []
next_states = []
overs = []
#########################游戏测试#########################################
from IPython import display
def test(play):
#初始化游戏
state = env.reset()
###########################训练函数##################################
def train():
optimizer_actor = torch.optim.Adam(model_actor.parameters(), lr=2e-3)
optimizer_critic = torch.optim.Adam(model_critic.parameters(), lr=1e-2)
train()

[
13AC倒立摆.pdf
](url)