-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathbaselines_test.py
More file actions
119 lines (84 loc) · 2.89 KB
/
baselines_test.py
File metadata and controls
119 lines (84 loc) · 2.89 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
###
### V2
###
import time
from stable_baselines3 import SAC, PPO
# from envs import *
import gym
import numpy as np
from gym.envs import box2d
import torch
from envs import HumanoidEnv, ANYMalStandupEnv
class BipedalWalkerEnv(gym.envs.box2d.BipedalWalker):
def __init__(self, config):
self.velocities = []
self.do_render = config.get("render", False)
self.cheat = config.get("cheat", False)
super().__init__()
self.hardcore = config.get("hardcore", False)
self.time = 0
# clip lidar for non cheating agent
if not self.cheat:
high = np.array([np.inf] * 14)
self.observation_space = gym.spaces.Box(-high, high, dtype=np.float32)
def step(self, a):
ob, reward, done, info = super().step(a)
vel = self.hull.linearVelocity.x
self.velocities.append(vel)
info.update(dict(avg_speed=np.mean(self.velocities),
checkpoints=-1))
if self.do_render:
self.render()
self.time += 1
if self.time > 2500:
done = True
# clip lidar for non cheating agent
if not self.cheat:
ob = ob[:14]
return ob, reward, done, info
def reset(self):
self.time = 0
self.velocities = []
return super().reset()
model_path = "pretrained/BipedalWalkerHardcore_pretrained.zip"
experiment_conf = {"render": True,
"terrain": "flat",
# "cheat": "cheat" in model_path,
"cheat": True,
"perturb_magnitude": 1,
"hardcore": True,
"desc": ""}
# env = ANYMalStandupEnv(experiment_conf)
env = BipedalWalkerEnv(experiment_conf)
# env = HumanoidEnv(experiment_conf)
# env = HumanoidEnvGym(experiment_conf)
env.render()
# env = InvertedPendulumEnvR(experiment_conf)
model = SAC.load(model_path)
# torch.save(model_teacher.policy.state_dict(), "models/baselines/BipedalWalkerHardcore-statedict.th")
# model_teacher = SAC("MlpPolicy", env, policy_kwargs=dict(net_arch=[400, 300]))
# model_teacher.policy.load_state_dict(torch.load("BipedalWalkerHardcore-statedict.th"))
scores = []
for i in range(100):
obs = env.reset()
score = 0
for i in range(2000):
action, _states = model.predict(obs)
# print(action)
# action = np.array([np.sin(i/50)] * 12)
# action = env.action_space.sample() * 0
obs, reward, done, info = env.step(action)
# print()
# print("direction_reward", info["direction_reward"])
# print("speed_reward", info["speed_reward"])
score += reward
env.render()
if done:
break
scores.append(score)
print("SCORE", score)
print("avg", np.mean(scores))
# pendulum cheat: avg 233.75
# pendulum: avg 233.75
# pendulum cheat 2x perturb: avg 102.83
# pendulum 2x perturb: avg 141.27 +