import math import gym from gym import spaces, logger from gym.utils import seeding import numpy as np def semi_circle_reward(x,saturated_value,reward_max = 1): 'This funciton gives maximum reward if x = 0. The reward decreases in the shape of a semi-circle until reward = 0 at abs(x) = saturated_value. For abs(x)>saturated_value, reward = 0.' x_normalized = min(abs(x/saturated_value),1) return reward_max*math.sqrt(1-x_normalized**2) class ContinuousCartPoleEnv(gym.Env): def __init__(self): self.gravity = 9.8 self.masscart = 1.0 self.masspole = 0.1 self.total_mass = (self.masspole + self.masscart) self.length = 0.5 # actually half the pole's length self.polemass_length = (self.masspole * self.length) self.force_mag = 30.0 self.tau = 0.02 # seconds between state updates self.min_action = -1.0 self.max_action = 1.0 self.x_goal = 0.0 #Initial goal location. # Angle at which to fail the episode self.theta_threshold_radians = 12 * 2 * math.pi / 360 self.x_threshold = 2.4 # Angle limit set to 2 * theta_threshold_radians so failing observation # is still within bounds high = np.array([ self.x_threshold * 2, np.finfo(np.float32).max, self.theta_threshold_radians * 2, np.finfo(np.float32).max]) self.action_space = spaces.Box( low=self.min_action, high=self.max_action, shape=(1,) ) self.observation_space = spaces.Box(-high, high) self.seed() self.steps_beyond_done = None def seed(self, seed=None): self.np_random, seed = seeding.np_random(seed) return [seed] def stepPhysics(self, force): x, x_dot, theta, theta_dot, x_goal = self.state costheta = math.cos(theta) sintheta = math.sin(theta) temp = (force + self.polemass_length * theta_dot * theta_dot * sintheta) / self.total_mass thetaacc = (self.gravity * sintheta - costheta * temp) / \ (self.length * (4.0/3.0 - self.masspole * costheta * costheta / self.total_mass)) xacc = temp - self.polemass_length * thetaacc * costheta / self.total_mass x = x + self.tau * x_dot x_dot = x_dot + self.tau * xacc theta = theta + self.tau * theta_dot theta_dot = theta_dot + self.tau * thetaacc return (x, x_dot, theta, theta_dot, self.x_goal) def step(self, action): assert self.action_space.contains(action), \ "%r (%s) invalid" % (action, type(action)) # Cast action to float to strip np trappings force = self.force_mag * float(action) self.state = self.stepPhysics(force) x, x_dot, theta, theta_dot, x_goal = self.state done = x < -self.x_threshold \ or x > self.x_threshold \ or theta < -self.theta_threshold_radians \ or theta > self.theta_threshold_radians done = bool(done) # reward = int(abs(x-x_goal)<0.1) reward = 0 # reward += semi_circle_reward((x-x_goal),saturated_value = 0.5, reward_max = 10) #Give a reward of 10 if x=x_goal. No rewards if abs(x-x_goal)> 0.5 reward += (abs(x-x_goal)<0.1)*10 reward += semi_circle_reward(x_dot, saturated_value = 0.5, reward_max = 1) reward += semi_circle_reward(theta, saturated_value = 0.5, reward_max = 1) reward += semi_circle_reward(theta_dot, saturated_value = 0.5, reward_max = 1) if self.steps_beyond_done is None: # Pole just fell! self.steps_beyond_done = 0 reward = 0.0 return np.array(self.state), reward, done, {} def reset(self): self.state = self.np_random.uniform(low=-0.05, high=0.05, size=(5,)) self.steps_beyond_done = None return np.array(self.state)