Source code for qualia2.rl.envs.classic_control

# -*- coding: utf-8 -*- 
from ..core import Env, Tensor

[docs]class CartPole(Env):
    ''' CartPole\n
    A pole is attached by an un-actuated joint to a cart, which moves along a frictionless track. 
    The pendulum starts upright, and the goal is to prevent it from falling over by increasing and reducing the cart's velocity.
    
    Observation: 
        Type: Box(4)
        Num	Observation               Min           Max
        0	Cart Position             -4.8          4.8
        1	Cart Velocity             -Inf          Inf
        2	Pole Angle                -24 deg       24 deg
        3	Pole Velocity At Tip      -Inf          Inf
        
    Actions:
        Type: Discrete(2)
        Num	Action
        0	Push cart to the left
        1	Push cart to the right
    Reward:
        0 for each step
        -1 if terminate condition meet before max_steps-5
        1 if terminate condition meet after max_steps-5
        (Note: original reward with the gym environment is not used)
    
    Reference:
        https://github.com/openai/gym/wiki/CartPole-v0
    '''
    def __init__(self):
        super().__init__('CartPole-v0')

[docs]    def step(self, action):
        self.steps += 1
        next_state, reward, done, info = self.env.step(action)
        return self.state_transformer(next_state), self.reward_transformer(reward, done), done, info
    
[docs]    def reward_transformer(self, reward, done):
        # clip rewards
        if done:
            if self.steps < self.max_steps-5:
                return Tensor(-1.0)
            else:
                return Tensor(1.0)
        else:
            return Tensor(0.0)

[docs]class MountainCar(Env):
    ''' MountainCar\n
    Get an under powered car to the top of a hill (top = 0.5 position)
    
    Observation: 
        Type: Box(2)
        Num	Observation    Min      Max
        0 	position 	  -1.2  	0.6
        1 	velocity      -0.07 	0.07
        
    Actions:
        Type: Discrete(3)
        Num	Action
        0 	push left
        1 	no push
        2 	push right
    
    Reward:
        -1 for each step
    Reference:
        https://github.com/openai/gym/wiki/MountainCar-v0
    '''
    def __init__(self):
        super().__init__('MountainCar-v0')
    
[docs]    def step(self, action):
        self.steps += 1
        next_state, reward, done, info = self.env.step(action)
        return self.state_transformer(next_state), self.reward_transformer(reward, done), done, info

[docs]    def reward_transformer(self, reward, done):
        if done:
            return Tensor(1 if self.steps < self.max_steps else -1)
        else:
            return Tensor(0)

[docs]class MountainCarContinuous(Env):
    ''' MountainCar\n
    Get an under powered car to the top of a hill (top = 0.5 position)
    
    Observation: 
        Type: Box(2)
        Num	Observation    Min      Max
        0 	position 	  -1.2  	0.6
        1 	velocity      -0.07 	0.07
        
    Actions:
        Type: Box(1)
        Num	Action
        0 	Push car to the left (negative value) or to the right (positive value)
    
    Reward:
        Reward is 100 for reaching the target of the hill on the right hand side, minus the squared sum of actions from start to goal.
    Reference:
        https://github.com/openai/gym/wiki/MountainCarContinuous-v0
    '''
    def __init__(self):
        super().__init__('MountainCarContinuous-v0')
    
[docs]    def step(self, action):
        self.steps += 1
        next_state, reward, done, info = self.env.step(action)
        return self.state_transformer(next_state), self.reward_transformer(reward, done), done, info

[docs]    def reward_transformer(self, reward, done):
        if done:
            return Tensor(1 if self.steps < self.max_steps else -1)
        else:
            return Tensor(0)

[docs]class Pendulum(Env):
    ''' Pendulum\n
    Try to keep a frictionless pendulum standing up.

    Observation: 
        Type: Box(3)
        Num	Observation               Min           Max
        0	cos(theta)               -1.0       	1.0
        1	sin(theta)               -1.0       	1.0
        2	theta dot                -8.0 	        8.0
        
    Actions:
        Type: Box(1)
        Num	Action                    Min           Max
        0	Joint effort             -2.0       	2.0

    Reward:
        The precise equation for reward: -(theta^2 + 0.1*theta_dt^2 + 0.001*action^2)
        Theta is normalized between -pi and pi. Therefore, the lowest cost is -(pi^2 + 0.1*8^2 + 0.001*2^2) = -16.2736044, and the highest cost is 0. 
        In essence, the goal is to remain at zero angle (vertical), with the least rotational velocity, and the least effort.

    Starting State:
        Random angle from -pi to pi, and random velocity between -1 and 1
    '''
    def __init__(self):
        super().__init__('Pendulum-v0')
    
[docs]class Acrobot(Env):
    ''' Acrobot \n
    The acrobot system includes two joints and two links, where the joint between the two links is actuated.
    '''
    def __init__(self):
        super().__init__('Acrobot-v1')