aidudezzz · veds12 · Jun 16, 2021 · Jun 21, 2021 · Jun 21, 2021 · Jun 23, 2021
diff --git a/.gitignore b/.gitignore
@@ -102,3 +102,6 @@ venv.bak/
 
 # mypy
 .mypy_cache/
+
+# logs
+examples/evolutionary/cartpole_evolutionary/controllers/supervisor_manager/wandb/*
diff --git a/examples/multi_cartpole/README.md b/examples/multi_cartpole/README.md
@@ -0,0 +1,27 @@
+# Multi Cartpole
+
+This example is a modified version of the classic [Cartpole](https://www.gymlibrary.ml/environments/classic_control/cart_pole/) problem in Reinforcement Learning in [Webots](https://cyberbotics.com/).It builds upon the 
+
+## Environment Details
+
+* The environment consists of of 9 cartpole robots placed in a 3D grid world.
+* Each robot works independently of the others. 
+* A reward of +1 is received for each time step **all** carts manage to keep their poles upright
+* An episode ends when any one of the carts looses the balance of it's pole
+* The environment is solved when the average of 100 most recent episodes exceeds 195.0
+
+## Implementation Details
+
+* This example is a depiction of how the [Emitter-Receiver scheme](https://github.com/aidudezzz/deepbots#emitter---receiver-scheme) can be used to communicate between a central controller and multiple robots.
+* The included solution uses a PPO policy with both the actor and the critic shared across all the agents.
+
+## Contents
+
+* [Controllers](./controllers/)
+* [Worlds](./worlds/)
+
+## Graphics
+
+Training using PPO (shared actor and shared critic)
+
+![](./doc/images/multi_cartpole.gif)
diff --git a/examples/multi_cartpole/controllers/robot_controller/robot_controller.py b/examples/multi_cartpole/controllers/robot_controller/robot_controller.py
@@ -0,0 +1,124 @@
+from deepbots.robots.controllers.robot_emitter_receiver_csv import RobotEmitterReceiverCSV
+
+
+class CartPoleRobot(RobotEmitterReceiverCSV):
+    """
+    CartPole robot has 4 wheels and pole connected by an unactuated hinge to its body.
+    The hinge contains a Position Sensor device to measure the angle from vertical needed in the observation.
+    Hinge: https://cyberbotics.com/doc/reference/hingejoint
+    Position Sensor: https://cyberbotics.com/doc/reference/positionsensor
+    """
+
+    def __init__(self):
+        """
+        The constructor gets the Position Sensor reference and enables it and also initializes the wheels.
+        """
+        super().__init__()
+
+        self.robot_num = int(self.robot.getName()[-1])
+        self.timestep = int(self.robot.getBasicTimeStep())
+
+        self.emitter, self.reciever = self.initialize_comms()
+        self.emitter.setChannel(self.robot_num)
+        self.reciever.setChannel(self.robot_num)
+
+        self.positionSensor = self.robot.getDevice("polePosSensor")
+        self.positionSensor.enable(self.timestep)
+
+        self.wheels = [None for _ in range(4)]
+        self.setup_motors()
+
+    def initialize_comms(self, emitter_name="emitter", receiver_name="receiver"):
+        emitter = self.robot.getDevice(emitter_name)
+        receiver = self.robot.getDevice(receiver_name)
+        receiver.enable(self.timestep)
+
+        return emitter, receiver
+
+    def setup_motors(self):
+        """
+        This method initializes the four wheels, storing the references inside a list and setting the starting
+        positions and velocities.
+        """
+        self.wheels[0] = self.robot.getDevice('wheel1')
+        self.wheels[1] = self.robot.getDevice('wheel2')
+        self.wheels[2] = self.robot.getDevice('wheel3')
+        self.wheels[3] = self.robot.getDevice('wheel4')
+        for i in range(len(self.wheels)):
+            self.wheels[i].setPosition(float('inf'))
+            self.wheels[i].setVelocity(0.0)
+
+    def create_message(self):
+        """
+        This method packs the robot's observation into a list of strings to be sent to the supervisor.
+        The message contains only the Position Sensor value, ie. the angle from vertical position in radians.
+        From Webots documentation:
+        'The getValue function returns the most recent value measured by the specified position sensor. Depending on
+        the type, it will return a value in radians (angular position sensor) or in meters (linear position sensor).'
+
+        :return: A list of strings with the robot's observations.
+        :rtype: list
+        """
+        message = [self.positionSensor.getValue()]
+        return message
+
+    def handle_receiver(self):
+        """
+        Modified handle_receiver from the basic implementation of deepbots.
+        This one consumes all available messages in the queue during the step it is called.
+        """
+        while self.receiver.getQueueLength() > 0:
+            # Receive and decode message from supervisor
+            message = self.receiver.getData().decode("utf-8")
+            # Convert string message into a list
+            message = message.split(",")
+
+            self.use_message_data(message)
+
+            self.receiver.nextPacket()
+
+    def use_message_data(self, message):
+        """
+        This method unpacks the supervisor's message, which contains the next action to be executed by the robot.
+        In this case it contains an integer denoting the action, either 0 or 1, with 0 being forward and
+        1 being backward movement. The corresponding motorSpeed value is applied to the wheels.
+
+        :param message: The message the supervisor sent containing the next action.
+        :type message: list of strings
+        """
+        action = int(message[0])
+
+        assert action == 0 or action == 1, "CartPoleRobot controller got incorrect action value: " + str(action)
+
+        if action == 0:
+            motorSpeed = 5.0
+        else:
+            motorSpeed = -5.0
+
+        for i in range(len(self.wheels)):
+            self.wheels[i].setPosition(float('inf'))
+            self.wheels[i].setVelocity(motorSpeed)
+
+    def handle_emitter(self):
+        data = self.create_message()
+        string_message = ""
+
+        if type(data) is list:
+            string_message = ",".join(map(str, data))
+        elif type(data) is str:
+            string_message = data
+        else:
+            raise TypeError(
+                "message must be either a comma-separater string or a 1D list")
+
+        string_message = string_message.encode("utf-8")
+        self.emitter.send(string_message)
+
+    def run(self):
+        while self.robot.step(self.timestep) != 1:
+            self.handle_receiver()
+            self.handle_emitter()
+
+# Create the robot controller object and run it
+robot_controller = CartPoleRobot()
+robot_controller.run()
diff --git a/examples/multi_cartpole/controllers/supervisor_manager/PPO_runner.py b/examples/multi_cartpole/controllers/supervisor_manager/PPO_runner.py
@@ -0,0 +1,93 @@
+from numpy import convolve, ones, mean
+
+from supervisor_controller import CartPoleSupervisor
+from agent.PPO_agent import PPOAgent, Transition
+from utilities import plotData
+
+#Change these variables if needed
+EPISODE_LIMIT = 2000
+STEPS_PER_EPISODE = 200
+NUM_ROBOTS = 9
+
+def run():
+    # Initialize supervisor object
+    supervisorEnv = CartPoleSupervisor()
+
+    episodeCount = 0
+
+    # The agent used here is trained with the PPO algorithm (https://arxiv.org/abs/1707.06347).
+    agent = PPOAgent(supervisorEnv.observationSpace, supervisorEnv.actionSpace)
+
+    solved = False                                    # Whether the solved requirement is met
+
+    # Run outer loop until the episodes limit is reached or the task is solved
+    while not solved and episodeCount < EPISODE_LIMIT:
+        state = supervisorEnv.reset()                  # Reset robots and get starting observation)
+        supervisorEnv.episodeScore = 0
+        actionProbs = [[] for i in range(NUM_ROBOTS)]
+
+        # Inner loop is the episode loop
+        for step in range(STEPS_PER_EPISODE):
+            # In training mode the agent samples from the probability distribution, naturally implementing exploration
+            selectedActions = []
+            for i in range(NUM_ROBOTS):
+                selectedAction, actionProb = agent.work(state[i], type_="selectAction")
+                actionProbs[i].append(actionProb)
+                selectedActions.append(selectedAction)
+
+            # Step the supervisor to get the current selectedAction reward, the new state and whether we reached the
+            # done condition
+            newState, reward, done, info = supervisorEnv.step([*selectedActions])
+
+            # Save the current state transitions from all robots in agent's memory
+            for i in range(NUM_ROBOTS):
+                trans = Transition(state[i], selectedActions[i], actionProbs[i][-1], reward, newState[i])
+                agent.storeTransition(trans)
+
+            supervisorEnv.episodeScore += reward  # Accumulate episode reward
+            if done:
+                # Save the episode's score
+                supervisorEnv.episodeScoreList.append(supervisorEnv.episodeScore)
+                agent.trainStep(batchSize=step + 1)
+                solved = supervisorEnv.solved()  # Check whether the task is solved
+                break
+
+            state = newState  # state for next step is current step's newState
+
+        avgActionProb = [round(mean(actionProbs[i]), 4) for i in range(NUM_ROBOTS)]
+
+        # The average action probability tells us how confident the agent was of its actions.
+        # By looking at this we can check whether the agent is converging to a certain policy.
+        print(f"Episode: {episodeCount} Score = {supervisorEnv.episodeScore} | Average Action Probabilities = {avgActionProb}")
+
+        episodeCount += 1  # Increment episode counter
+
+    movingAvgN = 10
+    plotData(convolve(supervisorEnv.episodeScoreList, ones((movingAvgN,))/movingAvgN, mode='valid'),
+             "episode", "episode score", "Episode scores over episodes")
+
+    if not solved and not supervisorEnv.test:
+        print("Reached episode limit and task was not solved.")
+    else:
+        if not solved:
+            print("Task is not solved, deploying agent for testing...")
+        elif solved:
+            print("Task is solved, deploying agent for testing...")
+
+    state = supervisorEnv.reset()
+    supervisorEnv.test = True
+    supervisorEnv.episodeScore = 0
+    while True:
+        selectedActions = []
+        for i in range(NUM_ROBOTS):
+            selectedAction, actionProb = agent.work(state[i], type_="selectAction")
+            actionProbs[i].append(actionProb)
+            selectedActions.append(selectedAction)
+
+        state, reward, done, _ = supervisorEnv.step(selectedActions)
+        supervisorEnv.episodeScore += reward  # Accumulate episode reward
+
+        if done:
+            print("Reward accumulated =", supervisorEnv.episodeScore)
+            supervisorEnv.episodeScore = 0
+            state = supervisorEnv.reset()
-Original file line number
+Diff line change
@@ Expand Up / @@ -102,3 +102,6 @@ venv.bak/ @@
     # mypy
     .mypy_cache/
+    # logs
+    examples/evolutionary/cartpole_evolutionary/controllers/supervisor_manager/wandb/*