default actor=adam3-64, critic=adam3-64, b=256, ep=245k

2023-01-13 17:16:51 +03:00 · 2023-01-13 17:16:51 +03:00 · 0a78c5a7d9
parent 5c11322e05
commit 0a78c5a7d9
24 changed files with 1917 additions and 1 deletions
--- a/.gitignore
+++ b/.gitignore
@ -1 +1,4 @@
 .ipynb*
+/model/*/*/*
+!/model/*/*/actor*
+!/model/*/*/critic*
--- a/agent.py
+++ b/agent.py
@ -0,0 +1,28 @@
+import numpy as np
+import torch
+import os
+from maddpg.maddpg import MADDPG
+
+
+class Agent:
+    def __init__(self, agent_id, args):
+        self.args = args
+        self.agent_id = agent_id
+        self.policy = MADDPG(args, agent_id)
+
+    def select_action(self, o, noise_rate, epsilon):
+        if np.random.uniform() < epsilon:
+            u = np.random.uniform(-self.args.high_action, self.args.high_action, self.args.action_shape[self.agent_id])
+        else:
+            inputs = torch.tensor(o, dtype=torch.float32).unsqueeze(0)
+            pi = self.policy.actor_network(inputs).squeeze(0)
+            # print('{} : {}'.format(self.name, pi))
+            u = pi.cpu().numpy()
+            noise = noise_rate * self.args.high_action * np.random.randn(*u.shape)  # gaussian noise
+            u += noise
+            u = np.clip(u, -self.args.high_action, self.args.high_action)
+        return u.copy()
+
+    def learn(self, transitions, other_agents):
+        self.policy.train(transitions, other_agents)
+
--- a/common/arguments.py
+++ b/common/arguments.py
@ -0,0 +1,38 @@
+import argparse
+
+"""
+Here are the param for the training
+
+"""
+
+
+def get_args():
+    parser = argparse.ArgumentParser("Reinforcement Learning experiments for multiagent environments")
+    # Environment
+    parser.add_argument("--scenario-name", type=str, default="simple_tag", help="name of the scenario script")
+    parser.add_argument("--max-episode-len", type=int, default=100, help="maximum episode length")
+    parser.add_argument("--time-steps", type=int, default=2000000, help="number of time steps")
+    # agents
+    parser.add_argument("--num-adversaries", type=int, default=1, help="number of adversaries")
+    # Core training parameters
+    parser.add_argument("--lr-actor", type=float, default=1e-4, help="learning rate of actor")
+    parser.add_argument("--lr-critic", type=float, default=1e-3, help="learning rate of critic")
+    parser.add_argument("--epsilon", type=float, default=0.1, help="epsilon greedy")
+    parser.add_argument("--noise_rate", type=float, default=0.1, help="noise rate for sampling from a standard normal distribution ")
+    parser.add_argument("--gamma", type=float, default=0.95, help="discount factor")
+    parser.add_argument("--tau", type=float, default=0.01, help="parameter for updating the target network")
+    parser.add_argument("--buffer-size", type=int, default=int(5e5), help="number of transitions can be stored in buffer")
+    parser.add_argument("--batch-size", type=int, default=256, help="number of episodes to optimize at the same time")
+    # Checkpointing
+    parser.add_argument("--save-dir", type=str, default="./model", help="directory in which training state and model should be saved")
+    parser.add_argument("--save-rate", type=int, default=2000, help="save model once every time this many episodes are completed")
+    parser.add_argument("--model-dir", type=str, default="", help="directory in which training state and model are loaded")
+
+    # Evaluate
+    parser.add_argument("--evaluate-episodes", type=int, default=10, help="number of episodes for evaluating")
+    parser.add_argument("--evaluate-episode-len", type=int, default=100, help="length of episodes for evaluating")
+    parser.add_argument("--evaluate", type=bool, default=False, help="whether to evaluate the model")
+    parser.add_argument("--evaluate-rate", type=int, default=1000, help="how often to evaluate model")
+    args = parser.parse_args()
+
+    return args
--- a/common/replay_buffer.py
+++ b/common/replay_buffer.py
@ -0,0 +1,53 @@
+import threading
+import numpy as np
+
+
+class Buffer:
+    def __init__(self, args):
+        self.size = args.buffer_size
+        self.args = args
+        # memory management
+        self.current_size = 0
+        # create the buffer to store info
+        self.buffer = dict()
+        for i in range(self.args.n_agents):
+            self.buffer['o_%d' % i] = np.empty([self.size, self.args.obs_shape[i]])
+            self.buffer['u_%d' % i] = np.empty([self.size, self.args.action_shape[i]])
+            self.buffer['r_%d' % i] = np.empty([self.size])
+            self.buffer['o_next_%d' % i] = np.empty([self.size, self.args.obs_shape[i]])
+        # thread lock
+        self.lock = threading.Lock()
+
+    # store the episode
+    def store_episode(self, o, u, r, o_next):
+        idxs = self._get_storage_idx(inc=1)
+        for i in range(self.args.n_agents):
+            with self.lock:
+                self.buffer['o_%d' % i][idxs] = o[i]
+                self.buffer['u_%d' % i][idxs] = u[i]
+                self.buffer['r_%d' % i][idxs] = r[i]
+                self.buffer['o_next_%d' % i][idxs] = o_next[i]
+    
+    # sample the data from the replay buffer
+    def sample(self, batch_size):
+        temp_buffer = {}
+        idx = np.random.randint(0, self.current_size, batch_size)
+        for key in self.buffer.keys():
+            temp_buffer[key] = self.buffer[key][idx]
+        return temp_buffer
+
+    def _get_storage_idx(self, inc=None):
+        inc = inc or 1
+        if self.current_size+inc <= self.size:
+            idx = np.arange(self.current_size, self.current_size+inc)
+        elif self.current_size < self.size:
+            overflow = inc - (self.size - self.current_size)
+            idx_a = np.arange(self.current_size, self.size)
+            idx_b = np.random.randint(0, self.current_size, overflow)
+            idx = np.concatenate([idx_a, idx_b])
+        else:
+            idx = np.random.randint(0, self.size, inc)
+        self.current_size = min(self.size, self.current_size+inc)
+        if inc == 1:
+            idx = idx[0]
+        return idx
--- a/common/utils.py
+++ b/common/utils.py
@ -0,0 +1,54 @@
+import numpy as np
+import inspect
+import functools
+
+
+def store_args(method):
+    """Stores provided method args as instance attributes.
+    """
+    argspec = inspect.getfullargspec(method)
+    defaults = {}
+    if argspec.defaults is not None:
+        defaults = dict(
+            zip(argspec.args[-len(argspec.defaults):], argspec.defaults))
+    if argspec.kwonlydefaults is not None:
+        defaults.update(argspec.kwonlydefaults)
+    arg_names = argspec.args[1:]
+
+    @functools.wraps(method)
+    def wrapper(*positional_args, **keyword_args):
+        self = positional_args[0]
+        # Get default arg values
+        args = defaults.copy()
+        # Add provided arg values
+        for name, value in zip(arg_names, positional_args[1:]):
+            args[name] = value
+        args.update(keyword_args)
+        self.__dict__.update(args)
+        return method(*positional_args, **keyword_args)
+
+    return wrapper
+
+
+def make_env(args):
+    from multiagent.environment import MultiAgentEnv
+    import multiagent.scenarios as scenarios
+
+    # load scenario from script
+    scenario = scenarios.load(args.scenario_name + ".py").Scenario()
+
+    # create world
+    world = scenario.make_world()
+    # create multiagent environment
+    env = MultiAgentEnv(world, scenario.reset_world, scenario.reward, scenario.observation)
+    # env = MultiAgentEnv(world)
+    args.n_players = env.n  
+    args.n_agents = env.n - args.num_adversaries  
+    args.obs_shape = [env.observation_space[i].shape[0] for i in range(args.n_agents)]  
+    action_shape = []
+    for content in env.action_space:
+        action_shape.append(content.n)
+    args.action_shape = action_shape[:args.n_agents]
+    args.high_action = 1
+    args.low_action = -1
+    return env, args
--- a/maddpg/actor_critic.py
+++ b/maddpg/actor_critic.py
@ -0,0 +1,44 @@
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+
+# define the actor network
+class Actor(nn.Module):
+    def __init__(self, args, agent_id):
+        super(Actor, self).__init__()
+        self.max_action = args.high_action
+        self.fc1 = nn.Linear(args.obs_shape[agent_id], 64)
+        self.fc2 = nn.Linear(64, 64)
+        self.fc3 = nn.Linear(64, 64)
+        self.action_out = nn.Linear(128, args.action_shape[agent_id])
+
+    def forward(self, x):
+        x = F.relu(self.fc1(x))
+        x = F.relu(self.fc2(x))
+        x = F.relu(self.fc3(x))
+        actions = self.max_action * torch.tanh(self.action_out(x))
+
+        return actions
+
+
+class Critic(nn.Module):
+    def __init__(self, args):
+        super(Critic, self).__init__()
+        self.max_action = args.high_action
+        self.fc1 = nn.Linear(sum(args.obs_shape) + sum(args.action_shape), 64)
+        self.fc2 = nn.Linear(64, 64)
+        self.fc3 = nn.Linear(64, 64)
+        self.q_out = nn.Linear(64, 1)
+
+    def forward(self, state, action):
+        state = torch.cat(state, dim=1)
+        for i in range(len(action)):
+            action[i] /= self.max_action
+        action = torch.cat(action, dim=1)
+        x = torch.cat([state, action], dim=1)
+        x = F.relu(self.fc1(x))
+        x = F.relu(self.fc2(x))
+        x = F.relu(self.fc3(x))
+        q_value = self.q_out(x)
+        return q_value
--- a/maddpg/maddpg.py
+++ b/maddpg/maddpg.py
@ -0,0 +1,114 @@
+import torch
+import os
+from maddpg.actor_critic import Actor, Critic
+
+
+class MADDPG:
+    def __init__(self, args, agent_id):
+        self.args = args
+        self.agent_id = agent_id
+        self.train_step = 0
+
+        # create the network
+        self.actor_network = Actor(args, agent_id)
+        self.critic_network = Critic(args)
+
+        # build up the target network
+        self.actor_target_network = Actor(args, agent_id)
+        self.critic_target_network = Critic(args)
+
+        # load the weights into the target networks
+        self.actor_target_network.load_state_dict(self.actor_network.state_dict())
+        self.critic_target_network.load_state_dict(self.critic_network.state_dict())
+
+        # create the optimizer
+        self.actor_optim = torch.optim.Adam(self.actor_network.parameters(), lr=self.args.lr_actor)
+        self.critic_optim = torch.optim.Adam(self.critic_network.parameters(), lr=self.args.lr_critic)
+
+        # create the dict for store the model
+        if not os.path.exists(self.args.save_dir):
+            os.mkdir(self.args.save_dir)
+        # path to save the model
+        self.model_path = self.args.save_dir + '/' + self.args.scenario_name
+        if not os.path.exists(self.model_path):
+            os.mkdir(self.model_path)
+        self.model_path = self.model_path + '/' + 'agent_%d' % agent_id
+        if not os.path.exists(self.model_path):
+            os.mkdir(self.model_path)
+        
+        if os.path.exists(self.model_path + '/actor_params.pkl'):
+            self.actor_network.load_state_dict(torch.load(self.model_path + '/actor_params.pkl'))
+            self.critic_network.load_state_dict(torch.load(self.model_path + '/critic_params.pkl'))
+            print('Agent {} successfully loaded actor_network: {}'.format(self.agent_id,
+                                                                          self.model_path + '/actor_params.pkl'))
+            print('Agent {} successfully loaded critic_network: {}'.format(self.agent_id,
+                                                                           self.model_path + '/critic_params.pkl'))
+
+    # soft update
+    def _soft_update_target_network(self):
+        for target_param, param in zip(self.actor_target_network.parameters(), self.actor_network.parameters()):
+            target_param.data.copy_((1 - self.args.tau) * target_param.data + self.args.tau * param.data)
+
+        for target_param, param in zip(self.critic_target_network.parameters(), self.critic_network.parameters()):
+            target_param.data.copy_((1 - self.args.tau) * target_param.data + self.args.tau * param.data)
+
+    # update the network
+    def train(self, transitions, other_agents):
+        for key in transitions.keys():
+            transitions[key] = torch.tensor(transitions[key], dtype=torch.float32)
+        r = transitions['r_%d' % self.agent_id]  # reward
+        o, u, o_next = [], [], []  # agent
+        for agent_id in range(self.args.n_agents):
+            o.append(transitions['o_%d' % agent_id])
+            u.append(transitions['u_%d' % agent_id])
+            o_next.append(transitions['o_next_%d' % agent_id])
+
+        # calculate the target Q value function
+        u_next = []
+        with torch.no_grad():
+            index = 0
+            for agent_id in range(self.args.n_agents):
+                if agent_id == self.agent_id:
+                    u_next.append(self.actor_target_network(o_next[agent_id]))
+                else:
+                    # other_agents
+                    u_next.append(other_agents[index].policy.actor_target_network(o_next[agent_id]))
+                    index += 1
+            q_next = self.critic_target_network(o_next, u_next).detach()
+
+            target_q = (r.unsqueeze(1) + self.args.gamma * q_next).detach()
+
+        # the q loss
+        q_value = self.critic_network(o, u)
+        critic_loss = (target_q - q_value).pow(2).mean()
+
+        # the actor loss 
+        u[self.agent_id] = self.actor_network(o[self.agent_id])
+        actor_loss = - self.critic_network(o, u).mean()
+        #if self.agent_id == 0:
+        #    print('critic_loss is {}, actor_loss is {}'.format(critic_loss, actor_loss))
+        # update the network
+        self.actor_optim.zero_grad()
+        actor_loss.backward()
+        self.actor_optim.step()
+        self.critic_optim.zero_grad()
+        critic_loss.backward()
+        self.critic_optim.step()
+
+        self._soft_update_target_network()
+        if self.train_step > 0 and self.train_step % self.args.save_rate == 0:
+            self.save_model(self.train_step)
+        self.train_step += 1
+
+    def save_model(self, train_step):
+        num = str(train_step // self.args.save_rate)
+        model_path = os.path.join(self.args.save_dir, self.args.scenario_name)
+        if not os.path.exists(model_path):
+            os.makedirs(model_path)
+        model_path = os.path.join(model_path, 'agent_%d' % self.agent_id)
+        if not os.path.exists(model_path):
+            os.makedirs(model_path)
+        torch.save(self.actor_network.state_dict(), model_path + '/' + num + '_actor_params.pkl')
+        torch.save(self.critic_network.state_dict(),  model_path + '/' + num + '_critic_params.pkl')
+
+
--- a/main.ipynb
+++ b/main.ipynb
@ -0,0 +1,331 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "id": "11c84981",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Looking in indexes: https://nexus.c68.spacecorp.ru/repository/pypi_group/simple/\n",
+      "Requirement already satisfied: torch==1.7.1 in /home/ovchinnikov_ii@RISDE.ru/Software/Jupyter/venv/lib/python3.9/site-packages (1.7.1)\n",
+      "Requirement already satisfied: typing-extensions in /home/ovchinnikov_ii@RISDE.ru/Software/Jupyter/venv/lib/python3.9/site-packages (from torch==1.7.1) (4.4.0)\n",
+      "Requirement already satisfied: numpy in /home/ovchinnikov_ii@RISDE.ru/Software/Jupyter/venv/lib/python3.9/site-packages (from torch==1.7.1) (1.24.1)\n",
+      "Note: you may need to restart the kernel to use updated packages.\n"
+     ]
+    }
+   ],
+   "source": [
+    "pip install torch==1.7.1"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "id": "735f4c82",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Looking in indexes: https://nexus.c68.spacecorp.ru/repository/pypi_group/simple/\n",
+      "Collecting supersuit==2.6.5\n",
+      "  Using cached SuperSuit-2.6.5-py3-none-any.whl\n",
+      "Requirement already satisfied: cloudpickle in /home/ovchinnikov_ii@RISDE.ru/Software/Jupyter/venv/lib/python3.9/site-packages (from supersuit==2.6.5) (2.2.0)\n",
+      "Collecting pettingzoo>=1.6.0\n",
+      "  Using cached https://nexus.c68.spacecorp.ru/repository/pypi_group/packages/pettingzoo/1.22.3/PettingZoo-1.22.3-py3-none-any.whl (816 kB)\n",
+      "Collecting opencv-python~=3.4.0\n",
+      "  Using cached https://nexus.c68.spacecorp.ru/repository/pypi_group/packages/opencv-python/3.4.18.65/opencv_python-3.4.18.65-cp36-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (58.4 MB)\n",
+      "Requirement already satisfied: numpy>=1.19.3 in /home/ovchinnikov_ii@RISDE.ru/Software/Jupyter/venv/lib/python3.9/site-packages (from opencv-python~=3.4.0->supersuit==2.6.5) (1.24.1)\n",
+      "Requirement already satisfied: gymnasium>=0.26.0 in /home/ovchinnikov_ii@RISDE.ru/Software/Jupyter/venv/lib/python3.9/site-packages (from pettingzoo>=1.6.0->supersuit==2.6.5) (0.27.0)\n",
+      "Requirement already satisfied: typing-extensions>=4.3.0 in /home/ovchinnikov_ii@RISDE.ru/Software/Jupyter/venv/lib/python3.9/site-packages (from gymnasium>=0.26.0->pettingzoo>=1.6.0->supersuit==2.6.5) (4.4.0)\n",
+      "Requirement already satisfied: importlib-metadata>=4.8.0 in /home/ovchinnikov_ii@RISDE.ru/Software/Jupyter/venv/lib/python3.9/site-packages (from gymnasium>=0.26.0->pettingzoo>=1.6.0->supersuit==2.6.5) (6.0.0)\n",
+      "Requirement already satisfied: shimmy<1.0,>=0.1.0 in /home/ovchinnikov_ii@RISDE.ru/Software/Jupyter/venv/lib/python3.9/site-packages (from gymnasium>=0.26.0->pettingzoo>=1.6.0->supersuit==2.6.5) (0.2.0)\n",
+      "Requirement already satisfied: jax-jumpy>=0.2.0 in /home/ovchinnikov_ii@RISDE.ru/Software/Jupyter/venv/lib/python3.9/site-packages (from gymnasium>=0.26.0->pettingzoo>=1.6.0->supersuit==2.6.5) (0.2.0)\n",
+      "Requirement already satisfied: gymnasium-notices>=0.0.1 in /home/ovchinnikov_ii@RISDE.ru/Software/Jupyter/venv/lib/python3.9/site-packages (from gymnasium>=0.26.0->pettingzoo>=1.6.0->supersuit==2.6.5) (0.0.1)\n",
+      "Requirement already satisfied: zipp>=0.5 in /home/ovchinnikov_ii@RISDE.ru/Software/Jupyter/venv/lib/python3.9/site-packages (from importlib-metadata>=4.8.0->gymnasium>=0.26.0->pettingzoo>=1.6.0->supersuit==2.6.5) (3.11.0)\n",
+      "Installing collected packages: pettingzoo, opencv-python, supersuit\n",
+      "  Attempting uninstall: pettingzoo\n",
+      "    Found existing installation: PettingZoo 1.3.3\n",
+      "    Uninstalling PettingZoo-1.3.3:\n",
+      "      Successfully uninstalled PettingZoo-1.3.3\n",
+      "  Attempting uninstall: opencv-python\n",
+      "    Found existing installation: opencv-python 4.7.0.68\n",
+      "    Uninstalling opencv-python-4.7.0.68:\n",
+      "      Successfully uninstalled opencv-python-4.7.0.68\n",
+      "  Attempting uninstall: supersuit\n",
+      "    Found existing installation: SuperSuit 3.7.1\n",
+      "    Uninstalling SuperSuit-3.7.1:\n",
+      "      Successfully uninstalled SuperSuit-3.7.1\n",
+      "Successfully installed opencv-python-3.4.18.65 pettingzoo-1.22.3 supersuit-2.6.5\n",
+      "Note: you may need to restart the kernel to use updated packages.\n"
+     ]
+    }
+   ],
+   "source": [
+    "pip install supersuit==2.6.5"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "id": "3b8272be",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Looking in indexes: https://nexus.c68.spacecorp.ru/repository/pypi_group/simple/\n",
+      "Collecting tqdm\n",
+      "  Using cached https://nexus.c68.spacecorp.ru/repository/pypi_group/packages/tqdm/4.64.1/tqdm-4.64.1-py2.py3-none-any.whl (78 kB)\n",
+      "Installing collected packages: tqdm\n",
+      "Successfully installed tqdm-4.64.1\n",
+      "Note: you may need to restart the kernel to use updated packages.\n"
+     ]
+    }
+   ],
+   "source": [
+    "pip install tqdm"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "id": "acc570b8",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Looking in indexes: https://nexus.c68.spacecorp.ru/repository/pypi_group/simple/\n",
+      "Requirement already satisfied: matplotlib in /home/ovchinnikov_ii@RISDE.ru/Software/Jupyter/venv/lib/python3.9/site-packages (3.6.3)\n",
+      "Requirement already satisfied: contourpy>=1.0.1 in /home/ovchinnikov_ii@RISDE.ru/Software/Jupyter/venv/lib/python3.9/site-packages (from matplotlib) (1.0.6)\n",
+      "Requirement already satisfied: numpy>=1.19 in /home/ovchinnikov_ii@RISDE.ru/Software/Jupyter/venv/lib/python3.9/site-packages (from matplotlib) (1.24.1)\n",
+      "Requirement already satisfied: pyparsing>=2.2.1 in /home/ovchinnikov_ii@RISDE.ru/Software/Jupyter/venv/lib/python3.9/site-packages (from matplotlib) (3.0.9)\n",
+      "Requirement already satisfied: packaging>=20.0 in /home/ovchinnikov_ii@RISDE.ru/Software/Jupyter/venv/lib/python3.9/site-packages (from matplotlib) (23.0)\n",
+      "Requirement already satisfied: fonttools>=4.22.0 in /home/ovchinnikov_ii@RISDE.ru/Software/Jupyter/venv/lib/python3.9/site-packages (from matplotlib) (4.38.0)\n",
+      "Requirement already satisfied: cycler>=0.10 in /home/ovchinnikov_ii@RISDE.ru/Software/Jupyter/venv/lib/python3.9/site-packages (from matplotlib) (0.11.0)\n",
+      "Requirement already satisfied: kiwisolver>=1.0.1 in /home/ovchinnikov_ii@RISDE.ru/Software/Jupyter/venv/lib/python3.9/site-packages (from matplotlib) (1.4.4)\n",
+      "Requirement already satisfied: pillow>=6.2.0 in /home/ovchinnikov_ii@RISDE.ru/Software/Jupyter/venv/lib/python3.9/site-packages (from matplotlib) (9.4.0)\n",
+      "Requirement already satisfied: python-dateutil>=2.7 in /home/ovchinnikov_ii@RISDE.ru/Software/Jupyter/venv/lib/python3.9/site-packages (from matplotlib) (2.8.2)\n",
+      "Requirement already satisfied: six>=1.5 in /home/ovchinnikov_ii@RISDE.ru/Software/Jupyter/venv/lib/python3.9/site-packages (from python-dateutil>=2.7->matplotlib) (1.16.0)\n",
+      "Note: you may need to restart the kernel to use updated packages.\n"
+     ]
+    }
+   ],
+   "source": [
+    "pip install matplotlib"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "id": "262ae5d6",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Looking in indexes: https://nexus.c68.spacecorp.ru/repository/pypi_group/simple/\n",
+      "Collecting gym==0.10.5\n",
+      "  Using cached gym-0.10.5-py3-none-any.whl\n",
+      "Requirement already satisfied: six in /home/ovchinnikov_ii@RISDE.ru/Software/Jupyter/venv/lib/python3.9/site-packages (from gym==0.10.5) (1.16.0)\n",
+      "Collecting pyglet>=1.2.0\n",
+      "  Using cached https://nexus.c68.spacecorp.ru/repository/pypi_group/packages/pyglet/2.0.3/pyglet-2.0.3-py3-none-any.whl (968 kB)\n",
+      "Requirement already satisfied: numpy>=1.10.4 in /home/ovchinnikov_ii@RISDE.ru/Software/Jupyter/venv/lib/python3.9/site-packages (from gym==0.10.5) (1.24.1)\n",
+      "Requirement already satisfied: requests>=2.0 in /home/ovchinnikov_ii@RISDE.ru/Software/Jupyter/venv/lib/python3.9/site-packages (from gym==0.10.5) (2.28.1)\n",
+      "Requirement already satisfied: idna<4,>=2.5 in /home/ovchinnikov_ii@RISDE.ru/Software/Jupyter/venv/lib/python3.9/site-packages (from requests>=2.0->gym==0.10.5) (3.4)\n",
+      "Requirement already satisfied: charset-normalizer<3,>=2 in /home/ovchinnikov_ii@RISDE.ru/Software/Jupyter/venv/lib/python3.9/site-packages (from requests>=2.0->gym==0.10.5) (2.1.1)\n",
+      "Requirement already satisfied: urllib3<1.27,>=1.21.1 in /home/ovchinnikov_ii@RISDE.ru/Software/Jupyter/venv/lib/python3.9/site-packages (from requests>=2.0->gym==0.10.5) (1.26.14)\n",
+      "Requirement already satisfied: certifi>=2017.4.17 in /home/ovchinnikov_ii@RISDE.ru/Software/Jupyter/venv/lib/python3.9/site-packages (from requests>=2.0->gym==0.10.5) (2022.12.7)\n",
+      "Installing collected packages: pyglet, gym\n",
+      "  Attempting uninstall: gym\n",
+      "    Found existing installation: gym 0.22.0\n",
+      "    Uninstalling gym-0.22.0:\n",
+      "      Successfully uninstalled gym-0.22.0\n",
+      "Successfully installed gym-0.10.5 pyglet-2.0.3\n",
+      "Note: you may need to restart the kernel to use updated packages.\n"
+     ]
+    }
+   ],
+   "source": [
+    "pip install gym==0.10.5"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "id": "9c651810",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Looking in indexes: https://nexus.c68.spacecorp.ru/repository/pypi_group/simple/\n",
+      "Collecting pyglet==1.5.27\n",
+      "  Using cached https://nexus.c68.spacecorp.ru/repository/pypi_group/packages/pyglet/1.5.27/pyglet-1.5.27-py3-none-any.whl (1.1 MB)\n",
+      "Installing collected packages: pyglet\n",
+      "  Attempting uninstall: pyglet\n",
+      "    Found existing installation: pyglet 2.0.3\n",
+      "    Uninstalling pyglet-2.0.3:\n",
+      "      Successfully uninstalled pyglet-2.0.3\n",
+      "Successfully installed pyglet-1.5.27\n",
+      "Note: you may need to restart the kernel to use updated packages.\n"
+     ]
+    }
+   ],
+   "source": [
+    "pip install pyglet==1.5.27"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "id": "cb877007",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Пытаемся загрузить данные!\n",
+      "Пытаемся загрузить данные!\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "  0%|                                                                                              | 203/2000000 [00:00<32:10, 1036.07it/s]/home/ovchinnikov_ii@RISDE.ru/Software/Jupyter/MADDPG/maddpg/maddpg.py:60: UserWarning: To copy construct from a tensor, it is recommended to use sourceTensor.clone().detach() or sourceTensor.clone().detach().requires_grad_(True), rather than torch.tensor(sourceTensor).\n",
+      "  transitions[key] = torch.tensor(transitions[key], dtype=torch.float32)\n",
+      "  0%|                                                                                             | 307/2000000 [00:01<2:18:56, 239.88it/s]"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Пытаемся сохранить данные по пути = ./model/simple_adversary/agent_0/1_actor_params.pkl\n",
+      "Пытаемся сохранить данные по пути = ./model/simple_adversary/agent_1/1_actor_params.pkl\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "  0%|                                                                                              | 459/2000000 [00:03<6:02:10, 92.02it/s]"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Пытаемся сохранить данные по пути = ./model/simple_adversary/agent_0/2_actor_params.pkl\n",
+      "Пытаемся сохранить данные по пути = ./model/simple_adversary/agent_1/2_actor_params.pkl\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "  0%|                                                                                              | 566/2000000 [00:04<7:30:23, 73.99it/s]"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Пытаемся сохранить данные по пути = ./model/simple_adversary/agent_0/3_actor_params.pkl\n",
+      "Пытаемся сохранить данные по пути = ./model/simple_adversary/agent_1/3_actor_params.pkl\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "  0%|                                                                                              | 667/2000000 [00:06<8:44:54, 63.48it/s]"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Пытаемся сохранить данные по пути = ./model/simple_adversary/agent_0/4_actor_params.pkl\n",
+      "Пытаемся сохранить данные по пути = ./model/simple_adversary/agent_1/4_actor_params.pkl\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "  0%|                                                                                              | 717/2000000 [00:07<5:36:33, 99.01it/s]\n"
+     ]
+    },
+    {
+     "ename": "KeyboardInterrupt",
+     "evalue": "",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
+      "\u001b[0;31mKeyboardInterrupt\u001b[0m                         Traceback (most recent call last)",
+      "File \u001b[0;32m~/Software/Jupyter/MADDPG/main.py:18\u001b[0m\n\u001b[1;32m     16\u001b[0m     \u001b[38;5;28mprint\u001b[39m(\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mAverage returns is\u001b[39m\u001b[38;5;124m'\u001b[39m, returns)\n\u001b[1;32m     17\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m---> 18\u001b[0m     \u001b[43mrunner\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mrun\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n",
+      "File \u001b[0;32m~/Software/Jupyter/MADDPG/runner.py:52\u001b[0m, in \u001b[0;36mRunner.run\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m     50\u001b[0m         other_agents \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39magents\u001b[38;5;241m.\u001b[39mcopy()\n\u001b[1;32m     51\u001b[0m         other_agents\u001b[38;5;241m.\u001b[39mremove(agent)\n\u001b[0;32m---> 52\u001b[0m         \u001b[43magent\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mlearn\u001b[49m\u001b[43m(\u001b[49m\u001b[43mtransitions\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mother_agents\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m     53\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m time_step \u001b[38;5;241m>\u001b[39m \u001b[38;5;241m0\u001b[39m \u001b[38;5;129;01mand\u001b[39;00m time_step \u001b[38;5;241m%\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39margs\u001b[38;5;241m.\u001b[39mevaluate_rate \u001b[38;5;241m==\u001b[39m \u001b[38;5;241m0\u001b[39m:\n\u001b[1;32m     54\u001b[0m     returns\u001b[38;5;241m.\u001b[39mappend(\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mevaluate())\n",
+      "File \u001b[0;32m~/Software/Jupyter/MADDPG/agent.py:27\u001b[0m, in \u001b[0;36mAgent.learn\u001b[0;34m(self, transitions, other_agents)\u001b[0m\n\u001b[1;32m     26\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mlearn\u001b[39m(\u001b[38;5;28mself\u001b[39m, transitions, other_agents):\n\u001b[0;32m---> 27\u001b[0m     \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mpolicy\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mtrain\u001b[49m\u001b[43m(\u001b[49m\u001b[43mtransitions\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mother_agents\u001b[49m\u001b[43m)\u001b[49m\n",
+      "File \u001b[0;32m~/Software/Jupyter/MADDPG/maddpg/maddpg.py:95\u001b[0m, in \u001b[0;36mMADDPG.train\u001b[0;34m(self, transitions, other_agents)\u001b[0m\n\u001b[1;32m     93\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mactor_optim\u001b[38;5;241m.\u001b[39mzero_grad()\n\u001b[1;32m     94\u001b[0m actor_loss\u001b[38;5;241m.\u001b[39mbackward()\n\u001b[0;32m---> 95\u001b[0m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mactor_optim\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mstep\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m     96\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mcritic_optim\u001b[38;5;241m.\u001b[39mzero_grad()\n\u001b[1;32m     97\u001b[0m critic_loss\u001b[38;5;241m.\u001b[39mbackward()\n",
+      "File \u001b[0;32m~/Software/Jupyter/venv/lib/python3.9/site-packages/torch/autograd/grad_mode.py:26\u001b[0m, in \u001b[0;36m_DecoratorContextManager.__call__.<locals>.decorate_context\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m     23\u001b[0m \u001b[38;5;129m@functools\u001b[39m\u001b[38;5;241m.\u001b[39mwraps(func)\n\u001b[1;32m     24\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mdecorate_context\u001b[39m(\u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs):\n\u001b[1;32m     25\u001b[0m     \u001b[38;5;28;01mwith\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m\u001b[38;5;18m__class__\u001b[39m():\n\u001b[0;32m---> 26\u001b[0m         \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mfunc\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n",
+      "File \u001b[0;32m~/Software/Jupyter/venv/lib/python3.9/site-packages/torch/optim/adamw.py:116\u001b[0m, in \u001b[0;36mAdamW.step\u001b[0;34m(self, closure)\u001b[0m\n\u001b[1;32m    112\u001b[0m             denom \u001b[38;5;241m=\u001b[39m (exp_avg_sq\u001b[38;5;241m.\u001b[39msqrt() \u001b[38;5;241m/\u001b[39m math\u001b[38;5;241m.\u001b[39msqrt(bias_correction2))\u001b[38;5;241m.\u001b[39madd_(group[\u001b[38;5;124m'\u001b[39m\u001b[38;5;124meps\u001b[39m\u001b[38;5;124m'\u001b[39m])\n\u001b[1;32m    114\u001b[0m         step_size \u001b[38;5;241m=\u001b[39m group[\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mlr\u001b[39m\u001b[38;5;124m'\u001b[39m] \u001b[38;5;241m/\u001b[39m bias_correction1\n\u001b[0;32m--> 116\u001b[0m         p\u001b[38;5;241m.\u001b[39maddcdiv_(exp_avg, denom, value\u001b[38;5;241m=\u001b[39m\u001b[38;5;241;43m-\u001b[39;49m\u001b[43mstep_size\u001b[49m)\n\u001b[1;32m    118\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m loss\n",
+      "\u001b[0;31mKeyboardInterrupt\u001b[0m: "
+     ]
+    }
+   ],
+   "source": [
+    "%run ./main.py --scenario-name=simple_adversary --evaluate-episodes=10000 --save-rate=100"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "d71a2b22",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "d079aff2",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "venv",
+   "language": "python",
+   "name": "venv"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.9.2"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
--- a/main.py
+++ b/main.py
@ -0,0 +1,18 @@
+from runner import Runner
+from common.arguments import get_args
+from common.utils import make_env
+import numpy as np
+import random
+import torch
+
+
+if __name__ == '__main__':
+    # get the params
+    args = get_args()
+    env, args = make_env(args)
+    runner = Runner(args, env)
+    if args.evaluate:
+        returns = runner.evaluate()
+        print('Average returns is', returns)
+    else:
+        runner.run()
--- a/model/simple_adversary/agent_0/actor_params.pkl
+++ b/model/simple_adversary/agent_0/actor_params.pkl
--- a/model/simple_adversary/agent_0/critic_params.pkl
+++ b/model/simple_adversary/agent_0/critic_params.pkl
--- a/model/simple_adversary/agent_1/actor_params.pkl
+++ b/model/simple_adversary/agent_1/actor_params.pkl
--- a/model/simple_adversary/agent_1/critic_params.pkl
+++ b/model/simple_adversary/agent_1/critic_params.pkl
--- a/model/simple_adversary/returns.pkl.npy
+++ b/model/simple_adversary/returns.pkl.npy
--- a/multiagent/init.py
+++ b/multiagent/init.py
@ -0,0 +1,18 @@
+from gym.envs.registration import register
+
+# Multiagent envs
+# ----------------------------------------
+
+register(
+    id='MultiagentSimple-v0',
+    entry_point='multiagent.envs:SimpleEnv',
+    # FIXME(cathywu) currently has to be exactly max_path_length parameters in
+    # rllab run script
+    max_episode_steps=100,
+)
+
+register(
+    id='MultiagentSimpleSpeakerListener-v0',
+    entry_point='multiagent.envs:SimpleSpeakerListenerEnv',
+    max_episode_steps=100,
+)
--- a/multiagent/core.py
+++ b/multiagent/core.py
@ -0,0 +1,196 @@
+import numpy as np
+
+# physical/external base state of all entites
+class EntityState(object):
+    def __init__(self):
+        # physical position
+        self.p_pos = None
+        # physical velocity
+        self.p_vel = None
+
+# state of agents (including communication and internal/mental state)
+class AgentState(EntityState):
+    def __init__(self):
+        super(AgentState, self).__init__()
+        # communication utterance
+        self.c = None
+
+# action of the agent
+class Action(object):
+    def __init__(self):
+        # physical action
+        self.u = None
+        # communication action
+        self.c = None
+
+# properties and state of physical world entity
+class Entity(object):
+    def __init__(self):
+        # name 
+        self.name = ''
+        # properties:
+        self.size = 0.050
+        # entity can move / be pushed
+        self.movable = False
+        # entity collides with others
+        self.collide = True
+        # material density (affects mass)
+        self.density = 25.0
+        # color
+        self.color = None
+        # max speed and accel
+        self.max_speed = None
+        self.accel = None
+        # state
+        self.state = EntityState()
+        # mass
+        self.initial_mass = 1.0
+
+    @property
+    def mass(self):
+        return self.initial_mass
+
+# properties of landmark entities
+class Landmark(Entity):
+     def __init__(self):
+        super(Landmark, self).__init__()
+
+# properties of agent entities
+class Agent(Entity):
+    def __init__(self):
+        super(Agent, self).__init__()
+        # agents are movable by default
+        self.movable = True
+        # cannot send communication signals
+        self.silent = False
+        # cannot observe the world
+        self.blind = False
+        # physical motor noise amount
+        self.u_noise = None
+        # communication noise amount
+        self.c_noise = None
+        # control range
+        self.u_range = 1.0
+        # state
+        self.state = AgentState()
+        # action
+        self.action = Action()
+        # script behavior to execute
+        self.action_callback = None
+
+# multi-agent world
+class World(object):
+    def __init__(self):
+        # list of agents and entities (can change at execution-time!)
+        self.agents = []
+        self.landmarks = []
+        # communication channel dimensionality
+        self.dim_c = 0
+        # position dimensionality
+        self.dim_p = 2
+        # color dimensionality
+        self.dim_color = 3
+        # simulation timestep
+        self.dt = 0.1
+        # physical damping
+        self.damping = 0.25
+        # contact response parameters
+        self.contact_force = 1e+2
+        self.contact_margin = 1e-3
+
+    # return all entities in the world
+    @property
+    def entities(self):
+        return self.agents + self.landmarks
+
+    # return all agents controllable by external policies
+    @property
+    def policy_agents(self):
+        return [agent for agent in self.agents if agent.action_callback is None]
+
+    # return all agents controlled by world scripts
+    @property
+    def scripted_agents(self):
+        return [agent for agent in self.agents if agent.action_callback is not None]
+
+    # update state of the world
+    def step(self):
+        # set actions for scripted agents 
+        for agent in self.scripted_agents:
+            agent.action = agent.action_callback(agent, self)
+        # gather forces applied to entities
+        p_force = [None] * len(self.entities)
+        # apply agent physical controls
+        p_force = self.apply_action_force(p_force)
+        # apply environment forces
+        p_force = self.apply_environment_force(p_force)
+        # integrate physical state
+        self.integrate_state(p_force)
+        # update agent state
+        for agent in self.agents:
+            self.update_agent_state(agent)
+
+    # gather agent action forces
+    def apply_action_force(self, p_force):
+        # set applied forces
+        for i,agent in enumerate(self.agents):
+            if agent.movable:
+                noise = np.random.randn(*agent.action.u.shape) * agent.u_noise if agent.u_noise else 0.0
+                p_force[i] = agent.action.u + noise                
+        return p_force
+
+    # gather physical forces acting on entities
+    def apply_environment_force(self, p_force):
+        # simple (but inefficient) collision response
+        for a,entity_a in enumerate(self.entities):
+            for b,entity_b in enumerate(self.entities):
+                if(b <= a): continue
+                [f_a, f_b] = self.get_collision_force(entity_a, entity_b)
+                if(f_a is not None):
+                    if(p_force[a] is None): p_force[a] = 0.0
+                    p_force[a] = f_a + p_force[a] 
+                if(f_b is not None):
+                    if(p_force[b] is None): p_force[b] = 0.0
+                    p_force[b] = f_b + p_force[b]        
+        return p_force
+
+    # integrate physical state
+    def integrate_state(self, p_force):
+        for i,entity in enumerate(self.entities):
+            if not entity.movable: continue
+            entity.state.p_vel = entity.state.p_vel * (1 - self.damping)
+            if (p_force[i] is not None):
+                entity.state.p_vel += (p_force[i] / entity.mass) * self.dt
+            if entity.max_speed is not None:
+                speed = np.sqrt(np.square(entity.state.p_vel[0]) + np.square(entity.state.p_vel[1]))
+                if speed > entity.max_speed:
+                    entity.state.p_vel = entity.state.p_vel / np.sqrt(np.square(entity.state.p_vel[0]) +
+                                                                  np.square(entity.state.p_vel[1])) * entity.max_speed
+            entity.state.p_pos += entity.state.p_vel * self.dt
+
+    def update_agent_state(self, agent):
+        # set communication state (directly for now)
+        if agent.silent:
+            agent.state.c = np.zeros(self.dim_c)
+        else:
+            noise = np.random.randn(*agent.action.c.shape) * agent.c_noise if agent.c_noise else 0.0
+            agent.state.c = agent.action.c + noise      
+
+    # get collision forces for any contact between two entities
+    def get_collision_force(self, entity_a, entity_b):
+        if (not entity_a.collide) or (not entity_b.collide):
+            return [None, None] # not a collider
+        if (entity_a is entity_b):
+            return [None, None] # don't collide against itself
+        # compute actual distance between entities
+        delta_pos = entity_a.state.p_pos - entity_b.state.p_pos
+        dist = np.sqrt(np.sum(np.square(delta_pos)))
+        # minimum allowable distance
+        dist_min = entity_a.size + entity_b.size
+        # softmax penetration
+        k = self.contact_margin
+        penetration = np.logaddexp(0, -(dist - dist_min)/k)*k
+        force = self.contact_force * delta_pos / dist * penetration
+        force_a = +force if entity_a.movable else None
+        force_b = -force if entity_b.movable else None
+        return [force_a, force_b]
--- a/multiagent/environment.py
+++ b/multiagent/environment.py
@ -0,0 +1,336 @@
+import gym
+from gym import spaces
+from gym.envs.registration import EnvSpec
+import numpy as np
+from multiagent.multi_discrete import MultiDiscrete
+
+# environment for all agents in the multiagent world
+# currently code assumes that no agents will be created/destroyed at runtime!
+class MultiAgentEnv(gym.Env):
+    metadata = {
+        'render.modes' : ['human', 'rgb_array']
+    }
+
+    def __init__(self, world, reset_callback=None, reward_callback=None,
+                 observation_callback=None, info_callback=None,
+                 done_callback=None, shared_viewer=True):
+
+        self.world = world
+        self.agents = self.world.policy_agents
+        # set required vectorized gym env property
+        self.n = len(world.policy_agents)
+        # scenario callbacks
+        self.reset_callback = reset_callback
+        self.reward_callback = reward_callback
+        self.observation_callback = observation_callback
+        self.info_callback = info_callback
+        self.done_callback = done_callback
+        # environment parameters
+        self.discrete_action_space = True
+        # if true, action is a number 0...N, otherwise action is a one-hot N-dimensional vector
+        self.discrete_action_input = False
+        # if true, even the action is continuous, action will be performed discretely
+        self.force_discrete_action = world.discrete_action if hasattr(world, 'discrete_action') else False
+        # if true, every agent has the same reward
+        self.shared_reward = world.collaborative if hasattr(world, 'collaborative') else False
+        self.time = 0
+
+        # configure spaces
+        self.action_space = []
+        self.observation_space = []
+        for agent in self.agents:
+            total_action_space = []
+            # physical action space
+            if self.discrete_action_space:
+                u_action_space = spaces.Discrete(world.dim_p * 2 + 1)
+            else:
+                u_action_space = spaces.Box(low=-agent.u_range, high=+agent.u_range, shape=(world.dim_p,), dtype=np.float32)
+            if agent.movable:
+                total_action_space.append(u_action_space)
+            # communication action space
+            if self.discrete_action_space:
+                c_action_space = spaces.Discrete(world.dim_c)
+            else:
+                c_action_space = spaces.Box(low=0.0, high=1.0, shape=(world.dim_c,), dtype=np.float32)
+            if not agent.silent:
+                total_action_space.append(c_action_space)
+            # total action space
+            if len(total_action_space) > 1:
+                # all action spaces are discrete, so simplify to MultiDiscrete action space
+                if all([isinstance(act_space, spaces.Discrete) for act_space in total_action_space]):
+                    act_space = MultiDiscrete([[0, act_space.n - 1] for act_space in total_action_space])
+                    #print(act_space.n)
+                else:
+                    act_space = spaces.Tuple(total_action_space)
+                self.action_space.append(act_space)
+            else:
+                self.action_space.append(total_action_space[0])
+            # observation space
+            obs_dim = len(observation_callback(agent, self.world))
+            self.observation_space.append(spaces.Box(low=-np.inf, high=+np.inf, shape=(obs_dim,), dtype=np.float32))
+            agent.action.c = np.zeros(self.world.dim_c)
+
+        # rendering
+        self.shared_viewer = shared_viewer
+        if self.shared_viewer:
+            self.viewers = [None]
+        else:
+            self.viewers = [None] * self.n
+        self._reset_render()
+
+    def step(self, action_n):
+        obs_n = []
+        reward_n = []
+        done_n = []
+        info_n = {'n': []}
+        self.agents = self.world.policy_agents
+        # set action for each agent
+        for i, agent in enumerate(self.agents):
+            self._set_action(action_n[i], agent, self.action_space[i])
+        # advance world state
+        self.world.step()
+        # record observation for each agent
+        for agent in self.agents:
+            obs_n.append(self._get_obs(agent))
+            reward_n.append(self._get_reward(agent))
+            done_n.append(self._get_done(agent))
+
+            info_n['n'].append(self._get_info(agent))
+
+        # all agents get total reward in cooperative case
+        reward = np.sum(reward_n)
+        if self.shared_reward:
+            reward_n = [reward] * self.n
+
+        return obs_n, reward_n, done_n, info_n
+
+    def reset(self):
+        # reset world
+        self.reset_callback(self.world)
+        # reset renderer
+        self._reset_render()
+        # record observations for each agent
+        obs_n = []
+        self.agents = self.world.policy_agents
+        for agent in self.agents:
+            obs_n.append(self._get_obs(agent))
+        return obs_n
+
+    # get info used for benchmarking
+    def _get_info(self, agent):
+        if self.info_callback is None:
+            return {}
+        return self.info_callback(agent, self.world)
+
+    # get observation for a particular agent
+    def _get_obs(self, agent):
+        if self.observation_callback is None:
+            return np.zeros(0)
+        return self.observation_callback(agent, self.world)
+
+    # get dones for a particular agent
+    # unused right now -- agents are allowed to go beyond the viewing screen
+    def _get_done(self, agent):
+        if self.done_callback is None:
+            return False
+        return self.done_callback(agent, self.world)
+
+    # get reward for a particular agent
+    def _get_reward(self, agent):
+        if self.reward_callback is None:
+            return 0.0
+        return self.reward_callback(agent, self.world)
+
+    # set env action for a particular agent
+    def _set_action(self, action, agent, action_space, time=None):
+        agent.action.u = np.zeros(self.world.dim_p)
+        agent.action.c = np.zeros(self.world.dim_c)
+        # process action
+        if isinstance(action_space, MultiDiscrete):
+            act = []
+            size = action_space.high - action_space.low + 1
+            index = 0
+            for s in size:
+                act.append(action[index:(index+s)])
+                index += s
+            action = act
+        else:
+            action = [action]
+
+        if agent.movable:
+            # physical action
+            if self.discrete_action_input:
+                agent.action.u = np.zeros(self.world.dim_p)
+                # process discrete action
+                if action[0] == 1: agent.action.u[0] = -1.0
+                if action[0] == 2: agent.action.u[0] = +1.0
+                if action[0] == 3: agent.action.u[1] = -1.0
+                if action[0] == 4: agent.action.u[1] = +1.0
+            else:
+                if self.force_discrete_action:
+                    d = np.argmax(action[0])
+                    action[0][:] = 0.0
+                    action[0][d] = 1.0
+                if self.discrete_action_space:
+                    agent.action.u[0] += action[0][1] - action[0][2]
+                    agent.action.u[1] += action[0][3] - action[0][4]
+                else:
+                    agent.action.u = action[0]
+            sensitivity = 5.0
+            if agent.accel is not None:
+                sensitivity = agent.accel
+            agent.action.u *= sensitivity
+            action = action[1:]
+        if not agent.silent:
+            # communication action
+            if self.discrete_action_input:
+                agent.action.c = np.zeros(self.world.dim_c)
+                agent.action.c[action[0]] = 1.0
+            else:
+                agent.action.c = action[0]
+            action = action[1:]
+        # make sure we used all elements of action
+        assert len(action) == 0
+
+    # reset rendering assets
+    def _reset_render(self):
+        self.render_geoms = None
+        self.render_geoms_xform = None
+
+    # render environment
+    def render(self, mode='human'):
+        if mode == 'human':
+            alphabet = 'ABCDEFGHIJKLMNOPQRSTUVWXYZ'
+            message = ''
+            for agent in self.world.agents:
+                comm = []
+                for other in self.world.agents:
+                    if other is agent: continue
+                    if np.all(other.state.c == 0):
+                        word = '_'
+                    else:
+                        word = alphabet[np.argmax(other.state.c)]
+                    message += (other.name + ' to ' + agent.name + ': ' + word + '   ')
+            print(message)
+
+        for i in range(len(self.viewers)):
+            # create viewers (if necessary)
+            if self.viewers[i] is None:
+                # import rendering only if we need it (and don't import for headless machines)
+                #from gym.envs.classic_control import rendering
+                from multiagent import rendering
+                self.viewers[i] = rendering.Viewer(700,700)
+
+        # create rendering geometry
+        if self.render_geoms is None:
+            # import rendering only if we need it (and don't import for headless machines)
+            #from gym.envs.classic_control import rendering
+            from multiagent import rendering
+            self.render_geoms = []
+            self.render_geoms_xform = []
+            for entity in self.world.entities:
+                geom = rendering.make_circle(entity.size)
+                xform = rendering.Transform()
+                if 'agent' in entity.name:
+                    geom.set_color(*entity.color, alpha=0.5)
+                else:
+                    geom.set_color(*entity.color)
+                geom.add_attr(xform)
+                self.render_geoms.append(geom)
+                self.render_geoms_xform.append(xform)
+
+            # add geoms to viewer
+            for viewer in self.viewers:
+                viewer.geoms = []
+                for geom in self.render_geoms:
+                    viewer.add_geom(geom)
+
+        results = []
+        for i in range(len(self.viewers)):
+            from multiagent import rendering
+            # update bounds to center around agent
+            cam_range = 1
+            if self.shared_viewer:
+                pos = np.zeros(self.world.dim_p)
+            else:
+                pos = self.agents[i].state.p_pos
+            self.viewers[i].set_bounds(pos[0]-cam_range,pos[0]+cam_range,pos[1]-cam_range,pos[1]+cam_range)
+            # update geometry positions
+            for e, entity in enumerate(self.world.entities):
+                self.render_geoms_xform[e].set_translation(*entity.state.p_pos)
+            # render to display or array
+            results.append(self.viewers[i].render(return_rgb_array = mode=='rgb_array'))
+
+        return results
+
+    # create receptor field locations in local coordinate frame
+    def _make_receptor_locations(self, agent):
+        receptor_type = 'polar'
+        range_min = 0.05 * 2.0
+        range_max = 1.00
+        dx = []
+        # circular receptive field
+        if receptor_type == 'polar':
+            for angle in np.linspace(-np.pi, +np.pi, 8, endpoint=False):
+                for distance in np.linspace(range_min, range_max, 3):
+                    dx.append(distance * np.array([np.cos(angle), np.sin(angle)]))
+            # add origin
+            dx.append(np.array([0.0, 0.0]))
+        # grid receptive field
+        if receptor_type == 'grid':
+            for x in np.linspace(-range_max, +range_max, 5):
+                for y in np.linspace(-range_max, +range_max, 5):
+                    dx.append(np.array([x,y]))
+        return dx
+
+
+# vectorized wrapper for a batch of multi-agent environments
+# assumes all environments have the same observation and action space
+class BatchMultiAgentEnv(gym.Env):
+    metadata = {
+        'runtime.vectorized': True,
+        'render.modes' : ['human', 'rgb_array']
+    }
+
+    def __init__(self, env_batch):
+        self.env_batch = env_batch
+
+    @property
+    def n(self):
+        return np.sum([env.n for env in self.env_batch])
+
+    @property
+    def action_space(self):
+        return self.env_batch[0].action_space
+
+    @property
+    def observation_space(self):
+        return self.env_batch[0].observation_space
+
+    def step(self, action_n, time):
+        obs_n = []
+        reward_n = []
+        done_n = []
+        info_n = {'n': []}
+        i = 0
+        for env in self.env_batch:
+            obs, reward, done, _ = env.step(action_n[i:(i+env.n)], time)
+            i += env.n
+            obs_n += obs
+            # reward = [r / len(self.env_batch) for r in reward]
+            reward_n += reward
+            done_n += done
+        return obs_n, reward_n, done_n, info_n
+
+    def reset(self):
+        obs_n = []
+        for env in self.env_batch:
+            obs_n += env.reset()
+        return obs_n
+
+    # render environment
+    def render(self, mode='human', close=True):
+        results_n = []
+        for env in self.env_batch:
+            results_n += env.render(mode, close)
+        return results_n
--- a/multiagent/multi_discrete.py
+++ b/multiagent/multi_discrete.py
@ -0,0 +1,44 @@
+# An old version of OpenAI Gym's multi_discrete.py. (Was getting affected by Gym updates)
+# (https://github.com/openai/gym/blob/1fb81d4e3fb780ccf77fec731287ba07da35eb84/gym/spaces/multi_discrete.py)
+
+import numpy as np
+
+import gym
+from gym.spaces import prng
+
+class MultiDiscrete(gym.Space):
+    """
+    - The multi-discrete action space consists of a series of discrete action spaces with different parameters
+    - It can be adapted to both a Discrete action space or a continuous (Box) action space
+    - It is useful to represent game controllers or keyboards where each key can be represented as a discrete action space
+    - It is parametrized by passing an array of arrays containing [min, max] for each discrete action space
+       where the discrete action space can take any integers from `min` to `max` (both inclusive)
+    Note: A value of 0 always need to represent the NOOP action.
+    e.g. Nintendo Game Controller
+    - Can be conceptualized as 3 discrete action spaces:
+        1) Arrow Keys: Discrete 5  - NOOP[0], UP[1], RIGHT[2], DOWN[3], LEFT[4]  - params: min: 0, max: 4
+        2) Button A:   Discrete 2  - NOOP[0], Pressed[1] - params: min: 0, max: 1
+        3) Button B:   Discrete 2  - NOOP[0], Pressed[1] - params: min: 0, max: 1
+    - Can be initialized as
+        MultiDiscrete([ [0,4], [0,1], [0,1] ])
+    """
+    def __init__(self, array_of_param_array):
+        self.low = np.array([x[0] for x in array_of_param_array])
+        self.high = np.array([x[1] for x in array_of_param_array])
+        self.num_discrete_space = self.low.shape[0]
+
+    def sample(self):
+        """ Returns a array with one sample from each discrete action space """
+        # For each row: round(random .* (max - min) + min, 0)
+        random_array = prng.np_random.rand(self.num_discrete_space)
+        return [int(x) for x in np.floor(np.multiply((self.high - self.low + 1.), random_array) + self.low)]
+    def contains(self, x):
+        return len(x) == self.num_discrete_space and (np.array(x) >= self.low).all() and (np.array(x) <= self.high).all()
+
+    @property
+    def shape(self):
+        return self.num_discrete_space
+    def __repr__(self):
+        return "MultiDiscrete" + str(self.num_discrete_space)
+    def __eq__(self, other):
+        return np.array_equal(self.low, other.low) and np.array_equal(self.high, other.high)
--- a/multiagent/policy.py
+++ b/multiagent/policy.py
@ -0,0 +1,52 @@
+import numpy as np
+from pyglet.window import key
+
+# individual agent policy
+class Policy(object):
+    def __init__(self):
+        pass
+    def action(self, obs):
+        raise NotImplementedError()
+
+# interactive policy based on keyboard input
+# hard-coded to deal only with movement, not communication
+class InteractivePolicy(Policy):
+    def __init__(self, env, agent_index):
+        super(InteractivePolicy, self).__init__()
+        self.env = env
+        # hard-coded keyboard events
+        self.move = [False for i in range(4)]
+        self.comm = [False for i in range(env.world.dim_c)]
+        # register keyboard events with this environment's window
+        env.viewers[agent_index].window.on_key_press = self.key_press
+        env.viewers[agent_index].window.on_key_release = self.key_release
+
+    def action(self, obs):
+        # ignore observation and just act based on keyboard events
+        if self.env.discrete_action_input:
+            u = 0
+            if self.move[0]: u = 1
+            if self.move[1]: u = 2
+            if self.move[2]: u = 4
+            if self.move[3]: u = 3
+        else:
+            u = np.zeros(5) # 5-d because of no-move action
+            if self.move[0]: u[1] += 1.0
+            if self.move[1]: u[2] += 1.0
+            if self.move[3]: u[3] += 1.0
+            if self.move[2]: u[4] += 1.0
+            if True not in self.move:
+                u[0] += 1.0
+        return np.concatenate([u, np.zeros(self.env.world.dim_c)])
+
+    # keyboard event callbacks
+    def key_press(self, k, mod):
+        if k==key.LEFT:  self.move[0] = True
+        if k==key.RIGHT: self.move[1] = True
+        if k==key.UP:    self.move[2] = True
+        if k==key.DOWN:  self.move[3] = True
+    def key_release(self, k, mod):
+        if k==key.LEFT:  self.move[0] = False
+        if k==key.RIGHT: self.move[1] = False
+        if k==key.UP:    self.move[2] = False
+        if k==key.DOWN:  self.move[3] = False
--- a/multiagent/rendering.py
+++ b/multiagent/rendering.py
@ -0,0 +1,345 @@
+"""
+2D rendering framework
+"""
+from __future__ import division
+import os
+import six
+import sys
+
+if "Apple" in sys.version:
+    if 'DYLD_FALLBACK_LIBRARY_PATH' in os.environ:
+        os.environ['DYLD_FALLBACK_LIBRARY_PATH'] += ':/usr/lib'
+        # (JDS 2016/04/15): avoid bug on Anaconda 2.3.0 / Yosemite
+
+from gym.utils import reraise
+from gym import error
+
+try:
+    import pyglet
+except ImportError as e:
+    reraise(suffix="HINT: you can install pyglet directly via 'pip install pyglet'. But if you really just want to install all Gym dependencies and not have to think about it, 'pip install -e .[all]' or 'pip install gym[all]' will do it.")
+
+try:
+    from pyglet.gl import *
+except ImportError as e:
+    reraise(prefix="Error occured while running `from pyglet.gl import *`",suffix="HINT: make sure you have OpenGL install. On Ubuntu, you can run 'apt-get install python-opengl'. If you're running on a server, you may need a virtual frame buffer; something like this should work: 'xvfb-run -s \"-screen 0 1400x900x24\" python <your_script.py>'")
+
+import math
+import numpy as np
+
+RAD2DEG = 57.29577951308232
+
+def get_display(spec):
+    """Convert a display specification (such as :0) into an actual Display
+    object.
+
+    Pyglet only supports multiple Displays on Linux.
+    """
+    if spec is None:
+        return None
+    elif isinstance(spec, six.string_types):
+        return pyglet.canvas.Display(spec)
+    else:
+        raise error.Error('Invalid display specification: {}. (Must be a string like :0 or None.)'.format(spec))
+
+class Viewer(object):
+    def __init__(self, width, height, display=None):
+        display = get_display(display)
+
+        self.width = width
+        self.height = height
+
+        self.window = pyglet.window.Window(width=width, height=height, display=display)
+        self.window.on_close = self.window_closed_by_user
+        self.geoms = []
+        self.onetime_geoms = []
+        self.transform = Transform()
+
+        glEnable(GL_BLEND)
+        # glEnable(GL_MULTISAMPLE)
+        glEnable(GL_LINE_SMOOTH)
+        # glHint(GL_LINE_SMOOTH_HINT, GL_DONT_CARE)
+        glHint(GL_LINE_SMOOTH_HINT, GL_NICEST)
+        glLineWidth(2.0)
+        glBlendFunc(GL_SRC_ALPHA, GL_ONE_MINUS_SRC_ALPHA)
+
+    def close(self):
+        self.window.close()
+
+    def window_closed_by_user(self):
+        self.close()
+
+    def set_bounds(self, left, right, bottom, top):
+        assert right > left and top > bottom
+        scalex = self.width/(right-left)
+        scaley = self.height/(top-bottom)
+        self.transform = Transform(
+            translation=(-left*scalex, -bottom*scaley),
+            scale=(scalex, scaley))
+
+    def add_geom(self, geom):
+        self.geoms.append(geom)
+
+    def add_onetime(self, geom):
+        self.onetime_geoms.append(geom)
+
+    def render(self, return_rgb_array=False):
+        glClearColor(1,1,1,1)
+        self.window.clear()
+        self.window.switch_to()
+        self.window.dispatch_events()
+        self.transform.enable()
+        for geom in self.geoms:
+            geom.render()
+        for geom in self.onetime_geoms:
+            geom.render()
+        self.transform.disable()
+        arr = None
+        if return_rgb_array:
+            buffer = pyglet.image.get_buffer_manager().get_color_buffer()
+            image_data = buffer.get_image_data()
+            arr = np.fromstring(image_data.data, dtype=np.uint8, sep='')
+            # In https://github.com/openai/gym-http-api/issues/2, we
+            # discovered that someone using Xmonad on Arch was having
+            # a window of size 598 x 398, though a 600 x 400 window
+            # was requested. (Guess Xmonad was preserving a pixel for
+            # the boundary.) So we use the buffer height/width rather
+            # than the requested one.
+            arr = arr.reshape(buffer.height, buffer.width, 4)
+            arr = arr[::-1,:,0:3]
+        self.window.flip()
+        self.onetime_geoms = []
+        return arr
+
+    # Convenience
+    def draw_circle(self, radius=10, res=30, filled=True, **attrs):
+        geom = make_circle(radius=radius, res=res, filled=filled)
+        _add_attrs(geom, attrs)
+        self.add_onetime(geom)
+        return geom
+
+    def draw_polygon(self, v, filled=True, **attrs):
+        geom = make_polygon(v=v, filled=filled)
+        _add_attrs(geom, attrs)
+        self.add_onetime(geom)
+        return geom
+
+    def draw_polyline(self, v, **attrs):
+        geom = make_polyline(v=v)
+        _add_attrs(geom, attrs)
+        self.add_onetime(geom)
+        return geom
+
+    def draw_line(self, start, end, **attrs):
+        geom = Line(start, end)
+        _add_attrs(geom, attrs)
+        self.add_onetime(geom)
+        return geom
+
+    def get_array(self):
+        self.window.flip()
+        image_data = pyglet.image.get_buffer_manager().get_color_buffer().get_image_data()
+        self.window.flip()
+        arr = np.fromstring(image_data.data, dtype=np.uint8, sep='')
+        arr = arr.reshape(self.height, self.width, 4)
+        return arr[::-1,:,0:3]
+
+def _add_attrs(geom, attrs):
+    if "color" in attrs:
+        geom.set_color(*attrs["color"])
+    if "linewidth" in attrs:
+        geom.set_linewidth(attrs["linewidth"])
+
+class Geom(object):
+    def __init__(self):
+        self._color=Color((0, 0, 0, 1.0))
+        self.attrs = [self._color]
+    def render(self):
+        for attr in reversed(self.attrs):
+            attr.enable()
+        self.render1()
+        for attr in self.attrs:
+            attr.disable()
+    def render1(self):
+        raise NotImplementedError
+    def add_attr(self, attr):
+        self.attrs.append(attr)
+    def set_color(self, r, g, b, alpha=1):
+        self._color.vec4 = (r, g, b, alpha)
+
+class Attr(object):
+    def enable(self):
+        raise NotImplementedError
+    def disable(self):
+        pass
+
+class Transform(Attr):
+    def __init__(self, translation=(0.0, 0.0), rotation=0.0, scale=(1,1)):
+        self.set_translation(*translation)
+        self.set_rotation(rotation)
+        self.set_scale(*scale)
+    def enable(self):
+        glPushMatrix()
+        glTranslatef(self.translation[0], self.translation[1], 0) # translate to GL loc ppint
+        glRotatef(RAD2DEG * self.rotation, 0, 0, 1.0)
+        glScalef(self.scale[0], self.scale[1], 1)
+    def disable(self):
+        glPopMatrix()
+    def set_translation(self, newx, newy):
+        self.translation = (float(newx), float(newy))
+    def set_rotation(self, new):
+        self.rotation = float(new)
+    def set_scale(self, newx, newy):
+        self.scale = (float(newx), float(newy))
+
+class Color(Attr):
+    def __init__(self, vec4):
+        self.vec4 = vec4
+    def enable(self):
+        glColor4f(*self.vec4)
+
+class LineStyle(Attr):
+    def __init__(self, style):
+        self.style = style
+    def enable(self):
+        glEnable(GL_LINE_STIPPLE)
+        glLineStipple(1, self.style)
+    def disable(self):
+        glDisable(GL_LINE_STIPPLE)
+
+class LineWidth(Attr):
+    def __init__(self, stroke):
+        self.stroke = stroke
+    def enable(self):
+        glLineWidth(self.stroke)
+
+class Point(Geom):
+    def __init__(self):
+        Geom.__init__(self)
+    def render1(self):
+        glBegin(GL_POINTS) # draw point
+        glVertex3f(0.0, 0.0, 0.0)
+        glEnd()
+
+class FilledPolygon(Geom):
+    def __init__(self, v):
+        Geom.__init__(self)
+        self.v = v
+    def render1(self):
+        if   len(self.v) == 4 : glBegin(GL_QUADS)
+        elif len(self.v)  > 4 : glBegin(GL_POLYGON)
+        else: glBegin(GL_TRIANGLES)
+        for p in self.v:
+            glVertex3f(p[0], p[1],0)  # draw each vertex
+        glEnd()
+
+        color = (self._color.vec4[0] * 0.5, self._color.vec4[1] * 0.5, self._color.vec4[2] * 0.5, self._color.vec4[3] * 0.5)
+        glColor4f(*color)
+        glBegin(GL_LINE_LOOP)
+        for p in self.v:
+            glVertex3f(p[0], p[1],0)  # draw each vertex
+        glEnd()
+
+def make_circle(radius=10, res=30, filled=True):
+    points = []
+    for i in range(res):
+        ang = 2*math.pi*i / res
+        points.append((math.cos(ang)*radius, math.sin(ang)*radius))
+    if filled:
+        return FilledPolygon(points)
+    else:
+        return PolyLine(points, True)
+
+def make_polygon(v, filled=True):
+    if filled: return FilledPolygon(v)
+    else: return PolyLine(v, True)
+
+def make_polyline(v):
+    return PolyLine(v, False)
+
+def make_capsule(length, width):
+    l, r, t, b = 0, length, width/2, -width/2
+    box = make_polygon([(l,b), (l,t), (r,t), (r,b)])
+    circ0 = make_circle(width/2)
+    circ1 = make_circle(width/2)
+    circ1.add_attr(Transform(translation=(length, 0)))
+    geom = Compound([box, circ0, circ1])
+    return geom
+
+class Compound(Geom):
+    def __init__(self, gs):
+        Geom.__init__(self)
+        self.gs = gs
+        for g in self.gs:
+            g.attrs = [a for a in g.attrs if not isinstance(a, Color)]
+    def render1(self):
+        for g in self.gs:
+            g.render()
+
+class PolyLine(Geom):
+    def __init__(self, v, close):
+        Geom.__init__(self)
+        self.v = v
+        self.close = close
+        self.linewidth = LineWidth(1)
+        self.add_attr(self.linewidth)
+    def render1(self):
+        glBegin(GL_LINE_LOOP if self.close else GL_LINE_STRIP)
+        for p in self.v:
+            glVertex3f(p[0], p[1],0)  # draw each vertex
+        glEnd()
+    def set_linewidth(self, x):
+        self.linewidth.stroke = x
+
+class Line(Geom):
+    def __init__(self, start=(0.0, 0.0), end=(0.0, 0.0)):
+        Geom.__init__(self)
+        self.start = start
+        self.end = end
+        self.linewidth = LineWidth(1)
+        self.add_attr(self.linewidth)
+
+    def render1(self):
+        glBegin(GL_LINES)
+        glVertex2f(*self.start)
+        glVertex2f(*self.end)
+        glEnd()
+
+class Image(Geom):
+    def __init__(self, fname, width, height):
+        Geom.__init__(self)
+        self.width = width
+        self.height = height
+        img = pyglet.image.load(fname)
+        self.img = img
+        self.flip = False
+    def render1(self):
+        self.img.blit(-self.width/2, -self.height/2, width=self.width, height=self.height)
+
+# ================================================================
+
+class SimpleImageViewer(object):
+    def __init__(self, display=None):
+        self.window = None
+        self.isopen = False
+        self.display = display
+    def imshow(self, arr):
+        if self.window is None:
+            height, width, channels = arr.shape
+            self.window = pyglet.window.Window(width=width, height=height, display=self.display)
+            self.width = width
+            self.height = height
+            self.isopen = True
+        assert arr.shape == (self.height, self.width, 3), "You passed in an image with the wrong number shape"
+        image = pyglet.image.ImageData(self.width, self.height, 'RGB', arr.tobytes(), pitch=self.width * -3)
+        self.window.clear()
+        self.window.switch_to()
+        self.window.dispatch_events()
+        image.blit(0,0)
+        self.window.flip()
+    def close(self):
+        if self.isopen:
+            self.window.close()
+            self.isopen = False
+    def __del__(self):
+        self.close()
--- a/multiagent/scenario.py
+++ b/multiagent/scenario.py
@ -0,0 +1,10 @@
+import numpy as np
+
+# defines scenario upon which the world is built
+class BaseScenario(object):
+    # create elements of the world
+    def make_world(self):
+        raise NotImplementedError()
+    # create initial conditions of the world
+    def reset_world(self, world):
+        raise NotImplementedError()
--- a/multiagent/scenarios/init.py
+++ b/multiagent/scenarios/init.py
@ -0,0 +1,7 @@
+import imp
+import os.path as osp
+
+
+def load(name):
+    pathname = osp.join(osp.dirname(__file__), name)
+    return imp.load_source('', pathname)
--- a/multiagent/scenarios/simple_adversary.py
+++ b/multiagent/scenarios/simple_adversary.py
@ -0,0 +1,139 @@
+import numpy as np
+from multiagent.core import World, Agent, Landmark
+from multiagent.scenario import BaseScenario
+
+
+class Scenario(BaseScenario):
+
+    def make_world(self):
+        world = World()
+        # set any world properties first
+        world.dim_c = 2
+        num_agents = 3
+        world.num_agents = num_agents
+        num_adversaries = 1
+        num_landmarks = num_agents - 1
+        # add agents
+        world.agents = [Agent() for i in range(num_agents)]
+        for i, agent in enumerate(world.agents):
+            agent.name = 'agent %d' % i
+            agent.collide = False
+            agent.silent = True
+            agent.adversary = True if i < num_adversaries else False
+            agent.size = 0.15
+        # add landmarks
+        world.landmarks = [Landmark() for i in range(num_landmarks)]
+        for i, landmark in enumerate(world.landmarks):
+            landmark.name = 'landmark %d' % i
+            landmark.collide = False
+            landmark.movable = False
+            landmark.size = 0.08
+        # make initial conditions
+        self.reset_world(world)
+        return world
+
+    def reset_world(self, world):
+        # random properties for agents
+        world.agents[0].color = np.array([0.85, 0.35, 0.35])
+        for i in range(1, world.num_agents):
+            world.agents[i].color = np.array([0.35, 0.35, 0.85])
+        # random properties for landmarks
+        for i, landmark in enumerate(world.landmarks):
+            landmark.color = np.array([0.15, 0.15, 0.15])
+        # set goal landmark
+        goal = np.random.choice(world.landmarks)
+        goal.color = np.array([0.15, 0.65, 0.15])
+        for agent in world.agents:
+            agent.goal_a = goal
+        # set random initial states
+        for agent in world.agents:
+            agent.state.p_pos = np.random.uniform(-1, +1, world.dim_p)
+            agent.state.p_vel = np.zeros(world.dim_p)
+            agent.state.c = np.zeros(world.dim_c)
+        for i, landmark in enumerate(world.landmarks):
+            landmark.state.p_pos = np.random.uniform(-1, +1, world.dim_p)
+            landmark.state.p_vel = np.zeros(world.dim_p)
+
+    def benchmark_data(self, agent, world):
+        # returns data for benchmarking purposes
+        if agent.adversary:
+            return np.sum(np.square(agent.state.p_pos - agent.goal_a.state.p_pos))
+        else:
+            dists = []
+            for l in world.landmarks:
+                dists.append(np.sum(np.square(agent.state.p_pos - l.state.p_pos)))
+            dists.append(np.sum(np.square(agent.state.p_pos - agent.goal_a.state.p_pos)))
+            return tuple(dists)
+
+    # return all agents that are not adversaries
+    def good_agents(self, world):
+        return [agent for agent in world.agents if not agent.adversary]
+
+    # return all adversarial agents
+    def adversaries(self, world):
+        return [agent for agent in world.agents if agent.adversary]
+
+    def reward(self, agent, world):
+        # Agents are rewarded based on minimum agent distance to each landmark
+        return self.adversary_reward(agent, world) if agent.adversary else self.agent_reward(agent, world)
+
+    def agent_reward(self, agent, world):
+        # Rewarded based on how close any good agent is to the goal landmark, and how far the adversary is from it
+        shaped_reward = True
+        shaped_adv_reward = True
+
+        # Calculate negative reward for adversary
+        adversary_agents = self.adversaries(world)
+        if shaped_adv_reward:  # distance-based adversary reward
+            adv_rew = sum([np.sqrt(np.sum(np.square(a.state.p_pos - a.goal_a.state.p_pos))) for a in adversary_agents])
+        else:  # proximity-based adversary reward (binary)
+            adv_rew = 0
+            for a in adversary_agents:
+                if np.sqrt(np.sum(np.square(a.state.p_pos - a.goal_a.state.p_pos))) < 2 * a.goal_a.size:
+                    adv_rew -= 5
+
+        # Calculate positive reward for agents
+        good_agents = self.good_agents(world)
+        if shaped_reward:  # distance-based agent reward
+            pos_rew = -min(
+                [np.sqrt(np.sum(np.square(a.state.p_pos - a.goal_a.state.p_pos))) for a in good_agents])
+        else:  # proximity-based agent reward (binary)
+            pos_rew = 0
+            if min([np.sqrt(np.sum(np.square(a.state.p_pos - a.goal_a.state.p_pos))) for a in good_agents]) \
+                    < 2 * agent.goal_a.size:
+                pos_rew += 5
+            pos_rew -= min(
+                [np.sqrt(np.sum(np.square(a.state.p_pos - a.goal_a.state.p_pos))) for a in good_agents])
+        return pos_rew + adv_rew
+
+    def adversary_reward(self, agent, world):
+        # Rewarded based on proximity to the goal landmark
+        shaped_reward = True
+        if shaped_reward:  # distance-based reward
+            return -np.sum(np.square(agent.state.p_pos - agent.goal_a.state.p_pos))
+        else:  # proximity-based reward (binary)
+            adv_rew = 0
+            if np.sqrt(np.sum(np.square(agent.state.p_pos - agent.goal_a.state.p_pos))) < 2 * agent.goal_a.size:
+                adv_rew += 5
+            return adv_rew
+
+
+    def observation(self, agent, world):
+        # get positions of all entities in this agent's reference frame
+        entity_pos = []
+        for entity in world.landmarks:
+            entity_pos.append(entity.state.p_pos - agent.state.p_pos)
+        # entity colors
+        entity_color = []
+        for entity in world.landmarks:
+            entity_color.append(entity.color)
+        # communication of all other agents
+        other_pos = []
+        for other in world.agents:
+            if other is agent: continue
+            other_pos.append(other.state.p_pos - agent.state.p_pos)
+
+        if not agent.adversary:
+            return np.concatenate([agent.goal_a.state.p_pos - agent.state.p_pos] + entity_pos + other_pos)
+        else:
+            return np.concatenate(entity_pos + other_pos)
--- a/runner.py
+++ b/runner.py
@ -0,0 +1,86 @@
+from tqdm import tqdm
+from agent import Agent
+from common.replay_buffer import Buffer
+import torch
+import os
+import numpy as np
+import matplotlib.pyplot as plt
+
+class Runner:
+    def __init__(self, args, env):
+        self.args = args
+        self.noise = args.noise_rate
+        self.epsilon = args.epsilon
+        self.episode_limit = args.max_episode_len
+        self.env = env
+        self.agents = self._init_agents()
+        self.buffer = Buffer(args)
+        self.save_path = self.args.save_dir + '/' + self.args.scenario_name
+        if not os.path.exists(self.save_path):
+            os.makedirs(self.save_path)
+
+    def _init_agents(self):
+        agents = []
+        for i in range(self.args.n_agents):
+            agent = Agent(i, self.args)
+            agents.append(agent)
+        return agents
+
+    def run(self):
+        returns = []
+        for time_step in tqdm(range(self.args.time_steps)):
+            # reset the environment
+            if time_step % self.episode_limit == 0:
+                s = self.env.reset()
+            u = []
+            actions = []
+            with torch.no_grad():
+                for agent_id, agent in enumerate(self.agents):
+                    action = agent.select_action(s[agent_id], self.noise, self.epsilon)
+                    u.append(action)
+                    actions.append(action)
+            for i in range(self.args.n_agents, self.args.n_players):
+                actions.append([0, np.random.rand() * 2 - 1, 0, np.random.rand() * 2 - 1, 0])
+            s_next, r, done, info = self.env.step(actions)
+            self.buffer.store_episode(s[:self.args.n_agents], u, r[:self.args.n_agents], s_next[:self.args.n_agents])
+            s = s_next
+            if self.buffer.current_size >= self.args.batch_size:
+                transitions = self.buffer.sample(self.args.batch_size)
+                for agent in self.agents:
+                    other_agents = self.agents.copy()
+                    other_agents.remove(agent)
+                    agent.learn(transitions, other_agents)
+            if time_step > 0 and time_step % self.args.evaluate_rate == 0:
+                returns.append(self.evaluate())
+                plt.figure()
+                plt.plot(range(len(returns)), returns)
+                plt.xlabel('episode * ' + str(self.args.evaluate_rate / self.episode_limit))
+                plt.ylabel('average returns')
+                plt.savefig(self.save_path + '/plt.png', format='png')
+            self.noise = max(0.05, self.noise - 0.0000005)
+            self.epsilon = max(0.05, self.epsilon - 0.0000005)
+            np.save(self.save_path + '/returns.pkl', returns)
+
+    def evaluate(self):
+        returns = []
+        for episode in range(self.args.evaluate_episodes):
+            # reset the environment
+            s = self.env.reset()
+            rewards = 0
+            for time_step in range(self.args.evaluate_episode_len):
+#                 if (episode > self.args.evaluate_episode_len - 50):
+                #self.env.render()
+                actions = []
+                with torch.no_grad():
+                    for agent_id, agent in enumerate(self.agents):
+                        action = agent.select_action(s[agent_id], 0, 0)
+                        actions.append(action)
+                for i in range(self.args.n_agents, self.args.n_players):
+                    actions.append([0, np.random.rand() * 2 - 1, 0, np.random.rand() * 2 - 1, 0])
+                s_next, r, done, info = self.env.step(actions)
+                rewards += r[0]
+                s = s_next
+            returns.append(rewards)
+            if (episode % 1000 == 0):
+                print('Returns is', rewards)
+        return sum(returns) / self.args.evaluate_episodes