diff --git a/.gitignore b/.gitignore index c4d7fc4..26f8b6a 100644 --- a/.gitignore +++ b/.gitignore @@ -1 +1,4 @@ -.ipynb* \ No newline at end of file +.ipynb* +/model/*/*/* +!/model/*/*/actor* +!/model/*/*/critic* diff --git a/agent.py b/agent.py new file mode 100644 index 0000000..5b91c07 --- /dev/null +++ b/agent.py @@ -0,0 +1,28 @@ +import numpy as np +import torch +import os +from maddpg.maddpg import MADDPG + + +class Agent: + def __init__(self, agent_id, args): + self.args = args + self.agent_id = agent_id + self.policy = MADDPG(args, agent_id) + + def select_action(self, o, noise_rate, epsilon): + if np.random.uniform() < epsilon: + u = np.random.uniform(-self.args.high_action, self.args.high_action, self.args.action_shape[self.agent_id]) + else: + inputs = torch.tensor(o, dtype=torch.float32).unsqueeze(0) + pi = self.policy.actor_network(inputs).squeeze(0) + # print('{} : {}'.format(self.name, pi)) + u = pi.cpu().numpy() + noise = noise_rate * self.args.high_action * np.random.randn(*u.shape) # gaussian noise + u += noise + u = np.clip(u, -self.args.high_action, self.args.high_action) + return u.copy() + + def learn(self, transitions, other_agents): + self.policy.train(transitions, other_agents) + diff --git a/common/arguments.py b/common/arguments.py new file mode 100644 index 0000000..76965c3 --- /dev/null +++ b/common/arguments.py @@ -0,0 +1,38 @@ +import argparse + +""" +Here are the param for the training + +""" + + +def get_args(): + parser = argparse.ArgumentParser("Reinforcement Learning experiments for multiagent environments") + # Environment + parser.add_argument("--scenario-name", type=str, default="simple_tag", help="name of the scenario script") + parser.add_argument("--max-episode-len", type=int, default=100, help="maximum episode length") + parser.add_argument("--time-steps", type=int, default=2000000, help="number of time steps") + # agents + parser.add_argument("--num-adversaries", type=int, default=1, help="number of adversaries") + # Core training parameters + parser.add_argument("--lr-actor", type=float, default=1e-4, help="learning rate of actor") + parser.add_argument("--lr-critic", type=float, default=1e-3, help="learning rate of critic") + parser.add_argument("--epsilon", type=float, default=0.1, help="epsilon greedy") + parser.add_argument("--noise_rate", type=float, default=0.1, help="noise rate for sampling from a standard normal distribution ") + parser.add_argument("--gamma", type=float, default=0.95, help="discount factor") + parser.add_argument("--tau", type=float, default=0.01, help="parameter for updating the target network") + parser.add_argument("--buffer-size", type=int, default=int(5e5), help="number of transitions can be stored in buffer") + parser.add_argument("--batch-size", type=int, default=256, help="number of episodes to optimize at the same time") + # Checkpointing + parser.add_argument("--save-dir", type=str, default="./model", help="directory in which training state and model should be saved") + parser.add_argument("--save-rate", type=int, default=2000, help="save model once every time this many episodes are completed") + parser.add_argument("--model-dir", type=str, default="", help="directory in which training state and model are loaded") + + # Evaluate + parser.add_argument("--evaluate-episodes", type=int, default=10, help="number of episodes for evaluating") + parser.add_argument("--evaluate-episode-len", type=int, default=100, help="length of episodes for evaluating") + parser.add_argument("--evaluate", type=bool, default=False, help="whether to evaluate the model") + parser.add_argument("--evaluate-rate", type=int, default=1000, help="how often to evaluate model") + args = parser.parse_args() + + return args diff --git a/common/replay_buffer.py b/common/replay_buffer.py new file mode 100644 index 0000000..eb8652f --- /dev/null +++ b/common/replay_buffer.py @@ -0,0 +1,53 @@ +import threading +import numpy as np + + +class Buffer: + def __init__(self, args): + self.size = args.buffer_size + self.args = args + # memory management + self.current_size = 0 + # create the buffer to store info + self.buffer = dict() + for i in range(self.args.n_agents): + self.buffer['o_%d' % i] = np.empty([self.size, self.args.obs_shape[i]]) + self.buffer['u_%d' % i] = np.empty([self.size, self.args.action_shape[i]]) + self.buffer['r_%d' % i] = np.empty([self.size]) + self.buffer['o_next_%d' % i] = np.empty([self.size, self.args.obs_shape[i]]) + # thread lock + self.lock = threading.Lock() + + # store the episode + def store_episode(self, o, u, r, o_next): + idxs = self._get_storage_idx(inc=1) + for i in range(self.args.n_agents): + with self.lock: + self.buffer['o_%d' % i][idxs] = o[i] + self.buffer['u_%d' % i][idxs] = u[i] + self.buffer['r_%d' % i][idxs] = r[i] + self.buffer['o_next_%d' % i][idxs] = o_next[i] + + # sample the data from the replay buffer + def sample(self, batch_size): + temp_buffer = {} + idx = np.random.randint(0, self.current_size, batch_size) + for key in self.buffer.keys(): + temp_buffer[key] = self.buffer[key][idx] + return temp_buffer + + def _get_storage_idx(self, inc=None): + inc = inc or 1 + if self.current_size+inc <= self.size: + idx = np.arange(self.current_size, self.current_size+inc) + elif self.current_size < self.size: + overflow = inc - (self.size - self.current_size) + idx_a = np.arange(self.current_size, self.size) + idx_b = np.random.randint(0, self.current_size, overflow) + idx = np.concatenate([idx_a, idx_b]) + else: + idx = np.random.randint(0, self.size, inc) + self.current_size = min(self.size, self.current_size+inc) + if inc == 1: + idx = idx[0] + return idx diff --git a/common/utils.py b/common/utils.py new file mode 100644 index 0000000..a5c59e3 --- /dev/null +++ b/common/utils.py @@ -0,0 +1,54 @@ +import numpy as np +import inspect +import functools + + +def store_args(method): + """Stores provided method args as instance attributes. + """ + argspec = inspect.getfullargspec(method) + defaults = {} + if argspec.defaults is not None: + defaults = dict( + zip(argspec.args[-len(argspec.defaults):], argspec.defaults)) + if argspec.kwonlydefaults is not None: + defaults.update(argspec.kwonlydefaults) + arg_names = argspec.args[1:] + + @functools.wraps(method) + def wrapper(*positional_args, **keyword_args): + self = positional_args[0] + # Get default arg values + args = defaults.copy() + # Add provided arg values + for name, value in zip(arg_names, positional_args[1:]): + args[name] = value + args.update(keyword_args) + self.__dict__.update(args) + return method(*positional_args, **keyword_args) + + return wrapper + + +def make_env(args): + from multiagent.environment import MultiAgentEnv + import multiagent.scenarios as scenarios + + # load scenario from script + scenario = scenarios.load(args.scenario_name + ".py").Scenario() + + # create world + world = scenario.make_world() + # create multiagent environment + env = MultiAgentEnv(world, scenario.reset_world, scenario.reward, scenario.observation) + # env = MultiAgentEnv(world) + args.n_players = env.n + args.n_agents = env.n - args.num_adversaries + args.obs_shape = [env.observation_space[i].shape[0] for i in range(args.n_agents)] + action_shape = [] + for content in env.action_space: + action_shape.append(content.n) + args.action_shape = action_shape[:args.n_agents] + args.high_action = 1 + args.low_action = -1 + return env, args diff --git a/maddpg/actor_critic.py b/maddpg/actor_critic.py new file mode 100644 index 0000000..e6a520f --- /dev/null +++ b/maddpg/actor_critic.py @@ -0,0 +1,44 @@ +import torch +import torch.nn as nn +import torch.nn.functional as F + + +# define the actor network +class Actor(nn.Module): + def __init__(self, args, agent_id): + super(Actor, self).__init__() + self.max_action = args.high_action + self.fc1 = nn.Linear(args.obs_shape[agent_id], 64) + self.fc2 = nn.Linear(64, 64) + self.fc3 = nn.Linear(64, 64) + self.action_out = nn.Linear(128, args.action_shape[agent_id]) + + def forward(self, x): + x = F.relu(self.fc1(x)) + x = F.relu(self.fc2(x)) + x = F.relu(self.fc3(x)) + actions = self.max_action * torch.tanh(self.action_out(x)) + + return actions + + +class Critic(nn.Module): + def __init__(self, args): + super(Critic, self).__init__() + self.max_action = args.high_action + self.fc1 = nn.Linear(sum(args.obs_shape) + sum(args.action_shape), 64) + self.fc2 = nn.Linear(64, 64) + self.fc3 = nn.Linear(64, 64) + self.q_out = nn.Linear(64, 1) + + def forward(self, state, action): + state = torch.cat(state, dim=1) + for i in range(len(action)): + action[i] /= self.max_action + action = torch.cat(action, dim=1) + x = torch.cat([state, action], dim=1) + x = F.relu(self.fc1(x)) + x = F.relu(self.fc2(x)) + x = F.relu(self.fc3(x)) + q_value = self.q_out(x) + return q_value diff --git a/maddpg/maddpg.py b/maddpg/maddpg.py new file mode 100644 index 0000000..096b55f --- /dev/null +++ b/maddpg/maddpg.py @@ -0,0 +1,114 @@ +import torch +import os +from maddpg.actor_critic import Actor, Critic + + +class MADDPG: + def __init__(self, args, agent_id): + self.args = args + self.agent_id = agent_id + self.train_step = 0 + + # create the network + self.actor_network = Actor(args, agent_id) + self.critic_network = Critic(args) + + # build up the target network + self.actor_target_network = Actor(args, agent_id) + self.critic_target_network = Critic(args) + + # load the weights into the target networks + self.actor_target_network.load_state_dict(self.actor_network.state_dict()) + self.critic_target_network.load_state_dict(self.critic_network.state_dict()) + + # create the optimizer + self.actor_optim = torch.optim.Adam(self.actor_network.parameters(), lr=self.args.lr_actor) + self.critic_optim = torch.optim.Adam(self.critic_network.parameters(), lr=self.args.lr_critic) + + # create the dict for store the model + if not os.path.exists(self.args.save_dir): + os.mkdir(self.args.save_dir) + # path to save the model + self.model_path = self.args.save_dir + '/' + self.args.scenario_name + if not os.path.exists(self.model_path): + os.mkdir(self.model_path) + self.model_path = self.model_path + '/' + 'agent_%d' % agent_id + if not os.path.exists(self.model_path): + os.mkdir(self.model_path) + + if os.path.exists(self.model_path + '/actor_params.pkl'): + self.actor_network.load_state_dict(torch.load(self.model_path + '/actor_params.pkl')) + self.critic_network.load_state_dict(torch.load(self.model_path + '/critic_params.pkl')) + print('Agent {} successfully loaded actor_network: {}'.format(self.agent_id, + self.model_path + '/actor_params.pkl')) + print('Agent {} successfully loaded critic_network: {}'.format(self.agent_id, + self.model_path + '/critic_params.pkl')) + + # soft update + def _soft_update_target_network(self): + for target_param, param in zip(self.actor_target_network.parameters(), self.actor_network.parameters()): + target_param.data.copy_((1 - self.args.tau) * target_param.data + self.args.tau * param.data) + + for target_param, param in zip(self.critic_target_network.parameters(), self.critic_network.parameters()): + target_param.data.copy_((1 - self.args.tau) * target_param.data + self.args.tau * param.data) + + # update the network + def train(self, transitions, other_agents): + for key in transitions.keys(): + transitions[key] = torch.tensor(transitions[key], dtype=torch.float32) + r = transitions['r_%d' % self.agent_id] # reward + o, u, o_next = [], [], [] # agent + for agent_id in range(self.args.n_agents): + o.append(transitions['o_%d' % agent_id]) + u.append(transitions['u_%d' % agent_id]) + o_next.append(transitions['o_next_%d' % agent_id]) + + # calculate the target Q value function + u_next = [] + with torch.no_grad(): + index = 0 + for agent_id in range(self.args.n_agents): + if agent_id == self.agent_id: + u_next.append(self.actor_target_network(o_next[agent_id])) + else: + # other_agents + u_next.append(other_agents[index].policy.actor_target_network(o_next[agent_id])) + index += 1 + q_next = self.critic_target_network(o_next, u_next).detach() + + target_q = (r.unsqueeze(1) + self.args.gamma * q_next).detach() + + # the q loss + q_value = self.critic_network(o, u) + critic_loss = (target_q - q_value).pow(2).mean() + + # the actor loss + u[self.agent_id] = self.actor_network(o[self.agent_id]) + actor_loss = - self.critic_network(o, u).mean() + #if self.agent_id == 0: + # print('critic_loss is {}, actor_loss is {}'.format(critic_loss, actor_loss)) + # update the network + self.actor_optim.zero_grad() + actor_loss.backward() + self.actor_optim.step() + self.critic_optim.zero_grad() + critic_loss.backward() + self.critic_optim.step() + + self._soft_update_target_network() + if self.train_step > 0 and self.train_step % self.args.save_rate == 0: + self.save_model(self.train_step) + self.train_step += 1 + + def save_model(self, train_step): + num = str(train_step // self.args.save_rate) + model_path = os.path.join(self.args.save_dir, self.args.scenario_name) + if not os.path.exists(model_path): + os.makedirs(model_path) + model_path = os.path.join(model_path, 'agent_%d' % self.agent_id) + if not os.path.exists(model_path): + os.makedirs(model_path) + torch.save(self.actor_network.state_dict(), model_path + '/' + num + '_actor_params.pkl') + torch.save(self.critic_network.state_dict(), model_path + '/' + num + '_critic_params.pkl') + + diff --git a/main.ipynb b/main.ipynb new file mode 100644 index 0000000..5924edf --- /dev/null +++ b/main.ipynb @@ -0,0 +1,331 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "id": "11c84981", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Looking in indexes: https://nexus.c68.spacecorp.ru/repository/pypi_group/simple/\n", + "Requirement already satisfied: torch==1.7.1 in /home/ovchinnikov_ii@RISDE.ru/Software/Jupyter/venv/lib/python3.9/site-packages (1.7.1)\n", + "Requirement already satisfied: typing-extensions in /home/ovchinnikov_ii@RISDE.ru/Software/Jupyter/venv/lib/python3.9/site-packages (from torch==1.7.1) (4.4.0)\n", + "Requirement already satisfied: numpy in /home/ovchinnikov_ii@RISDE.ru/Software/Jupyter/venv/lib/python3.9/site-packages (from torch==1.7.1) (1.24.1)\n", + "Note: you may need to restart the kernel to use updated packages.\n" + ] + } + ], + "source": [ + "pip install torch==1.7.1" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "735f4c82", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Looking in indexes: https://nexus.c68.spacecorp.ru/repository/pypi_group/simple/\n", + "Collecting supersuit==2.6.5\n", + " Using cached SuperSuit-2.6.5-py3-none-any.whl\n", + "Requirement already satisfied: cloudpickle in /home/ovchinnikov_ii@RISDE.ru/Software/Jupyter/venv/lib/python3.9/site-packages (from supersuit==2.6.5) (2.2.0)\n", + "Collecting pettingzoo>=1.6.0\n", + " Using cached https://nexus.c68.spacecorp.ru/repository/pypi_group/packages/pettingzoo/1.22.3/PettingZoo-1.22.3-py3-none-any.whl (816 kB)\n", + "Collecting opencv-python~=3.4.0\n", + " Using cached https://nexus.c68.spacecorp.ru/repository/pypi_group/packages/opencv-python/3.4.18.65/opencv_python-3.4.18.65-cp36-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (58.4 MB)\n", + "Requirement already satisfied: numpy>=1.19.3 in /home/ovchinnikov_ii@RISDE.ru/Software/Jupyter/venv/lib/python3.9/site-packages (from opencv-python~=3.4.0->supersuit==2.6.5) (1.24.1)\n", + "Requirement already satisfied: gymnasium>=0.26.0 in /home/ovchinnikov_ii@RISDE.ru/Software/Jupyter/venv/lib/python3.9/site-packages (from pettingzoo>=1.6.0->supersuit==2.6.5) (0.27.0)\n", + "Requirement already satisfied: typing-extensions>=4.3.0 in /home/ovchinnikov_ii@RISDE.ru/Software/Jupyter/venv/lib/python3.9/site-packages (from gymnasium>=0.26.0->pettingzoo>=1.6.0->supersuit==2.6.5) (4.4.0)\n", + "Requirement already satisfied: importlib-metadata>=4.8.0 in /home/ovchinnikov_ii@RISDE.ru/Software/Jupyter/venv/lib/python3.9/site-packages (from gymnasium>=0.26.0->pettingzoo>=1.6.0->supersuit==2.6.5) (6.0.0)\n", + "Requirement already satisfied: shimmy<1.0,>=0.1.0 in /home/ovchinnikov_ii@RISDE.ru/Software/Jupyter/venv/lib/python3.9/site-packages (from gymnasium>=0.26.0->pettingzoo>=1.6.0->supersuit==2.6.5) (0.2.0)\n", + "Requirement already satisfied: jax-jumpy>=0.2.0 in /home/ovchinnikov_ii@RISDE.ru/Software/Jupyter/venv/lib/python3.9/site-packages (from gymnasium>=0.26.0->pettingzoo>=1.6.0->supersuit==2.6.5) (0.2.0)\n", + "Requirement already satisfied: gymnasium-notices>=0.0.1 in /home/ovchinnikov_ii@RISDE.ru/Software/Jupyter/venv/lib/python3.9/site-packages (from gymnasium>=0.26.0->pettingzoo>=1.6.0->supersuit==2.6.5) (0.0.1)\n", + "Requirement already satisfied: zipp>=0.5 in /home/ovchinnikov_ii@RISDE.ru/Software/Jupyter/venv/lib/python3.9/site-packages (from importlib-metadata>=4.8.0->gymnasium>=0.26.0->pettingzoo>=1.6.0->supersuit==2.6.5) (3.11.0)\n", + "Installing collected packages: pettingzoo, opencv-python, supersuit\n", + " Attempting uninstall: pettingzoo\n", + " Found existing installation: PettingZoo 1.3.3\n", + " Uninstalling PettingZoo-1.3.3:\n", + " Successfully uninstalled PettingZoo-1.3.3\n", + " Attempting uninstall: opencv-python\n", + " Found existing installation: opencv-python 4.7.0.68\n", + " Uninstalling opencv-python-4.7.0.68:\n", + " Successfully uninstalled opencv-python-4.7.0.68\n", + " Attempting uninstall: supersuit\n", + " Found existing installation: SuperSuit 3.7.1\n", + " Uninstalling SuperSuit-3.7.1:\n", + " Successfully uninstalled SuperSuit-3.7.1\n", + "Successfully installed opencv-python-3.4.18.65 pettingzoo-1.22.3 supersuit-2.6.5\n", + "Note: you may need to restart the kernel to use updated packages.\n" + ] + } + ], + "source": [ + "pip install supersuit==2.6.5" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "3b8272be", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Looking in indexes: https://nexus.c68.spacecorp.ru/repository/pypi_group/simple/\n", + "Collecting tqdm\n", + " Using cached https://nexus.c68.spacecorp.ru/repository/pypi_group/packages/tqdm/4.64.1/tqdm-4.64.1-py2.py3-none-any.whl (78 kB)\n", + "Installing collected packages: tqdm\n", + "Successfully installed tqdm-4.64.1\n", + "Note: you may need to restart the kernel to use updated packages.\n" + ] + } + ], + "source": [ + "pip install tqdm" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "acc570b8", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Looking in indexes: https://nexus.c68.spacecorp.ru/repository/pypi_group/simple/\n", + "Requirement already satisfied: matplotlib in /home/ovchinnikov_ii@RISDE.ru/Software/Jupyter/venv/lib/python3.9/site-packages (3.6.3)\n", + "Requirement already satisfied: contourpy>=1.0.1 in /home/ovchinnikov_ii@RISDE.ru/Software/Jupyter/venv/lib/python3.9/site-packages (from matplotlib) (1.0.6)\n", + "Requirement already satisfied: numpy>=1.19 in /home/ovchinnikov_ii@RISDE.ru/Software/Jupyter/venv/lib/python3.9/site-packages (from matplotlib) (1.24.1)\n", + "Requirement already satisfied: pyparsing>=2.2.1 in /home/ovchinnikov_ii@RISDE.ru/Software/Jupyter/venv/lib/python3.9/site-packages (from matplotlib) (3.0.9)\n", + "Requirement already satisfied: packaging>=20.0 in /home/ovchinnikov_ii@RISDE.ru/Software/Jupyter/venv/lib/python3.9/site-packages (from matplotlib) (23.0)\n", + "Requirement already satisfied: fonttools>=4.22.0 in /home/ovchinnikov_ii@RISDE.ru/Software/Jupyter/venv/lib/python3.9/site-packages (from matplotlib) (4.38.0)\n", + "Requirement already satisfied: cycler>=0.10 in /home/ovchinnikov_ii@RISDE.ru/Software/Jupyter/venv/lib/python3.9/site-packages (from matplotlib) (0.11.0)\n", + "Requirement already satisfied: kiwisolver>=1.0.1 in /home/ovchinnikov_ii@RISDE.ru/Software/Jupyter/venv/lib/python3.9/site-packages (from matplotlib) (1.4.4)\n", + "Requirement already satisfied: pillow>=6.2.0 in /home/ovchinnikov_ii@RISDE.ru/Software/Jupyter/venv/lib/python3.9/site-packages (from matplotlib) (9.4.0)\n", + "Requirement already satisfied: python-dateutil>=2.7 in /home/ovchinnikov_ii@RISDE.ru/Software/Jupyter/venv/lib/python3.9/site-packages (from matplotlib) (2.8.2)\n", + "Requirement already satisfied: six>=1.5 in /home/ovchinnikov_ii@RISDE.ru/Software/Jupyter/venv/lib/python3.9/site-packages (from python-dateutil>=2.7->matplotlib) (1.16.0)\n", + "Note: you may need to restart the kernel to use updated packages.\n" + ] + } + ], + "source": [ + "pip install matplotlib" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "262ae5d6", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Looking in indexes: https://nexus.c68.spacecorp.ru/repository/pypi_group/simple/\n", + "Collecting gym==0.10.5\n", + " Using cached gym-0.10.5-py3-none-any.whl\n", + "Requirement already satisfied: six in /home/ovchinnikov_ii@RISDE.ru/Software/Jupyter/venv/lib/python3.9/site-packages (from gym==0.10.5) (1.16.0)\n", + "Collecting pyglet>=1.2.0\n", + " Using cached https://nexus.c68.spacecorp.ru/repository/pypi_group/packages/pyglet/2.0.3/pyglet-2.0.3-py3-none-any.whl (968 kB)\n", + "Requirement already satisfied: numpy>=1.10.4 in /home/ovchinnikov_ii@RISDE.ru/Software/Jupyter/venv/lib/python3.9/site-packages (from gym==0.10.5) (1.24.1)\n", + "Requirement already satisfied: requests>=2.0 in /home/ovchinnikov_ii@RISDE.ru/Software/Jupyter/venv/lib/python3.9/site-packages (from gym==0.10.5) (2.28.1)\n", + "Requirement already satisfied: idna<4,>=2.5 in /home/ovchinnikov_ii@RISDE.ru/Software/Jupyter/venv/lib/python3.9/site-packages (from requests>=2.0->gym==0.10.5) (3.4)\n", + "Requirement already satisfied: charset-normalizer<3,>=2 in /home/ovchinnikov_ii@RISDE.ru/Software/Jupyter/venv/lib/python3.9/site-packages (from requests>=2.0->gym==0.10.5) (2.1.1)\n", + "Requirement already satisfied: urllib3<1.27,>=1.21.1 in /home/ovchinnikov_ii@RISDE.ru/Software/Jupyter/venv/lib/python3.9/site-packages (from requests>=2.0->gym==0.10.5) (1.26.14)\n", + "Requirement already satisfied: certifi>=2017.4.17 in /home/ovchinnikov_ii@RISDE.ru/Software/Jupyter/venv/lib/python3.9/site-packages (from requests>=2.0->gym==0.10.5) (2022.12.7)\n", + "Installing collected packages: pyglet, gym\n", + " Attempting uninstall: gym\n", + " Found existing installation: gym 0.22.0\n", + " Uninstalling gym-0.22.0:\n", + " Successfully uninstalled gym-0.22.0\n", + "Successfully installed gym-0.10.5 pyglet-2.0.3\n", + "Note: you may need to restart the kernel to use updated packages.\n" + ] + } + ], + "source": [ + "pip install gym==0.10.5" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "9c651810", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Looking in indexes: https://nexus.c68.spacecorp.ru/repository/pypi_group/simple/\n", + "Collecting pyglet==1.5.27\n", + " Using cached https://nexus.c68.spacecorp.ru/repository/pypi_group/packages/pyglet/1.5.27/pyglet-1.5.27-py3-none-any.whl (1.1 MB)\n", + "Installing collected packages: pyglet\n", + " Attempting uninstall: pyglet\n", + " Found existing installation: pyglet 2.0.3\n", + " Uninstalling pyglet-2.0.3:\n", + " Successfully uninstalled pyglet-2.0.3\n", + "Successfully installed pyglet-1.5.27\n", + "Note: you may need to restart the kernel to use updated packages.\n" + ] + } + ], + "source": [ + "pip install pyglet==1.5.27" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "cb877007", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Пытаемся загрузить данные!\n", + "Пытаемся загрузить данные!\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + " 0%| | 203/2000000 [00:00<32:10, 1036.07it/s]/home/ovchinnikov_ii@RISDE.ru/Software/Jupyter/MADDPG/maddpg/maddpg.py:60: UserWarning: To copy construct from a tensor, it is recommended to use sourceTensor.clone().detach() or sourceTensor.clone().detach().requires_grad_(True), rather than torch.tensor(sourceTensor).\n", + " transitions[key] = torch.tensor(transitions[key], dtype=torch.float32)\n", + " 0%| | 307/2000000 [00:01<2:18:56, 239.88it/s]" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Пытаемся сохранить данные по пути = ./model/simple_adversary/agent_0/1_actor_params.pkl\n", + "Пытаемся сохранить данные по пути = ./model/simple_adversary/agent_1/1_actor_params.pkl\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + " 0%| | 459/2000000 [00:03<6:02:10, 92.02it/s]" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Пытаемся сохранить данные по пути = ./model/simple_adversary/agent_0/2_actor_params.pkl\n", + "Пытаемся сохранить данные по пути = ./model/simple_adversary/agent_1/2_actor_params.pkl\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + " 0%| | 566/2000000 [00:04<7:30:23, 73.99it/s]" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Пытаемся сохранить данные по пути = ./model/simple_adversary/agent_0/3_actor_params.pkl\n", + "Пытаемся сохранить данные по пути = ./model/simple_adversary/agent_1/3_actor_params.pkl\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + " 0%| | 667/2000000 [00:06<8:44:54, 63.48it/s]" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Пытаемся сохранить данные по пути = ./model/simple_adversary/agent_0/4_actor_params.pkl\n", + "Пытаемся сохранить данные по пути = ./model/simple_adversary/agent_1/4_actor_params.pkl\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + " 0%| | 717/2000000 [00:07<5:36:33, 99.01it/s]\n" + ] + }, + { + "ename": "KeyboardInterrupt", + "evalue": "", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mKeyboardInterrupt\u001b[0m Traceback (most recent call last)", + "File \u001b[0;32m~/Software/Jupyter/MADDPG/main.py:18\u001b[0m\n\u001b[1;32m 16\u001b[0m \u001b[38;5;28mprint\u001b[39m(\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mAverage returns is\u001b[39m\u001b[38;5;124m'\u001b[39m, returns)\n\u001b[1;32m 17\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m---> 18\u001b[0m \u001b[43mrunner\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mrun\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n", + "File \u001b[0;32m~/Software/Jupyter/MADDPG/runner.py:52\u001b[0m, in \u001b[0;36mRunner.run\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 50\u001b[0m other_agents \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39magents\u001b[38;5;241m.\u001b[39mcopy()\n\u001b[1;32m 51\u001b[0m other_agents\u001b[38;5;241m.\u001b[39mremove(agent)\n\u001b[0;32m---> 52\u001b[0m \u001b[43magent\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mlearn\u001b[49m\u001b[43m(\u001b[49m\u001b[43mtransitions\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mother_agents\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 53\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m time_step \u001b[38;5;241m>\u001b[39m \u001b[38;5;241m0\u001b[39m \u001b[38;5;129;01mand\u001b[39;00m time_step \u001b[38;5;241m%\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39margs\u001b[38;5;241m.\u001b[39mevaluate_rate \u001b[38;5;241m==\u001b[39m \u001b[38;5;241m0\u001b[39m:\n\u001b[1;32m 54\u001b[0m returns\u001b[38;5;241m.\u001b[39mappend(\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mevaluate())\n", + "File \u001b[0;32m~/Software/Jupyter/MADDPG/agent.py:27\u001b[0m, in \u001b[0;36mAgent.learn\u001b[0;34m(self, transitions, other_agents)\u001b[0m\n\u001b[1;32m 26\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mlearn\u001b[39m(\u001b[38;5;28mself\u001b[39m, transitions, other_agents):\n\u001b[0;32m---> 27\u001b[0m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mpolicy\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mtrain\u001b[49m\u001b[43m(\u001b[49m\u001b[43mtransitions\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mother_agents\u001b[49m\u001b[43m)\u001b[49m\n", + "File \u001b[0;32m~/Software/Jupyter/MADDPG/maddpg/maddpg.py:95\u001b[0m, in \u001b[0;36mMADDPG.train\u001b[0;34m(self, transitions, other_agents)\u001b[0m\n\u001b[1;32m 93\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mactor_optim\u001b[38;5;241m.\u001b[39mzero_grad()\n\u001b[1;32m 94\u001b[0m actor_loss\u001b[38;5;241m.\u001b[39mbackward()\n\u001b[0;32m---> 95\u001b[0m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mactor_optim\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mstep\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 96\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mcritic_optim\u001b[38;5;241m.\u001b[39mzero_grad()\n\u001b[1;32m 97\u001b[0m critic_loss\u001b[38;5;241m.\u001b[39mbackward()\n", + "File \u001b[0;32m~/Software/Jupyter/venv/lib/python3.9/site-packages/torch/autograd/grad_mode.py:26\u001b[0m, in \u001b[0;36m_DecoratorContextManager.__call__..decorate_context\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m 23\u001b[0m \u001b[38;5;129m@functools\u001b[39m\u001b[38;5;241m.\u001b[39mwraps(func)\n\u001b[1;32m 24\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mdecorate_context\u001b[39m(\u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs):\n\u001b[1;32m 25\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m\u001b[38;5;18m__class__\u001b[39m():\n\u001b[0;32m---> 26\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mfunc\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n", + "File \u001b[0;32m~/Software/Jupyter/venv/lib/python3.9/site-packages/torch/optim/adamw.py:116\u001b[0m, in \u001b[0;36mAdamW.step\u001b[0;34m(self, closure)\u001b[0m\n\u001b[1;32m 112\u001b[0m denom \u001b[38;5;241m=\u001b[39m (exp_avg_sq\u001b[38;5;241m.\u001b[39msqrt() \u001b[38;5;241m/\u001b[39m math\u001b[38;5;241m.\u001b[39msqrt(bias_correction2))\u001b[38;5;241m.\u001b[39madd_(group[\u001b[38;5;124m'\u001b[39m\u001b[38;5;124meps\u001b[39m\u001b[38;5;124m'\u001b[39m])\n\u001b[1;32m 114\u001b[0m step_size \u001b[38;5;241m=\u001b[39m group[\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mlr\u001b[39m\u001b[38;5;124m'\u001b[39m] \u001b[38;5;241m/\u001b[39m bias_correction1\n\u001b[0;32m--> 116\u001b[0m p\u001b[38;5;241m.\u001b[39maddcdiv_(exp_avg, denom, value\u001b[38;5;241m=\u001b[39m\u001b[38;5;241;43m-\u001b[39;49m\u001b[43mstep_size\u001b[49m)\n\u001b[1;32m 118\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m loss\n", + "\u001b[0;31mKeyboardInterrupt\u001b[0m: " + ] + } + ], + "source": [ + "%run ./main.py --scenario-name=simple_adversary --evaluate-episodes=10000 --save-rate=100" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d71a2b22", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d079aff2", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "venv", + "language": "python", + "name": "venv" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.2" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/main.py b/main.py new file mode 100644 index 0000000..810aa59 --- /dev/null +++ b/main.py @@ -0,0 +1,18 @@ +from runner import Runner +from common.arguments import get_args +from common.utils import make_env +import numpy as np +import random +import torch + + +if __name__ == '__main__': + # get the params + args = get_args() + env, args = make_env(args) + runner = Runner(args, env) + if args.evaluate: + returns = runner.evaluate() + print('Average returns is', returns) + else: + runner.run() diff --git a/model/simple_adversary/agent_0/actor_params.pkl b/model/simple_adversary/agent_0/actor_params.pkl new file mode 100644 index 0000000..1c1ca2d Binary files /dev/null and b/model/simple_adversary/agent_0/actor_params.pkl differ diff --git a/model/simple_adversary/agent_0/critic_params.pkl b/model/simple_adversary/agent_0/critic_params.pkl new file mode 100644 index 0000000..8dbb118 Binary files /dev/null and b/model/simple_adversary/agent_0/critic_params.pkl differ diff --git a/model/simple_adversary/agent_1/actor_params.pkl b/model/simple_adversary/agent_1/actor_params.pkl new file mode 100644 index 0000000..b3c9823 Binary files /dev/null and b/model/simple_adversary/agent_1/actor_params.pkl differ diff --git a/model/simple_adversary/agent_1/critic_params.pkl b/model/simple_adversary/agent_1/critic_params.pkl new file mode 100644 index 0000000..6dbfcc3 Binary files /dev/null and b/model/simple_adversary/agent_1/critic_params.pkl differ diff --git a/model/simple_adversary/returns.pkl.npy b/model/simple_adversary/returns.pkl.npy new file mode 100644 index 0000000..1b7ce3f Binary files /dev/null and b/model/simple_adversary/returns.pkl.npy differ diff --git a/multiagent/__init__.py b/multiagent/__init__.py new file mode 100644 index 0000000..90b6007 --- /dev/null +++ b/multiagent/__init__.py @@ -0,0 +1,18 @@ +from gym.envs.registration import register + +# Multiagent envs +# ---------------------------------------- + +register( + id='MultiagentSimple-v0', + entry_point='multiagent.envs:SimpleEnv', + # FIXME(cathywu) currently has to be exactly max_path_length parameters in + # rllab run script + max_episode_steps=100, +) + +register( + id='MultiagentSimpleSpeakerListener-v0', + entry_point='multiagent.envs:SimpleSpeakerListenerEnv', + max_episode_steps=100, +) diff --git a/multiagent/core.py b/multiagent/core.py new file mode 100644 index 0000000..c0662ca --- /dev/null +++ b/multiagent/core.py @@ -0,0 +1,196 @@ +import numpy as np + +# physical/external base state of all entites +class EntityState(object): + def __init__(self): + # physical position + self.p_pos = None + # physical velocity + self.p_vel = None + +# state of agents (including communication and internal/mental state) +class AgentState(EntityState): + def __init__(self): + super(AgentState, self).__init__() + # communication utterance + self.c = None + +# action of the agent +class Action(object): + def __init__(self): + # physical action + self.u = None + # communication action + self.c = None + +# properties and state of physical world entity +class Entity(object): + def __init__(self): + # name + self.name = '' + # properties: + self.size = 0.050 + # entity can move / be pushed + self.movable = False + # entity collides with others + self.collide = True + # material density (affects mass) + self.density = 25.0 + # color + self.color = None + # max speed and accel + self.max_speed = None + self.accel = None + # state + self.state = EntityState() + # mass + self.initial_mass = 1.0 + + @property + def mass(self): + return self.initial_mass + +# properties of landmark entities +class Landmark(Entity): + def __init__(self): + super(Landmark, self).__init__() + +# properties of agent entities +class Agent(Entity): + def __init__(self): + super(Agent, self).__init__() + # agents are movable by default + self.movable = True + # cannot send communication signals + self.silent = False + # cannot observe the world + self.blind = False + # physical motor noise amount + self.u_noise = None + # communication noise amount + self.c_noise = None + # control range + self.u_range = 1.0 + # state + self.state = AgentState() + # action + self.action = Action() + # script behavior to execute + self.action_callback = None + +# multi-agent world +class World(object): + def __init__(self): + # list of agents and entities (can change at execution-time!) + self.agents = [] + self.landmarks = [] + # communication channel dimensionality + self.dim_c = 0 + # position dimensionality + self.dim_p = 2 + # color dimensionality + self.dim_color = 3 + # simulation timestep + self.dt = 0.1 + # physical damping + self.damping = 0.25 + # contact response parameters + self.contact_force = 1e+2 + self.contact_margin = 1e-3 + + # return all entities in the world + @property + def entities(self): + return self.agents + self.landmarks + + # return all agents controllable by external policies + @property + def policy_agents(self): + return [agent for agent in self.agents if agent.action_callback is None] + + # return all agents controlled by world scripts + @property + def scripted_agents(self): + return [agent for agent in self.agents if agent.action_callback is not None] + + # update state of the world + def step(self): + # set actions for scripted agents + for agent in self.scripted_agents: + agent.action = agent.action_callback(agent, self) + # gather forces applied to entities + p_force = [None] * len(self.entities) + # apply agent physical controls + p_force = self.apply_action_force(p_force) + # apply environment forces + p_force = self.apply_environment_force(p_force) + # integrate physical state + self.integrate_state(p_force) + # update agent state + for agent in self.agents: + self.update_agent_state(agent) + + # gather agent action forces + def apply_action_force(self, p_force): + # set applied forces + for i,agent in enumerate(self.agents): + if agent.movable: + noise = np.random.randn(*agent.action.u.shape) * agent.u_noise if agent.u_noise else 0.0 + p_force[i] = agent.action.u + noise + return p_force + + # gather physical forces acting on entities + def apply_environment_force(self, p_force): + # simple (but inefficient) collision response + for a,entity_a in enumerate(self.entities): + for b,entity_b in enumerate(self.entities): + if(b <= a): continue + [f_a, f_b] = self.get_collision_force(entity_a, entity_b) + if(f_a is not None): + if(p_force[a] is None): p_force[a] = 0.0 + p_force[a] = f_a + p_force[a] + if(f_b is not None): + if(p_force[b] is None): p_force[b] = 0.0 + p_force[b] = f_b + p_force[b] + return p_force + + # integrate physical state + def integrate_state(self, p_force): + for i,entity in enumerate(self.entities): + if not entity.movable: continue + entity.state.p_vel = entity.state.p_vel * (1 - self.damping) + if (p_force[i] is not None): + entity.state.p_vel += (p_force[i] / entity.mass) * self.dt + if entity.max_speed is not None: + speed = np.sqrt(np.square(entity.state.p_vel[0]) + np.square(entity.state.p_vel[1])) + if speed > entity.max_speed: + entity.state.p_vel = entity.state.p_vel / np.sqrt(np.square(entity.state.p_vel[0]) + + np.square(entity.state.p_vel[1])) * entity.max_speed + entity.state.p_pos += entity.state.p_vel * self.dt + + def update_agent_state(self, agent): + # set communication state (directly for now) + if agent.silent: + agent.state.c = np.zeros(self.dim_c) + else: + noise = np.random.randn(*agent.action.c.shape) * agent.c_noise if agent.c_noise else 0.0 + agent.state.c = agent.action.c + noise + + # get collision forces for any contact between two entities + def get_collision_force(self, entity_a, entity_b): + if (not entity_a.collide) or (not entity_b.collide): + return [None, None] # not a collider + if (entity_a is entity_b): + return [None, None] # don't collide against itself + # compute actual distance between entities + delta_pos = entity_a.state.p_pos - entity_b.state.p_pos + dist = np.sqrt(np.sum(np.square(delta_pos))) + # minimum allowable distance + dist_min = entity_a.size + entity_b.size + # softmax penetration + k = self.contact_margin + penetration = np.logaddexp(0, -(dist - dist_min)/k)*k + force = self.contact_force * delta_pos / dist * penetration + force_a = +force if entity_a.movable else None + force_b = -force if entity_b.movable else None + return [force_a, force_b] \ No newline at end of file diff --git a/multiagent/environment.py b/multiagent/environment.py new file mode 100644 index 0000000..44e487d --- /dev/null +++ b/multiagent/environment.py @@ -0,0 +1,336 @@ +import gym +from gym import spaces +from gym.envs.registration import EnvSpec +import numpy as np +from multiagent.multi_discrete import MultiDiscrete + +# environment for all agents in the multiagent world +# currently code assumes that no agents will be created/destroyed at runtime! +class MultiAgentEnv(gym.Env): + metadata = { + 'render.modes' : ['human', 'rgb_array'] + } + + def __init__(self, world, reset_callback=None, reward_callback=None, + observation_callback=None, info_callback=None, + done_callback=None, shared_viewer=True): + + self.world = world + self.agents = self.world.policy_agents + # set required vectorized gym env property + self.n = len(world.policy_agents) + # scenario callbacks + self.reset_callback = reset_callback + self.reward_callback = reward_callback + self.observation_callback = observation_callback + self.info_callback = info_callback + self.done_callback = done_callback + # environment parameters + self.discrete_action_space = True + # if true, action is a number 0...N, otherwise action is a one-hot N-dimensional vector + self.discrete_action_input = False + # if true, even the action is continuous, action will be performed discretely + self.force_discrete_action = world.discrete_action if hasattr(world, 'discrete_action') else False + # if true, every agent has the same reward + self.shared_reward = world.collaborative if hasattr(world, 'collaborative') else False + self.time = 0 + + # configure spaces + self.action_space = [] + self.observation_space = [] + for agent in self.agents: + total_action_space = [] + # physical action space + if self.discrete_action_space: + u_action_space = spaces.Discrete(world.dim_p * 2 + 1) + else: + u_action_space = spaces.Box(low=-agent.u_range, high=+agent.u_range, shape=(world.dim_p,), dtype=np.float32) + if agent.movable: + total_action_space.append(u_action_space) + # communication action space + if self.discrete_action_space: + c_action_space = spaces.Discrete(world.dim_c) + else: + c_action_space = spaces.Box(low=0.0, high=1.0, shape=(world.dim_c,), dtype=np.float32) + if not agent.silent: + total_action_space.append(c_action_space) + # total action space + if len(total_action_space) > 1: + # all action spaces are discrete, so simplify to MultiDiscrete action space + if all([isinstance(act_space, spaces.Discrete) for act_space in total_action_space]): + act_space = MultiDiscrete([[0, act_space.n - 1] for act_space in total_action_space]) + #print(act_space.n) + else: + act_space = spaces.Tuple(total_action_space) + self.action_space.append(act_space) + else: + self.action_space.append(total_action_space[0]) + # observation space + obs_dim = len(observation_callback(agent, self.world)) + self.observation_space.append(spaces.Box(low=-np.inf, high=+np.inf, shape=(obs_dim,), dtype=np.float32)) + agent.action.c = np.zeros(self.world.dim_c) + + # rendering + self.shared_viewer = shared_viewer + if self.shared_viewer: + self.viewers = [None] + else: + self.viewers = [None] * self.n + self._reset_render() + + def step(self, action_n): + obs_n = [] + reward_n = [] + done_n = [] + info_n = {'n': []} + self.agents = self.world.policy_agents + # set action for each agent + for i, agent in enumerate(self.agents): + self._set_action(action_n[i], agent, self.action_space[i]) + # advance world state + self.world.step() + # record observation for each agent + for agent in self.agents: + obs_n.append(self._get_obs(agent)) + reward_n.append(self._get_reward(agent)) + done_n.append(self._get_done(agent)) + + info_n['n'].append(self._get_info(agent)) + + # all agents get total reward in cooperative case + reward = np.sum(reward_n) + if self.shared_reward: + reward_n = [reward] * self.n + + return obs_n, reward_n, done_n, info_n + + def reset(self): + # reset world + self.reset_callback(self.world) + # reset renderer + self._reset_render() + # record observations for each agent + obs_n = [] + self.agents = self.world.policy_agents + for agent in self.agents: + obs_n.append(self._get_obs(agent)) + return obs_n + + # get info used for benchmarking + def _get_info(self, agent): + if self.info_callback is None: + return {} + return self.info_callback(agent, self.world) + + # get observation for a particular agent + def _get_obs(self, agent): + if self.observation_callback is None: + return np.zeros(0) + return self.observation_callback(agent, self.world) + + # get dones for a particular agent + # unused right now -- agents are allowed to go beyond the viewing screen + def _get_done(self, agent): + if self.done_callback is None: + return False + return self.done_callback(agent, self.world) + + # get reward for a particular agent + def _get_reward(self, agent): + if self.reward_callback is None: + return 0.0 + return self.reward_callback(agent, self.world) + + # set env action for a particular agent + def _set_action(self, action, agent, action_space, time=None): + agent.action.u = np.zeros(self.world.dim_p) + agent.action.c = np.zeros(self.world.dim_c) + # process action + if isinstance(action_space, MultiDiscrete): + act = [] + size = action_space.high - action_space.low + 1 + index = 0 + for s in size: + act.append(action[index:(index+s)]) + index += s + action = act + else: + action = [action] + + if agent.movable: + # physical action + if self.discrete_action_input: + agent.action.u = np.zeros(self.world.dim_p) + # process discrete action + if action[0] == 1: agent.action.u[0] = -1.0 + if action[0] == 2: agent.action.u[0] = +1.0 + if action[0] == 3: agent.action.u[1] = -1.0 + if action[0] == 4: agent.action.u[1] = +1.0 + else: + if self.force_discrete_action: + d = np.argmax(action[0]) + action[0][:] = 0.0 + action[0][d] = 1.0 + if self.discrete_action_space: + agent.action.u[0] += action[0][1] - action[0][2] + agent.action.u[1] += action[0][3] - action[0][4] + else: + agent.action.u = action[0] + sensitivity = 5.0 + if agent.accel is not None: + sensitivity = agent.accel + agent.action.u *= sensitivity + action = action[1:] + if not agent.silent: + # communication action + if self.discrete_action_input: + agent.action.c = np.zeros(self.world.dim_c) + agent.action.c[action[0]] = 1.0 + else: + agent.action.c = action[0] + action = action[1:] + # make sure we used all elements of action + assert len(action) == 0 + + # reset rendering assets + def _reset_render(self): + self.render_geoms = None + self.render_geoms_xform = None + + # render environment + def render(self, mode='human'): + if mode == 'human': + alphabet = 'ABCDEFGHIJKLMNOPQRSTUVWXYZ' + message = '' + for agent in self.world.agents: + comm = [] + for other in self.world.agents: + if other is agent: continue + if np.all(other.state.c == 0): + word = '_' + else: + word = alphabet[np.argmax(other.state.c)] + message += (other.name + ' to ' + agent.name + ': ' + word + ' ') + print(message) + + for i in range(len(self.viewers)): + # create viewers (if necessary) + if self.viewers[i] is None: + # import rendering only if we need it (and don't import for headless machines) + #from gym.envs.classic_control import rendering + from multiagent import rendering + self.viewers[i] = rendering.Viewer(700,700) + + # create rendering geometry + if self.render_geoms is None: + # import rendering only if we need it (and don't import for headless machines) + #from gym.envs.classic_control import rendering + from multiagent import rendering + self.render_geoms = [] + self.render_geoms_xform = [] + for entity in self.world.entities: + geom = rendering.make_circle(entity.size) + xform = rendering.Transform() + if 'agent' in entity.name: + geom.set_color(*entity.color, alpha=0.5) + else: + geom.set_color(*entity.color) + geom.add_attr(xform) + self.render_geoms.append(geom) + self.render_geoms_xform.append(xform) + + # add geoms to viewer + for viewer in self.viewers: + viewer.geoms = [] + for geom in self.render_geoms: + viewer.add_geom(geom) + + results = [] + for i in range(len(self.viewers)): + from multiagent import rendering + # update bounds to center around agent + cam_range = 1 + if self.shared_viewer: + pos = np.zeros(self.world.dim_p) + else: + pos = self.agents[i].state.p_pos + self.viewers[i].set_bounds(pos[0]-cam_range,pos[0]+cam_range,pos[1]-cam_range,pos[1]+cam_range) + # update geometry positions + for e, entity in enumerate(self.world.entities): + self.render_geoms_xform[e].set_translation(*entity.state.p_pos) + # render to display or array + results.append(self.viewers[i].render(return_rgb_array = mode=='rgb_array')) + + return results + + # create receptor field locations in local coordinate frame + def _make_receptor_locations(self, agent): + receptor_type = 'polar' + range_min = 0.05 * 2.0 + range_max = 1.00 + dx = [] + # circular receptive field + if receptor_type == 'polar': + for angle in np.linspace(-np.pi, +np.pi, 8, endpoint=False): + for distance in np.linspace(range_min, range_max, 3): + dx.append(distance * np.array([np.cos(angle), np.sin(angle)])) + # add origin + dx.append(np.array([0.0, 0.0])) + # grid receptive field + if receptor_type == 'grid': + for x in np.linspace(-range_max, +range_max, 5): + for y in np.linspace(-range_max, +range_max, 5): + dx.append(np.array([x,y])) + return dx + + +# vectorized wrapper for a batch of multi-agent environments +# assumes all environments have the same observation and action space +class BatchMultiAgentEnv(gym.Env): + metadata = { + 'runtime.vectorized': True, + 'render.modes' : ['human', 'rgb_array'] + } + + def __init__(self, env_batch): + self.env_batch = env_batch + + @property + def n(self): + return np.sum([env.n for env in self.env_batch]) + + @property + def action_space(self): + return self.env_batch[0].action_space + + @property + def observation_space(self): + return self.env_batch[0].observation_space + + def step(self, action_n, time): + obs_n = [] + reward_n = [] + done_n = [] + info_n = {'n': []} + i = 0 + for env in self.env_batch: + obs, reward, done, _ = env.step(action_n[i:(i+env.n)], time) + i += env.n + obs_n += obs + # reward = [r / len(self.env_batch) for r in reward] + reward_n += reward + done_n += done + return obs_n, reward_n, done_n, info_n + + def reset(self): + obs_n = [] + for env in self.env_batch: + obs_n += env.reset() + return obs_n + + # render environment + def render(self, mode='human', close=True): + results_n = [] + for env in self.env_batch: + results_n += env.render(mode, close) + return results_n diff --git a/multiagent/multi_discrete.py b/multiagent/multi_discrete.py new file mode 100644 index 0000000..d7108ad --- /dev/null +++ b/multiagent/multi_discrete.py @@ -0,0 +1,44 @@ +# An old version of OpenAI Gym's multi_discrete.py. (Was getting affected by Gym updates) +# (https://github.com/openai/gym/blob/1fb81d4e3fb780ccf77fec731287ba07da35eb84/gym/spaces/multi_discrete.py) + +import numpy as np + +import gym +from gym.spaces import prng + +class MultiDiscrete(gym.Space): + """ + - The multi-discrete action space consists of a series of discrete action spaces with different parameters + - It can be adapted to both a Discrete action space or a continuous (Box) action space + - It is useful to represent game controllers or keyboards where each key can be represented as a discrete action space + - It is parametrized by passing an array of arrays containing [min, max] for each discrete action space + where the discrete action space can take any integers from `min` to `max` (both inclusive) + Note: A value of 0 always need to represent the NOOP action. + e.g. Nintendo Game Controller + - Can be conceptualized as 3 discrete action spaces: + 1) Arrow Keys: Discrete 5 - NOOP[0], UP[1], RIGHT[2], DOWN[3], LEFT[4] - params: min: 0, max: 4 + 2) Button A: Discrete 2 - NOOP[0], Pressed[1] - params: min: 0, max: 1 + 3) Button B: Discrete 2 - NOOP[0], Pressed[1] - params: min: 0, max: 1 + - Can be initialized as + MultiDiscrete([ [0,4], [0,1], [0,1] ]) + """ + def __init__(self, array_of_param_array): + self.low = np.array([x[0] for x in array_of_param_array]) + self.high = np.array([x[1] for x in array_of_param_array]) + self.num_discrete_space = self.low.shape[0] + + def sample(self): + """ Returns a array with one sample from each discrete action space """ + # For each row: round(random .* (max - min) + min, 0) + random_array = prng.np_random.rand(self.num_discrete_space) + return [int(x) for x in np.floor(np.multiply((self.high - self.low + 1.), random_array) + self.low)] + def contains(self, x): + return len(x) == self.num_discrete_space and (np.array(x) >= self.low).all() and (np.array(x) <= self.high).all() + + @property + def shape(self): + return self.num_discrete_space + def __repr__(self): + return "MultiDiscrete" + str(self.num_discrete_space) + def __eq__(self, other): + return np.array_equal(self.low, other.low) and np.array_equal(self.high, other.high) \ No newline at end of file diff --git a/multiagent/policy.py b/multiagent/policy.py new file mode 100644 index 0000000..cf9ad0e --- /dev/null +++ b/multiagent/policy.py @@ -0,0 +1,52 @@ +import numpy as np +from pyglet.window import key + +# individual agent policy +class Policy(object): + def __init__(self): + pass + def action(self, obs): + raise NotImplementedError() + +# interactive policy based on keyboard input +# hard-coded to deal only with movement, not communication +class InteractivePolicy(Policy): + def __init__(self, env, agent_index): + super(InteractivePolicy, self).__init__() + self.env = env + # hard-coded keyboard events + self.move = [False for i in range(4)] + self.comm = [False for i in range(env.world.dim_c)] + # register keyboard events with this environment's window + env.viewers[agent_index].window.on_key_press = self.key_press + env.viewers[agent_index].window.on_key_release = self.key_release + + def action(self, obs): + # ignore observation and just act based on keyboard events + if self.env.discrete_action_input: + u = 0 + if self.move[0]: u = 1 + if self.move[1]: u = 2 + if self.move[2]: u = 4 + if self.move[3]: u = 3 + else: + u = np.zeros(5) # 5-d because of no-move action + if self.move[0]: u[1] += 1.0 + if self.move[1]: u[2] += 1.0 + if self.move[3]: u[3] += 1.0 + if self.move[2]: u[4] += 1.0 + if True not in self.move: + u[0] += 1.0 + return np.concatenate([u, np.zeros(self.env.world.dim_c)]) + + # keyboard event callbacks + def key_press(self, k, mod): + if k==key.LEFT: self.move[0] = True + if k==key.RIGHT: self.move[1] = True + if k==key.UP: self.move[2] = True + if k==key.DOWN: self.move[3] = True + def key_release(self, k, mod): + if k==key.LEFT: self.move[0] = False + if k==key.RIGHT: self.move[1] = False + if k==key.UP: self.move[2] = False + if k==key.DOWN: self.move[3] = False diff --git a/multiagent/rendering.py b/multiagent/rendering.py new file mode 100644 index 0000000..cd00c7f --- /dev/null +++ b/multiagent/rendering.py @@ -0,0 +1,345 @@ +""" +2D rendering framework +""" +from __future__ import division +import os +import six +import sys + +if "Apple" in sys.version: + if 'DYLD_FALLBACK_LIBRARY_PATH' in os.environ: + os.environ['DYLD_FALLBACK_LIBRARY_PATH'] += ':/usr/lib' + # (JDS 2016/04/15): avoid bug on Anaconda 2.3.0 / Yosemite + +from gym.utils import reraise +from gym import error + +try: + import pyglet +except ImportError as e: + reraise(suffix="HINT: you can install pyglet directly via 'pip install pyglet'. But if you really just want to install all Gym dependencies and not have to think about it, 'pip install -e .[all]' or 'pip install gym[all]' will do it.") + +try: + from pyglet.gl import * +except ImportError as e: + reraise(prefix="Error occured while running `from pyglet.gl import *`",suffix="HINT: make sure you have OpenGL install. On Ubuntu, you can run 'apt-get install python-opengl'. If you're running on a server, you may need a virtual frame buffer; something like this should work: 'xvfb-run -s \"-screen 0 1400x900x24\" python '") + +import math +import numpy as np + +RAD2DEG = 57.29577951308232 + +def get_display(spec): + """Convert a display specification (such as :0) into an actual Display + object. + + Pyglet only supports multiple Displays on Linux. + """ + if spec is None: + return None + elif isinstance(spec, six.string_types): + return pyglet.canvas.Display(spec) + else: + raise error.Error('Invalid display specification: {}. (Must be a string like :0 or None.)'.format(spec)) + +class Viewer(object): + def __init__(self, width, height, display=None): + display = get_display(display) + + self.width = width + self.height = height + + self.window = pyglet.window.Window(width=width, height=height, display=display) + self.window.on_close = self.window_closed_by_user + self.geoms = [] + self.onetime_geoms = [] + self.transform = Transform() + + glEnable(GL_BLEND) + # glEnable(GL_MULTISAMPLE) + glEnable(GL_LINE_SMOOTH) + # glHint(GL_LINE_SMOOTH_HINT, GL_DONT_CARE) + glHint(GL_LINE_SMOOTH_HINT, GL_NICEST) + glLineWidth(2.0) + glBlendFunc(GL_SRC_ALPHA, GL_ONE_MINUS_SRC_ALPHA) + + def close(self): + self.window.close() + + def window_closed_by_user(self): + self.close() + + def set_bounds(self, left, right, bottom, top): + assert right > left and top > bottom + scalex = self.width/(right-left) + scaley = self.height/(top-bottom) + self.transform = Transform( + translation=(-left*scalex, -bottom*scaley), + scale=(scalex, scaley)) + + def add_geom(self, geom): + self.geoms.append(geom) + + def add_onetime(self, geom): + self.onetime_geoms.append(geom) + + def render(self, return_rgb_array=False): + glClearColor(1,1,1,1) + self.window.clear() + self.window.switch_to() + self.window.dispatch_events() + self.transform.enable() + for geom in self.geoms: + geom.render() + for geom in self.onetime_geoms: + geom.render() + self.transform.disable() + arr = None + if return_rgb_array: + buffer = pyglet.image.get_buffer_manager().get_color_buffer() + image_data = buffer.get_image_data() + arr = np.fromstring(image_data.data, dtype=np.uint8, sep='') + # In https://github.com/openai/gym-http-api/issues/2, we + # discovered that someone using Xmonad on Arch was having + # a window of size 598 x 398, though a 600 x 400 window + # was requested. (Guess Xmonad was preserving a pixel for + # the boundary.) So we use the buffer height/width rather + # than the requested one. + arr = arr.reshape(buffer.height, buffer.width, 4) + arr = arr[::-1,:,0:3] + self.window.flip() + self.onetime_geoms = [] + return arr + + # Convenience + def draw_circle(self, radius=10, res=30, filled=True, **attrs): + geom = make_circle(radius=radius, res=res, filled=filled) + _add_attrs(geom, attrs) + self.add_onetime(geom) + return geom + + def draw_polygon(self, v, filled=True, **attrs): + geom = make_polygon(v=v, filled=filled) + _add_attrs(geom, attrs) + self.add_onetime(geom) + return geom + + def draw_polyline(self, v, **attrs): + geom = make_polyline(v=v) + _add_attrs(geom, attrs) + self.add_onetime(geom) + return geom + + def draw_line(self, start, end, **attrs): + geom = Line(start, end) + _add_attrs(geom, attrs) + self.add_onetime(geom) + return geom + + def get_array(self): + self.window.flip() + image_data = pyglet.image.get_buffer_manager().get_color_buffer().get_image_data() + self.window.flip() + arr = np.fromstring(image_data.data, dtype=np.uint8, sep='') + arr = arr.reshape(self.height, self.width, 4) + return arr[::-1,:,0:3] + +def _add_attrs(geom, attrs): + if "color" in attrs: + geom.set_color(*attrs["color"]) + if "linewidth" in attrs: + geom.set_linewidth(attrs["linewidth"]) + +class Geom(object): + def __init__(self): + self._color=Color((0, 0, 0, 1.0)) + self.attrs = [self._color] + def render(self): + for attr in reversed(self.attrs): + attr.enable() + self.render1() + for attr in self.attrs: + attr.disable() + def render1(self): + raise NotImplementedError + def add_attr(self, attr): + self.attrs.append(attr) + def set_color(self, r, g, b, alpha=1): + self._color.vec4 = (r, g, b, alpha) + +class Attr(object): + def enable(self): + raise NotImplementedError + def disable(self): + pass + +class Transform(Attr): + def __init__(self, translation=(0.0, 0.0), rotation=0.0, scale=(1,1)): + self.set_translation(*translation) + self.set_rotation(rotation) + self.set_scale(*scale) + def enable(self): + glPushMatrix() + glTranslatef(self.translation[0], self.translation[1], 0) # translate to GL loc ppint + glRotatef(RAD2DEG * self.rotation, 0, 0, 1.0) + glScalef(self.scale[0], self.scale[1], 1) + def disable(self): + glPopMatrix() + def set_translation(self, newx, newy): + self.translation = (float(newx), float(newy)) + def set_rotation(self, new): + self.rotation = float(new) + def set_scale(self, newx, newy): + self.scale = (float(newx), float(newy)) + +class Color(Attr): + def __init__(self, vec4): + self.vec4 = vec4 + def enable(self): + glColor4f(*self.vec4) + +class LineStyle(Attr): + def __init__(self, style): + self.style = style + def enable(self): + glEnable(GL_LINE_STIPPLE) + glLineStipple(1, self.style) + def disable(self): + glDisable(GL_LINE_STIPPLE) + +class LineWidth(Attr): + def __init__(self, stroke): + self.stroke = stroke + def enable(self): + glLineWidth(self.stroke) + +class Point(Geom): + def __init__(self): + Geom.__init__(self) + def render1(self): + glBegin(GL_POINTS) # draw point + glVertex3f(0.0, 0.0, 0.0) + glEnd() + +class FilledPolygon(Geom): + def __init__(self, v): + Geom.__init__(self) + self.v = v + def render1(self): + if len(self.v) == 4 : glBegin(GL_QUADS) + elif len(self.v) > 4 : glBegin(GL_POLYGON) + else: glBegin(GL_TRIANGLES) + for p in self.v: + glVertex3f(p[0], p[1],0) # draw each vertex + glEnd() + + color = (self._color.vec4[0] * 0.5, self._color.vec4[1] * 0.5, self._color.vec4[2] * 0.5, self._color.vec4[3] * 0.5) + glColor4f(*color) + glBegin(GL_LINE_LOOP) + for p in self.v: + glVertex3f(p[0], p[1],0) # draw each vertex + glEnd() + +def make_circle(radius=10, res=30, filled=True): + points = [] + for i in range(res): + ang = 2*math.pi*i / res + points.append((math.cos(ang)*radius, math.sin(ang)*radius)) + if filled: + return FilledPolygon(points) + else: + return PolyLine(points, True) + +def make_polygon(v, filled=True): + if filled: return FilledPolygon(v) + else: return PolyLine(v, True) + +def make_polyline(v): + return PolyLine(v, False) + +def make_capsule(length, width): + l, r, t, b = 0, length, width/2, -width/2 + box = make_polygon([(l,b), (l,t), (r,t), (r,b)]) + circ0 = make_circle(width/2) + circ1 = make_circle(width/2) + circ1.add_attr(Transform(translation=(length, 0))) + geom = Compound([box, circ0, circ1]) + return geom + +class Compound(Geom): + def __init__(self, gs): + Geom.__init__(self) + self.gs = gs + for g in self.gs: + g.attrs = [a for a in g.attrs if not isinstance(a, Color)] + def render1(self): + for g in self.gs: + g.render() + +class PolyLine(Geom): + def __init__(self, v, close): + Geom.__init__(self) + self.v = v + self.close = close + self.linewidth = LineWidth(1) + self.add_attr(self.linewidth) + def render1(self): + glBegin(GL_LINE_LOOP if self.close else GL_LINE_STRIP) + for p in self.v: + glVertex3f(p[0], p[1],0) # draw each vertex + glEnd() + def set_linewidth(self, x): + self.linewidth.stroke = x + +class Line(Geom): + def __init__(self, start=(0.0, 0.0), end=(0.0, 0.0)): + Geom.__init__(self) + self.start = start + self.end = end + self.linewidth = LineWidth(1) + self.add_attr(self.linewidth) + + def render1(self): + glBegin(GL_LINES) + glVertex2f(*self.start) + glVertex2f(*self.end) + glEnd() + +class Image(Geom): + def __init__(self, fname, width, height): + Geom.__init__(self) + self.width = width + self.height = height + img = pyglet.image.load(fname) + self.img = img + self.flip = False + def render1(self): + self.img.blit(-self.width/2, -self.height/2, width=self.width, height=self.height) + +# ================================================================ + +class SimpleImageViewer(object): + def __init__(self, display=None): + self.window = None + self.isopen = False + self.display = display + def imshow(self, arr): + if self.window is None: + height, width, channels = arr.shape + self.window = pyglet.window.Window(width=width, height=height, display=self.display) + self.width = width + self.height = height + self.isopen = True + assert arr.shape == (self.height, self.width, 3), "You passed in an image with the wrong number shape" + image = pyglet.image.ImageData(self.width, self.height, 'RGB', arr.tobytes(), pitch=self.width * -3) + self.window.clear() + self.window.switch_to() + self.window.dispatch_events() + image.blit(0,0) + self.window.flip() + def close(self): + if self.isopen: + self.window.close() + self.isopen = False + def __del__(self): + self.close() \ No newline at end of file diff --git a/multiagent/scenario.py b/multiagent/scenario.py new file mode 100644 index 0000000..02d8677 --- /dev/null +++ b/multiagent/scenario.py @@ -0,0 +1,10 @@ +import numpy as np + +# defines scenario upon which the world is built +class BaseScenario(object): + # create elements of the world + def make_world(self): + raise NotImplementedError() + # create initial conditions of the world + def reset_world(self, world): + raise NotImplementedError() diff --git a/multiagent/scenarios/__init__.py b/multiagent/scenarios/__init__.py new file mode 100644 index 0000000..9a4b6f7 --- /dev/null +++ b/multiagent/scenarios/__init__.py @@ -0,0 +1,7 @@ +import imp +import os.path as osp + + +def load(name): + pathname = osp.join(osp.dirname(__file__), name) + return imp.load_source('', pathname) diff --git a/multiagent/scenarios/simple_adversary.py b/multiagent/scenarios/simple_adversary.py new file mode 100644 index 0000000..d897bb8 --- /dev/null +++ b/multiagent/scenarios/simple_adversary.py @@ -0,0 +1,139 @@ +import numpy as np +from multiagent.core import World, Agent, Landmark +from multiagent.scenario import BaseScenario + + +class Scenario(BaseScenario): + + def make_world(self): + world = World() + # set any world properties first + world.dim_c = 2 + num_agents = 3 + world.num_agents = num_agents + num_adversaries = 1 + num_landmarks = num_agents - 1 + # add agents + world.agents = [Agent() for i in range(num_agents)] + for i, agent in enumerate(world.agents): + agent.name = 'agent %d' % i + agent.collide = False + agent.silent = True + agent.adversary = True if i < num_adversaries else False + agent.size = 0.15 + # add landmarks + world.landmarks = [Landmark() for i in range(num_landmarks)] + for i, landmark in enumerate(world.landmarks): + landmark.name = 'landmark %d' % i + landmark.collide = False + landmark.movable = False + landmark.size = 0.08 + # make initial conditions + self.reset_world(world) + return world + + def reset_world(self, world): + # random properties for agents + world.agents[0].color = np.array([0.85, 0.35, 0.35]) + for i in range(1, world.num_agents): + world.agents[i].color = np.array([0.35, 0.35, 0.85]) + # random properties for landmarks + for i, landmark in enumerate(world.landmarks): + landmark.color = np.array([0.15, 0.15, 0.15]) + # set goal landmark + goal = np.random.choice(world.landmarks) + goal.color = np.array([0.15, 0.65, 0.15]) + for agent in world.agents: + agent.goal_a = goal + # set random initial states + for agent in world.agents: + agent.state.p_pos = np.random.uniform(-1, +1, world.dim_p) + agent.state.p_vel = np.zeros(world.dim_p) + agent.state.c = np.zeros(world.dim_c) + for i, landmark in enumerate(world.landmarks): + landmark.state.p_pos = np.random.uniform(-1, +1, world.dim_p) + landmark.state.p_vel = np.zeros(world.dim_p) + + def benchmark_data(self, agent, world): + # returns data for benchmarking purposes + if agent.adversary: + return np.sum(np.square(agent.state.p_pos - agent.goal_a.state.p_pos)) + else: + dists = [] + for l in world.landmarks: + dists.append(np.sum(np.square(agent.state.p_pos - l.state.p_pos))) + dists.append(np.sum(np.square(agent.state.p_pos - agent.goal_a.state.p_pos))) + return tuple(dists) + + # return all agents that are not adversaries + def good_agents(self, world): + return [agent for agent in world.agents if not agent.adversary] + + # return all adversarial agents + def adversaries(self, world): + return [agent for agent in world.agents if agent.adversary] + + def reward(self, agent, world): + # Agents are rewarded based on minimum agent distance to each landmark + return self.adversary_reward(agent, world) if agent.adversary else self.agent_reward(agent, world) + + def agent_reward(self, agent, world): + # Rewarded based on how close any good agent is to the goal landmark, and how far the adversary is from it + shaped_reward = True + shaped_adv_reward = True + + # Calculate negative reward for adversary + adversary_agents = self.adversaries(world) + if shaped_adv_reward: # distance-based adversary reward + adv_rew = sum([np.sqrt(np.sum(np.square(a.state.p_pos - a.goal_a.state.p_pos))) for a in adversary_agents]) + else: # proximity-based adversary reward (binary) + adv_rew = 0 + for a in adversary_agents: + if np.sqrt(np.sum(np.square(a.state.p_pos - a.goal_a.state.p_pos))) < 2 * a.goal_a.size: + adv_rew -= 5 + + # Calculate positive reward for agents + good_agents = self.good_agents(world) + if shaped_reward: # distance-based agent reward + pos_rew = -min( + [np.sqrt(np.sum(np.square(a.state.p_pos - a.goal_a.state.p_pos))) for a in good_agents]) + else: # proximity-based agent reward (binary) + pos_rew = 0 + if min([np.sqrt(np.sum(np.square(a.state.p_pos - a.goal_a.state.p_pos))) for a in good_agents]) \ + < 2 * agent.goal_a.size: + pos_rew += 5 + pos_rew -= min( + [np.sqrt(np.sum(np.square(a.state.p_pos - a.goal_a.state.p_pos))) for a in good_agents]) + return pos_rew + adv_rew + + def adversary_reward(self, agent, world): + # Rewarded based on proximity to the goal landmark + shaped_reward = True + if shaped_reward: # distance-based reward + return -np.sum(np.square(agent.state.p_pos - agent.goal_a.state.p_pos)) + else: # proximity-based reward (binary) + adv_rew = 0 + if np.sqrt(np.sum(np.square(agent.state.p_pos - agent.goal_a.state.p_pos))) < 2 * agent.goal_a.size: + adv_rew += 5 + return adv_rew + + + def observation(self, agent, world): + # get positions of all entities in this agent's reference frame + entity_pos = [] + for entity in world.landmarks: + entity_pos.append(entity.state.p_pos - agent.state.p_pos) + # entity colors + entity_color = [] + for entity in world.landmarks: + entity_color.append(entity.color) + # communication of all other agents + other_pos = [] + for other in world.agents: + if other is agent: continue + other_pos.append(other.state.p_pos - agent.state.p_pos) + + if not agent.adversary: + return np.concatenate([agent.goal_a.state.p_pos - agent.state.p_pos] + entity_pos + other_pos) + else: + return np.concatenate(entity_pos + other_pos) diff --git a/runner.py b/runner.py new file mode 100644 index 0000000..1418a6d --- /dev/null +++ b/runner.py @@ -0,0 +1,86 @@ +from tqdm import tqdm +from agent import Agent +from common.replay_buffer import Buffer +import torch +import os +import numpy as np +import matplotlib.pyplot as plt + +class Runner: + def __init__(self, args, env): + self.args = args + self.noise = args.noise_rate + self.epsilon = args.epsilon + self.episode_limit = args.max_episode_len + self.env = env + self.agents = self._init_agents() + self.buffer = Buffer(args) + self.save_path = self.args.save_dir + '/' + self.args.scenario_name + if not os.path.exists(self.save_path): + os.makedirs(self.save_path) + + def _init_agents(self): + agents = [] + for i in range(self.args.n_agents): + agent = Agent(i, self.args) + agents.append(agent) + return agents + + def run(self): + returns = [] + for time_step in tqdm(range(self.args.time_steps)): + # reset the environment + if time_step % self.episode_limit == 0: + s = self.env.reset() + u = [] + actions = [] + with torch.no_grad(): + for agent_id, agent in enumerate(self.agents): + action = agent.select_action(s[agent_id], self.noise, self.epsilon) + u.append(action) + actions.append(action) + for i in range(self.args.n_agents, self.args.n_players): + actions.append([0, np.random.rand() * 2 - 1, 0, np.random.rand() * 2 - 1, 0]) + s_next, r, done, info = self.env.step(actions) + self.buffer.store_episode(s[:self.args.n_agents], u, r[:self.args.n_agents], s_next[:self.args.n_agents]) + s = s_next + if self.buffer.current_size >= self.args.batch_size: + transitions = self.buffer.sample(self.args.batch_size) + for agent in self.agents: + other_agents = self.agents.copy() + other_agents.remove(agent) + agent.learn(transitions, other_agents) + if time_step > 0 and time_step % self.args.evaluate_rate == 0: + returns.append(self.evaluate()) + plt.figure() + plt.plot(range(len(returns)), returns) + plt.xlabel('episode * ' + str(self.args.evaluate_rate / self.episode_limit)) + plt.ylabel('average returns') + plt.savefig(self.save_path + '/plt.png', format='png') + self.noise = max(0.05, self.noise - 0.0000005) + self.epsilon = max(0.05, self.epsilon - 0.0000005) + np.save(self.save_path + '/returns.pkl', returns) + + def evaluate(self): + returns = [] + for episode in range(self.args.evaluate_episodes): + # reset the environment + s = self.env.reset() + rewards = 0 + for time_step in range(self.args.evaluate_episode_len): +# if (episode > self.args.evaluate_episode_len - 50): + #self.env.render() + actions = [] + with torch.no_grad(): + for agent_id, agent in enumerate(self.agents): + action = agent.select_action(s[agent_id], 0, 0) + actions.append(action) + for i in range(self.args.n_agents, self.args.n_players): + actions.append([0, np.random.rand() * 2 - 1, 0, np.random.rand() * 2 - 1, 0]) + s_next, r, done, info = self.env.step(actions) + rewards += r[0] + s = s_next + returns.append(rewards) + if (episode % 1000 == 0): + print('Returns is', rewards) + return sum(returns) / self.args.evaluate_episodes