default actor=adam3-64, critic=adam3-64, b=256, ep=245k

This commit is contained in:
Ivan I. Ovchinnikov 2023-01-13 17:16:51 +03:00
parent 5c11322e05
commit 0a78c5a7d9
24 changed files with 1917 additions and 1 deletions

3
.gitignore vendored
View File

@ -1 +1,4 @@
.ipynb* .ipynb*
/model/*/*/*
!/model/*/*/actor*
!/model/*/*/critic*

28
agent.py Normal file
View File

@ -0,0 +1,28 @@
import numpy as np
import torch
import os
from maddpg.maddpg import MADDPG
class Agent:
def __init__(self, agent_id, args):
self.args = args
self.agent_id = agent_id
self.policy = MADDPG(args, agent_id)
def select_action(self, o, noise_rate, epsilon):
if np.random.uniform() < epsilon:
u = np.random.uniform(-self.args.high_action, self.args.high_action, self.args.action_shape[self.agent_id])
else:
inputs = torch.tensor(o, dtype=torch.float32).unsqueeze(0)
pi = self.policy.actor_network(inputs).squeeze(0)
# print('{} : {}'.format(self.name, pi))
u = pi.cpu().numpy()
noise = noise_rate * self.args.high_action * np.random.randn(*u.shape) # gaussian noise
u += noise
u = np.clip(u, -self.args.high_action, self.args.high_action)
return u.copy()
def learn(self, transitions, other_agents):
self.policy.train(transitions, other_agents)

38
common/arguments.py Normal file
View File

@ -0,0 +1,38 @@
import argparse
"""
Here are the param for the training
"""
def get_args():
parser = argparse.ArgumentParser("Reinforcement Learning experiments for multiagent environments")
# Environment
parser.add_argument("--scenario-name", type=str, default="simple_tag", help="name of the scenario script")
parser.add_argument("--max-episode-len", type=int, default=100, help="maximum episode length")
parser.add_argument("--time-steps", type=int, default=2000000, help="number of time steps")
# agents
parser.add_argument("--num-adversaries", type=int, default=1, help="number of adversaries")
# Core training parameters
parser.add_argument("--lr-actor", type=float, default=1e-4, help="learning rate of actor")
parser.add_argument("--lr-critic", type=float, default=1e-3, help="learning rate of critic")
parser.add_argument("--epsilon", type=float, default=0.1, help="epsilon greedy")
parser.add_argument("--noise_rate", type=float, default=0.1, help="noise rate for sampling from a standard normal distribution ")
parser.add_argument("--gamma", type=float, default=0.95, help="discount factor")
parser.add_argument("--tau", type=float, default=0.01, help="parameter for updating the target network")
parser.add_argument("--buffer-size", type=int, default=int(5e5), help="number of transitions can be stored in buffer")
parser.add_argument("--batch-size", type=int, default=256, help="number of episodes to optimize at the same time")
# Checkpointing
parser.add_argument("--save-dir", type=str, default="./model", help="directory in which training state and model should be saved")
parser.add_argument("--save-rate", type=int, default=2000, help="save model once every time this many episodes are completed")
parser.add_argument("--model-dir", type=str, default="", help="directory in which training state and model are loaded")
# Evaluate
parser.add_argument("--evaluate-episodes", type=int, default=10, help="number of episodes for evaluating")
parser.add_argument("--evaluate-episode-len", type=int, default=100, help="length of episodes for evaluating")
parser.add_argument("--evaluate", type=bool, default=False, help="whether to evaluate the model")
parser.add_argument("--evaluate-rate", type=int, default=1000, help="how often to evaluate model")
args = parser.parse_args()
return args

53
common/replay_buffer.py Normal file
View File

@ -0,0 +1,53 @@
import threading
import numpy as np
class Buffer:
def __init__(self, args):
self.size = args.buffer_size
self.args = args
# memory management
self.current_size = 0
# create the buffer to store info
self.buffer = dict()
for i in range(self.args.n_agents):
self.buffer['o_%d' % i] = np.empty([self.size, self.args.obs_shape[i]])
self.buffer['u_%d' % i] = np.empty([self.size, self.args.action_shape[i]])
self.buffer['r_%d' % i] = np.empty([self.size])
self.buffer['o_next_%d' % i] = np.empty([self.size, self.args.obs_shape[i]])
# thread lock
self.lock = threading.Lock()
# store the episode
def store_episode(self, o, u, r, o_next):
idxs = self._get_storage_idx(inc=1)
for i in range(self.args.n_agents):
with self.lock:
self.buffer['o_%d' % i][idxs] = o[i]
self.buffer['u_%d' % i][idxs] = u[i]
self.buffer['r_%d' % i][idxs] = r[i]
self.buffer['o_next_%d' % i][idxs] = o_next[i]
# sample the data from the replay buffer
def sample(self, batch_size):
temp_buffer = {}
idx = np.random.randint(0, self.current_size, batch_size)
for key in self.buffer.keys():
temp_buffer[key] = self.buffer[key][idx]
return temp_buffer
def _get_storage_idx(self, inc=None):
inc = inc or 1
if self.current_size+inc <= self.size:
idx = np.arange(self.current_size, self.current_size+inc)
elif self.current_size < self.size:
overflow = inc - (self.size - self.current_size)
idx_a = np.arange(self.current_size, self.size)
idx_b = np.random.randint(0, self.current_size, overflow)
idx = np.concatenate([idx_a, idx_b])
else:
idx = np.random.randint(0, self.size, inc)
self.current_size = min(self.size, self.current_size+inc)
if inc == 1:
idx = idx[0]
return idx

54
common/utils.py Normal file
View File

@ -0,0 +1,54 @@
import numpy as np
import inspect
import functools
def store_args(method):
"""Stores provided method args as instance attributes.
"""
argspec = inspect.getfullargspec(method)
defaults = {}
if argspec.defaults is not None:
defaults = dict(
zip(argspec.args[-len(argspec.defaults):], argspec.defaults))
if argspec.kwonlydefaults is not None:
defaults.update(argspec.kwonlydefaults)
arg_names = argspec.args[1:]
@functools.wraps(method)
def wrapper(*positional_args, **keyword_args):
self = positional_args[0]
# Get default arg values
args = defaults.copy()
# Add provided arg values
for name, value in zip(arg_names, positional_args[1:]):
args[name] = value
args.update(keyword_args)
self.__dict__.update(args)
return method(*positional_args, **keyword_args)
return wrapper
def make_env(args):
from multiagent.environment import MultiAgentEnv
import multiagent.scenarios as scenarios
# load scenario from script
scenario = scenarios.load(args.scenario_name + ".py").Scenario()
# create world
world = scenario.make_world()
# create multiagent environment
env = MultiAgentEnv(world, scenario.reset_world, scenario.reward, scenario.observation)
# env = MultiAgentEnv(world)
args.n_players = env.n
args.n_agents = env.n - args.num_adversaries
args.obs_shape = [env.observation_space[i].shape[0] for i in range(args.n_agents)]
action_shape = []
for content in env.action_space:
action_shape.append(content.n)
args.action_shape = action_shape[:args.n_agents]
args.high_action = 1
args.low_action = -1
return env, args

44
maddpg/actor_critic.py Normal file
View File

@ -0,0 +1,44 @@
import torch
import torch.nn as nn
import torch.nn.functional as F
# define the actor network
class Actor(nn.Module):
def __init__(self, args, agent_id):
super(Actor, self).__init__()
self.max_action = args.high_action
self.fc1 = nn.Linear(args.obs_shape[agent_id], 64)
self.fc2 = nn.Linear(64, 64)
self.fc3 = nn.Linear(64, 64)
self.action_out = nn.Linear(128, args.action_shape[agent_id])
def forward(self, x):
x = F.relu(self.fc1(x))
x = F.relu(self.fc2(x))
x = F.relu(self.fc3(x))
actions = self.max_action * torch.tanh(self.action_out(x))
return actions
class Critic(nn.Module):
def __init__(self, args):
super(Critic, self).__init__()
self.max_action = args.high_action
self.fc1 = nn.Linear(sum(args.obs_shape) + sum(args.action_shape), 64)
self.fc2 = nn.Linear(64, 64)
self.fc3 = nn.Linear(64, 64)
self.q_out = nn.Linear(64, 1)
def forward(self, state, action):
state = torch.cat(state, dim=1)
for i in range(len(action)):
action[i] /= self.max_action
action = torch.cat(action, dim=1)
x = torch.cat([state, action], dim=1)
x = F.relu(self.fc1(x))
x = F.relu(self.fc2(x))
x = F.relu(self.fc3(x))
q_value = self.q_out(x)
return q_value

114
maddpg/maddpg.py Normal file
View File

@ -0,0 +1,114 @@
import torch
import os
from maddpg.actor_critic import Actor, Critic
class MADDPG:
def __init__(self, args, agent_id):
self.args = args
self.agent_id = agent_id
self.train_step = 0
# create the network
self.actor_network = Actor(args, agent_id)
self.critic_network = Critic(args)
# build up the target network
self.actor_target_network = Actor(args, agent_id)
self.critic_target_network = Critic(args)
# load the weights into the target networks
self.actor_target_network.load_state_dict(self.actor_network.state_dict())
self.critic_target_network.load_state_dict(self.critic_network.state_dict())
# create the optimizer
self.actor_optim = torch.optim.Adam(self.actor_network.parameters(), lr=self.args.lr_actor)
self.critic_optim = torch.optim.Adam(self.critic_network.parameters(), lr=self.args.lr_critic)
# create the dict for store the model
if not os.path.exists(self.args.save_dir):
os.mkdir(self.args.save_dir)
# path to save the model
self.model_path = self.args.save_dir + '/' + self.args.scenario_name
if not os.path.exists(self.model_path):
os.mkdir(self.model_path)
self.model_path = self.model_path + '/' + 'agent_%d' % agent_id
if not os.path.exists(self.model_path):
os.mkdir(self.model_path)
if os.path.exists(self.model_path + '/actor_params.pkl'):
self.actor_network.load_state_dict(torch.load(self.model_path + '/actor_params.pkl'))
self.critic_network.load_state_dict(torch.load(self.model_path + '/critic_params.pkl'))
print('Agent {} successfully loaded actor_network: {}'.format(self.agent_id,
self.model_path + '/actor_params.pkl'))
print('Agent {} successfully loaded critic_network: {}'.format(self.agent_id,
self.model_path + '/critic_params.pkl'))
# soft update
def _soft_update_target_network(self):
for target_param, param in zip(self.actor_target_network.parameters(), self.actor_network.parameters()):
target_param.data.copy_((1 - self.args.tau) * target_param.data + self.args.tau * param.data)
for target_param, param in zip(self.critic_target_network.parameters(), self.critic_network.parameters()):
target_param.data.copy_((1 - self.args.tau) * target_param.data + self.args.tau * param.data)
# update the network
def train(self, transitions, other_agents):
for key in transitions.keys():
transitions[key] = torch.tensor(transitions[key], dtype=torch.float32)
r = transitions['r_%d' % self.agent_id] # reward
o, u, o_next = [], [], [] # agent
for agent_id in range(self.args.n_agents):
o.append(transitions['o_%d' % agent_id])
u.append(transitions['u_%d' % agent_id])
o_next.append(transitions['o_next_%d' % agent_id])
# calculate the target Q value function
u_next = []
with torch.no_grad():
index = 0
for agent_id in range(self.args.n_agents):
if agent_id == self.agent_id:
u_next.append(self.actor_target_network(o_next[agent_id]))
else:
# other_agents
u_next.append(other_agents[index].policy.actor_target_network(o_next[agent_id]))
index += 1
q_next = self.critic_target_network(o_next, u_next).detach()
target_q = (r.unsqueeze(1) + self.args.gamma * q_next).detach()
# the q loss
q_value = self.critic_network(o, u)
critic_loss = (target_q - q_value).pow(2).mean()
# the actor loss
u[self.agent_id] = self.actor_network(o[self.agent_id])
actor_loss = - self.critic_network(o, u).mean()
#if self.agent_id == 0:
# print('critic_loss is {}, actor_loss is {}'.format(critic_loss, actor_loss))
# update the network
self.actor_optim.zero_grad()
actor_loss.backward()
self.actor_optim.step()
self.critic_optim.zero_grad()
critic_loss.backward()
self.critic_optim.step()
self._soft_update_target_network()
if self.train_step > 0 and self.train_step % self.args.save_rate == 0:
self.save_model(self.train_step)
self.train_step += 1
def save_model(self, train_step):
num = str(train_step // self.args.save_rate)
model_path = os.path.join(self.args.save_dir, self.args.scenario_name)
if not os.path.exists(model_path):
os.makedirs(model_path)
model_path = os.path.join(model_path, 'agent_%d' % self.agent_id)
if not os.path.exists(model_path):
os.makedirs(model_path)
torch.save(self.actor_network.state_dict(), model_path + '/' + num + '_actor_params.pkl')
torch.save(self.critic_network.state_dict(), model_path + '/' + num + '_critic_params.pkl')

331
main.ipynb Normal file
View File

@ -0,0 +1,331 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"id": "11c84981",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Looking in indexes: https://nexus.c68.spacecorp.ru/repository/pypi_group/simple/\n",
"Requirement already satisfied: torch==1.7.1 in /home/ovchinnikov_ii@RISDE.ru/Software/Jupyter/venv/lib/python3.9/site-packages (1.7.1)\n",
"Requirement already satisfied: typing-extensions in /home/ovchinnikov_ii@RISDE.ru/Software/Jupyter/venv/lib/python3.9/site-packages (from torch==1.7.1) (4.4.0)\n",
"Requirement already satisfied: numpy in /home/ovchinnikov_ii@RISDE.ru/Software/Jupyter/venv/lib/python3.9/site-packages (from torch==1.7.1) (1.24.1)\n",
"Note: you may need to restart the kernel to use updated packages.\n"
]
}
],
"source": [
"pip install torch==1.7.1"
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "735f4c82",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Looking in indexes: https://nexus.c68.spacecorp.ru/repository/pypi_group/simple/\n",
"Collecting supersuit==2.6.5\n",
" Using cached SuperSuit-2.6.5-py3-none-any.whl\n",
"Requirement already satisfied: cloudpickle in /home/ovchinnikov_ii@RISDE.ru/Software/Jupyter/venv/lib/python3.9/site-packages (from supersuit==2.6.5) (2.2.0)\n",
"Collecting pettingzoo>=1.6.0\n",
" Using cached https://nexus.c68.spacecorp.ru/repository/pypi_group/packages/pettingzoo/1.22.3/PettingZoo-1.22.3-py3-none-any.whl (816 kB)\n",
"Collecting opencv-python~=3.4.0\n",
" Using cached https://nexus.c68.spacecorp.ru/repository/pypi_group/packages/opencv-python/3.4.18.65/opencv_python-3.4.18.65-cp36-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (58.4 MB)\n",
"Requirement already satisfied: numpy>=1.19.3 in /home/ovchinnikov_ii@RISDE.ru/Software/Jupyter/venv/lib/python3.9/site-packages (from opencv-python~=3.4.0->supersuit==2.6.5) (1.24.1)\n",
"Requirement already satisfied: gymnasium>=0.26.0 in /home/ovchinnikov_ii@RISDE.ru/Software/Jupyter/venv/lib/python3.9/site-packages (from pettingzoo>=1.6.0->supersuit==2.6.5) (0.27.0)\n",
"Requirement already satisfied: typing-extensions>=4.3.0 in /home/ovchinnikov_ii@RISDE.ru/Software/Jupyter/venv/lib/python3.9/site-packages (from gymnasium>=0.26.0->pettingzoo>=1.6.0->supersuit==2.6.5) (4.4.0)\n",
"Requirement already satisfied: importlib-metadata>=4.8.0 in /home/ovchinnikov_ii@RISDE.ru/Software/Jupyter/venv/lib/python3.9/site-packages (from gymnasium>=0.26.0->pettingzoo>=1.6.0->supersuit==2.6.5) (6.0.0)\n",
"Requirement already satisfied: shimmy<1.0,>=0.1.0 in /home/ovchinnikov_ii@RISDE.ru/Software/Jupyter/venv/lib/python3.9/site-packages (from gymnasium>=0.26.0->pettingzoo>=1.6.0->supersuit==2.6.5) (0.2.0)\n",
"Requirement already satisfied: jax-jumpy>=0.2.0 in /home/ovchinnikov_ii@RISDE.ru/Software/Jupyter/venv/lib/python3.9/site-packages (from gymnasium>=0.26.0->pettingzoo>=1.6.0->supersuit==2.6.5) (0.2.0)\n",
"Requirement already satisfied: gymnasium-notices>=0.0.1 in /home/ovchinnikov_ii@RISDE.ru/Software/Jupyter/venv/lib/python3.9/site-packages (from gymnasium>=0.26.0->pettingzoo>=1.6.0->supersuit==2.6.5) (0.0.1)\n",
"Requirement already satisfied: zipp>=0.5 in /home/ovchinnikov_ii@RISDE.ru/Software/Jupyter/venv/lib/python3.9/site-packages (from importlib-metadata>=4.8.0->gymnasium>=0.26.0->pettingzoo>=1.6.0->supersuit==2.6.5) (3.11.0)\n",
"Installing collected packages: pettingzoo, opencv-python, supersuit\n",
" Attempting uninstall: pettingzoo\n",
" Found existing installation: PettingZoo 1.3.3\n",
" Uninstalling PettingZoo-1.3.3:\n",
" Successfully uninstalled PettingZoo-1.3.3\n",
" Attempting uninstall: opencv-python\n",
" Found existing installation: opencv-python 4.7.0.68\n",
" Uninstalling opencv-python-4.7.0.68:\n",
" Successfully uninstalled opencv-python-4.7.0.68\n",
" Attempting uninstall: supersuit\n",
" Found existing installation: SuperSuit 3.7.1\n",
" Uninstalling SuperSuit-3.7.1:\n",
" Successfully uninstalled SuperSuit-3.7.1\n",
"Successfully installed opencv-python-3.4.18.65 pettingzoo-1.22.3 supersuit-2.6.5\n",
"Note: you may need to restart the kernel to use updated packages.\n"
]
}
],
"source": [
"pip install supersuit==2.6.5"
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "3b8272be",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Looking in indexes: https://nexus.c68.spacecorp.ru/repository/pypi_group/simple/\n",
"Collecting tqdm\n",
" Using cached https://nexus.c68.spacecorp.ru/repository/pypi_group/packages/tqdm/4.64.1/tqdm-4.64.1-py2.py3-none-any.whl (78 kB)\n",
"Installing collected packages: tqdm\n",
"Successfully installed tqdm-4.64.1\n",
"Note: you may need to restart the kernel to use updated packages.\n"
]
}
],
"source": [
"pip install tqdm"
]
},
{
"cell_type": "code",
"execution_count": 4,
"id": "acc570b8",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Looking in indexes: https://nexus.c68.spacecorp.ru/repository/pypi_group/simple/\n",
"Requirement already satisfied: matplotlib in /home/ovchinnikov_ii@RISDE.ru/Software/Jupyter/venv/lib/python3.9/site-packages (3.6.3)\n",
"Requirement already satisfied: contourpy>=1.0.1 in /home/ovchinnikov_ii@RISDE.ru/Software/Jupyter/venv/lib/python3.9/site-packages (from matplotlib) (1.0.6)\n",
"Requirement already satisfied: numpy>=1.19 in /home/ovchinnikov_ii@RISDE.ru/Software/Jupyter/venv/lib/python3.9/site-packages (from matplotlib) (1.24.1)\n",
"Requirement already satisfied: pyparsing>=2.2.1 in /home/ovchinnikov_ii@RISDE.ru/Software/Jupyter/venv/lib/python3.9/site-packages (from matplotlib) (3.0.9)\n",
"Requirement already satisfied: packaging>=20.0 in /home/ovchinnikov_ii@RISDE.ru/Software/Jupyter/venv/lib/python3.9/site-packages (from matplotlib) (23.0)\n",
"Requirement already satisfied: fonttools>=4.22.0 in /home/ovchinnikov_ii@RISDE.ru/Software/Jupyter/venv/lib/python3.9/site-packages (from matplotlib) (4.38.0)\n",
"Requirement already satisfied: cycler>=0.10 in /home/ovchinnikov_ii@RISDE.ru/Software/Jupyter/venv/lib/python3.9/site-packages (from matplotlib) (0.11.0)\n",
"Requirement already satisfied: kiwisolver>=1.0.1 in /home/ovchinnikov_ii@RISDE.ru/Software/Jupyter/venv/lib/python3.9/site-packages (from matplotlib) (1.4.4)\n",
"Requirement already satisfied: pillow>=6.2.0 in /home/ovchinnikov_ii@RISDE.ru/Software/Jupyter/venv/lib/python3.9/site-packages (from matplotlib) (9.4.0)\n",
"Requirement already satisfied: python-dateutil>=2.7 in /home/ovchinnikov_ii@RISDE.ru/Software/Jupyter/venv/lib/python3.9/site-packages (from matplotlib) (2.8.2)\n",
"Requirement already satisfied: six>=1.5 in /home/ovchinnikov_ii@RISDE.ru/Software/Jupyter/venv/lib/python3.9/site-packages (from python-dateutil>=2.7->matplotlib) (1.16.0)\n",
"Note: you may need to restart the kernel to use updated packages.\n"
]
}
],
"source": [
"pip install matplotlib"
]
},
{
"cell_type": "code",
"execution_count": 5,
"id": "262ae5d6",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Looking in indexes: https://nexus.c68.spacecorp.ru/repository/pypi_group/simple/\n",
"Collecting gym==0.10.5\n",
" Using cached gym-0.10.5-py3-none-any.whl\n",
"Requirement already satisfied: six in /home/ovchinnikov_ii@RISDE.ru/Software/Jupyter/venv/lib/python3.9/site-packages (from gym==0.10.5) (1.16.0)\n",
"Collecting pyglet>=1.2.0\n",
" Using cached https://nexus.c68.spacecorp.ru/repository/pypi_group/packages/pyglet/2.0.3/pyglet-2.0.3-py3-none-any.whl (968 kB)\n",
"Requirement already satisfied: numpy>=1.10.4 in /home/ovchinnikov_ii@RISDE.ru/Software/Jupyter/venv/lib/python3.9/site-packages (from gym==0.10.5) (1.24.1)\n",
"Requirement already satisfied: requests>=2.0 in /home/ovchinnikov_ii@RISDE.ru/Software/Jupyter/venv/lib/python3.9/site-packages (from gym==0.10.5) (2.28.1)\n",
"Requirement already satisfied: idna<4,>=2.5 in /home/ovchinnikov_ii@RISDE.ru/Software/Jupyter/venv/lib/python3.9/site-packages (from requests>=2.0->gym==0.10.5) (3.4)\n",
"Requirement already satisfied: charset-normalizer<3,>=2 in /home/ovchinnikov_ii@RISDE.ru/Software/Jupyter/venv/lib/python3.9/site-packages (from requests>=2.0->gym==0.10.5) (2.1.1)\n",
"Requirement already satisfied: urllib3<1.27,>=1.21.1 in /home/ovchinnikov_ii@RISDE.ru/Software/Jupyter/venv/lib/python3.9/site-packages (from requests>=2.0->gym==0.10.5) (1.26.14)\n",
"Requirement already satisfied: certifi>=2017.4.17 in /home/ovchinnikov_ii@RISDE.ru/Software/Jupyter/venv/lib/python3.9/site-packages (from requests>=2.0->gym==0.10.5) (2022.12.7)\n",
"Installing collected packages: pyglet, gym\n",
" Attempting uninstall: gym\n",
" Found existing installation: gym 0.22.0\n",
" Uninstalling gym-0.22.0:\n",
" Successfully uninstalled gym-0.22.0\n",
"Successfully installed gym-0.10.5 pyglet-2.0.3\n",
"Note: you may need to restart the kernel to use updated packages.\n"
]
}
],
"source": [
"pip install gym==0.10.5"
]
},
{
"cell_type": "code",
"execution_count": 6,
"id": "9c651810",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Looking in indexes: https://nexus.c68.spacecorp.ru/repository/pypi_group/simple/\n",
"Collecting pyglet==1.5.27\n",
" Using cached https://nexus.c68.spacecorp.ru/repository/pypi_group/packages/pyglet/1.5.27/pyglet-1.5.27-py3-none-any.whl (1.1 MB)\n",
"Installing collected packages: pyglet\n",
" Attempting uninstall: pyglet\n",
" Found existing installation: pyglet 2.0.3\n",
" Uninstalling pyglet-2.0.3:\n",
" Successfully uninstalled pyglet-2.0.3\n",
"Successfully installed pyglet-1.5.27\n",
"Note: you may need to restart the kernel to use updated packages.\n"
]
}
],
"source": [
"pip install pyglet==1.5.27"
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "cb877007",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Пытаемся загрузить данные!\n",
"Пытаемся загрузить данные!\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
" 0%| | 203/2000000 [00:00<32:10, 1036.07it/s]/home/ovchinnikov_ii@RISDE.ru/Software/Jupyter/MADDPG/maddpg/maddpg.py:60: UserWarning: To copy construct from a tensor, it is recommended to use sourceTensor.clone().detach() or sourceTensor.clone().detach().requires_grad_(True), rather than torch.tensor(sourceTensor).\n",
" transitions[key] = torch.tensor(transitions[key], dtype=torch.float32)\n",
" 0%| | 307/2000000 [00:01<2:18:56, 239.88it/s]"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Пытаемся сохранить данные по пути = ./model/simple_adversary/agent_0/1_actor_params.pkl\n",
"Пытаемся сохранить данные по пути = ./model/simple_adversary/agent_1/1_actor_params.pkl\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
" 0%| | 459/2000000 [00:03<6:02:10, 92.02it/s]"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Пытаемся сохранить данные по пути = ./model/simple_adversary/agent_0/2_actor_params.pkl\n",
"Пытаемся сохранить данные по пути = ./model/simple_adversary/agent_1/2_actor_params.pkl\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
" 0%| | 566/2000000 [00:04<7:30:23, 73.99it/s]"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Пытаемся сохранить данные по пути = ./model/simple_adversary/agent_0/3_actor_params.pkl\n",
"Пытаемся сохранить данные по пути = ./model/simple_adversary/agent_1/3_actor_params.pkl\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
" 0%| | 667/2000000 [00:06<8:44:54, 63.48it/s]"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Пытаемся сохранить данные по пути = ./model/simple_adversary/agent_0/4_actor_params.pkl\n",
"Пытаемся сохранить данные по пути = ./model/simple_adversary/agent_1/4_actor_params.pkl\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
" 0%| | 717/2000000 [00:07<5:36:33, 99.01it/s]\n"
]
},
{
"ename": "KeyboardInterrupt",
"evalue": "",
"output_type": "error",
"traceback": [
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[0;31mKeyboardInterrupt\u001b[0m Traceback (most recent call last)",
"File \u001b[0;32m~/Software/Jupyter/MADDPG/main.py:18\u001b[0m\n\u001b[1;32m 16\u001b[0m \u001b[38;5;28mprint\u001b[39m(\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mAverage returns is\u001b[39m\u001b[38;5;124m'\u001b[39m, returns)\n\u001b[1;32m 17\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m---> 18\u001b[0m \u001b[43mrunner\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mrun\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n",
"File \u001b[0;32m~/Software/Jupyter/MADDPG/runner.py:52\u001b[0m, in \u001b[0;36mRunner.run\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 50\u001b[0m other_agents \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39magents\u001b[38;5;241m.\u001b[39mcopy()\n\u001b[1;32m 51\u001b[0m other_agents\u001b[38;5;241m.\u001b[39mremove(agent)\n\u001b[0;32m---> 52\u001b[0m \u001b[43magent\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mlearn\u001b[49m\u001b[43m(\u001b[49m\u001b[43mtransitions\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mother_agents\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 53\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m time_step \u001b[38;5;241m>\u001b[39m \u001b[38;5;241m0\u001b[39m \u001b[38;5;129;01mand\u001b[39;00m time_step \u001b[38;5;241m%\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39margs\u001b[38;5;241m.\u001b[39mevaluate_rate \u001b[38;5;241m==\u001b[39m \u001b[38;5;241m0\u001b[39m:\n\u001b[1;32m 54\u001b[0m returns\u001b[38;5;241m.\u001b[39mappend(\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mevaluate())\n",
"File \u001b[0;32m~/Software/Jupyter/MADDPG/agent.py:27\u001b[0m, in \u001b[0;36mAgent.learn\u001b[0;34m(self, transitions, other_agents)\u001b[0m\n\u001b[1;32m 26\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mlearn\u001b[39m(\u001b[38;5;28mself\u001b[39m, transitions, other_agents):\n\u001b[0;32m---> 27\u001b[0m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mpolicy\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mtrain\u001b[49m\u001b[43m(\u001b[49m\u001b[43mtransitions\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mother_agents\u001b[49m\u001b[43m)\u001b[49m\n",
"File \u001b[0;32m~/Software/Jupyter/MADDPG/maddpg/maddpg.py:95\u001b[0m, in \u001b[0;36mMADDPG.train\u001b[0;34m(self, transitions, other_agents)\u001b[0m\n\u001b[1;32m 93\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mactor_optim\u001b[38;5;241m.\u001b[39mzero_grad()\n\u001b[1;32m 94\u001b[0m actor_loss\u001b[38;5;241m.\u001b[39mbackward()\n\u001b[0;32m---> 95\u001b[0m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mactor_optim\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mstep\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 96\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mcritic_optim\u001b[38;5;241m.\u001b[39mzero_grad()\n\u001b[1;32m 97\u001b[0m critic_loss\u001b[38;5;241m.\u001b[39mbackward()\n",
"File \u001b[0;32m~/Software/Jupyter/venv/lib/python3.9/site-packages/torch/autograd/grad_mode.py:26\u001b[0m, in \u001b[0;36m_DecoratorContextManager.__call__.<locals>.decorate_context\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m 23\u001b[0m \u001b[38;5;129m@functools\u001b[39m\u001b[38;5;241m.\u001b[39mwraps(func)\n\u001b[1;32m 24\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mdecorate_context\u001b[39m(\u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs):\n\u001b[1;32m 25\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m\u001b[38;5;18m__class__\u001b[39m():\n\u001b[0;32m---> 26\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mfunc\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n",
"File \u001b[0;32m~/Software/Jupyter/venv/lib/python3.9/site-packages/torch/optim/adamw.py:116\u001b[0m, in \u001b[0;36mAdamW.step\u001b[0;34m(self, closure)\u001b[0m\n\u001b[1;32m 112\u001b[0m denom \u001b[38;5;241m=\u001b[39m (exp_avg_sq\u001b[38;5;241m.\u001b[39msqrt() \u001b[38;5;241m/\u001b[39m math\u001b[38;5;241m.\u001b[39msqrt(bias_correction2))\u001b[38;5;241m.\u001b[39madd_(group[\u001b[38;5;124m'\u001b[39m\u001b[38;5;124meps\u001b[39m\u001b[38;5;124m'\u001b[39m])\n\u001b[1;32m 114\u001b[0m step_size \u001b[38;5;241m=\u001b[39m group[\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mlr\u001b[39m\u001b[38;5;124m'\u001b[39m] \u001b[38;5;241m/\u001b[39m bias_correction1\n\u001b[0;32m--> 116\u001b[0m p\u001b[38;5;241m.\u001b[39maddcdiv_(exp_avg, denom, value\u001b[38;5;241m=\u001b[39m\u001b[38;5;241;43m-\u001b[39;49m\u001b[43mstep_size\u001b[49m)\n\u001b[1;32m 118\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m loss\n",
"\u001b[0;31mKeyboardInterrupt\u001b[0m: "
]
}
],
"source": [
"%run ./main.py --scenario-name=simple_adversary --evaluate-episodes=10000 --save-rate=100"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "d71a2b22",
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"id": "d079aff2",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "venv",
"language": "python",
"name": "venv"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.2"
}
},
"nbformat": 4,
"nbformat_minor": 5
}

18
main.py Normal file
View File

@ -0,0 +1,18 @@
from runner import Runner
from common.arguments import get_args
from common.utils import make_env
import numpy as np
import random
import torch
if __name__ == '__main__':
# get the params
args = get_args()
env, args = make_env(args)
runner = Runner(args, env)
if args.evaluate:
returns = runner.evaluate()
print('Average returns is', returns)
else:
runner.run()

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

18
multiagent/__init__.py Normal file
View File

@ -0,0 +1,18 @@
from gym.envs.registration import register
# Multiagent envs
# ----------------------------------------
register(
id='MultiagentSimple-v0',
entry_point='multiagent.envs:SimpleEnv',
# FIXME(cathywu) currently has to be exactly max_path_length parameters in
# rllab run script
max_episode_steps=100,
)
register(
id='MultiagentSimpleSpeakerListener-v0',
entry_point='multiagent.envs:SimpleSpeakerListenerEnv',
max_episode_steps=100,
)

196
multiagent/core.py Normal file
View File

@ -0,0 +1,196 @@
import numpy as np
# physical/external base state of all entites
class EntityState(object):
def __init__(self):
# physical position
self.p_pos = None
# physical velocity
self.p_vel = None
# state of agents (including communication and internal/mental state)
class AgentState(EntityState):
def __init__(self):
super(AgentState, self).__init__()
# communication utterance
self.c = None
# action of the agent
class Action(object):
def __init__(self):
# physical action
self.u = None
# communication action
self.c = None
# properties and state of physical world entity
class Entity(object):
def __init__(self):
# name
self.name = ''
# properties:
self.size = 0.050
# entity can move / be pushed
self.movable = False
# entity collides with others
self.collide = True
# material density (affects mass)
self.density = 25.0
# color
self.color = None
# max speed and accel
self.max_speed = None
self.accel = None
# state
self.state = EntityState()
# mass
self.initial_mass = 1.0
@property
def mass(self):
return self.initial_mass
# properties of landmark entities
class Landmark(Entity):
def __init__(self):
super(Landmark, self).__init__()
# properties of agent entities
class Agent(Entity):
def __init__(self):
super(Agent, self).__init__()
# agents are movable by default
self.movable = True
# cannot send communication signals
self.silent = False
# cannot observe the world
self.blind = False
# physical motor noise amount
self.u_noise = None
# communication noise amount
self.c_noise = None
# control range
self.u_range = 1.0
# state
self.state = AgentState()
# action
self.action = Action()
# script behavior to execute
self.action_callback = None
# multi-agent world
class World(object):
def __init__(self):
# list of agents and entities (can change at execution-time!)
self.agents = []
self.landmarks = []
# communication channel dimensionality
self.dim_c = 0
# position dimensionality
self.dim_p = 2
# color dimensionality
self.dim_color = 3
# simulation timestep
self.dt = 0.1
# physical damping
self.damping = 0.25
# contact response parameters
self.contact_force = 1e+2
self.contact_margin = 1e-3
# return all entities in the world
@property
def entities(self):
return self.agents + self.landmarks
# return all agents controllable by external policies
@property
def policy_agents(self):
return [agent for agent in self.agents if agent.action_callback is None]
# return all agents controlled by world scripts
@property
def scripted_agents(self):
return [agent for agent in self.agents if agent.action_callback is not None]
# update state of the world
def step(self):
# set actions for scripted agents
for agent in self.scripted_agents:
agent.action = agent.action_callback(agent, self)
# gather forces applied to entities
p_force = [None] * len(self.entities)
# apply agent physical controls
p_force = self.apply_action_force(p_force)
# apply environment forces
p_force = self.apply_environment_force(p_force)
# integrate physical state
self.integrate_state(p_force)
# update agent state
for agent in self.agents:
self.update_agent_state(agent)
# gather agent action forces
def apply_action_force(self, p_force):
# set applied forces
for i,agent in enumerate(self.agents):
if agent.movable:
noise = np.random.randn(*agent.action.u.shape) * agent.u_noise if agent.u_noise else 0.0
p_force[i] = agent.action.u + noise
return p_force
# gather physical forces acting on entities
def apply_environment_force(self, p_force):
# simple (but inefficient) collision response
for a,entity_a in enumerate(self.entities):
for b,entity_b in enumerate(self.entities):
if(b <= a): continue
[f_a, f_b] = self.get_collision_force(entity_a, entity_b)
if(f_a is not None):
if(p_force[a] is None): p_force[a] = 0.0
p_force[a] = f_a + p_force[a]
if(f_b is not None):
if(p_force[b] is None): p_force[b] = 0.0
p_force[b] = f_b + p_force[b]
return p_force
# integrate physical state
def integrate_state(self, p_force):
for i,entity in enumerate(self.entities):
if not entity.movable: continue
entity.state.p_vel = entity.state.p_vel * (1 - self.damping)
if (p_force[i] is not None):
entity.state.p_vel += (p_force[i] / entity.mass) * self.dt
if entity.max_speed is not None:
speed = np.sqrt(np.square(entity.state.p_vel[0]) + np.square(entity.state.p_vel[1]))
if speed > entity.max_speed:
entity.state.p_vel = entity.state.p_vel / np.sqrt(np.square(entity.state.p_vel[0]) +
np.square(entity.state.p_vel[1])) * entity.max_speed
entity.state.p_pos += entity.state.p_vel * self.dt
def update_agent_state(self, agent):
# set communication state (directly for now)
if agent.silent:
agent.state.c = np.zeros(self.dim_c)
else:
noise = np.random.randn(*agent.action.c.shape) * agent.c_noise if agent.c_noise else 0.0
agent.state.c = agent.action.c + noise
# get collision forces for any contact between two entities
def get_collision_force(self, entity_a, entity_b):
if (not entity_a.collide) or (not entity_b.collide):
return [None, None] # not a collider
if (entity_a is entity_b):
return [None, None] # don't collide against itself
# compute actual distance between entities
delta_pos = entity_a.state.p_pos - entity_b.state.p_pos
dist = np.sqrt(np.sum(np.square(delta_pos)))
# minimum allowable distance
dist_min = entity_a.size + entity_b.size
# softmax penetration
k = self.contact_margin
penetration = np.logaddexp(0, -(dist - dist_min)/k)*k
force = self.contact_force * delta_pos / dist * penetration
force_a = +force if entity_a.movable else None
force_b = -force if entity_b.movable else None
return [force_a, force_b]

336
multiagent/environment.py Normal file
View File

@ -0,0 +1,336 @@
import gym
from gym import spaces
from gym.envs.registration import EnvSpec
import numpy as np
from multiagent.multi_discrete import MultiDiscrete
# environment for all agents in the multiagent world
# currently code assumes that no agents will be created/destroyed at runtime!
class MultiAgentEnv(gym.Env):
metadata = {
'render.modes' : ['human', 'rgb_array']
}
def __init__(self, world, reset_callback=None, reward_callback=None,
observation_callback=None, info_callback=None,
done_callback=None, shared_viewer=True):
self.world = world
self.agents = self.world.policy_agents
# set required vectorized gym env property
self.n = len(world.policy_agents)
# scenario callbacks
self.reset_callback = reset_callback
self.reward_callback = reward_callback
self.observation_callback = observation_callback
self.info_callback = info_callback
self.done_callback = done_callback
# environment parameters
self.discrete_action_space = True
# if true, action is a number 0...N, otherwise action is a one-hot N-dimensional vector
self.discrete_action_input = False
# if true, even the action is continuous, action will be performed discretely
self.force_discrete_action = world.discrete_action if hasattr(world, 'discrete_action') else False
# if true, every agent has the same reward
self.shared_reward = world.collaborative if hasattr(world, 'collaborative') else False
self.time = 0
# configure spaces
self.action_space = []
self.observation_space = []
for agent in self.agents:
total_action_space = []
# physical action space
if self.discrete_action_space:
u_action_space = spaces.Discrete(world.dim_p * 2 + 1)
else:
u_action_space = spaces.Box(low=-agent.u_range, high=+agent.u_range, shape=(world.dim_p,), dtype=np.float32)
if agent.movable:
total_action_space.append(u_action_space)
# communication action space
if self.discrete_action_space:
c_action_space = spaces.Discrete(world.dim_c)
else:
c_action_space = spaces.Box(low=0.0, high=1.0, shape=(world.dim_c,), dtype=np.float32)
if not agent.silent:
total_action_space.append(c_action_space)
# total action space
if len(total_action_space) > 1:
# all action spaces are discrete, so simplify to MultiDiscrete action space
if all([isinstance(act_space, spaces.Discrete) for act_space in total_action_space]):
act_space = MultiDiscrete([[0, act_space.n - 1] for act_space in total_action_space])
#print(act_space.n)
else:
act_space = spaces.Tuple(total_action_space)
self.action_space.append(act_space)
else:
self.action_space.append(total_action_space[0])
# observation space
obs_dim = len(observation_callback(agent, self.world))
self.observation_space.append(spaces.Box(low=-np.inf, high=+np.inf, shape=(obs_dim,), dtype=np.float32))
agent.action.c = np.zeros(self.world.dim_c)
# rendering
self.shared_viewer = shared_viewer
if self.shared_viewer:
self.viewers = [None]
else:
self.viewers = [None] * self.n
self._reset_render()
def step(self, action_n):
obs_n = []
reward_n = []
done_n = []
info_n = {'n': []}
self.agents = self.world.policy_agents
# set action for each agent
for i, agent in enumerate(self.agents):
self._set_action(action_n[i], agent, self.action_space[i])
# advance world state
self.world.step()
# record observation for each agent
for agent in self.agents:
obs_n.append(self._get_obs(agent))
reward_n.append(self._get_reward(agent))
done_n.append(self._get_done(agent))
info_n['n'].append(self._get_info(agent))
# all agents get total reward in cooperative case
reward = np.sum(reward_n)
if self.shared_reward:
reward_n = [reward] * self.n
return obs_n, reward_n, done_n, info_n
def reset(self):
# reset world
self.reset_callback(self.world)
# reset renderer
self._reset_render()
# record observations for each agent
obs_n = []
self.agents = self.world.policy_agents
for agent in self.agents:
obs_n.append(self._get_obs(agent))
return obs_n
# get info used for benchmarking
def _get_info(self, agent):
if self.info_callback is None:
return {}
return self.info_callback(agent, self.world)
# get observation for a particular agent
def _get_obs(self, agent):
if self.observation_callback is None:
return np.zeros(0)
return self.observation_callback(agent, self.world)
# get dones for a particular agent
# unused right now -- agents are allowed to go beyond the viewing screen
def _get_done(self, agent):
if self.done_callback is None:
return False
return self.done_callback(agent, self.world)
# get reward for a particular agent
def _get_reward(self, agent):
if self.reward_callback is None:
return 0.0
return self.reward_callback(agent, self.world)
# set env action for a particular agent
def _set_action(self, action, agent, action_space, time=None):
agent.action.u = np.zeros(self.world.dim_p)
agent.action.c = np.zeros(self.world.dim_c)
# process action
if isinstance(action_space, MultiDiscrete):
act = []
size = action_space.high - action_space.low + 1
index = 0
for s in size:
act.append(action[index:(index+s)])
index += s
action = act
else:
action = [action]
if agent.movable:
# physical action
if self.discrete_action_input:
agent.action.u = np.zeros(self.world.dim_p)
# process discrete action
if action[0] == 1: agent.action.u[0] = -1.0
if action[0] == 2: agent.action.u[0] = +1.0
if action[0] == 3: agent.action.u[1] = -1.0
if action[0] == 4: agent.action.u[1] = +1.0
else:
if self.force_discrete_action:
d = np.argmax(action[0])
action[0][:] = 0.0
action[0][d] = 1.0
if self.discrete_action_space:
agent.action.u[0] += action[0][1] - action[0][2]
agent.action.u[1] += action[0][3] - action[0][4]
else:
agent.action.u = action[0]
sensitivity = 5.0
if agent.accel is not None:
sensitivity = agent.accel
agent.action.u *= sensitivity
action = action[1:]
if not agent.silent:
# communication action
if self.discrete_action_input:
agent.action.c = np.zeros(self.world.dim_c)
agent.action.c[action[0]] = 1.0
else:
agent.action.c = action[0]
action = action[1:]
# make sure we used all elements of action
assert len(action) == 0
# reset rendering assets
def _reset_render(self):
self.render_geoms = None
self.render_geoms_xform = None
# render environment
def render(self, mode='human'):
if mode == 'human':
alphabet = 'ABCDEFGHIJKLMNOPQRSTUVWXYZ'
message = ''
for agent in self.world.agents:
comm = []
for other in self.world.agents:
if other is agent: continue
if np.all(other.state.c == 0):
word = '_'
else:
word = alphabet[np.argmax(other.state.c)]
message += (other.name + ' to ' + agent.name + ': ' + word + ' ')
print(message)
for i in range(len(self.viewers)):
# create viewers (if necessary)
if self.viewers[i] is None:
# import rendering only if we need it (and don't import for headless machines)
#from gym.envs.classic_control import rendering
from multiagent import rendering
self.viewers[i] = rendering.Viewer(700,700)
# create rendering geometry
if self.render_geoms is None:
# import rendering only if we need it (and don't import for headless machines)
#from gym.envs.classic_control import rendering
from multiagent import rendering
self.render_geoms = []
self.render_geoms_xform = []
for entity in self.world.entities:
geom = rendering.make_circle(entity.size)
xform = rendering.Transform()
if 'agent' in entity.name:
geom.set_color(*entity.color, alpha=0.5)
else:
geom.set_color(*entity.color)
geom.add_attr(xform)
self.render_geoms.append(geom)
self.render_geoms_xform.append(xform)
# add geoms to viewer
for viewer in self.viewers:
viewer.geoms = []
for geom in self.render_geoms:
viewer.add_geom(geom)
results = []
for i in range(len(self.viewers)):
from multiagent import rendering
# update bounds to center around agent
cam_range = 1
if self.shared_viewer:
pos = np.zeros(self.world.dim_p)
else:
pos = self.agents[i].state.p_pos
self.viewers[i].set_bounds(pos[0]-cam_range,pos[0]+cam_range,pos[1]-cam_range,pos[1]+cam_range)
# update geometry positions
for e, entity in enumerate(self.world.entities):
self.render_geoms_xform[e].set_translation(*entity.state.p_pos)
# render to display or array
results.append(self.viewers[i].render(return_rgb_array = mode=='rgb_array'))
return results
# create receptor field locations in local coordinate frame
def _make_receptor_locations(self, agent):
receptor_type = 'polar'
range_min = 0.05 * 2.0
range_max = 1.00
dx = []
# circular receptive field
if receptor_type == 'polar':
for angle in np.linspace(-np.pi, +np.pi, 8, endpoint=False):
for distance in np.linspace(range_min, range_max, 3):
dx.append(distance * np.array([np.cos(angle), np.sin(angle)]))
# add origin
dx.append(np.array([0.0, 0.0]))
# grid receptive field
if receptor_type == 'grid':
for x in np.linspace(-range_max, +range_max, 5):
for y in np.linspace(-range_max, +range_max, 5):
dx.append(np.array([x,y]))
return dx
# vectorized wrapper for a batch of multi-agent environments
# assumes all environments have the same observation and action space
class BatchMultiAgentEnv(gym.Env):
metadata = {
'runtime.vectorized': True,
'render.modes' : ['human', 'rgb_array']
}
def __init__(self, env_batch):
self.env_batch = env_batch
@property
def n(self):
return np.sum([env.n for env in self.env_batch])
@property
def action_space(self):
return self.env_batch[0].action_space
@property
def observation_space(self):
return self.env_batch[0].observation_space
def step(self, action_n, time):
obs_n = []
reward_n = []
done_n = []
info_n = {'n': []}
i = 0
for env in self.env_batch:
obs, reward, done, _ = env.step(action_n[i:(i+env.n)], time)
i += env.n
obs_n += obs
# reward = [r / len(self.env_batch) for r in reward]
reward_n += reward
done_n += done
return obs_n, reward_n, done_n, info_n
def reset(self):
obs_n = []
for env in self.env_batch:
obs_n += env.reset()
return obs_n
# render environment
def render(self, mode='human', close=True):
results_n = []
for env in self.env_batch:
results_n += env.render(mode, close)
return results_n

View File

@ -0,0 +1,44 @@
# An old version of OpenAI Gym's multi_discrete.py. (Was getting affected by Gym updates)
# (https://github.com/openai/gym/blob/1fb81d4e3fb780ccf77fec731287ba07da35eb84/gym/spaces/multi_discrete.py)
import numpy as np
import gym
from gym.spaces import prng
class MultiDiscrete(gym.Space):
"""
- The multi-discrete action space consists of a series of discrete action spaces with different parameters
- It can be adapted to both a Discrete action space or a continuous (Box) action space
- It is useful to represent game controllers or keyboards where each key can be represented as a discrete action space
- It is parametrized by passing an array of arrays containing [min, max] for each discrete action space
where the discrete action space can take any integers from `min` to `max` (both inclusive)
Note: A value of 0 always need to represent the NOOP action.
e.g. Nintendo Game Controller
- Can be conceptualized as 3 discrete action spaces:
1) Arrow Keys: Discrete 5 - NOOP[0], UP[1], RIGHT[2], DOWN[3], LEFT[4] - params: min: 0, max: 4
2) Button A: Discrete 2 - NOOP[0], Pressed[1] - params: min: 0, max: 1
3) Button B: Discrete 2 - NOOP[0], Pressed[1] - params: min: 0, max: 1
- Can be initialized as
MultiDiscrete([ [0,4], [0,1], [0,1] ])
"""
def __init__(self, array_of_param_array):
self.low = np.array([x[0] for x in array_of_param_array])
self.high = np.array([x[1] for x in array_of_param_array])
self.num_discrete_space = self.low.shape[0]
def sample(self):
""" Returns a array with one sample from each discrete action space """
# For each row: round(random .* (max - min) + min, 0)
random_array = prng.np_random.rand(self.num_discrete_space)
return [int(x) for x in np.floor(np.multiply((self.high - self.low + 1.), random_array) + self.low)]
def contains(self, x):
return len(x) == self.num_discrete_space and (np.array(x) >= self.low).all() and (np.array(x) <= self.high).all()
@property
def shape(self):
return self.num_discrete_space
def __repr__(self):
return "MultiDiscrete" + str(self.num_discrete_space)
def __eq__(self, other):
return np.array_equal(self.low, other.low) and np.array_equal(self.high, other.high)

52
multiagent/policy.py Normal file
View File

@ -0,0 +1,52 @@
import numpy as np
from pyglet.window import key
# individual agent policy
class Policy(object):
def __init__(self):
pass
def action(self, obs):
raise NotImplementedError()
# interactive policy based on keyboard input
# hard-coded to deal only with movement, not communication
class InteractivePolicy(Policy):
def __init__(self, env, agent_index):
super(InteractivePolicy, self).__init__()
self.env = env
# hard-coded keyboard events
self.move = [False for i in range(4)]
self.comm = [False for i in range(env.world.dim_c)]
# register keyboard events with this environment's window
env.viewers[agent_index].window.on_key_press = self.key_press
env.viewers[agent_index].window.on_key_release = self.key_release
def action(self, obs):
# ignore observation and just act based on keyboard events
if self.env.discrete_action_input:
u = 0
if self.move[0]: u = 1
if self.move[1]: u = 2
if self.move[2]: u = 4
if self.move[3]: u = 3
else:
u = np.zeros(5) # 5-d because of no-move action
if self.move[0]: u[1] += 1.0
if self.move[1]: u[2] += 1.0
if self.move[3]: u[3] += 1.0
if self.move[2]: u[4] += 1.0
if True not in self.move:
u[0] += 1.0
return np.concatenate([u, np.zeros(self.env.world.dim_c)])
# keyboard event callbacks
def key_press(self, k, mod):
if k==key.LEFT: self.move[0] = True
if k==key.RIGHT: self.move[1] = True
if k==key.UP: self.move[2] = True
if k==key.DOWN: self.move[3] = True
def key_release(self, k, mod):
if k==key.LEFT: self.move[0] = False
if k==key.RIGHT: self.move[1] = False
if k==key.UP: self.move[2] = False
if k==key.DOWN: self.move[3] = False

345
multiagent/rendering.py Normal file
View File

@ -0,0 +1,345 @@
"""
2D rendering framework
"""
from __future__ import division
import os
import six
import sys
if "Apple" in sys.version:
if 'DYLD_FALLBACK_LIBRARY_PATH' in os.environ:
os.environ['DYLD_FALLBACK_LIBRARY_PATH'] += ':/usr/lib'
# (JDS 2016/04/15): avoid bug on Anaconda 2.3.0 / Yosemite
from gym.utils import reraise
from gym import error
try:
import pyglet
except ImportError as e:
reraise(suffix="HINT: you can install pyglet directly via 'pip install pyglet'. But if you really just want to install all Gym dependencies and not have to think about it, 'pip install -e .[all]' or 'pip install gym[all]' will do it.")
try:
from pyglet.gl import *
except ImportError as e:
reraise(prefix="Error occured while running `from pyglet.gl import *`",suffix="HINT: make sure you have OpenGL install. On Ubuntu, you can run 'apt-get install python-opengl'. If you're running on a server, you may need a virtual frame buffer; something like this should work: 'xvfb-run -s \"-screen 0 1400x900x24\" python <your_script.py>'")
import math
import numpy as np
RAD2DEG = 57.29577951308232
def get_display(spec):
"""Convert a display specification (such as :0) into an actual Display
object.
Pyglet only supports multiple Displays on Linux.
"""
if spec is None:
return None
elif isinstance(spec, six.string_types):
return pyglet.canvas.Display(spec)
else:
raise error.Error('Invalid display specification: {}. (Must be a string like :0 or None.)'.format(spec))
class Viewer(object):
def __init__(self, width, height, display=None):
display = get_display(display)
self.width = width
self.height = height
self.window = pyglet.window.Window(width=width, height=height, display=display)
self.window.on_close = self.window_closed_by_user
self.geoms = []
self.onetime_geoms = []
self.transform = Transform()
glEnable(GL_BLEND)
# glEnable(GL_MULTISAMPLE)
glEnable(GL_LINE_SMOOTH)
# glHint(GL_LINE_SMOOTH_HINT, GL_DONT_CARE)
glHint(GL_LINE_SMOOTH_HINT, GL_NICEST)
glLineWidth(2.0)
glBlendFunc(GL_SRC_ALPHA, GL_ONE_MINUS_SRC_ALPHA)
def close(self):
self.window.close()
def window_closed_by_user(self):
self.close()
def set_bounds(self, left, right, bottom, top):
assert right > left and top > bottom
scalex = self.width/(right-left)
scaley = self.height/(top-bottom)
self.transform = Transform(
translation=(-left*scalex, -bottom*scaley),
scale=(scalex, scaley))
def add_geom(self, geom):
self.geoms.append(geom)
def add_onetime(self, geom):
self.onetime_geoms.append(geom)
def render(self, return_rgb_array=False):
glClearColor(1,1,1,1)
self.window.clear()
self.window.switch_to()
self.window.dispatch_events()
self.transform.enable()
for geom in self.geoms:
geom.render()
for geom in self.onetime_geoms:
geom.render()
self.transform.disable()
arr = None
if return_rgb_array:
buffer = pyglet.image.get_buffer_manager().get_color_buffer()
image_data = buffer.get_image_data()
arr = np.fromstring(image_data.data, dtype=np.uint8, sep='')
# In https://github.com/openai/gym-http-api/issues/2, we
# discovered that someone using Xmonad on Arch was having
# a window of size 598 x 398, though a 600 x 400 window
# was requested. (Guess Xmonad was preserving a pixel for
# the boundary.) So we use the buffer height/width rather
# than the requested one.
arr = arr.reshape(buffer.height, buffer.width, 4)
arr = arr[::-1,:,0:3]
self.window.flip()
self.onetime_geoms = []
return arr
# Convenience
def draw_circle(self, radius=10, res=30, filled=True, **attrs):
geom = make_circle(radius=radius, res=res, filled=filled)
_add_attrs(geom, attrs)
self.add_onetime(geom)
return geom
def draw_polygon(self, v, filled=True, **attrs):
geom = make_polygon(v=v, filled=filled)
_add_attrs(geom, attrs)
self.add_onetime(geom)
return geom
def draw_polyline(self, v, **attrs):
geom = make_polyline(v=v)
_add_attrs(geom, attrs)
self.add_onetime(geom)
return geom
def draw_line(self, start, end, **attrs):
geom = Line(start, end)
_add_attrs(geom, attrs)
self.add_onetime(geom)
return geom
def get_array(self):
self.window.flip()
image_data = pyglet.image.get_buffer_manager().get_color_buffer().get_image_data()
self.window.flip()
arr = np.fromstring(image_data.data, dtype=np.uint8, sep='')
arr = arr.reshape(self.height, self.width, 4)
return arr[::-1,:,0:3]
def _add_attrs(geom, attrs):
if "color" in attrs:
geom.set_color(*attrs["color"])
if "linewidth" in attrs:
geom.set_linewidth(attrs["linewidth"])
class Geom(object):
def __init__(self):
self._color=Color((0, 0, 0, 1.0))
self.attrs = [self._color]
def render(self):
for attr in reversed(self.attrs):
attr.enable()
self.render1()
for attr in self.attrs:
attr.disable()
def render1(self):
raise NotImplementedError
def add_attr(self, attr):
self.attrs.append(attr)
def set_color(self, r, g, b, alpha=1):
self._color.vec4 = (r, g, b, alpha)
class Attr(object):
def enable(self):
raise NotImplementedError
def disable(self):
pass
class Transform(Attr):
def __init__(self, translation=(0.0, 0.0), rotation=0.0, scale=(1,1)):
self.set_translation(*translation)
self.set_rotation(rotation)
self.set_scale(*scale)
def enable(self):
glPushMatrix()
glTranslatef(self.translation[0], self.translation[1], 0) # translate to GL loc ppint
glRotatef(RAD2DEG * self.rotation, 0, 0, 1.0)
glScalef(self.scale[0], self.scale[1], 1)
def disable(self):
glPopMatrix()
def set_translation(self, newx, newy):
self.translation = (float(newx), float(newy))
def set_rotation(self, new):
self.rotation = float(new)
def set_scale(self, newx, newy):
self.scale = (float(newx), float(newy))
class Color(Attr):
def __init__(self, vec4):
self.vec4 = vec4
def enable(self):
glColor4f(*self.vec4)
class LineStyle(Attr):
def __init__(self, style):
self.style = style
def enable(self):
glEnable(GL_LINE_STIPPLE)
glLineStipple(1, self.style)
def disable(self):
glDisable(GL_LINE_STIPPLE)
class LineWidth(Attr):
def __init__(self, stroke):
self.stroke = stroke
def enable(self):
glLineWidth(self.stroke)
class Point(Geom):
def __init__(self):
Geom.__init__(self)
def render1(self):
glBegin(GL_POINTS) # draw point
glVertex3f(0.0, 0.0, 0.0)
glEnd()
class FilledPolygon(Geom):
def __init__(self, v):
Geom.__init__(self)
self.v = v
def render1(self):
if len(self.v) == 4 : glBegin(GL_QUADS)
elif len(self.v) > 4 : glBegin(GL_POLYGON)
else: glBegin(GL_TRIANGLES)
for p in self.v:
glVertex3f(p[0], p[1],0) # draw each vertex
glEnd()
color = (self._color.vec4[0] * 0.5, self._color.vec4[1] * 0.5, self._color.vec4[2] * 0.5, self._color.vec4[3] * 0.5)
glColor4f(*color)
glBegin(GL_LINE_LOOP)
for p in self.v:
glVertex3f(p[0], p[1],0) # draw each vertex
glEnd()
def make_circle(radius=10, res=30, filled=True):
points = []
for i in range(res):
ang = 2*math.pi*i / res
points.append((math.cos(ang)*radius, math.sin(ang)*radius))
if filled:
return FilledPolygon(points)
else:
return PolyLine(points, True)
def make_polygon(v, filled=True):
if filled: return FilledPolygon(v)
else: return PolyLine(v, True)
def make_polyline(v):
return PolyLine(v, False)
def make_capsule(length, width):
l, r, t, b = 0, length, width/2, -width/2
box = make_polygon([(l,b), (l,t), (r,t), (r,b)])
circ0 = make_circle(width/2)
circ1 = make_circle(width/2)
circ1.add_attr(Transform(translation=(length, 0)))
geom = Compound([box, circ0, circ1])
return geom
class Compound(Geom):
def __init__(self, gs):
Geom.__init__(self)
self.gs = gs
for g in self.gs:
g.attrs = [a for a in g.attrs if not isinstance(a, Color)]
def render1(self):
for g in self.gs:
g.render()
class PolyLine(Geom):
def __init__(self, v, close):
Geom.__init__(self)
self.v = v
self.close = close
self.linewidth = LineWidth(1)
self.add_attr(self.linewidth)
def render1(self):
glBegin(GL_LINE_LOOP if self.close else GL_LINE_STRIP)
for p in self.v:
glVertex3f(p[0], p[1],0) # draw each vertex
glEnd()
def set_linewidth(self, x):
self.linewidth.stroke = x
class Line(Geom):
def __init__(self, start=(0.0, 0.0), end=(0.0, 0.0)):
Geom.__init__(self)
self.start = start
self.end = end
self.linewidth = LineWidth(1)
self.add_attr(self.linewidth)
def render1(self):
glBegin(GL_LINES)
glVertex2f(*self.start)
glVertex2f(*self.end)
glEnd()
class Image(Geom):
def __init__(self, fname, width, height):
Geom.__init__(self)
self.width = width
self.height = height
img = pyglet.image.load(fname)
self.img = img
self.flip = False
def render1(self):
self.img.blit(-self.width/2, -self.height/2, width=self.width, height=self.height)
# ================================================================
class SimpleImageViewer(object):
def __init__(self, display=None):
self.window = None
self.isopen = False
self.display = display
def imshow(self, arr):
if self.window is None:
height, width, channels = arr.shape
self.window = pyglet.window.Window(width=width, height=height, display=self.display)
self.width = width
self.height = height
self.isopen = True
assert arr.shape == (self.height, self.width, 3), "You passed in an image with the wrong number shape"
image = pyglet.image.ImageData(self.width, self.height, 'RGB', arr.tobytes(), pitch=self.width * -3)
self.window.clear()
self.window.switch_to()
self.window.dispatch_events()
image.blit(0,0)
self.window.flip()
def close(self):
if self.isopen:
self.window.close()
self.isopen = False
def __del__(self):
self.close()

10
multiagent/scenario.py Normal file
View File

@ -0,0 +1,10 @@
import numpy as np
# defines scenario upon which the world is built
class BaseScenario(object):
# create elements of the world
def make_world(self):
raise NotImplementedError()
# create initial conditions of the world
def reset_world(self, world):
raise NotImplementedError()

View File

@ -0,0 +1,7 @@
import imp
import os.path as osp
def load(name):
pathname = osp.join(osp.dirname(__file__), name)
return imp.load_source('', pathname)

View File

@ -0,0 +1,139 @@
import numpy as np
from multiagent.core import World, Agent, Landmark
from multiagent.scenario import BaseScenario
class Scenario(BaseScenario):
def make_world(self):
world = World()
# set any world properties first
world.dim_c = 2
num_agents = 3
world.num_agents = num_agents
num_adversaries = 1
num_landmarks = num_agents - 1
# add agents
world.agents = [Agent() for i in range(num_agents)]
for i, agent in enumerate(world.agents):
agent.name = 'agent %d' % i
agent.collide = False
agent.silent = True
agent.adversary = True if i < num_adversaries else False
agent.size = 0.15
# add landmarks
world.landmarks = [Landmark() for i in range(num_landmarks)]
for i, landmark in enumerate(world.landmarks):
landmark.name = 'landmark %d' % i
landmark.collide = False
landmark.movable = False
landmark.size = 0.08
# make initial conditions
self.reset_world(world)
return world
def reset_world(self, world):
# random properties for agents
world.agents[0].color = np.array([0.85, 0.35, 0.35])
for i in range(1, world.num_agents):
world.agents[i].color = np.array([0.35, 0.35, 0.85])
# random properties for landmarks
for i, landmark in enumerate(world.landmarks):
landmark.color = np.array([0.15, 0.15, 0.15])
# set goal landmark
goal = np.random.choice(world.landmarks)
goal.color = np.array([0.15, 0.65, 0.15])
for agent in world.agents:
agent.goal_a = goal
# set random initial states
for agent in world.agents:
agent.state.p_pos = np.random.uniform(-1, +1, world.dim_p)
agent.state.p_vel = np.zeros(world.dim_p)
agent.state.c = np.zeros(world.dim_c)
for i, landmark in enumerate(world.landmarks):
landmark.state.p_pos = np.random.uniform(-1, +1, world.dim_p)
landmark.state.p_vel = np.zeros(world.dim_p)
def benchmark_data(self, agent, world):
# returns data for benchmarking purposes
if agent.adversary:
return np.sum(np.square(agent.state.p_pos - agent.goal_a.state.p_pos))
else:
dists = []
for l in world.landmarks:
dists.append(np.sum(np.square(agent.state.p_pos - l.state.p_pos)))
dists.append(np.sum(np.square(agent.state.p_pos - agent.goal_a.state.p_pos)))
return tuple(dists)
# return all agents that are not adversaries
def good_agents(self, world):
return [agent for agent in world.agents if not agent.adversary]
# return all adversarial agents
def adversaries(self, world):
return [agent for agent in world.agents if agent.adversary]
def reward(self, agent, world):
# Agents are rewarded based on minimum agent distance to each landmark
return self.adversary_reward(agent, world) if agent.adversary else self.agent_reward(agent, world)
def agent_reward(self, agent, world):
# Rewarded based on how close any good agent is to the goal landmark, and how far the adversary is from it
shaped_reward = True
shaped_adv_reward = True
# Calculate negative reward for adversary
adversary_agents = self.adversaries(world)
if shaped_adv_reward: # distance-based adversary reward
adv_rew = sum([np.sqrt(np.sum(np.square(a.state.p_pos - a.goal_a.state.p_pos))) for a in adversary_agents])
else: # proximity-based adversary reward (binary)
adv_rew = 0
for a in adversary_agents:
if np.sqrt(np.sum(np.square(a.state.p_pos - a.goal_a.state.p_pos))) < 2 * a.goal_a.size:
adv_rew -= 5
# Calculate positive reward for agents
good_agents = self.good_agents(world)
if shaped_reward: # distance-based agent reward
pos_rew = -min(
[np.sqrt(np.sum(np.square(a.state.p_pos - a.goal_a.state.p_pos))) for a in good_agents])
else: # proximity-based agent reward (binary)
pos_rew = 0
if min([np.sqrt(np.sum(np.square(a.state.p_pos - a.goal_a.state.p_pos))) for a in good_agents]) \
< 2 * agent.goal_a.size:
pos_rew += 5
pos_rew -= min(
[np.sqrt(np.sum(np.square(a.state.p_pos - a.goal_a.state.p_pos))) for a in good_agents])
return pos_rew + adv_rew
def adversary_reward(self, agent, world):
# Rewarded based on proximity to the goal landmark
shaped_reward = True
if shaped_reward: # distance-based reward
return -np.sum(np.square(agent.state.p_pos - agent.goal_a.state.p_pos))
else: # proximity-based reward (binary)
adv_rew = 0
if np.sqrt(np.sum(np.square(agent.state.p_pos - agent.goal_a.state.p_pos))) < 2 * agent.goal_a.size:
adv_rew += 5
return adv_rew
def observation(self, agent, world):
# get positions of all entities in this agent's reference frame
entity_pos = []
for entity in world.landmarks:
entity_pos.append(entity.state.p_pos - agent.state.p_pos)
# entity colors
entity_color = []
for entity in world.landmarks:
entity_color.append(entity.color)
# communication of all other agents
other_pos = []
for other in world.agents:
if other is agent: continue
other_pos.append(other.state.p_pos - agent.state.p_pos)
if not agent.adversary:
return np.concatenate([agent.goal_a.state.p_pos - agent.state.p_pos] + entity_pos + other_pos)
else:
return np.concatenate(entity_pos + other_pos)

86
runner.py Normal file
View File

@ -0,0 +1,86 @@
from tqdm import tqdm
from agent import Agent
from common.replay_buffer import Buffer
import torch
import os
import numpy as np
import matplotlib.pyplot as plt
class Runner:
def __init__(self, args, env):
self.args = args
self.noise = args.noise_rate
self.epsilon = args.epsilon
self.episode_limit = args.max_episode_len
self.env = env
self.agents = self._init_agents()
self.buffer = Buffer(args)
self.save_path = self.args.save_dir + '/' + self.args.scenario_name
if not os.path.exists(self.save_path):
os.makedirs(self.save_path)
def _init_agents(self):
agents = []
for i in range(self.args.n_agents):
agent = Agent(i, self.args)
agents.append(agent)
return agents
def run(self):
returns = []
for time_step in tqdm(range(self.args.time_steps)):
# reset the environment
if time_step % self.episode_limit == 0:
s = self.env.reset()
u = []
actions = []
with torch.no_grad():
for agent_id, agent in enumerate(self.agents):
action = agent.select_action(s[agent_id], self.noise, self.epsilon)
u.append(action)
actions.append(action)
for i in range(self.args.n_agents, self.args.n_players):
actions.append([0, np.random.rand() * 2 - 1, 0, np.random.rand() * 2 - 1, 0])
s_next, r, done, info = self.env.step(actions)
self.buffer.store_episode(s[:self.args.n_agents], u, r[:self.args.n_agents], s_next[:self.args.n_agents])
s = s_next
if self.buffer.current_size >= self.args.batch_size:
transitions = self.buffer.sample(self.args.batch_size)
for agent in self.agents:
other_agents = self.agents.copy()
other_agents.remove(agent)
agent.learn(transitions, other_agents)
if time_step > 0 and time_step % self.args.evaluate_rate == 0:
returns.append(self.evaluate())
plt.figure()
plt.plot(range(len(returns)), returns)
plt.xlabel('episode * ' + str(self.args.evaluate_rate / self.episode_limit))
plt.ylabel('average returns')
plt.savefig(self.save_path + '/plt.png', format='png')
self.noise = max(0.05, self.noise - 0.0000005)
self.epsilon = max(0.05, self.epsilon - 0.0000005)
np.save(self.save_path + '/returns.pkl', returns)
def evaluate(self):
returns = []
for episode in range(self.args.evaluate_episodes):
# reset the environment
s = self.env.reset()
rewards = 0
for time_step in range(self.args.evaluate_episode_len):
# if (episode > self.args.evaluate_episode_len - 50):
#self.env.render()
actions = []
with torch.no_grad():
for agent_id, agent in enumerate(self.agents):
action = agent.select_action(s[agent_id], 0, 0)
actions.append(action)
for i in range(self.args.n_agents, self.args.n_players):
actions.append([0, np.random.rand() * 2 - 1, 0, np.random.rand() * 2 - 1, 0])
s_next, r, done, info = self.env.step(actions)
rewards += r[0]
s = s_next
returns.append(rewards)
if (episode % 1000 == 0):
print('Returns is', rewards)
return sum(returns) / self.args.evaluate_episodes