Skip to content
Snippets Groups Projects
Commit 6a5cfb13 authored by dennismalmgren's avatar dennismalmgren
Browse files

Updated to run eval

parent 9258ccd2
No related branches found
No related tags found
No related merge requests found
...@@ -290,7 +290,7 @@ def get_config(): ...@@ -290,7 +290,7 @@ def get_config():
parser.add_argument("--log_interval", type=int, parser.add_argument("--log_interval", type=int,
default=1, help="time duration between contiunous twice log printing.") default=1, help="time duration between contiunous twice log printing.")
parser.add_argument("--model_dir", type=str, parser.add_argument("--model_dir", type=str,
default=None, help="by default None. set the path to pretrained model.") default="/mnt/f/repos/results/scontrol/happo/check/9967/run1/models", help="by default None. set the path to pretrained model.")
# eval parameters # eval parameters
parser.add_argument("--use_eval", action='store_true', parser.add_argument("--use_eval", action='store_true',
...@@ -298,7 +298,7 @@ def get_config(): ...@@ -298,7 +298,7 @@ def get_config():
parser.add_argument("--eval_interval", type=int, parser.add_argument("--eval_interval", type=int,
default=10, help="time duration between contiunous twice evaluation progress.") default=10, help="time duration between contiunous twice evaluation progress.")
parser.add_argument("--eval_episodes", type=int, parser.add_argument("--eval_episodes", type=int,
default=1, help="number of episodes of a single evaluation.") default=2, help="number of episodes of a single evaluation.")
# render parameters # render parameters
parser.add_argument("--save_gifs", action='store_true', parser.add_argument("--save_gifs", action='store_true',
......
import argparse
import datetime
import logging
import sys
import os
sys.path.append("../")
from pathlib import Path
import random
import time
import os
from configs.config import get_config
import gym
from gym.wrappers.time_limit import TimeLimit
from gym.wrappers.flatten_observation import FlattenObservation
import gym.spaces as spaces
import jax
import numpy as np
import torch
import torch.optim as optim
import torch.nn as nn
from observation_normalization_wrapper import ObservationNormalizationWrapper
from torch.utils.tensorboard import SummaryWriter
import setproctitle
from configs.config import get_config
from runners.separated.sensor_runner import SensorRunner as Runner
from envs.env_wrappers import ShareDummyVecEnv, ShareSubprocVecEnv
from sensor_control_env import SensorControlEnv
from discretized_action_wrapper import DiscretizedActionWrapper
from flatten_observation_wrapper import FlattenObservationWrapper
from red_team_behaviour_wrapper import RedTeamBehaviourWrapper
from scalarized_reward_wrapper import ScalarizedRewardWrapper
from multiagent_wrapper import MultiagentWrapper
from behaviour import behaviour_choices
import constants
def make_train_env(all_args):
def get_env_fn(rank):
def init_env():
if all_args.env_name == "scontrol":
max_team_members = 7
env_config = {
"max_team_size": max_team_members,
"scenario_id": 0,
'visualise_delay': 0,
}
#Weighting of objectives
reward_wrapper_config = {
"share_detected": 1.0,
"share_delay_detected": 1.0,
"share_radar_actions": 0.0,
"position_error": 0.0,
"position_delay_error": 0.0
}
red_behaviour_name = all_args.red
env = SensorControlEnv(env_config)
env = ScalarizedRewardWrapper(env, reward_wrapper_config)
env = ObservationNormalizationWrapper(env)
env = FlattenObservationWrapper(env)
env = RedTeamBehaviourWrapper(env, red_behaviour_name)
env = DiscretizedActionWrapper(env)
env = TimeLimit(env, max_episode_steps=constants.SCENARIO_LENGTH)
env = MultiagentWrapper(env)
else:
print("Can not support the " + all_args.env_name + "environment.")
raise NotImplementedError
return env
return init_env
if all_args.n_rollout_threads == 1:
return ShareDummyVecEnv([get_env_fn(0)])
else:
return ShareSubprocVecEnv([get_env_fn(i) for i in range(all_args.n_rollout_threads)])
def make_eval_env(all_args):
def get_env_fn(rank):
def init_env():
if all_args.env_name == "scontrol":
max_team_members = 7
env_config = {
"max_team_size": max_team_members,
"scenario_id": 0,
'visualise_delay': 0,
}
reward_wrapper_config = {
"share_detected": 1.0,
"share_delay_detected": 1.0,
"share_radar_actions": 0.0,
"position_error": 0.0,
"position_delay_error": 0.0
}
red_behaviour_name = all_args.red
env = SensorControlEnv(env_config)
env = ScalarizedRewardWrapper(env, reward_wrapper_config)
env = ObservationNormalizationWrapper(env)
env = FlattenObservationWrapper(env)
env = RedTeamBehaviourWrapper(env, red_behaviour_name)
env = DiscretizedActionWrapper(env)
env = TimeLimit(env, max_episode_steps=constants.SCENARIO_LENGTH)
env = MultiagentWrapper(env)
else:
print("Can not support the " + all_args.env_name + "environment.")
raise NotImplementedError
return env
return init_env
if all_args.n_eval_rollout_threads == 1:
return ShareDummyVecEnv([get_env_fn(0)])
else:
return ShareSubprocVecEnv([get_env_fn(i) for i in range(all_args.n_eval_rollout_threads)])
def parse_args(args, parser):
parser.add_argument("--red", default="no", choices=behaviour_choices, type=str.lower)
parser.add_argument("--use_single_network", action='store_true', default=False)
all_args = parser.parse_known_args(args)[0]
return all_args
def main(args) -> None:
# Create argument parse
parser = get_config()
all_args = parse_args(args, parser)
print("all config: ", all_args)
if all_args.seed_specify:
all_args.seed=all_args.runing_id #I really dislike misspelling but I don't want to change the original code.
else:
all_args.seed=np.random.randint(1000,10000)
print("seed is :",all_args.seed)
if all_args.cuda and torch.cuda.is_available():
print("choose to use gpu...")
device = torch.device("cuda:0")
torch.set_num_threads(all_args.n_training_threads)
if all_args.cuda_deterministic:
torch.backends.cudnn.benchmark = False
torch.backends.cudnn.deterministic = True
else:
print("choose to use cpu...")
device = torch.device("cpu")
torch.set_num_threads(all_args.n_training_threads)
run_dir = Path(os.path.split(os.path.dirname(os.path.abspath(__file__)))[
0] + "/results") / all_args.env_name / all_args.algorithm_name / all_args.experiment_name / str(all_args.seed)
if not run_dir.exists():
os.makedirs(str(run_dir))
if not run_dir.exists():
curr_run = 'run1'
else:
exst_run_nums = [int(str(folder.name).split('run')[1]) for folder in run_dir.iterdir() if
str(folder.name).startswith('run')]
if len(exst_run_nums) == 0:
curr_run = 'run1'
else:
curr_run = 'run%i' % (max(exst_run_nums) + 1)
run_dir = run_dir / curr_run
if not run_dir.exists():
os.makedirs(str(run_dir))
setproctitle.setproctitle(
str(all_args.algorithm_name) + "-" + str(all_args.env_name) + "-" + str(all_args.experiment_name) + "@" + str(
all_args.user_name))
# seed
torch.manual_seed(all_args.seed)
torch.cuda.manual_seed_all(all_args.seed)
np.random.seed(all_args.seed)
envs = make_train_env(all_args)
eval_envs = make_eval_env(all_args) if all_args.use_eval else None
num_agents = envs.n_agents
config = {
"all_args": all_args,
"envs": envs,
"eval_envs": eval_envs,
"num_agents": num_agents,
"device": device,
"run_dir": run_dir,
}
runner = Runner(config) #uses config.py/model_dir to load parameters into the model
runner.eval(total_num_steps=1) #uses config.py/eval_episodes to identify how many episodes of eval to run.
# post process
envs.close()
if all_args.use_eval and eval_envs is not envs:
eval_envs.close()
runner.writter.export_scalars_to_json(str(runner.log_dir + '/summary.json'))
runner.writter.close()
if __name__ == '__main__':
main(sys.argv[1:])
from rl_behaviour import rl_behaviour_choices
from behaviour import behaviour_choices
import constants
from scenario import eval_to_csv
from scenario_io import write_scenario_to_file
from single_agent_wrapper import SingleAgentWrapper
from single_agent_agent import Agent
from runners.separated.base_runner import Runner
def prepare_cumulative_evaluation(env: gym.Env):
base_env = env.unwrapped
reward = base_env.reward_space.sample()
zero_reward = jax.tree_util.tree_map(lambda e: 0.0, reward)
return zero_reward
def prepare_cumulative_scalarized_evaluation_dict(env: gym.Env):
reward = env.reward_space.sample()
zero_reward = jax.tree_util.tree_map(lambda e: 0.0, reward)
return zero_reward
def train_on_scenario(red_behaviour_name: str,
config: dict,
logger: logging.Logger,
rng: np.random.Generator):
max_team_members = 7
scenario_id = rng.integers(0, 1000000)
env_config = {
"scenario_id": scenario_id, #ignore this for training.
"use_static": False, #no_greens have to be 4 and no_reds 2 if use_static is set to True.
"no_greens": 7, #rng.integers(2, max_team_members),
"no_reds": 7, #rng.integers(2, max_team_members),
'visualise_delay': 0,
}
reward_wrapper_config = {
"share_detected": 1.0,
"share_delay_detected": 1.0,
"share_radar_actions": 0.0,
"position_error": 0.0,
"position_delay_error": 0.0
}
logger.info("Generate scenario")
env = SensorControlEnv(env_config)
env = ScalarizedRewardWrapper(env, reward_wrapper_config)
env = FlattenObservationWrapper(env, max_sa_size=max_team_members)
env = RedTeamBehaviourWrapper(env, red_behaviour_name) #This one is incredibly slow.
env = ObservationNormalizationWrapper(env, max_sa_size=max_team_members)
env = DiscretizedActionWrapper(env)
#env = SingleAgentWrapper(env, max_team_members=max_team_members, max_sa_size=max_team_members)
env = FlattenObservation(env)
env = TimeLimit(env, max_episode_steps=constants.SCENARIO_LENGTH)
run_name = f"run_{int(time.time())}"
config["env"] = env
config['num_agents'] = max_team_members
runner = Runner(config)
runner.run()
# #Start training
# agent = Agent(env)
# optimizer = optim.Adam(agent.parameters(), lr=learning_rate, eps=1e-5)
# if load_checkpoint_path:
# agent.load_state_dict(torch.load(os.path.join(load_checkpoint_path, "agent.pt")))
# optimizer.load_state_dict(torch.load(os.path.join(load_checkpoint_path, "optimizer.pt")))
# num_steps = constants.SCENARIO_LENGTH #Number of steps in a rollout
# batch_size = num_steps
# num_minibatches = 4
# minibatch_size = batch_size // num_minibatches
# # Storage setup
# obs = torch.zeros((num_steps,) + env.observation_space.shape)
# actions = torch.zeros((num_steps,) + env.action_space.shape)
# logprobs = torch.zeros((num_steps,))
# rewards = torch.zeros((num_steps,))
# terminateds = torch.zeros((num_steps,))
# truncateds = torch.zeros((num_steps,))
# values = torch.zeros((num_steps,))
# num_updates = total_timesteps // batch_size
# # cumulative_evaluation_dict = prepare_cumulative_evaluation(env)
# # cumulative_scalarized_evaluation_dict = prepare_cumulative_scalarized_evaluation_dict(env)
# logger.info("Start training")
# run_name = f"run_{int(time.time())}"
# writer = SummaryWriter(f"runs/{run_name}")
# start = time.time()
# global_step = 0
# for update in range(1, num_updates + 1):
# next_obs, info = env.reset(seed=rng.integers(2**32))
# next_obs = torch.tensor(next_obs)
# next_terminated = torch.zeros(1, dtype=torch.float)
# next_truncated = torch.zeros(1, dtype=torch.float)
# for step in range(0, num_steps):
# global_step += 1
# obs[step] = next_obs
# terminateds[step] = next_terminated
# truncateds[step] = next_truncated
# with torch.no_grad():
# action, logprob, _, value = agent.get_action_and_value(next_obs)
# values[step] = value.flatten()
# actions[step] = action
# logprobs[step] = logprob
# next_obs, reward, terminated, truncated, info = env.step(action)
# rewards[step] = torch.tensor(reward).view(-1)
# next_obs, next_terminated, next_truncated = torch.tensor(next_obs), torch.tensor(float(terminated)), torch.tensor(float(truncated))
# writer.add_scalar("charts/episodic_return", sum(rewards), global_step)
# logger.info(f"global step: {global_step}, episodic return: {sum(rewards)}")
# with torch.no_grad():
# next_value = agent.get_value(next_obs).reshape(1, -1)
# advantages = torch.zeros_like(rewards)
# lastgaelam = 0
# for t in reversed(range(num_steps)):
# if t == num_steps - 1:
# nextnonterminal = 1.0 - next_terminated
# nextnontruncated = 1.0 - next_truncated
# nextvalues = next_value
# else:
# nextnonterminal = 1.0 - terminateds[t + 1]
# nextnontruncated = 1.0 - truncateds[t + 1]
# nextvalues = values[t + 1]
# delta = rewards[t] + gamma * nextvalues * nextnonterminal - values[t]
# advantages[t] = lastgaelam = delta + gamma * 0.95 * nextnontruncated * lastgaelam
# returns = advantages + values
# # flattening
# b_obs = obs.reshape((-1,) + env.observation_space.shape)
# b_logprobs = logprobs.reshape(-1)
# b_actions = actions.reshape((-1,) + env.action_space.shape)
# b_advantages = advantages.reshape(-1)
# b_returns = returns.reshape(-1)
# b_values = values.reshape(-1)
# b_inds = np.arange(batch_size)
# clipfracs = []
# for epoch in range(update_epochs):
# np.random.shuffle(b_inds)
# for start in range(0, batch_size, minibatch_size):
# end = start + minibatch_size
# mbinds = b_inds[start:end]
# _, newlogprob, entropy, newvalue = agent.get_action_and_value(b_obs[mbinds], b_actions.long()[mbinds])
# logratio = newlogprob - b_logprobs[mbinds]
# ratio = logratio.exp()
# mb_advantages = b_advantages[mbinds]
# mb_advantages = (mb_advantages - mb_advantages.mean()) /(mb_advantages.std() + 1e-8)
# # Policy loss
# pg_loss1 = -mb_advantages * ratio
# pg_loss2 = -mb_advantages * torch.clamp(ratio, 1 - 0.2, 1 + 0.2)
# pg_loss = torch.max(pg_loss1, pg_loss2).mean()
# # Value loss
# v_loss = 0.5 * ((newvalue - b_returns[mbinds]) ** 2).mean()
# entropy_loss = entropy.mean()
# loss = pg_loss - 0.01 * entropy_loss + vf_coef * v_loss
# # Do the update
# optimizer.zero_grad()
# loss.backward()
# nn.utils.clip_grad_norm_(agent.parameters(), 0.5) #Might want to tune the 0.5. maybe.
# optimizer.step()
# #y_pred, y_true = b_values, b_returns
# #var_y = np.var(y_true)
# #explained_var = np.nan if var_y == 0 else 1 - np.var(y_pred - y_true) / var_y
# writer.add_scalar("losses/value_loss", v_loss.item(), global_step)
# writer.add_scalar("losses/policy_loss", pg_loss.item(), global_step)
# writer.add_scalar("losses/entropy", entropy_loss.item(), global_step)
# logger.info(f"global step: {global_step}, value_loss: {v_loss.item()}")
# logger.info(f"global step: {global_step}, policy_loss: {pg_loss.item()}")
# logger.info(f"global step: {global_step}, entropy: {entropy_loss.item()}")
# #Checkpoint
# if update % checkpoint_every_k_updates == 0:
# logger.info("Saving checkpoint")
# folder_path = f"checkpoints/{run_name}/{global_step}/"
# os.makedirs(folder_path, exist_ok = True)
# torch.save(agent.state_dict(), os.path.join(folder_path, "agent.pt"))
# torch.save(optimizer.state_dict(), os.path.join(folder_path, "optimizer.pt"))
\ No newline at end of file
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment