From 6a5cfb1332791f7149e8976b43bce009c1df2a75 Mon Sep 17 00:00:00 2001 From: dennismalmgren <dennis@dennismalmgren.se> Date: Tue, 15 Nov 2022 21:57:32 +0100 Subject: [PATCH] Updated to run eval --- configs/config.py | 4 +- multiagent_rl_eval.py | 408 ++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 410 insertions(+), 2 deletions(-) create mode 100644 multiagent_rl_eval.py diff --git a/configs/config.py b/configs/config.py index 3143e21e..49a64b81 100644 --- a/configs/config.py +++ b/configs/config.py @@ -290,7 +290,7 @@ def get_config(): parser.add_argument("--log_interval", type=int, default=1, help="time duration between contiunous twice log printing.") parser.add_argument("--model_dir", type=str, - default=None, help="by default None. set the path to pretrained model.") + default="/mnt/f/repos/results/scontrol/happo/check/9967/run1/models", help="by default None. set the path to pretrained model.") # eval parameters parser.add_argument("--use_eval", action='store_true', @@ -298,7 +298,7 @@ def get_config(): parser.add_argument("--eval_interval", type=int, default=10, help="time duration between contiunous twice evaluation progress.") parser.add_argument("--eval_episodes", type=int, - default=1, help="number of episodes of a single evaluation.") + default=2, help="number of episodes of a single evaluation.") # render parameters parser.add_argument("--save_gifs", action='store_true', diff --git a/multiagent_rl_eval.py b/multiagent_rl_eval.py new file mode 100644 index 00000000..44730cfc --- /dev/null +++ b/multiagent_rl_eval.py @@ -0,0 +1,408 @@ +import argparse +import datetime +import logging +import sys +import os +sys.path.append("../") +from pathlib import Path +import random +import time +import os + +from configs.config import get_config +import gym +from gym.wrappers.time_limit import TimeLimit +from gym.wrappers.flatten_observation import FlattenObservation +import gym.spaces as spaces +import jax +import numpy as np +import torch +import torch.optim as optim +import torch.nn as nn +from observation_normalization_wrapper import ObservationNormalizationWrapper +from torch.utils.tensorboard import SummaryWriter +import setproctitle + +from configs.config import get_config +from runners.separated.sensor_runner import SensorRunner as Runner +from envs.env_wrappers import ShareDummyVecEnv, ShareSubprocVecEnv +from sensor_control_env import SensorControlEnv +from discretized_action_wrapper import DiscretizedActionWrapper +from flatten_observation_wrapper import FlattenObservationWrapper +from red_team_behaviour_wrapper import RedTeamBehaviourWrapper +from scalarized_reward_wrapper import ScalarizedRewardWrapper +from multiagent_wrapper import MultiagentWrapper +from behaviour import behaviour_choices +import constants + +def make_train_env(all_args): + def get_env_fn(rank): + def init_env(): + if all_args.env_name == "scontrol": + max_team_members = 7 + env_config = { + "max_team_size": max_team_members, + "scenario_id": 0, + 'visualise_delay': 0, + } + + #Weighting of objectives + reward_wrapper_config = { + "share_detected": 1.0, + "share_delay_detected": 1.0, + "share_radar_actions": 0.0, + "position_error": 0.0, + "position_delay_error": 0.0 + } + red_behaviour_name = all_args.red + env = SensorControlEnv(env_config) + env = ScalarizedRewardWrapper(env, reward_wrapper_config) + env = ObservationNormalizationWrapper(env) + env = FlattenObservationWrapper(env) + env = RedTeamBehaviourWrapper(env, red_behaviour_name) + env = DiscretizedActionWrapper(env) + env = TimeLimit(env, max_episode_steps=constants.SCENARIO_LENGTH) + env = MultiagentWrapper(env) + else: + print("Can not support the " + all_args.env_name + "environment.") + raise NotImplementedError + return env + return init_env + + if all_args.n_rollout_threads == 1: + return ShareDummyVecEnv([get_env_fn(0)]) + else: + return ShareSubprocVecEnv([get_env_fn(i) for i in range(all_args.n_rollout_threads)]) + +def make_eval_env(all_args): + def get_env_fn(rank): + def init_env(): + if all_args.env_name == "scontrol": + max_team_members = 7 + env_config = { + "max_team_size": max_team_members, + "scenario_id": 0, + 'visualise_delay': 0, + } + + reward_wrapper_config = { + "share_detected": 1.0, + "share_delay_detected": 1.0, + "share_radar_actions": 0.0, + "position_error": 0.0, + "position_delay_error": 0.0 + } + red_behaviour_name = all_args.red + env = SensorControlEnv(env_config) + env = ScalarizedRewardWrapper(env, reward_wrapper_config) + env = ObservationNormalizationWrapper(env) + env = FlattenObservationWrapper(env) + env = RedTeamBehaviourWrapper(env, red_behaviour_name) + env = DiscretizedActionWrapper(env) + env = TimeLimit(env, max_episode_steps=constants.SCENARIO_LENGTH) + env = MultiagentWrapper(env) + else: + print("Can not support the " + all_args.env_name + "environment.") + raise NotImplementedError + return env + return init_env + + if all_args.n_eval_rollout_threads == 1: + return ShareDummyVecEnv([get_env_fn(0)]) + else: + return ShareSubprocVecEnv([get_env_fn(i) for i in range(all_args.n_eval_rollout_threads)]) + +def parse_args(args, parser): + parser.add_argument("--red", default="no", choices=behaviour_choices, type=str.lower) + parser.add_argument("--use_single_network", action='store_true', default=False) + + all_args = parser.parse_known_args(args)[0] + + return all_args + + +def main(args) -> None: + # Create argument parse + parser = get_config() + all_args = parse_args(args, parser) + + print("all config: ", all_args) + + if all_args.seed_specify: + all_args.seed=all_args.runing_id #I really dislike misspelling but I don't want to change the original code. + else: + all_args.seed=np.random.randint(1000,10000) + + print("seed is :",all_args.seed) + + if all_args.cuda and torch.cuda.is_available(): + print("choose to use gpu...") + device = torch.device("cuda:0") + torch.set_num_threads(all_args.n_training_threads) + if all_args.cuda_deterministic: + torch.backends.cudnn.benchmark = False + torch.backends.cudnn.deterministic = True + + else: + print("choose to use cpu...") + device = torch.device("cpu") + torch.set_num_threads(all_args.n_training_threads) + + run_dir = Path(os.path.split(os.path.dirname(os.path.abspath(__file__)))[ + 0] + "/results") / all_args.env_name / all_args.algorithm_name / all_args.experiment_name / str(all_args.seed) + + if not run_dir.exists(): + os.makedirs(str(run_dir)) + + if not run_dir.exists(): + curr_run = 'run1' + else: + exst_run_nums = [int(str(folder.name).split('run')[1]) for folder in run_dir.iterdir() if + str(folder.name).startswith('run')] + if len(exst_run_nums) == 0: + curr_run = 'run1' + else: + curr_run = 'run%i' % (max(exst_run_nums) + 1) + + run_dir = run_dir / curr_run + if not run_dir.exists(): + os.makedirs(str(run_dir)) + + setproctitle.setproctitle( + str(all_args.algorithm_name) + "-" + str(all_args.env_name) + "-" + str(all_args.experiment_name) + "@" + str( + all_args.user_name)) + + # seed + torch.manual_seed(all_args.seed) + torch.cuda.manual_seed_all(all_args.seed) + np.random.seed(all_args.seed) + + envs = make_train_env(all_args) + eval_envs = make_eval_env(all_args) if all_args.use_eval else None + num_agents = envs.n_agents + + config = { + "all_args": all_args, + "envs": envs, + "eval_envs": eval_envs, + "num_agents": num_agents, + "device": device, + "run_dir": run_dir, + } + + runner = Runner(config) #uses config.py/model_dir to load parameters into the model + runner.eval(total_num_steps=1) #uses config.py/eval_episodes to identify how many episodes of eval to run. + + # post process + envs.close() + if all_args.use_eval and eval_envs is not envs: + eval_envs.close() + + runner.writter.export_scalars_to_json(str(runner.log_dir + '/summary.json')) + runner.writter.close() + +if __name__ == '__main__': + main(sys.argv[1:]) + + + + + + + + + + + + + +from rl_behaviour import rl_behaviour_choices +from behaviour import behaviour_choices +import constants +from scenario import eval_to_csv +from scenario_io import write_scenario_to_file + +from single_agent_wrapper import SingleAgentWrapper +from single_agent_agent import Agent +from runners.separated.base_runner import Runner + +def prepare_cumulative_evaluation(env: gym.Env): + base_env = env.unwrapped + reward = base_env.reward_space.sample() + zero_reward = jax.tree_util.tree_map(lambda e: 0.0, reward) + return zero_reward + +def prepare_cumulative_scalarized_evaluation_dict(env: gym.Env): + reward = env.reward_space.sample() + zero_reward = jax.tree_util.tree_map(lambda e: 0.0, reward) + return zero_reward + +def train_on_scenario(red_behaviour_name: str, + config: dict, + logger: logging.Logger, + rng: np.random.Generator): + max_team_members = 7 + scenario_id = rng.integers(0, 1000000) + env_config = { + "scenario_id": scenario_id, #ignore this for training. + "use_static": False, #no_greens have to be 4 and no_reds 2 if use_static is set to True. + "no_greens": 7, #rng.integers(2, max_team_members), + "no_reds": 7, #rng.integers(2, max_team_members), + 'visualise_delay': 0, + } + + reward_wrapper_config = { + "share_detected": 1.0, + "share_delay_detected": 1.0, + "share_radar_actions": 0.0, + "position_error": 0.0, + "position_delay_error": 0.0 + } + + logger.info("Generate scenario") + + env = SensorControlEnv(env_config) + env = ScalarizedRewardWrapper(env, reward_wrapper_config) + env = FlattenObservationWrapper(env, max_sa_size=max_team_members) + env = RedTeamBehaviourWrapper(env, red_behaviour_name) #This one is incredibly slow. + env = ObservationNormalizationWrapper(env, max_sa_size=max_team_members) + env = DiscretizedActionWrapper(env) + #env = SingleAgentWrapper(env, max_team_members=max_team_members, max_sa_size=max_team_members) + env = FlattenObservation(env) + env = TimeLimit(env, max_episode_steps=constants.SCENARIO_LENGTH) + run_name = f"run_{int(time.time())}" + + config["env"] = env + config['num_agents'] = max_team_members + runner = Runner(config) + runner.run() + +# #Start training +# agent = Agent(env) +# optimizer = optim.Adam(agent.parameters(), lr=learning_rate, eps=1e-5) + +# if load_checkpoint_path: +# agent.load_state_dict(torch.load(os.path.join(load_checkpoint_path, "agent.pt"))) +# optimizer.load_state_dict(torch.load(os.path.join(load_checkpoint_path, "optimizer.pt"))) + +# num_steps = constants.SCENARIO_LENGTH #Number of steps in a rollout +# batch_size = num_steps +# num_minibatches = 4 +# minibatch_size = batch_size // num_minibatches + +# # Storage setup +# obs = torch.zeros((num_steps,) + env.observation_space.shape) +# actions = torch.zeros((num_steps,) + env.action_space.shape) +# logprobs = torch.zeros((num_steps,)) +# rewards = torch.zeros((num_steps,)) +# terminateds = torch.zeros((num_steps,)) +# truncateds = torch.zeros((num_steps,)) +# values = torch.zeros((num_steps,)) + +# num_updates = total_timesteps // batch_size +# # cumulative_evaluation_dict = prepare_cumulative_evaluation(env) +# # cumulative_scalarized_evaluation_dict = prepare_cumulative_scalarized_evaluation_dict(env) +# logger.info("Start training") +# run_name = f"run_{int(time.time())}" +# writer = SummaryWriter(f"runs/{run_name}") + +# start = time.time() +# global_step = 0 +# for update in range(1, num_updates + 1): +# next_obs, info = env.reset(seed=rng.integers(2**32)) +# next_obs = torch.tensor(next_obs) +# next_terminated = torch.zeros(1, dtype=torch.float) +# next_truncated = torch.zeros(1, dtype=torch.float) +# for step in range(0, num_steps): +# global_step += 1 +# obs[step] = next_obs +# terminateds[step] = next_terminated +# truncateds[step] = next_truncated + +# with torch.no_grad(): +# action, logprob, _, value = agent.get_action_and_value(next_obs) +# values[step] = value.flatten() +# actions[step] = action +# logprobs[step] = logprob + +# next_obs, reward, terminated, truncated, info = env.step(action) +# rewards[step] = torch.tensor(reward).view(-1) +# next_obs, next_terminated, next_truncated = torch.tensor(next_obs), torch.tensor(float(terminated)), torch.tensor(float(truncated)) +# writer.add_scalar("charts/episodic_return", sum(rewards), global_step) +# logger.info(f"global step: {global_step}, episodic return: {sum(rewards)}") + +# with torch.no_grad(): +# next_value = agent.get_value(next_obs).reshape(1, -1) +# advantages = torch.zeros_like(rewards) +# lastgaelam = 0 +# for t in reversed(range(num_steps)): +# if t == num_steps - 1: +# nextnonterminal = 1.0 - next_terminated +# nextnontruncated = 1.0 - next_truncated +# nextvalues = next_value +# else: +# nextnonterminal = 1.0 - terminateds[t + 1] +# nextnontruncated = 1.0 - truncateds[t + 1] +# nextvalues = values[t + 1] +# delta = rewards[t] + gamma * nextvalues * nextnonterminal - values[t] +# advantages[t] = lastgaelam = delta + gamma * 0.95 * nextnontruncated * lastgaelam +# returns = advantages + values + +# # flattening +# b_obs = obs.reshape((-1,) + env.observation_space.shape) +# b_logprobs = logprobs.reshape(-1) +# b_actions = actions.reshape((-1,) + env.action_space.shape) +# b_advantages = advantages.reshape(-1) +# b_returns = returns.reshape(-1) +# b_values = values.reshape(-1) + +# b_inds = np.arange(batch_size) +# clipfracs = [] +# for epoch in range(update_epochs): +# np.random.shuffle(b_inds) +# for start in range(0, batch_size, minibatch_size): +# end = start + minibatch_size +# mbinds = b_inds[start:end] +# _, newlogprob, entropy, newvalue = agent.get_action_and_value(b_obs[mbinds], b_actions.long()[mbinds]) +# logratio = newlogprob - b_logprobs[mbinds] +# ratio = logratio.exp() + +# mb_advantages = b_advantages[mbinds] +# mb_advantages = (mb_advantages - mb_advantages.mean()) /(mb_advantages.std() + 1e-8) + +# # Policy loss +# pg_loss1 = -mb_advantages * ratio +# pg_loss2 = -mb_advantages * torch.clamp(ratio, 1 - 0.2, 1 + 0.2) +# pg_loss = torch.max(pg_loss1, pg_loss2).mean() + +# # Value loss +# v_loss = 0.5 * ((newvalue - b_returns[mbinds]) ** 2).mean() + +# entropy_loss = entropy.mean() +# loss = pg_loss - 0.01 * entropy_loss + vf_coef * v_loss + +# # Do the update +# optimizer.zero_grad() +# loss.backward() +# nn.utils.clip_grad_norm_(agent.parameters(), 0.5) #Might want to tune the 0.5. maybe. +# optimizer.step() + +# #y_pred, y_true = b_values, b_returns +# #var_y = np.var(y_true) +# #explained_var = np.nan if var_y == 0 else 1 - np.var(y_pred - y_true) / var_y +# writer.add_scalar("losses/value_loss", v_loss.item(), global_step) +# writer.add_scalar("losses/policy_loss", pg_loss.item(), global_step) +# writer.add_scalar("losses/entropy", entropy_loss.item(), global_step) +# logger.info(f"global step: {global_step}, value_loss: {v_loss.item()}") +# logger.info(f"global step: {global_step}, policy_loss: {pg_loss.item()}") +# logger.info(f"global step: {global_step}, entropy: {entropy_loss.item()}") + +# #Checkpoint +# if update % checkpoint_every_k_updates == 0: +# logger.info("Saving checkpoint") +# folder_path = f"checkpoints/{run_name}/{global_step}/" +# os.makedirs(folder_path, exist_ok = True) + +# torch.save(agent.state_dict(), os.path.join(folder_path, "agent.pt")) +# torch.save(optimizer.state_dict(), os.path.join(folder_path, "optimizer.pt")) + \ No newline at end of file -- GitLab