From 6a5cfb1332791f7149e8976b43bce009c1df2a75 Mon Sep 17 00:00:00 2001
From: dennismalmgren <dennis@dennismalmgren.se>
Date: Tue, 15 Nov 2022 21:57:32 +0100
Subject: [PATCH] Updated to run eval

---
 configs/config.py     |   4 +-
 multiagent_rl_eval.py | 408 ++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 410 insertions(+), 2 deletions(-)
 create mode 100644 multiagent_rl_eval.py

diff --git a/configs/config.py b/configs/config.py
index 3143e21e..49a64b81 100644
--- a/configs/config.py
+++ b/configs/config.py
@@ -290,7 +290,7 @@ def get_config():
     parser.add_argument("--log_interval", type=int, 
                         default=1, help="time duration between contiunous twice log printing.")
     parser.add_argument("--model_dir", type=str, 
-                        default=None, help="by default None. set the path to pretrained model.")
+                        default="/mnt/f/repos/results/scontrol/happo/check/9967/run1/models", help="by default None. set the path to pretrained model.")
 
     # eval parameters
     parser.add_argument("--use_eval", action='store_true', 
@@ -298,7 +298,7 @@ def get_config():
     parser.add_argument("--eval_interval", type=int, 
                         default=10, help="time duration between contiunous twice evaluation progress.")
     parser.add_argument("--eval_episodes", type=int, 
-                        default=1, help="number of episodes of a single evaluation.")
+                        default=2, help="number of episodes of a single evaluation.")
 
     # render parameters
     parser.add_argument("--save_gifs", action='store_true', 
diff --git a/multiagent_rl_eval.py b/multiagent_rl_eval.py
new file mode 100644
index 00000000..44730cfc
--- /dev/null
+++ b/multiagent_rl_eval.py
@@ -0,0 +1,408 @@
+import argparse
+import datetime
+import logging
+import sys
+import os
+sys.path.append("../")
+from pathlib import Path
+import random
+import time
+import os
+
+from configs.config import get_config
+import gym
+from gym.wrappers.time_limit import TimeLimit
+from gym.wrappers.flatten_observation import FlattenObservation
+import gym.spaces as spaces
+import jax
+import numpy as np
+import torch
+import torch.optim as optim
+import torch.nn as nn
+from observation_normalization_wrapper import ObservationNormalizationWrapper
+from torch.utils.tensorboard import SummaryWriter
+import setproctitle
+
+from configs.config import get_config
+from runners.separated.sensor_runner import SensorRunner as Runner
+from envs.env_wrappers import ShareDummyVecEnv, ShareSubprocVecEnv
+from sensor_control_env import SensorControlEnv
+from discretized_action_wrapper import DiscretizedActionWrapper
+from flatten_observation_wrapper import FlattenObservationWrapper
+from red_team_behaviour_wrapper import RedTeamBehaviourWrapper
+from scalarized_reward_wrapper import ScalarizedRewardWrapper
+from multiagent_wrapper import MultiagentWrapper
+from behaviour import behaviour_choices
+import constants
+
+def make_train_env(all_args):
+    def get_env_fn(rank):
+        def init_env():
+            if all_args.env_name == "scontrol":
+                max_team_members = 7
+                env_config = {
+                    "max_team_size": max_team_members,
+                    "scenario_id": 0, 
+                    'visualise_delay': 0,
+                }
+
+                #Weighting of objectives
+                reward_wrapper_config = {
+                    "share_detected": 1.0,
+                    "share_delay_detected": 1.0,
+                    "share_radar_actions": 0.0,
+                    "position_error": 0.0,
+                    "position_delay_error": 0.0
+                }
+                red_behaviour_name = all_args.red
+                env = SensorControlEnv(env_config)
+                env = ScalarizedRewardWrapper(env, reward_wrapper_config)
+                env = ObservationNormalizationWrapper(env)
+                env = FlattenObservationWrapper(env)
+                env = RedTeamBehaviourWrapper(env, red_behaviour_name) 
+                env = DiscretizedActionWrapper(env)
+                env = TimeLimit(env, max_episode_steps=constants.SCENARIO_LENGTH)
+                env = MultiagentWrapper(env)
+            else:
+                print("Can not support the " + all_args.env_name + "environment.")
+                raise NotImplementedError
+            return env
+        return init_env
+
+    if all_args.n_rollout_threads == 1:
+        return ShareDummyVecEnv([get_env_fn(0)])
+    else:
+        return ShareSubprocVecEnv([get_env_fn(i) for i in range(all_args.n_rollout_threads)])
+
+def make_eval_env(all_args):
+    def get_env_fn(rank):
+        def init_env():
+            if all_args.env_name == "scontrol":
+                max_team_members = 7
+                env_config = {
+                    "max_team_size": max_team_members,
+                    "scenario_id": 0, 
+                    'visualise_delay': 0,
+                }
+
+                reward_wrapper_config = {
+                    "share_detected": 1.0,
+                    "share_delay_detected": 1.0,
+                    "share_radar_actions": 0.0,
+                    "position_error": 0.0,
+                    "position_delay_error": 0.0
+                }
+                red_behaviour_name = all_args.red
+                env = SensorControlEnv(env_config)
+                env = ScalarizedRewardWrapper(env, reward_wrapper_config)
+                env = ObservationNormalizationWrapper(env)
+                env = FlattenObservationWrapper(env)
+                env = RedTeamBehaviourWrapper(env, red_behaviour_name) 
+                env = DiscretizedActionWrapper(env)
+                env = TimeLimit(env, max_episode_steps=constants.SCENARIO_LENGTH)
+                env = MultiagentWrapper(env)
+            else:
+                print("Can not support the " + all_args.env_name + "environment.")
+                raise NotImplementedError
+            return env
+        return init_env
+
+    if all_args.n_eval_rollout_threads == 1:
+        return ShareDummyVecEnv([get_env_fn(0)])
+    else:
+        return ShareSubprocVecEnv([get_env_fn(i) for i in range(all_args.n_eval_rollout_threads)])
+
+def parse_args(args, parser):
+    parser.add_argument("--red", default="no", choices=behaviour_choices, type=str.lower)
+    parser.add_argument("--use_single_network", action='store_true', default=False)
+
+    all_args = parser.parse_known_args(args)[0]
+
+    return all_args
+
+
+def main(args) -> None:
+    # Create argument parse
+    parser = get_config()
+    all_args = parse_args(args, parser)
+
+    print("all config: ", all_args)
+
+    if all_args.seed_specify:
+        all_args.seed=all_args.runing_id #I really dislike misspelling but I don't want to change the original code.
+    else:
+        all_args.seed=np.random.randint(1000,10000)
+
+    print("seed is :",all_args.seed)
+
+    if all_args.cuda and torch.cuda.is_available():
+        print("choose to use gpu...")
+        device = torch.device("cuda:0")
+        torch.set_num_threads(all_args.n_training_threads)
+        if all_args.cuda_deterministic:
+            torch.backends.cudnn.benchmark = False
+            torch.backends.cudnn.deterministic = True
+
+    else:
+        print("choose to use cpu...")
+        device = torch.device("cpu")
+        torch.set_num_threads(all_args.n_training_threads)
+
+    run_dir = Path(os.path.split(os.path.dirname(os.path.abspath(__file__)))[
+                       0] + "/results") / all_args.env_name / all_args.algorithm_name / all_args.experiment_name / str(all_args.seed)
+
+    if not run_dir.exists():
+        os.makedirs(str(run_dir))
+
+    if not run_dir.exists():
+        curr_run = 'run1'
+    else:
+        exst_run_nums = [int(str(folder.name).split('run')[1]) for folder in run_dir.iterdir() if
+                            str(folder.name).startswith('run')]
+        if len(exst_run_nums) == 0:
+            curr_run = 'run1'
+        else:
+            curr_run = 'run%i' % (max(exst_run_nums) + 1)
+
+    run_dir = run_dir / curr_run
+    if not run_dir.exists():
+        os.makedirs(str(run_dir))
+
+    setproctitle.setproctitle(
+        str(all_args.algorithm_name) + "-" + str(all_args.env_name) + "-" + str(all_args.experiment_name) + "@" + str(
+            all_args.user_name))
+
+    # seed
+    torch.manual_seed(all_args.seed)
+    torch.cuda.manual_seed_all(all_args.seed)
+    np.random.seed(all_args.seed)
+
+    envs = make_train_env(all_args)
+    eval_envs = make_eval_env(all_args) if all_args.use_eval else None
+    num_agents = envs.n_agents
+
+    config = {
+        "all_args": all_args,
+        "envs": envs,
+        "eval_envs": eval_envs,
+        "num_agents": num_agents,
+        "device": device,
+        "run_dir": run_dir,
+    }
+
+    runner = Runner(config) #uses config.py/model_dir to load parameters into the model
+    runner.eval(total_num_steps=1) #uses config.py/eval_episodes to identify how many episodes of eval to run.
+
+    # post process
+    envs.close()
+    if all_args.use_eval and eval_envs is not envs:
+        eval_envs.close()
+
+    runner.writter.export_scalars_to_json(str(runner.log_dir + '/summary.json'))
+    runner.writter.close()
+
+if __name__ == '__main__':
+    main(sys.argv[1:])
+
+
+
+
+
+
+
+
+
+
+
+
+
+from rl_behaviour import rl_behaviour_choices
+from behaviour import behaviour_choices
+import constants
+from scenario import eval_to_csv
+from scenario_io import write_scenario_to_file
+
+from single_agent_wrapper import SingleAgentWrapper
+from single_agent_agent import Agent
+from runners.separated.base_runner import Runner
+
+def prepare_cumulative_evaluation(env: gym.Env):
+    base_env = env.unwrapped
+    reward = base_env.reward_space.sample()
+    zero_reward = jax.tree_util.tree_map(lambda e: 0.0, reward)
+    return zero_reward
+
+def prepare_cumulative_scalarized_evaluation_dict(env: gym.Env):
+    reward = env.reward_space.sample()
+    zero_reward = jax.tree_util.tree_map(lambda e: 0.0, reward)
+    return zero_reward
+
+def train_on_scenario(red_behaviour_name: str, 
+                    config: dict,
+                    logger: logging.Logger, 
+                    rng: np.random.Generator):
+    max_team_members = 7
+    scenario_id = rng.integers(0, 1000000)
+    env_config = {
+        "scenario_id": scenario_id, #ignore this for training.
+        "use_static": False, #no_greens have to be 4 and no_reds 2 if use_static is set to True.
+        "no_greens": 7, #rng.integers(2, max_team_members),
+        "no_reds": 7, #rng.integers(2, max_team_members),
+        'visualise_delay': 0,
+    }
+
+    reward_wrapper_config = {
+        "share_detected": 1.0,
+        "share_delay_detected": 1.0,
+        "share_radar_actions": 0.0,
+        "position_error": 0.0,
+        "position_delay_error": 0.0
+    }
+
+    logger.info("Generate scenario")
+
+    env = SensorControlEnv(env_config)
+    env = ScalarizedRewardWrapper(env, reward_wrapper_config)
+    env = FlattenObservationWrapper(env, max_sa_size=max_team_members)
+    env = RedTeamBehaviourWrapper(env, red_behaviour_name) #This one is incredibly slow.
+    env = ObservationNormalizationWrapper(env, max_sa_size=max_team_members)
+    env = DiscretizedActionWrapper(env)
+    #env = SingleAgentWrapper(env, max_team_members=max_team_members, max_sa_size=max_team_members)
+    env = FlattenObservation(env)
+    env = TimeLimit(env, max_episode_steps=constants.SCENARIO_LENGTH)
+    run_name = f"run_{int(time.time())}"
+
+    config["env"] = env
+    config['num_agents'] = max_team_members
+    runner = Runner(config)
+    runner.run()
+
+#     #Start training
+#     agent = Agent(env)
+#     optimizer = optim.Adam(agent.parameters(), lr=learning_rate, eps=1e-5)
+
+#     if load_checkpoint_path:
+#         agent.load_state_dict(torch.load(os.path.join(load_checkpoint_path, "agent.pt")))
+#         optimizer.load_state_dict(torch.load(os.path.join(load_checkpoint_path, "optimizer.pt")))
+
+#     num_steps = constants.SCENARIO_LENGTH #Number of steps in a rollout
+#     batch_size = num_steps
+#     num_minibatches = 4
+#     minibatch_size = batch_size // num_minibatches
+
+#     # Storage setup
+#     obs = torch.zeros((num_steps,) + env.observation_space.shape)
+#     actions = torch.zeros((num_steps,) + env.action_space.shape)
+#     logprobs = torch.zeros((num_steps,))
+#     rewards = torch.zeros((num_steps,))
+#     terminateds = torch.zeros((num_steps,))
+#     truncateds = torch.zeros((num_steps,))
+#     values = torch.zeros((num_steps,))
+
+#     num_updates = total_timesteps // batch_size
+# #    cumulative_evaluation_dict = prepare_cumulative_evaluation(env)
+# #    cumulative_scalarized_evaluation_dict = prepare_cumulative_scalarized_evaluation_dict(env)
+#     logger.info("Start training")
+#     run_name = f"run_{int(time.time())}"
+#     writer = SummaryWriter(f"runs/{run_name}")
+
+#     start = time.time()
+#     global_step = 0
+#     for update in range(1, num_updates + 1):
+#         next_obs, info = env.reset(seed=rng.integers(2**32))
+#         next_obs = torch.tensor(next_obs)
+#         next_terminated = torch.zeros(1, dtype=torch.float)
+#         next_truncated = torch.zeros(1, dtype=torch.float)
+#         for step in range(0, num_steps):
+#             global_step += 1
+#             obs[step] = next_obs
+#             terminateds[step] = next_terminated
+#             truncateds[step] = next_truncated
+
+#             with torch.no_grad():
+#                 action, logprob, _, value = agent.get_action_and_value(next_obs)
+#                 values[step] = value.flatten()
+#             actions[step] = action
+#             logprobs[step] = logprob
+
+#             next_obs, reward, terminated, truncated, info = env.step(action)
+#             rewards[step] = torch.tensor(reward).view(-1)
+#             next_obs, next_terminated, next_truncated = torch.tensor(next_obs), torch.tensor(float(terminated)), torch.tensor(float(truncated))
+#         writer.add_scalar("charts/episodic_return", sum(rewards), global_step)
+#         logger.info(f"global step: {global_step}, episodic return: {sum(rewards)}")
+
+#         with torch.no_grad():
+#             next_value = agent.get_value(next_obs).reshape(1, -1)
+#             advantages = torch.zeros_like(rewards)
+#             lastgaelam = 0
+#             for t in reversed(range(num_steps)):
+#                 if t == num_steps - 1:
+#                     nextnonterminal = 1.0 - next_terminated
+#                     nextnontruncated = 1.0 - next_truncated
+#                     nextvalues = next_value
+#                 else:
+#                     nextnonterminal = 1.0 - terminateds[t + 1]
+#                     nextnontruncated = 1.0 - truncateds[t + 1]
+#                     nextvalues = values[t + 1]
+#                 delta = rewards[t] + gamma * nextvalues * nextnonterminal  - values[t]
+#                 advantages[t] = lastgaelam = delta + gamma * 0.95 * nextnontruncated * lastgaelam
+#             returns = advantages + values
+        
+#         # flattening
+#         b_obs = obs.reshape((-1,) + env.observation_space.shape)
+#         b_logprobs = logprobs.reshape(-1)
+#         b_actions = actions.reshape((-1,) + env.action_space.shape)
+#         b_advantages = advantages.reshape(-1)
+#         b_returns = returns.reshape(-1)
+#         b_values = values.reshape(-1)
+
+#         b_inds = np.arange(batch_size)
+#         clipfracs = []
+#         for epoch in range(update_epochs):
+#             np.random.shuffle(b_inds)
+#             for start in range(0, batch_size, minibatch_size):
+#                 end = start + minibatch_size
+#                 mbinds = b_inds[start:end]
+#                 _, newlogprob, entropy, newvalue = agent.get_action_and_value(b_obs[mbinds], b_actions.long()[mbinds])
+#                 logratio = newlogprob - b_logprobs[mbinds]
+#                 ratio = logratio.exp()
+
+#                 mb_advantages = b_advantages[mbinds]
+#                 mb_advantages = (mb_advantages - mb_advantages.mean()) /(mb_advantages.std() + 1e-8)
+            
+#                 # Policy loss
+#                 pg_loss1 = -mb_advantages * ratio
+#                 pg_loss2 = -mb_advantages * torch.clamp(ratio, 1 - 0.2, 1 + 0.2)
+#                 pg_loss = torch.max(pg_loss1, pg_loss2).mean()
+
+#                 # Value loss
+#                 v_loss = 0.5 * ((newvalue - b_returns[mbinds]) ** 2).mean() 
+
+#                 entropy_loss = entropy.mean()
+#                 loss = pg_loss - 0.01 * entropy_loss + vf_coef * v_loss
+                
+#                 # Do the update
+#                 optimizer.zero_grad()
+#                 loss.backward()
+#                 nn.utils.clip_grad_norm_(agent.parameters(), 0.5) #Might want to tune the 0.5. maybe.
+#                 optimizer.step()
+
+#         #y_pred, y_true = b_values, b_returns
+#         #var_y = np.var(y_true)
+#         #explained_var = np.nan if var_y == 0 else 1 - np.var(y_pred - y_true) / var_y
+#         writer.add_scalar("losses/value_loss", v_loss.item(), global_step)
+#         writer.add_scalar("losses/policy_loss", pg_loss.item(), global_step)
+#         writer.add_scalar("losses/entropy", entropy_loss.item(), global_step)       
+#         logger.info(f"global step: {global_step}, value_loss: {v_loss.item()}")
+#         logger.info(f"global step: {global_step}, policy_loss: {pg_loss.item()}")
+#         logger.info(f"global step: {global_step}, entropy: {entropy_loss.item()}")
+
+#         #Checkpoint
+#         if update % checkpoint_every_k_updates == 0:
+#             logger.info("Saving checkpoint")
+#             folder_path = f"checkpoints/{run_name}/{global_step}/"
+#             os.makedirs(folder_path, exist_ok = True) 
+
+#             torch.save(agent.state_dict(), os.path.join(folder_path, "agent.pt"))
+#             torch.save(optimizer.state_dict(), os.path.join(folder_path, "optimizer.pt"))
+            
\ No newline at end of file
-- 
GitLab