From 4978b3ba309cb7d570cd92cc364fa5dce045268a Mon Sep 17 00:00:00 2001 From: Dennis Malmgren <dennis@dennismalmgren.se> Date: Wed, 30 Nov 2022 00:06:46 +0100 Subject: [PATCH] Working on it --- configs/config.py | 4 ++-- multiagent_rl_train.py | 40 ++++++++++++++++++------------------ scalarized_reward_wrapper.py | 22 +++++++++++++------- scenario.py | 4 ++-- 4 files changed, 39 insertions(+), 31 deletions(-) diff --git a/configs/config.py b/configs/config.py index 8b6141b1..3c855d9d 100644 --- a/configs/config.py +++ b/configs/config.py @@ -184,7 +184,7 @@ def get_config(): parser.add_argument("--n_training_threads", type=int, default=1, help="Number of torch threads for training") parser.add_argument("--n_rollout_threads", type=int, - default=20, help="Number of parallel envs for training rollouts") + default=30, help="Number of parallel envs for training rollouts") parser.add_argument("--n_eval_rollout_threads", type=int, default=1, help="Number of parallel envs for evaluating rollouts") parser.add_argument("--n_render_rollout_threads", type=int, @@ -267,7 +267,7 @@ def get_config(): parser.add_argument("--clip_param", type=float, default=0.2, help='ppo clip parameter (default: 0.2)') parser.add_argument("--num_mini_batch", type=int, - default=20, help='number of batches for ppo (default: 1)') + default=30, help='number of batches for ppo (default: 1)') parser.add_argument("--entropy_coef", type=float, default=0.01, help='entropy term coefficient (default: 0.01)') parser.add_argument("--value_loss_coef", type=float, diff --git a/multiagent_rl_train.py b/multiagent_rl_train.py index f6c3052b..45fced60 100644 --- a/multiagent_rl_train.py +++ b/multiagent_rl_train.py @@ -70,17 +70,17 @@ def make_train_env(all_args): # } reward_wrapper_config = { # values= weights, only non zero weights will count to reward - "aircraft_share_detected": 0.01, # green detect red without - "share_delay_detected": 0.01, # After 300 timesteps, how many red agents detected - "share_radar_actions": 0.01, # Number of times radar used - "radar_share_detected":0.01, # green detect red with radar + "aircraft_share_detected": 0.0, # green detect red without + "share_delay_detected": 0.0, # After 300 timesteps, how many red agents detected + "share_radar_actions": 0.0, # Number of times radar used + "radar_share_detected":0.0, # green detect red with radar "position_error": 0.0001, # "position_delay_error": 0.0001, #After 300 timesteps, - 'enemy_aircraft_share_detected': 0.01, - "enemy_share_delay_detected": 0.01, - "enemy_share_radar_actions": 0.01, - "enemy_radar_share_detected":0.01, + 'enemy_aircraft_share_detected': 0.0, + "enemy_share_delay_detected": 0.0, + "enemy_share_radar_actions": 0.0, + "enemy_radar_share_detected":0.0, "enemy_position_error": 0.0001, "enemy_position_delay_error": 0.0001 } @@ -124,19 +124,19 @@ def make_eval_env(all_args): reward_wrapper_config = { # values= weights, only non zero weights will count to reward - "aircraft_share_detected": 1, # green detect red without - "share_delay_detected": 1, # After 300 timesteps, how many red agents detected - "share_radar_actions": 1, # Number of times radar used - "radar_share_detected":1, # green detect red with radar - "position_error": 1, # - "position_delay_error": 1, #After 300 timesteps, + "aircraft_share_detected": 0.0, # green detect red without + "share_delay_detected": 0.0, # After 300 timesteps, how many red agents detected + "share_radar_actions": 0.0, # Number of times radar used + "radar_share_detected":0.0, # green detect red with radar + "position_error": 0.0001, # + "position_delay_error": 0.0001, #After 300 timesteps, - 'enemy_aircraft_share_detected': 1, - "enemy_share_delay_detected": 1, - "enemy_share_radar_actions": 1, - "enemy_radar_share_detected":1, - "enemy_position_error": 1, - "enemy_position_delay_error": 1 + 'enemy_aircraft_share_detected': 0.0, + "enemy_share_delay_detected": 0.0, + "enemy_share_radar_actions": 0.0, + "enemy_radar_share_detected":0.0, + "enemy_position_error": 0.0001, + "enemy_position_delay_error": 0.0001 } red_behaviour_name = all_args.red_behaviour rand_red_behaviour = all_args.rand_red_behaviour diff --git a/scalarized_reward_wrapper.py b/scalarized_reward_wrapper.py index 13c44a2b..bc258855 100644 --- a/scalarized_reward_wrapper.py +++ b/scalarized_reward_wrapper.py @@ -44,17 +44,25 @@ class ScalarizedRewardWrapper(gym.RewardWrapper): def reward(self, reward: MultiAgentMultiObjectiveReward) -> MultiAgentReward: - # if self.eval_==True: + #position error is a negative number that increases in magnitude with the error. + #at the 'worst' it is -600.0 new_reward = dict({ - agent_id: (0.01*reward[agent_id]['enemy_position_error']-0.01*reward[agent_id]['position_error']) - + (0.01*reward[agent_id]['enemy_position_delay_error']-0.01*reward[agent_id]['position_delay_error']) - + (reward[agent_id]['aircraft_share_detected']-reward[agent_id]['enemy_aircraft_share_detected']) - + (reward[agent_id]['share_delay_detected']-reward[agent_id]['enemy_share_delay_detected']) - + (0.1*reward[agent_id]['radar_share_detected']-0.1*reward[agent_id]['enemy_radar_share_detected']) - + (0.1*reward[agent_id]['enemy_share_radar_actions']-0.1*reward[agent_id]['radar_share_detected']) + agent_id: (0.00001*(reward[agent_id]['position_error'] - reward[agent_id]['enemy_position_error'])) + + (0.00001*(reward[agent_id]['position_delay_error'] - reward[agent_id]['enemy_position_delay_error'])) for agent_id in reward }) + # if self.eval_==True: + # new_reward = dict({ + # agent_id: (0.01*reward[agent_id]['enemy_position_error']-0.01*reward[agent_id]['position_error']) + # + (0.01*reward[agent_id]['enemy_position_delay_error']-0.01*reward[agent_id]['position_delay_error']) + # + (reward[agent_id]['aircraft_share_detected']-reward[agent_id]['enemy_aircraft_share_detected']) + # + (reward[agent_id]['share_delay_detected']-reward[agent_id]['enemy_share_delay_detected']) + # + (0.1*reward[agent_id]['radar_share_detected']-0.1*reward[agent_id]['enemy_radar_share_detected']) + # + (0.1*reward[agent_id]['enemy_share_radar_actions']-0.1*reward[agent_id]['radar_share_detected']) + # for agent_id in reward + # }) + # else: # new_reward = dict({ # # agent_id: (0.01*reward[agent_id]['enemy_position_error']-0.01*reward[agent_id]['position_error']) diff --git a/scenario.py b/scenario.py index 54a35a9c..a713ad88 100644 --- a/scenario.py +++ b/scenario.py @@ -203,7 +203,7 @@ def evaluate_position(state: State) -> tuple[float, float]: not_detected = [a_o.aircraft for a_o in state.green if a_o.aircraft.name not in detected_name] for not_detected_aircraft in not_detected: - dist = 141 + dist = 600 dist_red += dist dist_green = 0 @@ -219,7 +219,7 @@ def evaluate_position(state: State) -> tuple[float, float]: not_detected = [a_o.aircraft for a_o in state.red if a_o.aircraft.name not in detected_name] for not_detected_aircraft in not_detected: - dist = 141 + dist = 600 dist_green += dist dist_red = dist_red / (len(state.green) * len(state.red)) -- GitLab