Skip to content
Snippets Groups Projects
Commit ca195120 authored by Gunnar Arctaedius's avatar Gunnar Arctaedius
Browse files

lmao still adding models maybe should stop at some point

parent 2ec7cfa2
No related branches found
No related tags found
No related merge requests found
Showing
with 346 additions and 10 deletions
...@@ -302,7 +302,7 @@ def get_config(): ...@@ -302,7 +302,7 @@ def get_config():
parser.add_argument("--log_interval", type=int, parser.add_argument("--log_interval", type=int,
default=1, help="time duration between contiunous twice log printing.") default=1, help="time duration between contiunous twice log printing.")
parser.add_argument("--model_dir", type=str, parser.add_argument("--model_dir", type=str,
default='./trained_models/8800_six_nights/run1/models', help="by default None. set the path to pretrained model.") default='./trained_models/7010_after_christmas/run1/models', help="by default None. set the path to pretrained model.")
# eval parameters # eval parameters
parser.add_argument("--use_eval", action='store_true', parser.add_argument("--use_eval", action='store_true',
......
...@@ -55,15 +55,15 @@ class ScalarizedRewardWrapper(gym.RewardWrapper): ...@@ -55,15 +55,15 @@ class ScalarizedRewardWrapper(gym.RewardWrapper):
- 0.5*reward[agent_id]['enemy_time_step_radars_detected'] - 0.5*reward[agent_id]['enemy_time_step_radars_detected']
for agent_id in reward for agent_id in reward
}) })
agent_id = 'g0' # agent_id = 'g0'
print("position_error: ", 0.001*reward[agent_id]['position_error']) # print("position_error: ", 0.001*reward[agent_id]['position_error'])
print("enemy_position_error: ", -0.001*reward[agent_id]['enemy_position_error']) # print("enemy_position_error: ", -0.001*reward[agent_id]['enemy_position_error'])
#print("aircraft_share_detected: ", reward[agent_id]['aircraft_share_detected']) # print("aircraft_share_detected: ", reward[agent_id]['aircraft_share_detected'])
print("share_radar_actions: ", 0.5*reward[agent_id]['share_radar_actions']) # print("share_radar_actions: ", 0.5*reward[agent_id]['share_radar_actions'])
#print("enemy_radar_share_detected: ", -reward[agent_id]['enemy_radar_share_detected']) # print("enemy_radar_share_detected: ", -reward[agent_id]['enemy_radar_share_detected'])
print("time_step_aircrafts_detected: ", reward[agent_id]['time_step_aircrafts_detected']) # print("time_step_aircrafts_detected: ", reward[agent_id]['time_step_aircrafts_detected'])
print("enemy_time_step_radars_detected: ", -0.5*reward[agent_id]['enemy_time_step_radars_detected']) # print("enemy_time_step_radars_detected: ", -0.5*reward[agent_id]['enemy_time_step_radars_detected'])
print() # print()
# if self.eval_==True: # if self.eval_==True:
# new_reward = dict({ # new_reward = dict({
......
import argparse
from behaviour import behaviour_choices
def get_config():
"""
The configuration parser for common hyperparameters of all environment.
Please reach each `scripts/train/<env>_runner.py` file to find private hyperparameters
only used in <env>.
Prepare parameters:
--algorithm_name <algorithm_name>
specifiy the algorithm, including `["happo", "hatrpo"]`
--experiment_name <str>
an identifier to distinguish different experiment.
--seed <int>
set seed for numpy and torch
--seed_specify
by default True Random or specify seed for numpy/torch
--runing_id <int>
the runing index of experiment (default=1)
--cuda
by default True, will use GPU to train; or else will use CPU;
--cuda_deterministic
by default, make sure random seed effective. if set, bypass such function.
--n_training_threads <int>
number of training threads working in parallel. by default 1
--n_rollout_threads <int>
number of parallel envs for training rollout. by default 32
--n_eval_rollout_threads <int>
number of parallel envs for evaluating rollout. by default 1
--n_render_rollout_threads <int>
number of parallel envs for rendering, could only be set as 1 for some environments.
--num_env_steps <int>
number of env steps to train (default: 10e6)
Env parameters:
--env_name <str>
specify the name of environment
--use_obs_instead_of_state
[only for some env] by default False, will use global state; or else will use concatenated local obs.
Replay Buffer parameters:
--episode_length <int>
the max length of episode in the buffer.
Network parameters:
--share_policy
by default True, all agents will share the same network; set to make training agents use different policies.
--use_centralized_V
by default True, use centralized training mode; or else will decentralized training mode.
--stacked_frames <int>
Number of input frames which should be stack together.
--hidden_size <int>
Dimension of hidden layers for actor/critic networks
--layer_N <int>
Number of layers for actor/critic networks
--use_ReLU
by default True, will use ReLU. or else will use Tanh.
--use_popart
by default True, use running mean and std to normalize rewards.
--use_feature_normalization
by default True, apply layernorm to normalize inputs.
--use_orthogonal
by default True, use Orthogonal initialization for weights and 0 initialization for biases. or else, will use xavier uniform inilialization.
--gain
by default 0.01, use the gain # of last action layer
--use_naive_recurrent_policy
by default False, use the whole trajectory to calculate hidden states.
--use_recurrent_policy
by default, use Recurrent Policy. If set, do not use.
--recurrent_N <int>
The number of recurrent layers ( default 1).
--data_chunk_length <int>
Time length of chunks used to train a recurrent_policy, default 10.
Optimizer parameters:
--lr <float>
learning rate parameter, (default: 5e-4, fixed).
--critic_lr <float>
learning rate of critic (default: 5e-4, fixed)
--opti_eps <float>
RMSprop optimizer epsilon (default: 1e-5)
--weight_decay <float>
coefficience of weight decay (default: 0)
TRPO parameters:
--kl_threshold <float>
the threshold of kl-divergence (default: 0.01)
--ls_step <int>
the step of line search (default: 10)
--accept_ratio <float>
accept ratio of loss improve (default: 0.5)
PPO parameters:
--ppo_epoch <int>
number of ppo epochs (default: 15)
--use_clipped_value_loss
by default, clip loss value. If set, do not clip loss value.
--clip_param <float>
ppo clip parameter (default: 0.2)
--num_mini_batch <int>
number of batches for ppo (default: 1)
--entropy_coef <float>
entropy term coefficient (default: 0.01)
--use_max_grad_norm
by default, use max norm of gradients. If set, do not use.
--max_grad_norm <float>
max norm of gradients (default: 0.5)
--use_gae
by default, use generalized advantage estimation. If set, do not use gae.
--gamma <float>
discount factor for rewards (default: 0.99)
--gae_lambda <float>
gae lambda parameter (default: 0.95)
--use_proper_time_limits
by default, the return value does consider limits of time. If set, compute returns with considering time limits factor.
--use_huber_loss
by default, use huber loss. If set, do not use huber loss.
--use_value_active_masks
by default True, whether to mask useless data in value loss.
--huber_delta <float>
coefficient of huber loss.
Run parameters:
--use_linear_lr_decay
by default, do not apply linear decay to learning rate. If set, use a linear schedule on the learning rate
--save_interval <int>
time duration between contiunous twice models saving.
--log_interval <int>
time duration between contiunous twice log printing.
--model_dir <str>
by default None. set the path to pretrained model.
Eval parameters:
--use_eval
by default, do not start evaluation. If set`, start evaluation alongside with training.
--eval_interval <int>
time duration between contiunous twice evaluation progress.
--eval_episodes <int>
number of episodes of a single evaluation.
Render parameters:
--save_gifs
by default, do not save render video. If set, save video.
--use_render
by default, do not render the env during training. If set, start render. Note: something, the environment has internal render process which is not controlled by this hyperparam.
--render_episodes <int>
the number of episodes to render a given env
--ifi <float>
the play interval of each rendered image in saved video.
Pretrained parameters:
"""
parser = argparse.ArgumentParser(description='onpolicy_algorithm', formatter_class=argparse.RawDescriptionHelpFormatter)
# prepare parameters
parser.add_argument("--algorithm_name", type=str,
default='happo', choices=["happo","hatrpo"])
parser.add_argument("--experiment_name", type=str,
default="check", help="an identifier to distinguish different experiment.")
parser.add_argument("--seed", type=int,
default=1, help="Random seed for numpy/torch")
parser.add_argument("--seed_specify", action="store_true",
default=False, help="Random or specify seed for numpy/torch")
parser.add_argument("--red_behaviour", choices=behaviour_choices, type=str.lower,
default="simpleone", help="name of red behaviour to use for training. Ignored if rand_red_behaviour is True.")
parser.add_argument("--rand_red_behaviour", type=bool,
default=True, help="Randomizes opponent behaviour in each simulation for training. Uses red_behaviour if False.")
parser.add_argument("--rand_size", type=bool,
default=True, help="Randomizes both teams, teamsize in each simulation for training.")
parser.add_argument("--red_size", type=int,
default=2, help="2<=Size of red team<=7, works only with rand_size==false")
parser.add_argument("--green_size", type=int,
default=2, help="2<=Size of green team<=7, works only with rand_size==false")
parser.add_argument("--runing_id", type=int,
default=1, help="the runing index of experiment")
parser.add_argument("--cuda", action='store_false',
default=True, help="by default True, will use GPU to train; or else will use CPU;")
parser.add_argument("--cuda_deterministic", action='store_false',
default=False, help="by default, make sure random seed effective. if set, bypass such function.")
parser.add_argument("--n_training_threads", type=int,
default=1, help="Number of torch threads for training")
parser.add_argument("--n_rollout_threads", type=int,
default=20, help="Number of parallel envs for training rollouts")
parser.add_argument("--n_eval_rollout_threads", type=int,
default=1, help="Number of parallel envs for evaluating rollouts")
parser.add_argument("--n_render_rollout_threads", type=int,
default=1, help="Number of parallel envs for rendering rollouts")
parser.add_argument("--num_env_steps", type=int,
default=600*10*10000, help='Number of environment steps to train (default: 6000)') #600 ts x 20 rollout threads x 10 episode count
#time = 600*20*1000 / fps (250)
parser.add_argument("--user_name", type=str,
default='marl',help="[for wandb usage], to specify user's name for simply collecting training data.")
# env parameters
parser.add_argument("--env_name", type=str,
default='scontrol', help="specify the name of environment")
parser.add_argument("--use_obs_instead_of_state", action='store_true',
default=False, help="Whether to use global state or concatenated obs")
# replay buffer parameters
parser.add_argument("--episode_length", type=int,
default=600, help="Max length for any episode")
# network parameters
parser.add_argument("--share_policy", action='store_false',
default=True, help='Whether agent share the same policy')
parser.add_argument("--use_centralized_V", action='store_false',
default=True, help="Whether to use centralized V function")
parser.add_argument("--stacked_frames", type=int,
default=100, help="Dimension of hidden layers for actor/critic networks")
parser.add_argument("--use_stacked_frames", action='store_true',
default=True, help="Whether to use stacked_frames")
parser.add_argument("--hidden_size", type=int,
default=64, help="Dimension of hidden layers for actor/critic networks")
parser.add_argument("--layer_N", type=int,
default=2, help="Number of layers for actor/critic networks")
parser.add_argument("--use_ReLU", action='store_false',
default=True, help="Whether to use ReLU")
parser.add_argument("--use_popart", action='store_false',
default=False, help="by default True, use running mean and std to normalize rewards.")
parser.add_argument("--use_valuenorm", action='store_false',
default=False, help="by default True, use running mean and std to normalize rewards.")
parser.add_argument("--use_feature_normalization", action='store_false',
default=True, help="Whether to apply layernorm to the inputs")
parser.add_argument("--use_orthogonal", action='store_false',
default=True, help="Whether to use Orthogonal initialization for weights and 0 initialization for biases")
parser.add_argument("--gain", type=float,
default=0.01, help="The gain # of last action layer")
# recurrent parameters
parser.add_argument("--use_naive_recurrent_policy", action='store_true',
default=False, help='Whether to use a naive recurrent policy')
parser.add_argument("--use_recurrent_policy", action='store_true',
default=False, help='use a recurrent policy')
parser.add_argument("--recurrent_N", type=int,
default=1, help="The number of recurrent layers.")
parser.add_argument("--data_chunk_length", type=int,
default=10, help="Time length of chunks used to train a recurrent_policy")
# optimizer parameters
parser.add_argument("--lr", type=float,
default=5e-4, help='learning rate (default: 5e-4)')
parser.add_argument("--critic_lr", type=float,
default=5e-4, help='critic learning rate (default: 5e-4)')
parser.add_argument("--opti_eps", type=float,
default=1e-5, help='RMSprop optimizer epsilon (default: 1e-5)')
parser.add_argument("--weight_decay", type=float, default=0)
parser.add_argument("--std_x_coef", type=float, default=1)
parser.add_argument("--std_y_coef", type=float, default=0.5)
# trpo parameters
parser.add_argument("--kl_threshold", type=float,
default=0.01, help='the threshold of kl-divergence (default: 0.01)')
parser.add_argument("--ls_step", type=int,
default=10, help='number of line search (default: 10)')
parser.add_argument("--accept_ratio", type=float,
default=0.5, help='accept ratio of loss improve (default: 0.5)')
# ppo parameters
parser.add_argument("--ppo_epoch", type=int,
default=10, help='number of ppo epochs (default: 15)')
parser.add_argument("--use_clipped_value_loss", action='store_false',
default=False, help="by default, clip loss value. If set, do not clip loss value.")
parser.add_argument("--clip_param", type=float,
default=0.2, help='ppo clip parameter (default: 0.2)')
parser.add_argument("--num_mini_batch", type=int,
default=1, help='number of batches for ppo (default: 1)')
parser.add_argument("--entropy_coef", type=float,
default=0.01, help='entropy term coefficient (default: 0.01)')
parser.add_argument("--value_loss_coef", type=float,
default=0.01, help='value loss coefficient (default: 0.5)')
parser.add_argument("--use_max_grad_norm", action='store_false',
default=True, help="by default, use max norm of gradients. If set, do not use.")
parser.add_argument("--max_grad_norm", type=float,
default=0.5, help='max norm of gradients (default: 0.5)')
parser.add_argument("--use_gae", action='store_false',
default=True, help='use generalized advantage estimation')
parser.add_argument("--gamma", type=float, default=0.99,
help='discount factor for rewards (default: 0.99)')
parser.add_argument("--gae_lambda", type=float, default=0.95,
help='gae lambda parameter (default: 0.95)')
parser.add_argument("--use_proper_time_limits", action='store_true',
default=True, help='compute returns taking into account time limits')
parser.add_argument("--use_huber_loss", action='store_false',
default=True, help="by default, use huber loss. If set, do not use huber loss.")
parser.add_argument("--use_value_active_masks", action='store_false',
default=True, help="by default True, whether to mask useless data in value loss.")
parser.add_argument("--use_policy_active_masks", action='store_false',
default=True, help="by default True, whether to mask useless data in policy loss.")
parser.add_argument("--huber_delta", type=float,
default=10.0, help=" coefficience of huber loss.")
# run parameters
parser.add_argument("--use_linear_lr_decay", action='store_true',
default=False, help='use a linear schedule on the learning rate')
parser.add_argument("--save_interval", type=int,
default=10, help="time duration between contiunous twice models saving.")
parser.add_argument("--log_interval", type=int,
default=1, help="time duration between contiunous twice log printing.")
parser.add_argument("--model_dir", type=str,
default='./trained_models/9191_fournights/run1/models', help="by default None. set the path to pretrained model.")
# eval parameters
parser.add_argument("--use_eval", action='store_true',
default=True, help="by default, do not start evaluation. If set`, start evaluation alongside with training.")
parser.add_argument("--eval_interval", type=int,
default=10, help="time duration between contiunous twice evaluation progress.")
parser.add_argument("--eval_episodes", type=int,
default=2, help="number of episodes of a single evaluation.")
parser.add_argument("--eval_render", type=bool,
default=False, help="visualizes the model every eval. works on top of the 'eval' setting in multiagent_rl_train.")
parser.add_argument("--visualise_delay", type=int,
default=1, help="numer of milliseconds to wait between steps when drawing eval render.")
parser.add_argument("--only_delay_middle", type=bool,
default=True, help="uses 0 delay for time < 150 and time > 350, and visualise delay for the middle")
# render parameters
parser.add_argument("--save_gifs", action='store_true',
default=False, help="by default, do not save render video. If set, save video.")
parser.add_argument("--use_render", action='store_true',
default=False, help="by default, do not render the env during training. If set, start render. Note: something, the environment has internal render process which is not controlled by this hyperparam.")
parser.add_argument("--render_episodes", type=int,
default=5, help="the number of episodes to render a given env")
parser.add_argument("--ifi", type=float,
default=0.1, help="the play interval of each rendered image in saved video.")
return parser
\ No newline at end of file
agent_id: + (0.001*reward[agent_id]['position_error'] - 0.001*reward[agent_id]['enemy_position_error'])
+ 0.5*reward[agent_id]['share_radar_actions']
+ reward[agent_id]['time_step_aircrafts_detected']
- 0.5*reward[agent_id]['enemy_time_step_radars_detected']
eval: samma
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment