diff --git a/src/agent.py b/src/agent.py index 804733518c99544de59836fea65b8c9853db4d82..fa21642664b8d5765847251683b7d19a67cd2a24 100644 --- a/src/agent.py +++ b/src/agent.py @@ -40,6 +40,7 @@ class Memory(): def add(self, transition): + """ Add transition to memory buffer, if full, replace oldest transition """ self.memory[self.curr_index % self.max_capacity] = transition self.curr_index = self.curr_index + 1 self.size = min(self.curr_index, self.max_capacity) @@ -215,6 +216,7 @@ class Agent(): def get_velocity_state(self): + """ Get linear and angular velocies of the robot and return """ odom = rospy.wait_for_message("/odometry/filtered", Odometry, timeout=5) linear = odom.twist.twist.linear.x angular = odom.twist.twist.angular.z @@ -223,7 +225,7 @@ class Agent(): def get_goal_state(self): """ Get goal state - Currently assumes flat ground (2D) """ + Assumes flat ground (2D) """ position, orientation = self.get_position() diff_x = self.goal_x - position.x diff_y = self.goal_y - position.y @@ -275,20 +277,20 @@ class Agent(): self.has_arrived = True print("GOAL REACHED!!!") - # Perform some scaling + # Normalize values and set state self.state = np.concatenate((laser_scan / self.max_distance, velocity, goal_state / np.array([self.max_goal_dist, np.pi]))) - # TODO: Add OU noise and clip to velocity intervals def choose_action(self, state): + """ Pick an action """ + # Picks action from the actor, use during evaluation tf_action = self.network.actor(tf.expand_dims(tf.convert_to_tensor(state), 0)) action = tf.squeeze(tf_action).numpy() - #action[0] = (action[0] + np.random.uniform(0.0, 1.0)) * 0.25 - #action[1] = (action[1] + np.random.uniform(-0.5, 0.5)) * 0.335 - - #action = np.array([np.random.uniform(0.0, 0.25), np.random.uniform(-0.5, 0.5)]) + # Adds random noise for exploration, use during training + #action[0] = (action[0] + np.random.uniform(0.0, 1.0)) + #action[1] = (action[1] + np.random.uniform(-0.5, 0.5)) return action * np.array([0.35, 0.35]) @@ -309,6 +311,8 @@ class Agent(): def train_step(self): + """ One update iteration of the networks """ + batch = self.memory.sample() states = batch[:,:15] actions = batch[:,15:17] @@ -347,7 +351,7 @@ class Agent(): done = self.has_arrived or self.has_crashed self.store_transition(state, action, reward, copy.deepcopy(self.state), done) - # Update Networks + # Update Networks, comment out during evaluation #self.train_step() return reward, done diff --git a/src/networks.py b/src/networks.py index 060f49287a71fece9097144b414bc925aa143f02..afb42879a59eee2fd4980cdcc389e2c15f0518f2 100644 --- a/src/networks.py +++ b/src/networks.py @@ -64,6 +64,7 @@ class ActorCritic(): return model + # Update critic and actor networks # Source: https://keras.io/examples/rl/ddpg_pendulum/ def update_networks(self, states, actions, rewards, new_states, dones): @@ -94,6 +95,8 @@ class ActorCritic(): self.actor_optimizer.apply_gradients(zip(actor_grad, self.actor.trainable_variables)) + # Update target networks + # Source: https://github.com/philtabor/Youtube-Code-Repository/tree/master/ReinforcementLearning/PolicyGradient/DDPG/tensorflow2/pendulum def update_target_networks(self): """ Updates target actor and critic """ @@ -124,35 +127,3 @@ class ActorCritic(): self.target_actor.load_weights(TARGET_ACTOR_FILE_PATH) self.target_critic.load_weights(TARGET_CRITIC_FILE_PATH) print("Weights loaded.") - - -if __name__ == "__main__": - - # Test code - ac = ActorCritic(12, 0.005, 1) - - # Load weights - ac.load_weights_from_file() - - # Toy state - states = np.array([[0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0], - [12.0, 13.0, 14.0, 15.0, 16.0, 17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0], - [24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0, 31.0, 32.0, 33.0, 34.0, 35.0]]) - - actions = np.array([[0.1, 0.2], [0.3, 0.4], [0.5, 0.6]]) - - # Forward propagate actor and critic networks - actions = ac.actor(states) - y = ac.critic([states, actions]) - - # Test target updates - ac.update_target_networks() - - # Save weights - ac.save_weights_to_file() - - print(y) - print("PASS") - - - diff --git a/src/run.py b/src/run.py index 8aef7892622b2043e7754dd3529ca78257254421..0f9c9993c06a06f9838056a6a78159b8bdeac99b 100644 --- a/src/run.py +++ b/src/run.py @@ -12,7 +12,7 @@ episode_rewards = [] """ -# Pre training +# Pre training, comment out when finished for i in range(1000): agent.train_step() print("Step: {}".format(i)) @@ -21,6 +21,7 @@ agent.save_weights() """ +# Main program for episode in range(1, NUM_EPISODES + 1): episode_reward = 0.0 @@ -36,12 +37,15 @@ for episode in range(1, NUM_EPISODES + 1): break episode_rewards.append(episode_reward) + + # Moving average of past 40 episodes avg_reward = np.mean(episode_rewards[-min(40, episode):]) print("End of episode:", episode) print("Average reward:", avg_reward) print() """ + # Comment out during evaluation if episode % 3 == 0: agent.memory.save_memory_to_file() agent.save_weights()