Skip to content

ConfigFiles

Glen edited this page Jan 18, 2018 · 1 revision

Intro

Config files are used to store all of the setting desired when running a learning simulation.

Example:

{
    "comment__": "Type of model/network to use for the actor and critic",
"model_type": "Deep_NN",
    "comment__": "Learning algorithm to use",
"agent_name": "CACLA",
    "comment__": "Folder to store the training data in.",
"data_folder": "Simple_Walk_Flat/",
"comment": "initial probability of selecting a random action",
"epsilon": 0.75, 
"comment": "initial probability of selecting a discrete random action",
"omega": 0.3,
    "comment__": "Batch size used for learning",
"batch_size": 32,
    "comment__": "Learning rate for the actor/policy",
"learning_rate": 0.0001,
    "comment__": "Stupid file that should not be needed anymore...",
"anchor_file": "./learn/data/paperGibbonAnchors.json",
    "comment__": "Config file for the simulator",
"sim_config_file": "./args/biped2D/test_biped_args_flat.txt",
    "comment__": "A different config file for the forward dynamics simulator",
"forwardDynamics_config_file": "./args/biped2D/test_biped_args_flat.txt",
    "comment__": "Exploration rate use when randomly generating new actions",
"exploration_rate": 0.15,
    "comment__": "Number of rounds to perform before termination",
"rounds": 5000,
    "comment__": "Number of epochs to perform per round",
"epochs": 10,
    "comment__": "Number of epoch/episode to evaluate the policy over",
"eval_epochs": 10,
    "comment__": "Discount factor used during learning",
"discount_factor": 0.95,
    "comment__": "Should the training be plotted during learning",
"visualize_learning": true,
    "comment__": "Whether or not to save the plotted data while learning",
"save_trainData": true,
    "comment__": "Whether or not to train a forward dynamics model as well",
"train_forward_dynamics": false,
    "comment__": "Whether or not to plot the training curve while learning for the forward dynamics model",
"visulaize_forward_dynamics": false,
    "comment__": "Bounds used for scaling rewards for networks",
"reward_bounds": [[0.0],[1.0]],
    "comment__": "Max length of the Experience memory",
"expereince_length": 10000,
    "comment__": "Possible state bounds to be used for scaling states for networks",
"state_bounds": [[-3.14, 1.0,-8.05,-1.0,-1.0, -8.5,-5.0, -5.5, -5.0, -2.5],
				 [ 3.14, 8.5, 5.00, 1.0, 1.0, 10.0, 5.0, 15.0,  5.0, 20.0]],
    "comment__": "Action scalling values to be used to scale values for the network",
"action_bounds":    [[0.000000, 0.000000, -0.497252, -0.37, -3.0, -0.57, -0.37, -3.0, -0.57, -0.497252, -0.37, -3.0, -0.57, -0.37, -3.0, -0.57, -0.37, -3.0, -0.57, -0.37, -3.0, -0.57, -0.37, -3.0, -0.57, -0.37, -3.0, -0.57],
				     [1.251474, 2.500000,  0.497252,  2.87,  0.0,   1.0,  2.87,  0.0,  1.0,   0.497252,  2.87,  0.0,   1.0,  2.87,  0.0,  1.0,  2.87,  0.0,   1.0,  2.87,  0.0,  1.0, 2.87,  0.0,   1.0,  2.87,  0.0,  1.0]],
    "comment__": "Set of discrete actions that can be sampled from",				     
"discrete_actions": [[0.251474, 2.099796, -0.097252, -0.993935, 0.273527, 0.221481, 1.100288, -3.076833, 0.180141, -0.176967, 0.310372, -1.642646,   -0.406771, 1.240827, -1.773369, -0.508333, -0.063421, -2.091676, -1.418455, -1.242994, -0.262842, 0.453321, -0.366870, -1.494344, 0.794701, -1.408623, 0.655703, 0.634434],
		             [0.316504, 1.963316, -0.325309, -1.802222, 1.668542, 2.453011, 2.139391, -3.636978, -0.855670, -0.350402, -1.342939, 0.337384, -3.272340, 2.048047, -0.938193, -1.799840,   0.384958, -2.357088,    0.076791, -1.513792, -1.855033, 0.340609, -0.400898, -0.903512, 0.144128, 0.524855, -1.534278, 1.856180 ],
		             [0.289427, 1.994835, -0.254906, -1.167773, 0.275834, -0.197735, 2.298369, -3.272985, -0.179217, -0.530879, -0.362771, -0.690035, -0.558768, 1.806804, -1.341557, -0.688935, 0.337809, -1.773089,   -1.025832, -0.472297, -0.766728, 0.358616, -0.481754, -1.328176, 0.498767, -0.236538, -0.471444, 1.154247]],
    "comment__": "Is action space continuous or discrete?",
"action_space_continuous":true,
    "comment__": "Should the method train on the validation set only",
"train_on_validation_set":true,
    "comment__": "Name of the type of simulator to use",
"environment_type": "terrainRLFlatBiped2D",
    "comment__": "Model type to use for the forward dynamics model",
"forward_dynamics_predictor": "network",
    "comment__": "Method to be used for the forward dynamics model is the model types uses a simulator",
"sampling_method": "SequentialMC",
    "comment__": "Use the action suggested by the policy to start the sampling method.",
"use_actor_policy_action_suggestion": true,
    "comment__": "If selecting from a uniform distribution the number of regularly distant samples to take / action dimension",
"num_uniform_action_samples": 3,
    "comment__": "Number of steps ahead the actions should be sampled",
"look_ahead_planning_steps": 2,
    "comment__": "How often to update the training data and plots wrt # of rounds",
"plotting_update_freq_num_rounds": 2,
    "comment__": "How often to save the training data and plotting data",
"saving_update_freq_num_rounds": 2,
    "comment__": "Number of treads that can be run in parallel during training",
"num_available_threads": 5,
    "comment__": "Length of the queues used to pass simulation data between the simulation workers and the learning agent(s).",
"queue_size_limit": 100,
    "comment__": "Number of actions performed between training updates",
"sim_action_per_training_update": 8,
    "comment__": "Number of rounds of adaptive sampling",
"adaptive_samples": 5,
    "comment__": "Number of elite adaptive samples to keep between adaptive sampling rounds",
"num_adaptive_samples_to_keep": 50,
    "comment__": "Use the variance calculated from the policy network (calculated using dropout)",
"use_actor_policy_action_variance_suggestion": false,
    "comment__": "Method used for action exploration",
"exploration_method": "gaussian_random",
    "comment__": "Amount of dropout to use in the networks (if using a dropout network)",
"dropout_p": 0.1,
    "comment__": "Regularization weight for the policy network",
"regularization_weight": 0.0001,
    "comment__": "Some parameter for rmsprop stocastic gradient optimization method.",
"rho": 0.95,
    "comment__": "Some parameter for rmsprop stocastic gradient optimization method.",
"rms_epsilon": 0.001,
    "comment__": "Number of training updates before the target network is updated",
"steps_until_target_network_update": 1000,
    "comment__": "Initial factor epsilone in multiplied by (This value will slowly be reduced during training)",
"epsilon_annealing": 0.8,
    "comment__": "Different ways of calculating the scaling method used normalize the input and outputs of the network from the bootstrapping samples. minmax, input and output are -mean/max-min. variance, input and output are -mean/(std*2), given, use the bounds provided in this file",
"state_normalization": "variance",
"load_saved_model": false,
"critic_updates_per_actor_update": 5,
    "comment__": "weather or not to clamp actions to stay inside the action boundaries",
"clamp_actions_to_stay_inside_bounds": false,
    "comment__": "Number of initial actions to sample before calculating input/output scaling and starting to train.",
"bootsrap_samples": 500,
    "comment__": "What method to use to select actions during bootstrapping",
"bootsrap_with_discrete_policy": true,
    "comment__": "That max number of action that can be take before the end of an episode/epoch",
"max_epoch_length": 50,
    "comment__": "If reward is below this bound it will not be put in the Experience Buffer",
"reward_lower_bound": -0.5,
    "comment__": "Enable guided policy search. Uses MCMC sampling ahead in time to select the best action to keep",
"use_guided_policy_search" : false,
    "comment__": "The number of training updates to perform for every action that is simulated",
"training_updates_per_sim_action": 1,
    "comment__": "Use The forward dynamics simulator as a way of sampling suggested actions for exploration",
"use_sampling_exploration": false,
    "comment__": "Use the forward dyanmics model to perform action exploration wrt to V -> fd > delta Action gradients",
"use_model_based_action_optimization": false,
    "comment__": "Flag for policy evaluation to swap in the task network from one model and the character/robot network from another",
"use_transfer_task_network": false,
    "comment__": "Add a large cost to actions that are suggested to be outside the action bounds.",
"penalize_actions_outside_bounds": false,
    "comment__": "Network type to use for the forward dynamics model",
"forward_dynamics_model_type": "Deep_NN",
    "comment__": "Whether or not to save the Experience memory after bootstrapping",
"save_experience_memory": false,
    "comment__": "Whether or not to train the policy and critic?",
"train_rl_learning": true,
    "comment__": "Force the character to start each new action in a good state, close to a good solution",
"use_back_on_track_forcing": false,
    "comment__": "draw/render the next state suggested by the forward dynamics model",
"visualize_forward_dynamics": false,
    "comment__": "Learning rate for the forward dynamics model",
"fd_learning_rate": 0.01,
    "comment__": "Whether or not to train the policy. Used for debugging",
"train_actor": true,
    "comment__": "Plot the terms for the critic as well (regularization and td error)",
"debug_critic": false,
    "comment__": "critic regularization weight",
"critic_regularization_weight": 0.0001,
    "comment__": "Critic learning rate",
"critic_learning_rate": 0.001,
    "comment__": "During evaluation plot of value function",
"visualize_expected_value": true,
    "comment__": "exponential decay value for use in reward function",
"target_velocity_decay":-0.75,
    "comment__": "Target velocity for controller",
"target_velocity":3.0,
    "comment__": "NUmmber of terrain features for which convolutinoal filters should be used",
"num_terrain_features": 0,
    "comment__": "Initial tempurature for annealing of e-greedy exploration",
"initial_temperature": 20.0,
    "comment__": "epsilon lower limit",
"min_epsilon": 0.15,
    "comment__": "Whether or not to draw/render the simulation",
"shouldRender": false,
    "comment__": "Learning rate use for model based action exploration",
"action_learning_rate": 10.0,
    "comment__": "During model-based action exploration, Probability of random policy action",
"model_based_action_omega": 0.0
}

Clone this wiki locally