diff --git a/crowbar_framework/app/controllers/barclamp_controller.rb b/crowbar_framework/app/controllers/barclamp_controller.rb index 8646d99100..59e75a9678 100644 --- a/crowbar_framework/app/controllers/barclamp_controller.rb +++ b/crowbar_framework/app/controllers/barclamp_controller.rb @@ -569,6 +569,8 @@ def proposal_update "crowbar-deep-merge-template" ) ) + # See FIXME in ServiceObject.apply_role + message = "" if code == 202 respond_to do |format| case code @@ -623,7 +625,10 @@ def proposal_update @proposal["attributes"][params[:barclamp]] = JSON.parse(params[:proposal_attributes]) @proposal["deployment"][params[:barclamp]] = JSON.parse(params[:proposal_deployment]) @service_object.save_proposal!(@proposal) - answer = @service_object.proposal_commit(params[:name]) + # validation already happened on save + answer = @service_object.proposal_commit(params[:name], + validate: false, + validate_after_save: false) flash[:alert] = answer[1] if answer[0] >= 400 flash[:notice] = answer[1] if answer[0] >= 300 and answer[0] < 400 flash[:notice] = t("barclamp.proposal_show.commit_proposal_success") if answer[0] == 200 diff --git a/crowbar_framework/app/models/crowbar_service.rb b/crowbar_framework/app/models/crowbar_service.rb index 786ebd9b5b..8b23b9efb8 100644 --- a/crowbar_framework/app/models/crowbar_service.rb +++ b/crowbar_framework/app/models/crowbar_service.rb @@ -45,7 +45,12 @@ def role_constraints # Unfortunatelly we need to explicitely look at crowbar-status of the proposal # because apply_role from this model ignores errors from superclass's apply_role. def commit_and_check_proposal - answer = proposal_commit("default", false, false) + answer = proposal_commit( + "default", + in_queue: false, + validate: false, + validate_after_save: false + ) # check if error message is saved in one of the nodes if answer.first != 200 found_errors = [] @@ -390,7 +395,7 @@ def prepare_nodes_for_crowbar_upgrade proposal.raw_data["deployment"]["crowbar"]["elements"]["crowbar-upgrade"] = nodes_to_upgrade proposal.save # commit the proposal so chef recipe get executed - proposal_commit("default", false, false) + proposal_commit("default", in_queue: false, validate_after_save: false) end def disable_non_core_proposals @@ -492,7 +497,7 @@ def revert_nodes_from_crowbar_upgrade # commit current proposal (with the crowbar-upgrade role still assigned to nodes), # so the recipe is executed when nodes have 'ready' state - proposal_commit("default", false, false) + proposal_commit("default", in_queue: false, validate: false, validate_after_save: false) # now remove the nodes from upgrade role proposal["deployment"]["crowbar"]["elements"]["crowbar-upgrade"] = [] proposal.save @@ -569,7 +574,7 @@ def apply_role (role, inst, in_queue) else unless answer[1].include?(id) @logger.debug("Crowbar apply_role: #{k}.#{id} wasn't active: Activating") - answer = service.proposal_commit(id, false, false) + answer = service.proposal_commit(id, in_queue: false, validate_after_save: false) if answer[0] != 200 answer[1] = "Failed to commit proposal '#{id}' for '#{k}' " + "(The error message was: #{answer[1].strip})" diff --git a/crowbar_framework/app/models/network_service.rb b/crowbar_framework/app/models/network_service.rb index 94c79a633a..d3a466f65d 100644 --- a/crowbar_framework/app/models/network_service.rb +++ b/crowbar_framework/app/models/network_service.rb @@ -122,8 +122,10 @@ def allocate_ip_by_type(bc_instance, network, range, object, type, suggestion = if type == :node # Save the information. - node.crowbar["crowbar"]["network"][network] = net_info - node.save + if node.crowbar["crowbar"]["network"][network] != net_info + node.crowbar["crowbar"]["network"][network] = net_info + node.save + end end @logger.info("Network allocate ip for #{type}: Assigned: #{name} #{network} #{range} #{net_info["address"]}") diff --git a/crowbar_framework/app/models/service_object.rb b/crowbar_framework/app/models/service_object.rb index 99d30c513a..1b973d8196 100644 --- a/crowbar_framework/app/models/service_object.rb +++ b/crowbar_framework/app/models/service_object.rb @@ -539,10 +539,11 @@ def proposal_delete(inst) # FIXME: most of these can be validations on the model itself, # preferrably refactored into Validator classes. def save_proposal!(prop, options = {}) - options.reverse_merge!(validate_after_save: true) + options.reverse_merge!(validate: true, validate_after_save: true) clean_proposal(prop.raw_data) - validate_proposal(prop.raw_data) - validate_proposal_elements(prop.elements) + validate_proposal(prop.raw_data) if options[:validate] + validate_proposal_elements(prop.elements) if options[:validate] + prop.latest_applied = false prop.save validate_proposal_after_save(prop.raw_data) if options[:validate_after_save] end @@ -550,7 +551,13 @@ def save_proposal!(prop, options = {}) # XXX: this is where proposal gets copied into a role, scheduling / ops order # is computed (in apply_role) and chef client gets called on the nodes. # Hopefully, this will get moved into a background job. - def proposal_commit(inst, in_queue = false, validate_after_save = true) + def proposal_commit(inst, options = {}) + options.reverse_merge!( + in_queue: false, + validate: true, + validate_after_save: true + ) + prop = Proposal.where(barclamp: @bc_name, name: inst).first if prop.nil? @@ -562,8 +569,10 @@ def proposal_commit(inst, in_queue = false, validate_after_save = true) begin # Put mark on the wall prop["deployment"][@bc_name]["crowbar-committing"] = true - save_proposal!(prop, validate_after_save: validate_after_save) - response = active_update prop.raw_data, inst, in_queue + save_proposal!(prop, + validate: options[:validate], + validate_after_save: options[:validate_after_save]) + response = active_update(prop.raw_data, inst, options[:in_queue]) rescue Chef::Exceptions::ValidationFailed => e @logger.error ([e.message] + e.backtrace).join("\n") response = [400, "Failed to validate proposal: #{e.message}"] @@ -923,6 +932,13 @@ def self.proposal_to_role(proposal, bc_name) def apply_role(role, inst, in_queue) @logger.debug "apply_role(#{role.name}, #{inst}, #{in_queue})" + # Variables used in the global ensure + apply_locks = [] + applying_nodes = [] + + # Cache some node attributes to avoid useless node reloads + node_attr_cache = {} + # Part I: Looking up data & checks # # we look up the role in the database (if there is one), the new one is @@ -953,7 +969,13 @@ def apply_role(role, inst, in_queue) # deps = proposal_dependencies(role) delay, pre_cached_nodes = queue_proposal(inst, element_order, new_elements, deps) - return [202, delay] unless delay.empty? + unless delay.empty? + # force not processing the queue further + in_queue = true + # FIXME: this breaks the convention that we return a string; but really, + # we should return a hash everywhere, to avoid this... + return [202, delay] + end @logger.debug "delay empty - running proposal" @@ -965,7 +987,6 @@ def apply_role(role, inst, in_queue) @logger.fatal("apply_role: Failed to expand items #{failures.inspect} for role \"#{role_name}\"") message = "Failed to apply the proposal: cannot expand list of nodes for role \"#{role_name}\", following items do not exist: #{failures.join(", ")}" update_proposal_status(inst, "failed", message) - process_queue unless in_queue return [405, message] end end @@ -1005,6 +1026,9 @@ def apply_role(role, inst, in_queue) role_map = new_deployment["element_states"] role_map = {} unless role_map + # List of all *new* nodes which will be changed (sans deleted ones) + all_nodes = new_elements.values.flatten + # deployment["element_order"] tells us which order the various # roles should be applied, and deployment["elements"] tells us # which nodes each role should be applied to. We need to "join @@ -1022,16 +1046,9 @@ def apply_role(role, inst, in_queue) # } pending_node_actions = {} - # List of all *new* nodes which will be changed (sans deleted ones) - all_nodes = [] - # We'll build an Array where each item represents a batch of work, # and the batches must be performed sequentially in this order. - # This will mirror the ordering specified by element_order below, - # but the sub-arrays of run_order will be Arrays of names of the - # involved nodes, whereas the sub-arrays of element_order are Arrays - # of names of Chef roles. - run_order = [] + batches = [] # get proposal to remember potential removal of a role proposal = Proposal.where(barclamp: @bc_name, name: inst).first @@ -1042,7 +1059,7 @@ def apply_role(role, inst, in_queue) element_order.each do |roles| # roles is an Array of names of Chef roles which can all be # applied in parallel. - @logger.debug "elems #{roles.inspect}" + @logger.debug "Preparing batch with following roles: #{roles.inspect}" # A list of nodes changed when applying roles from this batch nodes_in_batch = [] @@ -1055,9 +1072,9 @@ def apply_role(role, inst, in_queue) old_nodes = old_elements[role_name] || [] new_nodes = new_elements[role_name] || [] - @logger.debug "role_name #{role_name.inspect}" - @logger.debug "old_nodes #{old_nodes.inspect}" - @logger.debug "new_nodes #{new_nodes.inspect}" + @logger.debug "Preparing role #{role_name} for batch:" + @logger.debug " Nodes in old applied proposal for role: #{old_nodes.inspect}" + @logger.debug " Nodes in new applied proposal for role: #{new_nodes.inspect}" remove_role_name = "#{role_name}_remove" @@ -1072,17 +1089,15 @@ def apply_role(role, inst, in_queue) use_remove_role = !tmprole.nil? old_nodes.each do |node_name| - node = NodeObject.find_node_by_name(node_name) + pre_cached_nodes[node_name] ||= NodeObject.find_node_by_name(node_name) # Don't add deleted nodes to the run order, they clearly won't have # the old role - if node.nil? + if pre_cached_nodes[node_name].nil? @logger.debug "skipping deleted node #{node_name}" next end - pre_cached_nodes[node_name] = node - # An old node that is not in the new deployment, drop it unless new_nodes.include?(node_name) @logger.debug "remove node #{node_name}" @@ -1113,7 +1128,7 @@ def apply_role(role, inst, in_queue) # If new_nodes is empty, we are just removing the proposal. unless new_nodes.empty? new_nodes.each do |node_name| - node = NodeObject.find_node_by_name(node_name) + pre_cached_nodes[node_name] ||= NodeObject.find_node_by_name(node_name) # Don't add deleted nodes to the run order # @@ -1126,15 +1141,11 @@ def apply_role(role, inst, in_queue) # have some alias that be used to assign all existing nodes to a # role (which would be an improvement over the requirement to # explicitly list all nodes). - if node.nil? + if pre_cached_nodes[node_name].nil? @logger.debug "skipping deleted node #{node_name}" next end - pre_cached_nodes[node_name] = node - - all_nodes << node_name unless all_nodes.include?(node_name) - # A new node that we did not know before unless old_nodes.include?(node_name) @logger.debug "add node #{node_name}" @@ -1146,15 +1157,23 @@ def apply_role(role, inst, in_queue) end end # roles.each - @logger.debug "nodes_in_batch #{nodes_in_batch.inspect}" - run_order << nodes_in_batch unless nodes_in_batch.empty? - @logger.debug "run_order #{run_order.inspect}" + batches << nodes_in_batch unless nodes_in_batch.empty? + end + @logger.debug "batches: #{batches.inspect}" + + # Cache attributes that are useful later on + pre_cached_nodes.each do |node_name, node| + node_attr_cache[node_name] = { + "alias" => node.alias, + "windows" => node[:platform_family] == "windows", + "admin" => node.admin? + } end # save databag with the role removal intention proposal.save if save_proposal - applying_nodes = run_order.flatten.uniq.sort + applying_nodes = batches.flatten.uniq.sort # Mark nodes as applying; beware that all_nodes do not contain nodes that # are actually removed. @@ -1168,9 +1187,7 @@ def apply_role(role, inst, in_queue) # pause-file.lock exists which the daemons will honour due to a # custom patch: nodes_to_lock = applying_nodes.reject do |node_name| - pre_cached_nodes[node_name] ||= NodeObject.find_node_by_name(node_name) - node = pre_cached_nodes[node_name] - node[:platform_family] == "windows" || node.admin? + node_attr_cache[node_name]["windows"] || node_attr_cache[node_name]["admin"] end begin @@ -1191,8 +1208,6 @@ def apply_role(role, inst, in_queue) rescue Crowbar::Error::LockingFailure => e message = "Failed to apply the proposal: #{e.message}" update_proposal_status(inst, "failed", message) - restore_to_ready(applying_nodes) - process_queue unless in_queue return [409, message] # 409 is 'Conflict' end @@ -1221,7 +1236,7 @@ def apply_role(role, inst, in_queue) # Part III: Update run lists of nodes to reflect new deployment. I.e. write # through the deployment schedule in pending node actions into run lists. - @logger.debug "Clean the run_lists for #{pending_node_actions.inspect}" + @logger.debug "Update the run_lists for #{pending_node_actions.inspect}" admin_nodes = [] @@ -1239,9 +1254,6 @@ def apply_role(role, inst, in_queue) rlist = lists[:remove] alist = lists[:add] - @logger.debug "rlist #{rlist.pretty_inspect}" - @logger.debug "alist #{alist.pretty_inspect}" - # Remove the roles being lost rlist.each do |item| next unless node.role? item @@ -1280,7 +1292,6 @@ def apply_role(role, inst, in_queue) end end - @logger.debug("AR: Saving node #{node.name}") if save_it node.save if save_it end @@ -1298,26 +1309,24 @@ def apply_role(role, inst, in_queue) @logger.fatal("apply_role: Exception #{e.message} #{e.backtrace.join("\n")}") message = "Failed to apply the proposal: exception before calling chef (#{e.message})" update_proposal_status(inst, "failed", message) - restore_to_ready(applying_nodes) - process_queue unless in_queue return [405, message] end + ran_admin = false + # Invalidate cache as apply_role_pre_chef_call can save nodes pre_cached_nodes = {} # Each batch is a list of nodes that can be done in parallel. - ran_admin = false - run_order.each do |batch| - next if batch.empty? - @logger.debug "batch #{batch.inspect}" + batches.each do |nodes| + next if nodes.empty? + @logger.debug "Applying batch: #{nodes.inspect}" non_admin_nodes = [] admin_list = [] - batch.each do |node_name| + nodes.each do |node_name| # Run admin nodes a different way. if admin_nodes.include?(node_name) - @logger.debug "#{node_name} is in admin_nodes #{admin_nodes.inspect}" admin_list << node_name ran_admin = true next @@ -1325,9 +1334,6 @@ def apply_role(role, inst, in_queue) non_admin_nodes << node_name end - @logger.debug("AR: Calling chef-client for #{role.name} on non-admin nodes #{non_admin_nodes.join(" ")}") - @logger.debug("AR: Calling chef-client for #{role.name} on admin nodes #{admin_list.join(" ")}") - # # XXX: We used to do this twice - do we really need twice??? # Yes! We do! The system has some transient issues that are hidden @@ -1336,14 +1342,14 @@ def apply_role(role, inst, in_queue) # pids = {} unless non_admin_nodes.empty? + @logger.debug( + "AR: Calling chef-client for #{role.name} on non-admin nodes #{non_admin_nodes.join(" ")}" + ) non_admin_nodes.each do |node| - pre_cached_nodes[node] ||= NodeObject.find_node_by_name(node) - nobj = pre_cached_nodes[node] - unless nobj[:platform_family] == "windows" - filename = "#{ENV['CROWBAR_LOG_DIR']}/chef-client/#{node}.log" - pid = run_remote_chef_client(node, "chef-client", filename) - pids[pid] = node - end + next if node_attr_cache[node]["windows"] + filename = "#{ENV['CROWBAR_LOG_DIR']}/chef-client/#{node}.log" + pid = run_remote_chef_client(node, "chef-client", filename) + pids[pid] = node end status = Process.waitall badones = status.select { |x| x[1].exitstatus != 0 } @@ -1367,14 +1373,15 @@ def apply_role(role, inst, in_queue) message = message + "#{pids[baddie[0]]} \n"+ get_log_lines("#{pids[baddie[0]]}") end update_proposal_status(inst, "failed", message) - restore_to_ready(applying_nodes) - process_queue unless in_queue return [405, message] end end end unless admin_list.empty? + @logger.debug( + "AR: Calling chef-client for #{role.name} on admin nodes #{admin_list.join(" ")}" + ) admin_list.each do |node| filename = "#{ENV['CROWBAR_LOG_DIR']}/chef-client/#{node}.log" pid = run_remote_chef_client(node, Rails.root.join("..", "bin", "single_chef_client.sh").expand_path.to_s, filename) @@ -1402,8 +1409,6 @@ def apply_role(role, inst, in_queue) message = message + "#{pids[baddie[0]]} \n "+ get_log_lines("#{pids[baddie[0]]}") end update_proposal_status(inst, "failed", message) - restore_to_ready(applying_nodes) - process_queue unless in_queue return [405, message] end end @@ -1420,8 +1425,6 @@ def apply_role(role, inst, in_queue) @logger.fatal("apply_role: Exception #{e.message} #{e.backtrace.join("\n")}") message = "Failed to apply the proposal: exception after calling chef (#{e.message})" update_proposal_status(inst, "failed", message) - restore_to_ready(applying_nodes) - process_queue unless in_queue return [405, message] end @@ -1456,11 +1459,16 @@ def apply_role(role, inst, in_queue) proposal.save unless roles_to_remove.empty? update_proposal_status(inst, "success", "") - restore_to_ready(applying_nodes) - process_queue unless in_queue [200, {}] + rescue StandardError => e + @logger.fatal("apply_role: Uncaught exception #{e.message} #{e.backtrace.join("\n")}") + message = "Failed to apply the proposal: uncaught exception (#{e.message})" + update_proposal_status(inst, "failed", message) + [405, message] ensure - apply_locks.each(&:release) if apply_locks + apply_locks.each(&:release) if apply_locks.any? + restore_to_ready(applying_nodes) if applying_nodes.any? + process_queue unless in_queue end def apply_role_pre_chef_call(old_role, role, all_nodes) diff --git a/crowbar_framework/lib/crowbar/deployment_queue.rb b/crowbar_framework/lib/crowbar/deployment_queue.rb index 149c5443d4..029438c60b 100644 --- a/crowbar_framework/lib/crowbar/deployment_queue.rb +++ b/crowbar_framework/lib/crowbar/deployment_queue.rb @@ -202,8 +202,12 @@ def commit_proposal(bc, inst) service = eval("#{bc.camelize}Service.new logger") # This will call apply_role and chef-client. - # Params: (inst, in_queue, validate_after_save) - status, message = service.proposal_commit(inst, true, false) + status, message = service.proposal_commit( + inst, + in_queue: true, + validate: false, + validate_after_save: false + ) logger.debug("process queue: committed item #{bc}:#{inst}: results = #{message.inspect}")