diff --git a/hack/setup-resourcemgm-server.sh b/hack/setup-resourcemgm-server.sh new file mode 100644 index 00000000..fb2a462f --- /dev/null +++ b/hack/setup-resourcemgm-server.sh @@ -0,0 +1,28 @@ +#!/usr/bin/env bash + +# Convenience script to setup a fresh Linux installation for resource management service. + +set -o errexit +set -o nounset +set -o pipefail + +echo "The script is to help install prerequisites of resource management service" +echo "on a fresh Linux installation." + +GOLANG_VERSION=${GOLANG_VERSION:-"1.17.11"} + +echo "Update apt." +sudo apt-get -y update + +echo "Install jq." +sudo apt-get -y install jq + +echo "Install golang." +wget https://dl.google.com/go/go${GOLANG_VERSION}.linux-amd64.tar.gz -P /tmp +sudo tar -C /usr/local -xzf /tmp/go${GOLANG_VERSION}.linux-amd64.tar.gz +echo 'export PATH=/usr/local/go/bin:$PATH' >>~/.bash_profile +source ~/.bash_profile + +echo "Done." +echo "Please run and add 'export PATH=\$PATH:/usr/local/go/bin' into your shell profile." +echo "You can proceed to run ./setup/grs-up.sh if you want to start resource management service." diff --git a/resource-management/setup/gce/config-default.sh b/resource-management/setup/gce/config-default.sh new file mode 100644 index 00000000..1489bb6a --- /dev/null +++ b/resource-management/setup/gce/config-default.sh @@ -0,0 +1,63 @@ +#!/usr/bin/env bash + +# gcloud multiplexing for shared GCE/GKE tests. +GRS_ROOT=$(dirname "${BASH_SOURCE[0]}")/../.. + +#Default GCE config +GCLOUD=gcloud +ZONE=${GRS_GCE_ZONE:-us-central1-b} +REGION=${ZONE%-*} +RELEASE_REGION_FALLBACK=${RELEASE_REGION_FALLBACK:-false} + +NETWORK=${GRS_GCE_NETWORK:-default} +CREATE_CUSTOM_NETWORK=${CREATE_CUSTOM_NETWORK:-false} +# Enable network deletion by default, unless we're using 'default' network. +if [[ "${NETWORK}" == "default" ]]; then + GRS_DELETE_NETWORK=${GRS_DELETE_NETWORK:-false} +else + GRS_DELETE_NETWORK=${GRS_DELETE_NETWORK:-true} +fi +if [[ "${CREATE_CUSTOM_NETWORK}" == true ]]; then + SUBNETWORK="${SUBNETWORK:-${NETWORK}-custom-subnet}" +fi + + +#common config +GOLANG_VERSION=${GOLANG_VERSION:-"1.17.11"} +REDIS_VERSION=${REDIS_VERSION:-"6:7.0.0-1rl1~focal1"} +INSTANCE_PREFIX="${GRS_INSTANCE_PREFIX:-grs}" +SERVER_NAME="${INSTANCE_PREFIX}-server" +SIM_INSTANCE_PREFIX="${INSTANCE_PREFIX}-sim" +GCI_VERSION="ubuntu-2004-focal-v20220701" +GCE_PROJECT="ubuntu-os-cloud" +GCE_IMAGE="ubuntu-2004-focal-v20220701" +ENABLE_IP_ALIASES=${ENABLE_IP_ALIASES:-false} + +#Region simulator config +SIM_SIZE=${SIM_SIZE:-n1-standard-8} +NUM_SIMS=${NUM_SIMS:-5} +SIM_DISK_TYPE=pd-standard +SIM_DISK_SIZE=${SIM_DISK_SIZE:-"100GB"} +SIM_ROOT_DISK_SIZE=${SIM_ROOT_DISK_SIZE:-"20GB"} +SIM_OS_DISTRIBUTION=${SIM_OS_DISTRIBUTION:-gci} +SIM_LOG_LEVEL=${SIM_LOG_LEVEL:-"--v=4"} +SIM_REGION_NAME=${SIM_REGION_NAME:-"Beijing"} +SIM_RP_NUM=${SIM_RP_NUM:-10} +SIM_NODES_PER_RP=${SIM_NODES_PER_RP:-20000} +GCE_SIM_PROJECT=${GCE_PROJECT:-"ubuntu-os-cloud"} +GCE_SIM_IMAGE=${GCE_IMAGE:-"ubuntu-2004-focal-v20220701"} +SIM_TAG="${INSTANCE_PREFIX}-sim" + +#Resource manager server config +SERVER_SIZE=${SERVER_SIZE:-n1-standard-32} +SERVER_DISK_TYPE=pd-ssd +SERVER_DISK_SIZE=${SERVER_DISK_SIZE:-"200GB"} +SERVER_ROOT_DISK_SIZE=${SERVER_ROOT_DISK_SIZE:-"20GB"} +SERVER_OS_DISTRIBUTION=${SERVER_OS_DISTRIBUTION:-gci} +SERVER_LOG_LEVEL=${SERVER_LOG_LEVEL:-"--v=4"} +GCE_SERVER_PROJECT=${GCE_PROJECT:-"ubuntu-os-cloud"} +GCE_SERVER_IMAGE=${GCE_IMAGE:-"ubuntu-2004-focal-v20220701"} +SERVER_TAG="${INSTANCE_PREFIX}-server" +RESOURCE_URLS=${RESOURCE_URLS:-} + + \ No newline at end of file diff --git a/resource-management/setup/gce/configure.sh b/resource-management/setup/gce/configure.sh new file mode 100644 index 00000000..43386f7d --- /dev/null +++ b/resource-management/setup/gce/configure.sh @@ -0,0 +1,252 @@ +#!/usr/bin/env bash + +set -o errexit +set -o nounset +set -o pipefail + + +# Use --retry-connrefused opt only if it's supported by curl. +CURL_RETRY_CONNREFUSED="" +if curl --help | grep -q -- '--retry-connrefused'; then + CURL_RETRY_CONNREFUSED='--retry-connrefused' +fi + + + +function validate-python { + local ver=$(python3 -c"import sys; print(sys.version_info.major)") + echo "python3 version: $ver" + if [[ $ver -ne 3 ]]; then + apt-get -y update + apt-get install -y python3 + apt-get install -y python3-pip + pip install pyyaml + else + echo "python3: $ver is running.." + fi +} + +function download-server-env { + # Fetch server-env from GCE metadata server. + ( + umask 077 + local -r tmp_server_env="/tmp/server-env.yaml" + curl --fail --retry 5 --retry-delay 3 ${CURL_RETRY_CONNREFUSED} --silent --show-error \ + -H "X-Google-Metadata-Request: True" \ + -o "${tmp_server_env}" \ + http://metadata.google.internal/computeMetadata/v1/instance/attributes/server-env + # Convert the yaml format file into a shell-style file. + eval $(python3 -c ''' +import pipes,sys,yaml +items = yaml.load(sys.stdin, Loader=yaml.BaseLoader).items() +for k, v in items: + print("readonly {var}={value}".format(var = k, value = pipes.quote(str(v)))) +''' < "${tmp_server_env}" > "${SERVER_HOME}/server-env") + rm -f "${tmp_server_env}" + ) +} + +# Get default service account credentials of the VM. +GCE_METADATA_INTERNAL="http://metadata.google.internal/computeMetadata/v1/instance" +function get-credentials { + curl --fail --retry 5 --retry-delay 3 ${CURL_RETRY_CONNREFUSED} --silent --show-error "${GCE_METADATA_INTERNAL}/service-accounts/default/token" -H "Metadata-Flavor: Google" -s | python3 -c \ + 'import sys; import json; print(json.loads(sys.stdin.read())["access_token"])' +} + +# intall-redis +function install-redis { + local -r version="$1" + if [ `uname -s` == "Linux" ]; then + LINUX_OS=`uname -v |awk -F'-' '{print $2}' |awk '{print $1}'` + if [ "$LINUX_OS" == "Ubuntu" ]; then + UBUNTU_VERSION_ID=`grep VERSION_ID /etc/os-release |awk -F'"' '{print $2}'` + + echo "1. Install Redis on Ubuntu ......" + REDIS_GPG_FILE=/usr/share/keyrings/redis-archive-keyring.gpg + if [ -f $REDIS_GPG_FILE ]; then + rm -f $REDIS_GPG_FILE + fi + curl -fsSL https://packages.redis.io/gpg | gpg --dearmor -o $REDIS_GPG_FILE + + echo "deb [signed-by=/usr/share/keyrings/redis-archive-keyring.gpg] https://packages.redis.io/deb $(lsb_release -cs) main" | tee /etc/apt/sources.list.d/redis.list + + apt-get update + if [ "$UBUNTU_VERSION_ID" != "20.04" ]; then + echo "The Ubuntu $UBUNTU_VERSION_ID is not currently supported and exit" + return + fi + + echo "Purge existing version of Redis ......" + apt-get purge redis -y + apt-get purge redis-server -y + apt-get purge redis-tools -y + + echo "Install Redis 7.0.0 ......" + apt-get install redis-tools=$version + apt-get install redis-server=$version + apt-get install redis=$version + echo "End to install on Ubuntu ......" + + echo "" + echo "2. Enable and Run Redis ......" + echo "===============================" + REDIS_CONF_Ubuntu=/etc/redis/redis.conf + ls -alg $REDIS_CONF_Ubuntu + + sed -i -e "s/^supervised auto$/supervised systemd/g" $REDIS_CONF_Ubuntu + egrep -v "(^#|^$)" $REDIS_CONF_Ubuntu |grep "supervised " + + sed -i -e "s/^appendonly no$/appendonly yes/g" $REDIS_CONF_Ubuntu + egrep -v "(^#|^$)" $REDIS_CONF_Ubuntu |egrep "(appendonly |appendfsync )" + + ls -al /lib/systemd/system/ |grep redis + + systemctl restart redis-server.service + systemctl status redis-server.service + else + echo "" + echo "This Linux OS ($LinuxOS) is currently not supported and exit" + return + fi + else + echo "" + echo "only ubuntu is currently supported" + return + fi + + echo "" + echo "Sleeping for 5 seconds after Redis installation ......" + sleep 5 + + echo "" + echo "3. Simply Test Redis ......" + echo "===============================" + which redis-cli + echo "3.1) Test ping ......" + redis-cli ping + + echo "" + echo "3.2) Test write key and value ......" + redis-cli << EOF +SET server:name "fido" +GET server:name +EOF + + echo "" + echo "3.3) Test write queue ......" + redis-cli << EOF +lpush demos redis-macOS-demo +rpop demos +EOF + + echo "" + echo "Sleep 5 seconds after Redis tests ..." + sleep 5 + + # Redis Persistence Options: + # + # 1.Redis Database File (RDB) persistence takes snapshots of the database at intervals corresponding to the save directives in the redis.conf file. The redis.conf file contains three default intervals. RDB persistence generates a compact file for data recovery. However, any writes since the last snapshot is lost. + + # 2. Append Only File (AOF) persistence appends every write operation to a log. Redis replays these transactions at startup to restore the database state. You can configure AOF persistence in the redis.conf file with the appendonly and appendfsync directives. This method is more durable and results in less data loss. Redis frequently rewrites the file so it is more concise, but AOF persistence results in larger files, and it is typically slower than the RDB approach + + echo "" + echo "************************************************************" + echo "* *" + echo "* You are successful to install and configure Redis Server *" + echo "* *" + echo "************************************************************" +} + +function setup-server-env { + golang_version=${GOLANG_VERSION:-"1.17.11"} + redis_version=${REDIS_VERSION:-"6:7.0.0-1rl1~focal1"} + echo "Update apt." + apt-get -y update + + echo "Install jq." + apt-get -y install jq + + install-golang + + + install-redis ${redis_version} +} + +function install-golang { + echo "Installinng golang." + GOROOT="/usr/local/go" + GOPATH="${SERVER_HOME}/go" + + wget https://dl.google.com/go/go${GOLANG_VERSION}.linux-amd64.tar.gz -P /tmp + tar -C /usr/local -xzf /tmp/go${GOLANG_VERSION}.linux-amd64.tar.gz + + #export GOROOT=${GOROOT} + #export GOPATH=${GOPATH} + #export PATH=/usr/local/go/bin:$PATH +} + + +function gitclone-project { + git --version &>/dev/null + GIT_IS_AVAILABLE=$? + if [ $GIT_IS_AVAILABLE -ne 0 ]; then + echo "git doesn't exist, installing" + apt-get -y update + apt-get -y install git + fi + + echo "git clone global resource service repo" + if [ -d "${GIT_REPO}/global-resource-service" ]; then + rm -r ${GIT_REPO}/global-resource-service + fi + mkdir -p ${GIT_REPO} + cd ${GIT_REPO} + git clone https://github.com/CentaurusInfra/global-resource-service.git + cd ${GIT_REPO}/global-resource-service +} + +function set-broken-motd { + cat > /etc/motd <> /var/log/server-init.log 2>&1 +echo "Start to setup resource management service" +# if install fails, message-of-the-day (motd) will warn at login shell +set-broken-motd + +SERVER_HOME="/home/grs" +SERVER_BIN="${SERVER_HOME}/bin" +GIT_REPO="${SERVER_HOME}/go/src" + + + +#ensure-container-runtime +# validate or install python +validate-python +# download and source server-env +download-server-env +source "${SERVER_HOME}/server-env" + +# setup server enviroment +setup-server-env + +#gitclone-project + +##TODO: add build to build cmd bin to avoid go run and git clone. +##TODO: add "too many open files" configuration + +echo "Done for installing resource management server files, Please run and add 'export PATH=\$PATH:/usr/local/go/bin' into your shell profile." diff --git a/resource-management/setup/gce/server-helper.sh b/resource-management/setup/gce/server-helper.sh new file mode 100644 index 00000000..2d15ff8e --- /dev/null +++ b/resource-management/setup/gce/server-helper.sh @@ -0,0 +1,75 @@ +#!/usr/bin/env bash + +# create-server-instance creates the server instance. If called with +# an argument, the argument is used as the name to a reserved IP +# address for the server. (In the case of upgrade/repair, we re-use +# the same IP.) +# +# variables are set: +# ensure-temp-dir +# detect-project +# get-bearer-token +function create-server-instance { + local address="" + [[ -n ${1:-} ]] && address="${1}" + local internal_address="" + [[ -n ${2:-} ]] && internal_address="${2}" + + write-server-env + #ensure-gci-metadata-files + create-server-instance-internal "${SERVER_NAME}" "${address}" "${internal_address}" +} + +function create-server-instance-internal() { + local gcloud="gcloud" + local retries=5 + local sleep_sec=10 + + local -r server_name="${1}" + local -r address="${2:-}" + local -r internal_address="${3:-}" + + local network=$(make-gcloud-network-argument \ + "${NETWORK_PROJECT}" "${REGION}" "${NETWORK}" "${SUBNETWORK:-}" \ + "${address:-}" "${ENABLE_IP_ALIASES:-}" "${IP_ALIAS_SIZE:-}") + + local metadata="server-env=${SERVICE_TEMP}/server-env.yaml" + metadata="${metadata},user-data=${GRS_ROOT}/setup/gce/server.yaml" + metadata="${metadata},configure-sh=${GRS_ROOT}/setup/gce/configure.sh" + + local disk="name=${server_name}-pd" + disk="${disk},device-name=server-pd" + disk="${disk},mode=rw" + disk="${disk},boot=no" + disk="${disk},auto-delete=no" + + for attempt in $(seq 1 ${retries}); do + if result=$(${gcloud} compute instances create "${server_name}" \ + --project "${PROJECT}" \ + --zone "${ZONE}" \ + --machine-type "${SERVER_SIZE}" \ + --image-project="${GCE_SERVER_PROJECT}" \ + --image "${GCE_SERVER_IMAGE}" \ + --tags "${SERVER_TAG}" \ + --scopes "storage-ro,compute-rw,monitoring,logging-write" \ + --metadata-from-file "${metadata}" \ + --disk "${disk}" \ + --boot-disk-size "${SERVER_ROOT_DISK_SIZE}" \ + ${network} \ + 2>&1); then + echo "${result}" >&2 + + return 0 + else + echo "${result}" >&2 + if [[ ! "${result}" =~ "try again later" ]]; then + echo "Failed to create server instance due to non-retryable error" >&2 + return 1 + fi + sleep $sleep_sec + fi + done + + echo "Failed to create server instance despite ${retries} attempts" >&2 + return 1 +} diff --git a/resource-management/setup/gce/server.yaml b/resource-management/setup/gce/server.yaml new file mode 100644 index 00000000..331d3000 --- /dev/null +++ b/resource-management/setup/gce/server.yaml @@ -0,0 +1,40 @@ +#cloud-config + +write_files: + - path: /etc/systemd/system/server-installation.service + permissions: 0644 + owner: root + content: | + [Unit] + Description=Download and install service binaries and configurations + After=network-online.target + + [Service] + Type=oneshot + RemainAfterExit=yes + ExecStartPre=/bin/mkdir -p /home/grs/bin + ExecStartPre=/bin/mount --bind /home/grs/bin /home/grs/bin + ExecStartPre=/bin/mount -o remount,exec /home/grs/bin + # Use --retry-connrefused opt only if it's supported by curl. + ExecStartPre=/bin/bash -c 'OPT=""; if curl --help | grep -q -- "--retry-connrefused"; then OPT="--retry-connrefused"; fi; /usr/bin/curl --fail --retry 5 --retry-delay 3 $OPT --silent --show-error -H "X-Google-Metadata-Request: True" -o /home/grs/bin/configure.sh http://metadata.google.internal/computeMetadata/v1/instance/attributes/configure-sh' + ExecStartPre=/bin/chmod 544 /home/grs/bin/configure.sh + ExecStart=/home/grs/bin/configure.sh + + [Install] + WantedBy=grs.target + + - path: /etc/systemd/system/grs.target + permissions: 0644 + owner: root + content: | + [Unit] + Description=Global resource service + + [Install] + WantedBy=multi-user.target + +runcmd: + - systemctl daemon-reload + - systemctl enable server-installation.service + - systemctl enable grs.target + - systemctl start grs.target diff --git a/resource-management/setup/gce/util.sh b/resource-management/setup/gce/util.sh new file mode 100644 index 00000000..2c98b593 --- /dev/null +++ b/resource-management/setup/gce/util.sh @@ -0,0 +1,552 @@ +#!/usr/bin/env bash + +# A library of helper functions and constant for the local config. + +# Use the config file specified in $SERVICE_CONFIG_FILE, or default to +# config-default.sh. + +GRS_ROOT=$(dirname "${BASH_SOURCE[0]}")/../.. + + +source "${GRS_ROOT}/setup/gce/${GRS_CONFIG_FILE-"config-default.sh"}" + +source "${GRS_ROOT}/setup/gce/server-helper.sh" + +# These prefixes must not be prefixes of each other, so that they can be used to +# detect mutually exclusive sets of nodes. +SIMULATOR_INSTANCE_PREFIX=${NODE_INSTANCE_PREFIX:-"${INSTANCE_PREFIX}-sim"} +PROMPT_FOR_UPDATE=${PROMPT_FOR_UPDATE:-"n"} + +function join_csv() { + local IFS=','; echo "$*"; +} + +# This function returns the first string before the comma +function split_csv() { + echo "$*" | cut -d',' -f1 +} + +# Verify prereqs +function verify-prereqs() { + local cmd + + # we use gcloud to create the server, gsutil to stage binaries and data + for cmd in gcloud gsutil; do + if ! which "${cmd}" >/dev/null; then + local resp="n" + if [[ "${PROMPT_FOR_UPDATE}" == "y" ]]; then + echo "Can't find ${cmd} in PATH. Do you wish to install the Google Cloud SDK? [Y/n]" + read resp + fi + if [[ "${resp}" != "n" && "${resp}" != "N" ]]; then + curl https://sdk.cloud.google.com | bash + fi + if ! which "${cmd}" >/dev/null; then + echo "Can't find ${cmd} in PATH, please fix and retry. The Google Cloud " >&2 + echo "SDK can be downloaded from https://cloud.google.com/sdk/." >&2 + exit 1 + fi + fi + done + update-or-verify-gcloud +} + +# Update or verify required gcloud components are installed +# at minimum required version. +# Assumed vars +# PROMPT_FOR_UPDATE +function update-or-verify-gcloud() { + local sudo_prefix="" + if [ ! -w $(dirname `which gcloud`) ]; then + sudo_prefix="sudo" + fi + # update and install components as needed + if [[ "${PROMPT_FOR_UPDATE}" == "y" ]]; then + ${sudo_prefix} gcloud ${gcloud_prompt:-} components install alpha + ${sudo_prefix} gcloud ${gcloud_prompt:-} components install beta + ${sudo_prefix} gcloud ${gcloud_prompt:-} components update + else + local version=$(gcloud version --format=json) + python -c' +import json,sys +from distutils import version + +minVersion = version.LooseVersion("1.3.0") +required = [ "alpha", "beta", "core" ] +data = json.loads(sys.argv[1]) +rel = data.get("Google Cloud SDK") +if "CL @" in rel: + print("Using dev version of gcloud: %s" %rel) + exit(0) +if rel != "HEAD" and version.LooseVersion(rel) < minVersion: + print("gcloud version out of date ( < %s )" % minVersion) + exit(1) +missing = [] +for c in required: + if not data.get(c): + missing += [c] +if missing: + for c in missing: + print ("missing required gcloud component \"{0}\"".format(c)) + print ("Try running `gcloud components install {0}`".format(c)) + exit(1) + ' """${version}""" + fi +} + +# Use the gcloud defaults to find the project. If it is already set in the +# environment then go with that. +# +# Vars set: +# PROJECT +# NETWORK_PROJECT +# PROJECT_REPORTED +function detect-project() { + if [[ -z "${PROJECT-}" ]]; then + PROJECT=$(gcloud config list project --format 'value(core.project)') + fi + + NETWORK_PROJECT=${NETWORK_PROJECT:-${PROJECT}} + + if [[ -z "${PROJECT-}" ]]; then + echo "Could not detect Google Cloud Platform project. Set the default project using " >&2 + echo "'gcloud config set project '" >&2 + exit 1 + fi + if [[ -z "${PROJECT_REPORTED-}" ]]; then + echo "Project: ${PROJECT}" >&2 + echo "Network Project: ${NETWORK_PROJECT}" >&2 + echo "Zone: ${ZONE}" >&2 + PROJECT_REPORTED=true + fi +} + +# Example: trap_add 'echo "in trap DEBUG"' DEBUG +# See: http://stackoverflow.com/questions/3338030/multiple-bash-traps-for-the-same-signal +function trap_add() { + local trap_add_cmd + trap_add_cmd=$1 + shift + + for trap_add_name in "$@"; do + local existing_cmd + local new_cmd + + # Grab the currently defined trap commands for this trap + existing_cmd=$(trap -p "${trap_add_name}" | awk -F"'" '{print $2}') + + if [[ -z "${existing_cmd}" ]]; then + new_cmd="${trap_add_cmd}" + else + new_cmd="${trap_add_cmd};${existing_cmd}" + fi + + # Assign the test. Disable the shellcheck warning telling that trap + # commands should be single quoted to avoid evaluating them at this + # point instead evaluating them at run time. The logic of adding new + # commands to a single trap requires them to be evaluated right away. + # shellcheck disable=SC2064 + trap "${new_cmd}" "${trap_add_name}" + done +} + +# Opposite of ensure-temp-dir() +cleanup-temp-dir() { + rm -rf "${SERVICE_TEMP}" +} + +# Create a temp dir that'll be deleted at the end of this bash session. +# +# Vars set: +# SERVICE_TEMP +function ensure-temp-dir() { + if [[ -z ${SERVICE_TEMP-} ]]; then + SERVICE_TEMP=$(mktemp -d 2>/dev/null || mktemp -d -t grs.XXXXXX) + trap_add cleanup-temp-dir EXIT + fi +} + +# Detect region simulators created in the instance group. +# +# Assumed vars: +# SIM_INSTANCE_PREFIX + +# Vars set: +# SIM_NAMES +# INSTANCE_GROUPS +function detect-sim-names() { + detect-project + INSTANCE_GROUPS=() + INSTANCE_GROUPS+=($(gcloud compute instance-groups managed list \ + --project "${PROJECT}" \ + --filter "name ~ '${SIM_INSTANCE_PREFIX}-.+' AND zone:(${ZONE})" \ + --format='value(name)' || true)) + SIM_NAMES=() + if [[ -n "${INSTANCE_GROUPS[@]:-}" ]]; then + for group in "${INSTANCE_GROUPS[@]}"; do + SIM_NAMES+=($(gcloud compute instance-groups managed list-instances \ + "${group}" --zone "${ZONE}" --project "${PROJECT}" \ + --format='value(instance)')) + done + fi + + echo "INSTANCE_GROUPS=${INSTANCE_GROUPS[*]:-}" >&2 + echo "SIM_NAMES=${SIM_NAMES[*]:-}" >&2 +} + +function check-network-mode() { + local mode="$(gcloud compute networks list --filter="name=('${NETWORK}')" --project ${NETWORK_PROJECT} --format='value(x_gcloud_subnet_mode)' || true)" + # The deprecated field uses lower case. Convert to upper case for consistency. + echo "$(echo $mode | tr [a-z] [A-Z])" +} + +function create-network() { + if ! gcloud compute networks --project "${NETWORK_PROJECT}" describe "${NETWORK}" &>/dev/null; then + # The network needs to be created synchronously or we have a race. The + # firewalls can be added concurrent with instance creation. + local network_mode="auto" + if [[ "${CREATE_CUSTOM_NETWORK:-}" == "true" ]]; then + network_mode="custom" + fi + echo "Creating new ${network_mode} network: ${NETWORK}" + gcloud compute networks create --project "${NETWORK_PROJECT}" "${NETWORK}" --subnet-mode="${network_mode}" + else + PREEXISTING_NETWORK=true + PREEXISTING_NETWORK_MODE="$(check-network-mode)" + echo "Found existing network ${NETWORK} in ${PREEXISTING_NETWORK_MODE} mode." + fi +} + +function create-subnetworks() { + case ${ENABLE_IP_ALIASES} in + true) echo "IP aliases are enabled. Creating subnetworks.";; + false) + echo "IP aliases are disabled." + if [[ "${ENABLE_BIG_CLUSTER_SUBNETS}" = "true" ]]; then + if [[ "${PREEXISTING_NETWORK}" != "true" ]]; then + expand-default-subnetwork + else + echo "${color_yellow}Using pre-existing network ${NETWORK}, subnets won't be expanded to /19!${color_norm}" + fi + elif [[ "${CREATE_CUSTOM_NETWORK:-}" == "true" && "${PREEXISTING_NETWORK}" != "true" ]]; then + gcloud compute networks subnets create "${SUBNETWORK}" --project "${NETWORK_PROJECT}" --region "${REGION}" --network "${NETWORK}" --range "${NODE_IP_RANGE}" + fi + return;; + *) echo "${color_red}Invalid argument to ENABLE_IP_ALIASES${color_norm}" + exit 1;; + esac + + # Look for the alias subnet, it must exist and have a secondary + # range configured. + local subnet=$(gcloud compute networks subnets describe \ + --project "${NETWORK_PROJECT}" \ + --region ${REGION} \ + ${IP_ALIAS_SUBNETWORK} 2>/dev/null) + if [[ -z ${subnet} ]]; then + echo "Creating subnet ${NETWORK}:${IP_ALIAS_SUBNETWORK}" + gcloud compute networks subnets create \ + ${IP_ALIAS_SUBNETWORK} \ + --description "Automatically generated subnet for ${INSTANCE_PREFIX} cluster. This will be removed on cluster teardown." \ + --project "${NETWORK_PROJECT}" \ + --network ${NETWORK} \ + --region ${REGION} \ + --range ${NODE_IP_RANGE} \ + --secondary-range "pods-default=${CLUSTER_IP_RANGE}" \ + --secondary-range "services-default=${SERVICE_CLUSTER_IP_RANGE}" + echo "Created subnetwork ${IP_ALIAS_SUBNETWORK}" + else + if ! echo ${subnet} | grep --quiet secondaryIpRanges; then + echo "${color_red}Subnet ${IP_ALIAS_SUBNETWORK} does not have a secondary range${color_norm}" + exit 1 + fi + fi +} + +# Robustly try to create a static ip. +# $1: The name of the ip to create +# $2: The name of the region to create the ip in. +function create-static-ip() { + detect-project + local attempt=0 + local REGION="$2" + while true; do + if gcloud compute addresses create "$1" \ + --project "${PROJECT}" \ + --region "${REGION}" -q > /dev/null; then + # successful operation - wait until it's visible + start="$(date +%s)" + while true; do + now="$(date +%s)" + # Timeout set to 15 minutes + if [[ $((now - start)) -gt 900 ]]; then + echo "Timeout while waiting for server IP visibility" + exit 2 + fi + if gcloud compute addresses describe "$1" --project "${PROJECT}" --region "${REGION}" >/dev/null 2>&1; then + break + fi + echo "server IP not visible yet. Waiting..." + sleep 5 + done + break + fi + + if gcloud compute addresses describe "$1" \ + --project "${PROJECT}" \ + --region "${REGION}" >/dev/null 2>&1; then + # it exists - postcondition satisfied + break + fi + + if (( attempt > 4 )); then + echo -e "${color_red}Failed to create static ip $1 ${color_norm}" >&2 + exit 2 + fi + attempt=$(($attempt+1)) + echo -e "${color_yellow}Attempt $attempt failed to create static ip $1. Retrying.${color_norm}" >&2 + sleep $(($attempt * 5)) + done +} + +# Instantiate resource management service +# + +function grs-up() { + ensure-temp-dir + detect-project + create-network + create-resourcemanagement-server + create-region-simulator +} + +# tear done resource management service + +function grs-down() { + detect-project + + echo "Bringing down resource management service" + set +e # Do not stop on error + + # Get the name of the managed instance group template and delete + local templates=$(get-template "${PROJECT}") + + local all_instance_groups=(${INSTANCE_GROUPS[@]:-}) + for group in ${all_instance_groups[@]:-}; do + { + if gcloud compute instance-groups managed describe "${group}" --project "${PROJECT}" --zone "${ZONE}" &>/dev/null; then + gcloud compute instance-groups managed delete \ + --project "${PROJECT}" \ + --quiet \ + --zone "${ZONE}" \ + "${group}" + fi + } & + done + + wait-for-jobs || { + echo -e "Failed to delete instance template(s)." >&2 + } + + + # Check if this are any remaining server replicas. + local REMAINING_SERVER_COUNT=0 +#: <<'EOF' + REMAINING_SERVER_COUNT=$(gcloud compute instances list \ + --project "${PROJECT}" \ + --filter="name ~ '$(get-replica-name-regexp)'" \ + --format "value(zone)" | wc -l) + + if [[ "${REMAINING_SERVER_COUNT}" -ge 1 ]]; then + local instance_names=$(get-all-replica-names) + + for instance_name in ${instance_names[@]:-}; do + { + if gcloud compute instances describe "${instance_name}" --zone "${ZONE}" --project "${PROJECT}" &>/dev/null; then + gcloud compute instances delete \ + --project "${PROJECT}" \ + --zone "${ZONE}" \ + --quiet \ + "${instance_name}" + fi + } + done + + wait-for-jobs || { + echo -e "Failed to delete server(s)." >&2 + } + fi +#EOF + + REMAINING_SERVER_COUNT=$(gcloud compute instances list \ + --project "${PROJECT}" \ + --filter="name ~ '$(get-replica-name-regexp)'" \ + --format "value(zone)" | wc -l) + + if [[ "${REMAINING_SERVER_COUNT}" -eq 0 ]]; then + # Delete the server's reserved IP + if gcloud compute addresses describe "${SERVER_NAME}-ip" --region "${REGION}" --project "${PROJECT}" &>/dev/null; then + echo "Deleting the server's reserved IP" + gcloud compute addresses delete \ + --project "${PROJECT}" \ + --region "${REGION}" \ + --quiet \ + "${SERVER_NAME}-ip" + fi + + # Delete the server's pd + if gcloud compute disks describe "${SERVER_NAME}-pd" --zone "${ZONE}" --project "${PROJECT}" &>/dev/null; then + echo "Deleting the server's pd" + gcloud compute disks delete \ + --project "${PROJECT}" \ + --zone "${ZONE}" \ + --quiet \ + "${SERVER_NAME}-pd" + fi + fi + + set -e +} + +function get-replica-name-regexp() { + echo "^${SERVER_NAME}(-...)?" +} + +function get-all-replica-names() { + echo $(gcloud compute instances list \ + --project "${PROJECT}" \ + --filter="name ~ '$(get-replica-name-regexp)'" \ + --format "value(name)" | tr "\n" "," | sed 's/,$//') +} + +# Gets the instance templates in use by the service. It echos the template names +# so that the function output can be used. + +function get-template() { + local linux_filter="${SIM_INSTANCE_PREFIX}-(extra-)?template(-)?" + + gcloud compute instance-templates list \ + --filter="name ~ '${linux_filter}'" \ + --project="${1}" --format='value(name)' +} + +function create-resourcemanagement-server() { + echo "Starting rersource management server" + + # We have to make sure the disk is created before creating the server VM, so + # run this in the foreground. + gcloud compute disks create "${SERVER_NAME}-pd" \ + --project "${PROJECT}" \ + --zone "${ZONE}" \ + --type "${SERVER_DISK_TYPE}" \ + --size "${SERVER_DISK_SIZE}" + + # Reserve the server's IP so that it can later be transferred to another VM + create-static-ip "${SERVER_NAME}-ip" "${REGION}" + SERVER_RESERVED_IP=$(gcloud compute addresses describe "${SERVER_NAME}-ip" \ + --project "${PROJECT}" --region "${REGION}" -q --format='value(address)') + + create-server-instance "${SERVER_RESERVED_IP}" + + +} + +function create-region-simulator() { + echo "Starting region simulatotrs" + #create-nodes-template + #create-linux-nodes +} + +# Quote something appropriate for a yaml string. +# +# TODO(zmerlynn): Note that this function doesn't so much "quote" as +# "strip out quotes", and we really should be using a YAML library for +# this, but PyYAML isn't shipped by default, and *rant rant rant ... SIGH* +function yaml-quote { + echo "'$(echo "${@:-}" | sed -e "s/'/''/g")'" +} + +function write-server-env { + build-server-env "server" "${SERVICE_TEMP}/server-env.yaml" +} + +function write-sim-env { + build-server-env "sim" "${SERVICE_TEMP}/server-env.yaml" +} + +function build-server-env { + local server="$1" + local file="$2" + + rm -f ${file} + cat >$file <>$file <>$file <&2 + +echo "... calling verify-prereqs" >&2 +verify-prereqs + +echo "... calling grs-down" >&2 +grs-down + +echo "Done" + diff --git a/resource-management/setup/grs-up.sh b/resource-management/setup/grs-up.sh new file mode 100755 index 00000000..4825c6dc --- /dev/null +++ b/resource-management/setup/grs-up.sh @@ -0,0 +1,32 @@ +#!/usr/bin/env bash + + +set -o errexit +set -o nounset +set -o pipefail + +GRS_ROOT=$(dirname "${BASH_SOURCE[0]}")/.. + +if [ -f "${GRS_ROOT}/setup/env.sh" ]; then + source "${GRS_ROOT}/setup/env.sh" +fi + +source "${GRS_ROOT}/setup/grs-util.sh" + +if [ -z "${ZONE-}" ]; then + echo "... Starting cluster using provider: ${CLOUD_PROVIDER}" >&2 +else + echo "... Starting cluster in ${ZONE} using provider ${CLOUD_PROVIDER}" >&2 +fi + +echo "... calling verify-prereqs" >&2 +verify-prereqs + +echo "... calling grs-up" >&2 +grs-up + +echo -e "Done, resource management service is running!\n" >&2 + +echo + +exit 0 diff --git a/resource-management/setup/grs-util.sh b/resource-management/setup/grs-util.sh new file mode 100644 index 00000000..e298e84f --- /dev/null +++ b/resource-management/setup/grs-util.sh @@ -0,0 +1,14 @@ +#!/usr/bin/env bash + +GRS_ROOT=$(dirname "${BASH_SOURCE[0]}")/.. + +CLOUD_PROVIDER="${CLOUD_PROVIDER:-gce}" + +# PROVIDER_VARS is a list of cloud provider specific variables. Note: +# this is a list of the _names_ of the variables, not the value of the +# variables. + +PROVIDER_UTILS="${GRS_ROOT}/setup/${CLOUD_PROVIDER}/util.sh" +if [ -f "${PROVIDER_UTILS}" ]; then + source "${PROVIDER_UTILS}" +fi