diff --git a/roles/kepler/README.md b/roles/kepler/README.md new file mode 100644 index 000000000..3b9634cd5 --- /dev/null +++ b/roles/kepler/README.md @@ -0,0 +1,9 @@ +Kepler + +Kepler (Kubernetes-based Efficient Power Level Exporter) is a Prometheus exporter that measures energy consumption at the container, pod, VM, and process level by reading hardware sensors and attributing power based on resource utilization. + +Kepler uses Intel RAPL (Running Average Power Limit) sensors to collect energy data from CPU packages, cores, and memory subsystems, then distributes this energy proportionally to workloads based on their CPU time consumption. + + * GIT: https://github.com/sustainable-computing-io/kepler + * IMAGES: https://quay.io/repository/sustainable_computing_io/kepler + * WWW: https://sustainable-computing.io/ diff --git a/roles/kepler/defaults/main.yml b/roles/kepler/defaults/main.yml new file mode 100644 index 000000000..52b0ce934 --- /dev/null +++ b/roles/kepler/defaults/main.yml @@ -0,0 +1,33 @@ +--- +########################## +# operator + +operator_user: "dragon" +operator_group: "{{ operator_user }}" + +########################## +# docker + +docker_network_mtu: 1500 +docker_registry_kepler: "quay.io" + +########################## +# kepler + +kepler_configuration_directory: "/opt/kepler/configuration" +kepler_container_name: "kepler" +kepler_docker_compose_directory: "/opt/kepler" +kepler_exporter: "prometheus" +kepler_flags: "{{ kepler_flags_defaults + kepler_flags_extra }}" +kepler_flags_defaults: "--config.file=/etc/kepler/config.yaml" +kepler_flags_extras: +kepler_host: "0.0.0.0" +kepler_image: "{{ kepler_repository }}:{{ kepler_tag }}" +kepler_kubeconfig_directory: "/opt/kepler/kubeconfig" +kepler_network: "172.31.101.80/28" +kepler_port: 28282 +kepler_port_container: "{{ kepler_port }}" +kepler_repository: "{{ docker_registry_kepler }}/sustainable_computing_io/kepler" +kepler_service_name: "docker-compose@kepler" +kepler_share_pids_with_host: true +kepler_tag: "v0.11.2" diff --git a/roles/kepler/handlers/main.yml b/roles/kepler/handlers/main.yml new file mode 100644 index 000000000..dc192063a --- /dev/null +++ b/roles/kepler/handlers/main.yml @@ -0,0 +1,10 @@ +--- +- name: Restart kepler service + become: true + ansible.builtin.service: + name: "{{ kepler_service_name }}" + state: restarted + register: result + until: result["status"]["ActiveState"] == "active" + retries: 10 + delay: 20 diff --git a/roles/kepler/meta/main.yml b/roles/kepler/meta/main.yml new file mode 100644 index 000000000..1d3cde852 --- /dev/null +++ b/roles/kepler/meta/main.yml @@ -0,0 +1,22 @@ +--- +galaxy_info: + author: Vinícius Zavam + description: Role osism.services.kepler + company: OSBA ECO:DIGIT + license: Apache License 2.0 + min_ansible_version: 2.16.0 + platforms: + - name: Ubuntu + versions: + - jammy + - noble + - name: Debian + versions: + - bookworm + - name: EL + versions: + - "9" + galaxy_tags: + - osism + - system +dependencies: [] diff --git a/roles/kepler/tasks/config.yml b/roles/kepler/tasks/config.yml new file mode 100644 index 000000000..6a9814433 --- /dev/null +++ b/roles/kepler/tasks/config.yml @@ -0,0 +1,22 @@ +--- +- name: Create required directories + become: true + ansible.builtin.file: + path: "{{ item }}" + state: directory + owner: "{{ operator_user }}" + group: "{{ operator_group }}" + mode: 0750 + loop: + - "{{ kepler_configuration_directory }}" + - "{{ kepler_docker_compose_directory }}" + - "{{ kepler_kubeconfig_directory }}" + +- name: Copy configuration file + ansible.builtin.template: + src: config.yaml.j2 + dest: "{{ kepler_configuration_directory }}/config.yaml" + owner: "{{ operator_user }}" + group: "{{ operator_group }}" + mode: 0640 + notify: Restart kepler service diff --git a/roles/kepler/tasks/main.yml b/roles/kepler/tasks/main.yml new file mode 100644 index 000000000..c906e0e1f --- /dev/null +++ b/roles/kepler/tasks/main.yml @@ -0,0 +1,8 @@ +--- +- name: Include config tasks + ansible.builtin.include_tasks: config.yml + tags: config + +- name: Include service tasks + ansible.builtin.include_tasks: service.yml + tags: service diff --git a/roles/kepler/tasks/service.yml b/roles/kepler/tasks/service.yml new file mode 100644 index 000000000..d8757ad1a --- /dev/null +++ b/roles/kepler/tasks/service.yml @@ -0,0 +1,20 @@ +--- +- name: Copy docker-compose.yml file + ansible.builtin.template: + src: docker-compose.yml.j2 + dest: "{{ kepler_docker_compose_directory }}/docker-compose.yml" + owner: "{{ operator_user }}" + group: "{{ operator_group }}" + mode: 0640 + notify: Restart kepler service + +- name: Manage kepler service + become: true + ansible.builtin.service: + name: "{{ kepler_service_name }}" + state: started + enabled: true + register: result + until: result["status"]["ActiveState"] == "active" + retries: 10 + delay: 20 diff --git a/roles/kepler/templates/config.yaml.j2 b/roles/kepler/templates/config.yaml.j2 new file mode 100644 index 000000000..037e5df9f --- /dev/null +++ b/roles/kepler/templates/config.yaml.j2 @@ -0,0 +1,83 @@ +--- +log: + level: info # debug, info, warn, error (default: info) + format: text # text or json (default: text) + +monitor: + # Interval is the monitor's refresh interval. All process that + # have a life time (gets spawned and killed within one monitor interval) + # less than the interval will be ignored. + # + # NOTE: setting the interval to 0s will disable the monitor refreshes + interval: 5s + + # Staleness is the duration after the data that is computed by the monitor + # in each refresh is considered stale and recomputed when requested again. + # + # This is especially useful if you have multiple prometheus instances + # (in a HA) scrapping kepler and the data received by both instances will be + # the same as long as the scrapes happens within the staleness duration. + # + # NOTE: Keep staleness shorter than the monitor interval. + staleness: 1000ms + + # maximum number of terminated workloads (process, container, VM, pods) + # to be kept in memory until the data is exported; 0 disables the limit + maxTerminated: 500 + + # minimum energy threshold (in joules) for terminated workloads + # terminated workloads with energy consumption below this threshold will be filtered out + minTerminatedEnergyThreshold: 10 + +host: + sysfs: /sys # Path to sysfs filesystem (default: /sys) + procfs: /proc # Path to procfs filesystem (default: /proc) + +rapl: + zones: [] # zones to be enabled, empty enables all default zones (core, dram, package) + +exporter: + stdout: # stdout exporter related config + enabled: false # disabled by default + + prometheus: # prometheus exporter related config + enabled: true + # debugCollectors: + # - go + # - process + metricsLevel: + - node + - process + - container + - vm + - pod + +debug: # debug related config + pprof: # pprof related config + enabled: false + +web: + configFile: "" # Path to TLS server config file + listenAddresses: # Web server listen addresses + - {{ kepler_host }}:{{ kepler_port_container }} + +kube: # kubernetes related config + enabled: false # enable kubernetes monitoring (default: false) + config: "" # path to kubeconfig file (optional if running in-cluster) + nodeName: "" # name of the kubernetes node (required when enabled) + +# WARN DO NOT ENABLE THIS IN PRODUCTION - for development / testing only +dev: + fake-cpu-meter: + enabled: false + zones: [] # zones to be enabled, empty enables all default zones + +# EXPERIMENTAL FEATURES - These features are experimental and may be unstable +# and are disabled by default +experimental: + platform: + redfish: + enabled: false # Enable experimental Redfish BMC power monitoring + configFile: hack/redfish.yaml # Path to Redfish BMC configuration file + nodeName: "" # Node name to use (overrides Kubernetes node name and hostname fallback) + httpTimeout: 5s # HTTP client timeout for BMC requests (default: 5s) diff --git a/roles/kepler/templates/docker-compose.yml.j2 b/roles/kepler/templates/docker-compose.yml.j2 new file mode 100644 index 000000000..f78cada24 --- /dev/null +++ b/roles/kepler/templates/docker-compose.yml.j2 @@ -0,0 +1,42 @@ +--- +services: + kepler: + container_name: "{{ kepler_container_name }}" + entrypoint: kepler {{ kepler_flags|join(" ") }} + image: "{{ kepler_image }}" + privileged: true + restart: unless-stopped +{% if kepler_share_pids_with_host %} + pid: host +{% endif %} + ports: + - "{{ kepler_host | ansible.utils.ipwrap }}:{{ kepler_port }}:{{ kepler_port_container }}/tcp" + volumes: + - type: bind + source: /proc + target: /host/proc + read_only: true + - type: bind + source: /sys + target: /host/sys + read_only: true + - type: bind + source: "{{ kepler_configuration_directory }}" + target: /etc/kepler + read_only: true + - type: bind + source: "{{ kepler_kubeconfig_directory }}" + target: /host/kube + read_only: true + command: + - --config.file=/etc/kepler/config.yaml + +networks: + default: + driver: bridge + driver_opts: + com.docker.network.driver.mtu: {{ docker_network_mtu }} + ipam: + driver: default + config: + - subnet: {{ kepler_network }}