diff --git a/historyserver/Dockerfile b/historyserver/Dockerfile deleted file mode 100644 index ed999b6148a..00000000000 --- a/historyserver/Dockerfile +++ /dev/null @@ -1,33 +0,0 @@ -ARG TARGETOS -ARG TARGETARCH - -FROM --platform=$BUILDPLATFORM golang:1.25.1 as builder -ENV GOPROXY=https://goproxy.cn,direct -ARG BUILD_RAYSERVER_DASHBOARD - -RUN if [ "$BUILD_RAYSERVER_DASHBOARD" = "yes" ] ; then \ - curl -o install.sh https://raw.githubusercontent.com/nvm-sh/nvm/v0.39.0/install.sh && chmod +x install.sh && ./install.sh && /bin/bash -c "source $HOME/.nvm/nvm.sh && nvm install 14 && nvm use 14" ;\ -else \ - echo "$BUILD_RAYSERVER_DASHBOARD not yes, no need install nvm"; \ -fi - -WORKDIR /historyserver -COPY . . - -RUN if [ "$BUILD_RAYSERVER_DASHBOARD" = "yes" ] ; then \ - /bin/bash -c "source $HOME/.nvm/nvm.sh && cd dashboard/v2.51.0/client && npm ci && npm run build" ;\ -else \ - mkdir -p dashboard/v2.51.0/client/build ;\ - echo "do not npm run build"; \ -fi - -RUN make build GOOS=${TARGETOS} GOARCH=${TARGETARCH} - -FROM ubuntu:22.04 - -RUN apt-get update && apt-get upgrade -y && rm -rf /var/cache/apt/ && apt-get install -y ca-certificates - -COPY --from=builder /historyserver/output/bin/historyserver /usr/local/bin/historyserver -COPY --from=builder /historyserver/output/bin/collector /usr/local/bin/collector -COPY --from=builder /historyserver/dashboard/v2.51.0/client/build /dashboard/v2.51.0/client/build -COPY --from=builder /historyserver/dashboard/homepage /dashboard/homepage diff --git a/historyserver/Dockerfile.historyserver b/historyserver/Dockerfile.historyserver new file mode 100644 index 00000000000..9a8a39251b4 --- /dev/null +++ b/historyserver/Dockerfile.historyserver @@ -0,0 +1,29 @@ +FROM golang:1.25.1 AS builder + +ENV GOPROXY=https://proxy.golang.org,direct +WORKDIR /historyserver + +# Copy the go modules and manifests. +COPY go.mod go.mod +COPY go.sum go.sum +# Cache dependencies to avoid re-downloading when only sources change. +RUN go mod download + +# Copy the go source. +COPY cmd/historyserver/main.go cmd/historyserver/main.go +# need collector because storage's interface is put in here, will change +# after this is merged +# https://github.com/ray-project/kuberay/pull/4302 +COPY pkg/collector/ pkg/collector/ +COPY pkg/historyserver/ pkg/historyserver/ +COPY pkg/storage/ pkg/storage/ +COPY pkg/utils/ pkg/utils/ +COPY pkg/eventserver/ pkg/eventserver/ + +# Build the historyserver binary. +COPY Makefile Makefile +RUN make buildhistoryserver GOOS=linux GOARCH=amd64 + +FROM ubuntu:22.04 +RUN apt-get update && apt-get upgrade -y && rm -rf /var/cache/apt/ && apt-get install -y ca-certificates +COPY --from=builder /historyserver/output/bin/historyserver /usr/local/bin/historyserver diff --git a/historyserver/Makefile b/historyserver/Makefile index dc92d2583a6..31a5a5c68ce 100644 --- a/historyserver/Makefile +++ b/historyserver/Makefile @@ -9,6 +9,7 @@ BIN_DIR=$(OUT_DIR)/bin BINARY_NAME=historyserver BINARY_NAME_COLLECTOR=collector COLLECTOR_IMG ?= collector:v0.1.0 +HISTORYSERVER_IMG ?= historyserver:v0.1.0 # Setting SHELL to bash allows bash commands to be executed by recipes. # Options are set to exit when a recipe line exits non-zero or a piped command fails. @@ -18,7 +19,7 @@ COMMIT_SHORT ?= $(shell git rev-parse --short HEAD) BRANCH ?= $(shell git branch --show-current) VERSION ?= $(shell git describe --tags --long|awk -F '-' '{print $$1"."$$2"-"$$3""}') -PACKAGE = gitlab.alibaba-inc.com/eml/historyserver +PACKAGE = github.com/ray-project/kuberay/historyserver GO_LDFLAGS := -extldflags "-static" # GO_LDFLAGS += -w -s # Drop debugging symbols. @@ -66,13 +67,16 @@ simplebuild: mod: go mod tidy +.PHONY: localimage-build +localimage-build: localimage-collector localimage-historyserver + .PHONY: localimage-collector localimage-collector: docker build -t $(COLLECTOR_IMG) -f Dockerfile.collector . -.PHONY: localimage -localimage: dockerbuilder_instance - docker buildx build -t historyserver:laster --platform linux/amd64 . --load +.PHONY: localimage-historyserver +localimage-historyserver: dockerbuilder_instance + docker build -t $(HISTORYSERVER_IMG) -f Dockerfile.historyserver . .PHONY: dockerbuilder_instance dockerbuilder_instance: diff --git a/historyserver/cmd/historyserver/main.go b/historyserver/cmd/historyserver/main.go index 06ab7d0f9a3..fb898ab924f 100644 --- a/historyserver/cmd/historyserver/main.go +++ b/historyserver/cmd/historyserver/main.go @@ -1 +1,101 @@ package main + +import ( + "encoding/json" + "flag" + "os" + "os/signal" + "sync" + "syscall" + + "github.com/ray-project/kuberay/historyserver/pkg/collector" + "github.com/ray-project/kuberay/historyserver/pkg/collector/types" + "github.com/ray-project/kuberay/historyserver/pkg/eventserver" + "github.com/ray-project/kuberay/historyserver/pkg/historyserver" + "github.com/sirupsen/logrus" +) + +func main() { + runtimeClassName := "" + rayRootDir := "" + kubeconfigs := "" + runtimeClassConfigPath := "/var/collector-config/data" + dashboardDir := "" + flag.StringVar(&runtimeClassName, "runtime-class-name", "", "") + flag.StringVar(&rayRootDir, "ray-root-dir", "", "") + flag.StringVar(&kubeconfigs, "kubeconfigs", "", "") + flag.StringVar(&dashboardDir, "dashboard-dir", "/dashboard", "") + flag.StringVar(&runtimeClassConfigPath, "runtime-class-config-path", "", "") //"/var/collector-config/data" + flag.Parse() + + cliMgr := historyserver.NewClientManager(kubeconfigs) + + jsonData := make(map[string]interface{}) + if runtimeClassConfigPath != "" { + data, err := os.ReadFile(runtimeClassConfigPath) + if err != nil { + panic("Failed to read runtime class config " + err.Error()) + } + err = json.Unmarshal(data, &jsonData) + if err != nil { + panic("Failed to parse runtime class config: " + err.Error()) + } + } + + registry := collector.GetReaderRegistry() + factory, ok := registry[runtimeClassName] + if !ok { + panic("Not supported runtime class name: " + runtimeClassName + ".") + } + + globalConfig := types.RayHistoryServerConfig{ + RootDir: rayRootDir, + } + + reader, err := factory(&globalConfig, jsonData) + if err != nil { + panic("Failed to create reader for runtime class name: " + runtimeClassName + ".") + } + + // Create EventHandler with storage reader + eventHandler := eventserver.NewEventHandler(reader) + + // WaitGroup to track goroutine completion + var wg sync.WaitGroup + + // Start EventHandler in background goroutine + eventStop := make(chan struct{}, 1) + wg.Add(1) + go func() { + defer wg.Done() + logrus.Info("Starting EventHandler in background...") + if err := eventHandler.Run(eventStop, 2); err != nil { + logrus.Errorf("EventHandler stopped with error: %v", err) + } + logrus.Info("EventHandler shutdown complete") + }() + + handler := historyserver.NewServerHandler(&globalConfig, dashboardDir, reader, cliMgr, eventHandler) + + sigChan := make(chan os.Signal, 1) + stop := make(chan struct{}, 1) + signal.Notify(sigChan, syscall.SIGINT, syscall.SIGTERM) + + wg.Add(1) + go func() { + defer wg.Done() + handler.Run(stop) + logrus.Info("HTTP server shutdown complete") + }() + + <-sigChan + logrus.Info("Received shutdown signal, initiating graceful shutdown...") + + // Stop both the server and the event handler + stop <- struct{}{} + eventStop <- struct{}{} + + // Wait for both goroutines to complete + wg.Wait() + logrus.Info("Graceful shutdown complete") +} diff --git a/historyserver/config/historyserver.yaml b/historyserver/config/historyserver.yaml new file mode 100644 index 00000000000..821609978a5 --- /dev/null +++ b/historyserver/config/historyserver.yaml @@ -0,0 +1,63 @@ +apiVersion: v1 +kind: Service +metadata: + name: historyserver #TODO: to specify your service name + labels: + app: historyserver +spec: + selector: + app: historyserver + ports: + - protocol: TCP + name: http + port: 30080 + targetPort: 8080 + type: ClusterIP +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: historyserver-demo + labels: + app: historyserver +spec: + replicas: 1 + selector: + matchLabels: + app: historyserver + template: + metadata: + labels: + app: historyserver + spec: + serviceAccountName: historyserver + containers: + - name: historyserver + env: + - name: S3DISABLE_SSL + value: "true" + - name: AWS_S3ID + value: minioadmin + - name: AWS_S3SECRET + value: minioadmin + - name: AWS_S3TOKEN + value: "" + - name: S3_BUCKET + value: "ray-historyserver" + - name: S3_ENDPOINT + value: "minio-service.minio-dev:9000" + - name: S3_REGION + value: "test" + - name: S3FORCE_PATH_STYLE + value: "true" + image: historyserver:v0.1.0 + imagePullPolicy: IfNotPresent + command: + - historyserver + - --runtime-class-name=s3 + - --ray-root-dir=log + ports: + - containerPort: 8080 + resources: + limits: + cpu: "500m" diff --git a/historyserver/config/rayjob.yaml b/historyserver/config/rayjob.yaml index dc5c5dbf041..6741b0ee898 100644 --- a/historyserver/config/rayjob.yaml +++ b/historyserver/config/rayjob.yaml @@ -8,11 +8,11 @@ spec: import ray ray.init() - @ray.remote + @ray.remote(num_cpus=0.5) def my_task(x): return x * 2 - @ray.remote + @ray.remote(num_cpus=0.5) class Counter: def __init__(self): self.count = 0 diff --git a/historyserver/config/service_account.yaml b/historyserver/config/service_account.yaml new file mode 100644 index 00000000000..42148d9a948 --- /dev/null +++ b/historyserver/config/service_account.yaml @@ -0,0 +1,27 @@ +apiVersion: v1 +kind: ServiceAccount +metadata: + name: historyserver +automountServiceAccountToken: true +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: raycluster-reader +rules: +- apiGroups: ["ray.io"] + resources: ["rayclusters"] + verbs: ["list", "get"] +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: historyserver + namespace: default +subjects: +- kind: ServiceAccount + name: historyserver + namespace: default +roleRef: + kind: ClusterRole + name: raycluster-reader diff --git a/historyserver/docs/set_up_historyserver.md b/historyserver/docs/set_up_historyserver.md new file mode 100644 index 00000000000..6b5f6780951 --- /dev/null +++ b/historyserver/docs/set_up_historyserver.md @@ -0,0 +1,153 @@ +# History Server Quick Start Guide + +## Prerequisites + +- Kind +- Docker +- kubectl +- Go 1.24+ + +## Setup Steps + +### 1. Create Kind Cluster + +```bash +kind create cluster --image=kindest/node:v1.27.0 +``` + +### 2. Build and Run Ray Operator + +Build and deploy the KubeRay operator (binary or deployment). + +### 3. Deploy MinIO + +```bash +kubectl apply -f historyserver/config/minio.yaml +``` + +### 4. Build and Load Collector & History Server Images + +```bash +cd historyserver +make localimage-historyserver +kind load docker-image historyserver:v0.1.0 +make localimage-collector +kind load docker-image collector:v0.1.0 +``` + +### 5. Deploy Ray Cluster + +```bash +kubectl apply -f historyserver/config/raycluster.yaml +``` + +### 6. Submit Ray Job + +```bash +kubectl apply -f historyserver/config/rayjob.yaml +``` + +### 7. Delete Ray Cluster (Trigger Log Upload) + +```bash +kubectl delete -f historyserver/config/raycluster.yaml +``` + +### 8. Deploy History Server + +```bash +kubectl apply -f config/historyserver.yaml +``` + +### 9. Access History Server + +```bash +kubectl port-forward svc/historyserver 8080:30080 +``` + +> **Note**: Get the correct session directory from MinIO console. +> Login: `minioadmin` / `minioadmin` +> See: [MinIO Setup Guide](./set_up_collector.md#deploy-minio-for-log-and-event-storage) + +--- + +## API Endpoints + +### Health Check + +```bash +curl "http://localhost:8080/readz" +curl "http://localhost:8080/livez" +``` + +### List Clusters + +```bash +curl "http://localhost:8080/clusters" +``` + +### Enter a Session (Dead Cluster) + +```bash +SESSION="session_2026-01-11_19-38-40_146706_1" # Replace with actual session +curl -c ~/cookies.txt "http://localhost:8080/enter_cluster/default/raycluster-historyserver/$SESSION" +``` + +### Dead Cluster Endpoints + +```bash +# All Tasks +curl -b ~/cookies.txt "http://localhost:8080/api/v0/tasks" + +# Tasks by job_id +curl -b ~/cookies.txt "http://localhost:8080/api/v0/tasks?filter_keys=job_id&filter_predicates==&filter_values=AgAAAA==" + +# Task by task_id +curl -b ~/cookies.txt "http://localhost:8080/api/v0/tasks?filter_keys=task_id&filter_predicates==&filter_values=Z6Loz6WgbbP///////////////8CAAAA" + +# All Actors +curl -b ~/cookies.txt "http://localhost:8080/logical/actors" + +# Single Actor +curl -b ~/cookies.txt "http://localhost:8080/logical/actors/" + +# Nodes +curl -b ~/cookies.txt "http://localhost:8080/nodes?view=summary" +``` + +### Enter a Session (Live Cluster) + +```bash +SESSION="live" +curl -c ~/cookies.txt "http://localhost:8080/enter_cluster/default/raycluster-historyserver/$SESSION" +``` + +### Live Cluster Endpoints + +Switch to live session first, then: + +```bash +# All Tasks +curl -b ~/cookies.txt "http://localhost:8080/api/v0/tasks" + +# Tasks by job_id +curl -b ~/cookies.txt "http://localhost:8080/api/v0/tasks?filter_keys=job_id&filter_predicates==&filter_values=04000000" + +# Task Summarize +curl -b ~/cookies.txt "http://localhost:8080/api/v0/tasks/summarize" + +# All Actors +curl -b ~/cookies.txt "http://localhost:8080/logical/actors" + +# Single Actor +curl -b ~/cookies.txt "http://localhost:8080/logical/actors/" + +# Nodes Summary +curl -b ~/cookies.txt "http://localhost:8080/nodes?view=summary" + +# Jobs +curl -b ~/cookies.txt "http://localhost:8080/api/jobs/" + +# Cluster Status +curl -b ~/cookies.txt "http://localhost:8080/api/cluster_status" +``` diff --git a/historyserver/go.mod b/historyserver/go.mod index 07e944dc1ef..96b2eb62a66 100644 --- a/historyserver/go.mod +++ b/historyserver/go.mod @@ -9,15 +9,16 @@ require ( github.com/aws/aws-sdk-go v1.55.8 github.com/emicklei/go-restful/v3 v3.13.0 github.com/fsnotify/fsnotify v1.9.0 - github.com/onsi/gomega v1.37.0 + github.com/onsi/gomega v1.38.2 github.com/ray-project/kuberay/ray-operator v1.5.1 github.com/sirupsen/logrus v1.9.3 - k8s.io/api v0.34.3 - k8s.io/apimachinery v0.34.3 + k8s.io/apimachinery v0.35.0 + k8s.io/client-go v0.35.0 + sigs.k8s.io/controller-runtime v0.22.4 ) require ( - github.com/Masterminds/semver/v3 v3.3.1 // indirect + github.com/Masterminds/semver/v3 v3.4.0 // indirect github.com/alibabacloud-go/debug v1.0.1 // indirect github.com/beorn7/perks v1.0.1 // indirect github.com/blang/semver/v4 v4.0.0 // indirect @@ -33,7 +34,7 @@ require ( github.com/gogo/protobuf v1.3.2 // indirect github.com/google/btree v1.1.3 // indirect github.com/google/gnostic-models v0.7.0 // indirect - github.com/google/go-cmp v0.7.0 // indirect + github.com/google/go-cmp v0.7.0 github.com/google/pprof v0.0.0-20250403155104-27863c87afa6 // indirect github.com/google/uuid v1.6.0 // indirect github.com/gorilla/websocket v1.5.4-0.20250319132907-e064f32e3674 // indirect @@ -41,52 +42,53 @@ require ( github.com/josharian/intern v1.0.0 // indirect github.com/json-iterator/go v1.1.12 // indirect github.com/mailru/easyjson v0.9.0 // indirect - github.com/moby/spdystream v0.5.0 // indirect github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd // indirect github.com/modern-go/reflect2 v1.0.3-0.20250322232337-35a7c28c31ee // indirect github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 // indirect github.com/mxk/go-flowrate v0.0.0-20140419014527-cca7078d478f // indirect - github.com/onsi/ginkgo/v2 v2.23.4 // indirect + github.com/onsi/ginkgo/v2 v2.27.2 // indirect github.com/openshift/api v0.0.0-20250602203052-b29811a290c7 // indirect - github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2 // indirect github.com/prometheus/client_golang v1.23.0 // indirect github.com/prometheus/client_model v0.6.2 // indirect github.com/prometheus/common v0.65.0 // indirect github.com/prometheus/procfs v0.17.0 // indirect - github.com/rogpeppe/go-internal v1.14.1 // indirect - github.com/spf13/pflag v1.0.7 // indirect - github.com/stretchr/testify v1.11.1 // indirect + github.com/spf13/pflag v1.0.9 // indirect github.com/x448/float16 v0.8.4 // indirect go.opentelemetry.io/otel v1.35.0 // indirect go.opentelemetry.io/otel/trace v1.35.0 // indirect - go.uber.org/automaxprocs v1.6.0 // indirect - go.yaml.in/yaml/v2 v2.4.2 // indirect + go.yaml.in/yaml/v2 v2.4.3 // indirect go.yaml.in/yaml/v3 v3.0.4 // indirect - golang.org/x/net v0.46.0 // indirect + golang.org/x/net v0.47.0 // indirect golang.org/x/oauth2 v0.30.0 // indirect - golang.org/x/sync v0.17.0 // indirect - golang.org/x/sys v0.37.0 // indirect - golang.org/x/term v0.36.0 // indirect - golang.org/x/text v0.30.0 // indirect + golang.org/x/sync v0.18.0 // indirect + golang.org/x/sys v0.38.0 // indirect + golang.org/x/term v0.37.0 // indirect + golang.org/x/text v0.31.0 // indirect golang.org/x/time v0.14.0 // indirect - golang.org/x/tools v0.37.0 // indirect - gomodules.xyz/jsonpatch/v2 v2.4.0 // indirect google.golang.org/protobuf v1.36.8 // indirect - gopkg.in/evanphx/json-patch.v4 v4.13.0 // indirect gopkg.in/inf.v0 v0.9.1 // indirect gopkg.in/ini.v1 v1.67.0 // indirect gopkg.in/yaml.v3 v3.0.1 // indirect + k8s.io/api v0.35.0 k8s.io/apiextensions-apiserver v0.34.1 // indirect k8s.io/apiserver v0.34.1 // indirect - k8s.io/client-go v0.34.3 // indirect k8s.io/component-base v0.34.1 // indirect k8s.io/klog/v2 v2.130.1 // indirect - k8s.io/kube-openapi v0.0.0-20250814151709-d7b6acb124c3 // indirect - k8s.io/utils v0.0.0-20250820121507-0af2bda4dd1d // indirect - sigs.k8s.io/controller-runtime v0.22.1 // indirect + k8s.io/kube-openapi v0.0.0-20250910181357-589584f1c912 // indirect + k8s.io/utils v0.0.0-20251002143259-bc988d571ff4 // indirect sigs.k8s.io/gateway-api v1.4.0 // indirect sigs.k8s.io/json v0.0.0-20250730193827-2d320260d730 // indirect sigs.k8s.io/randfill v1.0.0 // indirect sigs.k8s.io/structured-merge-diff/v6 v6.3.0 // indirect sigs.k8s.io/yaml v1.6.0 // indirect ) + +require ( + github.com/moby/spdystream v0.5.0 // indirect + github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2 // indirect + github.com/stretchr/testify v1.11.1 // indirect + golang.org/x/mod v0.29.0 // indirect + golang.org/x/tools v0.38.0 // indirect + gomodules.xyz/jsonpatch/v2 v2.4.0 // indirect + gopkg.in/evanphx/json-patch.v4 v4.13.0 // indirect +) diff --git a/historyserver/go.sum b/historyserver/go.sum index 7c4908b7613..2b623ecc52f 100644 --- a/historyserver/go.sum +++ b/historyserver/go.sum @@ -1,5 +1,5 @@ -github.com/Masterminds/semver/v3 v3.3.1 h1:QtNSWtVZ3nBfk8mAOu/B6v7FMJ+NHTIgUPi7rj+4nv4= -github.com/Masterminds/semver/v3 v3.3.1/go.mod h1:4V+yj/TJE1HU9XfppCwVMZq3I84lprf4nC11bSS5beM= +github.com/Masterminds/semver/v3 v3.4.0 h1:Zog+i5UMtVoCU8oKka5P7i9q9HgrJeGzI9SA1Xbatp0= +github.com/Masterminds/semver/v3 v3.4.0/go.mod h1:4V+yj/TJE1HU9XfppCwVMZq3I84lprf4nC11bSS5beM= github.com/alibabacloud-go/debug v1.0.0/go.mod h1:8gfgZCCAC3+SCzjWtY053FrOcd4/qlH6IHTI4QyICOc= github.com/alibabacloud-go/debug v1.0.1 h1:MsW9SmUtbb1Fnt3ieC6NNZi6aEwrXfDksD4QA6GSbPg= github.com/alibabacloud-go/debug v1.0.1/go.mod h1:8gfgZCCAC3+SCzjWtY053FrOcd4/qlH6IHTI4QyICOc= @@ -36,6 +36,12 @@ github.com/fsnotify/fsnotify v1.9.0 h1:2Ml+OJNzbYCTzsxtv8vKSFD9PbJjmhYF14k/jKC7S github.com/fsnotify/fsnotify v1.9.0/go.mod h1:8jBTzvmWwFyi3Pb8djgCCO5IBqzKJ/Jwo8TRcHyHii0= github.com/fxamacker/cbor/v2 v2.9.0 h1:NpKPmjDBgUfBms6tr6JZkTHtfFGcMKsw3eGcmD/sapM= github.com/fxamacker/cbor/v2 v2.9.0/go.mod h1:vM4b+DJCtHn+zz7h3FFp/hDAI9WNWCsZj23V5ytsSxQ= +github.com/gkampitakis/ciinfo v0.3.2 h1:JcuOPk8ZU7nZQjdUhctuhQofk7BGHuIy0c9Ez8BNhXs= +github.com/gkampitakis/ciinfo v0.3.2/go.mod h1:1NIwaOcFChN4fa/B0hEBdAb6npDlFL8Bwx4dfRLRqAo= +github.com/gkampitakis/go-diff v1.3.2 h1:Qyn0J9XJSDTgnsgHRdz9Zp24RaJeKMUHg2+PDZZdC4M= +github.com/gkampitakis/go-diff v1.3.2/go.mod h1:LLgOrpqleQe26cte8s36HTWcTmMEur6OPYerdAAS9tk= +github.com/gkampitakis/go-snaps v0.5.15 h1:amyJrvM1D33cPHwVrjo9jQxX8g/7E2wYdZ+01KS3zGE= +github.com/gkampitakis/go-snaps v0.5.15/go.mod h1:HNpx/9GoKisdhw9AFOBT1N7DBs9DiHo/hGheFGBZ+mc= github.com/go-logr/logr v1.4.3 h1:CjnDlHq8ikf6E492q6eKboGOC0T8CDaOvkHCIg8idEI= github.com/go-logr/logr v1.4.3/go.mod h1:9T104GzyrTigFIr8wt5mBrctHMim0Nb2HLGrmQ40KvY= github.com/go-logr/zapr v1.3.0 h1:XGdV8XW8zdwFiwOA2Dryh1gj2KRQyOOoNmBy4EplIcQ= @@ -48,6 +54,8 @@ github.com/go-openapi/swag v0.23.1 h1:lpsStH0n2ittzTnbaSloVZLuB5+fvSY/+hnagBjSNZ github.com/go-openapi/swag v0.23.1/go.mod h1:STZs8TbRvEQQKUA+JZNAm3EWlgaOBGpyFDqQnDHMef0= github.com/go-task/slim-sprig/v3 v3.0.0 h1:sUs3vkvUymDpBKi3qH1YSqBQk9+9D/8M2mN1vB6EwHI= github.com/go-task/slim-sprig/v3 v3.0.0/go.mod h1:W848ghGpv3Qj3dhTPRyJypKRiqCdHZiAzKg9hl15HA8= +github.com/goccy/go-yaml v1.18.0 h1:8W7wMFS12Pcas7KU+VVkaiCng+kG8QiFeFwzFb+rwuw= +github.com/goccy/go-yaml v1.18.0/go.mod h1:XBurs7gK8ATbW4ZPGKgcbrY1Br56PdM69F7LkFRi1kA= github.com/gogo/protobuf v1.3.2 h1:Ov1cvc58UF3b5XjBnZv7+opcTcQFZebYjWzi34vdm4Q= github.com/gogo/protobuf v1.3.2/go.mod h1:P1XiOD3dCwIKUDQYPy72D8LYyHL2YPYrpS2s69NZV8Q= github.com/google/btree v1.1.3 h1:CVpQJjYgC4VbzxeGVHfvZrv1ctoYCAI8vbl07Fcxlyg= @@ -74,6 +82,8 @@ github.com/jmespath/go-jmespath/internal/testify v1.5.1 h1:shLQSRRSCCPj3f2gpwzGw github.com/jmespath/go-jmespath/internal/testify v1.5.1/go.mod h1:L3OGu8Wl2/fWfCI6z80xFu9LTZmf1ZRjMHUOPmWr69U= github.com/josharian/intern v1.0.0 h1:vlS4z54oSdjm0bgjRigI+G1HpF+tI+9rE5LLzOg8HmY= github.com/josharian/intern v1.0.0/go.mod h1:5DoeVV0s6jJacbCEi61lwdGj/aVlrQvzHFFd8Hwg//Y= +github.com/joshdk/go-junit v1.0.0 h1:S86cUKIdwBHWwA6xCmFlf3RTLfVXYQfvanM5Uh+K6GE= +github.com/joshdk/go-junit v1.0.0/go.mod h1:TiiV0PqkaNfFXjEiyjWM3XXrhVyCa1K4Zfga6W52ung= github.com/json-iterator/go v1.1.12 h1:PV8peI4a0ysnczrg+LtxykD8LfKY9ML6u2jnxaEnrnM= github.com/json-iterator/go v1.1.12/go.mod h1:e30LSqwooZae/UwlEbR2852Gd8hjQvJoHmT4TnhNGBo= github.com/kisielk/errcheck v1.5.0/go.mod h1:pFxgyoBC7bSaBwPgfKdkLd5X25qrDl4LWUI2bnpBCr8= @@ -90,6 +100,10 @@ github.com/kylelemons/godebug v1.1.0 h1:RPNrshWIDI6G2gRW9EHilWtl7Z6Sb1BR0xunSBf0 github.com/kylelemons/godebug v1.1.0/go.mod h1:9/0rRGxNHcop5bhtWyNeEfOS8JIWk580+fNqagV/RAw= github.com/mailru/easyjson v0.9.0 h1:PrnmzHw7262yW8sTBwxi1PdJA3Iw/EKBa8psRf7d9a4= github.com/mailru/easyjson v0.9.0/go.mod h1:1+xMtQp2MRNVL/V1bOzuP3aP8VNwRW55fQUto+XFtTU= +github.com/maruel/natural v1.1.1 h1:Hja7XhhmvEFhcByqDoHz9QZbkWey+COd9xWfCfn1ioo= +github.com/maruel/natural v1.1.1/go.mod h1:v+Rfd79xlw1AgVBjbO0BEQmptqb5HvL/k9GRHB7ZKEg= +github.com/mfridman/tparse v0.18.0 h1:wh6dzOKaIwkUGyKgOntDW4liXSo37qg5AXbIhkMV3vE= +github.com/mfridman/tparse v0.18.0/go.mod h1:gEvqZTuCgEhPbYk/2lS3Kcxg1GmTxxU7kTC8DvP0i/A= github.com/moby/spdystream v0.5.0 h1:7r0J1Si3QO/kjRitvSLVVFUjxMEb/YLj6S9FF62JBCU= github.com/moby/spdystream v0.5.0/go.mod h1:xBAYlnt/ay+11ShkdFKNAG7LsyK/tmNBVvVOwrfMgdI= github.com/modern-go/concurrent v0.0.0-20180228061459-e0a39a4cb421/go.mod h1:6dJC0mAP4ikYIbvyc7fijjWJddQyLn8Ig3JB5CqoB9Q= @@ -103,10 +117,10 @@ github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822/go.mod h1:+n7T8m github.com/mxk/go-flowrate v0.0.0-20140419014527-cca7078d478f h1:y5//uYreIhSUg3J1GEMiLbxo1LJaP8RfCpH6pymGZus= github.com/mxk/go-flowrate v0.0.0-20140419014527-cca7078d478f/go.mod h1:ZdcZmHo+o7JKHSa8/e818NopupXU1YMK5fe1lsApnBw= github.com/niemeyer/pretty v0.0.0-20200227124842-a10e7caefd8e/go.mod h1:zD1mROLANZcx1PVRCS0qkT7pwLkGfwJo4zjcN/Tysno= -github.com/onsi/ginkgo/v2 v2.23.4 h1:ktYTpKJAVZnDT4VjxSbiBenUjmlL/5QkBEocaWXiQus= -github.com/onsi/ginkgo/v2 v2.23.4/go.mod h1:Bt66ApGPBFzHyR+JO10Zbt0Gsp4uWxu5mIOTusL46e8= -github.com/onsi/gomega v1.37.0 h1:CdEG8g0S133B4OswTDC/5XPSzE1OeP29QOioj2PID2Y= -github.com/onsi/gomega v1.37.0/go.mod h1:8D9+Txp43QWKhM24yyOBEdpkzN8FvJyAwecBgsU4KU0= +github.com/onsi/ginkgo/v2 v2.27.2 h1:LzwLj0b89qtIy6SSASkzlNvX6WktqurSHwkk2ipF/Ns= +github.com/onsi/ginkgo/v2 v2.27.2/go.mod h1:ArE1D/XhNXBXCBkKOLkbsb2c81dQHCRcF5zwn/ykDRo= +github.com/onsi/gomega v1.38.2 h1:eZCjf2xjZAqe+LeWvKb5weQ+NcPwX84kqJ0cZNxok2A= +github.com/onsi/gomega v1.38.2/go.mod h1:W2MJcYxRGV63b418Ai34Ud0hEdTVXq9NW9+Sx6uXf3k= github.com/openshift/api v0.0.0-20250602203052-b29811a290c7 h1:dZ9uBd0Cw3+l1RGpYRkWdrRjM9yvfxrjW/uPHKUwtIQ= github.com/openshift/api v0.0.0-20250602203052-b29811a290c7/go.mod h1:yk60tHAmHhtVpJQo3TwVYq2zpuP70iJIFDCmeKMIzPw= github.com/pkg/errors v0.9.1 h1:FEBLx1zS214owpjy7qsBeixbURkuhQAwrK5UwLGTwt4= @@ -114,8 +128,6 @@ github.com/pkg/errors v0.9.1/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINE github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2 h1:Jamvg5psRIccs7FGNTlIRMkT8wgtp5eCXdBlqhYGL6U= github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= -github.com/prashantv/gostub v1.1.0 h1:BTyx3RfQjRHnUWaGF9oQos79AlQ5k8WNktv7VGvVH4g= -github.com/prashantv/gostub v1.1.0/go.mod h1:A5zLQHz7ieHGG7is6LLXLz7I8+3LZzsrV0P1IAHhP5U= github.com/prometheus/client_golang v1.23.0 h1:ust4zpdl9r4trLY/gSjlm07PuiBq2ynaXXlptpfy8Uc= github.com/prometheus/client_golang v1.23.0/go.mod h1:i/o0R9ByOnHX0McrTMTyhYvKE4haaf2mW08I+jGAjEE= github.com/prometheus/client_model v0.6.2 h1:oBsgwpGs7iVziMvrGhE53c/GrLUsZdHnqNwqPLxwZyk= @@ -130,8 +142,8 @@ github.com/rogpeppe/go-internal v1.14.1 h1:UQB4HGPB6osV0SQTLymcB4TgvyWu6ZyliaW0t github.com/rogpeppe/go-internal v1.14.1/go.mod h1:MaRKkUm5W0goXpeCfT7UZI6fk/L7L7so1lCWt35ZSgc= github.com/sirupsen/logrus v1.9.3 h1:dueUQJ1C2q9oE3F7wvmSGAaVtTmUizReu6fjN8uqzbQ= github.com/sirupsen/logrus v1.9.3/go.mod h1:naHLuLoDiP4jHNo9R0sCBMtWGeIprob74mVsIT4qYEQ= -github.com/spf13/pflag v1.0.7 h1:vN6T9TfwStFPFM5XzjsvmzZkLuaLX+HS+0SeFLRgU6M= -github.com/spf13/pflag v1.0.7/go.mod h1:McXfInJRrz4CZXVZOBLb0bTZqETkiAhM9Iw0y3An2Bg= +github.com/spf13/pflag v1.0.9 h1:9exaQaMOCwffKiiiYk6/BndUBv+iRViNW+4lEMi0PvY= +github.com/spf13/pflag v1.0.9/go.mod h1:McXfInJRrz4CZXVZOBLb0bTZqETkiAhM9Iw0y3An2Bg= github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME= github.com/stretchr/objx v0.5.2 h1:xuMeJ0Sdp5ZMRXx/aWO6RZxdr3beISkG5/G/aIRr3pY= github.com/stretchr/objx v0.5.2/go.mod h1:FRsXN1f5AsAjCGJKqEizvkpNtU+EGNCLh3NxZ/8L+MA= @@ -140,6 +152,14 @@ github.com/stretchr/testify v1.5.1/go.mod h1:5W2xD1RspED5o8YsWQXVCued0rvSQ+mT+I5 github.com/stretchr/testify v1.7.0/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg= github.com/stretchr/testify v1.11.1 h1:7s2iGBzp5EwR7/aIZr8ao5+dra3wiQyKjjFuvgVKu7U= github.com/stretchr/testify v1.11.1/go.mod h1:wZwfW3scLgRK+23gO65QZefKpKQRnfz6sD981Nm4B6U= +github.com/tidwall/gjson v1.18.0 h1:FIDeeyB800efLX89e5a8Y0BNH+LOngJyGrIWxG2FKQY= +github.com/tidwall/gjson v1.18.0/go.mod h1:/wbyibRr2FHMks5tjHJ5F8dMZh3AcwJEMf5vlfC0lxk= +github.com/tidwall/match v1.1.1 h1:+Ho715JplO36QYgwN9PGYNhgZvoUSc9X2c80KVTi+GA= +github.com/tidwall/match v1.1.1/go.mod h1:eRSPERbgtNPcGhD8UCthc6PmLEQXEWd3PRB5JTxsfmM= +github.com/tidwall/pretty v1.2.1 h1:qjsOFOWWQl+N3RsoF5/ssm1pHmJJwhjlSbZ51I6wMl4= +github.com/tidwall/pretty v1.2.1/go.mod h1:ITEVvHYasfjBbM0u2Pg8T2nJnzm8xPwvNhhsoaGGjNU= +github.com/tidwall/sjson v1.2.5 h1:kLy8mja+1c9jlljvWTlSazM7cKDRfJuR/bOJhcY5NcY= +github.com/tidwall/sjson v1.2.5/go.mod h1:Fvgq9kS/6ociJEDnK0Fk1cpYF4FIW6ZF7LAe+6jwd28= github.com/x448/float16 v0.8.4 h1:qLwI1I70+NjRFUR3zs1JPUCgaCXSh3SW62uAKT1mSBM= github.com/x448/float16 v0.8.4/go.mod h1:14CWIYCyZA/cWjXOioeEpHeN/83MdbZDRQHoFcYsOfg= github.com/yuin/goldmark v1.1.27/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9decYSb74= @@ -149,16 +169,14 @@ go.opentelemetry.io/otel v1.35.0 h1:xKWKPxrxB6OtMCbmMY021CqC45J+3Onta9MqjhnusiQ= go.opentelemetry.io/otel v1.35.0/go.mod h1:UEqy8Zp11hpkUrL73gSlELM0DupHoiq72dR+Zqel/+Y= go.opentelemetry.io/otel/trace v1.35.0 h1:dPpEfJu1sDIqruz7BHFG3c7528f6ddfSWfFDVt/xgMs= go.opentelemetry.io/otel/trace v1.35.0/go.mod h1:WUk7DtFp1Aw2MkvqGdwiXYDZZNvA/1J8o6xRXLrIkyc= -go.uber.org/automaxprocs v1.6.0 h1:O3y2/QNTOdbF+e/dpXNNW7Rx2hZ4sTIPyybbxyNqTUs= -go.uber.org/automaxprocs v1.6.0/go.mod h1:ifeIMSnPZuznNm6jmdzmU3/bfk01Fe2fotchwEFJ8r8= go.uber.org/goleak v1.3.0 h1:2K3zAYmnTNqV73imy9J1T3WC+gmCePx2hEGkimedGto= go.uber.org/goleak v1.3.0/go.mod h1:CoHD4mav9JJNrW/WLlf7HGZPjdw8EucARQHekz1X6bE= go.uber.org/multierr v1.11.0 h1:blXXJkSxSSfBVBlC76pxqeO+LN3aDfLQo+309xJstO0= go.uber.org/multierr v1.11.0/go.mod h1:20+QtiLqy0Nd6FdQB9TLXag12DsQkrbs3htMFfDN80Y= go.uber.org/zap v1.27.0 h1:aJMhYGrd5QSmlpLMr2MftRKl7t8J8PTZPA732ud/XR8= go.uber.org/zap v1.27.0/go.mod h1:GB2qFLM7cTU87MWRP2mPIjqfIDnGu+VIO4V/SdhGo2E= -go.yaml.in/yaml/v2 v2.4.2 h1:DzmwEr2rDGHl7lsFgAHxmNz/1NlQ7xLIrlN2h5d1eGI= -go.yaml.in/yaml/v2 v2.4.2/go.mod h1:081UH+NErpNdqlCXm3TtEran0rJZGxAYx9hb/ELlsPU= +go.yaml.in/yaml/v2 v2.4.3 h1:6gvOSjQoTB3vt1l+CU+tSyi/HOjfOjRLJ4YwYZGwRO0= +go.yaml.in/yaml/v2 v2.4.3/go.mod h1:zSxWcmIDjOzPXpjlTTbAsKokqkDNAVtZO0WOMiT90s8= go.yaml.in/yaml/v3 v3.0.4 h1:tfq32ie2Jv2UxXFdLJdh3jXuOzWiL1fo0bu/FbuKpbc= go.yaml.in/yaml/v3 v3.0.4/go.mod h1:DhzuOOF2ATzADvBadXxruRBLzYTpT36CKvDb3+aBEFg= golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w= @@ -177,6 +195,8 @@ golang.org/x/mod v0.8.0/go.mod h1:iBbtSCu2XBx23ZKBPSOrRkjjQPZFPuis4dIYUhu/chs= golang.org/x/mod v0.12.0/go.mod h1:iBbtSCu2XBx23ZKBPSOrRkjjQPZFPuis4dIYUhu/chs= golang.org/x/mod v0.15.0/go.mod h1:hTbmBsO62+eylJbnUtE2MGJUyE7QWk4xUqPFrRgJ+7c= golang.org/x/mod v0.17.0/go.mod h1:hTbmBsO62+eylJbnUtE2MGJUyE7QWk4xUqPFrRgJ+7c= +golang.org/x/mod v0.29.0 h1:HV8lRxZC4l2cr3Zq1LvtOsi/ThTgWnUk/y64QSs8GwA= +golang.org/x/mod v0.29.0/go.mod h1:NyhrlYXJ2H4eJiRy/WDBO6HMqZQ6q9nk4JzS3NuCK+w= golang.org/x/net v0.0.0-20190404232315-eb5bcb51f2a3/go.mod h1:t9HGtf8HONx5eT2rtn7q6eTqICYqUVnKs3thJo3Qplg= golang.org/x/net v0.0.0-20190620200207-3b0461eec859/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= golang.org/x/net v0.0.0-20200226121028-0de0cce0169b/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= @@ -190,8 +210,8 @@ golang.org/x/net v0.20.0/go.mod h1:z8BVo6PvndSri0LbOE3hAn0apkU+1YvI6E70E9jsnvY= golang.org/x/net v0.21.0/go.mod h1:bIjVDfnllIU7BJ2DNgfnXvpSvtn8VRwhlsaeUTyUS44= golang.org/x/net v0.25.0/go.mod h1:JkAGAh7GEvH74S6FOH42FLoXpXbE/aqXSrIQjXgsiwM= golang.org/x/net v0.26.0/go.mod h1:5YKkiSynbBIh3p6iOc/vibscux0x38BZDkn8sCUPxHE= -golang.org/x/net v0.46.0 h1:giFlY12I07fugqwPuWJi68oOnpfqFnJIJzaIIm2JVV4= -golang.org/x/net v0.46.0/go.mod h1:Q9BGdFy1y4nkUwiLvT5qtyhAnEHgnQ/zd8PfU6nc210= +golang.org/x/net v0.47.0 h1:Mx+4dIFzqraBXUugkia1OOvlD6LemFo1ALMHjrXDOhY= +golang.org/x/net v0.47.0/go.mod h1:/jNxtkgq5yWUGYkaZGqo27cfGZ1c5Nen03aYrrKpVRU= golang.org/x/oauth2 v0.30.0 h1:dnDm7JmhM45NNpd8FDDeLhK6FwqbOf4MLCM9zb1BOHI= golang.org/x/oauth2 v0.30.0/go.mod h1:B++QgG3ZKulg6sRPGD/mqlHQs5rB3Ml9erfeDY7xKlU= golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= @@ -202,8 +222,8 @@ golang.org/x/sync v0.1.0/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= golang.org/x/sync v0.3.0/go.mod h1:FU7BRWz2tNW+3quACPkgCx/L+uEAv1htQ0V83Z9Rj+Y= golang.org/x/sync v0.6.0/go.mod h1:Czt+wKu1gCyEFDUtn0jG5QVvpJ6rzVqr5aXyt9drQfk= golang.org/x/sync v0.7.0/go.mod h1:Czt+wKu1gCyEFDUtn0jG5QVvpJ6rzVqr5aXyt9drQfk= -golang.org/x/sync v0.17.0 h1:l60nONMj9l5drqw6jlhIELNv9I0A4OFgRsG9k2oT9Ug= -golang.org/x/sync v0.17.0/go.mod h1:9KTHXmSnoGruLpwFjVSX0lNNA75CykiMECbovNTZqGI= +golang.org/x/sync v0.18.0 h1:kr88TuHDroi+UVf+0hZnirlk8o8T+4MrK6mr60WkH/I= +golang.org/x/sync v0.18.0/go.mod h1:9KTHXmSnoGruLpwFjVSX0lNNA75CykiMECbovNTZqGI= golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= golang.org/x/sys v0.0.0-20190412213103-97732733099d/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20200930185726-fdedc70b468f/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= @@ -219,8 +239,8 @@ golang.org/x/sys v0.16.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= golang.org/x/sys v0.17.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= golang.org/x/sys v0.20.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= golang.org/x/sys v0.21.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= -golang.org/x/sys v0.37.0 h1:fdNQudmxPjkdUTPnLn5mdQv7Zwvbvpaxqs831goi9kQ= -golang.org/x/sys v0.37.0/go.mod h1:OgkHotnGiDImocRcuBABYBEXf8A9a87e/uXjp9XT3ks= +golang.org/x/sys v0.38.0 h1:3yZWxaJjBmCWXqhN1qh02AkOnCQ1poK6oF+a7xWL6Gc= +golang.org/x/sys v0.38.0/go.mod h1:OgkHotnGiDImocRcuBABYBEXf8A9a87e/uXjp9XT3ks= golang.org/x/telemetry v0.0.0-20240228155512-f48c80bd79b2/go.mod h1:TeRTkGYfJXctD9OcfyVLyj2J3IxLnKwHJR8f4D8a3YE= golang.org/x/term v0.0.0-20201126162022-7de9c90e9dd1/go.mod h1:bj7SfCRtBDWHUb9snDiAeCFNEtKQo2Wmx5Cou7ajbmo= golang.org/x/term v0.0.0-20210927222741-03fcf44c2211/go.mod h1:jbD1KX2456YbFQfuXm/mYQcufACuNUgVhRMnK/tPxf8= @@ -231,8 +251,8 @@ golang.org/x/term v0.16.0/go.mod h1:yn7UURbUtPyrVJPGPq404EukNFxcm/foM+bV/bfcDsY= golang.org/x/term v0.17.0/go.mod h1:lLRBjIVuehSbZlaOtGMbcMncT+aqLLLmKrsjNrUguwk= golang.org/x/term v0.20.0/go.mod h1:8UkIAJTvZgivsXaD6/pH6U9ecQzZ45awqEOzuCvwpFY= golang.org/x/term v0.21.0/go.mod h1:ooXLefLobQVslOqselCNF4SxFAaoS6KujMbsGzSDmX0= -golang.org/x/term v0.36.0 h1:zMPR+aF8gfksFprF/Nc/rd1wRS1EI6nDBGyWAvDzx2Q= -golang.org/x/term v0.36.0/go.mod h1:Qu394IJq6V6dCBRgwqshf3mPF85AqzYEzofzRdZkWss= +golang.org/x/term v0.37.0 h1:8EGAD0qCmHYZg6J17DvsMy9/wJ7/D/4pV/wfnld5lTU= +golang.org/x/term v0.37.0/go.mod h1:5pB4lxRNYYVZuTLmy8oR2BH8dflOR+IbTYFD8fi3254= golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ= golang.org/x/text v0.3.3/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ= golang.org/x/text v0.3.7/go.mod h1:u+2+/6zg+i71rQMx5EYifcz6MCKuco9NR6JIITiCfzQ= @@ -242,8 +262,8 @@ golang.org/x/text v0.13.0/go.mod h1:TvPlkZtksWOMsz7fbANvkp4WM8x/WCo/om8BMLbz+aE= golang.org/x/text v0.14.0/go.mod h1:18ZOQIKpY8NJVqYksKHtTdi31H5itFRjB5/qKTNYzSU= golang.org/x/text v0.15.0/go.mod h1:18ZOQIKpY8NJVqYksKHtTdi31H5itFRjB5/qKTNYzSU= golang.org/x/text v0.16.0/go.mod h1:GhwF1Be+LQoKShO3cGOHzqOgRrGaYc9AvblQOmPVHnI= -golang.org/x/text v0.30.0 h1:yznKA/E9zq54KzlzBEAWn1NXSQ8DIp/NYMy88xJjl4k= -golang.org/x/text v0.30.0/go.mod h1:yDdHFIX9t+tORqspjENWgzaCVXgk0yYnYuSZ8UzzBVM= +golang.org/x/text v0.31.0 h1:aC8ghyu4JhP8VojJ2lEHBnochRno1sgL6nEi9WGFGMM= +golang.org/x/text v0.31.0/go.mod h1:tKRAlv61yKIjGGHX/4tP1LTbc13YSec1pxVEWXzfoeM= golang.org/x/time v0.14.0 h1:MRx4UaLrDotUKUdCIqzPC48t1Y9hANFKIRpNx+Te8PI= golang.org/x/time v0.14.0/go.mod h1:eL/Oa2bBBK0TkX57Fyni+NgnyQQN4LitPmob2Hjnqw4= golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ= @@ -254,8 +274,8 @@ golang.org/x/tools v0.1.12/go.mod h1:hNGJHUnrk76NpqgfD5Aqm5Crs+Hm0VOH/i9J2+nxYbc golang.org/x/tools v0.6.0/go.mod h1:Xwgl3UAJ/d3gWutnCtw505GrjyAbvKui8lOU390QaIU= golang.org/x/tools v0.13.0/go.mod h1:HvlwmtVNQAhOuCjW7xxvovg8wbNq7LwfXh/k7wXUl58= golang.org/x/tools v0.21.1-0.20240508182429-e35e4ccd0d2d/go.mod h1:aiJjzUbINMkxbQROHiO6hDPo2LHcIPhhQsa9DLh0yGk= -golang.org/x/tools v0.37.0 h1:DVSRzp7FwePZW356yEAChSdNcQo6Nsp+fex1SUW09lE= -golang.org/x/tools v0.37.0/go.mod h1:MBN5QPQtLMHVdvsbtarmTNukZDdgwdwlO5qGacAzF0w= +golang.org/x/tools v0.38.0 h1:Hx2Xv8hISq8Lm16jvBZ2VQf+RLmbd7wVUsALibYI/IQ= +golang.org/x/tools v0.38.0/go.mod h1:yEsQ/d/YK8cjh0L6rZlY8tgtlKiBNTL14pGDJPJpYQs= golang.org/x/xerrors v0.0.0-20190717185122-a985d3407aa7/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= golang.org/x/xerrors v0.0.0-20191011141410-1b5146add898/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= golang.org/x/xerrors v0.0.0-20191204190536-9bdfabe68543/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= @@ -281,26 +301,26 @@ gopkg.in/yaml.v2 v2.4.0/go.mod h1:RDklbk79AGWmwhnvt/jBztapEOGDOx6ZbXqjP6csGnQ= gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA= gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= -k8s.io/api v0.34.3 h1:D12sTP257/jSH2vHV2EDYrb16bS7ULlHpdNdNhEw2S4= -k8s.io/api v0.34.3/go.mod h1:PyVQBF886Q5RSQZOim7DybQjAbVs8g7gwJNhGtY5MBk= +k8s.io/api v0.35.0 h1:iBAU5LTyBI9vw3L5glmat1njFK34srdLmktWwLTprlY= +k8s.io/api v0.35.0/go.mod h1:AQ0SNTzm4ZAczM03QH42c7l3bih1TbAXYo0DkF8ktnA= k8s.io/apiextensions-apiserver v0.34.1 h1:NNPBva8FNAPt1iSVwIE0FsdrVriRXMsaWFMqJbII2CI= k8s.io/apiextensions-apiserver v0.34.1/go.mod h1:hP9Rld3zF5Ay2Of3BeEpLAToP+l4s5UlxiHfqRaRcMc= -k8s.io/apimachinery v0.34.3 h1:/TB+SFEiQvN9HPldtlWOTp0hWbJ+fjU+wkxysf/aQnE= -k8s.io/apimachinery v0.34.3/go.mod h1:/GwIlEcWuTX9zKIg2mbw0LRFIsXwrfoVxn+ef0X13lw= +k8s.io/apimachinery v0.35.0 h1:Z2L3IHvPVv/MJ7xRxHEtk6GoJElaAqDCCU0S6ncYok8= +k8s.io/apimachinery v0.35.0/go.mod h1:jQCgFZFR1F4Ik7hvr2g84RTJSZegBc8yHgFWKn//hns= k8s.io/apiserver v0.34.1 h1:U3JBGdgANK3dfFcyknWde1G6X1F4bg7PXuvlqt8lITA= k8s.io/apiserver v0.34.1/go.mod h1:eOOc9nrVqlBI1AFCvVzsob0OxtPZUCPiUJL45JOTBG0= -k8s.io/client-go v0.34.3 h1:wtYtpzy/OPNYf7WyNBTj3iUA0XaBHVqhv4Iv3tbrF5A= -k8s.io/client-go v0.34.3/go.mod h1:OxxeYagaP9Kdf78UrKLa3YZixMCfP6bgPwPwNBQBzpM= +k8s.io/client-go v0.35.0 h1:IAW0ifFbfQQwQmga0UdoH0yvdqrbwMdq9vIFEhRpxBE= +k8s.io/client-go v0.35.0/go.mod h1:q2E5AAyqcbeLGPdoRB+Nxe3KYTfPce1Dnu1myQdqz9o= k8s.io/component-base v0.34.1 h1:v7xFgG+ONhytZNFpIz5/kecwD+sUhVE6HU7qQUiRM4A= k8s.io/component-base v0.34.1/go.mod h1:mknCpLlTSKHzAQJJnnHVKqjxR7gBeHRv0rPXA7gdtQ0= k8s.io/klog/v2 v2.130.1 h1:n9Xl7H1Xvksem4KFG4PYbdQCQxqc/tTUyrgXaOhHSzk= k8s.io/klog/v2 v2.130.1/go.mod h1:3Jpz1GvMt720eyJH1ckRHK1EDfpxISzJ7I9OYgaDtPE= -k8s.io/kube-openapi v0.0.0-20250814151709-d7b6acb124c3 h1:liMHz39T5dJO1aOKHLvwaCjDbf07wVh6yaUlTpunnkE= -k8s.io/kube-openapi v0.0.0-20250814151709-d7b6acb124c3/go.mod h1:UZ2yyWbFTpuhSbFhv24aGNOdoRdJZgsIObGBUaYVsts= -k8s.io/utils v0.0.0-20250820121507-0af2bda4dd1d h1:wAhiDyZ4Tdtt7e46e9M5ZSAJ/MnPGPs+Ki1gHw4w1R0= -k8s.io/utils v0.0.0-20250820121507-0af2bda4dd1d/go.mod h1:OLgZIPagt7ERELqWJFomSt595RzquPNLL48iOWgYOg0= -sigs.k8s.io/controller-runtime v0.22.1 h1:Ah1T7I+0A7ize291nJZdS1CabF/lB4E++WizgV24Eqg= -sigs.k8s.io/controller-runtime v0.22.1/go.mod h1:FwiwRjkRPbiN+zp2QRp7wlTCzbUXxZ/D4OzuQUDwBHY= +k8s.io/kube-openapi v0.0.0-20250910181357-589584f1c912 h1:Y3gxNAuB0OBLImH611+UDZcmKS3g6CthxToOb37KgwE= +k8s.io/kube-openapi v0.0.0-20250910181357-589584f1c912/go.mod h1:kdmbQkyfwUagLfXIad1y2TdrjPFWp2Q89B3qkRwf/pQ= +k8s.io/utils v0.0.0-20251002143259-bc988d571ff4 h1:SjGebBtkBqHFOli+05xYbK8YF1Dzkbzn+gDM4X9T4Ck= +k8s.io/utils v0.0.0-20251002143259-bc988d571ff4/go.mod h1:OLgZIPagt7ERELqWJFomSt595RzquPNLL48iOWgYOg0= +sigs.k8s.io/controller-runtime v0.22.4 h1:GEjV7KV3TY8e+tJ2LCTxUTanW4z/FmNB7l327UfMq9A= +sigs.k8s.io/controller-runtime v0.22.4/go.mod h1:+QX1XUpTXN4mLoblf4tqr5CQcyHPAki2HLXqQMY6vh8= sigs.k8s.io/gateway-api v1.4.0 h1:ZwlNM6zOHq0h3WUX2gfByPs2yAEsy/EenYJB78jpQfQ= sigs.k8s.io/gateway-api v1.4.0/go.mod h1:AR5RSqciWP98OPckEjOjh2XJhAe2Na4LHyXD2FUY7Qk= sigs.k8s.io/json v0.0.0-20250730193827-2d320260d730 h1:IpInykpT6ceI+QxKBbEflcR5EXP7sU1kvOlxwZh5txg= diff --git a/historyserver/pkg/collector/logcollector/runtime/runtime.go b/historyserver/pkg/collector/logcollector/runtime/runtime.go index 8189b80e332..7e7b1ae7b84 100644 --- a/historyserver/pkg/collector/logcollector/runtime/runtime.go +++ b/historyserver/pkg/collector/logcollector/runtime/runtime.go @@ -40,6 +40,8 @@ func NewCollector(config *types.RayCollectorConfig, writer storage.StorageWriter } logDir := strings.TrimSpace(path.Join(config.SessionDir, utils.RAY_SESSIONDIR_LOGDIR_NAME)) handler.LogDir = logDir + // rootMetaDir uses flat key format (name_id) for S3/OSS performance optimization. + // See utils.connector for the design rationale. rootMetaDir := fmt.Sprintf("%s/", path.Clean(path.Join(handler.RootDir, handler.RayClusterName+"_"+handler.RayClusterID, "meta"))) handler.MetaDir = rootMetaDir diff --git a/historyserver/pkg/collector/registry.go b/historyserver/pkg/collector/registry.go index 423b9be14ef..037e1327409 100644 --- a/historyserver/pkg/collector/registry.go +++ b/historyserver/pkg/collector/registry.go @@ -15,8 +15,8 @@ func GetWriterRegistry() WriterRegistry { } var writerRegistry = WriterRegistry{ - "aliyunoss": ray.NewWritter, - "s3": s3.NewWritter, + "aliyunoss": ray.NewWriter, + "s3": s3.NewWriter, } type ReaderRegistry map[string]func(globalData *types.RayHistoryServerConfig, data map[string]interface{}) (storage.StorageReader, error) diff --git a/historyserver/pkg/eventserver/eventprocessor_interface.go b/historyserver/pkg/eventserver/eventprocessor_interface.go new file mode 100644 index 00000000000..37f9a17d6e7 --- /dev/null +++ b/historyserver/pkg/eventserver/eventprocessor_interface.go @@ -0,0 +1,7 @@ +package eventserver + +import "context" + +type EventProcessor[T any] interface { + ProcessEvents(ctx context.Context, ch <-chan T) error +} diff --git a/historyserver/pkg/eventserver/eventserver.go b/historyserver/pkg/eventserver/eventserver.go new file mode 100644 index 00000000000..f6b7488201f --- /dev/null +++ b/historyserver/pkg/eventserver/eventserver.go @@ -0,0 +1,710 @@ +package eventserver + +import ( + "context" + "encoding/json" + "fmt" + "io" + "regexp" + "sort" + "strings" + "sync" + "time" + + "github.com/ray-project/kuberay/historyserver/pkg/eventserver/types" + "github.com/ray-project/kuberay/historyserver/pkg/storage" + "github.com/ray-project/kuberay/historyserver/pkg/utils" + "github.com/sirupsen/logrus" +) + +type EventHandler struct { + reader storage.StorageReader + + ClusterTaskMap *types.ClusterTaskMap + ClusterActorMap *types.ClusterActorMap +} + +var eventFilePattern = regexp.MustCompile(`-\d{4}-\d{2}-\d{2}-\d{2}$`) + +func isValidEventFile(fileName string) bool { + // Skip directories + if strings.HasSuffix(fileName, "/") { + return false + } + // Only files matching {nodeId}-{YYYY-MM-DD-HH} format are valid event files + return eventFilePattern.MatchString(fileName) +} + +func NewEventHandler(reader storage.StorageReader) *EventHandler { + return &EventHandler{ + reader: reader, + ClusterTaskMap: &types.ClusterTaskMap{ + ClusterTaskMap: make(map[string]*types.TaskMap), + }, + ClusterActorMap: &types.ClusterActorMap{ + ClusterActorMap: make(map[string]*types.ActorMap), + }, + } +} + +// ProcessEvents func reads the channel and then processes the event received +func (h *EventHandler) ProcessEvents(ctx context.Context, ch <-chan map[string]any) error { + logrus.Infof("Starting a event processor channel") + for { + select { + case <-ctx.Done(): + // TODO: The context was cancelled, either stop here or process the rest of the events and return + // Currently, it will just stop. + logrus.Warnf("Event processor context was cancelled") + return ctx.Err() + case currEventData, ok := <-ch: + if !ok { + logrus.Warnf("Channel was closed") + return nil + } + if err := h.storeEvent(currEventData); err != nil { + logrus.Errorf("Failed to store event: %v", err) + continue + } + } + } +} + +// Run will start numOfEventProcessors (default to 5) processing functions and the event reader. The event reader will run once an hr, +// which is currently how often the collector flushes. +func (h *EventHandler) Run(stop chan struct{}, numOfEventProcessors int) error { + var wg sync.WaitGroup + + if numOfEventProcessors == 0 { + numOfEventProcessors = 5 + } + eventProcessorChannels := make([]chan map[string]any, numOfEventProcessors) + cctx := make([]context.CancelFunc, numOfEventProcessors) + + for i := range numOfEventProcessors { + eventProcessorChannels[i] = make(chan map[string]any, 100) + } + + for i, currEventChannel := range eventProcessorChannels { + wg.Add(1) + ctx, cancel := context.WithCancel(context.Background()) + cctx[i] = cancel + go func() { + defer wg.Done() + var processor EventProcessor[map[string]any] = h + err := processor.ProcessEvents(ctx, currEventChannel) + if err == ctx.Err() { + logrus.Warnf("Event processor go routine %d is now closed", i) + return + } + if err != nil { + logrus.Errorf("event processor %d go routine failed %v", i, err) + return + } + }() + } + + // Start reading files and sending events for processing + wg.Add(1) + go func() { + defer wg.Done() + logrus.Info("Starting event file reader loop") + + // Helper function to process all events + processAllEvents := func() { + clusterList := h.reader.List() + for _, clusterInfo := range clusterList { + clusterNameNamespace := clusterInfo.Name + "_" + clusterInfo.Namespace + eventFileList := append(h.getAllJobEventFiles(clusterInfo), h.getAllNodeEventFiles(clusterInfo)...) + + logrus.Infof("current eventFileList for cluster %s is: %v", clusterInfo.Name, eventFileList) + for _, eventFile := range eventFileList { + // TODO: Filter out ones that have already been read + logrus.Infof("Reading event file: %s", eventFile) + + eventioReader := h.reader.GetContent(clusterNameNamespace, eventFile) + if eventioReader == nil { + logrus.Errorf("Failed to get content for event file: %s, skipping", eventFile) + continue + } + eventbytes, err := io.ReadAll(eventioReader) + if err != nil { + logrus.Errorf("Failed to read event file: %v", err) + continue + } + + var eventList []map[string]any + if err := json.Unmarshal(eventbytes, &eventList); err != nil { + logrus.Errorf("Failed to unmarshal event: %v", err) + continue + } + + // Evenly distribute events to each channel + for i, curr := range eventList { + // Skip nil events (can occur with corrupted event files containing null elements) + if curr == nil { + continue + } + curr["clusterName"] = clusterInfo.Name + "_" + clusterInfo.Namespace + eventProcessorChannels[i%numOfEventProcessors] <- curr + } + } + } + } + + // Process events immediately on startup + processAllEvents() + + // Create a ticker for hourly processing + ticker := time.NewTicker(1 * time.Hour) + defer ticker.Stop() + + for { + logrus.Info("Finished reading files, waiting for next cycle...") + select { + case <-stop: + // Received stop signal, clean up and exit + for i, currChan := range eventProcessorChannels { + close(currChan) + cctx[i]() + } + logrus.Info("Event processor received stop signal, exiting.") + return + case <-ticker.C: + // Process events every hour + processAllEvents() + } + } + }() + + wg.Wait() + return nil +} + +// storeEvent unmarshals the event map into the correct actor/task struct and then stores it into the corresonding list +func (h *EventHandler) storeEvent(eventMap map[string]any) error { + eventTypeVal, ok := eventMap["eventType"] + if !ok { + return fmt.Errorf("event missing 'eventType' field") + } + eventTypeStr, ok := eventTypeVal.(string) + if !ok { + return fmt.Errorf("eventType is not a string, got %T", eventTypeVal) + } + eventType := types.EventType(eventTypeStr) + + clusterNameVal, ok := eventMap["clusterName"] + if !ok { + return fmt.Errorf("event missing 'clusterName' field") + } + currentClusterName, ok := clusterNameVal.(string) + if !ok { + return fmt.Errorf("clusterName is not a string, got %T", clusterNameVal) + } + + logrus.Infof("current eventType: %v", eventType) + switch eventType { + case types.TASK_DEFINITION_EVENT: + taskDef, ok := eventMap["taskDefinitionEvent"] + if !ok { + return fmt.Errorf("event does not have 'taskDefinitionEvent'") + } + jsonTaskDefinition, err := json.Marshal(taskDef) + if err != nil { + return err + } + + var currTask types.Task + if err := json.Unmarshal(jsonTaskDefinition, &currTask); err != nil { + return err + } + + taskMap := h.ClusterTaskMap.GetOrCreateTaskMap(currentClusterName) + taskMap.CreateOrMergeAttempt(currTask.TaskID, currTask.AttemptNumber, func(t *types.Task) { + // Merge definition fields (preserve existing Events if any) + existingEvents := t.Events + *t = currTask + if len(existingEvents) > 0 { + t.Events = existingEvents + t.State = existingEvents[len(existingEvents)-1].State + } + }) + + case types.TASK_LIFECYCLE_EVENT: + lifecycleEvent, ok := eventMap["taskLifecycleEvent"].(map[string]any) + if !ok { + return fmt.Errorf("invalid taskLifecycleEvent format") + } + + taskId, _ := lifecycleEvent["taskId"].(string) + taskAttempt, _ := lifecycleEvent["taskAttempt"].(float64) + transitions, _ := lifecycleEvent["stateTransitions"].([]any) + + nodeId, _ := lifecycleEvent["nodeId"].(string) + workerId, _ := lifecycleEvent["workerId"].(string) + + if len(transitions) == 0 || taskId == "" { + return nil + } + + // Parse state transitions + var stateEvents []types.StateEvent + for _, transition := range transitions { + tr, ok := transition.(map[string]any) + if !ok { + continue + } + state, _ := tr["state"].(string) + timestampStr, _ := tr["timestamp"].(string) + + var timestamp time.Time + if timestampStr != "" { + timestamp, _ = time.Parse(time.RFC3339Nano, timestampStr) + } + + stateEvents = append(stateEvents, types.StateEvent{ + State: types.TaskStatus(state), + Timestamp: timestamp, + }) + } + + if len(stateEvents) == 0 { + return nil + } + + taskMap := h.ClusterTaskMap.GetOrCreateTaskMap(currentClusterName) + taskMap.CreateOrMergeAttempt(taskId, int(taskAttempt), func(t *types.Task) { + // --- DEDUPLICATION using (State + Timestamp) as unique key --- + // Build a set of existing event keys to detect duplicates + type eventKey struct { + State string + Timestamp int64 + } + existingKeys := make(map[eventKey]bool) + for _, e := range t.Events { + existingKeys[eventKey{string(e.State), e.Timestamp.UnixNano()}] = true + } + + // Only append events that haven't been seen before + for _, e := range stateEvents { + key := eventKey{string(e.State), e.Timestamp.UnixNano()} + if !existingKeys[key] { + t.Events = append(t.Events, e) + existingKeys[key] = true + } + } + + // Sort events by timestamp to ensure correct order + sort.Slice(t.Events, func(i, j int) bool { + return t.Events[i].Timestamp.Before(t.Events[j].Timestamp) + }) + + if len(t.Events) == 0 { + return + } + + t.State = t.Events[len(t.Events)-1].State + + if nodeId != "" { + t.NodeID = nodeId + } + if workerId != "" { + t.WorkerID = workerId + } + if t.StartTime.IsZero() { + for _, e := range t.Events { + if e.State == types.RUNNING { + t.StartTime = e.Timestamp + break + } + } + } + lastEvent := t.Events[len(t.Events)-1] + if lastEvent.State == types.FINISHED || lastEvent.State == types.FAILED { + t.EndTime = lastEvent.Timestamp + } + }) + + case types.ACTOR_DEFINITION_EVENT: + actorDef, ok := eventMap["actorDefinitionEvent"] + if !ok { + return fmt.Errorf("event does not have 'actorDefinitionEvent'") + } + jsonActorDefinition, err := json.Marshal(actorDef) + if err != nil { + return err + } + + var currActor types.Actor + if err := json.Unmarshal(jsonActorDefinition, &currActor); err != nil { + return err + } + + // Use CreateOrMergeActor pattern (same as Task) + actorMap := h.ClusterActorMap.GetOrCreateActorMap(currentClusterName) + actorMap.CreateOrMergeActor(currActor.ActorID, func(a *types.Actor) { + // Preserve lifecycle-derived fields that may have arrived first + existingEvents := a.Events + existingState := a.State + existingStartTime := a.StartTime + existingEndTime := a.EndTime + existingNumRestarts := a.NumRestarts + existingPID := a.PID + existingExitDetails := a.ExitDetails + existingAddress := a.Address + + // Overwrite with definition fields + *a = currActor + + // Restore lifecycle-derived fields if they existed + if len(existingEvents) > 0 { + a.Events = existingEvents + a.State = existingState + a.StartTime = existingStartTime + a.EndTime = existingEndTime + a.NumRestarts = existingNumRestarts + a.PID = existingPID + a.ExitDetails = existingExitDetails + a.Address = existingAddress + } + }) + case types.ACTOR_LIFECYCLE_EVENT: + lifecycleEvent, ok := eventMap["actorLifecycleEvent"].(map[string]any) + if !ok { + return fmt.Errorf("invalid actorLifecycleEvent format") + } + + actorId, _ := lifecycleEvent["actorId"].(string) + transitions, _ := lifecycleEvent["stateTransitions"].([]any) + + if len(transitions) == 0 || actorId == "" { + return nil + } + + // Parse state transitions into ActorStateEvent slice + var stateEvents []types.ActorStateEvent + for _, transition := range transitions { + tr, ok := transition.(map[string]any) + if !ok { + continue + } + state, _ := tr["state"].(string) + timestampStr, _ := tr["timestamp"].(string) + nodeId, _ := tr["nodeId"].(string) + workerId, _ := tr["workerId"].(string) + reprName, _ := tr["reprName"].(string) + + var timestamp time.Time + if timestampStr != "" { + timestamp, _ = time.Parse(time.RFC3339Nano, timestampStr) + } + + // DeathCause is a complex object, store as JSON string + var deathCause string + if dc, ok := tr["deathCause"]; ok { + if dcBytes, err := json.Marshal(dc); err == nil { + deathCause = string(dcBytes) + } + } + + stateEvents = append(stateEvents, types.ActorStateEvent{ + State: types.StateType(state), + Timestamp: timestamp, + NodeID: nodeId, + WorkerID: workerId, + ReprName: reprName, + DeathCause: deathCause, + }) + } + + if len(stateEvents) == 0 { + return nil + } + + actorMap := h.ClusterActorMap.GetOrCreateActorMap(currentClusterName) + actorMap.CreateOrMergeActor(actorId, func(a *types.Actor) { + // Ensure ActorID is set (in case LIFECYCLE arrives before DEFINITION) + a.ActorID = actorId + + // --- DEDUPLICATION using (State + Timestamp) as unique key --- + // Build a set of existing event keys to detect duplicates + type eventKey struct { + State string + Timestamp int64 + } + existingKeys := make(map[eventKey]bool) + for _, e := range a.Events { + existingKeys[eventKey{string(e.State), e.Timestamp.UnixNano()}] = true + } + + // Only append events that haven't been seen before + for _, e := range stateEvents { + key := eventKey{string(e.State), e.Timestamp.UnixNano()} + if !existingKeys[key] { + a.Events = append(a.Events, e) + existingKeys[key] = true + } + } + + // Sort events by timestamp to ensure correct order + sort.Slice(a.Events, func(i, j int) bool { + return a.Events[i].Timestamp.Before(a.Events[j].Timestamp) + }) + + if len(a.Events) == 0 { + return + } + + lastEvent := a.Events[len(a.Events)-1] + + // --- UPDATE STATE --- + a.State = lastEvent.State + + // --- UPDATE ADDRESS from ALIVE state --- + // NodeID and WorkerID are only populated in ALIVE state + for i := len(a.Events) - 1; i >= 0; i-- { + if a.Events[i].State == types.ALIVE && a.Events[i].NodeID != "" { + a.Address.NodeID = a.Events[i].NodeID + a.Address.WorkerID = a.Events[i].WorkerID + break + } + } + + // --- UPDATE ReprName from latest --- + if lastEvent.ReprName != "" { + a.ReprName = lastEvent.ReprName + } + + // --- CALCULATE StartTime (first ALIVE timestamp) --- + if a.StartTime.IsZero() { + for _, e := range a.Events { + if e.State == types.ALIVE { + a.StartTime = e.Timestamp + break + } + } + } + + // --- HANDLE DEAD state --- + if lastEvent.State == types.DEAD { + a.EndTime = lastEvent.Timestamp + + // Parse deathCause to extract PID, IP, errorMessage + if lastEvent.DeathCause != "" { + var deathCauseMap map[string]any + if err := json.Unmarshal([]byte(lastEvent.DeathCause), &deathCauseMap); err == nil { + if ctx, ok := deathCauseMap["actorDiedErrorContext"].(map[string]any); ok { + // Extract PID + if pid, ok := ctx["pid"].(float64); ok { + a.PID = int(pid) + } + // Extract IP address + if ip, ok := ctx["nodeIpAddress"].(string); ok { + a.Address.IPAddress = ip + } + // Extract error message as ExitDetails + if errMsg, ok := ctx["errorMessage"].(string); ok { + a.ExitDetails = errMsg + } + } + } + } + } + + // --- COUNT RESTARTS --- + restartCount := 0 + for _, e := range a.Events { + if e.State == types.RESTARTING { + restartCount++ + } + } + a.NumRestarts = restartCount + }) + + case types.ACTOR_TASK_DEFINITION_EVENT: + // TODO: Handle actor task definition event + // This is related to GET /api/v0/tasks (type=ACTOR_TASK) + logrus.Debugf("ACTOR_TASK_DEFINITION_EVENT received, not yet implemented") + default: + logrus.Infof("Event not supported, skipping: %v", eventMap) + } + + return nil +} + +// getAllJobEventFiles get all the job event files for the given cluster. +// Assuming that the events file object follow the format root/clustername/sessionid/job_events/{job-*}/* +func (h *EventHandler) getAllJobEventFiles(clusterInfo utils.ClusterInfo) []string { + var allJobFiles []string + clusterNameID := clusterInfo.Name + "_" + clusterInfo.Namespace + jobEventDirPrefix := clusterInfo.SessionName + "/job_events/" + jobDirList := h.reader.ListFiles(clusterNameID, jobEventDirPrefix) + + for _, jobDir := range jobDirList { + // Skip non-directory entries + if !strings.HasSuffix(jobDir, "/") { + continue + } + jobDirPath := jobEventDirPrefix + jobDir + jobFiles := h.reader.ListFiles(clusterNameID, jobDirPath) + for _, jobFile := range jobFiles { + if isValidEventFile(jobFile) { + allJobFiles = append(allJobFiles, jobDirPath+jobFile) + } + } + } + return allJobFiles +} + +// getAllNodeEventFiles retrieves all node event files for the given cluster +func (h *EventHandler) getAllNodeEventFiles(clusterInfo utils.ClusterInfo) []string { + clusterNameID := clusterInfo.Name + "_" + clusterInfo.Namespace + nodeEventDirPrefix := clusterInfo.SessionName + "/node_events/" + nodeEventFileNames := h.reader.ListFiles(clusterNameID, nodeEventDirPrefix) + + // Filter out directories (items ending with /) and build full paths + var nodeEventFiles []string + for _, fileName := range nodeEventFileNames { + // Skip directories + if isValidEventFile(fileName) { + fullPath := nodeEventDirPrefix + fileName + nodeEventFiles = append(nodeEventFiles, fullPath) + } + } + return nodeEventFiles +} + +// GetTasks returns a thread-safe deep copy of all tasks (including all attempts) for a given cluster. +// Each task attempt is returned as a separate element in the slice. +// Deep copy ensures the returned data is safe to use after the lock is released. +func (h *EventHandler) GetTasks(clusterName string) []types.Task { + h.ClusterTaskMap.RLock() + defer h.ClusterTaskMap.RUnlock() + + taskMap, ok := h.ClusterTaskMap.ClusterTaskMap[clusterName] + if !ok { + return []types.Task{} + } + + taskMap.Lock() + defer taskMap.Unlock() + + // Flatten all attempts into a single slice with deep copy + var tasks []types.Task + for _, attempts := range taskMap.TaskMap { + for _, task := range attempts { + tasks = append(tasks, task.DeepCopy()) + } + } + return tasks +} + +// GetTaskByID returns all attempts for a specific task ID in a given cluster. +// Returns a slice of tasks representing all attempts, sorted by attempt number is not guaranteed. +func (h *EventHandler) GetTaskByID(clusterName, taskID string) ([]types.Task, bool) { + h.ClusterTaskMap.RLock() + defer h.ClusterTaskMap.RUnlock() + + taskMap, ok := h.ClusterTaskMap.ClusterTaskMap[clusterName] + if !ok { + return nil, false + } + + taskMap.Lock() + defer taskMap.Unlock() + + attempts, ok := taskMap.TaskMap[taskID] + if !ok || len(attempts) == 0 { + return nil, false + } + // Return a deep copy to avoid data race + result := make([]types.Task, len(attempts)) + for i, task := range attempts { + result[i] = task.DeepCopy() + } + return result, true +} + +// GetTasksByJobID returns all tasks (including all attempts) for a given job ID in a cluster. +func (h *EventHandler) GetTasksByJobID(clusterName, jobID string) []types.Task { + h.ClusterTaskMap.RLock() + defer h.ClusterTaskMap.RUnlock() + + taskMap, ok := h.ClusterTaskMap.ClusterTaskMap[clusterName] + if !ok { + return []types.Task{} + } + + taskMap.Lock() + defer taskMap.Unlock() + + var tasks []types.Task + for _, attempts := range taskMap.TaskMap { + for _, task := range attempts { + if task.JobID == jobID { + tasks = append(tasks, task.DeepCopy()) + } + } + } + return tasks +} + +// GetActors returns a thread-safe deep copy of all actors for a given cluster +func (h *EventHandler) GetActors(clusterName string) []types.Actor { + h.ClusterActorMap.RLock() + defer h.ClusterActorMap.RUnlock() + + actorMap, ok := h.ClusterActorMap.ClusterActorMap[clusterName] + if !ok { + return []types.Actor{} + } + + actorMap.Lock() + defer actorMap.Unlock() + + actors := make([]types.Actor, 0, len(actorMap.ActorMap)) + for _, actor := range actorMap.ActorMap { + actors = append(actors, actor.DeepCopy()) + } + return actors +} + +// GetActorByID returns a specific actor by ID for a given cluster +func (h *EventHandler) GetActorByID(clusterName, actorID string) (types.Actor, bool) { + h.ClusterActorMap.RLock() + defer h.ClusterActorMap.RUnlock() + + actorMap, ok := h.ClusterActorMap.ClusterActorMap[clusterName] + if !ok { + return types.Actor{}, false + } + + actorMap.Lock() + defer actorMap.Unlock() + + actor, ok := actorMap.ActorMap[actorID] + if !ok { + return types.Actor{}, false + } + return actor.DeepCopy(), true +} + +// GetActorsMap returns a thread-safe deep copy of all actors as a map for a given cluster +func (h *EventHandler) GetActorsMap(clusterName string) map[string]types.Actor { + h.ClusterActorMap.RLock() + defer h.ClusterActorMap.RUnlock() + + actorMap, ok := h.ClusterActorMap.ClusterActorMap[clusterName] + if !ok { + return map[string]types.Actor{} + } + + actorMap.Lock() + defer actorMap.Unlock() + + actors := make(map[string]types.Actor, len(actorMap.ActorMap)) + for id, actor := range actorMap.ActorMap { + actors[id] = actor.DeepCopy() + } + return actors +} diff --git a/historyserver/pkg/eventserver/eventserver_test.go b/historyserver/pkg/eventserver/eventserver_test.go new file mode 100644 index 00000000000..39f89bc1bad --- /dev/null +++ b/historyserver/pkg/eventserver/eventserver_test.go @@ -0,0 +1,770 @@ +package eventserver + +import ( + "context" + "errors" + "testing" + "time" + + "github.com/google/go-cmp/cmp" + "github.com/ray-project/kuberay/historyserver/pkg/eventserver/types" +) + +func makeTaskEventMap(taskName, nodeId, taskID, cluster string, attempt int) map[string]any { + return map[string]any{ + "eventType": string(types.TASK_DEFINITION_EVENT), + "clusterName": cluster, + "taskDefinitionEvent": map[string]any{ + "taskId": taskID, + "taskName": taskName, + "nodeId": nodeId, + "taskAttempt": attempt, + }, + } +} + +func TestEventProcessor(t *testing.T) { + tests := []struct { + name string + // Setup + eventsToSend []map[string]any + cancelAfter time.Duration // Time after which to cancel context (0 for no cancel) + closeChan bool // Whether to close the channel after sending events + + // Expectations + wantErr bool + expectedErrType error // Specific error type to check (e.g., context.Canceled) + wantStoredEvents map[string][]types.Task + }{ + { + name: "process multiple events then close channel", + eventsToSend: []map[string]any{ + { + "clusterName": "cluster1", + "eventType": "TASK_DEFINITION_EVENT", + "taskDefinitionEvent": map[string]any{ + "taskId": "ID_12345", + "taskName": "Name_12345", + "nodeId": "Nodeid_12345", + "taskAttempt": 2, + }, + }, + { + "clusterName": "cluster1", + "eventType": "TASK_DEFINITION_EVENT", + "taskDefinitionEvent": map[string]any{ + "taskId": "ID_54321", + "taskName": "Name_54321", + "nodeId": "Nodeid_54321", + "taskAttempt": 1, + }, + }, + }, + closeChan: true, + wantStoredEvents: map[string][]types.Task{ + "ID_12345": { + { + TaskID: "ID_12345", + Name: "Name_12345", + NodeID: "Nodeid_12345", + AttemptNumber: 2, + }, + }, + "ID_54321": { + { + TaskID: "ID_54321", + Name: "Name_54321", + NodeID: "Nodeid_54321", + AttemptNumber: 1, + }, + }, + }, + }, + { + name: "channel closed immediately", + closeChan: true, + wantErr: false, + }, + { + name: "context canceled", + eventsToSend: []map[string]any{ + { + "clusterName": "cluster1", + "eventType": "TASK_DEFINITION_EVENT", + "taskDefinitionEvent": map[string]any{ + "taskId": "ID_12345", + "taskName": "Name_12345", + "nodeId": "Nodeid_12345", + "taskAttempt": 2, + }, + }, + }, + cancelAfter: 50 * time.Millisecond, + wantErr: true, + expectedErrType: context.Canceled, + // Event might be processed before cancellation is detected + wantStoredEvents: map[string][]types.Task{ + "ID_12345": { + { + TaskID: "ID_12345", + Name: "Name_12345", + NodeID: "Nodeid_12345", + AttemptNumber: 2, + }, + }, + }, + }, + { + name: "no events, context canceled", + cancelAfter: 10 * time.Millisecond, + wantErr: true, + expectedErrType: context.Canceled, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + // Sending nil for reader since it won't be used anyways + h := NewEventHandler(nil) + + // Channel buffer size a bit larger than events to avoid blocking sender in test setup + ch := make(chan map[string]any, len(tt.eventsToSend)+2) + ctx, cancel := context.WithCancel(context.Background()) + defer cancel() + + // Send events into the channel + go func() { + for _, event := range tt.eventsToSend { + select { + case ch <- event: + case <-ctx.Done(): // Stop sending if context is cancelled + return + } + } + if tt.closeChan { + close(ch) + } + }() + + // Handle context cancellation if specified + if tt.cancelAfter > 0 { + go func() { + time.Sleep(tt.cancelAfter) + cancel() + }() + } + + // Run the ProcessEvent + err := h.ProcessEvents(ctx, ch) + + // Check error expectations + if (err != nil) != tt.wantErr { + t.Errorf("ProcessEvents() error = %v, wantErr %v", err, tt.wantErr) + } + if tt.expectedErrType != nil { + if !errors.Is(err, tt.expectedErrType) { + t.Errorf("ProcessEvents() error type = %T, want type %T (err: %v)", err, tt.expectedErrType, err) + } + } + + // Check stored events + if tt.wantStoredEvents != nil { + if diff := cmp.Diff(tt.wantStoredEvents, h.ClusterTaskMap.ClusterTaskMap["cluster1"].TaskMap); diff != "" { + t.Errorf("storeEventCalls diff (-want +got):\n%s", diff) + } + } + }) + } +} + +func TestStoreEvent(t *testing.T) { + initialTask := types.Task{ + TaskID: "taskid1", + Name: "taskName123", + NodeID: "nodeid123", + AttemptNumber: 0, + } + tests := []struct { + name string + initialState *types.ClusterTaskMap + eventMap map[string]any + wantErr bool + wantClusterCount int + wantTaskInCluster string // Cluster to check for the task + wantTaskID string // TaskID to check + wantTasks []types.Task // Expected tasks (all attempts), nil if not applicable + }{ + { + name: "unsupported event type", + initialState: &types.ClusterTaskMap{ + ClusterTaskMap: make(map[string]*types.TaskMap), + }, + eventMap: map[string]any{ + "eventType": "UNKNOWN_TYPE", + "clusterName": "c1", + }, + wantErr: false, + wantClusterCount: 0, + }, + { + name: "task event - new cluster and new task", + initialState: &types.ClusterTaskMap{ + ClusterTaskMap: make(map[string]*types.TaskMap), + }, + eventMap: makeTaskEventMap("taskName123", "nodeid1234", "taskid1", "cluster1", 0), + wantErr: false, + wantClusterCount: 1, + wantTaskInCluster: "cluster1", + wantTaskID: "taskid1", + wantTasks: []types.Task{ + { + TaskID: "taskid1", + Name: "taskName123", + NodeID: "nodeid1234", + AttemptNumber: 0, + }, + }, + }, + { + name: "task event - existing cluster, new task", + initialState: &types.ClusterTaskMap{ + ClusterTaskMap: map[string]*types.TaskMap{ + "cluster1": types.NewTaskMap(), + }, + }, + eventMap: makeTaskEventMap("taskName123", "nodeid1234", "taskid2", "cluster1", 1), + wantErr: false, + wantClusterCount: 1, + wantTaskInCluster: "cluster1", + wantTaskID: "taskid2", + wantTasks: []types.Task{ + { + TaskID: "taskid2", + Name: "taskName123", + NodeID: "nodeid1234", + AttemptNumber: 1, + }, + }, + }, + { + name: "task event - existing cluster and existing task with new attempt", + initialState: &types.ClusterTaskMap{ + ClusterTaskMap: map[string]*types.TaskMap{ + "cluster1": { + TaskMap: map[string][]types.Task{ + "taskid1": {initialTask}, + }, + }, + }, + }, + eventMap: makeTaskEventMap("taskName123", "nodeid123", "taskid1", "cluster1", 2), + wantErr: false, + wantClusterCount: 1, + wantTaskInCluster: "cluster1", + wantTaskID: "taskid1", + // Now expects BOTH attempts to be stored + wantTasks: []types.Task{ + { + TaskID: "taskid1", + Name: "taskName123", + NodeID: "nodeid123", + AttemptNumber: 0, + }, + { + TaskID: "taskid1", + Name: "taskName123", + NodeID: "nodeid123", + AttemptNumber: 2, + }, + }, + }, + { + name: "task event - missing taskDefinitionEvent", + initialState: &types.ClusterTaskMap{ + ClusterTaskMap: make(map[string]*types.TaskMap), + }, + eventMap: map[string]any{ + "eventType": string(types.TASK_DEFINITION_EVENT), + "clusterName": "c1", + }, + wantErr: true, + }, + { + name: "task event - taskDefinitionEvent wrong type", + initialState: &types.ClusterTaskMap{ + ClusterTaskMap: make(map[string]*types.TaskMap), + }, + eventMap: map[string]any{ + "eventType": string(types.TASK_DEFINITION_EVENT), + "clusterName": "c1", + "taskDefinitionEvent": "not a map", + }, + wantErr: true, // Marshal will fail + }, + { + name: "task event - invalid task structure", + initialState: &types.ClusterTaskMap{ + ClusterTaskMap: make(map[string]*types.TaskMap), + }, + eventMap: map[string]any{ + "eventType": string(types.TASK_DEFINITION_EVENT), + "clusterName": "c1", + "taskDefinitionEvent": map[string]any{ + "taskId": 123, // Should be string + "taskAttempt": 0, + }, + }, + wantErr: true, // Unmarshal will fail + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + h := &EventHandler{ + ClusterTaskMap: tt.initialState, + } + if h.ClusterTaskMap == nil { + h.ClusterTaskMap = &types.ClusterTaskMap{ + ClusterTaskMap: make(map[string]*types.TaskMap), + } + } + + err := h.storeEvent(tt.eventMap) + + if (err != nil) != tt.wantErr { + t.Fatalf("storeEvent() error = %v, wantErr %v", err, tt.wantErr) + } + if err != nil { + return + } + + gotClusterCount := len(h.ClusterTaskMap.ClusterTaskMap) + + if gotClusterCount != tt.wantClusterCount { + t.Errorf("storeEvent() resulted in %d clusters, want %d", gotClusterCount, tt.wantClusterCount) + } + + if tt.wantTasks != nil { + clusterObj, clusterExists := h.ClusterTaskMap.ClusterTaskMap[tt.wantTaskInCluster] + + if !clusterExists { + t.Fatalf("storeEvent() cluster %s not found", tt.wantTaskInCluster) + } + + clusterObj.Lock() + defer clusterObj.Unlock() + gotTasks, taskExists := clusterObj.TaskMap[tt.wantTaskID] + if !taskExists { + t.Fatalf("storeEvent() task %s not found in cluster %s", tt.wantTaskID, tt.wantTaskInCluster) + } + + if diff := cmp.Diff(tt.wantTasks, gotTasks); diff != "" { + t.Errorf("storeEvent() tasks mismatch (-want +got):\n%s", diff) + } + } + }) + } +} + +// TestTaskLifecycleEventDeduplication verifies that duplicate events are correctly filtered +// and out-of-order events are properly sorted +func TestTaskLifecycleEventDeduplication(t *testing.T) { + // Helper to create a StateEvent + makeStateEvent := func(state types.TaskStatus, timestampNano int64) types.StateEvent { + return types.StateEvent{ + State: state, + Timestamp: time.Unix(0, timestampNano), + } + } + + // Helper to create a TASK_LIFECYCLE_EVENT map + makeLifecycleEvent := func(taskID string, attempt int, transitions []map[string]any) map[string]any { + // Convert []map[string]any to []any for proper type assertion in storeEvent + transitionsAny := make([]any, len(transitions)) + for i, t := range transitions { + transitionsAny[i] = t + } + return map[string]any{ + "eventType": string(types.TASK_LIFECYCLE_EVENT), + "clusterName": "test-cluster", + "taskLifecycleEvent": map[string]any{ + "taskId": taskID, + "taskAttempt": float64(attempt), + "stateTransitions": transitionsAny, + "nodeId": "node-1", + "workerId": "worker-1", + }, + } + } + + // Helper to create state transition + makeTransition := func(state string, timestampNano int64) map[string]any { + return map[string]any{ + "state": state, + "timestamp": time.Unix(0, timestampNano).Format(time.RFC3339Nano), + } + } + + tests := []struct { + name string + existingEvents []types.StateEvent // Events already in the task + newTransitions []map[string]any // New transitions to process + wantEvents []types.StateEvent // Expected final events (sorted by timestamp) + wantState types.TaskStatus // Expected final state + }{ + { + name: "Scenario 1: Normal deduplication - same events processed twice", + existingEvents: []types.StateEvent{ + makeStateEvent(types.PENDING_NODE_ASSIGNMENT, 1000), + makeStateEvent(types.RUNNING, 2000), + }, + newTransitions: []map[string]any{ + makeTransition("PENDING_NODE_ASSIGNMENT", 1000), // Duplicate + makeTransition("RUNNING", 2000), // Duplicate + }, + wantEvents: []types.StateEvent{ + makeStateEvent(types.PENDING_NODE_ASSIGNMENT, 1000), + makeStateEvent(types.RUNNING, 2000), + }, + wantState: types.RUNNING, + }, + { + name: "Scenario 2: Out-of-order events - B(t=2) arrives after A(t=1), C(t=3)", + existingEvents: []types.StateEvent{ + makeStateEvent(types.PENDING_NODE_ASSIGNMENT, 1000), // A + makeStateEvent(types.FINISHED, 3000), // C + }, + newTransitions: []map[string]any{ + makeTransition("RUNNING", 2000), // B - should be inserted in the middle + }, + wantEvents: []types.StateEvent{ + makeStateEvent(types.PENDING_NODE_ASSIGNMENT, 1000), // A + makeStateEvent(types.RUNNING, 2000), // B - now in correct position + makeStateEvent(types.FINISHED, 3000), // C + }, + wantState: types.FINISHED, + }, + { + name: "Scenario 3: Same timestamp, different states - both should be kept", + existingEvents: []types.StateEvent{ + makeStateEvent(types.PENDING_NODE_ASSIGNMENT, 1000), + }, + newTransitions: []map[string]any{ + makeTransition("RUNNING", 1000), // Same timestamp, different state + }, + wantEvents: []types.StateEvent{ + makeStateEvent(types.PENDING_NODE_ASSIGNMENT, 1000), + makeStateEvent(types.RUNNING, 1000), + }, + wantState: types.RUNNING, // Last after sort (order of same timestamp is stable) + }, + { + name: "Scenario 4: Exact duplicate event - only one should remain", + existingEvents: []types.StateEvent{ + makeStateEvent(types.RUNNING, 1000), + }, + newTransitions: []map[string]any{ + makeTransition("RUNNING", 1000), // Exact duplicate + makeTransition("RUNNING", 1000), // Another duplicate + }, + wantEvents: []types.StateEvent{ + makeStateEvent(types.RUNNING, 1000), // Only one + }, + wantState: types.RUNNING, + }, + { + name: "Scenario 5: Partial overlap - existing [A,B], new [B,C] -> result [A,B,C]", + existingEvents: []types.StateEvent{ + makeStateEvent(types.PENDING_NODE_ASSIGNMENT, 1000), // A + makeStateEvent(types.RUNNING, 2000), // B + }, + newTransitions: []map[string]any{ + makeTransition("RUNNING", 2000), // B - duplicate + makeTransition("FINISHED", 3000), // C - new + }, + wantEvents: []types.StateEvent{ + makeStateEvent(types.PENDING_NODE_ASSIGNMENT, 1000), // A + makeStateEvent(types.RUNNING, 2000), // B + makeStateEvent(types.FINISHED, 3000), // C + }, + wantState: types.FINISHED, + }, + { + name: "Scenario 6: Empty initial events - add new events", + existingEvents: []types.StateEvent{}, + newTransitions: []map[string]any{ + makeTransition("PENDING_NODE_ASSIGNMENT", 1000), + makeTransition("RUNNING", 2000), + }, + wantEvents: []types.StateEvent{ + makeStateEvent(types.PENDING_NODE_ASSIGNMENT, 1000), + makeStateEvent(types.RUNNING, 2000), + }, + wantState: types.RUNNING, + }, + { + name: "Scenario 7: Multiple reprocessing cycles - events should not grow", + existingEvents: []types.StateEvent{ + makeStateEvent(types.PENDING_NODE_ASSIGNMENT, 1000), + makeStateEvent(types.RUNNING, 2000), + makeStateEvent(types.FINISHED, 3000), + }, + newTransitions: []map[string]any{ + // Simulating reprocess of same file + makeTransition("PENDING_NODE_ASSIGNMENT", 1000), + makeTransition("RUNNING", 2000), + makeTransition("FINISHED", 3000), + }, + wantEvents: []types.StateEvent{ + makeStateEvent(types.PENDING_NODE_ASSIGNMENT, 1000), + makeStateEvent(types.RUNNING, 2000), + makeStateEvent(types.FINISHED, 3000), + }, + wantState: types.FINISHED, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + h := NewEventHandler(nil) + + // Pre-populate existing events if any + if len(tt.existingEvents) > 0 { + taskMap := h.ClusterTaskMap.GetOrCreateTaskMap("test-cluster") + taskMap.CreateOrMergeAttempt("task-1", 0, func(task *types.Task) { + task.TaskID = "task-1" + task.Events = tt.existingEvents + if len(tt.existingEvents) > 0 { + task.State = tt.existingEvents[len(tt.existingEvents)-1].State + } + }) + } + + // Process the lifecycle event + eventMap := makeLifecycleEvent("task-1", 0, tt.newTransitions) + err := h.storeEvent(eventMap) + if err != nil { + t.Fatalf("storeEvent() unexpected error: %v", err) + } + + // Get the task and verify + taskMap := h.ClusterTaskMap.GetOrCreateTaskMap("test-cluster") + taskMap.Lock() + defer taskMap.Unlock() + + tasks, exists := taskMap.TaskMap["task-1"] + if !exists || len(tasks) == 0 { + t.Fatal("Task not found after processing") + } + + task := tasks[0] + + // Verify events + if diff := cmp.Diff(tt.wantEvents, task.Events); diff != "" { + t.Errorf("Events mismatch (-want +got):\n%s", diff) + } + + // Verify final state + if task.State != tt.wantState { + t.Errorf("State = %v, want %v", task.State, tt.wantState) + } + + // Verify event count (important for deduplication) + if len(task.Events) != len(tt.wantEvents) { + t.Errorf("Event count = %d, want %d", len(task.Events), len(tt.wantEvents)) + } + }) + } +} + +// TestActorLifecycleEventDeduplication verifies that duplicate actor events are correctly filtered +func TestActorLifecycleEventDeduplication(t *testing.T) { + // Helper to create an ActorStateEvent + makeActorStateEvent := func(state types.StateType, timestampNano int64) types.ActorStateEvent { + return types.ActorStateEvent{ + State: state, + Timestamp: time.Unix(0, timestampNano), + } + } + + // Helper to create an ACTOR_LIFECYCLE_EVENT map + makeActorLifecycleEvent := func(actorID string, transitions []map[string]any) map[string]any { + // Convert []map[string]any to []any for proper type assertion in storeEvent + transitionsAny := make([]any, len(transitions)) + for i, t := range transitions { + transitionsAny[i] = t + } + return map[string]any{ + "eventType": string(types.ACTOR_LIFECYCLE_EVENT), + "clusterName": "test-cluster", + "actorLifecycleEvent": map[string]any{ + "actorId": actorID, + "stateTransitions": transitionsAny, + }, + } + } + + // Helper to create state transition + makeTransition := func(state string, timestampNano int64) map[string]any { + return map[string]any{ + "state": state, + "timestamp": time.Unix(0, timestampNano).Format(time.RFC3339Nano), + } + } + + tests := []struct { + name string + existingEvents []types.ActorStateEvent + newTransitions []map[string]any + wantEvents []types.ActorStateEvent + wantState types.StateType + }{ + { + name: "Actor: Normal deduplication", + existingEvents: []types.ActorStateEvent{ + makeActorStateEvent(types.PENDING_CREATION, 1000), + makeActorStateEvent(types.ALIVE, 2000), + }, + newTransitions: []map[string]any{ + makeTransition("PENDING_CREATION", 1000), + makeTransition("ALIVE", 2000), + }, + wantEvents: []types.ActorStateEvent{ + makeActorStateEvent(types.PENDING_CREATION, 1000), + makeActorStateEvent(types.ALIVE, 2000), + }, + wantState: types.ALIVE, + }, + { + name: "Actor: Out-of-order with sort", + existingEvents: []types.ActorStateEvent{ + makeActorStateEvent(types.PENDING_CREATION, 1000), + makeActorStateEvent(types.DEAD, 3000), + }, + newTransitions: []map[string]any{ + makeTransition("ALIVE", 2000), // Should be inserted between + }, + wantEvents: []types.ActorStateEvent{ + makeActorStateEvent(types.PENDING_CREATION, 1000), + makeActorStateEvent(types.ALIVE, 2000), + makeActorStateEvent(types.DEAD, 3000), + }, + wantState: types.DEAD, + }, + { + name: "Actor: Exact duplicate should not increase count", + existingEvents: []types.ActorStateEvent{ + makeActorStateEvent(types.ALIVE, 1000), + }, + newTransitions: []map[string]any{ + makeTransition("ALIVE", 1000), + makeTransition("ALIVE", 1000), + makeTransition("ALIVE", 1000), + }, + wantEvents: []types.ActorStateEvent{ + makeActorStateEvent(types.ALIVE, 1000), + }, + wantState: types.ALIVE, + }, + { + name: "Actor: Same timestamp different states should both be kept", + existingEvents: []types.ActorStateEvent{ + makeActorStateEvent(types.PENDING_CREATION, 1000), + }, + newTransitions: []map[string]any{ + makeTransition("ALIVE", 1000), // Same timestamp, different state + }, + wantEvents: []types.ActorStateEvent{ + makeActorStateEvent(types.PENDING_CREATION, 1000), + makeActorStateEvent(types.ALIVE, 1000), + }, + wantState: types.ALIVE, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + h := NewEventHandler(nil) + + // Pre-populate existing events + if len(tt.existingEvents) > 0 { + actorMap := h.ClusterActorMap.GetOrCreateActorMap("test-cluster") + actorMap.CreateOrMergeActor("actor-1", func(a *types.Actor) { + a.ActorID = "actor-1" + a.Events = tt.existingEvents + if len(tt.existingEvents) > 0 { + a.State = tt.existingEvents[len(tt.existingEvents)-1].State + } + }) + } + + // Process the lifecycle event + eventMap := makeActorLifecycleEvent("actor-1", tt.newTransitions) + err := h.storeEvent(eventMap) + if err != nil { + t.Fatalf("storeEvent() unexpected error: %v", err) + } + + // Get the actor and verify + actor, found := h.GetActorByID("test-cluster", "actor-1") + if !found { + t.Fatal("Actor not found after processing") + } + + // Verify events count (most important for dedup testing) + if len(actor.Events) != len(tt.wantEvents) { + t.Errorf("Event count = %d, want %d", len(actor.Events), len(tt.wantEvents)) + } + + // Verify final state + if actor.State != tt.wantState { + t.Errorf("State = %v, want %v", actor.State, tt.wantState) + } + }) + } +} + +// TestMultipleReprocessingCycles simulates hourly reprocessing and verifies no memory growth +func TestMultipleReprocessingCycles(t *testing.T) { + h := NewEventHandler(nil) + + // The same events that would be in an event file + // Use []any to match what storeEvent expects from JSON parsing + transitions := []any{ + map[string]any{"state": "PENDING_NODE_ASSIGNMENT", "timestamp": time.Unix(0, 1000).Format(time.RFC3339Nano)}, + map[string]any{"state": "RUNNING", "timestamp": time.Unix(0, 2000).Format(time.RFC3339Nano)}, + map[string]any{"state": "FINISHED", "timestamp": time.Unix(0, 3000).Format(time.RFC3339Nano)}, + } + + eventMap := map[string]any{ + "eventType": string(types.TASK_LIFECYCLE_EVENT), + "clusterName": "test-cluster", + "taskLifecycleEvent": map[string]any{ + "taskId": "task-1", + "taskAttempt": float64(0), + "stateTransitions": transitions, + "nodeId": "node-1", + "workerId": "worker-1", + }, + } + + // Simulate 10 hourly reprocessing cycles + for cycle := 0; cycle < 10; cycle++ { + err := h.storeEvent(eventMap) + if err != nil { + t.Fatalf("Cycle %d: storeEvent() error = %v", cycle, err) + } + + // Check event count after each cycle + taskMap := h.ClusterTaskMap.GetOrCreateTaskMap("test-cluster") + taskMap.Lock() + tasks := taskMap.TaskMap["task-1"] + eventCount := len(tasks[0].Events) + taskMap.Unlock() + + // Should always be exactly 3 events, never growing + if eventCount != 3 { + t.Errorf("Cycle %d: Event count = %d, want 3 (events are duplicating!)", cycle, eventCount) + } + } +} diff --git a/historyserver/pkg/eventserver/types/actor.go b/historyserver/pkg/eventserver/types/actor.go new file mode 100644 index 00000000000..6a8552cd90d --- /dev/null +++ b/historyserver/pkg/eventserver/types/actor.go @@ -0,0 +1,217 @@ +package types + +import ( + "strconv" + "sync" + "time" +) + +type StateType string + +const ( + DEPENDENCIES_UNREADY StateType = "DEPENDENCIES_UNREADY" + PENDING_CREATION StateType = "PENDING_CREATION" + ALIVE StateType = "ALIVE" + RESTARTING StateType = "RESTARTING" + DEAD StateType = "DEAD" +) + +type Address struct { + NodeID string + IPAddress string + Port string + WorkerID string +} + +// ActorStateEvent represents a single state transition event with its timestamp. +// This mirrors the stateTransitions format from Ray's actor lifecycle event. +// Fields: +// - State: the actor state (DEPENDENCIES_UNREADY, PENDING_CREATION, ALIVE, RESTARTING, DEAD) +// - Timestamp: when the state transition occurred +// - NodeID: node where actor runs (only populated in ALIVE state) +// - WorkerID: worker running the actor (only populated in ALIVE state) +// - ReprName: actor repr name (may change during lifecycle) +// - DeathCause: JSON string containing death details (only in DEAD state) +type ActorStateEvent struct { + State StateType `json:"state"` + Timestamp time.Time `json:"timestamp"` + NodeID string `json:"nodeId,omitempty"` + WorkerID string `json:"workerId,omitempty"` + ReprName string `json:"reprName,omitempty"` + DeathCause string `json:"deathCause,omitempty"` +} + +type Actor struct { + ActorID string `json:"actorId"` + JobID string `json:"jobId"` + PlacementGroupID string `json:"placementGroupId,omitempty"` + State StateType + + // PID is extracted from deathCause.actorDiedErrorContext.pid (not from definition event) + PID int `json:"pid,omitempty"` + + // Address contains node/worker info, populated from ALIVE state transitions + Address Address `json:"address"` + + Name string `json:"name,omitempty"` + ActorClass string `json:"className"` + + // NumRestarts is calculated by counting RESTARTING states in Events + NumRestarts int `json:"numRestarts"` + + // RequiredResources type changed from int to float64 to match Ray protobuf + RequiredResources map[string]float64 `json:"requiredResources,omitempty"` + + // ExitDetails is extracted from deathCause.actorDiedErrorContext.errorMessage + ExitDetails string `json:"exitDetails,omitempty"` + + ReprName string `json:"reprName,omitempty"` + CallSite string `json:"callSite,omitempty"` + LabelSelector map[string]string `json:"labelSelector,omitempty"` + + // IsDetached indicates if actor is detached (survives driver exit) + IsDetached bool `json:"isDetached"` + + // RayNamespace is the Ray namespace this actor belongs to + RayNamespace string `json:"rayNamespace,omitempty"` + + // SerializedRuntimeEnv contains the serialized runtime environment + SerializedRuntimeEnv string `json:"serializedRuntimeEnv,omitempty"` + + // --- LIFECYCLE FIELDS --- + + // Events stores the complete state transition history + // Deduplication is applied based on timestamp to avoid duplicate events + Events []ActorStateEvent `json:"events,omitempty"` + + // StartTime is the timestamp of first ALIVE state (computed from Events) + StartTime time.Time `json:"startTime,omitempty"` + + // EndTime is the timestamp of DEAD state (computed from Events) + EndTime time.Time `json:"endTime,omitempty"` +} + +// ActorMap is a struct that uses ActorID as key and the Actor struct as value +type ActorMap struct { + ActorMap map[string]Actor + Mu sync.Mutex +} + +func (a *ActorMap) Lock() { + a.Mu.Lock() +} + +func (a *ActorMap) Unlock() { + a.Mu.Unlock() +} + +func NewActorMap() *ActorMap { + return &ActorMap{ + ActorMap: make(map[string]Actor), + } +} + +// ClusterActorMap uses the cluster name as the key +type ClusterActorMap struct { + ClusterActorMap map[string]*ActorMap + Mu sync.RWMutex +} + +func (c *ClusterActorMap) RLock() { + c.Mu.RLock() +} + +func (c *ClusterActorMap) RUnlock() { + c.Mu.RUnlock() +} + +func (c *ClusterActorMap) Lock() { + c.Mu.Lock() +} + +func (c *ClusterActorMap) Unlock() { + c.Mu.Unlock() +} + +// GetOrCreateActorMap returns the ActorMap for the given cluster, creating it if it doesn't exist. +// This is the actor equivalent of ClusterTaskMap.GetOrCreateTaskMap +func (c *ClusterActorMap) GetOrCreateActorMap(clusterName string) *ActorMap { + c.Lock() + defer c.Unlock() + + actorMap, exists := c.ClusterActorMap[clusterName] + if !exists { + actorMap = NewActorMap() + c.ClusterActorMap[clusterName] = actorMap + } + return actorMap +} + +// CreateOrMergeActor finds or creates an actor and applies the merge function. +// Unlike Task which has AttemptNumber requiring binary search, +// Actor uses simple map lookup since ActorID is unique. +// This handles the case where LIFECYCLE events arrive before DEFINITION events. +func (a *ActorMap) CreateOrMergeActor(actorId string, mergeFn func(*Actor)) { + a.Lock() + defer a.Unlock() + + actor, exists := a.ActorMap[actorId] + if !exists { + // Actor doesn't exist, create new with ActorID initialized + newActor := Actor{ActorID: actorId} + mergeFn(&newActor) + a.ActorMap[actorId] = newActor + return + } + + // Actor exists: apply merge function and write back to map + // NOTE: Must write back because Go map returns a copy, not a reference + mergeFn(&actor) + a.ActorMap[actorId] = actor +} + +func GetActorFieldValue(actor Actor, filterKey string) string { + switch filterKey { + case "actor_id": + return actor.ActorID + case "job_id": + return actor.JobID + case "state": + return string(actor.State) + case "name", "actor_name": + return actor.Name + case "class_name", "actor_class": + return actor.ActorClass + case "node_id": + return actor.Address.NodeID + case "pid": + return strconv.Itoa(actor.PID) + case "placement_group_id": + return actor.PlacementGroupID + default: + return "" + } +} + +// DeepCopy returns a deep copy of the Actor, including slices and maps. +// This prevents race conditions when the returned Actor is used after locks are released. +func (a Actor) DeepCopy() Actor { + cp := a + if len(a.Events) > 0 { + cp.Events = make([]ActorStateEvent, len(a.Events)) + copy(cp.Events, a.Events) + } + if len(a.RequiredResources) > 0 { + cp.RequiredResources = make(map[string]float64, len(a.RequiredResources)) + for k, v := range a.RequiredResources { + cp.RequiredResources[k] = v + } + } + if len(a.LabelSelector) > 0 { + cp.LabelSelector = make(map[string]string, len(a.LabelSelector)) + for k, v := range a.LabelSelector { + cp.LabelSelector[k] = v + } + } + return cp +} diff --git a/historyserver/pkg/eventserver/types/event.go b/historyserver/pkg/eventserver/types/event.go new file mode 100644 index 00000000000..257ed3373cf --- /dev/null +++ b/historyserver/pkg/eventserver/types/event.go @@ -0,0 +1,17 @@ +package types + +type EventType string + +const ( + EVENT_TYPE_UNSPECIFIED EventType = "EVENT_TYPE_UNSPECIFIED" + TASK_DEFINITION_EVENT EventType = "TASK_DEFINITION_EVENT" + TASK_LIFECYCLE_EVENT EventType = "TASK_LIFECYCLE_EVENT" + ACTOR_TASK_DEFINITION_EVENT EventType = "ACTOR_TASK_DEFINITION_EVENT" + TASK_PROFILE_EVENT EventType = "TASK_PROFILE_EVENT" + DRIVER_JOB_DEFINITION_EVENT EventType = "DRIVER_JOB_DEFINITION_EVENT" + DRIVER_JOB_LIFECYCLE_EVENT EventType = "DRIVER_JOB_LIFECYCLE_EVENT" + NODE_DEFINITION_EVENT EventType = "NODE_DEFINITION_EVENT" + NODE_LIFECYCLE_EVENT EventType = "NODE_LIFECYCLE_EVENT" + ACTOR_DEFINITION_EVENT EventType = "ACTOR_DEFINITION_EVENT" + ACTOR_LIFECYCLE_EVENT EventType = "ACTOR_LIFECYCLE_EVENT" +) diff --git a/historyserver/pkg/eventserver/types/task.go b/historyserver/pkg/eventserver/types/task.go new file mode 100644 index 00000000000..9fad7dabf05 --- /dev/null +++ b/historyserver/pkg/eventserver/types/task.go @@ -0,0 +1,224 @@ +package types + +import ( + "sort" + "sync" + "time" +) + +type TaskStatus string + +const ( + NIL TaskStatus = "NIL" + PENDING_ARGS_AVAIL TaskStatus = "PENDING_ARGS_AVAIL" + PENDING_NODE_ASSIGNMENT TaskStatus = "PENDING_NODE_ASSIGNMENT" + PENDING_OBJ_STORE_MEM_AVAIL TaskStatus = "PENDING_OBJ_STORE_MEM_AVAIL" + PENDING_ARGS_FETCH TaskStatus = "PENDING_ARGS_FETCH" + SUBMITTED_TO_WORKER TaskStatus = "SUBMITTED_TO_WORKER" + PENDING_ACTOR_TASK_ARGS_FETCH TaskStatus = "PENDING_ACTOR_TASK_ARGS_FETCH" + PENDING_ACTOR_TASK_ORDERING_OR_CONCURRENCY TaskStatus = "PENDING_ACTOR_TASK_ORDERING_OR_CONCURRENCY" + RUNNING TaskStatus = "RUNNING" + RUNNING_IN_RAY_GET TaskStatus = "RUNNING_IN_RAY_GET" + RUNNING_IN_RAY_WAIT TaskStatus = "RUNNING_IN_RAY_WAIT" + FINISHED TaskStatus = "FINISHED" + FAILED TaskStatus = "FAILED" + GETTING_AND_PINNING_ARGS TaskStatus = "GETTING_AND_PINNING_ARGS" +) + +type TaskType string + +const ( + NORMAL_TASK TaskType = "NORMAL_TASK" + ACTOR_CREATION_TASK TaskType = "ACTOR_CREATION_TASK" + ACTOR_TASK TaskType = "ACTOR_TASK" + DRIVER_TASK TaskType = "DRIVER_TASK" +) + +// StateEvent represents a single state transition event with its timestamp. +// This mirrors the stateTransitions format from Ray's event export API. +type StateEvent struct { + State TaskStatus `json:"state"` + Timestamp time.Time `json:"timestamp"` +} + +type Task struct { + TaskID string `json:"taskId"` + Name string `json:"taskName"` + AttemptNumber int `json:"taskAttempt"` + State TaskStatus + JobID string `json:"jobId"` + NodeID string `json:"nodeId"` + ActorID string + PlacementGroupID string `json:"placementGroupId"` + Type TaskType `json:"taskType"` + FuncOrClassName string `json:"functionName"` + Language string `json:"language"` + RequiredResources map[string]float64 `json:"requiredResources"` // float64 to match Ray protobuf (e.g., {"CPU": 0.5}) + StartTime time.Time + EndTime time.Time + // Events stores the complete state transition history. + // Each element represents a state change with its timestamp. + Events []StateEvent `json:"events,omitempty"` + // ProfilingData ProfilingData + WorkerID string `json:"workerId"` + ErrorType string `json:"errorType"` + ErrorMessage string `json:"errorMessage"` + TaskLogInfo map[string]string + CallSite string + LabelSelector map[string]string +} + +// TaskMap is a struct that uses TaskID as key and stores a list of Task attempts. +// Each TaskID maps to a slice of Tasks, where each element represents a different attempt. +type TaskMap struct { + TaskMap map[string][]Task + Mu sync.Mutex +} + +func (t *TaskMap) Lock() { + t.Mu.Lock() +} + +func (t *TaskMap) Unlock() { + t.Mu.Unlock() +} + +func NewTaskMap() *TaskMap { + return &TaskMap{ + TaskMap: make(map[string][]Task), + } +} + +// ClusterTaskMap uses the cluster name as the key +type ClusterTaskMap struct { + ClusterTaskMap map[string]*TaskMap + Mu sync.RWMutex +} + +func (c *ClusterTaskMap) RLock() { + c.Mu.RLock() +} + +func (c *ClusterTaskMap) RUnlock() { + c.Mu.RUnlock() +} + +func (c *ClusterTaskMap) Lock() { + c.Mu.Lock() +} + +func (c *ClusterTaskMap) Unlock() { + c.Mu.Unlock() +} + +// GetOrCreateTaskMap returns the TaskMap for the given cluster, creating it if it doesn't exist. +func (c *ClusterTaskMap) GetOrCreateTaskMap(clusterName string) *TaskMap { + c.Lock() + defer c.Unlock() + + taskMap, exists := c.ClusterTaskMap[clusterName] + if !exists { + taskMap = NewTaskMap() + c.ClusterTaskMap[clusterName] = taskMap + } + return taskMap +} + +// CreateOrMergeAttempt finds or creates a task attempt and applies the merge function. +// Uses binary search for O(log n) lookup. Maintains sorted order by AttemptNumber. +// This handles the case where LIFECYCLE events arrive before DEFINITION events. +// - If the attempt doesn't exist, creates a new one at the correct position +// - If the attempt exists, applies mergeFn to merge new data into existing +func (t *TaskMap) CreateOrMergeAttempt(taskId string, attemptNum int, mergeFn func(*Task)) { + t.Lock() + defer t.Unlock() + + attempts, exists := t.TaskMap[taskId] + if !exists { + // Task doesn't exist, create new + newTask := Task{TaskID: taskId, AttemptNumber: attemptNum} + mergeFn(&newTask) + t.TaskMap[taskId] = []Task{newTask} + return + } + + // Binary search: find the first index where AttemptNumber >= attemptNum + idx := sort.Search(len(attempts), func(i int) bool { + return attempts[i].AttemptNumber >= attemptNum + }) + + // Check if attempt already exists at this position + if idx < len(attempts) && attempts[idx].AttemptNumber == attemptNum { + // Exists: merge into existing + mergeFn(&attempts[idx]) + return + } + + // Doesn't exist: insert at correct position to maintain sorted order + newTask := Task{TaskID: taskId, AttemptNumber: attemptNum} + mergeFn(&newTask) + + // Insert at idx position + attempts = append(attempts, Task{}) // Extend slice by 1 + copy(attempts[idx+1:], attempts[idx:]) // Shift elements right + attempts[idx] = newTask // Insert at correct position + t.TaskMap[taskId] = attempts +} + +func GetTaskFieldValue(task Task, filterKey string) string { + switch filterKey { + case "task_id": + return task.TaskID + case "job_id": + return task.JobID + case "state": + return string(task.State) + case "name", "task_name": + return task.Name + case "func_name", "function_name": + return task.FuncOrClassName + case "node_id": + return task.NodeID + case "actor_id": + return task.ActorID + case "type", "task_type": + return string(task.Type) + case "worker_id": + return task.WorkerID + case "language": + return task.Language + case "error_type": + return task.ErrorType + default: + return "" + } +} + +// DeepCopy returns a deep copy of the Task, including slices and maps. +// This prevents race conditions when the returned Task is used after locks are released. +func (t Task) DeepCopy() Task { + cp := t + if len(t.Events) > 0 { + cp.Events = make([]StateEvent, len(t.Events)) + copy(cp.Events, t.Events) + } + if len(t.RequiredResources) > 0 { + cp.RequiredResources = make(map[string]float64, len(t.RequiredResources)) + for k, v := range t.RequiredResources { + cp.RequiredResources[k] = v + } + } + if len(t.TaskLogInfo) > 0 { + cp.TaskLogInfo = make(map[string]string, len(t.TaskLogInfo)) + for k, v := range t.TaskLogInfo { + cp.TaskLogInfo[k] = v + } + } + if len(t.LabelSelector) > 0 { + cp.LabelSelector = make(map[string]string, len(t.LabelSelector)) + for k, v := range t.LabelSelector { + cp.LabelSelector[k] = v + } + } + return cp +} diff --git a/historyserver/pkg/historyserver/clientmanager.go b/historyserver/pkg/historyserver/clientmanager.go new file mode 100644 index 00000000000..337658e3f31 --- /dev/null +++ b/historyserver/pkg/historyserver/clientmanager.go @@ -0,0 +1,91 @@ +package historyserver + +import ( + "context" + "strings" + + rayv1 "github.com/ray-project/kuberay/ray-operator/apis/ray/v1" + "github.com/sirupsen/logrus" + + "k8s.io/apimachinery/pkg/runtime" + utilruntime "k8s.io/apimachinery/pkg/util/runtime" + "k8s.io/client-go/rest" + "k8s.io/client-go/tools/clientcmd" + + "sigs.k8s.io/controller-runtime/pkg/client" +) + +type ClientManager struct { + configs []*rest.Config + clients []client.Client +} + +func (c *ClientManager) ListRayClusters(ctx context.Context) ([]*rayv1.RayCluster, error) { + list := []*rayv1.RayCluster{} + for _, c := range c.clients { + listOfRayCluster := rayv1.RayClusterList{} + err := c.List(ctx, &listOfRayCluster) + if err != nil { + logrus.Errorf("Failed to list RayClusters: %v", err) + continue + } + for _, rayCluster := range listOfRayCluster.Items { + list = append(list, &rayCluster) + } + } + return list, nil +} + +func NewClientManager(kubeconfigs string) *ClientManager { + kubeconfigList := []*rest.Config{} + if len(kubeconfigs) > 0 { + stringList := strings.Split(kubeconfigs, ",") + if len(stringList) > 1 { + // historyserver is able to get query from live gcs, which is not safe. + // we hope to replace these apis with one events. + logrus.Errorf("Only one kubeconfig is supported.") + } + for _, kubeconfig := range stringList { + if kubeconfig != "" { + c, err := clientcmd.BuildConfigFromFlags("", kubeconfig) + if err != nil { + logrus.Errorf("Failed to build config from kubeconfig: %v", err) + continue + } + c.QPS = 50 + c.Burst = 100 + kubeconfigList = append(kubeconfigList, c) + logrus.Infof("add config from path: %v", kubeconfig) + break + } + } + } else { + c, err := rest.InClusterConfig() + if err != nil { + logrus.Errorf("Failed to build config from kubeconfig: %v", err) + } else { + c.QPS = 50 + c.Burst = 100 + kubeconfigList = append(kubeconfigList, c) + logrus.Infof("add config from in cluster config") + } + } + scheme := runtime.NewScheme() + utilruntime.Must(rayv1.AddToScheme(scheme)) + clientList := []client.Client{} + for _, config := range kubeconfigList { + c, err := client.New(config, client.Options{ + Scheme: scheme, + }) + if err != nil { + logrus.Errorf("Failed to create client: %v", err) + continue + } + clientList = append(clientList, c) + } + logrus.Infof("create client manager successfully, clients: %v", len(clientList)) + return &ClientManager{ + configs: kubeconfigList, + clients: clientList, + } +} diff --git a/historyserver/pkg/historyserver/reader.go b/historyserver/pkg/historyserver/reader.go new file mode 100644 index 00000000000..3a8a573102f --- /dev/null +++ b/historyserver/pkg/historyserver/reader.go @@ -0,0 +1,86 @@ +package historyserver + +import ( + "context" + "encoding/json" + "net/http" + "path" + "sort" + + "github.com/emicklei/go-restful/v3" + "github.com/ray-project/kuberay/historyserver/pkg/utils" + "github.com/sirupsen/logrus" +) + +func (s *ServerHandler) listClusters(limit int) []utils.ClusterInfo { + // Initial continuation marker + logrus.Debugf("Prepare to get list clusters info ...") + ctx := context.Background() + liveClusters, _ := s.clientManager.ListRayClusters(ctx) + liveClusterNames := []string{} + liveClusterInfos := []utils.ClusterInfo{} + for _, liveCluster := range liveClusters { + liveClusterInfo := utils.ClusterInfo{ + Name: liveCluster.Name, + Namespace: liveCluster.Namespace, + CreateTime: liveCluster.CreationTimestamp.String(), + CreateTimeStamp: liveCluster.CreationTimestamp.Unix(), + SessionName: "live", + } + liveClusterInfos = append(liveClusterInfos, liveClusterInfo) + liveClusterNames = append(liveClusterNames, liveCluster.Name) + } + logrus.Infof("live clusters: %v", liveClusterNames) + clusters := s.reader.List() + sort.Sort(utils.ClusterInfoList(clusters)) + if limit > 0 && limit < len(clusters) { + clusters = clusters[:limit] + } + clusters = append(liveClusterInfos, clusters...) + return clusters +} + +func (s *ServerHandler) _getNodeLogs(rayClusterNameID, sessionId, nodeId, dir string) ([]byte, error) { + logPath := path.Join(sessionId, "logs", nodeId) + if dir != "" { + logPath = path.Join(logPath, dir) + } + files := s.reader.ListFiles(rayClusterNameID, logPath) + ret := map[string]interface{}{ + "data": map[string]interface{}{ + "result": map[string]interface{}{ + "padding": files, + }, + }, + } + return json.Marshal(ret) +} + +func (s *ServerHandler) GetNodes(rayClusterNameID, sessionId string) ([]byte, error) { + logPath := path.Join(sessionId, "logs") + nodes := s.reader.ListFiles(rayClusterNameID, logPath) + templ := map[string]interface{}{ + "result": true, + "msg": "Node summary fetched.", + "data": map[string]interface{}{ + "summary": []map[string]interface{}{}, + }, + } + nodeSummary := []map[string]interface{}{} + for _, node := range nodes { + nodeSummary = append(nodeSummary, map[string]interface{}{ + "raylet": map[string]interface{}{ + "nodeId": path.Clean(node), + "state": "ALIVE", + }, + "ip": "UNKNOWN", + }) + } + templ["data"].(map[string]interface{})["summary"] = nodeSummary + return json.Marshal(templ) +} + +// TODO: implement this +func (h *ServerHandler) getGrafanaHealth(req *restful.Request, resp *restful.Response) { + resp.WriteErrorString(http.StatusNotImplemented, "Grafana health not yet supported") +} diff --git a/historyserver/pkg/historyserver/router.go b/historyserver/pkg/historyserver/router.go new file mode 100644 index 00000000000..d78a12ffb82 --- /dev/null +++ b/historyserver/pkg/historyserver/router.go @@ -0,0 +1,824 @@ +package historyserver + +import ( + "context" + "encoding/json" + "errors" + "io" + "net/http" + "strings" + + "github.com/emicklei/go-restful/v3" + eventtypes "github.com/ray-project/kuberay/historyserver/pkg/eventserver/types" + "github.com/ray-project/kuberay/historyserver/pkg/utils" + rayv1 "github.com/ray-project/kuberay/ray-operator/apis/ray/v1" + "github.com/sirupsen/logrus" + "k8s.io/apimachinery/pkg/types" + "sigs.k8s.io/controller-runtime/pkg/client" +) + +const ( + COOKIE_CLUSTER_NAME_KEY = "cluster_name" + COOKIE_CLUSTER_NAMESPACE_KEY = "cluster_namespace" + COOKIE_SESSION_NAME_KEY = "session_name" + COOKIE_DASHBOARD_VERSION_KEY = "dashboard_version" + + ATTRIBUTE_SERVICE_NAME = "cluster_service_name" +) + +func RequestLogFilter(req *restful.Request, resp *restful.Response, chain *restful.FilterChain) { + logrus.Infof("Received request: %s %s", req.Request.Method, req.Request.URL.String()) + chain.ProcessFilter(req, resp) +} + +func routerClusters(s *ServerHandler) { + ws := new(restful.WebService) + defer restful.Add(ws) + + ws.Path("/clusters").Consumes(restful.MIME_JSON).Produces(restful.MIME_JSON) //.Filter(s.loginWrapper) + ws.Route(ws.GET("/").To(s.getClusters). + Doc("get all clusters"). + Writes([]string{})) +} + +func routerNodes(s *ServerHandler) { + ws := new(restful.WebService) + defer restful.Add(ws) + ws.Path("/nodes").Consumes(restful.MIME_JSON).Produces(restful.MIME_JSON) //.Filter(s.loginWrapper) + ws.Route(ws.GET("/").To(s.getNodes).Filter(s.CookieHandle). + Doc("get nodes for a given clusters").Param(ws.QueryParameter("view", "such as summary")). + Writes("")) + ws.Route(ws.GET("/{node_id}").To(s.getNode).Filter(s.CookieHandle). + Doc("get specifical nodes "). + Param(ws.PathParameter("node_id", "node_id")). + Writes("")) +} + +func routerEvents(s *ServerHandler) { + ws := new(restful.WebService) + defer restful.Add(ws) + ws.Path("/events").Consumes(restful.MIME_JSON).Produces(restful.MIME_JSON) //.Filter(s.loginWrapper) + ws.Route(ws.GET("/").To(s.getEvents).Filter(s.CookieHandle). + Doc("get events"). + Writes("")) +} + +func routerAPI(s *ServerHandler) { + ws := new(restful.WebService) + defer restful.Add(ws) + ws.Path("/api").Consumes(restful.MIME_JSON).Produces(restful.MIME_JSON).Filter(RequestLogFilter) //.Filter(s.loginWrapper) + ws.Route(ws.GET("/cluster_status").To(s.getClusterStatus).Filter(s.CookieHandle). + Doc("get clusters status").Param(ws.QueryParameter("format", "such as 1")). + Writes("")) // Placeholder for specific return type + ws.Route(ws.GET("/grafana_health").To(s.getGrafanaHealth).Filter(s.CookieHandle). + Doc("get grafana_health"). + Writes("")) // Placeholder for specific return type + ws.Route(ws.GET("/prometheus_health").To(s.getPrometheusHealth).Filter(s.CookieHandle). + Doc("get prometheus_health"). + Writes("")) // Placeholder for specific return type + + ws.Route(ws.GET("/jobs").To(s.getJobs).Filter(s.CookieHandle). + Doc("get jobs"). + Writes("")) // Placeholder for specific return type + + ws.Route(ws.GET("/jobs/{job_id}").To(s.getJob).Filter(s.CookieHandle). + Doc("get single job"). + Param(ws.PathParameter("job_id", "job_id")). + Writes("")) // Placeholder for specific return type + + ws.Route(ws.GET("/data/datasets/{job_id}").To(s.getDatasets).Filter(s.CookieHandle). + Doc("get datasets"). + Param(ws.PathParameter("job_id", "job_id")). + Writes("")) // Placeholder for specific return type + + ws.Route(ws.GET("/serve/applications/").To(s.getServeApplications).Filter(s.CookieHandle). + Doc("get appliations"). + Writes("")) // Placeholder for specific return type + + ws.Route(ws.GET("/v0/placement_groups/").To(s.getPlacementGroups).Filter(s.CookieHandle). + Doc("get placement_groups"). + Writes("")) // Placeholder for specific return type + + ws.Route(ws.GET("/v0/logs").To(s.getNodeLogs).Filter(s.CookieHandle). + Doc("get appliations").Param(ws.QueryParameter("node_id", "node_id")). + Writes("")) // Placeholder for specific return type + ws.Route(ws.GET("/v0/logs/file").To(s.getNodeLogFile).Filter(s.CookieHandle). + Doc("get logfile").Param(ws.QueryParameter("node_id", "node_id")). + Param(ws.QueryParameter("filename", "filename")). + Param(ws.QueryParameter("lines", "lines")). + Param(ws.QueryParameter("format", "format")). + Writes("")) // Placeholder for specific return type + + ws.Route(ws.GET("/v0/tasks").To(s.getTaskDetail).Filter(s.CookieHandle). + Doc("get task detail "). + // TODO: support limit + // Param(ws.QueryParameter("limit", "limit")). + Param(ws.QueryParameter("filter_keys", "filter_keys")). + Param(ws.QueryParameter("filter_predicates", "filter_predicates")). + Param(ws.QueryParameter("filter_values", "filter_values")). + Writes("")) // Placeholder for specific return type + + ws.Route(ws.GET("/v0/tasks/summarize").To(s.getTaskSummarize).Filter(s.CookieHandle). + Doc("get summarize"). + Param(ws.QueryParameter("filter_keys", "filter_keys")). + Param(ws.QueryParameter("filter_predicates", "filter_predicates")). + Param(ws.QueryParameter("filter_values", "filter_values")). + Param(ws.QueryParameter("summary_by", "summary_by")). + Writes("")) // Placeholder for specific return type +} + +// func routerRoot(s *ServerHandler) { +// ws := new(restful.WebService) +// defer restful.Add(ws) +// ws.Filter(RequestLogFilter) +// ws.Route(ws.GET("/").To(func(req *restful.Request, w *restful.Response) { +// isHomePage := true +// _, err := req.Request.Cookie(COOKIE_CLUSTER_NAME_KEY) +// isHomePage = err != nil +// prefix := "" +// if isHomePage { +// prefix = "homepage" +// } else { +// version := "v2.51.0" +// if versionCookie, err := req.Request.Cookie(COOKIE_DASHBOARD_VERSION_KEY); err == nil { +// version = versionCookie.Value +// } +// prefix = version + "/client/build" +// } +// // Check if homepage file exists; if so use it, otherwise use default index.html +// homepagePath := path.Join(s.dashboardDir, prefix, "index.html") + +// var data []byte + +// if _, statErr := os.Stat(homepagePath); !os.IsNotExist(statErr) { +// data, err = os.ReadFile(homepagePath) +// } else { +// http.Error(w, "could not read HTML file", http.StatusInternalServerError) +// logrus.Errorf("could not read HTML file: %v", statErr) +// return +// } + +// if err != nil { +// http.Error(w, "could not read HTML file", http.StatusInternalServerError) +// logrus.Errorf("could not read HTML file: %v", err) +// return +// } +// w.Header().Set("Content-Type", "text/html") +// w.Write(data) +// }).Writes("")) +// } + +// TODO: this is the frontend's entry. +// func routerHomepage(s *ServerHandler) { +// ws := new(restful.WebService) +// defer restful.Add(ws) +// ws.Path("/homepage").Consumes("*/*").Produces("*/*").Filter(RequestLogFilter) +// ws.Route(ws.GET("/").To(func(_ *restful.Request, w *restful.Response) { +// data, err := os.ReadFile(path.Join(s.dashboardDir, "homepage/index.html")) +// if err != nil { +// // Fallback to root path +// routerRoot(s) +// return +// } +// w.Header().Set("Content-Type", "text/html") +// w.Write(data) +// }).Writes("")) +// } + +func routerHealthz(s *ServerHandler) { + + http.HandleFunc("/readz", func(w http.ResponseWriter, r *http.Request) { + logrus.Infof("Received request: %s %s", r.Method, r.URL.String()) + w.Header().Set("Content-Type", "text/plain") + w.Write([]byte("ok")) + logrus.Debugf("request /readz") + }) + http.HandleFunc("/livez", func(w http.ResponseWriter, r *http.Request) { + logrus.Infof("Received request: %s %s", r.Method, r.URL.String()) + w.Header().Set("Content-Type", "text/plain") + w.Write([]byte("ok")) + logrus.Debugf("request /livez") + }) + +} + +func routerLogical(s *ServerHandler) { + ws := new(restful.WebService) + defer restful.Add(ws) + ws.Path("/logical").Consumes(restful.MIME_JSON).Produces(restful.MIME_JSON).Filter(RequestLogFilter) //.Filter(s.loginWrapper) + ws.Route(ws.GET("/actors").To(s.getLogicalActors).Filter(s.CookieHandle). + Doc("get logical actors"). + Param(ws.QueryParameter("filter_keys", "filter_keys")). + Param(ws.QueryParameter("filter_predicates", "filter_predicates")). + Param(ws.QueryParameter("filter_values", "filter_values")). + Writes("")) // Placeholder for specific return type + + // TODO: discuss with Ray Core team about this + // I noticed that IDs (`actor_id`, `job_id`, `node_id`, etc.) in Ray Base Events + // are encoded as Base64, while the Dashboard/State APIs use Hex. + // Problem: Base64 can contain `/` characters, which breaks URL routing: + ws.Route(ws.GET("/actors/{single_actor:*}").To(s.getLogicalActor).Filter(s.CookieHandle). + Doc("get logical single actor"). + Param(ws.PathParameter("single_actor", "single_actor")). + Writes("")) // Placeholder for specific return type + +} + +func routerRayClusterSet(s *ServerHandler) { + ws := new(restful.WebService) + defer restful.Add(ws) + + ws.Path("/enter_cluster").Consumes(restful.MIME_JSON).Produces(restful.MIME_JSON).Filter(RequestLogFilter) + ws.Route(ws.GET("/{namespace}/{name}/{session}").To(func(r1 *restful.Request, r2 *restful.Response) { + name := r1.PathParameter("name") + namespace := r1.PathParameter("namespace") + session := r1.PathParameter("session") + http.SetCookie(r2, &http.Cookie{MaxAge: 600, Path: "/", Name: COOKIE_CLUSTER_NAME_KEY, Value: name}) + http.SetCookie(r2, &http.Cookie{MaxAge: 600, Path: "/", Name: COOKIE_CLUSTER_NAMESPACE_KEY, Value: namespace}) + http.SetCookie(r2, &http.Cookie{MaxAge: 600, Path: "/", Name: COOKIE_SESSION_NAME_KEY, Value: session}) + r2.WriteJson(map[string]interface{}{ + "result": "success", + "name": name, + "namespace": namespace, + "session": session, + }, "application/json") + }). + Doc("set cookie for cluster"). + Param(ws.PathParameter("namespace", "namespace")). + Param(ws.PathParameter("name", "name")). + Param(ws.PathParameter("session", "session")). + Writes("")) // Placeholder for specific return type +} + +func (s *ServerHandler) RegisterRouter() { + routerRayClusterSet(s) + routerClusters(s) + routerNodes(s) + routerEvents(s) + routerAPI(s) + // routerRoot(s) + // routerHomepage(s) + routerHealthz(s) + routerLogical(s) +} + +func (s *ServerHandler) redirectRequest(req *restful.Request, resp *restful.Response) { + svcName := req.Attribute(ATTRIBUTE_SERVICE_NAME).(string) + remoteResp, err := s.httpClient.Get("http://" + svcName + req.Request.URL.String()) + if err != nil { + logrus.Errorf("Error: %v", err) + resp.WriteError(http.StatusBadGateway, err) + return + } + defer remoteResp.Body.Close() + + // Copy headers from remote response + for key, values := range remoteResp.Header { + for _, value := range values { + resp.Header().Add(key, value) + } + } + + // Set status code + resp.WriteHeader(remoteResp.StatusCode) + + // Copy response body + _, err = io.Copy(resp, remoteResp.Body) + if err != nil { + logrus.Errorf("Failed to copy response body: %v", err) + } +} + +func (s *ServerHandler) getClusters(req *restful.Request, resp *restful.Response) { + clusters := s.listClusters(s.maxClusters) + resp.WriteAsJson(clusters) +} + +// getNodes returns nodes for the specified cluster +func (s *ServerHandler) getNodes(req *restful.Request, resp *restful.Response) { + sessionName := req.Attribute(COOKIE_SESSION_NAME_KEY).(string) + if sessionName == "live" { + s.redirectRequest(req, resp) + return + } + clusterNameID := req.Attribute(COOKIE_CLUSTER_NAME_KEY).(string) + clusterNamespace := req.Attribute(COOKIE_CLUSTER_NAMESPACE_KEY).(string) + data, err := s.GetNodes(clusterNameID+"_"+clusterNamespace, sessionName) + if data == nil { + logrus.Errorf("Failed to get nodes for cluster %s", clusterNameID+"_"+clusterNamespace) + resp.WriteError(http.StatusInternalServerError, errors.New("failed to get nodes")) + return + } + if err != nil { + logrus.Errorf("Error: %v", err) + resp.WriteError(400, err) + return + } + resp.Write(data) +} + +func (s *ServerHandler) getEvents(req *restful.Request, resp *restful.Response) { + sessionName := req.Attribute(COOKIE_SESSION_NAME_KEY).(string) + if sessionName == "live" { + s.redirectRequest(req, resp) + return + } + // Return "not yet supported" for historical data + resp.WriteErrorString(http.StatusNotImplemented, "Historical events not yet supported") +} + +func (s *ServerHandler) getPrometheusHealth(req *restful.Request, resp *restful.Response) { + sessionName := req.Attribute(COOKIE_SESSION_NAME_KEY).(string) + if sessionName == "live" { + s.redirectRequest(req, resp) + return + } + // Return "not yet supported" for prometheus health + resp.WriteErrorString(http.StatusNotImplemented, "Prometheus health not yet supported") +} + +func (s *ServerHandler) getJobs(req *restful.Request, resp *restful.Response) { + sessionName := req.Attribute(COOKIE_SESSION_NAME_KEY).(string) + if sessionName == "live" { + s.redirectRequest(req, resp) + return + } + // Return "not yet supported" for jobs + resp.WriteErrorString(http.StatusNotImplemented, "Jobs not yet supported") +} + +func (s *ServerHandler) getNode(req *restful.Request, resp *restful.Response) { + sessionName := req.Attribute(COOKIE_SESSION_NAME_KEY).(string) + if sessionName == "live" { + s.redirectRequest(req, resp) + return + } + // Return "not yet supported" for node + resp.WriteErrorString(http.StatusNotImplemented, "Node not yet supported") +} + +func (s *ServerHandler) getJob(req *restful.Request, resp *restful.Response) { + sessionName := req.Attribute(COOKIE_SESSION_NAME_KEY).(string) + if sessionName == "live" { + s.redirectRequest(req, resp) + return + } + + // Return "not yet supported" for job + resp.WriteErrorString(http.StatusNotImplemented, "Job not yet supported") +} + +func (s *ServerHandler) getDatasets(req *restful.Request, resp *restful.Response) { + sessionName := req.Attribute(COOKIE_SESSION_NAME_KEY).(string) + if sessionName == "live" { + s.redirectRequest(req, resp) + return + } + + // Return "not yet supported" for datasets + resp.WriteErrorString(http.StatusNotImplemented, "Datasets not yet supported") +} + +func (s *ServerHandler) getServeApplications(req *restful.Request, resp *restful.Response) { + sessionName := req.Attribute(COOKIE_SESSION_NAME_KEY).(string) + if sessionName == "live" { + s.redirectRequest(req, resp) + return + } + + // Return "not yet supported" for serve applications + resp.WriteErrorString(http.StatusNotImplemented, "Serve applications not yet supported") +} + +func (s *ServerHandler) getPlacementGroups(req *restful.Request, resp *restful.Response) { + sessionName := req.Attribute(COOKIE_SESSION_NAME_KEY).(string) + if sessionName == "live" { + s.redirectRequest(req, resp) + return + } + + // Return "not yet supported" for placement groups + resp.WriteErrorString(http.StatusNotImplemented, "Placement groups not yet supported") +} + +func (s *ServerHandler) getClusterStatus(req *restful.Request, resp *restful.Response) { + sessionName := req.Attribute(COOKIE_SESSION_NAME_KEY).(string) + if sessionName == "live" { + s.redirectRequest(req, resp) + return + } + + // Return "not yet supported" for cluster status + resp.WriteErrorString(http.StatusNotImplemented, "Cluster status not yet supported") +} + +func (s *ServerHandler) getNodeLogs(req *restful.Request, resp *restful.Response) { + clusterNameID := req.Attribute(COOKIE_CLUSTER_NAME_KEY).(string) + clusterNamespace := req.Attribute(COOKIE_CLUSTER_NAMESPACE_KEY).(string) + sessionName := req.Attribute(COOKIE_SESSION_NAME_KEY).(string) + if sessionName == "live" { + s.redirectRequest(req, resp) + return + } + folder := "" + if req.QueryParameter("folder") != "" { + folder = req.QueryParameter("folder") + } + if req.QueryParameter("glob") != "" { + folder = req.QueryParameter("glob") + folder = strings.TrimSuffix(folder, "*") + } + data, err := s._getNodeLogs(clusterNameID+"_"+clusterNamespace, sessionName, req.QueryParameter("node_id"), folder) + if err != nil { + logrus.Errorf("Error: %v", err) + resp.WriteError(400, err) + return + } + resp.Write(data) +} + +func (s *ServerHandler) getLogicalActors(req *restful.Request, resp *restful.Response) { + clusterName := req.Attribute(COOKIE_CLUSTER_NAME_KEY).(string) + clusterNamespace := req.Attribute(COOKIE_CLUSTER_NAMESPACE_KEY).(string) + clusterNameID := clusterName + "_" + clusterNamespace + sessionName := req.Attribute(COOKIE_SESSION_NAME_KEY).(string) + + if sessionName == "live" { + s.redirectRequest(req, resp) + return + } + + filterKey := req.QueryParameter("filter_keys") + filterValue := req.QueryParameter("filter_values") + filterPredicate := req.QueryParameter("filter_predicates") + + // Get actors from EventHandler's in-memory map + actorsMap := s.eventHandler.GetActorsMap(clusterNameID) + + // Convert map to slice for filtering + actors := make([]eventtypes.Actor, 0, len(actorsMap)) + for _, actor := range actorsMap { + actors = append(actors, actor) + } + + // Apply generic filtering + actors = utils.ApplyFilter(actors, filterKey, filterPredicate, filterValue, + func(a eventtypes.Actor, key string) string { + return eventtypes.GetActorFieldValue(a, key) + }) + + // Format response to match Ray Dashboard API format + formattedActors := make(map[string]interface{}) + for _, actor := range actors { + formattedActors[actor.ActorID] = formatActorForResponse(actor) + } + + response := map[string]interface{}{ + "result": true, + "msg": "All actors fetched.", + "data": map[string]interface{}{ + "actors": formattedActors, + }, + } + + respData, err := json.Marshal(response) + if err != nil { + logrus.Errorf("Failed to marshal actors response: %v", err) + resp.WriteErrorString(http.StatusInternalServerError, err.Error()) + return + } + resp.Write(respData) +} + +// formatActorForResponse converts an eventtypes.Actor to the format expected by Ray Dashboard +func formatActorForResponse(actor eventtypes.Actor) map[string]interface{} { + result := map[string]interface{}{ + "actor_id": actor.ActorID, + "job_id": actor.JobID, + "placement_group_id": actor.PlacementGroupID, + "state": string(actor.State), + "pid": actor.PID, + "address": map[string]interface{}{ + "node_id": actor.Address.NodeID, + "ip_address": actor.Address.IPAddress, + "port": actor.Address.Port, + "worker_id": actor.Address.WorkerID, + }, + "name": actor.Name, + "num_restarts": actor.NumRestarts, + "actor_class": actor.ActorClass, + "required_resources": actor.RequiredResources, + "exit_details": actor.ExitDetails, + "repr_name": actor.ReprName, + "call_site": actor.CallSite, + "is_detached": actor.IsDetached, + "ray_namespace": actor.RayNamespace, + } + + // Only include start_time if it's set (non-zero) + if !actor.StartTime.IsZero() { + result["start_time"] = actor.StartTime.UnixMilli() + } + + // Only include end_time if it's set (non-zero) + if !actor.EndTime.IsZero() { + result["end_time"] = actor.EndTime.UnixMilli() + } + + return result +} +func (s *ServerHandler) getLogicalActor(req *restful.Request, resp *restful.Response) { + clusterName := req.Attribute(COOKIE_CLUSTER_NAME_KEY).(string) + clusterNamespace := req.Attribute(COOKIE_CLUSTER_NAMESPACE_KEY).(string) + clusterNameID := clusterName + "_" + clusterNamespace + sessionName := req.Attribute(COOKIE_SESSION_NAME_KEY).(string) + if sessionName == "live" { + s.redirectRequest(req, resp) + return + } + + actorID := req.PathParameter("single_actor") + + // Get actor from EventHandler's in-memory map + actor, found := s.eventHandler.GetActorByID(clusterNameID, actorID) + + replyActorInfo := ReplyActorInfo{ + Data: ActorInfoData{}, + } + + if found { + replyActorInfo.Result = true + replyActorInfo.Msg = "Actor fetched." + replyActorInfo.Data.Detail = formatActorForResponse(actor) + } else { + replyActorInfo.Result = false + replyActorInfo.Msg = "Actor not found." + } + + actData, err := json.MarshalIndent(&replyActorInfo, "", " ") + if err != nil { + logrus.Errorf("Failed to marshal actor response: %v", err) + resp.WriteErrorString(http.StatusInternalServerError, err.Error()) + return + } + + resp.Write(actData) +} + +func (s *ServerHandler) getNodeLogFile(req *restful.Request, resp *restful.Response) { + sessionName := req.Attribute(COOKIE_SESSION_NAME_KEY).(string) + if sessionName == "live" { + s.redirectRequest(req, resp) + return + } + + // Not yet supported + resp.WriteErrorString(http.StatusNotImplemented, "Node log file not yet supported") +} + +func (s *ServerHandler) getTaskSummarize(req *restful.Request, resp *restful.Response) { + clusterName := req.Attribute(COOKIE_CLUSTER_NAME_KEY).(string) + clusterNamespace := req.Attribute(COOKIE_CLUSTER_NAMESPACE_KEY).(string) + clusterNameID := clusterName + "_" + clusterNamespace + sessionName := req.Attribute(COOKIE_SESSION_NAME_KEY).(string) + + if sessionName == "live" { + s.redirectRequest(req, resp) + return + } + + // Parse filter parameters + filterKey := req.QueryParameter("filter_keys") + filterValue := req.QueryParameter("filter_values") + filterPredicate := req.QueryParameter("filter_predicates") + summaryBy := req.QueryParameter("summary_by") + + // Get all tasks + tasks := s.eventHandler.GetTasks(clusterNameID) + + // Apply generic filtering using utils.ApplyFilter + tasks = utils.ApplyFilter(tasks, filterKey, filterPredicate, filterValue, + func(t eventtypes.Task, key string) string { + return eventtypes.GetTaskFieldValue(t, key) + }) + + // Summarize tasks based on summary_by parameter + var summary map[string]interface{} + if summaryBy == "lineage" { + summary = summarizeTasksByLineage(tasks) + } else { + // Default to func_name + summary = summarizeTasksByFuncName(tasks) + } + + response := map[string]interface{}{ + "result": true, + "msg": "Tasks summarized.", + "data": map[string]interface{}{ + "result": summary, + }, + } + + respData, err := json.Marshal(response) + if err != nil { + logrus.Errorf("Failed to marshal task summarize response: %v", err) + resp.WriteErrorString(http.StatusInternalServerError, err.Error()) + return + } + resp.Write(respData) +} + +// summarizeTasksByFuncName groups tasks by function name and counts by state +func summarizeTasksByFuncName(tasks []eventtypes.Task) map[string]interface{} { + summary := make(map[string]map[string]int) + + for _, task := range tasks { + funcName := task.FuncOrClassName + if funcName == "" { + funcName = "unknown" + } + if _, ok := summary[funcName]; !ok { + summary[funcName] = make(map[string]int) + } + state := string(task.State) + if state == "" { + state = "UNKNOWN" + } + summary[funcName][state]++ + } + + return map[string]interface{}{ + "summary": summary, + "total": len(tasks), + } +} + +// TODO(Han-Ju Chen): This function has a bug - using JobID instead of actual lineage. +// Real lineage requires: +// 1. Add ParentTaskID field to Task struct (types/task.go) +// 2. Parse parent_task_id from Ray events (eventserver.go) +// 3. Build task tree structure based on ParentTaskID +// 4. Update rayjob example to generate nested tasks for testing +func summarizeTasksByLineage(tasks []eventtypes.Task) map[string]interface{} { + summary := make(map[string]map[string]int) + + for _, task := range tasks { + // Use JobID as a simple lineage grouping for now + lineageKey := task.JobID + if lineageKey == "" { + lineageKey = "unknown" + } + if _, ok := summary[lineageKey]; !ok { + summary[lineageKey] = make(map[string]int) + } + state := string(task.State) + if state == "" { + state = "UNKNOWN" + } + summary[lineageKey][state]++ + } + + return map[string]interface{}{ + "summary": summary, + "total": len(tasks), + } +} + +func (s *ServerHandler) getTaskDetail(req *restful.Request, resp *restful.Response) { + clusterName := req.Attribute(COOKIE_CLUSTER_NAME_KEY).(string) + clusterNamespace := req.Attribute(COOKIE_CLUSTER_NAMESPACE_KEY).(string) + sessionName := req.Attribute(COOKIE_SESSION_NAME_KEY).(string) + + // Combine into internal key format + clusterNameID := clusterName + "_" + clusterNamespace + + if sessionName == "live" { + s.redirectRequest(req, resp) + return + } + + filterKey := req.QueryParameter("filter_keys") + filterValue := req.QueryParameter("filter_values") + filterPredicate := req.QueryParameter("filter_predicates") + + tasks := s.eventHandler.GetTasks(clusterNameID) + tasks = utils.ApplyFilter(tasks, filterKey, filterPredicate, filterValue, + func(t eventtypes.Task, key string) string { + return eventtypes.GetTaskFieldValue(t, key) + }) + + taskResults := make([]interface{}, 0, len(tasks)) + for _, task := range tasks { + taskResults = append(taskResults, formatTaskForResponse(task)) + } + + response := ReplyTaskInfo{ + Result: true, + Msg: "Tasks fetched.", + Data: TaskInfoData{ + Result: TaskInfoDataResult{ + Result: taskResults, + Total: len(taskResults), + NumFiltered: len(taskResults), + NumAfterTruncation: len(taskResults), + }, + }, + } + + respData, err := json.Marshal(response) + if err != nil { + logrus.Errorf("Failed to marshal task response: %v", err) + resp.WriteErrorString(http.StatusInternalServerError, err.Error()) + return + } + resp.Write(respData) +} + +// formatTaskForResponse converts an eventtypes.Task to the format expected by Ray Dashboard +func formatTaskForResponse(task eventtypes.Task) map[string]interface{} { + result := map[string]interface{}{ + "task_id": task.TaskID, + "name": task.Name, + "attempt_number": task.AttemptNumber, + "state": string(task.State), + "job_id": task.JobID, + "node_id": task.NodeID, + "actor_id": task.ActorID, + "placement_group_id": task.PlacementGroupID, + "type": string(task.Type), + "func_or_class_name": task.FuncOrClassName, + "language": task.Language, + "required_resources": task.RequiredResources, + "worker_id": task.WorkerID, + "error_type": task.ErrorType, + "error_message": task.ErrorMessage, + "call_site": task.CallSite, + } + + if !task.StartTime.IsZero() { + result["start_time"] = task.StartTime.UnixMilli() + } + + if !task.EndTime.IsZero() { + result["end_time"] = task.EndTime.UnixMilli() + } + + return result +} + +// CookieHandle is a preprocessing filter function +func (s *ServerHandler) CookieHandle(req *restful.Request, resp *restful.Response, chain *restful.FilterChain) { + // Get cookie from request + clusterName, err := req.Request.Cookie(COOKIE_CLUSTER_NAME_KEY) + if err != nil { + resp.WriteHeaderAndEntity(http.StatusBadRequest, "Cluster Cookie not found") + return + } + sessionName, err := req.Request.Cookie(COOKIE_SESSION_NAME_KEY) + if err != nil { + resp.WriteHeaderAndEntity(http.StatusBadRequest, "RayCluster Session Name Cookie not found") + return + } + clusterNamespace, err := req.Request.Cookie(COOKIE_CLUSTER_NAMESPACE_KEY) + if err != nil { + resp.WriteHeaderAndEntity(http.StatusBadRequest, "Cluster Namespace Cookie not found") + return + } + http.SetCookie(resp, &http.Cookie{MaxAge: 600, Path: "/", Name: COOKIE_CLUSTER_NAME_KEY, Value: clusterName.Value}) + http.SetCookie(resp, &http.Cookie{MaxAge: 600, Path: "/", Name: COOKIE_CLUSTER_NAMESPACE_KEY, Value: clusterNamespace.Value}) + http.SetCookie(resp, &http.Cookie{MaxAge: 600, Path: "/", Name: COOKIE_SESSION_NAME_KEY, Value: sessionName.Value}) + + if sessionName.Value == "live" { + // Always query K8s to get the service name to prevent SSRF attacks. + // Do not trust user-provided cookies for service name. + // TODO: here might be a bottleneck if there are many requests in the future. + svcName, err := getClusterSvcName(s.clientManager.clients, clusterName.Value, clusterNamespace.Value) + if err != nil { + resp.WriteHeaderAndEntity(http.StatusBadRequest, err.Error()) + return + } + req.SetAttribute(ATTRIBUTE_SERVICE_NAME, svcName) + } + req.SetAttribute(COOKIE_CLUSTER_NAME_KEY, clusterName.Value) + req.SetAttribute(COOKIE_SESSION_NAME_KEY, sessionName.Value) + req.SetAttribute(COOKIE_CLUSTER_NAMESPACE_KEY, clusterNamespace.Value) + logrus.Infof("Request URL %s", req.Request.URL.String()) + chain.ProcessFilter(req, resp) +} + +func getClusterSvcName(clis []client.Client, name, namespace string) (string, error) { + if len(clis) == 0 { + return "", errors.New("No available kubernetes config found") + } + cli := clis[0] + rc := rayv1.RayCluster{} + err := cli.Get(context.Background(), types.NamespacedName{Namespace: namespace, Name: name}, &rc) + if err != nil { + return "", errors.New("RayCluster not found") + } + svcName := rc.Status.Head.ServiceName + if svcName == "" { + return "", errors.New("RayCluster head service not ready") + } + return svcName + ":8265", nil +} diff --git a/historyserver/pkg/historyserver/server.go b/historyserver/pkg/historyserver/server.go new file mode 100644 index 00000000000..c3c8fe90ac4 --- /dev/null +++ b/historyserver/pkg/historyserver/server.go @@ -0,0 +1,71 @@ +package historyserver + +import ( + "context" + "log" + "net/http" + "time" + + "github.com/ray-project/kuberay/historyserver/pkg/collector/types" + "github.com/ray-project/kuberay/historyserver/pkg/eventserver" + "github.com/ray-project/kuberay/historyserver/pkg/storage" + "github.com/sirupsen/logrus" +) + +type ServerHandler struct { + maxClusters int + rootDir string + dashboardDir string + + reader storage.StorageReader + clientManager *ClientManager + eventHandler *eventserver.EventHandler + httpClient *http.Client +} + +func NewServerHandler(c *types.RayHistoryServerConfig, dashboardDir string, reader storage.StorageReader, clientManager *ClientManager, eventHandler *eventserver.EventHandler) *ServerHandler { + return &ServerHandler{ + reader: reader, + clientManager: clientManager, + eventHandler: eventHandler, + + rootDir: c.RootDir, + dashboardDir: dashboardDir, + // TODO: make this configurable + maxClusters: 100, + httpClient: &http.Client{ + Timeout: 30 * time.Second, + }, + } +} + +func (s *ServerHandler) Run(stop chan struct{}) error { + s.RegisterRouter() + port := ":8080" + server := &http.Server{ + Addr: port, // Listen address + ReadTimeout: 5 * time.Second, // Read timeout + WriteTimeout: 35 * time.Second, // Write response timeout (must be >= httpClient.Timeout for proxy requests) + IdleTimeout: 60 * time.Second, // Idle timeout + } + go func() { + logrus.Infof("Starting server on %s", port) + err := server.ListenAndServe() + if err != nil && err != http.ErrServerClosed { + logrus.Fatalf("Error starting server: %v", err) + } + logrus.Infof("Server stopped gracefully") + }() + + <-stop + logrus.Warnf("Receive stop single, so stop ray history server") + // Create a context with 1 second timeout + ctx, cancel := context.WithTimeout(context.Background(), 1*time.Second) + defer cancel() + + // Shutdown the server + if err := server.Shutdown(ctx); err != nil { + log.Fatalf("Ray HistoryServer forced to shutdown: %v", err) + } + return nil +} diff --git a/historyserver/pkg/historyserver/types.go b/historyserver/pkg/historyserver/types.go new file mode 100644 index 00000000000..0c9831adfbb --- /dev/null +++ b/historyserver/pkg/historyserver/types.go @@ -0,0 +1,28 @@ +package historyserver + +type ReplyTaskInfo struct { + Data TaskInfoData `json:"data"` + Msg string `json:"msg"` + Result bool `json:"result"` +} +type TaskInfoData struct { + Result TaskInfoDataResult `json:"result"` +} +type TaskInfoDataResult struct { + NumAfterTruncation int `json:"num_after_truncation"` + NumFiltered int `json:"num_filtered"` + PartialFailureWarning string `json:"partial_failure_warning"` + Result []interface{} `json:"result"` + Total int `json:"total"` + Warnings interface{} `json:"warnings"` +} + +type ReplyActorInfo struct { + Result bool `json:"result"` + Msg string `json:"msg"` + Data ActorInfoData `json:"data"` +} + +type ActorInfoData struct { + Detail map[string]interface{} `json:"detail"` +} diff --git a/historyserver/pkg/storage/aliyunoss/config/types.go b/historyserver/pkg/storage/aliyunoss/config/types.go index 0e0575db631..5a3b3de7555 100644 --- a/historyserver/pkg/storage/aliyunoss/config/types.go +++ b/historyserver/pkg/storage/aliyunoss/config/types.go @@ -1,20 +1,3 @@ -// Package config is -/* -Copyright 2024 by the bingyu bingyu.zj@alibaba-inc.com Authors. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ - package config import ( diff --git a/historyserver/pkg/storage/aliyunoss/config/validate.go b/historyserver/pkg/storage/aliyunoss/config/validate.go index 70c0236e994..85190a2ad4f 100644 --- a/historyserver/pkg/storage/aliyunoss/config/validate.go +++ b/historyserver/pkg/storage/aliyunoss/config/validate.go @@ -1,19 +1,3 @@ -// Package config is -/* -Copyright 2024 by the zhangjie bingyu.zj@alibaba-inc.com Authors. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ package config import ( diff --git a/historyserver/pkg/storage/aliyunoss/ray/ray.go b/historyserver/pkg/storage/aliyunoss/ray/ray.go index a685c60c081..91edb018be9 100644 --- a/historyserver/pkg/storage/aliyunoss/ray/ray.go +++ b/historyserver/pkg/storage/aliyunoss/ray/ray.go @@ -1,19 +1,3 @@ -// Package ray is -/* -Copyright 2024 by the zhangjie bingyu.zj@alibaba-inc.com Authors. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ package ray import ( @@ -240,7 +224,7 @@ func NewReader(c *types.RayHistoryServerConfig, jd map[string]interface{}) (stor return New(config) } -func NewWritter(c *types.RayCollectorConfig, jd map[string]interface{}) (storage.StorageWriter, error) { +func NewWriter(c *types.RayCollectorConfig, jd map[string]interface{}) (storage.StorageWriter, error) { config := &config{} config.complete(c, jd) diff --git a/historyserver/pkg/storage/aliyunoss/ray/ray_test.go b/historyserver/pkg/storage/aliyunoss/ray/ray_test.go index eff04340be5..ef01bcd6915 100644 --- a/historyserver/pkg/storage/aliyunoss/ray/ray_test.go +++ b/historyserver/pkg/storage/aliyunoss/ray/ray_test.go @@ -1,19 +1,3 @@ -// Package logs is -/* -Copyright 2024 by the zhangjie bingyu.zj@alibaba-inc.com Authors. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ package ray import ( diff --git a/historyserver/pkg/storage/s3/s3.go b/historyserver/pkg/storage/s3/s3.go index 760e12f7123..d234602cd3a 100644 --- a/historyserver/pkg/storage/s3/s3.go +++ b/historyserver/pkg/storage/s3/s3.go @@ -203,24 +203,33 @@ func (r *RayLogsHandler) List() (res []utils.ClusterInfo) { } func (r *RayLogsHandler) GetContent(clusterId string, fileName string) io.Reader { - logrus.Infof("Prepare to get object %s info ...", fileName) + fullPath := path.Join(r.S3RootDir, clusterId, fileName) + logrus.Infof("Prepare to get object %s info ...", fullPath) result, err := r.S3Client.GetObject(&s3.GetObjectInput{ Bucket: aws.String(r.S3Bucket), - Key: aws.String(fileName), + Key: aws.String(fullPath), }) if err != nil { - logrus.Errorf("Failed to get object %s: %v", fileName, err) - allFiles := r._listFiles(clusterId+"/"+path.Dir(fileName), "", false) + // Close the first result's Body if it exists to prevent connection leak + if result != nil && result.Body != nil { + result.Body.Close() + } + logrus.Errorf("Failed to get object %s: %v", fullPath, err) + dirPath := path.Dir(fullPath) + allFiles := r._listFiles(dirPath, "", false) found := false for _, f := range allFiles { - if path.Base(f) == fileName { + if path.Base(f) == path.Base(fullPath) { logrus.Infof("Get object %s info success", f) result, err = r.S3Client.GetObject(&s3.GetObjectInput{ Bucket: aws.String(r.S3Bucket), Key: aws.String(f), }) if err != nil { + if result != nil && result.Body != nil { + result.Body.Close() + } logrus.Errorf("Failed to get object %s: %v", f, err) return nil } @@ -251,7 +260,7 @@ func NewReader(c *types.RayHistoryServerConfig, jd map[string]interface{}) (stor return New(config) } -func NewWritter(c *types.RayCollectorConfig, jd map[string]interface{}) (storage.StorageWriter, error) { +func NewWriter(c *types.RayCollectorConfig, jd map[string]interface{}) (storage.StorageWriter, error) { config := &config{} config.complete(c, jd) diff --git a/historyserver/pkg/utils/filter.go b/historyserver/pkg/utils/filter.go new file mode 100644 index 00000000000..4b634d042aa --- /dev/null +++ b/historyserver/pkg/utils/filter.go @@ -0,0 +1,45 @@ +package utils + +type PredicateType string + +const ( + PredicateEqual PredicateType = "=" + PredicateNotEqual PredicateType = "!=" +) + +type PredicateFunc func(fieldValue, filterValue string) bool + +var PredicateMap = map[PredicateType]PredicateFunc{ + PredicateEqual: func(field, value string) bool { return field == value }, + PredicateNotEqual: func(field, value string) bool { return field != value }, +} + +func ParsePredicate(predicate string) PredicateType { + switch predicate { + case "!=": + return PredicateNotEqual + default: + return PredicateEqual + } +} + +func ApplyFilter[T any](items []T, filterKey, filterPredicate, filterValue string, fieldGetter func(T, string) string) []T { + if filterKey == "" || filterValue == "" { + return items + } + + predicate := ParsePredicate(filterPredicate) + predicateFunc, ok := PredicateMap[predicate] + if !ok { + predicateFunc = PredicateMap[PredicateEqual] + } + + var result []T + for _, item := range items { + fieldValue := fieldGetter(item, filterKey) + if predicateFunc(fieldValue, filterValue) { + result = append(result, item) + } + } + return result +} diff --git a/historyserver/pkg/utils/types.go b/historyserver/pkg/utils/types.go index 683e00c3bde..c61ee1c3991 100644 --- a/historyserver/pkg/utils/types.go +++ b/historyserver/pkg/utils/types.go @@ -1,19 +1,3 @@ -// Package utils is -/* -Copyright 2024 by the zhangjie bingyu.zj@alibaba-inc.com Authors. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ package utils type ClusterInfo struct { diff --git a/historyserver/pkg/utils/utils.go b/historyserver/pkg/utils/utils.go index 53040cd6008..9c2ab4381c7 100644 --- a/historyserver/pkg/utils/utils.go +++ b/historyserver/pkg/utils/utils.go @@ -1,19 +1,3 @@ -// Package utils is -/* -Copyright 2024 by the zhangjie bingyu.zj@alibaba-inc.com Authors. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ package utils import ( @@ -75,9 +59,9 @@ func RecreateObjectDir(bucket *oss.Bucket, dir string, options ...oss.Option) er } logrus.Infof("ObjectDir %s has delete success...", objectDir) - // List and delete all files with specified prefix + // List all files with the specified prefix and delete them marker := oss.Marker("") - // To delete only src/ and its contents, set prefix to src/ + // To delete only the src directory and all files within it, set prefix to "src/" prefix := oss.Prefix(objectDir) var totalDeleted int @@ -171,7 +155,7 @@ func DeleteObject(bucket *oss.Bucket, objectName string) error { } if isExist { - // Delete single file + // Delete a single file err = bucket.DeleteObject(objectName) if err != nil { logrus.Warnf("Failed to delete object '%s': %v", objectName, err) @@ -194,7 +178,27 @@ func GetLogDir(ossHistorySeverDir, rayClusterName, rayClusterID, sessionId, rayN } const ( - // do not change + // connector is the separator for creating flat storage keys. + // + // Design Philosophy: + // - Format: "{clusterName}_{namespace}" for router/historyserver + // "{clusterName}_{clusterID}" for collector + // + // Why "_" instead of "/"? + // Using "/" would create a hierarchical path like "namespace/cluster/session/..." + // which requires multiple ListObjects API calls to traverse: + // 1. First list all clusters under a namespace + // 2. Then list contents of the target cluster + // + // Using "_" creates a flat path like "namespace_cluster/session/..." + // which allows direct access with a single ListObjects call. + // + // Why this is SAFE for parsing: + // - Kubernetes namespace follows DNS-1123 label spec + // - DNS-1123 only allows: lowercase letters, digits, and hyphens (-) + // - Namespace CANNOT contain "_", so we can unambiguously split from the LAST "_" + // + // DO NOT CHANGE: Would break existing stored data paths connector = "_" ) diff --git a/historyserver/pkg/utils/utils_test.go b/historyserver/pkg/utils/utils_test.go index d9df6f52c02..8cac453ca36 100644 --- a/historyserver/pkg/utils/utils_test.go +++ b/historyserver/pkg/utils/utils_test.go @@ -1,19 +1,3 @@ -// Package utils is -/* -Copyright 2024 by the zhangjie bingyu.zj@alibaba-inc.com Authors. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ package utils import (