Skip to content

Commit ca7a5e6

Browse files
committed
Add healthcheck
1 parent 20bf8b2 commit ca7a5e6

File tree

5 files changed

+279
-0
lines changed

5 files changed

+279
-0
lines changed

images/job-manager/Dockerfile

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,14 @@
1010
# Use lowercase to specify the release, for example: ARG MATLAB_RELEASE=r2024a
1111
ARG MATLAB_RELEASE=r2024a
1212

13+
# Stage 1: Build the mjshealthcheck executable
14+
FROM golang:1.22.4 AS builder
15+
WORKDIR /app
16+
COPY healthcheck/ /app
17+
RUN go version
18+
RUN go mod tidy
19+
RUN CGO_ENABLED=0 GOOS=linux go build -a -installsuffix cgo -o mjshealthcheck /app/main.go
20+
1321
# When you start the build stage, by default this Dockerfile uses the Ubuntu-based matlab-deps image.
1422
# To check the available matlab-deps images, see: https://hub.docker.com/r/mathworks/matlab-deps
1523
FROM mathworks/matlab-deps:${MATLAB_RELEASE}
@@ -47,3 +55,6 @@ RUN wget -q https://www.mathworks.com/mpm/glnxa64/mpm \
4755
--products MATLAB_Parallel_Server \
4856
|| (echo "MPM Installation Failure. See below for more information:" && cat /tmp/mathworks_root.log && false) \
4957
&& sudo rm -rf mpm /tmp/mathworks_root.log
58+
59+
# Add the mjshealthcheck binary
60+
COPY --from=builder /app/mjshealthcheck /opt/matlab/toolbox/parallel/bin/glnxa64/mjshealthcheck

images/job-manager/healthcheck/go.mod

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
module mjshealthcheck
2+
3+
go 1.21.4
Lines changed: 82 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,82 @@
1+
// Copyright 2023-2024 The MathWorks, Inc.
2+
package healthcheck
3+
4+
import (
5+
"errors"
6+
"fmt"
7+
"mjshealthcheck/internal/status"
8+
)
9+
10+
type HealthChecker struct {
11+
statusGetter status.StatusGetter
12+
}
13+
14+
func NewHealthChecker(statusGetter status.StatusGetter) *HealthChecker {
15+
return &HealthChecker{
16+
statusGetter: statusGetter,
17+
}
18+
}
19+
20+
// Return true if a job manager is found and healthy. If the healthcheck fails, return a diagnostic message.
21+
func (h *HealthChecker) DoJobManagerHealthcheck(jobManagerName string) (bool, string, error) {
22+
jobManagers, err := h.statusGetter.GetJobManagers()
23+
if err != nil {
24+
return false, "", err
25+
}
26+
if len(jobManagers) == 0 {
27+
return false, "No job managers found", nil
28+
}
29+
status := ""
30+
if jobManagerName == "" {
31+
if len(jobManagers) > 1 {
32+
return false, "", errors.New("error: multiple job managers were found; a job manager name must be specified in order to perform a healthcheck")
33+
}
34+
// If a job manager name was not specified, use the only job manager
35+
status = jobManagers[0].Status
36+
} else {
37+
// If a specific job manager name was specified, check that specific job manager
38+
found := false
39+
status, found = findJobManagerStatus(jobManagerName, jobManagers)
40+
if !found {
41+
return false, fmt.Sprintf("Job manager \"%s\" not found", jobManagerName), nil
42+
}
43+
}
44+
isHealthy, msg := isHealthyStatus(status)
45+
return isHealthy, msg, nil
46+
}
47+
48+
// Return true if a worker group is running. If healthcheck fails, return a diagnostic message.
49+
func (h *HealthChecker) DoWorkerGroupHealthcheck() (bool, string, error) {
50+
status, err := h.statusGetter.GetWorkerGroupStatus()
51+
if err != nil {
52+
return false, "", err
53+
}
54+
isHealthy := status == "Running"
55+
msg := ""
56+
if !isHealthy {
57+
msg = fmt.Sprintf("Worker group status: %s", status)
58+
}
59+
return isHealthy, msg, nil
60+
}
61+
62+
// Find the status of a job manager with a given name
63+
func findJobManagerStatus(name string, jobManagers []status.JobManagerStatus) (string, bool) {
64+
for _, jm := range jobManagers {
65+
if jm.Name == name {
66+
return jm.Status, true
67+
}
68+
}
69+
return "", false
70+
}
71+
72+
const statusRunning = "running"
73+
const statusPaused = "paused"
74+
75+
func isHealthyStatus(status string) (bool, string) {
76+
isHealthy := status == statusRunning || status == statusPaused
77+
msg := ""
78+
if !isHealthy {
79+
msg = fmt.Sprintf("Job manager status: %s", status)
80+
}
81+
return isHealthy, msg
82+
}
Lines changed: 114 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,114 @@
1+
// Copyright 2023-2024 The MathWorks, Inc.
2+
package status
3+
4+
import (
5+
"context"
6+
"encoding/json"
7+
"fmt"
8+
"os/exec"
9+
"path/filepath"
10+
"time"
11+
)
12+
13+
type StatusGetter interface {
14+
GetJobManagers() ([]JobManagerStatus, error)
15+
GetWorkerGroupStatus() (string, error)
16+
}
17+
18+
type JobManagerStatus struct {
19+
Name string
20+
Status string
21+
}
22+
23+
type WorkerGroupStatus struct {
24+
Status string
25+
}
26+
27+
type nodeStatus struct {
28+
JobManagers []JobManagerStatus
29+
WorkerGroup WorkerGroupStatus
30+
}
31+
32+
type nodeStatusFunc = func(context.Context, string, ...string) ([]byte, error)
33+
34+
type NodeStatusRunner struct {
35+
nodeStatusPath string
36+
timeout time.Duration
37+
basePort int
38+
39+
// Allow the nodestatus function call to be replaced with a mock
40+
runNodeStatusFunc nodeStatusFunc
41+
}
42+
43+
func NewNodeStatusRunner(matlabRoot string, timeoutSeconds, basePort int) *NodeStatusRunner {
44+
n := NodeStatusRunner{
45+
nodeStatusPath: filepath.Join(filepath.FromSlash(matlabRoot), "toolbox", "parallel", "bin", "nodestatus"),
46+
timeout: time.Duration(timeoutSeconds * int(time.Second)),
47+
basePort: basePort,
48+
runNodeStatusFunc: func(ctx context.Context, path string, arg ...string) ([]byte, error) {
49+
cmd := exec.CommandContext(ctx, path, arg...)
50+
return cmd.Output()
51+
},
52+
}
53+
return &n
54+
}
55+
56+
// Get a list of job managers and their statuses
57+
func (n *NodeStatusRunner) GetJobManagers() ([]JobManagerStatus, error) {
58+
status, err := n.getNodeStatus()
59+
if err != nil {
60+
return []JobManagerStatus{}, err
61+
}
62+
return status.JobManagers, nil
63+
}
64+
65+
// Get worker group status
66+
func (n *NodeStatusRunner) GetWorkerGroupStatus() (string, error) {
67+
status, err := n.getNodeStatus()
68+
if err != nil {
69+
return "", err
70+
}
71+
return status.WorkerGroup.Status, nil
72+
}
73+
74+
func (n *NodeStatusRunner) getNodeStatus() (*nodeStatus, error) {
75+
ctx, cancel := context.WithTimeout(context.Background(), n.timeout)
76+
defer cancel()
77+
78+
// Pass the -baseport argument to nodestatus if basePort is set
79+
args := []string{"-json"}
80+
if n.basePort != -1 {
81+
args = append(args, "-baseport", fmt.Sprintf("%d", n.basePort))
82+
}
83+
output, err := n.runNodeStatusFunc(ctx, n.nodeStatusPath, args...)
84+
85+
// Check if the command timed out
86+
if ctx.Err() == context.DeadlineExceeded {
87+
return nil, fmt.Errorf("error: nodestatus command failed to complete within %.0f seconds", n.timeout.Seconds())
88+
}
89+
90+
// Check if nodestatus errored
91+
if err != nil {
92+
errMsg := fmt.Sprintf("error executing nodestatus: %v", err)
93+
94+
// Try to get stderr from nodestatus
95+
cmdOut := cmdOutput{}
96+
unmarshalOutputErr := json.Unmarshal(output, &cmdOut)
97+
if unmarshalOutputErr == nil {
98+
errMsg = errMsg + "\n" + cmdOut.Error
99+
}
100+
return nil, fmt.Errorf(errMsg)
101+
}
102+
103+
// Parse the raw output
104+
status := nodeStatus{}
105+
err = json.Unmarshal(output, &status)
106+
if err != nil {
107+
return nil, fmt.Errorf("error parsing the output of nodestatus: %v", err)
108+
}
109+
return &status, nil
110+
}
111+
112+
type cmdOutput struct {
113+
Error string
114+
}
Lines changed: 69 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,69 @@
1+
// Copyright 2023 The MathWorks, Inc.
2+
//nolint
3+
package main
4+
5+
import (
6+
"flag"
7+
"fmt"
8+
"mjshealthcheck/internal/healthcheck"
9+
"mjshealthcheck/internal/status"
10+
"os"
11+
)
12+
13+
const (
14+
exitUnhealthy = 1
15+
exitError = 2
16+
)
17+
18+
// Tool for performing an MJS job manager healthcheck.
19+
// Exit code 0 = the job manager is healthy
20+
// Exit code 1 = the job manager is unhealthy or was not found
21+
// Exit code 2 = an error occurred while performing the healthcheck
22+
func main() {
23+
inputOpts := parseFlags()
24+
healthchecker := healthcheck.NewHealthChecker(status.NewNodeStatusRunner(inputOpts.matlabRoot, inputOpts.timeout, inputOpts.basePort))
25+
26+
var healthy bool
27+
var msg string
28+
var err error
29+
if inputOpts.isWorkerCheck {
30+
healthy, msg, err = healthchecker.DoWorkerGroupHealthcheck()
31+
} else {
32+
healthy, msg, err = healthchecker.DoJobManagerHealthcheck(inputOpts.jobManagerName)
33+
}
34+
if err != nil {
35+
fmt.Println(err)
36+
os.Exit(exitError)
37+
}
38+
if !healthy {
39+
fmt.Println(msg)
40+
os.Exit(exitUnhealthy)
41+
}
42+
}
43+
44+
type opts struct {
45+
matlabRoot string
46+
jobManagerName string
47+
timeout int
48+
basePort int
49+
isWorkerCheck bool
50+
}
51+
52+
func parseFlags() *opts {
53+
inputOpts := opts{}
54+
// By default, assume we are running from the directory of the executable (matlab/toolbox/parallel/bin/${ARCH})
55+
flag.StringVar(&inputOpts.matlabRoot, "matlabroot", "../../../..", "Path to MATLAB root")
56+
flag.StringVar(&inputOpts.jobManagerName, "jobmanager", "", "Name of the job manager on which to perform the healthcheck if multiple job managers are running")
57+
flag.BoolVar(&inputOpts.isWorkerCheck, "worker", false, "Flag to perform a healthcheck on the worker group instead of a job manager")
58+
flag.IntVar(&inputOpts.timeout, "timeout", 60, "Timeout in seconds for running the nodestatus command")
59+
flag.IntVar(&inputOpts.basePort, "baseport", -1, "The base port that the MJS service is using")
60+
flag.Parse()
61+
62+
// We cannot do both a worker healthcheck and a job manager check
63+
if inputOpts.isWorkerCheck && inputOpts.jobManagerName != "" {
64+
fmt.Println("error: healthcheck can only be performed on a job manager or a worker, not both. Provide only the -jobmanager flag or the -worker flag.")
65+
os.Exit(exitError)
66+
}
67+
68+
return &inputOpts
69+
}

0 commit comments

Comments
 (0)