From 3c107e3c79d34cdfb03bc836b39d2f3fc0d6502c Mon Sep 17 00:00:00 2001
From: Elwin Cheng <elwinxcheng@gmail.com>
Date: Wed, 18 Feb 2026 14:13:46 -0800
Subject: [PATCH] Add querying container logs API with authentication and tests

- Implemented `getJobLogs` endpoint requiring Bearer token authentication.
- Added `requireAuth` middleware to enforce authentication for protected routes.
- Created comprehensive tests for job logs retrieval, including scenarios for valid and invalid authentication, missing job logs, and query parameters.
- Updated `DockerMgr` to support container naming for log retrieval.
- Enhanced error handling and logging for better traceability.
---
 src/api.go                 |  61 +++++++++++
 src/api_test.go            | 216 +++++++++++++++++++++++++++++++++++++
 src/container_job_test.go  |   2 +-
 src/docker/docker.go       |  78 +++++++++++++-
 src/docker/docker_test.go  |  63 +++++++++--
 src/int_test.go            |  27 +++--
 src/job_scheduling_test.go |   2 +-
 src/supervisor.go          |  13 ++-
 8 files changed, 435 insertions(+), 27 deletions(-)
 create mode 100644 src/api_test.go

diff --git a/src/api.go b/src/api.go
index 15b29d1..219d4d0 100644
--- a/src/api.go
+++ b/src/api.go
@@ -26,6 +26,7 @@ type App struct {
 	wg             sync.WaitGroup
 	log            *slog.Logger
 	statusRegistry *StatusRegistry
+	authToken      string // if set, Bearer token required for protected endpoints
 }
 
 func NewApp(redisAddr, gpuType string, log *slog.Logger) *App {
@@ -36,6 +37,8 @@ func NewApp(redisAddr, gpuType string, log *slog.Logger) *App {
 	consumerID := fmt.Sprintf("worker_%d", os.Getpid())
 	supervisor := NewSupervisor(redisAddr, consumerID, gpuType, log)
 
+	authToken := os.Getenv("AUTH_TOKEN")
+
 	mux := http.NewServeMux()
 	a := &App{
 		redisClient:    client,
@@ -44,12 +47,14 @@ func NewApp(redisAddr, gpuType string, log *slog.Logger) *App {
 		httpServer:     &http.Server{Addr: ":3000", Handler: mux},
 		log:            log,
 		statusRegistry: statusRegistry,
+		authToken:      authToken,
 	}
 
 	mux.HandleFunc("/auth/login", a.login)
 	mux.HandleFunc("/auth/refresh", a.refresh)
 	mux.HandleFunc("/jobs", a.handleJobs)
 	mux.HandleFunc("/jobs/status", a.getJobStatus)
+	mux.HandleFunc("/jobs/logs/", a.requireAuth(a.getJobLogs))
 	mux.HandleFunc("/supervisors/status", a.getSupervisorStatus)
 	mux.HandleFunc("/supervisors/status/", a.getSupervisorStatusByID)
 	mux.HandleFunc("/supervisors", a.getAllSupervisors)
@@ -297,6 +302,62 @@ func (a *App) getSupervisorStatusByID(w http.ResponseWriter, r *http.Request) {
 	}
 }
 
+// requireAuth wraps a handler and enforces Bearer token authentication.
+// If AUTH_TOKEN is not set, returns 503 (logs feature not configured).
+// If Authorization header is missing or invalid, returns 401.
+func (a *App) requireAuth(next http.HandlerFunc) http.HandlerFunc {
+	return func(w http.ResponseWriter, r *http.Request) {
+		if a.authToken == "" {
+			a.log.Warn("job logs requested but AUTH_TOKEN not configured")
+			http.Error(w, "Logs require authentication to be configured", http.StatusServiceUnavailable)
+			return
+		}
+		auth := r.Header.Get("Authorization")
+		if auth == "" {
+			http.Error(w, "Authorization header required", http.StatusUnauthorized)
+			return
+		}
+		const prefix = "Bearer "
+		if !strings.HasPrefix(auth, prefix) || strings.TrimSpace(strings.TrimPrefix(auth, prefix)) != a.authToken {
+			http.Error(w, "Invalid or expired token", http.StatusUnauthorized)
+			return
+		}
+		next(w, r)
+	}
+}
+
+func (a *App) getJobLogs(w http.ResponseWriter, r *http.Request) {
+	if r.Method != http.MethodGet {
+		http.Error(w, "Method not allowed", http.StatusMethodNotAllowed)
+		return
+	}
+
+	path := strings.TrimPrefix(r.URL.Path, "/jobs/logs/")
+	jobID := strings.Trim(path, "/")
+	if jobID == "" {
+		jobID = r.URL.Query().Get("id")
+	}
+	if jobID == "" {
+		http.Error(w, "Job ID is required", http.StatusBadRequest)
+		return
+	}
+
+	a.log.Info("getJobLogs handler accessed", "job_id", jobID, "remote_address", r.RemoteAddr)
+
+	logs, err := a.supervisor.GetContainerLogsForJob(jobID)
+	if err != nil {
+		a.log.Error("failed to get job logs", "job_id", jobID, "error", err)
+		http.Error(w, fmt.Sprintf("Logs not available for job: %s (container must be running)", jobID), http.StatusNotFound)
+		return
+	}
+
+	w.Header().Set("Content-Type", "text/plain; charset=utf-8")
+	w.WriteHeader(http.StatusOK)
+	if _, err := w.Write(logs); err != nil {
+		a.log.Error("failed to write job logs response", "job_id", jobID, "error", err)
+	}
+}
+
 func (a *App) getAllSupervisors(w http.ResponseWriter, r *http.Request) {
 	activeOnly := r.URL.Query().Get("active") == "true"
 
diff --git a/src/api_test.go b/src/api_test.go
new file mode 100644
index 0000000..7eb8d69
--- /dev/null
+++ b/src/api_test.go
@@ -0,0 +1,216 @@
+package main
+
+import (
+	"context"
+	"io"
+	"log/slog"
+	"net/http"
+	"net/http/httptest"
+	"strings"
+	"testing"
+	"time"
+
+	"mist/docker"
+
+	"github.com/docker/docker/client"
+	"github.com/redis/go-redis/v9"
+)
+
+func TestGetJobLogs_RequiresAuth(t *testing.T) {
+	redisAddr := "localhost:6379"
+	client := redis.NewClient(&redis.Options{Addr: redisAddr})
+	if err := client.Ping(context.Background()).Err(); err != nil {
+		t.Skipf("Redis not running, skipping: %v", err)
+	}
+	client.FlushDB(context.Background())
+
+	log := slog.New(slog.NewJSONHandler(io.Discard, nil))
+	app := NewApp(redisAddr, "AMD", log)
+	app.authToken = "secret-token"
+
+	req := httptest.NewRequest(http.MethodGet, "/jobs/logs/job_123", nil)
+	rr := httptest.NewRecorder()
+
+	app.requireAuth(app.getJobLogs)(rr, req)
+
+	if rr.Code != http.StatusUnauthorized {
+		t.Errorf("expected 401 without auth, got %d", rr.Code)
+	}
+}
+
+func TestGetJobLogs_ValidAuth(t *testing.T) {
+	redisAddr := "localhost:6379"
+	redisClient := redis.NewClient(&redis.Options{Addr: redisAddr})
+	if err := redisClient.Ping(context.Background()).Err(); err != nil {
+		t.Skipf("Redis not running, skipping: %v", err)
+	}
+	redisClient.FlushDB(context.Background())
+
+	dockerCli, err := client.NewClientWithOpts(client.FromEnv)
+	if err != nil {
+		t.Skipf("Docker not available: %v", err)
+	}
+	defer dockerCli.Close()
+	if _, err := dockerCli.Ping(context.Background()); err != nil {
+		t.Skipf("Docker daemon not reachable: %v", err)
+	}
+	_, _, err = dockerCli.ImageInspectWithRaw(context.Background(), "pytorch-cpu")
+	if err != nil {
+		t.Skipf("pytorch-cpu image not found: %v", err)
+	}
+
+	// Start a running container named with job ID (simulating supervisor)
+	mgr := docker.NewDockerMgr(dockerCli, 10, 100)
+	volName := "test_logs_vol"
+	_, _ = mgr.CreateVolume(volName)
+	defer mgr.RemoveVolume(volName, true)
+
+	containerID, err := mgr.RunContainer("pytorch-cpu", "runc", volName, "job_123")
+	if err != nil {
+		t.Fatalf("failed to run container: %v", err)
+	}
+	defer func() {
+		_ = mgr.StopContainer(containerID)
+		_ = mgr.RemoveContainer(containerID)
+	}()
+
+	time.Sleep(500 * time.Millisecond) // let container produce output
+
+	log := slog.New(slog.NewJSONHandler(io.Discard, nil))
+	app := NewApp(redisAddr, "AMD", log)
+	app.authToken = "secret-token"
+
+	req := httptest.NewRequest(http.MethodGet, "/jobs/logs/job_123", nil)
+	req.Header.Set("Authorization", "Bearer secret-token")
+	rr := httptest.NewRecorder()
+
+	app.requireAuth(app.getJobLogs)(rr, req)
+
+	if rr.Code != http.StatusOK {
+		t.Errorf("expected 200 with valid auth, got %d: %s", rr.Code, rr.Body.String())
+	}
+	if !strings.Contains(rr.Body.String(), "hello-from-container") {
+		t.Errorf("expected logs to contain 'hello-from-container', got %q", rr.Body.String())
+	}
+}
+
+func TestGetJobLogs_NotFound(t *testing.T) {
+	redisAddr := "localhost:6379"
+	client := redis.NewClient(&redis.Options{Addr: redisAddr})
+	if err := client.Ping(context.Background()).Err(); err != nil {
+		t.Skipf("Redis not running, skipping: %v", err)
+	}
+	client.FlushDB(context.Background())
+
+	log := slog.New(slog.NewJSONHandler(io.Discard, nil))
+	app := NewApp(redisAddr, "AMD", log)
+	app.authToken = "secret-token"
+
+	req := httptest.NewRequest(http.MethodGet, "/jobs/logs/nonexistent_job", nil)
+	req.Header.Set("Authorization", "Bearer secret-token")
+	rr := httptest.NewRecorder()
+
+	app.requireAuth(app.getJobLogs)(rr, req)
+
+	if rr.Code != http.StatusNotFound {
+		t.Errorf("expected 404 for missing logs, got %d", rr.Code)
+	}
+}
+
+func TestGetJobLogs_NoAuthConfigured(t *testing.T) {
+	redisAddr := "localhost:6379"
+	client := redis.NewClient(&redis.Options{Addr: redisAddr})
+	if err := client.Ping(context.Background()).Err(); err != nil {
+		t.Skipf("Redis not running, skipping: %v", err)
+	}
+
+	log := slog.New(slog.NewJSONHandler(io.Discard, nil))
+	app := NewApp(redisAddr, "AMD", log)
+	app.authToken = "" // no auth configured
+
+	req := httptest.NewRequest(http.MethodGet, "/jobs/logs/job_123", nil)
+	rr := httptest.NewRecorder()
+
+	app.requireAuth(app.getJobLogs)(rr, req)
+
+	if rr.Code != http.StatusServiceUnavailable {
+		t.Errorf("expected 503 when auth not configured, got %d", rr.Code)
+	}
+}
+
+func TestGetJobLogs_InvalidToken(t *testing.T) {
+	redisAddr := "localhost:6379"
+	client := redis.NewClient(&redis.Options{Addr: redisAddr})
+	if err := client.Ping(context.Background()).Err(); err != nil {
+		t.Skipf("Redis not running, skipping: %v", err)
+	}
+
+	log := slog.New(slog.NewJSONHandler(io.Discard, nil))
+	app := NewApp(redisAddr, "AMD", log)
+	app.authToken = "correct-token"
+
+	req := httptest.NewRequest(http.MethodGet, "/jobs/logs/job_123", nil)
+	req.Header.Set("Authorization", "Bearer wrong-token")
+	rr := httptest.NewRecorder()
+
+	app.requireAuth(app.getJobLogs)(rr, req)
+
+	if rr.Code != http.StatusUnauthorized {
+		t.Errorf("expected 401 for invalid token, got %d", rr.Code)
+	}
+}
+
+func TestGetJobLogs_QueryParam(t *testing.T) {
+	redisAddr := "localhost:6379"
+	redisClient := redis.NewClient(&redis.Options{Addr: redisAddr})
+	if err := redisClient.Ping(context.Background()).Err(); err != nil {
+		t.Skipf("Redis not running, skipping: %v", err)
+	}
+	redisClient.FlushDB(context.Background())
+
+	dockerCli, err := client.NewClientWithOpts(client.FromEnv)
+	if err != nil {
+		t.Skipf("Docker not available: %v", err)
+	}
+	defer dockerCli.Close()
+	if _, err := dockerCli.Ping(context.Background()); err != nil {
+		t.Skipf("Docker daemon not reachable: %v", err)
+	}
+	_, _, err = dockerCli.ImageInspectWithRaw(context.Background(), "pytorch-cpu")
+	if err != nil {
+		t.Skipf("pytorch-cpu image not found: %v", err)
+	}
+
+	mgr := docker.NewDockerMgr(dockerCli, 10, 100)
+	volName := "test_logs_query_vol"
+	_, _ = mgr.CreateVolume(volName)
+	defer mgr.RemoveVolume(volName, true)
+
+	containerID, err := mgr.RunContainer("pytorch-cpu", "runc", volName, "job_456")
+	if err != nil {
+		t.Fatalf("failed to run container: %v", err)
+	}
+	defer func() {
+		_ = mgr.StopContainer(containerID)
+		_ = mgr.RemoveContainer(containerID)
+	}()
+
+	time.Sleep(500 * time.Millisecond)
+
+	log := slog.New(slog.NewJSONHandler(io.Discard, nil))
+	app := NewApp(redisAddr, "AMD", log)
+	app.authToken = "token"
+
+	req := httptest.NewRequest(http.MethodGet, "/jobs/logs/?id=job_456", nil)
+	req.Header.Set("Authorization", "Bearer token")
+	rr := httptest.NewRecorder()
+
+	app.requireAuth(app.getJobLogs)(rr, req)
+
+	if rr.Code != http.StatusOK {
+		t.Errorf("expected 200, got %d", rr.Code)
+	}
+	if !strings.Contains(rr.Body.String(), "hello-from-container") {
+		t.Errorf("expected logs to contain 'hello-from-container', got %q", rr.Body.String())
+	}
+}
diff --git a/src/container_job_test.go b/src/container_job_test.go
index cd3ed1e..d86c7a5 100644
--- a/src/container_job_test.go
+++ b/src/container_job_test.go
@@ -136,7 +136,7 @@ func TestRunContainerCPUIntegration(t *testing.T) {
 	}
 	defer mgr.RemoveVolume(volName, true)
 
-	containerID, err := mgr.RunContainer("pytorch-cpu", "runc", volName)
+	containerID, err := mgr.RunContainer("pytorch-cpu", "runc", volName, "test_run_cpu_integration")
 	if err != nil {
 		t.Fatalf("run container: %v", err)
 	}
diff --git a/src/docker/docker.go b/src/docker/docker.go
index b8c19b9..1fdada5 100644
--- a/src/docker/docker.go
+++ b/src/docker/docker.go
@@ -1,8 +1,10 @@
 package docker
 
 import (
+	"bytes"
 	"context"
 	"fmt"
+	"io"
 	"log/slog"
 	"sync"
 
@@ -10,6 +12,7 @@ import (
 	"github.com/docker/docker/api/types/mount"
 	"github.com/docker/docker/api/types/volume"
 	"github.com/docker/docker/client"
+	"github.com/docker/docker/pkg/stdcopy"
 )
 
 // DockerMgr manages Docker containers and volumes, enforces resource limits, and tracks active resources.
@@ -116,10 +119,11 @@ func (mgr *DockerMgr) RemoveVolume(volumeName string, force bool) error {
 	return nil
 }
 
-// RunContainer creates and starts a container with the specified image, runtime, and volume attached at /data.
+// RunContainer creates and starts a container with the specified image, runtime, volume, and name.
+// containerName is used as the Docker container name (e.g. job ID for log lookup by name).
 // Enforces the container limit and checks that the volume exists.
 // Returns the container ID or an error.
-func (mgr *DockerMgr) RunContainer(imageName string, runtimeName string, volumeName string) (string, error) {
+func (mgr *DockerMgr) RunContainer(imageName string, runtimeName string, volumeName string, containerName string) (string, error) {
 	mgr.mu.Lock()
 	defer mgr.mu.Unlock()
 	if len(mgr.containers) >= mgr.containerLimit {
@@ -146,7 +150,7 @@ func (mgr *DockerMgr) RunContainer(imageName string, runtimeName string, volumeN
 		ctx,
 		&container.Config{
 			Image: imageName,
-			Cmd:   []string{"sleep", "1000"},
+			Cmd:   []string{"sh", "-c", "echo hello-from-container && sleep 1000"},
 		},
 		&container.HostConfig{
 			Runtime: runtimeName,
@@ -160,7 +164,7 @@ func (mgr *DockerMgr) RunContainer(imageName string, runtimeName string, volumeN
 		},
 		nil,
 		nil,
-		"",
+		containerName,
 	)
 
 	if err != nil {
@@ -176,3 +180,69 @@ func (mgr *DockerMgr) RunContainer(imageName string, runtimeName string, volumeN
 
 	return resp.ID, nil
 }
+
+// LogOptions configures how container logs are fetched.
+type LogOptions struct {
+	ShowStdout bool
+	ShowStderr bool
+	Tail       string // e.g. "100" for last 100 lines, or "all"
+	Since      string // RFC3339 timestamp
+	Until      string // RFC3339 timestamp
+	Timestamps bool
+}
+
+// IsContainerRunning returns true if the container exists and is running.
+func (mgr *DockerMgr) IsContainerRunning(ctx context.Context, containerID string) (bool, error) {
+	inspect, err := mgr.cli.ContainerInspect(ctx, containerID)
+	if err != nil {
+		return false, err
+	}
+	return inspect.State.Running, nil
+}
+
+// GetContainerLogs fetches logs from a running Docker container.
+// Returns an error if the container does not exist, is not running, or logs cannot be read.
+func (mgr *DockerMgr) GetContainerLogs(ctx context.Context, containerID string, opts LogOptions) ([]byte, error) {
+	running, err := mgr.IsContainerRunning(ctx, containerID)
+	if err != nil {
+		return nil, fmt.Errorf("container not found: %w", err)
+	}
+	if !running {
+		return nil, fmt.Errorf("container is not running; logs only available for running containers")
+	}
+	if opts.ShowStdout == false && opts.ShowStderr == false {
+		opts.ShowStdout = true
+		opts.ShowStderr = true
+	}
+
+	options := container.LogsOptions{
+		ShowStdout: opts.ShowStdout,
+		ShowStderr: opts.ShowStderr,
+		Timestamps: opts.Timestamps,
+	}
+	if opts.Tail != "" {
+		options.Tail = opts.Tail
+	}
+	if opts.Since != "" {
+		options.Since = opts.Since
+	}
+	if opts.Until != "" {
+		options.Until = opts.Until
+	}
+
+	reader, err := mgr.cli.ContainerLogs(ctx, containerID, options)
+	if err != nil {
+		slog.Error("Failed to get container logs", "containerID", containerID, "error", err)
+		return nil, fmt.Errorf("failed to get container logs: %w", err)
+	}
+	defer reader.Close()
+
+	var buf bytes.Buffer
+	_, err = stdcopy.StdCopy(&buf, &buf, reader)
+	if err != nil && err != io.EOF {
+		slog.Error("Failed to demultiplex container logs", "containerID", containerID, "error", err)
+		return nil, fmt.Errorf("failed to read container logs: %w", err)
+	}
+
+	return buf.Bytes(), nil
+}
diff --git a/src/docker/docker_test.go b/src/docker/docker_test.go
index 63f897d..eebc9ba 100644
--- a/src/docker/docker_test.go
+++ b/src/docker/docker_test.go
@@ -1,8 +1,11 @@
 package docker
 
 import (
+	"context"
 	"fmt"
+	"strings"
 	"testing"
+	"time"
 
 	"github.com/docker/docker/api/types/volume"
 	"github.com/docker/docker/client"
@@ -11,7 +14,11 @@ import (
 func setupMgr(t *testing.T) *DockerMgr {
 	cli, err := client.NewClientWithOpts(client.FromEnv)
 	if err != nil {
-		t.Fatalf("Failed to create Docker client: %v", err)
+		t.Skipf("Docker not available: %v", err)
+	}
+	ctx := context.Background()
+	if _, err := cli.Ping(ctx); err != nil {
+		t.Skipf("Docker daemon not reachable: %v", err)
 	}
 	return NewDockerMgr(cli, 10, 100)
 }
@@ -93,7 +100,7 @@ func TestRunContainerCPU(t *testing.T) {
 		t.Fatalf("Failed to create volume %s: %v", volName, err)
 	}
 	defer mgr.RemoveVolume(volName, true)
-	containerID, err := mgr.RunContainer(imageName, runtimeName, volName)
+	containerID, err := mgr.RunContainer(imageName, runtimeName, volName, "test_run_cpu")
 	if err != nil {
 		t.Fatalf("Failed to start CPU container: %v", err)
 	}
@@ -113,7 +120,7 @@ func TestRemoveVolumeInUse(t *testing.T) {
 	if err != nil {
 		t.Fatalf("Failed to create volume %s: %v", volName, err)
 	}
-	containerID, err := mgr.RunContainer(imageName, runtimeName, volName)
+	containerID, err := mgr.RunContainer(imageName, runtimeName, volName, "test_remove_vol_in_use")
 	if err != nil {
 		t.Fatalf("Failed to start container: %v", err)
 	}
@@ -148,7 +155,7 @@ func TestAttachNonexistentVolume(t *testing.T) {
 	mgr := setupMgr(t)
 	imageName, runtimeName := cpuImageAndRuntime(t, mgr)
 	volName := "nonexistent_volume_t6"
-	id, err := mgr.RunContainer(imageName, runtimeName, volName)
+	id, err := mgr.RunContainer(imageName, runtimeName, volName, "test_attach_nonexistent")
 	// If Docker auto-creates the volume, this may not error; check your policy
 	if id != "" && err != nil {
 		t.Errorf("Expected error when attaching nonexistent volume, but got id=%v, err=%v", id, err)
@@ -166,11 +173,11 @@ func TestTwoContainersSameVolume(t *testing.T) {
 	if err != nil {
 		t.Fatalf("Failed to create volume %s: %v", volName, err)
 	}
-	id1, err := mgr.RunContainer(imageName, runtimeName, volName)
+	id1, err := mgr.RunContainer(imageName, runtimeName, volName, "test_two_vol_1")
 	if err != nil {
 		t.Fatalf("Failed to start first container: %v", err)
 	}
-	id2, err := mgr.RunContainer(imageName, runtimeName, volName)
+	id2, err := mgr.RunContainer(imageName, runtimeName, volName, "test_two_vol_2")
 	if err != nil {
 		t.Fatalf("Failed to start second container: %v", err)
 	}
@@ -201,11 +208,11 @@ func TestTwoContainersSameVolumeConcurrent(t *testing.T) {
 	if err != nil {
 		t.Fatalf("Failed to create volume %s: %v", volName, err)
 	}
-	id1, err := mgr.RunContainer(imageName, runtimeName, volName)
+	id1, err := mgr.RunContainer(imageName, runtimeName, volName, "test_two_concurrent_1")
 	if err != nil {
 		t.Fatalf("Failed to start first container: %v", err)
 	}
-	id2, err2 := mgr.RunContainer(imageName, runtimeName, volName)
+	id2, err2 := mgr.RunContainer(imageName, runtimeName, volName, "test_two_concurrent_2")
 	if err2 != nil {
 		t.Fatalf("Failed to start second container: %v", err2)
 	}
@@ -259,6 +266,41 @@ func TestVolumeLimit(t *testing.T) {
 	// If your implementation doesn't enforce a limit, this test will fail
 }
 
+// TestGetContainerLogs verifies that container logs can be fetched.
+func TestGetContainerLogs(t *testing.T) {
+	mgr := setupMgr(t)
+	imageName, runtimeName := cpuImageAndRuntime(t, mgr)
+	volName := "test_volume_logs"
+	_, err := mgr.CreateVolume(volName)
+	if err != nil {
+		t.Fatalf("Failed to create volume %s: %v", volName, err)
+	}
+	defer mgr.RemoveVolume(volName, true)
+
+	containerName := "test_get_logs"
+	containerID, err := mgr.RunContainer(imageName, runtimeName, volName, containerName)
+	if err != nil {
+		t.Fatalf("Failed to start container: %v", err)
+	}
+	defer func() {
+		_ = mgr.StopContainer(containerID)
+		_ = mgr.RemoveContainer(containerID)
+	}()
+
+	// Give container time to start and produce output
+	time.Sleep(500 * time.Millisecond)
+
+	ctx := context.Background()
+	logs, err := mgr.GetContainerLogs(ctx, containerName, LogOptions{})
+	if err != nil {
+		t.Fatalf("GetContainerLogs failed: %v", err)
+	}
+	if !strings.Contains(string(logs), "hello-from-container") {
+		t.Errorf("expected logs to contain 'hello-from-container', got: %q", string(logs))
+	}
+	t.Logf("container logs:\n%s", string(logs))
+}
+
 // Set a limit of 10 containers (should fail on 11th if you enforce a limit)
 func TestContainerLimit(t *testing.T) {
 	mgr := setupMgr(t)
@@ -271,13 +313,14 @@ func TestContainerLimit(t *testing.T) {
 	ids := []string{}
 	limit := 10
 	for i := 0; i < limit; i++ {
-		id, err := mgr.RunContainer(imageName, runtimeName, volName)
+		name := "test_limit_" + fmt.Sprint(i)
+		id, err := mgr.RunContainer(imageName, runtimeName, volName, name)
 		if err != nil {
 			t.Fatalf("Failed to start container %d: %v", i, err)
 		}
 		ids = append(ids, id)
 	}
-	_, err = mgr.RunContainer(imageName, runtimeName, volName)
+	_, err = mgr.RunContainer(imageName, runtimeName, volName, "test_limit_overflow")
 	if err == nil {
 		t.Errorf("Container limit not enforced")
 	} else {
diff --git a/src/int_test.go b/src/int_test.go
index 3552145..95740de 100644
--- a/src/int_test.go
+++ b/src/int_test.go
@@ -56,17 +56,18 @@ func TestIntegration(t *testing.T) {
 	defer os.Unsetenv("ENV")
 
 	redisAddr := "localhost:6379"
+	client := redis.NewClient(&redis.Options{Addr: redisAddr})
+	defer client.Close()
+	if err := client.Ping(context.Background()).Err(); err != nil {
+		t.Skipf("Redis not running at %s, skipping: %v (run: docker-compose up -d)", redisAddr, err)
+	}
+
 	config, _ := multilogger.GetLogConfig()
 	schedulerLog, err := multilogger.CreateLogger("scheduler", &config)
 	if err != nil {
 		fmt.Fprintf(os.Stderr, "failed to create logger: %v\n", err)
 		os.Exit(1)
 	}
-	client := redis.NewClient(&redis.Options{Addr: redisAddr})
-	defer client.Close()
-	if err := client.Ping(context.Background()).Err(); err != nil {
-		t.Errorf("Failed to connect to Redis: %v", err)
-	}
 
 	scheduler := NewScheduler(redisAddr, schedulerLog)
 	defer scheduler.Close()
@@ -118,9 +119,11 @@ func TestDummySupervisors(t *testing.T) {
 	os.Setenv("ENV", "test")
 	defer os.Unsetenv("ENV")
 
-	// Clean up Redis data before test
 	client := redis.NewClient(&redis.Options{Addr: redisAddr})
 	defer client.Close()
+	if err := client.Ping(context.Background()).Err(); err != nil {
+		t.Skipf("Redis not running at %s, skipping: %v (run: docker-compose up -d)", redisAddr, err)
+	}
 	client.FlushDB(context.Background())
 
 	app := NewApp(redisAddr, "AMD", log)
@@ -131,7 +134,7 @@ func TestDummySupervisors(t *testing.T) {
 
 	supervisors, err := app.statusRegistry.GetAllSupervisors()
 	if err != nil {
-		t.Errorf("Failed to get supervisors: %v", err)
+		t.Fatalf("Failed to get supervisors: %v", err)
 	}
 	// Verify dummy supervisor IDs exist
 	dummyIDs := []string{"worker_amd_001", "worker_nvidia_002", "worker_tt_003"}
@@ -156,6 +159,9 @@ func TestStatusRegistry_BasicOperations(t *testing.T) {
 
 	client := redis.NewClient(&redis.Options{Addr: redisAddr})
 	defer client.Close()
+	if err := client.Ping(context.Background()).Err(); err != nil {
+		t.Skipf("Redis not running at %s, skipping: %v (run: docker-compose up -d)", redisAddr, err)
+	}
 	client.FlushDB(context.Background())
 
 	registry := NewStatusRegistry(client, log)
@@ -174,13 +180,13 @@ func TestStatusRegistry_BasicOperations(t *testing.T) {
 	// Add status
 	err := registry.UpdateStatus(status.ConsumerID, status)
 	if err != nil {
-		t.Errorf("UpdateStatus failed: %v", err)
+		t.Fatalf("UpdateStatus failed: %v", err)
 	}
 
 	// Retrieve status
 	retrievedStatus, err := registry.GetSupervisor(status.ConsumerID)
 	if err != nil {
-		t.Errorf("GetSupervisor failed: %v", err)
+		t.Fatalf("GetSupervisor failed: %v", err)
 	}
 
 	if retrievedStatus.Status != status.Status {
@@ -200,10 +206,11 @@ func TestStatusRegistry_BasicOperations(t *testing.T) {
 	// Test getting active supervisors
 	activeSupervisors, err := registry.GetActiveSupervisors()
 	if err != nil {
-		t.Errorf("GetActiveSupervisors failed: %v", err)
+		t.Fatalf("GetActiveSupervisors failed: %v", err)
 	}
 
 	if len(activeSupervisors) != 1 {
 		t.Errorf("Expected 1 active supervisor, got %d", len(activeSupervisors))
 	}
 }
+
diff --git a/src/job_scheduling_test.go b/src/job_scheduling_test.go
index b7271f1..5136c65 100644
--- a/src/job_scheduling_test.go
+++ b/src/job_scheduling_test.go
@@ -17,7 +17,7 @@ func TestJobEnqueueAndSupervisor(t *testing.T) {
 	client := redis.NewClient(&redis.Options{Addr: redisAddr})
 	defer client.Close()
 	if err := client.Ping(context.Background()).Err(); err != nil {
-		t.Fatalf("Failed to connect to Redis: %v", err)
+		t.Skipf("Redis not running at %s, skipping: %v (run: docker-compose up -d)", redisAddr, err)
 	}
 
 	client.FlushDB(context.Background())
diff --git a/src/supervisor.go b/src/supervisor.go
index 96da66a..236631c 100644
--- a/src/supervisor.go
+++ b/src/supervisor.go
@@ -231,7 +231,7 @@ func (s *Supervisor) processJob(job Job) bool {
 		return false
 	}
 
-	containerID, err := s.dockerMgr.RunContainer(CPUImage, CPURuntime, volumeName)
+	containerID, err := s.dockerMgr.RunContainer(CPUImage, CPURuntime, volumeName, job.ID)
 	if err != nil {
 		s.log.Error("failed to run container for job", "job_id", job.ID, "error", err)
 		_ = s.dockerMgr.RemoveVolume(volumeName, true)
@@ -293,6 +293,17 @@ func (s *Supervisor) updateJobState(jobID string, state JobState) {
 	}
 }
 
+// GetContainerLogsForJob fetches logs from Docker for a job whose container is currently running.
+// The container is named with the job ID, so we use jobID directly to fetch logs.
+// Returns an error if the container does not exist or is not running.
+// Logs are not persisted; they are only available while the container is running.
+func (s *Supervisor) GetContainerLogsForJob(jobID string) ([]byte, error) {
+	if s.dockerMgr == nil {
+		return nil, fmt.Errorf("Docker not available")
+	}
+	return s.dockerMgr.GetContainerLogs(s.ctx, jobID, docker.LogOptions{})
+}
+
 func (s *Supervisor) ackMessage(messageID string) {
 	result := s.redisClient.XAck(s.ctx, StreamName, ConsumerGroup, messageID)
 	if result.Err() != nil {