From 9026f00d98b87fdba31a27f2dd2790d63c08c675 Mon Sep 17 00:00:00 2001 From: Benji Rewis Date: Tue, 10 Dec 2024 15:56:26 -0500 Subject: [PATCH 01/32] initial impl --- robot/client/client.go | 10 +++++ robot/client/client_test.go | 23 ++++++++++++ robot/impl/local_robot.go | 38 ++++++++++++++++--- robot/impl/local_robot_test.go | 69 ++++++++++++++++++++++++++++++++++ robot/robot.go | 14 +++++++ robot/server/server.go | 10 +++++ robot/server/server_test.go | 66 ++++++++++++++++++++++++++++---- web/server/entrypoint.go | 36 ++++++++++++++---- 8 files changed, 245 insertions(+), 21 deletions(-) diff --git a/robot/client/client.go b/robot/client/client.go index 32a641052a2..c8a379bf9f7 100644 --- a/robot/client/client.go +++ b/robot/client/client.go @@ -1115,6 +1115,16 @@ func (rc *RobotClient) MachineStatus(ctx context.Context) (robot.MachineStatus, mStatus.Resources = append(mStatus.Resources, resStatus) } + switch resp.State { + case pb.GetMachineStatusResponse_STATE_UNSPECIFIED: + rc.logger.CError(ctx, "received unspecified machine state") + mStatus.State = robot.StateUnknown + case pb.GetMachineStatusResponse_STATE_INITIALIZING: + mStatus.State = robot.StateInitializing + case pb.GetMachineStatusResponse_STATE_RUNNING: + mStatus.State = robot.StateRunning + } + return mStatus, nil } diff --git a/robot/client/client_test.go b/robot/client/client_test.go index e9a850e7549..5024e5fe551 100644 --- a/robot/client/client_test.go +++ b/robot/client/client_test.go @@ -1975,6 +1975,7 @@ func TestMachineStatus(t *testing.T) { robot.MachineStatus{ Config: config.Revision{Revision: "rev1"}, Resources: []resource.Status{}, + State: robot.StateRunning, }, 0, }, @@ -1990,6 +1991,7 @@ func TestMachineStatus(t *testing.T) { }, }, }, + State: robot.StateRunning, }, 1, }, @@ -2006,6 +2008,7 @@ func TestMachineStatus(t *testing.T) { }, }, }, + State: robot.StateRunning, }, 0, }, @@ -2034,6 +2037,7 @@ func TestMachineStatus(t *testing.T) { }, }, }, + State: robot.StateRunning, }, 2, }, @@ -2073,6 +2077,25 @@ func TestMachineStatus(t *testing.T) { }, }, }, + State: robot.StateRunning, + }, + 0, + }, + { + "initializing machine state", + robot.MachineStatus{ + Config: config.Revision{Revision: "rev1"}, + Resources: []resource.Status{}, + State: robot.StateInitializing, + }, + 0, + }, + { + "unknown machine state", + robot.MachineStatus{ + Config: config.Revision{Revision: "rev1"}, + Resources: []resource.Status{}, + State: robot.StateUnknown, }, 0, }, diff --git a/robot/impl/local_robot.go b/robot/impl/local_robot.go index 0822cfcdef7..be22f4cc7a1 100644 --- a/robot/impl/local_robot.go +++ b/robot/impl/local_robot.go @@ -91,6 +91,10 @@ type localRobot struct { // whether the robot is actively reconfiguring reconfiguring atomic.Bool + + // whether the robot is still initializing (first reconfigure after initial + // construction has not yet completed.) + initializing bool } // ExportResourcesAsDot exports the resource graph as a DOT representation for @@ -511,6 +515,8 @@ func newWithResources( } successful = true + // Robot is "initializing" until first reconfigure after initial creation completes. + r.initializing = true return r, nil } @@ -1097,6 +1103,14 @@ func dialRobotClient( // possibly leak resources. The given config may be modified by Reconfigure. func (r *localRobot) Reconfigure(ctx context.Context, newConfig *config.Config) { r.reconfigure(ctx, newConfig, false) + + // Robot is no longer initializing after a reconfigure completes. It does not + // matter if the call above resulted in errors, we only care that all + // initially specified components, servies, modules, remotes, and processes + // got a chance to attempt configuration. + // + // On initial construction, this value will get set back to true. + r.initializing = false } // set Module.LocalVersion on Type=local modules. Call this before localPackages.Sync and in RestartModule. @@ -1289,18 +1303,26 @@ func (r *localRobot) reconfigure(ctx context.Context, newConfig *config.Config, allErrs = multierr.Combine(allErrs, err) } - // Cleanup unused packages after all old resources have been closed above. This ensures - // processes are shutdown before any files are deleted they are using. - allErrs = multierr.Combine(allErrs, r.packageManager.Cleanup(ctx)) - allErrs = multierr.Combine(allErrs, r.localPackages.Cleanup(ctx)) - // Cleanup extra dirs from previous modules or rogue scripts. - allErrs = multierr.Combine(allErrs, r.manager.moduleManager.CleanModuleDataDirectory()) + // If not initializing, cleanup unused packages after all old resources have + // been closed above. This ensures processes are shutdown before any files + // are deleted they are using. + // + // If initializing, machine will be starting with no modules, but may + // immediately reconfigure to start modules that have already been + // downloaded. Do not cleanup packages/module dirs in that case. + if !r.initializing { + allErrs = multierr.Combine(allErrs, r.packageManager.Cleanup(ctx)) + allErrs = multierr.Combine(allErrs, r.localPackages.Cleanup(ctx)) + // Cleanup extra dirs from previous modules or rogue scripts. + allErrs = multierr.Combine(allErrs, r.manager.moduleManager.CleanModuleDataDirectory()) + } if allErrs != nil { r.logger.CErrorw(ctx, "The following errors were gathered during reconfiguration", "errors", allErrs) } else { r.logger.CInfow(ctx, "Robot (re)configured") } + } // checkMaxInstance checks to see if the local robot has reached the maximum number of a specific resource type that are local. @@ -1431,6 +1453,10 @@ func (r *localRobot) MachineStatus(ctx context.Context) (robot.MachineStatus, er result.Config = r.configRevision r.configRevisionMu.RUnlock() + result.State = robot.StateRunning + if r.initializing { + result.State = robot.StateInitializing + } return result, nil } diff --git a/robot/impl/local_robot_test.go b/robot/impl/local_robot_test.go index 0c505c281b9..c71c8dbf614 100644 --- a/robot/impl/local_robot_test.go +++ b/robot/impl/local_robot_test.go @@ -4504,3 +4504,72 @@ func TestRemovingOfflineRemotes(t *testing.T) { cancelReconfig() wg.Wait() } + +// Tests that machine state properly reports initializing and running. +//func TestMachineState(t *testing.T) { +//logger := logging.NewTestLogger(t) +//ctx := context.Background() + +//completeComponentConstruction := make(chan struct{}, 1) + +//// Register a `foo` component whose construction completion can be delayed, +//// and defer its deregistration. +//resource.RegisterComponent(generic.API, fooModel, resource.Registration[resource.Resource, +//resource.NoNativeConfig]{ +//Constructor: func( +//ctx context.Context, +//deps resource.Dependencies, +//conf resource.Config, +//logger logging.Logger, +//) (resource.Resource, error) { +//// Delay completion of constructor until `completeComponentConstruction` is closed. +//<-completeComponentConstruction + +//return &fooComponent{ +//Named: conf.ResourceName().AsNamed(), +//logger: logger, +//}, nil +//}, +//}) +//defer func() { +//resource.Deregister(generic.API, fooModel) +//}() + +//cfg := &config.Config{ +//Components: []resource.Config{ +//{ +//Name: "foo", +//API: generic.API, +//Model: fooModel, +//}, +//}, +//} +//r := setupLocalRobot(t, ctx, cfg, logger) + +//// Assert that robot reports a state of "initializing" until `foo` completes construction. +//machineStatus, err := r.MachineStatus(ctx) +//test.That(t, err, test.ShouldBeNil) +//test.That(t, machineStatus, test.ShouldNotBeNil) +//test.That(t, machineStatus.State, test.ShouldEqual, robot.StateInitializing) + +//close(completeComponentConstruction) + +//// Assert that robot reports a state of "running" after `foo` completes +//// construction. +//machineStatus, err = r.MachineStatus(ctx) +//test.That(t, err, test.ShouldBeNil) +//test.That(t, machineStatus, test.ShouldNotBeNil) +//test.That(t, machineStatus.State, test.ShouldEqual, robot.StateRunning) + +//// Reconfigure robot to replace `foo` with idential `bar` component (should build +//// immediately, as `completeComponentConstruction` has already been closed.) +//cfg.Components[0].Name = "bar" +//r.Reconfigure(ctx, cfg) + +//// Assert that robot continues to report a state of "running" even after further +//// reconfiguration. +//machineStatus, err = r.MachineStatus(ctx) +//test.That(t, err, test.ShouldBeNil) +//test.That(t, machineStatus, test.ShouldNotBeNil) +//test.That(t, machineStatus.State, test.ShouldEqual, robot.StateRunning) +//} diff --git a/robot/robot.go b/robot/robot.go index f57330d0852..a08fda88009 100644 --- a/robot/robot.go +++ b/robot/robot.go @@ -310,10 +310,24 @@ func (rmr *RestartModuleRequest) MatchesModule(mod config.Module) bool { return mod.Name == rmr.ModuleName } +type MachineState uint8 + +const ( + // StateUnknown represents an unknown state. + StateUnknown MachineState = iota + // StateInitializing denotes a currently initializing machine. The first + // reconfigure after initial creation has not completed. + StateInitializing + // StateRunning denotes a running machine. The first reconfigure after + // initial creation has completed. + StateRunning +) + // MachineStatus encapsulates the current status of the robot. type MachineStatus struct { Resources []resource.Status Config config.Revision + State MachineState } // VersionResponse encapsulates the version info of the robot. diff --git a/robot/server/server.go b/robot/server/server.go index 86c3df4a8ee..cdcd8a7988f 100644 --- a/robot/server/server.go +++ b/robot/server/server.go @@ -476,6 +476,16 @@ func (s *Server) GetMachineStatus(ctx context.Context, _ *pb.GetMachineStatusReq result.Resources = append(result.Resources, pbResStatus) } + switch mStatus.State { + case robot.StateUnknown: + s.robot.Logger().CError(ctx, "machine in an unknown state") + result.State = pb.GetMachineStatusResponse_STATE_UNSPECIFIED + case robot.StateInitializing: + result.State = pb.GetMachineStatusResponse_STATE_INITIALIZING + case robot.StateRunning: + result.State = pb.GetMachineStatusResponse_STATE_RUNNING + } + return &result, nil } diff --git a/robot/server/server_test.go b/robot/server/server_test.go index 79437d8f2fe..24838b461a5 100644 --- a/robot/server/server_test.go +++ b/robot/server/server_test.go @@ -73,20 +73,25 @@ func TestServer(t *testing.T) { t.Run("GetMachineStatus", func(t *testing.T) { for _, tc := range []struct { - name string - injectMachineStatus robot.MachineStatus - expConfig *pb.ConfigStatus - expResources []*pb.ResourceStatus - expBadStateCount int + name string + injectMachineStatus robot.MachineStatus + expConfig *pb.ConfigStatus + expResources []*pb.ResourceStatus + expState pb.GetMachineStatusResponse_State + expBadResourceStateCount int + expBadMachineStateCount int }{ { "no resources", robot.MachineStatus{ Config: config.Revision{Revision: "rev1"}, Resources: []resource.Status{}, + State: robot.StateRunning, }, &pb.ConfigStatus{Revision: "rev1"}, []*pb.ResourceStatus{}, + pb.GetMachineStatusResponse_STATE_RUNNING, + 0, 0, }, { @@ -101,6 +106,7 @@ func TestServer(t *testing.T) { }, }, }, + State: robot.StateRunning, }, &pb.ConfigStatus{Revision: "rev1"}, []*pb.ResourceStatus{ @@ -110,7 +116,9 @@ func TestServer(t *testing.T) { Revision: "rev0", }, }, + pb.GetMachineStatusResponse_STATE_RUNNING, 1, + 0, }, { "resource with valid status", @@ -125,6 +133,7 @@ func TestServer(t *testing.T) { }, }, }, + State: robot.StateRunning, }, &pb.ConfigStatus{Revision: "rev1"}, []*pb.ResourceStatus{ @@ -134,6 +143,8 @@ func TestServer(t *testing.T) { Revision: "rev1", }, }, + pb.GetMachineStatusResponse_STATE_RUNNING, + 0, 0, }, { @@ -249,6 +260,7 @@ func TestServer(t *testing.T) { }, }, }, + State: robot.StateRunning, }, &pb.ConfigStatus{Revision: "rev1"}, []*pb.ResourceStatus{ @@ -268,7 +280,9 @@ func TestServer(t *testing.T) { Revision: "rev-1", }, }, + pb.GetMachineStatusResponse_STATE_RUNNING, 2, + 0, }, { "unhealthy status", @@ -284,6 +298,7 @@ func TestServer(t *testing.T) { }, }, }, + State: robot.StateRunning, }, &pb.ConfigStatus{Revision: "rev1"}, []*pb.ResourceStatus{ @@ -294,7 +309,35 @@ func TestServer(t *testing.T) { Error: "bad configuration", }, }, + pb.GetMachineStatusResponse_STATE_RUNNING, + 0, + 0, + }, + { + "initializing machine state", + robot.MachineStatus{ + Config: config.Revision{Revision: "rev1"}, + Resources: []resource.Status{}, + State: robot.StateInitializing, + }, + &pb.ConfigStatus{Revision: "rev1"}, + []*pb.ResourceStatus{}, + pb.GetMachineStatusResponse_STATE_INITIALIZING, + 0, + 0, + }, + { + "unknown machine state", + robot.MachineStatus{ + Config: config.Revision{Revision: "rev1"}, + Resources: []resource.Status{}, + State: robot.StateUnknown, + }, + &pb.ConfigStatus{Revision: "rev1"}, + []*pb.ResourceStatus{}, + pb.GetMachineStatusResponse_STATE_UNSPECIFIED, 0, + 1, }, } { logger, logs := logging.NewObservedTestLogger(t) @@ -315,9 +358,16 @@ func TestServer(t *testing.T) { test.That(t, res.GetState(), test.ShouldResemble, tc.expResources[i].State) test.That(t, res.GetRevision(), test.ShouldEqual, tc.expResources[i].Revision) } - const badStateMsg = "resource in an unknown state" - badStateCount := logs.FilterLevelExact(zapcore.ErrorLevel).FilterMessageSnippet(badStateMsg).Len() - test.That(t, badStateCount, test.ShouldEqual, tc.expBadStateCount) + + test.That(t, resp.GetState(), test.ShouldEqual, tc.expState) + + const badResourceStateMsg = "resource in an unknown state" + badResourceStateCount := logs.FilterLevelExact(zapcore.ErrorLevel).FilterMessageSnippet(badResourceStateMsg).Len() + test.That(t, badResourceStateCount, test.ShouldEqual, tc.expBadResourceStateCount) + + const badMachineStateMsg = "machine in an unknown state" + badMachineStateCount := logs.FilterLevelExact(zapcore.ErrorLevel).FilterMessageSnippet(badMachineStateMsg).Len() + test.That(t, badMachineStateCount, test.ShouldEqual, tc.expBadMachineStateCount) } }) diff --git a/web/server/entrypoint.go b/web/server/entrypoint.go index 759e7f5f4fb..2765e97ad79 100644 --- a/web/server/entrypoint.go +++ b/web/server/entrypoint.go @@ -338,7 +338,7 @@ func (s *robotServer) serveWeb(ctx context.Context, cfg *config.Config) (err err return out, nil } - processedConfig, err := processConfig(cfg) + fullProcessedConfig, err := processConfig(cfg) if err != nil { return err } @@ -347,9 +347,9 @@ func (s *robotServer) serveWeb(ctx context.Context, cfg *config.Config) (err err // updates to the registry will be handled by the config watcher goroutine. // // This functionality is tested in `TestLogPropagation` in `local_robot_test.go`. - config.UpdateLoggerRegistryFromConfig(s.registry, processedConfig, s.logger) + config.UpdateLoggerRegistryFromConfig(s.registry, fullProcessedConfig, s.logger) - if processedConfig.Cloud != nil { + if fullProcessedConfig.Cloud != nil { cloudRestartCheckerActive = make(chan struct{}) utils.PanicCapturingGo(func() { defer close(cloudRestartCheckerActive) @@ -398,7 +398,20 @@ func (s *robotServer) serveWeb(ctx context.Context, cfg *config.Config) (err err robotOptions = append(robotOptions, robotimpl.WithFTDC()) } - myRobot, err := robotimpl.New(ctx, processedConfig, s.logger, robotOptions...) + // Create `minimalProcessedConfig`, a copy of `fullProcessedConfig`. Remove + // all components, services, remotes, modules, and processes from + // `minimalProcessedConfig`. Create new robot with `minimalProcessedConfig` + // and immediately start web service. We need the machine to be reachable + // through the web service ASAP, even if some resources take a long time to + // initially configure. + minimalProcessedConfig := &(*fullProcessedConfig) + minimalProcessedConfig.Components = nil + minimalProcessedConfig.Services = nil + minimalProcessedConfig.Remotes = nil + minimalProcessedConfig.Modules = nil + minimalProcessedConfig.Processes = nil + + myRobot, err := robotimpl.New(ctx, minimalProcessedConfig, s.logger, robotOptions...) if err != nil { cancel() return err @@ -417,7 +430,10 @@ func (s *robotServer) serveWeb(ctx context.Context, cfg *config.Config) (err err err = multierr.Combine(err, watcher.Close()) }() onWatchDone := make(chan struct{}) - oldCfg := processedConfig + // Use `fullProcessedConfig` as the initial `oldCfg` for the config watcher + // goroutine, as we want incoming config changes to be compared to the full + // config. + oldCfg := fullProcessedConfig utils.ManagedGo(func() { for { select { @@ -479,11 +495,17 @@ func (s *robotServer) serveWeb(ctx context.Context, cfg *config.Config) (err err }() defer cancel() - options, err := s.createWebOptions(processedConfig) + // As mentioned above, start web service with `minimalProcessedConfig`. Then, + // `Reconfigure` robot to have `fullProcessedConfig`. + options, err := s.createWebOptions(minimalProcessedConfig) if err != nil { return err } - return web.RunWeb(ctx, myRobot, options, s.logger) + if err := web.RunWeb(ctx, myRobot, options, s.logger); err != nil { + return err + } + myRobot.Reconfigure(ctx, fullProcessedConfig) + return nil } // dumpResourceRegistrations prints all builtin resource registrations as a json array From 53b9d29664ebfcb4d850c12402fa0a18ee38ab83 Mon Sep 17 00:00:00 2001 From: Benji Rewis Date: Tue, 10 Dec 2024 16:07:12 -0500 Subject: [PATCH 02/32] atomic --- robot/impl/local_robot.go | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/robot/impl/local_robot.go b/robot/impl/local_robot.go index be22f4cc7a1..ed4abd99e2e 100644 --- a/robot/impl/local_robot.go +++ b/robot/impl/local_robot.go @@ -94,7 +94,7 @@ type localRobot struct { // whether the robot is still initializing (first reconfigure after initial // construction has not yet completed.) - initializing bool + initializing atomic.Bool } // ExportResourcesAsDot exports the resource graph as a DOT representation for @@ -516,7 +516,7 @@ func newWithResources( successful = true // Robot is "initializing" until first reconfigure after initial creation completes. - r.initializing = true + r.initializing.Store(true) return r, nil } @@ -1110,7 +1110,7 @@ func (r *localRobot) Reconfigure(ctx context.Context, newConfig *config.Config) // got a chance to attempt configuration. // // On initial construction, this value will get set back to true. - r.initializing = false + r.initializing.Store(false) } // set Module.LocalVersion on Type=local modules. Call this before localPackages.Sync and in RestartModule. @@ -1310,7 +1310,7 @@ func (r *localRobot) reconfigure(ctx context.Context, newConfig *config.Config, // If initializing, machine will be starting with no modules, but may // immediately reconfigure to start modules that have already been // downloaded. Do not cleanup packages/module dirs in that case. - if !r.initializing { + if !r.initializing.Load() { allErrs = multierr.Combine(allErrs, r.packageManager.Cleanup(ctx)) allErrs = multierr.Combine(allErrs, r.localPackages.Cleanup(ctx)) // Cleanup extra dirs from previous modules or rogue scripts. @@ -1322,7 +1322,6 @@ func (r *localRobot) reconfigure(ctx context.Context, newConfig *config.Config, } else { r.logger.CInfow(ctx, "Robot (re)configured") } - } // checkMaxInstance checks to see if the local robot has reached the maximum number of a specific resource type that are local. @@ -1454,7 +1453,7 @@ func (r *localRobot) MachineStatus(ctx context.Context) (robot.MachineStatus, er r.configRevisionMu.RUnlock() result.State = robot.StateRunning - if r.initializing { + if r.initializing.Load() { result.State = robot.StateInitializing } return result, nil From 4802b8d8384ea9500f3d3a1770944b8d470e0028 Mon Sep 17 00:00:00 2001 From: Benji Rewis Date: Wed, 11 Dec 2024 13:38:36 -0500 Subject: [PATCH 03/32] start config watcher goroutine last --- web/server/entrypoint.go | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/web/server/entrypoint.go b/web/server/entrypoint.go index 2765e97ad79..2367bb3363d 100644 --- a/web/server/entrypoint.go +++ b/web/server/entrypoint.go @@ -420,6 +420,17 @@ func (s *robotServer) serveWeb(ctx context.Context, cfg *config.Config) (err err err = multierr.Combine(err, myRobot.Close(context.Background())) }() + // Start web service with `minimalProcessedConfig`, then `Reconfigure` robot + // to have `fullProcessedConfig`. + options, err := s.createWebOptions(minimalProcessedConfig) + if err != nil { + return err + } + if err := web.RunWeb(ctx, myRobot, options, s.logger); err != nil { + return err + } + myRobot.Reconfigure(ctx, fullProcessedConfig) + // watch for and deliver changes to the robot watcher, err := config.NewWatcher(ctx, cfg, s.logger.Sublogger("config")) if err != nil { @@ -494,17 +505,6 @@ func (s *robotServer) serveWeb(ctx context.Context, cfg *config.Config) (err err <-onWatchDone }() defer cancel() - - // As mentioned above, start web service with `minimalProcessedConfig`. Then, - // `Reconfigure` robot to have `fullProcessedConfig`. - options, err := s.createWebOptions(minimalProcessedConfig) - if err != nil { - return err - } - if err := web.RunWeb(ctx, myRobot, options, s.logger); err != nil { - return err - } - myRobot.Reconfigure(ctx, fullProcessedConfig) return nil } From 8fa983092926b012f3e7d3b5ab2d667c4e4175e5 Mon Sep 17 00:00:00 2001 From: Benji Rewis Date: Wed, 11 Dec 2024 13:57:17 -0500 Subject: [PATCH 04/32] addrderef -> CopyOnlyPublicFields --- web/server/entrypoint.go | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/web/server/entrypoint.go b/web/server/entrypoint.go index 2367bb3363d..d500070afcb 100644 --- a/web/server/entrypoint.go +++ b/web/server/entrypoint.go @@ -404,7 +404,10 @@ func (s *robotServer) serveWeb(ctx context.Context, cfg *config.Config) (err err // and immediately start web service. We need the machine to be reachable // through the web service ASAP, even if some resources take a long time to // initially configure. - minimalProcessedConfig := &(*fullProcessedConfig) + minimalProcessedConfig, err := fullProcessedConfig.CopyOnlyPublicFields() + if err != nil { + return err + } minimalProcessedConfig.Components = nil minimalProcessedConfig.Services = nil minimalProcessedConfig.Remotes = nil From c78e16e66760af46484e5ce041c58fe54a30cca5 Mon Sep 17 00:00:00 2001 From: Benji Rewis Date: Wed, 11 Dec 2024 14:18:19 -0500 Subject: [PATCH 05/32] update api dependency and lint --- .../replay/replay_utils_test.go | 2 +- robot/impl/local_robot_test.go | 20 +++++++++---------- robot/robot.go | 1 + 3 files changed, 12 insertions(+), 11 deletions(-) diff --git a/components/movementsensor/replay/replay_utils_test.go b/components/movementsensor/replay/replay_utils_test.go index 73ccee66397..3681d36821d 100644 --- a/components/movementsensor/replay/replay_utils_test.go +++ b/components/movementsensor/replay/replay_utils_test.go @@ -46,8 +46,8 @@ type mockDataServiceServer struct { // //nolint:deprecated,staticcheck func (mDServer *mockDataServiceServer) TabularDataByFilter(ctx context.Context, req *datapb.TabularDataByFilterRequest, - //nolint:deprecated,staticcheck +) (*datapb.TabularDataByFilterResponse, error) { ) (*datapb.TabularDataByFilterResponse, error) { filter := req.DataRequest.GetFilter() last := req.DataRequest.GetLast() diff --git a/robot/impl/local_robot_test.go b/robot/impl/local_robot_test.go index c71c8dbf614..9a37c9e460e 100644 --- a/robot/impl/local_robot_test.go +++ b/robot/impl/local_robot_test.go @@ -4506,15 +4506,15 @@ func TestRemovingOfflineRemotes(t *testing.T) { } // Tests that machine state properly reports initializing and running. -//func TestMachineState(t *testing.T) { + //logger := logging.NewTestLogger(t) //ctx := context.Background() -//completeComponentConstruction := make(chan struct{}, 1) +// completeComponentConstruction := make(chan struct{}, 1) //// Register a `foo` component whose construction completion can be delayed, //// and defer its deregistration. -//resource.RegisterComponent(generic.API, fooModel, resource.Registration[resource.Resource, +// resource.RegisterComponent(generic.API, fooModel, resource.Registration[resource.Resource, //resource.NoNativeConfig]{ //Constructor: func( //ctx context.Context, @@ -4525,7 +4525,7 @@ func TestRemovingOfflineRemotes(t *testing.T) { //// Delay completion of constructor until `completeComponentConstruction` is closed. //<-completeComponentConstruction -//return &fooComponent{ +// return &fooComponent{ //Named: conf.ResourceName().AsNamed(), //logger: logger, //}, nil @@ -4535,7 +4535,7 @@ func TestRemovingOfflineRemotes(t *testing.T) { //resource.Deregister(generic.API, fooModel) //}() -//cfg := &config.Config{ +// cfg := &config.Config{ //Components: []resource.Config{ //{ //Name: "foo", @@ -4547,28 +4547,28 @@ func TestRemovingOfflineRemotes(t *testing.T) { //r := setupLocalRobot(t, ctx, cfg, logger) //// Assert that robot reports a state of "initializing" until `foo` completes construction. -//machineStatus, err := r.MachineStatus(ctx) +// machineStatus, err := r.MachineStatus(ctx) //test.That(t, err, test.ShouldBeNil) //test.That(t, machineStatus, test.ShouldNotBeNil) //test.That(t, machineStatus.State, test.ShouldEqual, robot.StateInitializing) -//close(completeComponentConstruction) +// close(completeComponentConstruction) //// Assert that robot reports a state of "running" after `foo` completes //// construction. -//machineStatus, err = r.MachineStatus(ctx) +// machineStatus, err = r.MachineStatus(ctx) //test.That(t, err, test.ShouldBeNil) //test.That(t, machineStatus, test.ShouldNotBeNil) //test.That(t, machineStatus.State, test.ShouldEqual, robot.StateRunning) //// Reconfigure robot to replace `foo` with idential `bar` component (should build //// immediately, as `completeComponentConstruction` has already been closed.) -//cfg.Components[0].Name = "bar" +// cfg.Components[0].Name = "bar" //r.Reconfigure(ctx, cfg) //// Assert that robot continues to report a state of "running" even after further //// reconfiguration. -//machineStatus, err = r.MachineStatus(ctx) +// machineStatus, err = r.MachineStatus(ctx) //test.That(t, err, test.ShouldBeNil) //test.That(t, machineStatus, test.ShouldNotBeNil) //test.That(t, machineStatus.State, test.ShouldEqual, robot.StateRunning) diff --git a/robot/robot.go b/robot/robot.go index a08fda88009..e3a8c7538df 100644 --- a/robot/robot.go +++ b/robot/robot.go @@ -310,6 +310,7 @@ func (rmr *RestartModuleRequest) MatchesModule(mod config.Module) bool { return mod.Name == rmr.ModuleName } +// MachineState captures the state of a machine. type MachineState uint8 const ( From 62c3f5edb1b97359d10157a83a10338109b11c84 Mon Sep 17 00:00:00 2001 From: Benji Rewis Date: Wed, 11 Dec 2024 15:34:09 -0500 Subject: [PATCH 06/32] more lint sigh --- robot/impl/local_robot_test.go | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/robot/impl/local_robot_test.go b/robot/impl/local_robot_test.go index 9a37c9e460e..62935a86381 100644 --- a/robot/impl/local_robot_test.go +++ b/robot/impl/local_robot_test.go @@ -4507,7 +4507,7 @@ func TestRemovingOfflineRemotes(t *testing.T) { // Tests that machine state properly reports initializing and running. -//logger := logging.NewTestLogger(t) + //ctx := context.Background() // completeComponentConstruction := make(chan struct{}, 1) @@ -4515,7 +4515,7 @@ func TestRemovingOfflineRemotes(t *testing.T) { //// Register a `foo` component whose construction completion can be delayed, //// and defer its deregistration. // resource.RegisterComponent(generic.API, fooModel, resource.Registration[resource.Resource, -//resource.NoNativeConfig]{ +// resource.NoNativeConfig]{ //Constructor: func( //ctx context.Context, //deps resource.Dependencies, @@ -4528,7 +4528,7 @@ func TestRemovingOfflineRemotes(t *testing.T) { // return &fooComponent{ //Named: conf.ResourceName().AsNamed(), //logger: logger, -//}, nil +// }, nil //}, //}) //defer func() { @@ -4541,14 +4541,14 @@ func TestRemovingOfflineRemotes(t *testing.T) { //Name: "foo", //API: generic.API, //Model: fooModel, -//}, +// }, //}, //} //r := setupLocalRobot(t, ctx, cfg, logger) //// Assert that robot reports a state of "initializing" until `foo` completes construction. // machineStatus, err := r.MachineStatus(ctx) -//test.That(t, err, test.ShouldBeNil) +// test.That(t, err, test.ShouldBeNil) //test.That(t, machineStatus, test.ShouldNotBeNil) //test.That(t, machineStatus.State, test.ShouldEqual, robot.StateInitializing) @@ -4557,19 +4557,19 @@ func TestRemovingOfflineRemotes(t *testing.T) { //// Assert that robot reports a state of "running" after `foo` completes //// construction. // machineStatus, err = r.MachineStatus(ctx) -//test.That(t, err, test.ShouldBeNil) +// test.That(t, err, test.ShouldBeNil) //test.That(t, machineStatus, test.ShouldNotBeNil) //test.That(t, machineStatus.State, test.ShouldEqual, robot.StateRunning) //// Reconfigure robot to replace `foo` with idential `bar` component (should build //// immediately, as `completeComponentConstruction` has already been closed.) // cfg.Components[0].Name = "bar" -//r.Reconfigure(ctx, cfg) +// r.Reconfigure(ctx, cfg) //// Assert that robot continues to report a state of "running" even after further //// reconfiguration. // machineStatus, err = r.MachineStatus(ctx) -//test.That(t, err, test.ShouldBeNil) +// test.That(t, err, test.ShouldBeNil) //test.That(t, machineStatus, test.ShouldNotBeNil) //test.That(t, machineStatus.State, test.ShouldEqual, robot.StateRunning) //} From 3b9d6b9491f1e49e7a1e5a9905f4a2d7589d9cf9 Mon Sep 17 00:00:00 2001 From: Benji Rewis Date: Wed, 11 Dec 2024 16:04:27 -0500 Subject: [PATCH 07/32] basic entrypoint test --- robot/impl/local_robot_test.go | 69 ---------------------------------- web/server/entrypoint_test.go | 51 +++++++++++++++++++++++++ 2 files changed, 51 insertions(+), 69 deletions(-) diff --git a/robot/impl/local_robot_test.go b/robot/impl/local_robot_test.go index 62935a86381..0c505c281b9 100644 --- a/robot/impl/local_robot_test.go +++ b/robot/impl/local_robot_test.go @@ -4504,72 +4504,3 @@ func TestRemovingOfflineRemotes(t *testing.T) { cancelReconfig() wg.Wait() } - -// Tests that machine state properly reports initializing and running. - - -//ctx := context.Background() - -// completeComponentConstruction := make(chan struct{}, 1) - -//// Register a `foo` component whose construction completion can be delayed, -//// and defer its deregistration. -// resource.RegisterComponent(generic.API, fooModel, resource.Registration[resource.Resource, -// resource.NoNativeConfig]{ -//Constructor: func( -//ctx context.Context, -//deps resource.Dependencies, -//conf resource.Config, -//logger logging.Logger, -//) (resource.Resource, error) { -//// Delay completion of constructor until `completeComponentConstruction` is closed. -//<-completeComponentConstruction - -// return &fooComponent{ -//Named: conf.ResourceName().AsNamed(), -//logger: logger, -// }, nil -//}, -//}) -//defer func() { -//resource.Deregister(generic.API, fooModel) -//}() - -// cfg := &config.Config{ -//Components: []resource.Config{ -//{ -//Name: "foo", -//API: generic.API, -//Model: fooModel, -// }, -//}, -//} -//r := setupLocalRobot(t, ctx, cfg, logger) - -//// Assert that robot reports a state of "initializing" until `foo` completes construction. -// machineStatus, err := r.MachineStatus(ctx) -// test.That(t, err, test.ShouldBeNil) -//test.That(t, machineStatus, test.ShouldNotBeNil) -//test.That(t, machineStatus.State, test.ShouldEqual, robot.StateInitializing) - -// close(completeComponentConstruction) - -//// Assert that robot reports a state of "running" after `foo` completes -//// construction. -// machineStatus, err = r.MachineStatus(ctx) -// test.That(t, err, test.ShouldBeNil) -//test.That(t, machineStatus, test.ShouldNotBeNil) -//test.That(t, machineStatus.State, test.ShouldEqual, robot.StateRunning) - -//// Reconfigure robot to replace `foo` with idential `bar` component (should build -//// immediately, as `completeComponentConstruction` has already been closed.) -// cfg.Components[0].Name = "bar" -// r.Reconfigure(ctx, cfg) - -//// Assert that robot continues to report a state of "running" even after further -//// reconfiguration. -// machineStatus, err = r.MachineStatus(ctx) -// test.That(t, err, test.ShouldBeNil) -//test.That(t, machineStatus, test.ShouldNotBeNil) -//test.That(t, machineStatus.State, test.ShouldEqual, robot.StateRunning) -//} diff --git a/web/server/entrypoint_test.go b/web/server/entrypoint_test.go index 7b8cf98771c..8256e069ee8 100644 --- a/web/server/entrypoint_test.go +++ b/web/server/entrypoint_test.go @@ -27,6 +27,7 @@ import ( "go.viam.com/rdk/config" "go.viam.com/rdk/logging" "go.viam.com/rdk/resource" + "go.viam.com/rdk/robot" _ "go.viam.com/rdk/services/register" "go.viam.com/rdk/testutils" "go.viam.com/rdk/testutils/robottestutils" @@ -192,3 +193,53 @@ func isExpectedShutdownError(err error, testLogger logging.Logger) bool { testLogger.Errorw("Unexpected shutdown error", "err", err) return false } + +// Tests that machine state properly reports initializing or running. +func TestMachineState(t *testing.T) { + if runtime.GOARCH == "arm" { + t.Skip("skipping on 32-bit ARM, subprocess build warnings cause failure") + } + + logger, logObserver := logging.NewObservedTestLogger(t) + + cfgFilename := utils.ResolveFile("/etc/configs/fake.json") + cfg, err := config.Read(context.Background(), cfgFilename, logger) + test.That(t, err, test.ShouldBeNil) + + var port int + var success bool + var server pexec.ManagedProcess + for portTryNum := 0; portTryNum < 10; portTryNum++ { + p, err := goutils.TryReserveRandomPort() + port = p + test.That(t, err, test.ShouldBeNil) + + cfg.Network.BindAddress = fmt.Sprintf(":%d", port) + cfgFilename, err = robottestutils.MakeTempConfig(t, cfg, logger) + test.That(t, err, test.ShouldBeNil) + + server = robottestutils.ServerAsSeparateProcess(t, cfgFilename, logger) + err = server.Start(context.Background()) + test.That(t, err, test.ShouldBeNil) + + if success = robottestutils.WaitForServing(logObserver, port); success { + defer func() { + test.That(t, server.Stop(), test.ShouldBeNil) + }() + break + } + logger.Infow("Port in use. Restarting on new port.", "port", port, "err", err) + server.Stop() + continue + } + test.That(t, success, test.ShouldBeTrue) + + addr := "localhost:" + strconv.Itoa(port) + rc := robottestutils.NewRobotClient(t, logger, addr, time.Second) + + machineStatus, err := rc.MachineStatus(context.Background()) + test.That(t, err, test.ShouldBeNil) + test.That(t, machineStatus, test.ShouldNotBeNil) + test.That(t, machineStatus.State, test.ShouldBeIn, + []robot.MachineState{robot.StateInitializing, robot.StateRunning}) +} From d97e5835963d6091eae01068a2ebf170c65c2f93 Mon Sep 17 00:00:00 2001 From: Benji Rewis Date: Fri, 13 Dec 2024 12:28:36 -0500 Subject: [PATCH 08/32] reconfigure in goroutine and check for state running in client new --- .../replay/replay_utils_test.go | 2 +- robot/client/client.go | 25 +++++++++++++++++++ web/server/entrypoint.go | 14 +++++------ web/server/entrypoint_test.go | 5 ++-- 4 files changed, 36 insertions(+), 10 deletions(-) diff --git a/components/movementsensor/replay/replay_utils_test.go b/components/movementsensor/replay/replay_utils_test.go index 3681d36821d..73ccee66397 100644 --- a/components/movementsensor/replay/replay_utils_test.go +++ b/components/movementsensor/replay/replay_utils_test.go @@ -46,8 +46,8 @@ type mockDataServiceServer struct { // //nolint:deprecated,staticcheck func (mDServer *mockDataServiceServer) TabularDataByFilter(ctx context.Context, req *datapb.TabularDataByFilterRequest, + //nolint:deprecated,staticcheck -) (*datapb.TabularDataByFilterResponse, error) { ) (*datapb.TabularDataByFilterResponse, error) { filter := req.DataRequest.GetFilter() last := req.DataRequest.GetLast() diff --git a/robot/client/client.go b/robot/client/client.go index c8a379bf9f7..292f71edea8 100644 --- a/robot/client/client.go +++ b/robot/client/client.go @@ -333,6 +333,31 @@ func New(ctx context.Context, address string, clientLogger logging.ZapCompatible }, rc.activeBackgroundWorkers.Done) } + // If running in a testing environment, wait for machine to report a state of + // running. We often establish connections in tests and expected resources to + // be immediately available once the web service has started; resources will + // not be available when the machine is still initializing. + // + // It is expected that golang SDK users will handle lack of resource + // availability due to the machine being in an initializing state themselves. + if testing.Testing() { + for { + if ctx.Err() != nil { + return nil, multierr.Combine(ctx.Err(), rc.conn.Close()) + } + + mStatus, err := rc.MachineStatus(ctx) + if err != nil { + return nil, multierr.Combine(err, rc.conn.Close()) + } + + if mStatus.State == robot.StateRunning { + break + } + time.Sleep(50 * time.Millisecond) + } + } + return rc, nil } diff --git a/web/server/entrypoint.go b/web/server/entrypoint.go index d500070afcb..efafd317b5b 100644 --- a/web/server/entrypoint.go +++ b/web/server/entrypoint.go @@ -423,16 +423,11 @@ func (s *robotServer) serveWeb(ctx context.Context, cfg *config.Config) (err err err = multierr.Combine(err, myRobot.Close(context.Background())) }() - // Start web service with `minimalProcessedConfig`, then `Reconfigure` robot - // to have `fullProcessedConfig`. + // Create initial web options with `minimalProcessedConfig`. options, err := s.createWebOptions(minimalProcessedConfig) if err != nil { return err } - if err := web.RunWeb(ctx, myRobot, options, s.logger); err != nil { - return err - } - myRobot.Reconfigure(ctx, fullProcessedConfig) // watch for and deliver changes to the robot watcher, err := config.NewWatcher(ctx, cfg, s.logger.Sublogger("config")) @@ -449,6 +444,10 @@ func (s *robotServer) serveWeb(ctx context.Context, cfg *config.Config) (err err // config. oldCfg := fullProcessedConfig utils.ManagedGo(func() { + // Reconfigure robot to have full processed config before listening for any + // config changes. + myRobot.Reconfigure(ctx, fullProcessedConfig) + for { select { case <-ctx.Done(): @@ -508,7 +507,8 @@ func (s *robotServer) serveWeb(ctx context.Context, cfg *config.Config) (err err <-onWatchDone }() defer cancel() - return nil + + return web.RunWeb(ctx, myRobot, options, s.logger) } // dumpResourceRegistrations prints all builtin resource registrations as a json array diff --git a/web/server/entrypoint_test.go b/web/server/entrypoint_test.go index 8256e069ee8..945e41e36c6 100644 --- a/web/server/entrypoint_test.go +++ b/web/server/entrypoint_test.go @@ -237,9 +237,10 @@ func TestMachineState(t *testing.T) { addr := "localhost:" + strconv.Itoa(port) rc := robottestutils.NewRobotClient(t, logger, addr, time.Second) + // NewRobotClient will wait for machine state to be running. Assert that this + // is still the case. machineStatus, err := rc.MachineStatus(context.Background()) test.That(t, err, test.ShouldBeNil) test.That(t, machineStatus, test.ShouldNotBeNil) - test.That(t, machineStatus.State, test.ShouldBeIn, - []robot.MachineState{robot.StateInitializing, robot.StateRunning}) + test.That(t, machineStatus.State, test.ShouldEqual, robot.StateRunning) } From 58b4b94705e6c8339e4bb99999fd10dea8854a28 Mon Sep 17 00:00:00 2001 From: Benji Rewis Date: Fri, 13 Dec 2024 12:43:40 -0500 Subject: [PATCH 09/32] typos --- components/movementsensor/replay/replay_utils_test.go | 2 -- web/server/entrypoint.go | 11 +++++------ 2 files changed, 5 insertions(+), 8 deletions(-) diff --git a/components/movementsensor/replay/replay_utils_test.go b/components/movementsensor/replay/replay_utils_test.go index 73ccee66397..ee35c74dd0d 100644 --- a/components/movementsensor/replay/replay_utils_test.go +++ b/components/movementsensor/replay/replay_utils_test.go @@ -53,7 +53,6 @@ func (mDServer *mockDataServiceServer) TabularDataByFilter(ctx context.Context, last := req.DataRequest.GetLast() limit := req.DataRequest.GetLimit() - //nolint:deprecated,staticcheck var dataset []*datapb.TabularData var dataIndex int var err error @@ -80,7 +79,6 @@ func (mDServer *mockDataServiceServer) TabularDataByFilter(ctx context.Context, last = fmt.Sprint(dataIndex) - //nolint:deprecated,staticcheck tabularData := &datapb.TabularData{ Data: data, TimeRequested: timeReq, diff --git a/web/server/entrypoint.go b/web/server/entrypoint.go index efafd317b5b..f70b84307cb 100644 --- a/web/server/entrypoint.go +++ b/web/server/entrypoint.go @@ -423,12 +423,6 @@ func (s *robotServer) serveWeb(ctx context.Context, cfg *config.Config) (err err err = multierr.Combine(err, myRobot.Close(context.Background())) }() - // Create initial web options with `minimalProcessedConfig`. - options, err := s.createWebOptions(minimalProcessedConfig) - if err != nil { - return err - } - // watch for and deliver changes to the robot watcher, err := config.NewWatcher(ctx, cfg, s.logger.Sublogger("config")) if err != nil { @@ -508,6 +502,11 @@ func (s *robotServer) serveWeb(ctx context.Context, cfg *config.Config) (err err }() defer cancel() + // Create initial web options with `minimalProcessedConfig`. + options, err := s.createWebOptions(minimalProcessedConfig) + if err != nil { + return err + } return web.RunWeb(ctx, myRobot, options, s.logger) } From eacd5617016445b2d1820225211e2771423c4cb1 Mon Sep 17 00:00:00 2001 From: Benji Rewis Date: Fri, 13 Dec 2024 15:07:39 -0500 Subject: [PATCH 10/32] client test inject fixes --- robot/client/client_test.go | 63 +++++++++++++++++++++++++++++-------- 1 file changed, 50 insertions(+), 13 deletions(-) diff --git a/robot/client/client_test.go b/robot/client/client_test.go index 5024e5fe551..236bfddc913 100644 --- a/robot/client/client_test.go +++ b/robot/client/client_test.go @@ -669,6 +669,9 @@ func TestClientRefresh(t *testing.T) { callCountNames++ return emptyResources } + injectRobot.MachineStatusFunc = func(context.Context) (robot.MachineStatus, error) { + return robot.MachineStatus{State: robot.StateRunning}, nil + } mu.Unlock() start := time.Now() @@ -806,6 +809,9 @@ func TestClientDisconnect(t *testing.T) { injectRobot.ResourceNamesFunc = func() []resource.Name { return []resource.Name{arm.Named("arm1")} } + injectRobot.MachineStatusFunc = func(context.Context) (robot.MachineStatus, error) { + return robot.MachineStatus{State: robot.StateRunning}, nil + } // TODO(RSDK-882): will update this so that this is not necessary injectRobot.FrameSystemConfigFunc = func(ctx context.Context) (*framesystem.Config, error) { @@ -859,21 +865,24 @@ func TestClientUnaryDisconnectHandler(t *testing.T) { info *grpc.UnaryServerInfo, handler grpc.UnaryHandler, ) (interface{}, error) { + // Allow a single GetMachineStatus through; return `io.ErrClosedPipe` + // after that. if strings.HasSuffix(info.FullMethod, "RobotService/GetMachineStatus") { if unaryStatusCallReceived { return nil, status.Error(codes.Unknown, io.ErrClosedPipe.Error()) } unaryStatusCallReceived = true } - var resp interface{} - return resp, nil + return handler(ctx, req) }, ) gServer := grpc.NewServer(justOneUnaryStatusCall) injectRobot := &inject.Robot{} + injectRobot.ResourceRPCAPIsFunc = func() []resource.RPCAPI { return nil } + injectRobot.ResourceNamesFunc = func() []resource.Name { return nil } injectRobot.MachineStatusFunc = func(ctx context.Context) (robot.MachineStatus, error) { - return robot.MachineStatus{}, nil + return robot.MachineStatus{State: robot.StateRunning}, nil } pb.RegisterRobotServiceServer(gServer, server.New(injectRobot)) @@ -888,10 +897,11 @@ func TestClientUnaryDisconnectHandler(t *testing.T) { WithReconnectEvery(never), ) test.That(t, err, test.ShouldBeNil) + // Reset unaryStatusCallReceived to false, as `New` call above set it to + // true. + unaryStatusCallReceived = false t.Run("unary call to connected remote", func(t *testing.T) { - t.Helper() - client.connected.Store(false) _, err = client.MachineStatus(context.Background()) test.That(t, status.Code(err), test.ShouldEqual, codes.Unavailable) @@ -901,8 +911,6 @@ func TestClientUnaryDisconnectHandler(t *testing.T) { }) t.Run("unary call to disconnected remote", func(t *testing.T) { - t.Helper() - _, err = client.MachineStatus(context.Background()) test.That(t, err, test.ShouldBeNil) test.That(t, unaryStatusCallReceived, test.ShouldBeTrue) @@ -947,7 +955,7 @@ func TestClientStreamDisconnectHandler(t *testing.T) { injectRobot.ResourceRPCAPIsFunc = func() []resource.RPCAPI { return nil } injectRobot.ResourceNamesFunc = func() []resource.Name { return nil } injectRobot.MachineStatusFunc = func(ctx context.Context) (robot.MachineStatus, error) { - return robot.MachineStatus{}, nil + return robot.MachineStatus{State: robot.StateRunning}, nil } pb.RegisterRobotServiceServer(gServer, server.New(injectRobot)) @@ -1041,7 +1049,9 @@ func TestClientReconnect(t *testing.T) { injectRobot.ResourceNamesFunc = func() []resource.Name { return []resource.Name{arm.Named("arm1"), thing1Name} } - + injectRobot.MachineStatusFunc = func(ctx context.Context) (robot.MachineStatus, error) { + return robot.MachineStatus{State: robot.StateRunning}, nil + } // TODO(RSDK-882): will update this so that this is not necessary injectRobot.FrameSystemConfigFunc = func(ctx context.Context) (*framesystem.Config, error) { return &framesystem.Config{}, nil @@ -1137,6 +1147,9 @@ func TestClientRefreshNoReconfigure(t *testing.T) { pb.RegisterRobotServiceServer(gServer, server.New(injectRobot)) injectRobot.ResourceRPCAPIsFunc = func() []resource.RPCAPI { return nil } thing1Name := resource.NewName(someAPI, "thing1") + injectRobot.MachineStatusFunc = func(context.Context) (robot.MachineStatus, error) { + return robot.MachineStatus{State: robot.StateRunning}, nil + } var callCount int calledEnough := make(chan struct{}) @@ -1555,6 +1568,9 @@ func TestNewRobotClientRefresh(t *testing.T) { callCount++ return emptyResources } + injectRobot.MachineStatusFunc = func(context.Context) (robot.MachineStatus, error) { + return robot.MachineStatus{State: robot.StateRunning}, nil + } pb.RegisterRobotServiceServer(gServer, server.New(injectRobot)) @@ -1638,6 +1654,9 @@ func TestRemoteClientMatch(t *testing.T) { injectRobot1 := &inject.Robot{ ResourceNamesFunc: func() []resource.Name { return validResources }, ResourceRPCAPIsFunc: func() []resource.RPCAPI { return nil }, + MachineStatusFunc: func(ctx context.Context) (robot.MachineStatus, error) { + return robot.MachineStatus{State: robot.StateRunning}, nil + }, } // TODO(RSDK-882): will update this so that this is not necessary @@ -1688,6 +1707,9 @@ func TestRemoteClientDuplicate(t *testing.T) { injectRobot1 := &inject.Robot{ ResourceNamesFunc: func() []resource.Name { return validResources }, ResourceRPCAPIsFunc: func() []resource.RPCAPI { return nil }, + MachineStatusFunc: func(ctx context.Context) (robot.MachineStatus, error) { + return robot.MachineStatus{State: robot.StateRunning}, nil + }, } pb.RegisterRobotServiceServer(gServer1, server.New(injectRobot1)) @@ -1756,7 +1778,7 @@ func TestClientOperationIntercept(t *testing.T) { receivedOpID, err := operation.GetOrCreateFromMetadata(meta) test.That(t, err, test.ShouldBeNil) test.That(t, receivedOpID.String(), test.ShouldEqual, fakeOp.ID.String()) - return robot.MachineStatus{}, nil + return robot.MachineStatus{State: robot.StateRunning}, nil } resp, err := client.MachineStatus(ctx) @@ -1775,6 +1797,9 @@ func TestGetUnknownResource(t *testing.T) { injectRobot := &inject.Robot{ ResourceNamesFunc: func() []resource.Name { return []resource.Name{arm.Named("myArm")} }, ResourceRPCAPIsFunc: func() []resource.RPCAPI { return nil }, + MachineStatusFunc: func(ctx context.Context) (robot.MachineStatus, error) { + return robot.MachineStatus{State: robot.StateRunning}, nil + }, } // TODO(RSDK-882): will update this so that this is not necessary @@ -1821,15 +1846,15 @@ func TestLoggingInterceptor(t *testing.T) { MachineStatusFunc: func(ctx context.Context) (robot.MachineStatus, error) { // If there is no debug information with the context, return no revision if !logging.IsDebugMode(ctx) && logging.GetName(ctx) == "" { - return robot.MachineStatus{}, nil + return robot.MachineStatus{State: robot.StateRunning}, nil } // If there is debug information with `oliver` with the context, return a revision of `oliver` if logging.IsDebugMode(ctx) && logging.GetName(ctx) == "oliver" { - return robot.MachineStatus{Config: config.Revision{Revision: "oliver"}}, nil + return robot.MachineStatus{Config: config.Revision{Revision: "oliver"}, State: robot.StateRunning}, nil } - return robot.MachineStatus{}, errors.New("shouldn't happen") + return robot.MachineStatus{State: robot.StateRunning}, errors.New("shouldn't happen") }, } pb.RegisterRobotServiceServer(gServer, server.New(injectRobot)) @@ -1872,6 +1897,9 @@ func TestCloudMetadata(t *testing.T) { CloudMetadataFunc: func(ctx context.Context) (cloud.Metadata, error) { return injectCloudMD, nil }, + MachineStatusFunc: func(ctx context.Context) (robot.MachineStatus, error) { + return robot.MachineStatus{State: robot.StateRunning}, nil + }, } // TODO(RSDK-882): will update this so that this is not necessary injectRobot.FrameSystemConfigFunc = func(ctx context.Context) (*framesystem.Config, error) { @@ -1906,6 +1934,9 @@ func TestShutDown(t *testing.T) { shutdownCalled = true return nil }, + MachineStatusFunc: func(ctx context.Context) (robot.MachineStatus, error) { + return robot.MachineStatus{State: robot.StateRunning}, nil + }, } gServer := grpc.NewServer() @@ -1942,6 +1973,9 @@ func TestUnregisteredResourceByName(t *testing.T) { injectRobot := &inject.Robot{ ResourceNamesFunc: func() []resource.Name { return resourceList }, ResourceRPCAPIsFunc: func() []resource.RPCAPI { return nil }, + MachineStatusFunc: func(ctx context.Context) (robot.MachineStatus, error) { + return robot.MachineStatus{State: robot.StateRunning}, nil + }, } gServer := grpc.NewServer() @@ -2149,6 +2183,9 @@ func TestVersion(t *testing.T) { injectRobot := &inject.Robot{ ResourceNamesFunc: func() []resource.Name { return nil }, ResourceRPCAPIsFunc: func() []resource.RPCAPI { return nil }, + MachineStatusFunc: func(ctx context.Context) (robot.MachineStatus, error) { + return robot.MachineStatus{State: robot.StateRunning}, nil + }, } pb.RegisterRobotServiceServer(gServer, server.New(injectRobot)) From 4948b130b671714deef2e8352dea24ce4b6bfdd8 Mon Sep 17 00:00:00 2001 From: Benji Rewis Date: Mon, 16 Dec 2024 13:35:20 -0500 Subject: [PATCH 11/32] move out of initializing in setupLocalRobot --- robot/impl/utils.go | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/robot/impl/utils.go b/robot/impl/utils.go index f5031757bb8..25f24996881 100644 --- a/robot/impl/utils.go +++ b/robot/impl/utils.go @@ -36,6 +36,12 @@ func setupLocalRobot( test.That(t, ok, test.ShouldBeTrue) lRobot.reconfigureWorkers.Wait() }) + + // Manually move testing robot out of initializing mode. Normally this is + // done as part of web server entrypoint code. + lRobot, ok := r.(*localRobot) + test.That(t, ok, test.ShouldBeTrue) + lRobot.initializing.Store(false) return r } From d75639687a481b947db6a314a8f08eaf06e65916 Mon Sep 17 00:00:00 2001 From: Benji Rewis Date: Mon, 16 Dec 2024 14:35:30 -0500 Subject: [PATCH 12/32] add log lines for debugging 32 bit tests --- .../demos/complexmodule/moduletest/module_test.go | 2 ++ robot/client/client.go | 3 +++ 2 files changed, 5 insertions(+) diff --git a/examples/customresources/demos/complexmodule/moduletest/module_test.go b/examples/customresources/demos/complexmodule/moduletest/module_test.go index 1f510da8dde..7376a88fbc5 100644 --- a/examples/customresources/demos/complexmodule/moduletest/module_test.go +++ b/examples/customresources/demos/complexmodule/moduletest/module_test.go @@ -296,6 +296,7 @@ func TestComplexModule(t *testing.T) { } func connect(port int, logger logging.Logger) (robot.Robot, error) { + logger.Info("BENJI-DBG Calling complex module connect") connectCtx, cancelConn := context.WithTimeout(context.Background(), time.Second*30) defer cancelConn() for { @@ -306,6 +307,7 @@ func connect(port int, logger logging.Logger) (robot.Robot, error) { ) dialCancel() if !errors.Is(err, context.DeadlineExceeded) { + logger.Info("BENJI-DBG I successfully created a client in complex module connect") return rc, err } select { diff --git a/robot/client/client.go b/robot/client/client.go index 292f71edea8..f4fea9d461f 100644 --- a/robot/client/client.go +++ b/robot/client/client.go @@ -341,6 +341,7 @@ func New(ctx context.Context, address string, clientLogger logging.ZapCompatible // It is expected that golang SDK users will handle lack of resource // availability due to the machine being in an initializing state themselves. if testing.Testing() { + logger.Info("BENJI-DBG I AM checking for a machine status of Running due to presence of testing flag") for { if ctx.Err() != nil { return nil, multierr.Combine(ctx.Err(), rc.conn.Close()) @@ -356,6 +357,8 @@ func New(ctx context.Context, address string, clientLogger logging.ZapCompatible } time.Sleep(50 * time.Millisecond) } + } else { + logger.Info("BENJI-DBG I am NOT checking for a machine status of Running due to lack of testing flag") } return rc, nil From fcf03b5aa6401b2e7515dda7f6409858d184649b Mon Sep 17 00:00:00 2001 From: Benji Rewis Date: Mon, 16 Dec 2024 15:09:00 -0500 Subject: [PATCH 13/32] move running check higher --- .../complexmodule/moduletest/module_test.go | 2 - robot/client/client.go | 53 +++++++++---------- 2 files changed, 25 insertions(+), 30 deletions(-) diff --git a/examples/customresources/demos/complexmodule/moduletest/module_test.go b/examples/customresources/demos/complexmodule/moduletest/module_test.go index 7376a88fbc5..1f510da8dde 100644 --- a/examples/customresources/demos/complexmodule/moduletest/module_test.go +++ b/examples/customresources/demos/complexmodule/moduletest/module_test.go @@ -296,7 +296,6 @@ func TestComplexModule(t *testing.T) { } func connect(port int, logger logging.Logger) (robot.Robot, error) { - logger.Info("BENJI-DBG Calling complex module connect") connectCtx, cancelConn := context.WithTimeout(context.Background(), time.Second*30) defer cancelConn() for { @@ -307,7 +306,6 @@ func connect(port int, logger logging.Logger) (robot.Robot, error) { ) dialCancel() if !errors.Is(err, context.DeadlineExceeded) { - logger.Info("BENJI-DBG I successfully created a client in complex module connect") return rc, err } select { diff --git a/robot/client/client.go b/robot/client/client.go index f4fea9d461f..d9cdc8b0f43 100644 --- a/robot/client/client.go +++ b/robot/client/client.go @@ -288,6 +288,31 @@ func New(ctx context.Context, address string, clientLogger logging.ZapCompatible return nil, err } + // If running in a testing environment, wait for machine to report a state of + // running. We often establish connections in tests and expected resources to + // be immediately available once the web service has started; resources will + // not be available when the machine is still initializing. + // + // It is expected that golang SDK users will handle lack of resource + // availability due to the machine being in an initializing state themselves. + if testing.Testing() { + for { + if ctx.Err() != nil { + return nil, multierr.Combine(ctx.Err(), rc.conn.Close()) + } + + mStatus, err := rc.MachineStatus(ctx) + if err != nil { + return nil, multierr.Combine(err, rc.conn.Close()) + } + + if mStatus.State == robot.StateRunning { + break + } + time.Sleep(50 * time.Millisecond) + } + } + // refresh once to hydrate the robot. if err := rc.Refresh(ctx); err != nil { return nil, multierr.Combine(err, rc.conn.Close()) @@ -333,34 +358,6 @@ func New(ctx context.Context, address string, clientLogger logging.ZapCompatible }, rc.activeBackgroundWorkers.Done) } - // If running in a testing environment, wait for machine to report a state of - // running. We often establish connections in tests and expected resources to - // be immediately available once the web service has started; resources will - // not be available when the machine is still initializing. - // - // It is expected that golang SDK users will handle lack of resource - // availability due to the machine being in an initializing state themselves. - if testing.Testing() { - logger.Info("BENJI-DBG I AM checking for a machine status of Running due to presence of testing flag") - for { - if ctx.Err() != nil { - return nil, multierr.Combine(ctx.Err(), rc.conn.Close()) - } - - mStatus, err := rc.MachineStatus(ctx) - if err != nil { - return nil, multierr.Combine(err, rc.conn.Close()) - } - - if mStatus.State == robot.StateRunning { - break - } - time.Sleep(50 * time.Millisecond) - } - } else { - logger.Info("BENJI-DBG I am NOT checking for a machine status of Running due to lack of testing flag") - } - return rc, nil } From 09b76fdce099e926ac0d628e94e3f65018450524 Mon Sep 17 00:00:00 2001 From: Benji Rewis Date: Wed, 18 Dec 2024 11:23:52 -0500 Subject: [PATCH 14/32] potentially simpler API --- robot/impl/local_robot.go | 22 ++++++++++------------ robot/impl/robot_options.go | 11 +++++++++++ robot/impl/utils.go | 6 ------ robot/robot.go | 3 +++ web/server/entrypoint.go | 7 +++++++ 5 files changed, 31 insertions(+), 18 deletions(-) diff --git a/robot/impl/local_robot.go b/robot/impl/local_robot.go index ed4abd99e2e..9d8fea19eec 100644 --- a/robot/impl/local_robot.go +++ b/robot/impl/local_robot.go @@ -92,8 +92,9 @@ type localRobot struct { // whether the robot is actively reconfiguring reconfiguring atomic.Bool - // whether the robot is still initializing (first reconfigure after initial - // construction has not yet completed.) + // whether the robot is still initializing; can be set with SetInitializing + // and defaults to false. this value controls what state will be returned by + // the MachineStatus endpoint (initializing if true, running if false.) initializing atomic.Bool } @@ -515,8 +516,6 @@ func newWithResources( } successful = true - // Robot is "initializing" until first reconfigure after initial creation completes. - r.initializing.Store(true) return r, nil } @@ -1103,14 +1102,6 @@ func dialRobotClient( // possibly leak resources. The given config may be modified by Reconfigure. func (r *localRobot) Reconfigure(ctx context.Context, newConfig *config.Config) { r.reconfigure(ctx, newConfig, false) - - // Robot is no longer initializing after a reconfigure completes. It does not - // matter if the call above resulted in errors, we only care that all - // initially specified components, servies, modules, remotes, and processes - // got a chance to attempt configuration. - // - // On initial construction, this value will get set back to true. - r.initializing.Store(false) } // set Module.LocalVersion on Type=local modules. Call this before localPackages.Sync and in RestartModule. @@ -1557,3 +1548,10 @@ func (r *localRobot) RestartAllowed() bool { } return false } + +// SetInitializing sets the initializing state of the robot. This method can be +// used after initial configuration to indicate that the robot should return a +// state of running from the MachineStatus endpoint. +func (r *localRobot) SetInitializing(initializing bool) { + r.initializing.Store(initializing) +} diff --git a/robot/impl/robot_options.go b/robot/impl/robot_options.go index c59aded407b..81391c285a2 100644 --- a/robot/impl/robot_options.go +++ b/robot/impl/robot_options.go @@ -19,10 +19,14 @@ type options struct { // shutdownCallback provides a callback for the robot to be able to shut itself down. shutdownCallback func() + // whether FTDC is enabled enableFTDC bool // disableCompleteConfigWorker starts the robot without the complete config worker - should only be used for tests. disableCompleteConfigWorker bool + + // whether robot should be in an initializing state + initializing bool } // Option configures how we set up the web service. @@ -92,3 +96,10 @@ func withDisableCompleteConfigWorker() Option { o.disableCompleteConfigWorker = true }) } + +// WithInitializing returns an Option which sets initializing to true. +func WithInitializing() Option { + return newFuncOption(func(o *options) { + o.initializing = true + }) +} diff --git a/robot/impl/utils.go b/robot/impl/utils.go index 25f24996881..f5031757bb8 100644 --- a/robot/impl/utils.go +++ b/robot/impl/utils.go @@ -36,12 +36,6 @@ func setupLocalRobot( test.That(t, ok, test.ShouldBeTrue) lRobot.reconfigureWorkers.Wait() }) - - // Manually move testing robot out of initializing mode. Normally this is - // done as part of web server entrypoint code. - lRobot, ok := r.(*localRobot) - test.That(t, ok, test.ShouldBeTrue) - lRobot.initializing.Store(false) return r } diff --git a/robot/robot.go b/robot/robot.go index e3a8c7538df..258a7fc5d05 100644 --- a/robot/robot.go +++ b/robot/robot.go @@ -186,6 +186,9 @@ type LocalRobot interface { // RestartAllowed returns whether the robot can safely be restarted. RestartAllowed() bool + + // SetInitializing sets the initializing state of the robot. + SetInitializing(initializing bool) } // A RemoteRobot is a Robot that was created through a connection. diff --git a/web/server/entrypoint.go b/web/server/entrypoint.go index f70b84307cb..1656d89db23 100644 --- a/web/server/entrypoint.go +++ b/web/server/entrypoint.go @@ -414,6 +414,8 @@ func (s *robotServer) serveWeb(ctx context.Context, cfg *config.Config) (err err minimalProcessedConfig.Modules = nil minimalProcessedConfig.Processes = nil + // Start robot in an initializing state with minimal config. + robotOptions = append(robotOptions, robotimpl.WithInitializing()) myRobot, err := robotimpl.New(ctx, minimalProcessedConfig, s.logger, robotOptions...) if err != nil { cancel() @@ -442,6 +444,11 @@ func (s *robotServer) serveWeb(ctx context.Context, cfg *config.Config) (err err // config changes. myRobot.Reconfigure(ctx, fullProcessedConfig) + // Once reconfigure with full processed config is complete; set initializing + // to false. Robot is now fully running and can indicate this through the + // MachineStatus endpoint. + myRobot.SetInitializing(false) + for { select { case <-ctx.Done(): From 35f8f3ddbdfbfbdef361939a48f44c7844243e68 Mon Sep 17 00:00:00 2001 From: Benji Rewis Date: Wed, 18 Dec 2024 11:34:45 -0500 Subject: [PATCH 15/32] more missing injections --- robot/client/client_test.go | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/robot/client/client_test.go b/robot/client/client_test.go index 236bfddc913..b43f86ed72d 100644 --- a/robot/client/client_test.go +++ b/robot/client/client_test.go @@ -108,6 +108,12 @@ func (ms *mockRPCSubtypesUnimplemented) ResourceNames( return ms.ResourceNamesFunc(req) } +func (ms *mockRPCSubtypesUnimplemented) GetMachineStatus( + ctx context.Context, req *pb.GetMachineStatusRequest, +) (*pb.GetMachineStatusResponse, error) { + return &pb.GetMachineStatusResponse{State: pb.GetMachineStatusResponse_STATE_RUNNING}, nil +} + type mockRPCSubtypesImplemented struct { mockRPCSubtypesUnimplemented ResourceNamesFunc func(*pb.ResourceNamesRequest) (*pb.ResourceNamesResponse, error) @@ -125,6 +131,12 @@ func (ms *mockRPCSubtypesImplemented) ResourceNames( return ms.ResourceNamesFunc(req) } +func (ms *mockRPCSubtypesImplemented) GetMachineStatus( + ctx context.Context, req *pb.GetMachineStatusRequest, +) (*pb.GetMachineStatusResponse, error) { + return &pb.GetMachineStatusResponse{State: pb.GetMachineStatusResponse_STATE_RUNNING}, nil +} + var resourceFunc1 = func(*pb.ResourceNamesRequest) (*pb.ResourceNamesResponse, error) { board1 := board.Named("board1") rNames := []*commonpb.ResourceName{ @@ -309,11 +321,17 @@ func TestStatusClient(t *testing.T) { FrameSystemConfigFunc: frameSystemConfigFunc, ResourceNamesFunc: resourcesFunc, ResourceRPCAPIsFunc: func() []resource.RPCAPI { return nil }, + MachineStatusFunc: func(_ context.Context) (robot.MachineStatus, error) { + return robot.MachineStatus{State: robot.StateRunning}, nil + }, } injectRobot2 := &inject.Robot{ FrameSystemConfigFunc: frameSystemConfigFunc, ResourceNamesFunc: resourcesFunc, ResourceRPCAPIsFunc: func() []resource.RPCAPI { return nil }, + MachineStatusFunc: func(_ context.Context) (robot.MachineStatus, error) { + return robot.MachineStatus{State: robot.StateRunning}, nil + }, } pb.RegisterRobotServiceServer(gServer1, server.New(injectRobot1)) pb.RegisterRobotServiceServer(gServer2, server.New(injectRobot2)) From e299ab4122f430f3108364c8f11c8ff9a31852f7 Mon Sep 17 00:00:00 2001 From: Benji Rewis Date: Wed, 18 Dec 2024 13:11:38 -0500 Subject: [PATCH 16/32] fix more tests sigh --- robot/client/client.go | 4 ++++ robot/client/client_test.go | 3 +++ 2 files changed, 7 insertions(+) diff --git a/robot/client/client.go b/robot/client/client.go index d9cdc8b0f43..f32e39c9fce 100644 --- a/robot/client/client.go +++ b/robot/client/client.go @@ -303,6 +303,10 @@ func New(ctx context.Context, address string, clientLogger logging.ZapCompatible mStatus, err := rc.MachineStatus(ctx) if err != nil { + // Allow for MachineStatus to not be injected/implemented in some tests. + if status.Code(err) == codes.Unimplemented { + break + } return nil, multierr.Combine(err, rc.conn.Close()) } diff --git a/robot/client/client_test.go b/robot/client/client_test.go index b43f86ed72d..fbaedad90a7 100644 --- a/robot/client/client_test.go +++ b/robot/client/client_test.go @@ -1258,6 +1258,9 @@ func TestClientResources(t *testing.T) { injectRobot.ResourceRPCAPIsFunc = func() []resource.RPCAPI { return respWith } injectRobot.ResourceNamesFunc = func() []resource.Name { return finalResources } + injectRobot.MachineStatusFunc = func(_ context.Context) (robot.MachineStatus, error) { + return robot.MachineStatus{State: robot.StateRunning}, nil + } gServer := grpc.NewServer() pb.RegisterRobotServiceServer(gServer, server.New(injectRobot)) From 19f7940d436df5fc67110e0fb9d49a2f68b43f59 Mon Sep 17 00:00:00 2001 From: Benji Rewis Date: Wed, 18 Dec 2024 15:34:01 -0500 Subject: [PATCH 17/32] maybe this time --- robot/client/client_session_test.go | 35 +++++++++++++++--------- robot/client/client_test.go | 41 +++++++++++++++-------------- 2 files changed, 43 insertions(+), 33 deletions(-) diff --git a/robot/client/client_session_test.go b/robot/client/client_session_test.go index 8475f2201da..abae64269df 100644 --- a/robot/client/client_session_test.go +++ b/robot/client/client_session_test.go @@ -107,8 +107,11 @@ func TestClientSessionOptions(t *testing.T) { return &dummyEcho{Named: arbName.AsNamed()}, nil }, ResourceRPCAPIsFunc: func() []resource.RPCAPI { return nil }, - LoggerFunc: func() logging.Logger { return logger }, - SessMgr: sessMgr, + MachineStatusFunc: func(_ context.Context) (robot.MachineStatus, error) { + return robot.MachineStatus{State: robot.StateRunning}, nil + }, + LoggerFunc: func() logging.Logger { return logger }, + SessMgr: sessMgr, } svc := web.New(injectRobot, logger) @@ -129,13 +132,11 @@ func TestClientSessionOptions(t *testing.T) { Disable: true, }))) } - roboClient, err := client.New(ctx, addr, logger, opts...) - test.That(t, err, test.ShouldBeNil) injectRobot.Mu.Lock() injectRobot.MachineStatusFunc = func(ctx context.Context) (robot.MachineStatus, error) { session.SafetyMonitorResourceName(ctx, someTargetName1) - return robot.MachineStatus{}, nil + return robot.MachineStatus{State: robot.StateRunning}, nil } injectRobot.Mu.Unlock() @@ -179,6 +180,8 @@ func TestClientSessionOptions(t *testing.T) { } sessMgr.mu.Unlock() + roboClient, err := client.New(ctx, addr, logger, opts...) + test.That(t, err, test.ShouldBeNil) resp, err := roboClient.MachineStatus(nextCtx) test.That(t, err, test.ShouldBeNil) test.That(t, resp, test.ShouldNotBeNil) @@ -289,6 +292,9 @@ func TestClientSessionExpiration(t *testing.T) { ResourceByNameFunc: func(name resource.Name) (resource.Resource, error) { return &dummyEcho1, nil }, + MachineStatusFunc: func(_ context.Context) (robot.MachineStatus, error) { + return robot.MachineStatus{State: robot.StateRunning}, nil + }, ResourceRPCAPIsFunc: func() []resource.RPCAPI { return nil }, LoggerFunc: func() logging.Logger { return logger }, SessMgr: sessMgr, @@ -306,8 +312,6 @@ func TestClientSessionExpiration(t *testing.T) { Disable: true, }))) } - roboClient, err := client.New(ctx, addr, logger, opts...) - test.That(t, err, test.ShouldBeNil) injectRobot.Mu.Lock() var capSessID uuid.UUID @@ -317,7 +321,7 @@ func TestClientSessionExpiration(t *testing.T) { panic("expected session") } capSessID = sess.ID() - return robot.MachineStatus{}, nil + return robot.MachineStatus{State: robot.StateRunning}, nil } injectRobot.Mu.Unlock() @@ -372,6 +376,8 @@ func TestClientSessionExpiration(t *testing.T) { } sessMgr.mu.Unlock() + roboClient, err := client.New(ctx, addr, logger, opts...) + test.That(t, err, test.ShouldBeNil) resp, err := roboClient.MachineStatus(nextCtx) test.That(t, err, test.ShouldBeNil) test.That(t, resp, test.ShouldNotBeNil) @@ -480,8 +486,11 @@ func TestClientSessionResume(t *testing.T) { injectRobot := &inject.Robot{ ResourceNamesFunc: func() []resource.Name { return []resource.Name{} }, ResourceRPCAPIsFunc: func() []resource.RPCAPI { return nil }, - LoggerFunc: func() logging.Logger { return logger }, - SessMgr: sessMgr, + MachineStatusFunc: func(_ context.Context) (robot.MachineStatus, error) { + return robot.MachineStatus{State: robot.StateRunning}, nil + }, + LoggerFunc: func() logging.Logger { return logger }, + SessMgr: sessMgr, } svc := web.New(injectRobot, logger) @@ -496,8 +505,6 @@ func TestClientSessionResume(t *testing.T) { Disable: true, }))) } - roboClient, err := client.New(ctx, addr, logger, opts...) - test.That(t, err, test.ShouldBeNil) var capMu sync.Mutex var startCalled int @@ -536,10 +543,12 @@ func TestClientSessionResume(t *testing.T) { panic("expected session") } capSessID = sess.ID() - return robot.MachineStatus{}, nil + return robot.MachineStatus{State: robot.StateRunning}, nil } injectRobot.Mu.Unlock() + roboClient, err := client.New(ctx, addr, logger, opts...) + test.That(t, err, test.ShouldBeNil) resp, err := roboClient.MachineStatus(nextCtx) test.That(t, err, test.ShouldBeNil) test.That(t, resp, test.ShouldNotBeNil) diff --git a/robot/client/client_test.go b/robot/client/client_test.go index fbaedad90a7..241ac58fece 100644 --- a/robot/client/client_test.go +++ b/robot/client/client_test.go @@ -1317,6 +1317,9 @@ func TestClientDiscovery(t *testing.T) { injectRobot.ResourceNamesFunc = func() []resource.Name { return finalResources } + injectRobot.MachineStatusFunc = func(_ context.Context) (robot.MachineStatus, error) { + return robot.MachineStatus{State: robot.StateRunning}, nil + } q := resource.DiscoveryQuery{ API: movementsensor.Named("foo").API, Model: resource.DefaultModelFamily.WithModel("bar"), @@ -1393,10 +1396,17 @@ func TestClientConfig(t *testing.T) { workingRobot := &inject.Robot{ ResourceNamesFunc: resourcesFunc, ResourceRPCAPIsFunc: func() []resource.RPCAPI { return nil }, + MachineStatusFunc: func(_ context.Context) (robot.MachineStatus, error) { + return robot.MachineStatus{State: robot.StateRunning}, nil + }, } + failingRobot := &inject.Robot{ ResourceNamesFunc: resourcesFunc, ResourceRPCAPIsFunc: func() []resource.RPCAPI { return nil }, + MachineStatusFunc: func(_ context.Context) (robot.MachineStatus, error) { + return robot.MachineStatus{State: robot.StateRunning}, nil + }, } o1 := &spatialmath.R4AA{Theta: math.Pi / 2, RZ: 1} @@ -1540,6 +1550,9 @@ func TestForeignResource(t *testing.T) { injectRobot.ResourceRPCAPIsFunc = func() []resource.RPCAPI { return respWith } injectRobot.ResourceNamesFunc = func() []resource.Name { return respWithResources } + injectRobot.MachineStatusFunc = func(_ context.Context) (robot.MachineStatus, error) { + return robot.MachineStatus{State: robot.StateRunning}, nil + } // TODO(RSDK-882): will update this so that this is not necessary injectRobot.FrameSystemConfigFunc = func(ctx context.Context) (*framesystem.Config, error) { return &framesystem.Config{}, nil @@ -1645,6 +1658,9 @@ func TestClientStopAll(t *testing.T) { injectRobot1 := &inject.Robot{ ResourceNamesFunc: resourcesFunc, ResourceRPCAPIsFunc: func() []resource.RPCAPI { return nil }, + MachineStatusFunc: func(_ context.Context) (robot.MachineStatus, error) { + return robot.MachineStatus{State: robot.StateRunning}, nil + }, StopAllFunc: func(ctx context.Context, extra map[resource.Name]map[string]interface{}) error { stopAllCalled = true return nil @@ -1774,6 +1790,9 @@ func TestClientOperationIntercept(t *testing.T) { injectRobot := &inject.Robot{ ResourceNamesFunc: func() []resource.Name { return []resource.Name{} }, ResourceRPCAPIsFunc: func() []resource.RPCAPI { return nil }, + MachineStatusFunc: func(_ context.Context) (robot.MachineStatus, error) { + return robot.MachineStatus{State: robot.StateRunning}, nil + }, } gServer := grpc.NewServer() @@ -2048,7 +2067,7 @@ func TestMachineStatus(t *testing.T) { }, State: robot.StateRunning, }, - 1, + 2, // once for client.New call and once for MachineStatus call }, { "resource with valid status", @@ -2094,7 +2113,7 @@ func TestMachineStatus(t *testing.T) { }, State: robot.StateRunning, }, - 2, + 4, // twice for client.New call and twice for MachineStatus call }, { "unhealthy status", @@ -2136,24 +2155,6 @@ func TestMachineStatus(t *testing.T) { }, 0, }, - { - "initializing machine state", - robot.MachineStatus{ - Config: config.Revision{Revision: "rev1"}, - Resources: []resource.Status{}, - State: robot.StateInitializing, - }, - 0, - }, - { - "unknown machine state", - robot.MachineStatus{ - Config: config.Revision{Revision: "rev1"}, - Resources: []resource.Status{}, - State: robot.StateUnknown, - }, - 0, - }, } { t.Run(tc.name, func(t *testing.T) { logger, logs := logging.NewObservedTestLogger(t) From d68e6f404d191756865aaa906697198019d7b4b5 Mon Sep 17 00:00:00 2001 From: Benji Rewis Date: Wed, 18 Dec 2024 16:28:13 -0500 Subject: [PATCH 18/32] actually use value from options --- robot/impl/local_robot.go | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/robot/impl/local_robot.go b/robot/impl/local_robot.go index 9d8fea19eec..e7cc7f84160 100644 --- a/robot/impl/local_robot.go +++ b/robot/impl/local_robot.go @@ -393,6 +393,10 @@ func newWithResources( localModuleVersions: make(map[string]semver.Version), ftdc: ftdcWorker, } + + // Use value of initializing from robot options. + r.initializing.Store(rOpts.initializing) + r.mostRecentCfg.Store(config.Config{}) var heartbeatWindow time.Duration if cfg.Network.Sessions.HeartbeatWindow == 0 { From 2d8bc1b3df61e067b9a78a25abbba263bc7eabac Mon Sep 17 00:00:00 2001 From: Benji Rewis Date: Thu, 19 Dec 2024 13:05:04 -0500 Subject: [PATCH 19/32] make configWatcher a method instead of an anonymous function --- web/server/entrypoint.go | 187 +++++++++++++++++++++------------------ 1 file changed, 102 insertions(+), 85 deletions(-) diff --git a/web/server/entrypoint.go b/web/server/entrypoint.go index 1656d89db23..69de5cc8ed8 100644 --- a/web/server/entrypoint.go +++ b/web/server/entrypoint.go @@ -25,6 +25,7 @@ import ( "go.viam.com/rdk/config" "go.viam.com/rdk/logging" "go.viam.com/rdk/resource" + "go.viam.com/rdk/robot" robotimpl "go.viam.com/rdk/robot/impl" "go.viam.com/rdk/robot/web" weboptions "go.viam.com/rdk/robot/web/options" @@ -249,6 +250,92 @@ func (s *robotServer) createWebOptions(cfg *config.Config) (weboptions.Options, return options, nil } +// A wrapper around actual config processing that also applies options from the +// robot server. +func (s *robotServer) processConfig(in *config.Config) (*config.Config, error) { + out, err := config.ProcessConfig(in) + if err != nil { + return nil, err + } + out.Debug = s.args.Debug || in.Debug + out.EnableWebProfile = s.args.WebProfile || in.EnableWebProfile + out.FromCommand = true + out.AllowInsecureCreds = s.args.AllowInsecureCreds + out.UntrustedEnv = s.args.UntrustedEnv + out.PackagePath = path.Join(viamDotDir, "packages") + return out, nil +} + +// A function to be started as a goroutine that watches for changes, either +// from disk or from cloud, to the robot's config. Starts comparisons based on +// `currCfg`. Reconfigures the robot when config changes are received from the +// watcher. +func (s *robotServer) configWatcher(ctx context.Context, currCfg *config.Config, r robot.LocalRobot, + watcher config.Watcher, +) { + // Reconfigure robot to have passed-in config before listening for any config + // changes. + r.Reconfigure(ctx, currCfg) + + // Once reconfigure with initial config is complete; set initializing to + // false. Robot is now fully running and can indicate this through the + // MachineStatus endpoint. + r.SetInitializing(false) + + for { + select { + case <-ctx.Done(): + return + default: + } + select { + case <-ctx.Done(): + return + case cfg := <-watcher.Config(): + processedConfig, err := s.processConfig(cfg) + if err != nil { + s.logger.Errorw("reconfiguration aborted: error processing config", "error", err) + continue + } + + // flag to restart web service if necessary + diff, err := config.DiffConfigs(*currCfg, *processedConfig, s.args.RevealSensitiveConfigDiffs) + if err != nil { + s.logger.Errorw("reconfiguration aborted: error diffing config", "error", err) + continue + } + var options weboptions.Options + + if !diff.NetworkEqual { + // TODO(RSDK-2694): use internal web service reconfiguration instead + r.StopWeb() + options, err = s.createWebOptions(processedConfig) + if err != nil { + s.logger.Errorw("reconfiguration aborted: error creating weboptions", "error", err) + continue + } + } + + // Update logger registry if log patterns may have changed. + // + // This functionality is tested in `TestLogPropagation` in `local_robot_test.go`. + if !diff.LogEqual { + s.logger.Debug("Detected potential changes to log patterns; updating logger levels") + config.UpdateLoggerRegistryFromConfig(s.registry, processedConfig, s.logger) + } + + r.Reconfigure(ctx, processedConfig) + + if !diff.NetworkEqual { + if err := r.StartWeb(ctx, options); err != nil { + s.logger.Errorw("reconfiguration failed: error starting web service while reconfiguring", "error", err) + } + } + currCfg = processedConfig + } + } +} + func (s *robotServer) serveWeb(ctx context.Context, cfg *config.Config) (err error) { ctx, cancel := context.WithCancel(ctx) @@ -324,21 +411,7 @@ func (s *robotServer) serveWeb(ctx context.Context, cfg *config.Config) (err err defer cancel() ctx = rpc.ContextWithDialer(ctx, rpcDialer) - processConfig := func(in *config.Config) (*config.Config, error) { - out, err := config.ProcessConfig(in) - if err != nil { - return nil, err - } - out.Debug = s.args.Debug || in.Debug - out.EnableWebProfile = s.args.WebProfile || in.EnableWebProfile - out.FromCommand = true - out.AllowInsecureCreds = s.args.AllowInsecureCreds - out.UntrustedEnv = s.args.UntrustedEnv - out.PackagePath = path.Join(viamDotDir, "packages") - return out, nil - } - - fullProcessedConfig, err := processConfig(cfg) + fullProcessedConfig, err := s.processConfig(cfg) if err != nil { return err } @@ -434,80 +507,24 @@ func (s *robotServer) serveWeb(ctx context.Context, cfg *config.Config) (err err defer func() { err = multierr.Combine(err, watcher.Close()) }() - onWatchDone := make(chan struct{}) - // Use `fullProcessedConfig` as the initial `oldCfg` for the config watcher - // goroutine, as we want incoming config changes to be compared to the full - // config. - oldCfg := fullProcessedConfig - utils.ManagedGo(func() { - // Reconfigure robot to have full processed config before listening for any - // config changes. - myRobot.Reconfigure(ctx, fullProcessedConfig) - - // Once reconfigure with full processed config is complete; set initializing - // to false. Robot is now fully running and can indicate this through the - // MachineStatus endpoint. - myRobot.SetInitializing(false) - - for { - select { - case <-ctx.Done(): - return - default: - } - select { - case <-ctx.Done(): - return - case cfg := <-watcher.Config(): - processedConfig, err := processConfig(cfg) - if err != nil { - s.logger.Errorw("reconfiguration aborted: error processing config", "error", err) - continue - } - - // flag to restart web service if necessary - diff, err := config.DiffConfigs(*oldCfg, *processedConfig, s.args.RevealSensitiveConfigDiffs) - if err != nil { - s.logger.Errorw("reconfiguration aborted: error diffing config", "error", err) - continue - } - var options weboptions.Options - - if !diff.NetworkEqual { - // TODO(RSDK-2694): use internal web service reconfiguration instead - myRobot.StopWeb() - options, err = s.createWebOptions(processedConfig) - if err != nil { - s.logger.Errorw("reconfiguration aborted: error creating weboptions", "error", err) - continue - } - } - // Update logger registry if log patterns may have changed. - // - // This functionality is tested in `TestLogPropagation` in `local_robot_test.go`. - if !diff.LogEqual { - s.logger.Debug("Detected potential changes to log patterns; updating logger levels") - config.UpdateLoggerRegistryFromConfig(s.registry, processedConfig, s.logger) - } - - myRobot.Reconfigure(ctx, processedConfig) - - if !diff.NetworkEqual { - if err := myRobot.StartWeb(ctx, options); err != nil { - s.logger.Errorw("reconfiguration failed: error starting web service while reconfiguring", "error", err) - } - } - oldCfg = processedConfig - } - } - }, func() { - close(onWatchDone) - }) + onWatchDone := make(chan struct{}) + go func() { + defer func() { + close(onWatchDone) + }() + + // Use `fullProcessedConfig` as the initial config for the config watcher + // goroutine, as we want incoming config changes to be compared to the full + // config. + s.configWatcher(ctx, fullProcessedConfig, myRobot, watcher) + }() + // At end of this function, cancel context and wait for watcher goroutine + // to complete. defer func() { + cancel() <-onWatchDone }() - defer cancel() // Create initial web options with `minimalProcessedConfig`. options, err := s.createWebOptions(minimalProcessedConfig) From f997bbbfafd8fab36e61439cfef69c4c3fa77a56 Mon Sep 17 00:00:00 2001 From: Benji Rewis Date: Thu, 19 Dec 2024 13:09:20 -0500 Subject: [PATCH 20/32] put back lints --- components/movementsensor/replay/replay_utils_test.go | 2 ++ 1 file changed, 2 insertions(+) diff --git a/components/movementsensor/replay/replay_utils_test.go b/components/movementsensor/replay/replay_utils_test.go index ee35c74dd0d..73ccee66397 100644 --- a/components/movementsensor/replay/replay_utils_test.go +++ b/components/movementsensor/replay/replay_utils_test.go @@ -53,6 +53,7 @@ func (mDServer *mockDataServiceServer) TabularDataByFilter(ctx context.Context, last := req.DataRequest.GetLast() limit := req.DataRequest.GetLimit() + //nolint:deprecated,staticcheck var dataset []*datapb.TabularData var dataIndex int var err error @@ -79,6 +80,7 @@ func (mDServer *mockDataServiceServer) TabularDataByFilter(ctx context.Context, last = fmt.Sprint(dataIndex) + //nolint:deprecated,staticcheck tabularData := &datapb.TabularData{ Data: data, TimeRequested: timeReq, From f97feefb6d9ebad3ba502b2eca95500953521b27 Mon Sep 17 00:00:00 2001 From: Benji Rewis Date: Sat, 21 Dec 2024 21:24:29 -0500 Subject: [PATCH 21/32] try to simplify updateWeakDependents check in completeConfig --- robot/impl/local_robot.go | 14 -------------- robot/impl/resource_manager.go | 21 ++++----------------- 2 files changed, 4 insertions(+), 31 deletions(-) diff --git a/robot/impl/local_robot.go b/robot/impl/local_robot.go index e7cc7f84160..2251ffd70a6 100644 --- a/robot/impl/local_robot.go +++ b/robot/impl/local_robot.go @@ -546,20 +546,6 @@ func (r *localRobot) removeOrphanedResources(ctx context.Context, r.updateWeakDependents(ctx) } -// resourceHasWeakDependencies will return whether a given resource has weak dependencies. -// Internal services that depend on other resources are also included in the check. -func (r *localRobot) resourceHasWeakDependencies(rName resource.Name, node *resource.GraphNode) bool { - if len(r.getWeakDependencyMatchers(node.Config().API, node.Config().Model)) > 0 { - return true - } - - // also return true for internal services that depends on other resources (web, framesystem). - if rName == web.InternalServiceName || rName == framesystem.InternalServiceName { - return true - } - return false -} - // getDependencies derives a collection of dependencies from a robot for a given // component's name. We don't use the resource manager for this information since // it is not be constructed at this point. diff --git a/robot/impl/resource_manager.go b/robot/impl/resource_manager.go index 902679c5521..b7f0f11e784 100644 --- a/robot/impl/resource_manager.go +++ b/robot/impl/resource_manager.go @@ -632,11 +632,8 @@ func (manager *resourceManager) completeConfig( levels := manager.resources.ReverseTopologicalSortInLevels() timeout := rutils.GetResourceConfigurationTimeout(manager.logger) for _, resourceNames := range levels { - // At the start of every reconfiguration level, check if updateWeakDependents should be run. - // Both conditions below should be met for `updateWeakDependents` to be called: - // - At least one resource that needs to reconfigure in this level - // depends on at least one resource with weak dependencies (weak dependents) - // - The logical clock is higher than the `lastWeakDependentsRound` value + // At the start of every reconfiguration level, check if updateWeakDependents should be run + // by checking if the logical clock is higher than the `lastWeakDependentsRound` value. // // This will make sure that weak dependents are updated before they are passed into constructors // or reconfigure methods. @@ -644,7 +641,6 @@ func (manager *resourceManager) completeConfig( // Resources that depend on weak dependents should expect that the weak dependents pass into the // constructor or reconfigure method will only have been reconfigured with all resources constructed // before their level. - var weakDependentsUpdated bool for _, resName := range resourceNames { select { case <-ctx.Done(): @@ -659,17 +655,8 @@ func (manager *resourceManager) completeConfig( continue } - for _, dep := range manager.resources.GetAllParentsOf(resName) { - if node, ok := manager.resources.Node(dep); ok { - if lr.resourceHasWeakDependencies(dep, node) && lr.lastWeakDependentsRound.Load() < manager.resources.CurrLogicalClockValue() { - lr.updateWeakDependents(ctx) - weakDependentsUpdated = true - break - } - } - } - if weakDependentsUpdated { - break + if lr.lastWeakDependentsRound.Load() < manager.resources.CurrLogicalClockValue() { + lr.updateWeakDependents(ctx) } } // we use an errgroup here instead of a normal waitgroup to conveniently bubble From 72bada67f01ebca40fc8628efbbfb26cc73ed30d Mon Sep 17 00:00:00 2001 From: Benji Rewis Date: Thu, 2 Jan 2025 16:06:19 -0500 Subject: [PATCH 22/32] avoid CopyOnlyPublicFields --- web/server/entrypoint.go | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/web/server/entrypoint.go b/web/server/entrypoint.go index 69de5cc8ed8..535c5b4fc0f 100644 --- a/web/server/entrypoint.go +++ b/web/server/entrypoint.go @@ -477,10 +477,7 @@ func (s *robotServer) serveWeb(ctx context.Context, cfg *config.Config) (err err // and immediately start web service. We need the machine to be reachable // through the web service ASAP, even if some resources take a long time to // initially configure. - minimalProcessedConfig, err := fullProcessedConfig.CopyOnlyPublicFields() - if err != nil { - return err - } + minimalProcessedConfig := *fullProcessedConfig minimalProcessedConfig.Components = nil minimalProcessedConfig.Services = nil minimalProcessedConfig.Remotes = nil @@ -489,7 +486,7 @@ func (s *robotServer) serveWeb(ctx context.Context, cfg *config.Config) (err err // Start robot in an initializing state with minimal config. robotOptions = append(robotOptions, robotimpl.WithInitializing()) - myRobot, err := robotimpl.New(ctx, minimalProcessedConfig, s.logger, robotOptions...) + myRobot, err := robotimpl.New(ctx, &minimalProcessedConfig, s.logger, robotOptions...) if err != nil { cancel() return err @@ -527,7 +524,7 @@ func (s *robotServer) serveWeb(ctx context.Context, cfg *config.Config) (err err }() // Create initial web options with `minimalProcessedConfig`. - options, err := s.createWebOptions(minimalProcessedConfig) + options, err := s.createWebOptions(&minimalProcessedConfig) if err != nil { return err } From fc67364f7b4d927e02cde9489a2a46c8e888f23d Mon Sep 17 00:00:00 2001 From: Benji Rewis Date: Thu, 2 Jan 2025 18:02:09 -0500 Subject: [PATCH 23/32] initializing -> running test --- robot/client/client.go | 23 ++++++- web/server/entrypoint_test.go | 125 ++++++++++++++++++++++++---------- 2 files changed, 110 insertions(+), 38 deletions(-) diff --git a/robot/client/client.go b/robot/client/client.go index f32e39c9fce..10b52924603 100644 --- a/robot/client/client.go +++ b/robot/client/client.go @@ -6,6 +6,7 @@ import ( "errors" "fmt" "io" + "os" "strings" "sync" "sync/atomic" @@ -60,6 +61,12 @@ var ( // defaultResourcesTimeout is the default timeout for getting resources. defaultResourcesTimeout = 5 * time.Second + + // DoNotWaitForRunningEnvVar is the name of an environment variable to set + // only in tests to allow connecting to still-initializing machines. Note + // that robot clients in production (not in a testing environment) will + // already allow connecting to still-initializing machines. + DoNotWaitForRunningEnvVar = "VIAM_CLIENT_DO_NOT_WAIT_FOR_RUNNING" ) // RobotClient satisfies the robot.Robot interface through a gRPC based @@ -289,13 +296,17 @@ func New(ctx context.Context, address string, clientLogger logging.ZapCompatible } // If running in a testing environment, wait for machine to report a state of - // running. We often establish connections in tests and expected resources to + // running. We often establish connections in tests and expect resources to // be immediately available once the web service has started; resources will // not be available when the machine is still initializing. // // It is expected that golang SDK users will handle lack of resource // availability due to the machine being in an initializing state themselves. - if testing.Testing() { + // + // Allow this behavior to be turned off in some tests that specifically want + // to examine the behavior of a machine in an initializing state through the + // use of an environment variable. + if testing.Testing() && os.Getenv(DoNotWaitForRunningEnvVar) == "" { for { if ctx.Err() != nil { return nil, multierr.Combine(ctx.Err(), rc.conn.Close()) @@ -417,7 +428,15 @@ func (rc *RobotClient) connectWithLock(ctx context.Context) error { dialOptionsWebRTCOnly[0] = rpc.WithDisableDirectGRPC() dialLogger := rc.logger.Sublogger("networking") + + // This dial seems to hang for 10 seconds, and it's not exactly clear why. + // Doesn't seem to have to do with reconfiguring, just some inner timeout + // probably related to checking for something? But what? + + dialLogger.Info("DBG-BENJI Start of gRPC dial") conn, err := grpc.Dial(ctx, rc.address, dialLogger, dialOptionsWebRTCOnly...) + dialLogger.Info("DBG-BENJI End of gRPC dial") + if err == nil { // If we succeed with a webrtc connection, flip the `serverIsWebrtcEnabled` to force all future // connections to use webrtc. diff --git a/web/server/entrypoint_test.go b/web/server/entrypoint_test.go index 945e41e36c6..8cbe30b8478 100644 --- a/web/server/entrypoint_test.go +++ b/web/server/entrypoint_test.go @@ -10,6 +10,7 @@ import ( "path/filepath" "runtime" "strconv" + "sync" "testing" "time" @@ -23,15 +24,18 @@ import ( "google.golang.org/grpc/codes" "google.golang.org/grpc/status" + "go.viam.com/rdk/components/generic" _ "go.viam.com/rdk/components/register" "go.viam.com/rdk/config" "go.viam.com/rdk/logging" "go.viam.com/rdk/resource" "go.viam.com/rdk/robot" + "go.viam.com/rdk/robot/client" _ "go.viam.com/rdk/services/register" "go.viam.com/rdk/testutils" "go.viam.com/rdk/testutils/robottestutils" "go.viam.com/rdk/utils" + "go.viam.com/rdk/web/server" ) // numResources is the # of resources in /etc/configs/fake.json + the 2 @@ -196,51 +200,100 @@ func isExpectedShutdownError(err error, testLogger logging.Logger) bool { // Tests that machine state properly reports initializing or running. func TestMachineState(t *testing.T) { - if runtime.GOARCH == "arm" { - t.Skip("skipping on 32-bit ARM, subprocess build warnings cause failure") - } - - logger, logObserver := logging.NewObservedTestLogger(t) + logger := logging.NewTestLogger(t) + ctx, cancel := context.WithCancel(context.Background()) - cfgFilename := utils.ResolveFile("/etc/configs/fake.json") - cfg, err := config.Read(context.Background(), cfgFilename, logger) - test.That(t, err, test.ShouldBeNil) + machineAddress := "localhost:23654" - var port int - var success bool - var server pexec.ManagedProcess - for portTryNum := 0; portTryNum < 10; portTryNum++ { - p, err := goutils.TryReserveRandomPort() - port = p + // Register a slow-constructing generic resource and defer its + // deregistration. + type slow struct { + resource.Named + resource.AlwaysRebuild + resource.TriviallyCloseable + } + completeConstruction := make(chan struct{}, 1) + slowModel := resource.NewModel("slow", "to", "build") + resource.RegisterComponent(generic.API, slowModel, resource.Registration[resource.Resource, resource.NoNativeConfig]{ + Constructor: func( + ctx context.Context, + deps resource.Dependencies, + conf resource.Config, + logger logging.Logger, + ) (resource.Resource, error) { + // Wait for `completeConstruction` to close before returning from + // constructor. + <-completeConstruction + + return &slow{ + Named: conf.ResourceName().AsNamed(), + }, nil + }, + }) + defer func() { + resource.Deregister(generic.API, slowModel) + }() + + // Run entrypoint code (RunServer) in a goroutine, as it is blocking. + var wg sync.WaitGroup + wg.Add(1) + go func() { + defer wg.Done() + + // Create a temporary config file with a single + tempConfigFile, err := os.CreateTemp(t.TempDir(), "temp_config.json") test.That(t, err, test.ShouldBeNil) - cfg.Network.BindAddress = fmt.Sprintf(":%d", port) - cfgFilename, err = robottestutils.MakeTempConfig(t, cfg, logger) - test.That(t, err, test.ShouldBeNil) + cfg := &config.Config{ + Components: []resource.Config{ + { + Name: "slowpoke", + API: generic.API, + Model: slowModel, + }, + }, + Network: config.NetworkConfig{ + config.NetworkConfigData{ + BindAddress: machineAddress, + }, + }, + } - server = robottestutils.ServerAsSeparateProcess(t, cfgFilename, logger) - err = server.Start(context.Background()) + cfgBytes, err := json.Marshal(&cfg) test.That(t, err, test.ShouldBeNil) + test.That(t, os.WriteFile(tempConfigFile.Name(), cfgBytes, 0o755), test.ShouldBeNil) - if success = robottestutils.WaitForServing(logObserver, port); success { - defer func() { - test.That(t, server.Stop(), test.ShouldBeNil) - }() - break - } - logger.Infow("Port in use. Restarting on new port.", "port", port, "err", err) - server.Stop() - continue - } - test.That(t, success, test.ShouldBeTrue) + args := []string{"viam-server", "-config", tempConfigFile.Name()} + test.That(t, server.RunServer(ctx, args, logger), test.ShouldBeNil) + }() + + // Set value for `DoNotWaitForRunningEnvVar` to allow connecting to a + // still-initializing machine. + test.That(t, os.Setenv(client.DoNotWaitForRunningEnvVar, "true"), test.ShouldBeNil) + defer func() { + test.That(t, os.Unsetenv(client.DoNotWaitForRunningEnvVar), test.ShouldBeNil) + }() - addr := "localhost:" + strconv.Itoa(port) - rc := robottestutils.NewRobotClient(t, logger, addr, time.Second) + rc := robottestutils.NewRobotClient(t, logger, machineAddress, time.Second) - // NewRobotClient will wait for machine state to be running. Assert that this - // is still the case. - machineStatus, err := rc.MachineStatus(context.Background()) + // Assert that, from client's perspective, robot is in an initializing state + // until `slowpoke` completes construction. + machineStatus, err := rc.MachineStatus(ctx) test.That(t, err, test.ShouldBeNil) test.That(t, machineStatus, test.ShouldNotBeNil) - test.That(t, machineStatus.State, test.ShouldEqual, robot.StateRunning) + test.That(t, machineStatus.State, test.ShouldEqual, robot.StateInitializing) + + // Allow `slowpoke` to complete construction. + close(completeConstruction) + + gtestutils.WaitForAssertion(t, func(tb testing.TB) { + machineStatus, err := rc.MachineStatus(ctx) + test.That(tb, err, test.ShouldBeNil) + test.That(tb, machineStatus, test.ShouldNotBeNil) + test.That(tb, machineStatus.State, test.ShouldEqual, robot.StateRunning) + }) + + // Cancel context and wait for server goroutine to stop running. + cancel() + wg.Wait() } From 18555b3a0abebf6fedc06745474a8e14e680a811 Mon Sep 17 00:00:00 2001 From: Benji Rewis Date: Fri, 3 Jan 2025 10:24:41 -0500 Subject: [PATCH 24/32] move redefinition of context above slow shutdown goroutine --- web/server/entrypoint.go | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/web/server/entrypoint.go b/web/server/entrypoint.go index 535c5b4fc0f..9ed2cb8b690 100644 --- a/web/server/entrypoint.go +++ b/web/server/entrypoint.go @@ -348,6 +348,17 @@ func (s *robotServer) serveWeb(ctx context.Context, cfg *config.Config) (err err forceShutdown := make(chan struct{}) defer func() { <-forceShutdown }() + var cloudRestartCheckerActive chan struct{} + rpcDialer := rpc.NewCachedDialer() + defer func() { + if cloudRestartCheckerActive != nil { + <-cloudRestartCheckerActive + } + err = multierr.Combine(err, rpcDialer.Close()) + }() + defer cancel() + ctx = rpc.ContextWithDialer(ctx, rpcDialer) + utils.PanicCapturingGo(func() { defer close(forceShutdown) @@ -400,17 +411,6 @@ func (s *robotServer) serveWeb(ctx context.Context, cfg *config.Config) (err err <-slowWatcher }() - var cloudRestartCheckerActive chan struct{} - rpcDialer := rpc.NewCachedDialer() - defer func() { - if cloudRestartCheckerActive != nil { - <-cloudRestartCheckerActive - } - err = multierr.Combine(err, rpcDialer.Close()) - }() - defer cancel() - ctx = rpc.ContextWithDialer(ctx, rpcDialer) - fullProcessedConfig, err := s.processConfig(cfg) if err != nil { return err From e46aa9d85a4a12d5ccd9e43ca38e236fd50b5f21 Mon Sep 17 00:00:00 2001 From: Benji Rewis Date: Fri, 3 Jan 2025 11:22:36 -0500 Subject: [PATCH 25/32] remove debug logs --- robot/client/client.go | 8 -------- 1 file changed, 8 deletions(-) diff --git a/robot/client/client.go b/robot/client/client.go index 10b52924603..1c078ce5812 100644 --- a/robot/client/client.go +++ b/robot/client/client.go @@ -428,15 +428,7 @@ func (rc *RobotClient) connectWithLock(ctx context.Context) error { dialOptionsWebRTCOnly[0] = rpc.WithDisableDirectGRPC() dialLogger := rc.logger.Sublogger("networking") - - // This dial seems to hang for 10 seconds, and it's not exactly clear why. - // Doesn't seem to have to do with reconfiguring, just some inner timeout - // probably related to checking for something? But what? - - dialLogger.Info("DBG-BENJI Start of gRPC dial") conn, err := grpc.Dial(ctx, rc.address, dialLogger, dialOptionsWebRTCOnly...) - dialLogger.Info("DBG-BENJI End of gRPC dial") - if err == nil { // If we succeed with a webrtc connection, flip the `serverIsWebrtcEnabled` to force all future // connections to use webrtc. From 7f0fffdd3c0b21d2c05807a8205c8413015420f8 Mon Sep 17 00:00:00 2001 From: Benji Rewis Date: Fri, 3 Jan 2025 11:32:57 -0500 Subject: [PATCH 26/32] table drive and fix server GetMachineStatus test --- robot/server/server_test.go | 58 +++++++++++++++++++++---------------- 1 file changed, 33 insertions(+), 25 deletions(-) diff --git a/robot/server/server_test.go b/robot/server/server_test.go index 24838b461a5..326c223766a 100644 --- a/robot/server/server_test.go +++ b/robot/server/server_test.go @@ -161,6 +161,7 @@ func TestServer(t *testing.T) { CloudMetadata: cloud.Metadata{}, }, }, + State: robot.StateRunning, }, &pb.ConfigStatus{Revision: "rev1"}, []*pb.ResourceStatus{ @@ -171,6 +172,8 @@ func TestServer(t *testing.T) { CloudMetadata: &pb.GetCloudMetadataResponse{}, }, }, + pb.GetMachineStatusResponse_STATE_RUNNING, + 0, 0, }, { @@ -204,6 +207,7 @@ func TestServer(t *testing.T) { }, }, }, + State: robot.StateRunning, }, &pb.ConfigStatus{Revision: "rev1"}, []*pb.ResourceStatus{ @@ -233,6 +237,8 @@ func TestServer(t *testing.T) { ), }, }, + pb.GetMachineStatusResponse_STATE_RUNNING, + 0, 0, }, { @@ -340,34 +346,36 @@ func TestServer(t *testing.T) { 1, }, } { - logger, logs := logging.NewObservedTestLogger(t) - injectRobot := &inject.Robot{} - server := server.New(injectRobot) - req := pb.GetMachineStatusRequest{} - injectRobot.LoggerFunc = func() logging.Logger { - return logger - } - injectRobot.MachineStatusFunc = func(ctx context.Context) (robot.MachineStatus, error) { - return tc.injectMachineStatus, nil - } - resp, err := server.GetMachineStatus(context.Background(), &req) - test.That(t, err, test.ShouldBeNil) - test.That(t, resp.GetConfig().GetRevision(), test.ShouldEqual, tc.expConfig.Revision) - for i, res := range resp.GetResources() { - test.That(t, res.GetName(), test.ShouldResemble, tc.expResources[i].Name) - test.That(t, res.GetState(), test.ShouldResemble, tc.expResources[i].State) - test.That(t, res.GetRevision(), test.ShouldEqual, tc.expResources[i].Revision) - } + t.Run(tc.name, func(t *testing.T) { + logger, logs := logging.NewObservedTestLogger(t) + injectRobot := &inject.Robot{} + server := server.New(injectRobot) + req := pb.GetMachineStatusRequest{} + injectRobot.LoggerFunc = func() logging.Logger { + return logger + } + injectRobot.MachineStatusFunc = func(ctx context.Context) (robot.MachineStatus, error) { + return tc.injectMachineStatus, nil + } + resp, err := server.GetMachineStatus(context.Background(), &req) + test.That(t, err, test.ShouldBeNil) + test.That(t, resp.GetConfig().GetRevision(), test.ShouldEqual, tc.expConfig.Revision) + for i, res := range resp.GetResources() { + test.That(t, res.GetName(), test.ShouldResemble, tc.expResources[i].Name) + test.That(t, res.GetState(), test.ShouldResemble, tc.expResources[i].State) + test.That(t, res.GetRevision(), test.ShouldEqual, tc.expResources[i].Revision) + } - test.That(t, resp.GetState(), test.ShouldEqual, tc.expState) + test.That(t, resp.GetState(), test.ShouldEqual, tc.expState) - const badResourceStateMsg = "resource in an unknown state" - badResourceStateCount := logs.FilterLevelExact(zapcore.ErrorLevel).FilterMessageSnippet(badResourceStateMsg).Len() - test.That(t, badResourceStateCount, test.ShouldEqual, tc.expBadResourceStateCount) + const badResourceStateMsg = "resource in an unknown state" + badResourceStateCount := logs.FilterLevelExact(zapcore.ErrorLevel).FilterMessageSnippet(badResourceStateMsg).Len() + test.That(t, badResourceStateCount, test.ShouldEqual, tc.expBadResourceStateCount) - const badMachineStateMsg = "machine in an unknown state" - badMachineStateCount := logs.FilterLevelExact(zapcore.ErrorLevel).FilterMessageSnippet(badMachineStateMsg).Len() - test.That(t, badMachineStateCount, test.ShouldEqual, tc.expBadMachineStateCount) + const badMachineStateMsg = "machine in an unknown state" + badMachineStateCount := logs.FilterLevelExact(zapcore.ErrorLevel).FilterMessageSnippet(badMachineStateMsg).Len() + test.That(t, badMachineStateCount, test.ShouldEqual, tc.expBadMachineStateCount) + }) } }) From da02e548df7ec59649db946991f96ae1d682c807 Mon Sep 17 00:00:00 2001 From: Benji Rewis Date: Fri, 3 Jan 2025 13:11:20 -0500 Subject: [PATCH 27/32] better table driving ? --- robot/server/server_test.go | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/robot/server/server_test.go b/robot/server/server_test.go index 326c223766a..40425a8976b 100644 --- a/robot/server/server_test.go +++ b/robot/server/server_test.go @@ -72,7 +72,7 @@ func TestServer(t *testing.T) { }) t.Run("GetMachineStatus", func(t *testing.T) { - for _, tc := range []struct { + testCases := []struct { name string injectMachineStatus robot.MachineStatus expConfig *pb.ConfigStatus @@ -345,7 +345,9 @@ func TestServer(t *testing.T) { 0, 1, }, - } { + } + + for _, tc := range testCases { t.Run(tc.name, func(t *testing.T) { logger, logs := logging.NewObservedTestLogger(t) injectRobot := &inject.Robot{} From 2c591ad8b7f803d0646f53a20996336c4e0e4b70 Mon Sep 17 00:00:00 2001 From: Benji Rewis Date: Fri, 3 Jan 2025 14:10:19 -0500 Subject: [PATCH 28/32] dan comments --- robot/client/client.go | 17 +++++++++-------- web/server/entrypoint_test.go | 6 +++--- 2 files changed, 12 insertions(+), 11 deletions(-) diff --git a/robot/client/client.go b/robot/client/client.go index 1c078ce5812..f227612af9f 100644 --- a/robot/client/client.go +++ b/robot/client/client.go @@ -6,7 +6,6 @@ import ( "errors" "fmt" "io" - "os" "strings" "sync" "sync/atomic" @@ -62,11 +61,11 @@ var ( // defaultResourcesTimeout is the default timeout for getting resources. defaultResourcesTimeout = 5 * time.Second - // DoNotWaitForRunningEnvVar is the name of an environment variable to set - // only in tests to allow connecting to still-initializing machines. Note - // that robot clients in production (not in a testing environment) will - // already allow connecting to still-initializing machines. - DoNotWaitForRunningEnvVar = "VIAM_CLIENT_DO_NOT_WAIT_FOR_RUNNING" + // DoNotWaitForRunning should be set only in tests to allow connecting to + // still-initializing machines. Note that robot clients in production (not in + // a testing environment) will already allow connecting to still-initializing + // machines. + DoNotWaitForRunning = atomic.Bool{} ) // RobotClient satisfies the robot.Robot interface through a gRPC based @@ -306,7 +305,7 @@ func New(ctx context.Context, address string, clientLogger logging.ZapCompatible // Allow this behavior to be turned off in some tests that specifically want // to examine the behavior of a machine in an initializing state through the // use of an environment variable. - if testing.Testing() && os.Getenv(DoNotWaitForRunningEnvVar) == "" { + if testing.Testing() && !DoNotWaitForRunning.Load() { for { if ctx.Err() != nil { return nil, multierr.Combine(ctx.Err(), rc.conn.Close()) @@ -318,7 +317,9 @@ func New(ctx context.Context, address string, clientLogger logging.ZapCompatible if status.Code(err) == codes.Unimplemented { break } - return nil, multierr.Combine(err, rc.conn.Close()) + // Ignore error from Close and just return original machine status error. + utils.UncheckedError(rc.conn.Close()) + return nil, err } if mStatus.State == robot.StateRunning { diff --git a/web/server/entrypoint_test.go b/web/server/entrypoint_test.go index 8cbe30b8478..1f6f3473d3d 100644 --- a/web/server/entrypoint_test.go +++ b/web/server/entrypoint_test.go @@ -267,11 +267,11 @@ func TestMachineState(t *testing.T) { test.That(t, server.RunServer(ctx, args, logger), test.ShouldBeNil) }() - // Set value for `DoNotWaitForRunningEnvVar` to allow connecting to a + // Set `DoNotWaitForRunning` to true to allow connecting to a // still-initializing machine. - test.That(t, os.Setenv(client.DoNotWaitForRunningEnvVar, "true"), test.ShouldBeNil) + client.DoNotWaitForRunning.Store(true) defer func() { - test.That(t, os.Unsetenv(client.DoNotWaitForRunningEnvVar), test.ShouldBeNil) + client.DoNotWaitForRunning.Store(false) }() rc := robottestutils.NewRobotClient(t, logger, machineAddress, time.Second) From d5e484f9523c54d9e1d491eb915db9f1026d5228 Mon Sep 17 00:00:00 2001 From: Benji Rewis Date: Mon, 6 Jan 2025 17:09:05 -0500 Subject: [PATCH 29/32] fix client test --- robot/client/client_test.go | 1 + 1 file changed, 1 insertion(+) diff --git a/robot/client/client_test.go b/robot/client/client_test.go index 241ac58fece..5258dcb2e7c 100644 --- a/robot/client/client_test.go +++ b/robot/client/client_test.go @@ -2129,6 +2129,7 @@ func TestMachineStatus(t *testing.T) { }, }, }, + State: robot.StateRunning, }, 0, }, From 5bed7f2992e0e9fd49810332d73edcee5b2a8c10 Mon Sep 17 00:00:00 2001 From: Benji Rewis Date: Tue, 7 Jan 2025 14:50:33 -0500 Subject: [PATCH 30/32] Initial field on Config in place of SetInitializing --- config/config.go | 6 ++++++ robot/impl/local_robot.go | 34 ++++++++++++++-------------------- robot/impl/robot_options.go | 10 ---------- robot/robot.go | 3 --- web/server/entrypoint.go | 11 ++++------- 5 files changed, 24 insertions(+), 40 deletions(-) diff --git a/config/config.go b/config/config.go index 5ac34ad9d27..43286595f34 100644 --- a/config/config.go +++ b/config/config.go @@ -71,6 +71,12 @@ type Config struct { // Revision contains the current revision of the config. Revision string + // Initial represents whether this is an "initial" config passed in by web + // server entrypoint code. If true, the robot will continue to report a state + // of initializing after applying this config. If false, the robot will + // report a state of reconfiguring after applying this config. + Initial bool + // toCache stores the JSON marshalled version of the config to be cached. It should be a copy of // the config pulled from cloud with minor changes. // This version is kept because the config is changed as it moves through the system. diff --git a/robot/impl/local_robot.go b/robot/impl/local_robot.go index 2251ffd70a6..0e91fe1f5c9 100644 --- a/robot/impl/local_robot.go +++ b/robot/impl/local_robot.go @@ -92,9 +92,9 @@ type localRobot struct { // whether the robot is actively reconfiguring reconfiguring atomic.Bool - // whether the robot is still initializing; can be set with SetInitializing - // and defaults to false. this value controls what state will be returned by - // the MachineStatus endpoint (initializing if true, running if false.) + // whether the robot is still initializing. this value controls what state will be + // returned by the MachineStatus endpoint (initializing if true, running if false.) + // configured based on the `Initial` value of applied `config.Config`s. initializing atomic.Bool } @@ -394,9 +394,6 @@ func newWithResources( ftdc: ftdcWorker, } - // Use value of initializing from robot options. - r.initializing.Store(rOpts.initializing) - r.mostRecentCfg.Store(config.Config{}) var heartbeatWindow time.Duration if cfg.Network.Sessions.HeartbeatWindow == 0 { @@ -1092,6 +1089,9 @@ func dialRobotClient( // possibly leak resources. The given config may be modified by Reconfigure. func (r *localRobot) Reconfigure(ctx context.Context, newConfig *config.Config) { r.reconfigure(ctx, newConfig, false) + + // Set initializing value based on `newConfig.Initial`. + r.initializing.Store(newConfig.Initial) } // set Module.LocalVersion on Type=local modules. Call this before localPackages.Sync and in RestartModule. @@ -1284,16 +1284,17 @@ func (r *localRobot) reconfigure(ctx context.Context, newConfig *config.Config, allErrs = multierr.Combine(allErrs, err) } - // If not initializing, cleanup unused packages after all old resources have - // been closed above. This ensures processes are shutdown before any files - // are deleted they are using. + // If new config is not marked as initial, cleanup unused packages after all + // old resources have been closed above. This ensures processes are shutdown + // before any files are deleted they are using. // - // If initializing, machine will be starting with no modules, but may - // immediately reconfigure to start modules that have already been - // downloaded. Do not cleanup packages/module dirs in that case. - if !r.initializing.Load() { + // If new config IS marked as initial, machine will be starting with no + // modules but may immediately reconfigure to start modules that have + // already been downloaded. Do not cleanup packages/module dirs in that case. + if !newConfig.Initial { allErrs = multierr.Combine(allErrs, r.packageManager.Cleanup(ctx)) allErrs = multierr.Combine(allErrs, r.localPackages.Cleanup(ctx)) + // Cleanup extra dirs from previous modules or rogue scripts. allErrs = multierr.Combine(allErrs, r.manager.moduleManager.CleanModuleDataDirectory()) } @@ -1538,10 +1539,3 @@ func (r *localRobot) RestartAllowed() bool { } return false } - -// SetInitializing sets the initializing state of the robot. This method can be -// used after initial configuration to indicate that the robot should return a -// state of running from the MachineStatus endpoint. -func (r *localRobot) SetInitializing(initializing bool) { - r.initializing.Store(initializing) -} diff --git a/robot/impl/robot_options.go b/robot/impl/robot_options.go index 81391c285a2..005d4cce55f 100644 --- a/robot/impl/robot_options.go +++ b/robot/impl/robot_options.go @@ -24,9 +24,6 @@ type options struct { // disableCompleteConfigWorker starts the robot without the complete config worker - should only be used for tests. disableCompleteConfigWorker bool - - // whether robot should be in an initializing state - initializing bool } // Option configures how we set up the web service. @@ -96,10 +93,3 @@ func withDisableCompleteConfigWorker() Option { o.disableCompleteConfigWorker = true }) } - -// WithInitializing returns an Option which sets initializing to true. -func WithInitializing() Option { - return newFuncOption(func(o *options) { - o.initializing = true - }) -} diff --git a/robot/robot.go b/robot/robot.go index 258a7fc5d05..e3a8c7538df 100644 --- a/robot/robot.go +++ b/robot/robot.go @@ -186,9 +186,6 @@ type LocalRobot interface { // RestartAllowed returns whether the robot can safely be restarted. RestartAllowed() bool - - // SetInitializing sets the initializing state of the robot. - SetInitializing(initializing bool) } // A RemoteRobot is a Robot that was created through a connection. diff --git a/web/server/entrypoint.go b/web/server/entrypoint.go index 9ed2cb8b690..38be5b84621 100644 --- a/web/server/entrypoint.go +++ b/web/server/entrypoint.go @@ -277,11 +277,6 @@ func (s *robotServer) configWatcher(ctx context.Context, currCfg *config.Config, // changes. r.Reconfigure(ctx, currCfg) - // Once reconfigure with initial config is complete; set initializing to - // false. Robot is now fully running and can indicate this through the - // MachineStatus endpoint. - r.SetInitializing(false) - for { select { case <-ctx.Done(): @@ -484,8 +479,10 @@ func (s *robotServer) serveWeb(ctx context.Context, cfg *config.Config) (err err minimalProcessedConfig.Modules = nil minimalProcessedConfig.Processes = nil - // Start robot in an initializing state with minimal config. - robotOptions = append(robotOptions, robotimpl.WithInitializing()) + // Mark minimalProcessedConfig as an initial config, so robot reports a + // state of initializing until reconfigured with full config. + minimalProcessedConfig.Initial = true + myRobot, err := robotimpl.New(ctx, &minimalProcessedConfig, s.logger, robotOptions...) if err != nil { cancel() From e00ad7d6f29713fbdf7d665707e7c3107b304951 Mon Sep 17 00:00:00 2001 From: Benji Rewis Date: Tue, 7 Jan 2025 16:44:23 -0500 Subject: [PATCH 31/32] test that local package file is handled properly --- config/config.go | 3 +++ web/server/entrypoint.go | 7 +++++- web/server/entrypoint_test.go | 41 +++++++++++++++++++++++++++-------- 3 files changed, 41 insertions(+), 10 deletions(-) diff --git a/config/config.go b/config/config.go index 43286595f34..402d174f084 100644 --- a/config/config.go +++ b/config/config.go @@ -107,6 +107,7 @@ type configData struct { LogConfig []logging.LoggerPatternConfig `json:"log,omitempty"` Revision string `json:"revision,omitempty"` MaintenanceConfig *MaintenanceConfig `json:"maintenance,omitempty"` + PackagePath string `json:"package_path,omitempty"` } // AppValidationStatus refers to the. @@ -314,6 +315,7 @@ func (c *Config) UnmarshalJSON(data []byte) error { c.LogConfig = conf.LogConfig c.Revision = conf.Revision c.MaintenanceConfig = conf.MaintenanceConfig + c.PackagePath = conf.PackagePath return nil } @@ -346,6 +348,7 @@ func (c Config) MarshalJSON() ([]byte, error) { LogConfig: c.LogConfig, Revision: c.Revision, MaintenanceConfig: c.MaintenanceConfig, + PackagePath: c.PackagePath, }) } diff --git a/web/server/entrypoint.go b/web/server/entrypoint.go index 38be5b84621..8af9edfe81a 100644 --- a/web/server/entrypoint.go +++ b/web/server/entrypoint.go @@ -262,7 +262,12 @@ func (s *robotServer) processConfig(in *config.Config) (*config.Config, error) { out.FromCommand = true out.AllowInsecureCreds = s.args.AllowInsecureCreds out.UntrustedEnv = s.args.UntrustedEnv - out.PackagePath = path.Join(viamDotDir, "packages") + + // Use ~/.viam/packages for package path if one was not specified. + if in.PackagePath == "" { + out.PackagePath = path.Join(viamDotDir, "packages") + } + return out, nil } diff --git a/web/server/entrypoint_test.go b/web/server/entrypoint_test.go index 1f6f3473d3d..a4865a111eb 100644 --- a/web/server/entrypoint_test.go +++ b/web/server/entrypoint_test.go @@ -10,6 +10,7 @@ import ( "path/filepath" "runtime" "strconv" + "strings" "sync" "testing" "time" @@ -205,8 +206,18 @@ func TestMachineState(t *testing.T) { machineAddress := "localhost:23654" - // Register a slow-constructing generic resource and defer its - // deregistration. + // Create a fake package directory using `t.TempDir`. Set it up to be identical to the + // expected file tree of the local package manager. Place a single file `foo` in a + // `fake-module` directory. + tempDir := t.TempDir() + fakePackagePath := filepath.Join(tempDir, fmt.Sprint("packages", config.LocalPackagesSuffix)) + fakeModuleDataPath := filepath.Join(fakePackagePath, "data", "fake-module") + err := os.MkdirAll(fakeModuleDataPath, 0o777) // should create all dirs along path + test.That(t, err, test.ShouldBeNil) + fakeModuleDataFile, err := os.Create(filepath.Join(fakeModuleDataPath, "foo")) + test.That(t, err, test.ShouldBeNil) + + // Register a slow-constructing generic resource and defer its deregistration. type slow struct { resource.Named resource.AlwaysRebuild @@ -221,8 +232,7 @@ func TestMachineState(t *testing.T) { conf resource.Config, logger logging.Logger, ) (resource.Resource, error) { - // Wait for `completeConstruction` to close before returning from - // constructor. + // Wait for `completeConstruction` to close before returning from constructor. <-completeConstruction return &slow{ @@ -245,6 +255,9 @@ func TestMachineState(t *testing.T) { test.That(t, err, test.ShouldBeNil) cfg := &config.Config{ + // Set PackagePath to temp dir created at top of test with the "-local" piece trimmed. Local + // package manager will automatically add that suffix. + PackagePath: strings.TrimSuffix(fakePackagePath, config.LocalPackagesSuffix), Components: []resource.Config{ { Name: "slowpoke", @@ -253,7 +266,7 @@ func TestMachineState(t *testing.T) { }, }, Network: config.NetworkConfig{ - config.NetworkConfigData{ + NetworkConfigData: config.NetworkConfigData{ BindAddress: machineAddress, }, }, @@ -267,8 +280,8 @@ func TestMachineState(t *testing.T) { test.That(t, server.RunServer(ctx, args, logger), test.ShouldBeNil) }() - // Set `DoNotWaitForRunning` to true to allow connecting to a - // still-initializing machine. + // Set `DoNotWaitForRunning` to true to allow connecting to a still-initializing + // machine. client.DoNotWaitForRunning.Store(true) defer func() { client.DoNotWaitForRunning.Store(false) @@ -276,13 +289,18 @@ func TestMachineState(t *testing.T) { rc := robottestutils.NewRobotClient(t, logger, machineAddress, time.Second) - // Assert that, from client's perspective, robot is in an initializing state - // until `slowpoke` completes construction. + // Assert that, from client's perspective, robot is in an initializing state until + // `slowpoke` completes construction. machineStatus, err := rc.MachineStatus(ctx) test.That(t, err, test.ShouldBeNil) test.That(t, machineStatus, test.ShouldNotBeNil) test.That(t, machineStatus.State, test.ShouldEqual, robot.StateInitializing) + // Assert that the `foo` package file exists during initialization, machine assumes + // package files may still be in use.) + _, err = os.Stat(fakeModuleDataFile.Name()) + test.That(t, err, test.ShouldBeNil) + // Allow `slowpoke` to complete construction. close(completeConstruction) @@ -293,6 +311,11 @@ func TestMachineState(t *testing.T) { test.That(tb, machineStatus.State, test.ShouldEqual, robot.StateRunning) }) + // Assert that the `foo` file was removed, as the non-initializing `Reconfigure` + // determined it was unnecessary (no associated package/module.) + _, err = os.Stat(fakeModuleDataFile.Name()) + test.That(t, os.IsNotExist(err), test.ShouldBeTrue) + // Cancel context and wait for server goroutine to stop running. cancel() wg.Wait() From 3e6e59eddd7461e10fd3d9c6e52027f5ed5aa9bb Mon Sep 17 00:00:00 2001 From: Benji Rewis Date: Thu, 9 Jan 2025 12:21:14 -0500 Subject: [PATCH 32/32] final comments --- config/config.go | 2 +- robot/client/client.go | 2 +- robot/impl/robot_options.go | 2 +- web/server/entrypoint.go | 4 +--- 4 files changed, 4 insertions(+), 6 deletions(-) diff --git a/config/config.go b/config/config.go index 402d174f084..88b100d749a 100644 --- a/config/config.go +++ b/config/config.go @@ -74,7 +74,7 @@ type Config struct { // Initial represents whether this is an "initial" config passed in by web // server entrypoint code. If true, the robot will continue to report a state // of initializing after applying this config. If false, the robot will - // report a state of reconfiguring after applying this config. + // report a state of running after applying this config. Initial bool // toCache stores the JSON marshalled version of the config to be cached. It should be a copy of diff --git a/robot/client/client.go b/robot/client/client.go index f227612af9f..a3967f954bc 100644 --- a/robot/client/client.go +++ b/robot/client/client.go @@ -304,7 +304,7 @@ func New(ctx context.Context, address string, clientLogger logging.ZapCompatible // // Allow this behavior to be turned off in some tests that specifically want // to examine the behavior of a machine in an initializing state through the - // use of an environment variable. + // use of a global variable. if testing.Testing() && !DoNotWaitForRunning.Load() { for { if ctx.Err() != nil { diff --git a/robot/impl/robot_options.go b/robot/impl/robot_options.go index 005d4cce55f..50ce478b30a 100644 --- a/robot/impl/robot_options.go +++ b/robot/impl/robot_options.go @@ -19,7 +19,7 @@ type options struct { // shutdownCallback provides a callback for the robot to be able to shut itself down. shutdownCallback func() - // whether FTDC is enabled + // whether or not to run FTDC enableFTDC bool // disableCompleteConfigWorker starts the robot without the complete config worker - should only be used for tests. diff --git a/web/server/entrypoint.go b/web/server/entrypoint.go index 8af9edfe81a..6c2a2bed01e 100644 --- a/web/server/entrypoint.go +++ b/web/server/entrypoint.go @@ -509,9 +509,7 @@ func (s *robotServer) serveWeb(ctx context.Context, cfg *config.Config) (err err onWatchDone := make(chan struct{}) go func() { - defer func() { - close(onWatchDone) - }() + defer close(onWatchDone) // Use `fullProcessedConfig` as the initial config for the config watcher // goroutine, as we want incoming config changes to be compared to the full