Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

RSDK-9440 Report machine state through GetMachineStatus #4616

Merged
merged 32 commits into from
Jan 9, 2025
Merged
Show file tree
Hide file tree
Changes from 31 commits
Commits
Show all changes
32 commits
Select commit Hold shift + click to select a range
9026f00
initial impl
benjirewis Dec 10, 2024
53b9d29
atomic
benjirewis Dec 10, 2024
4802b8d
start config watcher goroutine last
benjirewis Dec 11, 2024
8fa9830
addrderef -> CopyOnlyPublicFields
benjirewis Dec 11, 2024
c78e16e
update api dependency and lint
benjirewis Dec 11, 2024
62c3f5e
more lint sigh
benjirewis Dec 11, 2024
3b9d6b9
basic entrypoint test
benjirewis Dec 11, 2024
d97e583
reconfigure in goroutine and check for state running in client new
benjirewis Dec 13, 2024
58b4b94
typos
benjirewis Dec 13, 2024
eacd561
client test inject fixes
benjirewis Dec 13, 2024
4948b13
move out of initializing in setupLocalRobot
benjirewis Dec 16, 2024
d756396
add log lines for debugging 32 bit tests
benjirewis Dec 16, 2024
fcf03b5
move running check higher
benjirewis Dec 16, 2024
09b76fd
potentially simpler API
benjirewis Dec 18, 2024
35f8f3d
more missing injections
benjirewis Dec 18, 2024
e299ab4
fix more tests sigh
benjirewis Dec 18, 2024
19f7940
maybe this time
benjirewis Dec 18, 2024
d68e6f4
actually use value from options
benjirewis Dec 18, 2024
2d8bc1b
make configWatcher a method instead of an anonymous function
benjirewis Dec 19, 2024
f997bbb
put back lints
benjirewis Dec 19, 2024
f97feef
try to simplify updateWeakDependents check in completeConfig
benjirewis Dec 22, 2024
72bada6
avoid CopyOnlyPublicFields
benjirewis Jan 2, 2025
fc67364
initializing -> running test
benjirewis Jan 2, 2025
18555b3
move redefinition of context above slow shutdown goroutine
benjirewis Jan 3, 2025
e46aa9d
remove debug logs
benjirewis Jan 3, 2025
7f0fffd
table drive and fix server GetMachineStatus test
benjirewis Jan 3, 2025
da02e54
better table driving ?
benjirewis Jan 3, 2025
2c591ad
dan comments
benjirewis Jan 3, 2025
d5e484f
fix client test
benjirewis Jan 6, 2025
5bed7f2
Initial field on Config in place of SetInitializing
benjirewis Jan 7, 2025
e00ad7d
test that local package file is handled properly
benjirewis Jan 7, 2025
3e6e59e
final comments
benjirewis Jan 9, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 9 additions & 0 deletions config/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -71,6 +71,12 @@ type Config struct {
// Revision contains the current revision of the config.
Revision string

// Initial represents whether this is an "initial" config passed in by web
// server entrypoint code. If true, the robot will continue to report a state
// of initializing after applying this config. If false, the robot will
// report a state of reconfiguring after applying this config.
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
// report a state of reconfiguring after applying this config.
// report a state of running after applying this config.

?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

😮‍💨 good catch; applied locally.

Initial bool

// toCache stores the JSON marshalled version of the config to be cached. It should be a copy of
// the config pulled from cloud with minor changes.
// This version is kept because the config is changed as it moves through the system.
Expand Down Expand Up @@ -101,6 +107,7 @@ type configData struct {
LogConfig []logging.LoggerPatternConfig `json:"log,omitempty"`
Revision string `json:"revision,omitempty"`
MaintenanceConfig *MaintenanceConfig `json:"maintenance,omitempty"`
PackagePath string `json:"package_path,omitempty"`
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We were missing some code to marshal and unmarshal the PackagePath field. I needed that functionality for TestMachineState, so I just added it.

}

// AppValidationStatus refers to the.
Expand Down Expand Up @@ -308,6 +315,7 @@ func (c *Config) UnmarshalJSON(data []byte) error {
c.LogConfig = conf.LogConfig
c.Revision = conf.Revision
c.MaintenanceConfig = conf.MaintenanceConfig
c.PackagePath = conf.PackagePath

return nil
}
Expand Down Expand Up @@ -340,6 +348,7 @@ func (c Config) MarshalJSON() ([]byte, error) {
LogConfig: c.LogConfig,
Revision: c.Revision,
MaintenanceConfig: c.MaintenanceConfig,
PackagePath: c.PackagePath,
})
}

Expand Down
51 changes: 51 additions & 0 deletions robot/client/client.go
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,12 @@ var (

// defaultResourcesTimeout is the default timeout for getting resources.
defaultResourcesTimeout = 5 * time.Second

// DoNotWaitForRunning should be set only in tests to allow connecting to
// still-initializing machines. Note that robot clients in production (not in
// a testing environment) will already allow connecting to still-initializing
// machines.
DoNotWaitForRunning = atomic.Bool{}
)

// RobotClient satisfies the robot.Robot interface through a gRPC based
Expand Down Expand Up @@ -288,6 +294,41 @@ func New(ctx context.Context, address string, clientLogger logging.ZapCompatible
return nil, err
}

// If running in a testing environment, wait for machine to report a state of
// running. We often establish connections in tests and expect resources to
// be immediately available once the web service has started; resources will
// not be available when the machine is still initializing.
//
// It is expected that golang SDK users will handle lack of resource
// availability due to the machine being in an initializing state themselves.
//
// Allow this behavior to be turned off in some tests that specifically want
// to examine the behavior of a machine in an initializing state through the
// use of an environment variable.
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
// use of an environment variable.
// use of a package variable.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Good catch; applied something similar locally.

if testing.Testing() && !DoNotWaitForRunning.Load() {
for {
if ctx.Err() != nil {
return nil, multierr.Combine(ctx.Err(), rc.conn.Close())
}

mStatus, err := rc.MachineStatus(ctx)
if err != nil {
// Allow for MachineStatus to not be injected/implemented in some tests.
if status.Code(err) == codes.Unimplemented {
break
}
// Ignore error from Close and just return original machine status error.
utils.UncheckedError(rc.conn.Close())
return nil, err
}

if mStatus.State == robot.StateRunning {
break
}
time.Sleep(50 * time.Millisecond)
cheukt marked this conversation as resolved.
Show resolved Hide resolved
}
}

// refresh once to hydrate the robot.
if err := rc.Refresh(ctx); err != nil {
return nil, multierr.Combine(err, rc.conn.Close())
Expand Down Expand Up @@ -1115,6 +1156,16 @@ func (rc *RobotClient) MachineStatus(ctx context.Context) (robot.MachineStatus,
mStatus.Resources = append(mStatus.Resources, resStatus)
}

switch resp.State {
case pb.GetMachineStatusResponse_STATE_UNSPECIFIED:
rc.logger.CError(ctx, "received unspecified machine state")
mStatus.State = robot.StateUnknown
case pb.GetMachineStatusResponse_STATE_INITIALIZING:
mStatus.State = robot.StateInitializing
case pb.GetMachineStatusResponse_STATE_RUNNING:
mStatus.State = robot.StateRunning
}

return mStatus, nil
}

Expand Down
35 changes: 22 additions & 13 deletions robot/client/client_session_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -107,8 +107,11 @@ func TestClientSessionOptions(t *testing.T) {
return &dummyEcho{Named: arbName.AsNamed()}, nil
},
ResourceRPCAPIsFunc: func() []resource.RPCAPI { return nil },
LoggerFunc: func() logging.Logger { return logger },
SessMgr: sessMgr,
MachineStatusFunc: func(_ context.Context) (robot.MachineStatus, error) {
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I inject an almost identical MachineStatusFunc all over the place in client_session_test.go and client_test.go. client.New now calls MachineStatus when in a testing environment to make sure the robot is in a robot.StateRunning and therefore capable of receiving resource API calls.

I thought about placing this functionality in inject/robot.go, but opted to just put it in every robot inject, since we currently do that for ResourceNamesFunc and ResourceByNameFunc.

return robot.MachineStatus{State: robot.StateRunning}, nil
},
LoggerFunc: func() logging.Logger { return logger },
SessMgr: sessMgr,
}

svc := web.New(injectRobot, logger)
Expand All @@ -129,13 +132,11 @@ func TestClientSessionOptions(t *testing.T) {
Disable: true,
})))
}
roboClient, err := client.New(ctx, addr, logger, opts...)
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

In three of the client session tests, I had to move the instantiation of the robot client to be below the injection of the session manager. client.New now calls MachineStatus in testing environments, which is not exempted from session creation. So, the injected robot will try to Start a session from its session manager, and end up panicking if StartFunc is not defined. I did not leave a comment in-line explaining this.

I'd like to meet offline at some point to walk through TestClientSessionOptions, in particular, and add some documentation to what it's testing.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

client.New now calls MachineStatus in testing environments, which is not exempted from session creation.

I'm fine not touching this in the current PR. But are we currently aware of a reason why MachineStatus should not be exempt?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

are we currently aware of a reason why MachineStatus should not be exempt?

As far as I can tell, we only rely on MachineStatus starting sessions for tests, and, while it's not currently, it should be exempt. I assume we'll handle this eventually as part of RSDK-9262.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Makes sense -- in hindsight i wish the scope listed out all of the actuating (or non-actuating, if that's easier) API calls as of the time of writing.

test.That(t, err, test.ShouldBeNil)

injectRobot.Mu.Lock()
injectRobot.MachineStatusFunc = func(ctx context.Context) (robot.MachineStatus, error) {
session.SafetyMonitorResourceName(ctx, someTargetName1)
return robot.MachineStatus{}, nil
return robot.MachineStatus{State: robot.StateRunning}, nil
}
injectRobot.Mu.Unlock()

Expand Down Expand Up @@ -179,6 +180,8 @@ func TestClientSessionOptions(t *testing.T) {
}
sessMgr.mu.Unlock()

roboClient, err := client.New(ctx, addr, logger, opts...)
test.That(t, err, test.ShouldBeNil)
resp, err := roboClient.MachineStatus(nextCtx)
test.That(t, err, test.ShouldBeNil)
test.That(t, resp, test.ShouldNotBeNil)
Expand Down Expand Up @@ -289,6 +292,9 @@ func TestClientSessionExpiration(t *testing.T) {
ResourceByNameFunc: func(name resource.Name) (resource.Resource, error) {
return &dummyEcho1, nil
},
MachineStatusFunc: func(_ context.Context) (robot.MachineStatus, error) {
return robot.MachineStatus{State: robot.StateRunning}, nil
},
ResourceRPCAPIsFunc: func() []resource.RPCAPI { return nil },
LoggerFunc: func() logging.Logger { return logger },
SessMgr: sessMgr,
Expand All @@ -306,8 +312,6 @@ func TestClientSessionExpiration(t *testing.T) {
Disable: true,
})))
}
roboClient, err := client.New(ctx, addr, logger, opts...)
test.That(t, err, test.ShouldBeNil)

injectRobot.Mu.Lock()
var capSessID uuid.UUID
Expand All @@ -317,7 +321,7 @@ func TestClientSessionExpiration(t *testing.T) {
panic("expected session")
}
capSessID = sess.ID()
return robot.MachineStatus{}, nil
return robot.MachineStatus{State: robot.StateRunning}, nil
}
injectRobot.Mu.Unlock()

Expand Down Expand Up @@ -372,6 +376,8 @@ func TestClientSessionExpiration(t *testing.T) {
}
sessMgr.mu.Unlock()

roboClient, err := client.New(ctx, addr, logger, opts...)
test.That(t, err, test.ShouldBeNil)
resp, err := roboClient.MachineStatus(nextCtx)
test.That(t, err, test.ShouldBeNil)
test.That(t, resp, test.ShouldNotBeNil)
Expand Down Expand Up @@ -480,8 +486,11 @@ func TestClientSessionResume(t *testing.T) {
injectRobot := &inject.Robot{
ResourceNamesFunc: func() []resource.Name { return []resource.Name{} },
ResourceRPCAPIsFunc: func() []resource.RPCAPI { return nil },
LoggerFunc: func() logging.Logger { return logger },
SessMgr: sessMgr,
MachineStatusFunc: func(_ context.Context) (robot.MachineStatus, error) {
return robot.MachineStatus{State: robot.StateRunning}, nil
},
LoggerFunc: func() logging.Logger { return logger },
SessMgr: sessMgr,
}

svc := web.New(injectRobot, logger)
Expand All @@ -496,8 +505,6 @@ func TestClientSessionResume(t *testing.T) {
Disable: true,
})))
}
roboClient, err := client.New(ctx, addr, logger, opts...)
test.That(t, err, test.ShouldBeNil)

var capMu sync.Mutex
var startCalled int
Expand Down Expand Up @@ -536,10 +543,12 @@ func TestClientSessionResume(t *testing.T) {
panic("expected session")
}
capSessID = sess.ID()
return robot.MachineStatus{}, nil
return robot.MachineStatus{State: robot.StateRunning}, nil
}
injectRobot.Mu.Unlock()

roboClient, err := client.New(ctx, addr, logger, opts...)
test.That(t, err, test.ShouldBeNil)
resp, err := roboClient.MachineStatus(nextCtx)
test.That(t, err, test.ShouldBeNil)
test.That(t, resp, test.ShouldNotBeNil)
Expand Down
Loading
Loading