diff --git a/apple/testing/default_runner/ios_xctestrun_runner.bzl b/apple/testing/default_runner/ios_xctestrun_runner.bzl index 342115b853..4633818f15 100644 --- a/apple/testing/default_runner/ios_xctestrun_runner.bzl +++ b/apple/testing/default_runner/ios_xctestrun_runner.bzl @@ -14,7 +14,7 @@ def _get_template_substitutions( create_xcresult_bundle, device_type, os_version, - simulator_creator, + simulator_manager_start, random, xcodebuild_args, command_line_args, @@ -27,12 +27,12 @@ def _get_template_substitutions( post_action_binary, post_action_determines_exit_code): substitutions = { - "device_type": device_type, + "url_encoded_device_type": device_type.replace(" ", "%20"), "os_version": os_version, "create_xcresult_bundle": create_xcresult_bundle, "xcodebuild_args": xcodebuild_args, "command_line_args": command_line_args, - "simulator_creator.py": simulator_creator, + "simulator_manager_start": simulator_manager_start, # "ordered" isn't a special string, but anything besides "random" for this field runs in order "test_order": "random" if random else "ordered", "xctestrun_template": xctestrun_template, @@ -69,7 +69,7 @@ def _ios_xctestrun_runner_impl(ctx): runfiles = ctx.runfiles(files = [ ctx.file._xctestrun_template, ctx.file._xctrunner_entitlements_template, - ]).merge(ctx.attr._simulator_creator[DefaultInfo].default_runfiles) + ]).merge(ctx.attr._simulator_manager_start[DefaultInfo].default_runfiles) default_action_binary = "/usr/bin/true" @@ -93,7 +93,7 @@ def _ios_xctestrun_runner_impl(ctx): create_xcresult_bundle = "true" if ctx.attr.create_xcresult_bundle else "false", device_type = device_type, os_version = os_version, - simulator_creator = ctx.executable._simulator_creator.short_path, + simulator_manager_start = ctx.executable._simulator_manager_start.short_path, random = ctx.attr.random, xcodebuild_args = " ".join(ctx.attr.xcodebuild_args) if ctx.attr.xcodebuild_args else "", command_line_args = " ".join(ctx.attr.command_line_args) if ctx.attr.command_line_args else "", @@ -204,9 +204,9 @@ A binary to run following test execution. Runs after testing but before test res When true, the exit code of the test run will be set to the exit code of the post action. This is useful for tests that need to fail the test run based on their own criteria. """, ), - "_simulator_creator": attr.label( + "_simulator_manager_start": attr.label( default = Label( - "//apple/testing/default_runner:simulator_creator", + "//apple/testing/simulator_manager:start", ), executable = True, cfg = "exec", diff --git a/apple/testing/default_runner/ios_xctestrun_runner.template.sh b/apple/testing/default_runner/ios_xctestrun_runner.template.sh index e44a94033d..24fde6389a 100755 --- a/apple/testing/default_runner/ios_xctestrun_runner.template.sh +++ b/apple/testing/default_runner/ios_xctestrun_runner.template.sh @@ -23,7 +23,6 @@ if [[ -n "${CREATE_XCRESULT_BUNDLE:-}" ]]; then fi custom_xcodebuild_args=(%(xcodebuild_args)s) -simulator_name="" device_id="" command_line_args=(%(command_line_args)s) attachment_lifetime="%(attachment_lifetime)s" @@ -31,9 +30,6 @@ destination_timeout="%(destination_timeout)s" while [[ $# -gt 0 ]]; do arg="$1" case $arg in - --simulator_name=*) - simulator_name="${arg##*=}" - ;; --xcodebuild_args=*) xcodebuild_arg="${arg#--xcodebuild_args=}" # Strip "--xcodebuild_args=" prefix custom_xcodebuild_args+=("$xcodebuild_arg") @@ -391,24 +387,71 @@ fi readonly profraw="$test_tmp_dir/coverage.profraw" -simulator_creator_args=( - "%(os_version)s" \ - "%(device_type)s" \ - --name "$simulator_name" -) +simulator_id="" +if [[ "$build_for_device" == false ]]; then + simulator_manager_command() { + local _http_method="$1" + local _command_path="$2" + + # Retry up to 10 times with a 1 second delay between each attempt, to allow + # for the server to be down during upgrades + for i in {1..10}; do + set +e + local _simulator_response + _simulator_response=$( + curl \ + "http:/-/$_command_path" \ + --request "$_http_method" \ + --silent \ + --fail-with-body \ + --unix-socket "/tmp/simulator_manager.sock" + ) + local _curl_exit_code=$? + set -e + + if [[ $_curl_exit_code -eq 0 ]]; then + echo "$_simulator_response" + return 0 + fi -reuse_simulator=%(reuse_simulator)s -if [[ "$reuse_simulator" == true ]]; then - simulator_creator_args+=(--reuse-simulator) -else - simulator_creator_args+=(--no-reuse-simulator) -fi + # If the error was a connection issue (e.g., couldn't connect), then retry + # 6: couldn't resolve host + # 7: failed to connect + # 28: operation timeout + if [[ + $_curl_exit_code -eq 6 || + $_curl_exit_code -eq 7 || + $_curl_exit_code -eq 28 + ]]; then + echo >&2 "$(date '+[%H:%M:%S]') warning: simulator manager command" \ + "\"$_command_path\" failed with exit code $_curl_exit_code:" + echo >&2 "$(date '+[%H:%M:%S]') $_simulator_response" + echo >&2 "$(date '+[%H:%M:%S]') retrying in 1 second" + sleep 1 + else + echo >&2 "$(date '+[%H:%M:%S]') error: simulator manager command" \ + "\"$_command_path\" failed:" + echo >&2 "$(date '+[%H:%M:%S]') $_simulator_response" + return $_curl_exit_code + fi + done + } -simulator_id="unused" -if [[ "$build_for_device" == false ]]; then - simulator_id="$("./%(simulator_creator.py)s" \ - "${simulator_creator_args[@]}" + reuse_simulator=%(reuse_simulator)s + if [[ "$reuse_simulator" == true ]]; then + exclusive_simulator=0 + else + exclusive_simulator=1 + fi + + "./%(simulator_manager_start)s" + + echo "$(date '+[%H:%M:%S]') Attempting to lease simulator" + simulator_id="$( + simulator_manager_command POST "simulator/$$?exclusive=$exclusive_simulator&deviceType=%(url_encoded_device_type)s&os=iOS&version=%(os_version)s" )" + + echo "$(date '+[%H:%M:%S]') ✅ Leased simulator $simulator_id" fi test_exit_code=0 @@ -585,9 +628,13 @@ if [[ rm -r "$result_bundle_path" fi -if [[ "$reuse_simulator" == false ]]; then - # Delete will shutdown down the simulator if it's still currently running. - xcrun simctl delete "$simulator_id" +if [[ -n "$simulator_id" ]]; then + echo "$(date '+[%H:%M:%S]') Releasing simulator $simulator_id" + if response=$(simulator_manager_command DELETE "simulator/$$"); then + echo "$(date '+[%H:%M:%S]') ✅ Released simulator $simulator_id" + else + echo "$(date '+[%H:%M:%S]') ❌ Failed to release simulator $simulator_id: $response" >&2 + fi fi profdata="$test_tmp_dir/$simulator_id/Coverage.profdata" diff --git a/apple/testing/simulator_manager/BUILD.bazel b/apple/testing/simulator_manager/BUILD.bazel new file mode 100644 index 0000000000..3d0daee380 --- /dev/null +++ b/apple/testing/simulator_manager/BUILD.bazel @@ -0,0 +1,31 @@ +load("@rules_shell//shell:sh_binary.bzl", "sh_binary") + +# This is a macro in our repo, so this won't compile as is. But it's basically `macos_application` and `macos_unit_test` combined. +macos_swift_tool( + name = "simulator_manager", + deps = [ + "@swiftpkg_shellout//:ShellOut", + "@swiftpkg_swift_argument_parser//:ArgumentParser", + "@swiftpkg_swift_nio//:NIO", + "@swiftpkg_swift_nio//:NIOCore", + "@swiftpkg_swift_nio//:NIOHTTP1", + "@swiftpkg_swift_nio_extras//:NIOExtras", + ], +) + +sh_binary( + name = "start", + srcs = ["start.sh"], + data = [ + ":prepare_simulator", + ":simulator_manager_opt", + ":start_tunnel", + ], + visibility = ["//visibility:public"], + deps = ["@bazel_tools//tools/bash/runfiles"], +) + +sh_binary( + name = "prepare_simulator", + srcs = ["prepare_simulator.sh"], +) diff --git a/apple/testing/simulator_manager/README.md b/apple/testing/simulator_manager/README.md new file mode 100644 index 0000000000..90ad38ad85 --- /dev/null +++ b/apple/testing/simulator_manager/README.md @@ -0,0 +1,17 @@ +# `simulator_manager` + +The `simulator_manager` replaces the `simulator_creator.py` used in +**rules_apple**'s `ios_xctestrun_runner.template.sh`. + +It manages simulator "leases". Tests can lease and then release a simulator of a +given configuration (device type and os version), and can also request that it's +an "exclusive" lease. The manager automatically releases a simulator if the +requested process exits without releasing first. + +Exclusive leases mean that test has exclusive access to the simulator, which is +needed for App Host and UI tests. + +Base simulators are created for a given configuration, leases are on clones of +the base simulators. After a simulator has been released for 10 minutes the +clone is deleted. This allows us to free up disk space on remote executors, but +also allow reuse in a short period of time. diff --git a/apple/testing/simulator_manager/Sources/AccumulatedHTTPHandler.swift b/apple/testing/simulator_manager/Sources/AccumulatedHTTPHandler.swift new file mode 100644 index 0000000000..0595e0937e --- /dev/null +++ b/apple/testing/simulator_manager/Sources/AccumulatedHTTPHandler.swift @@ -0,0 +1,80 @@ +import NIO +import NIOHTTP1 +import os.log + +extension Logger { + static let accumulatedHTTP = simulatorManager(category: "accumulated_http") +} + +struct FullHTTPRequest { + let head: HTTPRequestHead + var body: ByteBuffer +} + +struct FullHTTPResponse { + let head: HTTPResponseHead + var body: ByteBuffer +} + +final class AccumulatedHTTPHandler: ChannelInboundHandler, ChannelOutboundHandler { + typealias InboundIn = HTTPServerRequestPart + typealias InboundOut = FullHTTPRequest + + typealias OutboundIn = FullHTTPResponse + typealias OutboundOut = HTTPServerResponsePart + + private var requestHead: HTTPRequestHead? + private var bodyBuffer: ByteBuffer? + + func channelRead(context: ChannelHandlerContext, data: NIOAny) { + let part = self.unwrapInboundIn(data) + + switch part { + case .head(let head): + self.requestHead = head + self.bodyBuffer = context.channel.allocator.buffer(capacity: 0) + + case .body(var chunk): + self.bodyBuffer?.writeBuffer(&chunk) + + case .end: + if let head = requestHead, let body = bodyBuffer { + Logger.accumulatedHTTP.info( + """ + ▶️ Received \(head.method.rawValue, privacy: .public) request for \ + \(head.uri, privacy: .public) + """ + ) + + let fullRequest = FullHTTPRequest(head: head, body: body) + context.fireChannelRead(self.wrapInboundOut(fullRequest)) + } + + self.requestHead = nil + self.bodyBuffer = nil + } + } + + func write(context: ChannelHandlerContext, data: NIOAny, promise: EventLoopPromise?) { + let fullResponse = unwrapOutboundIn(data) + + Logger.accumulatedHTTP.info( + "◀️ Sending \(fullResponse.head.status, privacy: .public) response" + ) + + context.write(wrapOutboundOut(.head(fullResponse.head)), promise: nil) + + if fullResponse.body.readableBytes > 0 { + context.write(wrapOutboundOut(.body(.byteBuffer(fullResponse.body))), promise: nil) + } + + context.write(wrapOutboundOut(.end(nil)), promise: promise) + } + + func errorCaught(context: ChannelHandlerContext, error: Error) { + Logger.accumulatedHTTP.error( + "❌ \(error.localizedDescription, privacy: .public)" + ) + context.close(promise: nil) + } +} diff --git a/apple/testing/simulator_manager/Sources/HTTPServer.swift b/apple/testing/simulator_manager/Sources/HTTPServer.swift new file mode 100644 index 0000000000..e926fe2b42 --- /dev/null +++ b/apple/testing/simulator_manager/Sources/HTTPServer.swift @@ -0,0 +1,164 @@ +import Foundation +import NIO +import NIOExtras +import NIOHTTP1 +import NIOPosix +import os.log + +extension Logger { + static let httpServer = simulatorManager(category: "server") +} + +final class HTTPServer { + private let simulatorRequestHandler: SimulatorRequestHandler + + private let version: String + + private var serverShutdownHandler: (() -> Void)? + + init(simulatorRequestHandler: SimulatorRequestHandler, version: String) { + self.simulatorRequestHandler = simulatorRequestHandler + self.version = version + } + + func run(pidPath: String, unixSocketPath: String) async throws { + let socketURL = URL(fileURLWithPath: unixSocketPath) + let pidURL = URL(fileURLWithPath: pidPath) + + // Remove existing files if they exist + try? FileManager.default.removeItem(at: socketURL) + try? FileManager.default.removeItem(at: pidURL) + + try String(ProcessInfo.processInfo.processIdentifier).write( + to: pidURL, + atomically: true, + encoding: .utf8 + ) + + let eventLoopGroup = MultiThreadedEventLoopGroup(numberOfThreads: System.coreCount) + + do { + // This nested block is necessary to ensure that all the destructors for objects defined + // inside are called before the final call to `eventLoopGroup.syncShutdownGracefully()`. A + // possible side effect of not doing this is a run-time error "Cannot schedule tasks on an + // EventLoop that has already shut down". + let quiesce = ServerQuiescingHelper(group: eventLoopGroup) + let fullyShutdownPromise: EventLoopPromise = eventLoopGroup.next().makePromise() + serverShutdownHandler = { + Logger.httpServer.info("⚠️ Shutting down server") + quiesce.initiateShutdown(promise: fullyShutdownPromise) + } + + do { + let serverChannel = try await ServerBootstrap(group: eventLoopGroup) + .serverChannelOption(ChannelOptions.backlog, value: 256) + .serverChannelInitializer { channel in + return channel.eventLoop.makeCompletedFuture { + try channel.pipeline.syncOperations.addHandler( + quiesce.makeServerChannelHandler(channel: channel) + ) + } + } + .bind(unixDomainSocketPath: unixSocketPath, childChannelInitializer: { childChannel in + return childChannel.eventLoop.makeCompletedFuture { + try childChannel.pipeline.syncOperations.addHandlers([ + HTTPResponseEncoder(), + ByteToMessageHandler(HTTPRequestDecoder()), + AccumulatedHTTPHandler(), + SimulatorManagerHTTPHandler(), + ]) + + return try NIOAsyncChannel( + wrappingChannelSynchronously: childChannel, + configuration: .init() + ) + } + }) + + Logger.httpServer.info("🔌 Server running on UDS at \(unixSocketPath, privacy: .public)") + + try await withThrowingDiscardingTaskGroup { group in + try await serverChannel.executeThenClose { inbound in + for try await connectionChannel in inbound { + group.addTask { + do { + try await self.handleConnection( + channel: connectionChannel + ) + } catch { + // We don't throw here, as it locks up the whole server + Logger.httpServer.error( + """ + ❌ Caught connection error: \(error, privacy: .public) + """ + ) + } + } + } + } + } + } catch { + Logger.httpServer.error("❌ Caught top-level error: \(error, privacy: .public)") + try await eventLoopGroup.shutdownGracefully() + throw error + } + + try await fullyShutdownPromise.futureResult.get() + } + + try await eventLoopGroup.shutdownGracefully() + Logger.httpServer.info("✅ Server shut down") + + // Cleanup files + try? FileManager.default.removeItem(at: socketURL) + try? FileManager.default.removeItem(at: pidURL) + } + + private func handleConnection( + channel: NIOAsyncChannel + ) async throws { + try await channel.executeThenClose { inbound, outbound in + for try await request in inbound { + try await outbound.write(handleRequest(request)) + } + } + } + + private func handleRequest( + _ request: SimulatorManagerRequest + ) async -> SimulatorManagerResponse { + switch request.path { + case "simulator": + do { + return try await simulatorRequestHandler.handleRequest( + method: request.method, + pathComponents: request.pathComponents, + queryParameters: request.queryParameters + ) + } catch { + Logger.httpServer.error( + "❌ simulatorRequestHandler.handleRequest error: \(error, privacy: .public)" + ) + + return .init( + status: .internalServerError, + message: "Internal server error: \(error)" + ) + } + + case "version": + return .init( + status: .ok, + message: version + ) + + case "shutdown": + Logger.httpServer.info("⚠️ Shutdown request received") + serverShutdownHandler?() + return .init(status: .ok, message: "Server shutting down") + + default: + return .init(status: .badRequest, message: "Unknown method: \(request.path)") + } + } +} diff --git a/apple/testing/simulator_manager/Sources/LRUSet.swift b/apple/testing/simulator_manager/Sources/LRUSet.swift new file mode 100644 index 0000000000..972c08c0a9 --- /dev/null +++ b/apple/testing/simulator_manager/Sources/LRUSet.swift @@ -0,0 +1,49 @@ +/// A `Set` that has a maximum capacity and evicts the least recently used item +// when full. +struct LRUSet { + private let capacity: Int + // An array to keep track of the order in which elements were inserted. + // The first element in the array is the least recently used. + private var order: [Element] = [] + + // A Set to enable fast O(1) membership tests. + private var storage: Set = [] + + init(capacity: Int) { + precondition(capacity > 0, "Capacity must be greater than zero.") + self.capacity = capacity + } + + // Returns an element that was evicted from the set. + mutating func insert(_ element: Element) -> Element? { + let evicted: Element? + if storage.contains(element) { + if let index = order.firstIndex(of: element) { + evicted = order.remove(at: index) + } else { + evicted = nil + } + + order.append(element) + } else { + if order.count >= capacity, let oldest = order.first { + order.removeFirst() + evicted = storage.remove(oldest) + } else { + evicted = nil + } + + order.append(element) + storage.insert(element) + } + return evicted + } + + func contains(_ element: Element) -> Bool { + return storage.contains(element) + } + + var elements: [Element] { + return order + } +} diff --git a/apple/testing/simulator_manager/Sources/Logger.swift b/apple/testing/simulator_manager/Sources/Logger.swift new file mode 100644 index 0000000000..39507ba763 --- /dev/null +++ b/apple/testing/simulator_manager/Sources/Logger.swift @@ -0,0 +1,10 @@ +import os.log + +extension Logger { + static func simulatorManager(category: String) -> Logger { + Logger( + subsystem: "com.example.tools.simulator_manager", + category: category + ) + } +} diff --git a/apple/testing/simulator_manager/Sources/Main.swift b/apple/testing/simulator_manager/Sources/Main.swift new file mode 100644 index 0000000000..c9c12d574f --- /dev/null +++ b/apple/testing/simulator_manager/Sources/Main.swift @@ -0,0 +1,79 @@ +import ArgumentParser + +@main +struct Main: AsyncParsableCommand { + // This is set externally to prevent having to recompile the manager just for `start.sh` changes + @Option(help: "Version of the simulator manager") + var version: String + + @Option(help: "Path to where the pid should be written") + var pidPath: String + + @Option(help: "Path to where the unix domain socket should be created") + var unixSocketPath: String + + @Option(help: "Number of seconds to wait before deleting a recently used idle simulator") + var deleteRecentlyUsedIdleAfter: UInt16 + + @Option(help: "Number of seconds to wait before deleting a non-recently used idle simulator") + var deleteIdleAfter: UInt16 + + @Option( + help: """ + The number of simulators to keep in the recently used list; affects wether \ + 'delete-recently-used-idle-after' or 'delete-idle-after' is used when determining when to \ + delete an unused simulator + """ + ) + var recentlyUsedCapacity = 1 + + @Option( + name: .customLong("startup-process"), + help: """ + The path to a startup process that will be run when the simulator manager is started. This \ + process will not be relaunched if it exits. + + To pass custom arguments to the process you should wrap it in a script. + + Setting this flag multiple times will result in multiple startup process being launched. + """ + ) + var startupProcesses: [String] = [] + + @Option(help: "Path to an executable that will run after a simulator clone is booted") + var postBoot: String + + func validate() throws { + guard recentlyUsedCapacity > 0 else { + throw ValidationError( + """ + 'recently-used-capacity' must be greater than 0. + """ + ) + } + + guard Set(startupProcesses).count == startupProcesses.count else { + throw ValidationError("'startup-process' must be unique.") + } + } + + func run() async throws { + let simulatorManager = SimulatorManager( + simulatorControl: RealSimulatorControl(), + deleteRecentlyUsedIdleAfter: deleteRecentlyUsedIdleAfter, + deleteIdleAfter: deleteIdleAfter, + recentlyUsedCapacity: recentlyUsedCapacity, + deleteOnPIDExit: true, + startupProcesses: startupProcesses, + postBoot: postBoot + ) + try await simulatorManager.startChildProcesses() + + try await HTTPServer( + simulatorRequestHandler: SimulatorRequestHandler( + simulatorManager: simulatorManager + ), + version: version + ).run(pidPath: pidPath, unixSocketPath: unixSocketPath) + } +} diff --git a/apple/testing/simulator_manager/Sources/PTY.swift b/apple/testing/simulator_manager/Sources/PTY.swift new file mode 100644 index 0000000000..6f11ed47f8 --- /dev/null +++ b/apple/testing/simulator_manager/Sources/PTY.swift @@ -0,0 +1,38 @@ +import Darwin + +struct PTY { + let parent: Int32 + let child: Int32 + + init() throws { + var parentFd: Int32 = 0 + var childFd: Int32 = 0 + + // NULL for name/pw/termios/winsize = defaults + let result = openpty(&parentFd, &childFd, nil, nil, nil) + guard result == 0 else { + throw Errno(rawValue: errno) + } + + self.parent = parentFd + self.child = childFd + } +} + +/// Simple POSIX errno wrapper. +struct Errno: Error, RawRepresentable { + /// The raw POSIX error number. + let rawValue: Int32 + + init(rawValue: Int32) { + self.rawValue = rawValue + } +} + +extension Errno: CustomStringConvertible { + var description: String { + var buf = [CChar](repeating: 0, count: 256) + strerror_r(rawValue, &buf, buf.count) + return String(cString: buf) + } +} diff --git a/apple/testing/simulator_manager/Sources/SimulatorControl.swift b/apple/testing/simulator_manager/Sources/SimulatorControl.swift new file mode 100644 index 0000000000..996fbad023 --- /dev/null +++ b/apple/testing/simulator_manager/Sources/SimulatorControl.swift @@ -0,0 +1,628 @@ +import Foundation +import os.log +import ShellOut + +typealias SimulatorUDID = String + +extension Logger { + static let simulatorControl = simulatorManager(category: "control") +} + +struct SimulatorConfig: Hashable, Equatable { + let deviceType: String + let os: String + let version: String +} + +extension SimulatorConfig: CustomStringConvertible { + var description: String { + return "\(deviceType) (\(os) \(version))" + } +} + +struct SimCtlDevices: Decodable { + let devices: [String: [SimCtlDevice]] +} + +struct SimCtlDevice: Decodable { + let name: String + let udid: String +} + +struct ProcessError: Error { + let command: String + let context: String? + let exitCode: Int32 + let stdOut: String + let stdErr: String +} + +extension ProcessError: CustomStringConvertible { + var description: String { + let contextStr: String + if let context { + contextStr = " (\(context))" + } else { + contextStr = "" + } + + return """ + "\(command)"\(contextStr) failed with exit code \(exitCode): + \(stdOut)\(stdErr) + """ + } +} + +extension ProcessError: LocalizedError { + var errorDescription: String? { + return description + } +} + +protocol SimulatorControl: Actor { + // Creates a base simulator with the given config. + // + // It also boots and shuts down the simulator, making it ready for cloning. + // + // If an existing simulator with the same name already exists, that is returned instead of + // creating a new one. This is to support the simulator manager being restarted and losing state. + func createBase( + name: String, + with config: SimulatorConfig, + runtimeIdentifier: String + ) async throws -> SimulatorUDID + + // Clones a base simulator. + // + // It also boots the cloned simulator, making it ready for use. + // + // If an existing simulator with the same name already exists, that is returned instead of + // creating a new one. This is to support the simulator manager being restarted and losing state. + func clone( + _ baseSimulator: SimulatorUDID, + name: String, + deviceType: String, + runtimeIdentifier: String, + postBoot: String? + ) async throws -> SimulatorUDID + + func ensureBooted( + _ simulator: SimulatorUDID, + context: @escaping @autoclosure () -> String? + ) async throws + + func cleanTempFiles(in simulator: SimulatorUDID) + + func delete( + _ simulator: SimulatorUDID, + name: String, + context: @escaping @autoclosure () -> String? + ) async throws + + func getExisting( + name: String, + deviceType: String, + runtimeIdentifier: String, + context: @escaping @autoclosure () -> String? + ) async throws -> String? +} + +actor RealSimulatorControl: SimulatorControl { + private var createBaseTasks: [String: Task] = [:] + private var cloneTasks: [String: Task] = [:] + + private var deleteAndExistenceMutexes: [String: (SimulatorDeleteOrExistenceMutex, Int)] = [:] + + func createBase( + name: String, + with config: SimulatorConfig, + runtimeIdentifier: String + ) async throws -> SimulatorUDID { + if let existingTask = createBaseTasks[name] { + return try await existingTask.value + } + + // We use a task to prevent data races that can occur when the `await` on `simctl` blocks. This + // ensures that multiple callers trying create a base simulator will all wait for the same + // simulator to be returned. + let task = Task { + defer { + createBaseTasks.removeValue(forKey: name) + } + + if let existingUDID = try await getExisting( + name: name, + deviceType: config.deviceType, + runtimeIdentifier: runtimeIdentifier, + context: "createBase" + ) { + Logger.simulatorControl.info( + """ + 📱 Base simulator "\(name, privacy: .public)" already exists, skipping creation: \ + \(existingUDID, privacy: .public) + """ + ) + + do { + // Under weird circumstances, the base simulator might be booted. This could happen if + // the simulator manager is killed in the process of creating a new base. Always call + // shutdown just in case. + try await shutdown(existingUDID, context: "createBase existing: \(name)") + } catch { + // If we fail to do what we need to, then we need to delete the faulty base simulator + Logger.simulatorControl.error( + """ + 📱 Failed to set up base simulator "\(name)" \(existingUDID, privacy: .public); deleting + """ + ) + + // If we fail to delete, don't throw _that_ error, throw the original error + try? await delete(existingUDID, name: name, context: "createBase existing: \(name)") + + throw error + } + + return existingUDID + } + + Logger.simulatorControl.info( + #"📱 Creating \#(config, privacy: .public) base simulator "\#(name, privacy: .public)""# + ) + + let udid = try await simctl( + ["create", name, config.deviceType, runtimeIdentifier] + ).trimmingCharacters(in: .whitespacesAndNewlines) + + do { + try await ensureBooted(udid, context: "createBase new: \(name)") + + // FIXME: Find a better way to know the simulator is ready + // Give the simulator some time to do some post-boot processing + try await Task.sleep(for: .seconds(5)) + + try await shutdown(udid, context: "createBase new: \(name)") + } catch { + // If we fail to do what we need to, then we need to delete the faulty base simulator + Logger.simulatorControl.error( + #""📱 Failed to set up base simulator "\#(name)" \#(udid, privacy: .public); deleting"# + ) + + // If we fail to delete, don't throw _that_ error, throw the original error + try? await delete(udid, name: name, context: "createBase new: \(name)") + + throw error + } + + Logger.simulatorControl.info( + """ + 📱 Created \(config, privacy: .public) base simulator \ + "\(name, privacy: .public)": \(udid, privacy: .public) + """ + ) + + return udid + } + + createBaseTasks[name] = task + + return try await task.value + } + + func clone( + _ baseSimulator: SimulatorUDID, + name: String, + deviceType: String, + runtimeIdentifier: String, + postBoot: String? = nil + ) async throws -> SimulatorUDID { + if let existingTask = cloneTasks[name] { + return try await existingTask.value + } + + // We use a task to prevent data races that can occur when the `await` on `simctl` blocks. This + // ensures that multiple callers trying create a base simulator will all wait for the same + // simulator to be returned. + let task = Task { + defer { + cloneTasks.removeValue(forKey: name) + } + + let udid: String + let isExisting: Bool + if let existingUDID = try await getExisting( + name: name, + deviceType: deviceType, + runtimeIdentifier: runtimeIdentifier, + context: "clone" + ) { + udid = existingUDID + isExisting = true + + // An existing simulator can be found if a previous simulator manager was killed before the + // clone was deleted. No tests _should_ be actively leasing the simulator. + Logger.simulatorControl.info( + """ + 📱 Cloned simulator "\(name, privacy: .public)" already exists, skipping creation: \ + \(udid, privacy: .public) + """ + ) + + // Wait for it to boot. This shouldn't be necessary, but sometimes the simulator will + // reboot because of a migration. + try await ensureBooted(udid, context: "clone, existing: \(name)") + } else { + isExisting = false + + Logger.simulatorControl.info( + """ + 📱 Cloning base simulator \(baseSimulator, privacy: .public) as \ + "\(name, privacy: .public)" + """ + ) + + udid = try await simctl( + ["clone", baseSimulator, name] + ).trimmingCharacters(in: .whitespacesAndNewlines) + + Logger.simulatorControl.info( + """ + 📱 Cloned base simulator \(baseSimulator, privacy: .public) as \ + "\(name, privacy: .public)": \(udid, privacy: .public) + """ + ) + + try await ensureBooted(udid, context: "clone, new: \(name)") + } + + if let postBoot { + Logger.simulatorControl.info( + """ + 📱 Running post-boot script "\(postBoot, privacy: .public)" on \ + \(udid, privacy: .public) + """ + ) + + do { + _ = try await subprocess(postBoot, env: ["SIMULATOR_UDID": udid]) + } catch { + throw NSError( + domain: "SimulatorControl", + code: 1, + userInfo: + [NSLocalizedDescriptionKey: "postBoot failed (isExisting: \(isExisting)): \(error)"] + ) + } + } + + return udid + } + + cloneTasks[name] = task + + return try await task.value + } + + func shutdown(_ simulator: SimulatorUDID, context: @escaping @autoclosure () -> String?) async throws { + do { + _ = try await simctl(["shutdown", simulator], context: context()) + } catch let error as ProcessError { + // Exit code 149 is related to the simulator already being shut down + guard error.exitCode == 149 else { + throw error + } + + Logger.simulatorControl.warning( + """ + ⚠️ Shutdown failed, but probably \"already shut down\": \(error, privacy: .public) + """ + ) + } + } + + func cleanTempFiles(in simulator: SimulatorUDID) { + let fileManager = FileManager.default + + // Remove all files and directories under + // `data/Library/Caches/com.apple.containermanagerd/Dead/`, ignoring errors. There seems to be + // a bug where the simulator moves files here but never cleans them up. Maybe it's waiting for + // a reboot or something, which we never do. + let deadCachesPath = + "\(NSHomeDirectory())/Library/Developer/CoreSimulator/Devices/\(simulator)/data/Library/Caches/com.apple.containermanagerd/Dead" + guard let contents = try? fileManager.contentsOfDirectory(atPath: deadCachesPath) else { + return + } + for item in contents { + let itemPath = "\(deadCachesPath)/\(item)" + try? fileManager.removeItem(atPath: itemPath) + } + } + + func delete( + _ simulator: SimulatorUDID, + name: String, + context: @escaping @autoclosure () -> String? + ) async throws { + try await deleteAndExistenceMutex(name: name) { mutex in + try await mutex.unlockedDelete(simulator, context: context()) + } + } + + func getExisting( + name: String, + deviceType: String, + runtimeIdentifier: String, + context: @escaping @autoclosure () -> String? + ) async throws -> String? { + return try await deleteAndExistenceMutex(name: name) { mutex in + return try await mutex.unlockedGetExisting( + name: name, + deviceType: deviceType, + runtimeIdentifier: runtimeIdentifier, + context: context() + ) + } + } + + func ensureBooted(_ simulator: SimulatorUDID, context: @escaping @autoclosure () -> String?) async throws { + for retriesLeft in (0...1).reversed() { + do { + // This private command boots the simulator if it isn't already, and waits for the + // appropriate amount of time until we can actually run tests + _ = try await simctl(["bootstatus", simulator, "-b"], context: context()) + break + } catch let error as ProcessError { + // Exit code 149 is related to the simulator already being booted + guard error.exitCode == 149 && retriesLeft > 0 else { + throw error + } + + // This is a known error that happens when the simulator is already booted. A retry + // should succeed. + Logger.simulatorControl.warning( + """ + ⚠️ Boot of simulator \(simulator, privacy: .public) failed, but probably \"already \ + booted\": \(error, privacy: .public) + """ + ) + } + } + } + + private func deleteAndExistenceMutex( + name: String, + _ call: (_ mutex: SimulatorDeleteOrExistenceMutex) async throws -> T + ) async throws -> T { + var (mutex, referenceCount) = + deleteAndExistenceMutexes[name] ?? (SimulatorDeleteOrExistenceMutex(), 0) + referenceCount += 1 + deleteAndExistenceMutexes[name] = (mutex, referenceCount) + + defer { + guard let mutexAndRef = deleteAndExistenceMutexes[name] else { + preconditionFailure( + """ + State of `deleteAndExistenceMutexes` changed unexpectedly. Expected value for "\(name)". + """ + ) + } + let mutex = mutexAndRef.0 + var referenceCount = mutexAndRef.1 + referenceCount -= 1 + + if referenceCount == 0 { + deleteAndExistenceMutexes.removeValue(forKey: name) + } else { + deleteAndExistenceMutexes[name] = (mutex, referenceCount) + } + } + + return try await mutex.withLock { + try await call(mutex) + } + } +} + +// An instance of this actor is created for each simulator name that is being checked for existence +// or being deleted. The actor is only called through `withLock()`, which will suspend on multiple +// calls to ensure that these operations are serialized. Without this, someone could try to call +// `clone()` while a deletion is pending, which will call `getExisting()`, and it can return the +// simulator that is in the process of being deleted. +actor SimulatorDeleteOrExistenceMutex { + private var isLocked = false + private var waiters: [CheckedContinuation] = [] + + /// Acquires, runs the work, and then releases the lock. + func withLock(_ work: () async throws -> T) async throws -> T { + await lock() + defer { unlock() } + return try await work() + } + + /// Acquires the lock. If already locked, will suspend until unlocked. + private func lock() async { + if !isLocked { + isLocked = true + } else { + await withCheckedContinuation { cont in + waiters.append(cont) + } + } + } + + /// Releases the lock and wakes one waiter (if any). + private func unlock() { + if !waiters.isEmpty { + let cont = waiters.removeFirst() + cont.resume() + } else { + isLocked = false + } + } + + func unlockedGetExisting( + name: String, + deviceType: String, + runtimeIdentifier: String, + context: @escaping @autoclosure () -> String? + ) async throws -> String? { + Logger.simulatorControl.debug( + #"🔍 Trying to find existing simulator "\#(name, privacy: .public)""# + ) + + let output = try await simctl(["list", "devices", "-j", deviceType], context: context()) + + guard let jsonData = output.data(using: .utf8) else { + throw NSError( + domain: "SimulatorControl", + code: 1, + userInfo: [NSLocalizedDescriptionKey: "Failed to convert output to data"] + ) + } + + let jsonDecoder = JSONDecoder() + + let devicesByRuntime: [String: [SimCtlDevice]] + do { + devicesByRuntime = try jsonDecoder.decode(SimCtlDevices.self, from: jsonData).devices + } catch { + let json = String(data: jsonData, encoding: .utf8) ?? "" + Logger.simulatorControl.error( + """ + ❌ Failed to decode 'simctl list devices -j': \(error, privacy: .public). + Output: \(json, privacy: .public) + """ + ) + throw NSError( + domain: "SimulatorControl", + code: 1, + userInfo: [NSLocalizedDescriptionKey: "Failed to decode output: \(error) - \(json)"] + ) + } + + if let devices = devicesByRuntime[runtimeIdentifier] { + for device in devices { + if device.name == name { + let udid = device.udid + + Logger.simulatorControl.debug( + #"🔍 Found existing simulator "\#(name, privacy: .public)": \#(udid, privacy: .public)"# + ) + + // Sometimes the simulator is not actually on disk, but it is in the list. If this + // happens, "delete" it so simctl stops reporting it as existing. + if !FileManager.default.fileExists( + atPath: + "\(NSHomeDirectory())/Library/Developer/CoreSimulator/Devices/\(udid)" + ) { + Logger.simulatorControl.debug( + """ + ⚠️ Simulator \(udid, privacy: .public) doesn't actually exist on disk; "deleting" + """ + ) + + // If we fail to delete, don't throw an error + try? await unlockedDelete(udid, context: context()) + + return nil + } + + return udid + } + } + } + + Logger.simulatorControl.debug( + #"🔍 No existing simulator "\#(name, privacy: .public)" found"# + ) + + return nil + } + + func unlockedDelete( + _ simulator: SimulatorUDID, + context: @escaping @autoclosure () -> String? + ) async throws { + Logger.simulatorControl.info("🗑️ Deleting simulator \(simulator, privacy: .public)") + + do { + _ = try await simctl(["delete", simulator], context: context()) + } catch { + Logger.simulatorControl.error( + """ + ❌ Failed to delete simulator \(simulator, privacy: .public): \ + \(error, privacy: .public) + """ + ) + + throw error + } + + Logger.simulatorControl.info("🗑️ Deleted simulator \(simulator, privacy: .public)") + } +} + +private func simctl( + _ args: [String], + context: @escaping @autoclosure () -> String? = nil +) async throws -> String { + return try await subprocess("/usr/bin/xcrun", ["simctl"] + args, context: context()) +} + +private func subprocess( + _ executable: String, + _ args: [String] = [], + env: [String: String] = [:], + context: @escaping @autoclosure () -> String? = nil +) async throws -> String { + return try await Task { try syncSubprocess(executable, args, env: env, context: context()) }.value +} + +private func syncSubprocess( + _ executable: String, + _ args: [String] = [], + env: [String: String] = [:], + context: @escaping @autoclosure () -> String? = nil +) throws -> String { + let quotedArgs = args.map { "'\($0)'" } + + var newEnv = ProcessInfo.processInfo.environment.merging(env) { _, new in new } + newEnv["PWD"] = FileManager.default.currentDirectoryPath + + let process = Process() + process.environment = ProcessInfo.processInfo.environment.merging(env) { _, new in new } + + let command = "\(executable) \(quotedArgs.joined(separator: " "))" + + Logger.simulatorControl.debug(#"🛠️ Running "\#(command, privacy: .public)""#) + + do { + return try shellOut( + to: executable, + arguments: quotedArgs, + process: process + ) + } catch let error as ShellOutError { + throw ProcessError( + command: command, + context: context(), + exitCode: error.terminationStatus, + stdOut: error.output, + stdErr: error.message + ) + } +} + +extension SimulatorConfig { + func baseDeviceName() -> String { + return "EXAMPLE_BAZEL_BASE_\(deviceType)_\(version)" + } + + func cloneDeviceName(index: Int) -> String { + return "EXAMPLE_BAZEL_CLONE_\(deviceType)_\(version)_\(index)" + } + + func runtimeIdentifier() -> String { + let runtimeVersion = version.replacingOccurrences(of: ".", with: "-") + return "com.apple.CoreSimulator.SimRuntime.\(os)-\(runtimeVersion)" + } +} diff --git a/apple/testing/simulator_manager/Sources/SimulatorManager.swift b/apple/testing/simulator_manager/Sources/SimulatorManager.swift new file mode 100644 index 0000000000..7236fbe4c0 --- /dev/null +++ b/apple/testing/simulator_manager/Sources/SimulatorManager.swift @@ -0,0 +1,679 @@ +import Foundation +import os +import ShellOut + +typealias PID = pid_t + +extension Logger { + static let simulatorManager = simulatorManager(category: "manager") + static let childProcess = simulatorManager(category: "manager.child-process") +} + +enum SimulatorManagerError: Error { + case alreadyLeased(udid: SimulatorUDID) + case noLease +} + +private struct SimulatorLease { + let udid: SimulatorUDID + let config: SimulatorConfig + let exclusive: Bool + let slotIndex: Int +} + +private enum SimulatorSlot { + case empty + case pendingCreation(Task, exclusive: Bool) + case active(SimulatorUDID, exclusive: Bool) + case pendingDeletion(SimulatorUDID, Task) + case deleting(SimulatorUDID) +} + +extension SimulatorSlot { + var sortOrder: Int { + switch self { + // Try to use active or pending creation simulators first (should only be + // one of either for non-exclusive) + case .active: + return 0 + case .pendingCreation: + return 1 + + // Use a pending deletion before any empty slots + case .pendingDeletion: + return 2 + + // Finally use empty slots + case .empty: + return 3 + + // Deleting simulator can't be used, so put it at the end + case .deleting: + return 4 + } + } +} + +private enum SimulatorSlotResult { + case active(SimulatorUDID, slotIndex: Int) + case pending(Task, slotIndex: Int) +} + +actor SimulatorManager { + private let simulatorControl: SimulatorControl + + private var simulatorSlots: [SimulatorConfig: [SimulatorSlot]] = [:] + private var referenceCount: [SimulatorUDID: Int] = [:] + private var leases: [PID: SimulatorLease] = [:] + + private var leaserExitListeners: [PID: DispatchSourceProcess] = [:] + + private var getBaseSimulatorTasks: [SimulatorConfig: Task] = [:] + + private let deleteIdleAfter: UInt16 + private let deleteRecentlyUsedIdleAfter: UInt16 + private let deleteOnPIDExit: Bool + + private var recentlyLeased: LRUSet + + private var startupProcessPaths: [String] + private var postBoot: String? + private var childProcessTasks: [Task] = [] + private var childProcesses: [String: (Process, DispatchSourceRead, DispatchSourceRead)] = [:] + + init( + simulatorControl: SimulatorControl, + deleteRecentlyUsedIdleAfter: UInt16, + deleteIdleAfter: UInt16, + recentlyUsedCapacity: Int, + deleteOnPIDExit: Bool, + startupProcesses: [String] = [], + postBoot: String? = nil + ) { + self.simulatorControl = simulatorControl + self.deleteIdleAfter = deleteIdleAfter + self.deleteRecentlyUsedIdleAfter = deleteRecentlyUsedIdleAfter + self.deleteOnPIDExit = deleteOnPIDExit + self.recentlyLeased = LRUSet(capacity: recentlyUsedCapacity) + self.startupProcessPaths = startupProcesses + self.postBoot = postBoot + + // Change the working directory to some place stable, since on RBE the runfiles directory can + // get cleaned up + FileManager.default.changeCurrentDirectoryPath("/tmp") + } + + deinit { + for task in childProcessTasks { + task.cancel() + } + + for (process, outWatcher, errWatcher) in childProcesses.values { + process.terminate() + outWatcher.cancel() + errWatcher.cancel() + } + } + + func startChildProcesses() throws { + for path in startupProcessPaths { + childProcessTasks.append(createStartChildProcessTask(path: path)) + } + } + + func lease( + to leaser: PID, + exclusive: Bool, + config: SimulatorConfig + ) async throws -> SimulatorUDID { + // Each process can only lease one simulator at a time + if let existingLease = leases[leaser] { + throw SimulatorManagerError.alreadyLeased(udid: existingLease.udid) + } + + Logger.simulatorManager.info( + """ + 🔒 Leasing \(exclusive ? "exclusive" : "non-exclusive", privacy: .public) \ + \(config, privacy: .public) simulator for PID \(leaser, privacy: .public) + """ + ) + + // `getSimulator()` will increment the reference count for the simulator + let (simulator, slotIndex) = try await getSimulator(for: config, exclusive: exclusive) + + _ = recentlyLeased.insert(config) + + leases[leaser] = .init( + udid: simulator, + config: config, + exclusive: exclusive, + slotIndex: slotIndex + ) + + Logger.simulatorManager.info( + "🔒 Leased simulator \(simulator, privacy: .public) to PID \(leaser, privacy: .public)" + ) + + if deleteOnPIDExit { + registerReleaseOnExit(for: leaser) + } + + return simulator + } + + func release(for leaser: PID) async throws { + guard let lease = leases.removeValue(forKey: leaser) else { + // If the manager recently restarted, we might not have the state of all leases. Since + // `SimulatorControl` will return existing simulators matching a given name, the dangling + // simulator will eventually get picked back up again and properly reference counted. So we + // will return an error here, and ignore it in the test runner. + throw SimulatorManagerError.noLease + } + + Logger.simulatorManager.info( + "🔓 Releasing simulator \(lease.udid, privacy: .public) for PID \(leaser, privacy: .public)" + ) + + removeReleaseOnExit(for: leaser) + + await simulatorControl.cleanTempFiles(in: lease.udid) + + try await decrementReferenceCount( + for: lease.udid, + config: lease.config, + slotIndex: lease.slotIndex + ) + } + + private func getBase( + for config: SimulatorConfig + ) async throws -> SimulatorUDID { + if let existingTask = getBaseSimulatorTasks[config] { + return try await existingTask.value + } + + // We use a task to prevent data races that can occur when the `await` on `simulatorControl` + // blocks. This ensures that multiple callers trying to get a base simulator will all wait + // for the same simulator to be returned. + let task = Task { + defer { + getBaseSimulatorTasks.removeValue(forKey: config) + } + + Logger.simulatorManager.info("📱 Creating \(config, privacy: .public) base simulator") + + let baseSimulator = + try await simulatorControl + .createBase( + name: config.baseDeviceName(), + with: config, + runtimeIdentifier: config.runtimeIdentifier() + ) + + Logger.simulatorManager.info( + "📱 Created \(config, privacy: .public) base simulator: \(baseSimulator, privacy: .public)" + ) + + return baseSimulator + } + + getBaseSimulatorTasks[config] = task + + return try await task.value + } + + private func incrementReferenceCount(for simulator: SimulatorUDID) { + var count = referenceCount[simulator] ?? 0 + count += 1 + referenceCount[simulator] = count + + Logger.simulatorManager.debug( + """ + 🔼 Reference count for simulator \(simulator, privacy: .public) is now \ + \(count, privacy: .public) + """ + ) + } + + private func decrementReferenceCount( + for simulator: SimulatorUDID, + config: SimulatorConfig, + slotIndex: Int + ) async throws { + guard var count = referenceCount[simulator] else { + // Simulator was already deleted, nothing to do + return + } + + count -= 1 + referenceCount[simulator] = count + + Logger.simulatorManager.debug( + "🔽 Reference count for \(simulator, privacy: .public) is now \(count, privacy: .public)" + ) + + guard count == 0 else { + return + } + + // Wait a bit before deleting simulators, to allow them to be reused + await pendingDeletion(simulator, config: config, slotIndex: slotIndex) + } + + // Warning: We must update slots before we `await` on anything in this function (unless that + // method updates slots before `await`ing on anything). + private func getSimulator( + for config: SimulatorConfig, + exclusive: Bool + ) async throws -> (simulator: SimulatorUDID, slotIndex: Int) { + if simulatorSlots.keys.contains(config) == false { + simulatorSlots[config] = [] + } + + // Need to sort so we reuse the the correct slots + let sortedSlots = simulatorSlots[config]!.enumerated().sorted { lhs, rhs in + let lhsSortOrder = lhs.element.sortOrder + let rhsSortOrder = rhs.element.sortOrder + + guard lhsSortOrder == rhsSortOrder else { + // Sort by sort order first + return lhsSortOrder < rhsSortOrder + } + + // If the sort order is the same, sort by index + return lhs.offset < rhs.offset + } + + for (index, slot) in sortedSlots { + switch slot { + case .active(let simulator, false) where exclusive != true: + // We have an active non-exclusive simulator, so reuse it + return try await ( + reuseSimulator(simulator, config: config, exclusive: exclusive, slotIndex: index), + slotIndex: index + ) + + case .pendingDeletion(let simulator, let task): + // We have a pending deletion, so we can reuse it + Logger.simulatorManager.info( + """ + ♻️ Turning a pending deletion of simulator \(simulator, privacy: .public) into an \ + active \(exclusive ? "exclusive" : "non-exclusive", privacy: .public) simulator + """ + ) + + simulatorSlots[config]![index] = .active(simulator, exclusive: exclusive) + + task.cancel() + + return try await ( + reuseSimulator(simulator, config: config, exclusive: exclusive, slotIndex: index), + slotIndex: index + ) + + case .empty: + let task = createCloneTask(config: config, exclusive: exclusive, slotIndex: index) + simulatorSlots[config]![index] = .pendingCreation(task, exclusive: exclusive) + return try await (task.value, slotIndex: index) + + case .pendingCreation(let task, false) where exclusive == false: + // We have a non-exclusive simulator pending creation, so reuse it + let simulator = try await task.value + + // We call `incrementReferenceCount()` instead of `reuseSimulator()` here, because the + // simulator is freshly created, so we can (hopefully) assume it is in a good state + incrementReferenceCount(for: simulator) + + return (simulator, slotIndex: index) + + default: + // Ignore incompatible slots + break + } + } + + // If we got here, we need to add a new slot + let index = simulatorSlots[config]!.count + let task = createCloneTask(config: config, exclusive: exclusive, slotIndex: index) + simulatorSlots[config]!.append(.pendingCreation(task, exclusive: exclusive)) + return try await (task.value, slotIndex: index) + } + + private func reuseSimulator( + _ simulator: SimulatorUDID, + config: SimulatorConfig, + exclusive: Bool, + slotIndex: Int + ) async throws -> SimulatorUDID { + incrementReferenceCount(for: simulator) + + do { + // Wait for it to boot. This shouldn't be necessary, but sometimes the simulator will + // reboot because of a migration. This also guards against a simulator being deleted out + // from under us, as it will error, and we can then "delete" it and return a new one. + try await simulatorControl.ensureBooted( + simulator, + context: "getSimulator, reused: \(config.cloneDeviceName(index: slotIndex))" + ) + + return simulator + } catch let error as ProcessError { + // 148 happens for "Invalid device". So it either has already been deleted or it's corrupt + // in some way. Either way, we will "delete" it and return a new one. + guard error.exitCode == 148 else { + throw error + } + + Logger.simulatorManager.warning( + """ + ⚠️ Boot of existing simulator \(simulator, privacy: .public) failed; deleting and \ + returning a new simulator: \(error, privacy: .public) + """ + ) + + // If we fail to delete, don't throw an error + try? await delete( + simulator, + config: config, + slotIndex: slotIndex, + // We can't clean up slots, since we assign to it below + cleanUpSlots: false, + context: "getSimulator, reused: \(config.cloneDeviceName(index: slotIndex))" + ) + + let task = createCloneTask(config: config, exclusive: exclusive, slotIndex: slotIndex) + simulatorSlots[config]![slotIndex] = .pendingCreation(task, exclusive: exclusive) + return try await task.value + } + } + + private func createCloneTask( + config: SimulatorConfig, + exclusive: Bool, + slotIndex: Int + ) -> Task { + return Task { + do { + let simulator = try await simulatorControl.clone( + getBase(for: config), + name: config.cloneDeviceName(index: slotIndex), + deviceType: config.deviceType, + runtimeIdentifier: config.runtimeIdentifier(), + postBoot: postBoot + ) + + simulatorSlots[config]![slotIndex] = .active(simulator, exclusive: exclusive) + + // We want to increment the reference count as soon as we get back from `await`, to ensure + // that when we suspend and potentially decrement the reference count, we don't delete the + // simulator before we have a chance to use it. Also, since we created the simulator, we + // should be responsible for incrementing the reference count. Any functions that reuse + // this task need to increment the reference count as well. + incrementReferenceCount(for: simulator) + + return simulator + } catch { + // If we fail to create the clone, we need to empty the slot, instead + // of leaving it in a pending state + simulatorSlots[config]![slotIndex] = .empty + + throw error + } + } + } + + private func registerReleaseOnExit(for leaser: PID) { + let processSource = + DispatchSource.makeProcessSource(identifier: leaser, eventMask: .exit, queue: .main) + + var handledExit = false + let onExitHandler: () -> Void = { [weak self] in + // Avoid double handling of exit in case the process exits between + // `processSource.resume()` and the check with `kill` + guard !handledExit else { return } + handledExit = true + + Task { + guard let self else { return } + + Logger.simulatorManager.debug("👋 PID \(leaser, privacy: .public) exited") + + try await self.release(for: leaser) + } + } + + processSource.setEventHandler { onExitHandler() } + processSource.resume() + + // Check to see if the process is already dead and cancel the source if it is, which will + // trigger `setCancelHandler`, which releases the simulator + guard kill(leaser, 0) == 0 else { + processSource.cancel() + onExitHandler() + return + } + + leaserExitListeners[leaser] = processSource + } + + private func removeReleaseOnExit(for leaser: PID) { + guard let leaserExitListener = leaserExitListeners[leaser] else { return } + leaserExitListeners.removeValue(forKey: leaser) + leaserExitListener.cancel() + } + + private func pendingDeletion( + _ simulator: SimulatorUDID, + config: SimulatorConfig, + slotIndex: Int + ) async { + guard deleteIdleAfter > 0 || deleteRecentlyUsedIdleAfter > 0 else { + // If we fail to delete, don't throw an error + try? await delete( + simulator, + config: config, + slotIndex: slotIndex, + cleanUpSlots: true, + context: "pendingDeletion immediate" + ) + return + } + + let task = Task { + Logger.simulatorManager.info( + """ + 💤 Scheduling delete of simulator \(simulator, privacy: .public) in \ + \(self.deleteIdleAfter, privacy: .public) to \ + \(self.deleteRecentlyUsedIdleAfter, privacy: .public) seconds + """ + ) + + let now = Date() + let shortDeadline = now.addingTimeInterval(TimeInterval(deleteIdleAfter)) + let recentlyUsedDeadline = now.addingTimeInterval(TimeInterval(deleteRecentlyUsedIdleAfter)) + + while true { + let remainingTime: TimeInterval + if recentlyLeased.contains(config) { + remainingTime = recentlyUsedDeadline.timeIntervalSinceNow + } else { + remainingTime = shortDeadline.timeIntervalSinceNow + } + + if remainingTime <= 0 { + break + } + + // Sleep for up-to 1 second before next check + try await Task.sleep(for: .seconds(min(remainingTime, 1))) + } + + guard case .pendingDeletion(let slotSimulator, _) = simulatorSlots[config]![slotIndex], + simulator == slotSimulator else { + // Simulator was reused, no need to delete + return + } + + // If we fail to delete, don't throw an error + try? await delete( + simulator, + config: config, + slotIndex: slotIndex, + cleanUpSlots: true, + context: "pendingDeletion delayed" + ) + } + + simulatorSlots[config]![slotIndex] = .pendingDeletion(simulator, task) + } + + private func delete( + _ simulator: SimulatorUDID, + config: SimulatorConfig, + slotIndex: Int, + cleanUpSlots: Bool, + context: @escaping @autoclosure () -> String? + ) async throws { + let name = config.cloneDeviceName(index: slotIndex) + + Logger.simulatorManager.info( + "🗑️ Deleting simulator \(simulator, privacy: .public) (\(name, privacy: .public))" + ) + + simulatorSlots[config]![slotIndex] = .deleting(simulator) + + referenceCount.removeValue(forKey: simulator) + + defer { + // Even if we fail to delete, we need to set the slot to empty + simulatorSlots[config]![slotIndex] = .empty + + if cleanUpSlots { + // Shorten up the array by removing any empty slots at the end + while case .empty = simulatorSlots[config]!.last { + simulatorSlots[config]!.removeLast() + } + } + } + + try await simulatorControl.delete(simulator, name: name, context: context()) + + Logger.simulatorManager.info( + "🗑️ Deleted simulator \(simulator, privacy: .public) (\(name, privacy: .public)" + ) + } + + // MARK: Child Process Management + + private nonisolated func createStartChildProcessTask(path: String) -> Task { + return Task.detached { [weak self] in + let process: Process + do { + guard let self else { return } + process = try await self.createProcess(path: path) + // `self` drops out of scope here, so `SimulatorManager` can deinit + } catch { + Logger.simulatorManager.info( + """ + ❌ Failed to create child process at "\(path, privacy: .public)": \ + \(error, privacy: .public) + """ + ) + return + } + + await withCheckedContinuation { cont in + process.terminationHandler = { proc in + let exitCode = proc.terminationStatus + Logger.simulatorManager.warning( + """ + ⚠️ "\(path, privacy: .public)" exited with code: \(exitCode, privacy: .public) + """ + ) + cont.resume() + } + + do { + Logger.simulatorManager.info( + #"🧒 Starting "\#(path, privacy: .public)""# + ) + try process.run() + } catch { + Logger.simulatorManager.info( + """ + ❌ Failed to start "\(path, privacy: .public)": \ + \(error, privacy: .public) + """ + ) + cont.resume() + } + } + } + } + + private func createProcess(path: String) throws -> Process { + let process = Process() + + process.executableURL = URL(fileURLWithPath: path) + + let outPTY = try PTY() + process.standardOutput = FileHandle(fileDescriptor: outPTY.child, closeOnDealloc: true) + let outQueue = DispatchQueue(label: "com.example.simulator_manager.child_process.out") + let outWatcher = watch(fd: outPTY.parent, queue: outQueue) { line in + Logger.childProcess.info("[\(path, privacy: .public)] \(line, privacy: .public)") + } + + let errPTY = try PTY() + process.standardError = FileHandle(fileDescriptor: errPTY.child, closeOnDealloc: true) + let errQueue = DispatchQueue(label: "com.example.simulator_manager.child_process.err") + let errWatcher = watch(fd: errPTY.parent, queue: errQueue) { line in + Logger.childProcess.error("[\(path, privacy: .public)] \(line, privacy: .public)") + } + + childProcesses[path] = (process, outWatcher, errWatcher) + + return process + } +} + +/// Installs a DispatchSourceRead on `fd`. +private func watch( + fd: Int32, + queue: DispatchQueue, + onLine: @escaping (String) -> Void +) -> DispatchSourceRead { + let src = DispatchSource.makeReadSource(fileDescriptor: fd, queue: queue) + var buffer = Data() + + src.setEventHandler { + var tmp = [UInt8](repeating: 0, count: 4096) + let n = read(fd, &tmp, tmp.count) + guard n > 0 else { + src.cancel() + close(fd) + return + } + + buffer.append(contentsOf: tmp[0..?) { + let response = unwrapOutboundIn(data) + + write(context: context, response: response) + } + + private func write(context: ChannelHandlerContext, response: SimulatorManagerResponse) { + let message = response.message + "\n" + + context.write( + wrapOutboundOut( + .init( + head: .init( + version: .http1_1, + status: response.status, + headers: .defaultHeaders(for: message) + ), + body: context.channel.allocator.buffer(string: message) + ) + ), + promise: nil + ) + } +} + +extension HTTPHeaders { + static func defaultHeaders(for message: String) -> HTTPHeaders { + var headers = HTTPHeaders() + headers.add(name: "Content-Length", value: "\(message.utf8.count)") + headers.add(name: "Content-Type", value: "text/plain") + return headers + } +} diff --git a/apple/testing/simulator_manager/Sources/SimulatorRequestHandler.swift b/apple/testing/simulator_manager/Sources/SimulatorRequestHandler.swift new file mode 100644 index 0000000000..48ecd5d737 --- /dev/null +++ b/apple/testing/simulator_manager/Sources/SimulatorRequestHandler.swift @@ -0,0 +1,92 @@ +import NIOHTTP1 + +final class SimulatorRequestHandler { + private let simulatorManager: SimulatorManager + + init(simulatorManager: SimulatorManager) { + self.simulatorManager = simulatorManager + } + + func handleRequest( + method: HTTPMethod, + pathComponents: [String], + queryParameters: [String: String] + ) async throws -> SimulatorManagerResponse { + switch method { + case .POST: + guard pathComponents.count >= 1 else { + return .init( + status: .badRequest, + message: "Must specify " + ) + } + + guard let leaser = PID(pathComponents[0]) else { + return .init(status: .badRequest, message: "Leaser PID must be an integer") + } + guard let exclusiveString = queryParameters["exclusive"] else { + return .init(status: .badRequest, message: "Must specify 'exclusive' query parameter") + } + let exclusive = exclusiveString == "1" + guard let deviceType = queryParameters["deviceType"] else { + return .init(status: .badRequest, message: "Must specify 'deviceType' query parameter") + } + guard let os = queryParameters["os"] else { + return .init(status: .badRequest, message: "Must specify 'os' query parameter") + } + guard let version = queryParameters["version"] else { + return .init(status: .badRequest, message: "Must specify 'version' query parameter") + } + + let config = SimulatorConfig( + deviceType: deviceType, + os: os, + version: version + ) + + do { + return try await .init( + status: .created, + message: simulatorManager + .lease(to: leaser, exclusive: exclusive, config: config) + ) + } catch SimulatorManagerError.alreadyLeased(let udid) { + return .init( + status: .badRequest, + // FIXME: Get this from the error itself + message: "PID \(leaser) has already leased another simulator: \(udid)" + ) + } + + case .DELETE: + guard pathComponents.count >= 1 else { + return .init( + status: .badRequest, + message: "Must specify " + ) + } + + guard let leaser = PID(pathComponents[0]) else { + return .init(status: .badRequest, message: "Leaser PID must be an integer") + } + + do { + try await simulatorManager.release(for: leaser) + + return .init( + status: .ok, + message: "Success" + ) + } catch SimulatorManagerError.noLease { + return .init( + status: .notFound, + // FIXME: Get this message from the error itself + message: "PID \(leaser) doesn't have a simulator leased" + ) + } + + default: + return .init(status: .methodNotAllowed, message: "Unsupported HTTP method: \(method)") + } + } +} diff --git a/apple/testing/simulator_manager/Tests/SimulatorManagerTests.swift b/apple/testing/simulator_manager/Tests/SimulatorManagerTests.swift new file mode 100644 index 0000000000..5117dc04e9 --- /dev/null +++ b/apple/testing/simulator_manager/Tests/SimulatorManagerTests.swift @@ -0,0 +1,213 @@ +@testable import simulator_manager +import XCTest + +final class SimulatorManagerTests: XCTestCase { + func test_lease_release_lease() async throws { + let mockSimulatorControl = MockSimulatorControl() + let simulatorManager = SimulatorManager( + simulatorControl: mockSimulatorControl, + deleteRecentlyUsedIdleAfter: 0, + deleteIdleAfter: 0, + recentlyUsedCapacity: 1, + deleteOnPIDExit: false + ) + let leaser: PID = 1234 + let config = SimulatorConfig(deviceType: "iPhone 14", os: "iOS", version: "16.4") + + let simulator1 = + try await simulatorManager + .lease(to: leaser, exclusive: false, config: config) + let maybeBaseSimulators1 = await mockSimulatorControl.baseSimulators[config] + let baseSimulators1 = try XCTUnwrap(maybeBaseSimulators1) + XCTAssert(baseSimulators1.count == 1) + let baseSimulator1 = baseSimulators1[0] + let maybeBaseForClone = await mockSimulatorControl.baseForClones[simulator1] + let baseForClone = try XCTUnwrap(maybeBaseForClone) + + try await simulatorManager.release(for: leaser) + let maybeBaseSimulators2 = await mockSimulatorControl.baseSimulators[config] + let baseSimulators2 = try XCTUnwrap(maybeBaseSimulators2) + XCTAssert(baseSimulators2.count == 1) + let baseSimulator2 = baseSimulators2[0] + + let simulator2 = + try await simulatorManager + .lease(to: leaser, exclusive: false, config: config) + let maybeBaseSimulators3 = await mockSimulatorControl.baseSimulators[config] + let baseSimulators3 = try XCTUnwrap(maybeBaseSimulators3) + XCTAssert(baseSimulators3.count == 2) + let baseSimulator3 = baseSimulators3[1] + + // Simulator is a clone (not the same as base) + XCTAssertNotEqual(simulator1, baseSimulator1) + XCTAssertEqual(baseForClone, baseSimulator1) + + // Even with reuse, after last use of a simulator it's deleted + XCTAssertNotEqual(simulator1, simulator2) + + XCTAssertEqual(baseSimulator1, baseSimulator2) + + // We don't cache our clones + XCTAssertNotEqual(baseSimulator2, baseSimulator3) + } + + func test_lease_reuse() async throws { + let mockSimulatorControl = MockSimulatorControl() + let simulatorManager = SimulatorManager( + simulatorControl: mockSimulatorControl, + deleteRecentlyUsedIdleAfter: 0, + deleteIdleAfter: 0, + recentlyUsedCapacity: 1, + deleteOnPIDExit: false + ) + let leaser1: PID = 1234 + let leaser2: PID = 1235 + let config = SimulatorConfig(deviceType: "iPhone 14", os: "iOS", version: "16.4") + + let simulator1 = + try await simulatorManager + .lease(to: leaser1, exclusive: false, config: config) + let simulator2 = + try await simulatorManager + .lease(to: leaser2, exclusive: false, config: config) + + XCTAssertEqual(simulator1, simulator2) + } + + func test_lease_exclusive() async throws { + let mockSimulatorControl = MockSimulatorControl() + let simulatorManager = SimulatorManager( + simulatorControl: mockSimulatorControl, + deleteRecentlyUsedIdleAfter: 0, + deleteIdleAfter: 0, + recentlyUsedCapacity: 1, + deleteOnPIDExit: false + ) + let leaser1: PID = 1234 + let leaser2: PID = 1235 + let config = SimulatorConfig(deviceType: "iPhone 14", os: "iOS", version: "16.4") + + let simulator1 = + try await simulatorManager + .lease(to: leaser1, exclusive: true, config: config) + let simulator2 = + try await simulatorManager + .lease(to: leaser2, exclusive: true, config: config) + + XCTAssertNotEqual(simulator1, simulator2) + } + + func test_lease_twice() async throws { + let mockSimulatorControl = MockSimulatorControl() + let simulatorManager = SimulatorManager( + simulatorControl: mockSimulatorControl, + deleteRecentlyUsedIdleAfter: 0, + deleteIdleAfter: 0, + recentlyUsedCapacity: 1, + deleteOnPIDExit: false + ) + let leaser: PID = 1234 + let config = SimulatorConfig(deviceType: "iPhone 14", os: "iOS", version: "16.4") + + _ = + try await simulatorManager + .lease(to: leaser, exclusive: true, config: config) + + // swiftformat:disable:next hoistAwait + try await assertThrowsAsyncError( + await simulatorManager + .lease(to: leaser, exclusive: true, config: config) + ) + } + + func test_release_alone() async throws { + let mockSimulatorControl = MockSimulatorControl() + let simulatorManager = SimulatorManager( + simulatorControl: mockSimulatorControl, + deleteRecentlyUsedIdleAfter: 0, + deleteIdleAfter: 0, + recentlyUsedCapacity: 1, + deleteOnPIDExit: false + ) + let leaser: PID = 1234 + + // swiftformat:disable:next hoistAwait + try await assertThrowsAsyncError(await simulatorManager.release(for: leaser)) + } +} + +func assertThrowsAsyncError( + _ expression: @autoclosure () async throws -> some Any, + _ message: @autoclosure () -> String = "", + file: StaticString = #filePath, + line: UInt = #line, + _ errorHandler: (_ error: Error) -> Void = { _ in } +) async { + do { + _ = try await expression() + // expected error to be thrown, but it was not + let customMessage = message() + if customMessage.isEmpty { + XCTFail("Asynchronous call did not throw an error.", file: file, line: line) + } else { + XCTFail(customMessage, file: file, line: line) + } + } catch { + errorHandler(error) + } +} + +actor MockSimulatorControl: SimulatorControl { + var baseSimulators: [SimulatorConfig: [SimulatorUDID]] = [:] + var baseForClones: [SimulatorUDID: SimulatorUDID] = [:] + + func createBase( + name: String, + with config: SimulatorConfig, + runtimeIdentifier: String + ) async throws -> SimulatorUDID { + let simulator = UUID().uuidString + baseSimulators[config, default: []].append(simulator) + return simulator + } + + func clone( + _ simulator: SimulatorUDID, + name: String, + deviceType: String, + runtimeIdentifier: String, + postBoot: String? + ) async throws -> SimulatorUDID { + let clone = UUID().uuidString + baseForClones[clone] = simulator + return clone + } + + func ensureBooted( + _ simulator: SimulatorUDID, + context: @escaping @autoclosure () -> String? + ) async throws { + return + } + + func cleanTempFiles(in simulator: SimulatorUDID) { + return + } + + func delete( + _ simulator: SimulatorUDID, + name: String, + context: @escaping @autoclosure () -> String? + ) async throws { + return + } + + func getExisting( + name: String, + deviceType: String, + runtimeIdentifier: String, + context: @escaping @autoclosure () -> String? + ) async throws -> String? { + return nil + } +} diff --git a/apple/testing/simulator_manager/prepare_simulator.sh b/apple/testing/simulator_manager/prepare_simulator.sh new file mode 100755 index 0000000000..cb27276c1c --- /dev/null +++ b/apple/testing/simulator_manager/prepare_simulator.sh @@ -0,0 +1,47 @@ +#!/bin/bash + +set -euo pipefail + +readonly plist="/tmp/$SIMULATOR_UDID.simulator.defaults.plist" +trap 'rm -f "$plist"' EXIT + +function set_or_add_to_simulator_app_defaults() { + local -r _keypath="$1" + local -r _type="$2" + local -r _value="$3" + IFS=':' read -r -a _parts <<< "$_keypath" + + local -r _last_index=$((${#_parts[@]} - 1)) + + # Export the plist to a temporary file + /usr/bin/defaults export com.apple.iphonesimulator "$plist" + + # Create intermediate dicts + local _prefix="" + for ((i = 0; i < ${#_parts[@]} - 1; i++)); do + _prefix="$_prefix:${_parts[i]}" + /usr/libexec/PlistBuddy -c "Print $_prefix" "$plist" \ + > /dev/null 2>&1 || /usr/libexec/PlistBuddy -c "Add $_prefix dict" "$plist" + done + + # Set or add the final key + local -r _full="$_prefix:${_parts[$_last_index]}" + if + ! /usr/libexec/PlistBuddy -c "Set $_full $_type $_value" "$plist" \ + 2> /dev/null + then + /usr/libexec/PlistBuddy -c "Add $_full $_type $_value" "$plist" + fi + + # Import the modified plist back to the simulator + /usr/bin/defaults import com.apple.iphonesimulator "$plist" +} + +# Disable hardware keyboard +set_or_add_to_simulator_app_defaults \ + "DevicePreferences:$SIMULATOR_UDID:ConnectHardwareKeyboard" "bool" "false" + +# Disable slide typing prompt +/usr/bin/xcrun simctl spawn "$SIMULATOR_UDID" \ + defaults write com.apple.keyboard.preferences \ + DidShowContinuousPathIntroduction 1 diff --git a/apple/testing/simulator_manager/start.sh b/apple/testing/simulator_manager/start.sh new file mode 100755 index 0000000000..596ab915f7 --- /dev/null +++ b/apple/testing/simulator_manager/start.sh @@ -0,0 +1,242 @@ +#!/bin/bash + +set -euo pipefail + +#!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! WARNING !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! +# +# Only change this version after all testing has happened. +# +# If this version is changed then CI won't use the staging RBE pool, and the +# changes will impact all executors in the default pool. +# +#!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! WARNING !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! +readonly non_staging_version=41 + +if [[ -z "${EXAMPLE_CI_STAGING_VERSION:-}" ]]; then + readonly expected_version="$non_staging_version" +else + readonly expected_version="staging-$EXAMPLE_CI_STAGING_VERSION" +fi + +function check_need_shutdown() { + local -r current_version="$1" + + if [[ -z "$current_version" ]]; then + return 1 + fi + + if [[ -z "${EXAMPLE_CI_STAGING_VERSION:-}" ]]; then + if [[ $current_version =~ ^[0-9]+$ ]]; then + [[ "$current_version" -lt "$expected_version" ]] + else + return 0 + fi + else + [[ "$current_version" != "$expected_version" ]] + fi +} + +# While we are only allowing a single simulator on remote executors we need to +# more aggressively clean up unused simulators. If this is too large, then we +# have an extra 6GB memory usage sticking around. If this is too small, then we +# will constantly be deleting and creating simulators, adding at least 10 +# seconds to test runtimes. +readonly delete_after_idle_secs=0 +readonly delete_recently_used_after_idle_secs="${EXAMPLE_SIMULATOR_MANAGER_DELETE_RECENTLY_USED_AFTER_IDLE_SECS:-60}" # TODO: Tweak this more. Adjusted to 1 minute to help BuildBuddy with disk space issues. +readonly recently_used_capacity="${EXAMPLE_SIMULATOR_MANAGER_RECENTLY_USED_CAPACITY:-1}" + +readonly mutex_timeout=60 +readonly shutdown_timeout=45 +readonly startup_timeout=10 +readonly stale_mutex_seconds=120 +readonly socket="/tmp/simulator_manager.sock" +readonly pid_path="/tmp/simulator_manager.pid" +readonly mutex_path="/tmp/simulator_manager_start.lock" +readonly scripts_path="/tmp/simulator_manager.scripts" + +function exitMutex() { + rmdir "$mutex_path" 2> /dev/null || true + trap - EXIT +} + +function enterMutex() { + if [[ -d "$mutex_path" ]]; then + local mtime + mtime=$(stat -f %B "$mutex_path") + local now + now=$(date +%s) + if (((now - mtime) > stale_mutex_seconds)); then + echo >&2 "$(date '+[%H:%M:%S]') ⚠️ Deleting stale lock file. This" \ + "shouldn't happen." + rm -rf "$mutex_path" + fi + fi + + for _ in $(seq 1 "$mutex_timeout"); do + if mkdir "$mutex_path" 2> /dev/null; then + trap 'exitMutex' EXIT + return 0 + fi + sleep 1 + done + + return 1 +} + +if ! enterMutex; then + echo >&2 "$(date '+[%H:%M:%S]') ❌ Failed to acquire lock after" \ + "$mutex_timeout seconds; exiting" + exit 1 +fi + +if + { server_pid=$(< "$pid_path"); } 2> /dev/null \ + && ps -p "$server_pid" > /dev/null +then + echo "$(date '+[%H:%M:%S]') Existing simulator manager found; checking" \ + "version" + if version=$( + curl \ + --silent \ + --fail-with-body \ + --unix-socket "$socket" \ + -XGET \ + 'http:/-/version' + ); then + echo "$(date '+[%H:%M:%S]') Existing simulator manager version is $version" + else + echo >&2 "$(date '+[%H:%M:%S]') ❌ Failed to get simulator manager" \ + "version: $version" + server_pid="" + version="" + fi +else + echo "$(date '+[%H:%M:%S]') Existing simulator manager not found" + server_pid="" + version="" +fi + +if check_need_shutdown "$version"; then + echo "$(date '+[%H:%M:%S]') Shutting down existing simulator manager to" \ + "upgrade to version $expected_version" + if response=$( + curl \ + --silent \ + --fail-with-body \ + --unix-socket "$socket" \ + -XPOST \ + 'http:/-/shutdown' + ); then + echo "$(date '+[%H:%M:%S]') Sent graceful shutdown request to simulator" \ + "manager" + + for _ in $(seq 1 "$shutdown_timeout"); do + if ! ps -p "$server_pid" > /dev/null; then + echo "$(date '+[%H:%M:%S]') Simulator manager shut down successfully" + break + fi + sleep 1 + done + if ps -p "$server_pid" > /dev/null; then + echo >&2 "$(date '+[%H:%M:%S]') 🛑 Simulator manager did not shutdown" \ + "in $shutdown_timeout seconds; killing it" + kill -9 "$server_pid" + fi + else + echo >&2 "$(date '+[%H:%M:%S]') 🛑 Failed to gracefully shut down" \ + "simulator manager ($response); killing it" + kill -9 "$server_pid" + fi + + # Kill all straggler processes, in case they started under a different pid + killall simulator_manager_opt 2> /dev/null || true + + server_pid="" + version="" +fi + +if [[ -z "$version" ]]; then + rm -f "$pid_path" || true + + echo "$(date '+[%H:%M:%S]') Starting simulator manager" + + # --- begin runfiles.bash initialization v3 --- + # Copy-pasted from the Bazel Bash runfiles library v3. + set +e + f=bazel_tools/tools/bash/runfiles/runfiles.bash + # shellcheck disable=SC1090 + source "${RUNFILES_DIR:-/dev/null}/$f" 2> /dev/null \ + || source "$(grep -sm1 "^$f " "${RUNFILES_MANIFEST_FILE:-/dev/null}" | cut -f2- -d' ')" 2> /dev/null \ + || source "$0.runfiles/$f" 2> /dev/null \ + || source "$(grep -sm1 "^$f " "$0.runfiles_manifest" | cut -f2- -d' ')" 2> /dev/null \ + || source "$(grep -sm1 "^$f " "$0.exe.runfiles_manifest" | cut -f2- -d' ')" 2> /dev/null \ + || { + echo >&2 "ERROR: cannot find $f" + exit 1 + } + f= + set -e + # --- end runfiles.bash initialization v3 --- + + simulator_manager="$( + rlocation _main/tools/snoozel/simulator_manager/simulator_manager_opt + )" + bazel_prepare_simulator="$( + rlocation _main/tools/snoozel/simulator_manager/prepare_simulator + )" + + # We need to copy the prepare_simulator script to a location that will stick + # around after the test runner exits. On RBE the workspace is deleted after + # an action runs, and if the simulator manager tries to run this script then + # it will fail with an error like: + # + # shell-init: error retrieving current directory: getcwd: cannot access parent directories: No such file or directory + # chdir: error retrieving current directory: getcwd: cannot access parent directories: No such file or directory + readonly prepare_simulator="$scripts_path/prepare_simulator.sh" + mkdir -p "$scripts_path" + cp "$bazel_prepare_simulator" "$prepare_simulator" + + # Run server in background, in a new process group + # Adjust `PATH` since it's limited when the simulator manager is started in a + # test runner (in particular it doesn't have `/usr/sbin:/sbin` in `PATH`) + PATH="/usr/bin:/bin:/usr/sbin:/sbin" \ + perl -e 'use POSIX setsid; setsid or die "setsid: $!"; exec @ARGV' \ + "$simulator_manager" \ + --version \ + "$expected_version" \ + --pid-path \ + "$pid_path" \ + --unix-socket-path \ + "$socket" \ + --delete-recently-used-idle-after \ + "$delete_recently_used_after_idle_secs" \ + --delete-idle-after \ + "$delete_after_idle_secs" \ + --recently-used-capacity \ + "$recently_used_capacity" \ + "--post-boot" \ + "$prepare_simulator" \ + > /dev/null 2>&1 \ + & + + for _ in $(seq 1 "$startup_timeout"); do + if [[ -f "$pid_path" ]]; then + break + fi + sleep 1 + done + if [[ ! -f "$pid_path" ]]; then + echo >&2 "$(date '+[%H:%M:%S]') error: ❌ Failed to start simulator" \ + "manager in 10 seconds; exiting" + exit 1 + fi + + server_pid=$(< "$pid_path") + + echo "$(date '+[%H:%M:%S]') ✅ Started simulator manager with pid $server_pid" +else + echo "$(date '+[%H:%M:%S]') ✅ Simulator manager with version $version" \ + "already running with pid $server_pid" +fi + +exitMutex