diff --git a/.gitignore b/.gitignore index 38c671ead86..85cc2761334 100644 --- a/.gitignore +++ b/.gitignore @@ -3,6 +3,7 @@ .DS_Store .artifactory .idea/* +**/.idea/* .ensime_cache/* .config/* .local/* @@ -47,3 +48,6 @@ private_docker_papi_v2_usa.options tesk_application_ftp.conf ftp_centaur_cwl_runner.conf tesk_application.conf +**/__pycache__/ +**/venv/ +exome_germline_single_sample_v1.3/ diff --git a/.travis.yml b/.travis.yml index c59e645e2b2..301a2894a22 100644 --- a/.travis.yml +++ b/.travis.yml @@ -108,6 +108,8 @@ env: BUILD_TYPE=dbms - >- BUILD_TYPE=singleWorkflowRunner + - >- + BUILD_TYPE=metadataComparisonPython script: - src/ci/bin/test.sh notifications: diff --git a/CHANGELOG.md b/CHANGELOG.md index 2799e39ec59..cfbcde6e932 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,37 @@ # Cromwell Change Log +## 51 Release Notes + +### Changes and Warnings + +The configuration format for call cache blacklisting has been updated, please see the [call caching documentation]( +https://cromwell.readthedocs.io/en/stable/Configuring/#call-caching) for details. + +### Bug fixes + +* Fixed a bug where the `size(...)` function did not work correctly on files + from a shared filesystem if `size(...)` was called in the input section on a + relative path. ++ Fixed a bug where the `use_relative_output_paths` option would not preserve intermediate folders. + +### New functionality + +#### Call caching blacklisting improvements + +Cromwell previously supported blacklisting GCS buckets containing cache hits which could not be copied for permissions +reasons. Cromwell now adds support for blacklisting individual cache hits which could not be copied for any reason, +as well as grouping blacklist caches according to a workflow option key. More information available in the [ +call caching documentation]( https://cromwell.readthedocs.io/en/stable/Configuring/#call-caching). + +#### new xxh64 and fingerprint strategies for call caching + +Existing call cache strategies `path` and `path+modtime` don't work when using docker on shared filesystems +(SFS backend, i.e. not in cloud storage). The `file` (md5sum) strategy works, but uses a lot of resources. +Two faster strategies have been added for this use case: `xxh64` and +`fingerprint`. `xxh64` is a lightweight hashing algorithm, `fingerprint` is a strategy designed to be very +lightweight. Read more about it in the [call caching documentation]( +https://cromwell.readthedocs.io/en/stable/Configuring/#call-caching). + ## 50 Release Notes ### Changes and Warnings @@ -11,7 +43,6 @@ Cromwell's metadata archival configuration has changed in a backwards incompatib please see [the updated documentation](https://cromwell.readthedocs.io/en/stable/Configuring#hybrid-metadata-storage-classic-carbonite) for details. - ## 49 Release Notes ### Changes and Warnings diff --git a/backend/src/main/scala/cromwell/backend/BackendCacheHitCopyingActor.scala b/backend/src/main/scala/cromwell/backend/BackendCacheHitCopyingActor.scala index e6bae1bebf7..54849df4250 100644 --- a/backend/src/main/scala/cromwell/backend/BackendCacheHitCopyingActor.scala +++ b/backend/src/main/scala/cromwell/backend/BackendCacheHitCopyingActor.scala @@ -3,15 +3,19 @@ package cromwell.backend import cromwell.backend.MetricableCacheCopyErrorCategory.MetricableCacheCopyErrorCategory import cromwell.core.JobKey import cromwell.core.simpleton.WomValueSimpleton +import cromwell.services.CallCaching.CallCachingEntryId object BackendCacheHitCopyingActor { - final case class CopyOutputsCommand(womValueSimpletons: Seq[WomValueSimpleton], jobDetritusFiles: Map[String, String], returnCode: Option[Int]) + final case class CopyOutputsCommand(womValueSimpletons: Seq[WomValueSimpleton], jobDetritusFiles: Map[String, String], cacheHit: CallCachingEntryId, returnCode: Option[Int]) - final case class CopyingOutputsFailedResponse(jobKey: JobKey, cacheCopyAttempt: Int, failure: CacheCopyError) + final case class CopyingOutputsFailedResponse(jobKey: JobKey, cacheCopyAttempt: Int, failure: CacheCopyFailure) - sealed trait CacheCopyError - final case class LoggableCacheCopyError(failure: Throwable) extends CacheCopyError - final case class MetricableCacheCopyError(failureCategory: MetricableCacheCopyErrorCategory) extends CacheCopyError + sealed trait CacheCopyFailure + /** A cache hit copy was attempted but failed. */ + final case class CopyAttemptError(failure: Throwable) extends CacheCopyFailure + /** Copying was requested for a blacklisted cache hit, however the cache hit copying actor found the hit had already + * been blacklisted so no novel copy attempt was made. */ + final case class BlacklistSkip(failureCategory: MetricableCacheCopyErrorCategory) extends CacheCopyFailure } object MetricableCacheCopyErrorCategory { @@ -20,4 +24,5 @@ object MetricableCacheCopyErrorCategory { override def toString: String = getClass.getSimpleName.stripSuffix("$").toLowerCase } final case object BucketBlacklisted extends MetricableCacheCopyErrorCategory + final case object HitBlacklisted extends MetricableCacheCopyErrorCategory } diff --git a/backend/src/main/scala/cromwell/backend/standard/callcaching/BlacklistCache.scala b/backend/src/main/scala/cromwell/backend/standard/callcaching/BlacklistCache.scala index b1878731fda..ff3248123ff 100644 --- a/backend/src/main/scala/cromwell/backend/standard/callcaching/BlacklistCache.scala +++ b/backend/src/main/scala/cromwell/backend/standard/callcaching/BlacklistCache.scala @@ -2,23 +2,59 @@ package cromwell.backend.standard.callcaching import com.google.common.cache.{CacheBuilder, CacheLoader} import cromwell.core.CacheConfig +import cromwell.services.CallCaching.CallCachingEntryId -case class BlacklistCache(config: CacheConfig) { - val cache = { - // Queries to the blacklist cache return false by default (i.e. not blacklisted). - val falseLoader = new CacheLoader[String, java.lang.Boolean]() { - override def load(key: String): java.lang.Boolean = false +sealed trait BlacklistStatus +case object BadCacheResult extends BlacklistStatus +case object GoodCacheResult extends BlacklistStatus +case object UntestedCacheResult extends BlacklistStatus + +sealed abstract class BlacklistCache(bucketCacheConfig: CacheConfig, + hitCacheConfig: CacheConfig, + val name: Option[String]) { + val bucketCache = { + // Queries to the bucket blacklist cache return UntestedCacheResult by default. + val unknownLoader = new CacheLoader[String, BlacklistStatus]() { + override def load(key: String): BlacklistStatus = UntestedCacheResult + } + + CacheBuilder. + newBuilder(). + concurrencyLevel(bucketCacheConfig.concurrency). + maximumSize(bucketCacheConfig.size). + expireAfterWrite(bucketCacheConfig.ttl.length, bucketCacheConfig.ttl.unit). + build[String, BlacklistStatus](unknownLoader) + } + + val hitCache = { + // Queries to the hit blacklist cache return UntestedCacheResult by default (i.e. not blacklisted). + val unknownLoader = new CacheLoader[CallCachingEntryId, BlacklistStatus]() { + override def load(key: CallCachingEntryId): BlacklistStatus = UntestedCacheResult } CacheBuilder. newBuilder(). - concurrencyLevel(config.concurrency). - maximumSize(config.size). - expireAfterWrite(config.ttl.length, config.ttl.unit). - build[String, java.lang.Boolean](falseLoader) + concurrencyLevel(hitCacheConfig.concurrency). + maximumSize(hitCacheConfig.size). + expireAfterWrite(hitCacheConfig.ttl.length, hitCacheConfig.ttl.unit). + build[CallCachingEntryId, BlacklistStatus](unknownLoader) } - def isBlacklisted(bucket: String): Boolean = cache.get(bucket) + def getBlacklistStatus(hit: CallCachingEntryId): BlacklistStatus = hitCache.get(hit) - def blacklist(bucket: String): Unit = cache.put(bucket, true) + def getBlacklistStatus(bucket: String): BlacklistStatus = bucketCache.get(bucket) + + def blacklist(hit: CallCachingEntryId): Unit = hitCache.put(hit, BadCacheResult) + + def blacklist(bucket: String): Unit = bucketCache.put(bucket, BadCacheResult) + + def whitelist(hit: CallCachingEntryId): Unit = hitCache.put(hit, GoodCacheResult) + + def whitelist(bucket: String): Unit = bucketCache.put(bucket, GoodCacheResult) } + +class RootWorkflowBlacklistCache(bucketCacheConfig: CacheConfig, hitCacheConfig: CacheConfig) extends + BlacklistCache(bucketCacheConfig = bucketCacheConfig, hitCacheConfig = hitCacheConfig, name = None) + +class GroupingBlacklistCache(bucketCacheConfig: CacheConfig, hitCacheConfig: CacheConfig, val group: String) extends + BlacklistCache(bucketCacheConfig = bucketCacheConfig, hitCacheConfig = hitCacheConfig, name = Option(group)) diff --git a/backend/src/main/scala/cromwell/backend/standard/callcaching/CallCachingBlacklistManager.scala b/backend/src/main/scala/cromwell/backend/standard/callcaching/CallCachingBlacklistManager.scala new file mode 100644 index 00000000000..a22631aeeb4 --- /dev/null +++ b/backend/src/main/scala/cromwell/backend/standard/callcaching/CallCachingBlacklistManager.scala @@ -0,0 +1,131 @@ +package cromwell.backend.standard.callcaching + +import akka.event.LoggingAdapter +import com.google.common.cache.{CacheBuilder, CacheLoader, LoadingCache} +import com.typesafe.config.Config +import cromwell.core.{CacheConfig, HasWorkflowIdAndSources} +import mouse.boolean._ +import net.ceedubs.ficus.Ficus._ + +import scala.concurrent.duration._ +import scala.language.postfixOps + +object CallCachingBlacklistManager { + object Defaults { + object Groupings { + val Concurrency = 10000 + val Size = 1000L + val Ttl = 2 hours + } + object Hits { + val Concurrency = 10000 + val Size = 20000L + val Ttl = 1 hour + } + object Buckets { + val Concurrency = 10000 + val Size = 1000L + val Ttl = 1 hour + } + } +} + +class CallCachingBlacklistManager(rootConfig: Config, logger: LoggingAdapter) { + + // Defined if "call-caching.blacklist-cache.enabled = true". + private val blacklistCacheConfig: Option[Unit] = + rootConfig.getOrElse("call-caching.blacklist-cache.enabled", false).option(()) + + // Defined if `blacklistCacheConfig` is defined and "call-caching.blacklist-cache.groupings.workflow-option" is defined. + private val blacklistGroupingWorkflowOptionKey: Option[String] = for { + _ <- blacklistCacheConfig // Only return a groupings cache if blacklisting is enabled. + workflowOption <- rootConfig.as[Option[String]]("call-caching.blacklist-cache.groupings.workflow-option") + } yield workflowOption + + // Defined if `blacklistGroupingWorkflowOptionKey` is defined. + private val blacklistGroupingCacheConfig: Option[CacheConfig] = { + import CallCachingBlacklistManager.Defaults.Groupings._ + for { + _ <- blacklistGroupingWorkflowOptionKey + groupingsOption = rootConfig.as[Option[Config]] ("call-caching.blacklist-cache.groupings") + conf = CacheConfig.config(groupingsOption, defaultConcurrency = Concurrency, defaultSize = Size, defaultTtl = Ttl) + } yield conf + } + + // Defined if `blacklistCacheConfig` is defined. + private val blacklistBucketCacheConfig: Option[CacheConfig] = { + import CallCachingBlacklistManager.Defaults.Buckets._ + for { + _ <- blacklistCacheConfig + bucketsOption = rootConfig.as[Option[Config]]("call-caching.blacklist-cache.buckets") + conf = CacheConfig.config(bucketsOption, defaultConcurrency = Concurrency, defaultSize = Size, defaultTtl = Ttl) + } yield conf + } + + // Defined if `blacklistCacheConfig` is defined. + private val blacklistHitCacheConfig: Option[CacheConfig] = { + import CallCachingBlacklistManager.Defaults.Hits._ + for { + _ <- blacklistCacheConfig + hitsOption = rootConfig.as[Option[Config]]("call-caching.blacklist-cache.hits") + conf = CacheConfig.config(hitsOption, defaultConcurrency = Concurrency, defaultSize = Size, defaultTtl = Ttl) + } yield conf + } + + // If configuration allows, build a cache of blacklist groupings to BlacklistCaches. + private val blacklistGroupingsCache: Option[LoadingCache[String, BlacklistCache]] = { + def buildBlacklistGroupingsCache(groupingConfig: CacheConfig, bucketConfig: CacheConfig, hitConfig: CacheConfig): LoadingCache[String, BlacklistCache] = { + val emptyBlacklistCacheLoader = new CacheLoader[String, BlacklistCache]() { + override def load(key: String): BlacklistCache = new GroupingBlacklistCache( + bucketCacheConfig = bucketConfig, + hitCacheConfig = hitConfig, + group = key + ) + } + + CacheBuilder. + newBuilder(). + concurrencyLevel(groupingConfig.concurrency). + maximumSize(groupingConfig.size). + expireAfterWrite(groupingConfig.ttl.length, groupingConfig.ttl.unit). + build[String, BlacklistCache](emptyBlacklistCacheLoader) + } + + for { + groupingsConfig <- blacklistGroupingCacheConfig + bucketsConfig <- blacklistBucketCacheConfig + hitsConfig <- blacklistHitCacheConfig + } yield buildBlacklistGroupingsCache(groupingsConfig, bucketsConfig, hitsConfig) + } + + /** + * If configured return a group blacklist cache, otherwise if configured return a root workflow cache, + * otherwise return nothing. + */ + def blacklistCacheFor(workflow: HasWorkflowIdAndSources): Option[BlacklistCache] = { + // If configuration is set up for blacklist groups and a blacklist group is specified in workflow options, + // get the BlacklistCache for the group. + val groupBlacklistCache: Option[BlacklistCache] = for { + groupings <- blacklistGroupingsCache + groupKey <- blacklistGroupingWorkflowOptionKey + groupFromWorkflowOptions <- workflow.sources.workflowOptions.get(groupKey).toOption + } yield groupings.get(groupFromWorkflowOptions) + + // Build a blacklist cache for a single, ungrouped root workflow. + def rootWorkflowBlacklistCache: Option[BlacklistCache] = for { + bucketConfig <- blacklistBucketCacheConfig + hitConfig <- blacklistHitCacheConfig + } yield new RootWorkflowBlacklistCache(bucketCacheConfig = bucketConfig, hitCacheConfig = hitConfig) + + // Return the group blacklist cache if available, otherwise a blacklist cache for the root workflow. + val maybeCache = groupBlacklistCache orElse rootWorkflowBlacklistCache + maybeCache collect { + case group: GroupingBlacklistCache => + logger.info("Workflow {} using group blacklist cache '{}' containing blacklist status for {} hits and {} buckets.", + workflow.id, group.group, group.hitCache.size(), group.bucketCache.size()) + case _: RootWorkflowBlacklistCache => + logger.info("Workflow {} using root workflow blacklist cache.", workflow.id) + } + maybeCache + } +} diff --git a/backend/src/main/scala/cromwell/backend/standard/callcaching/CopyingActorBlacklistCacheSupport.scala b/backend/src/main/scala/cromwell/backend/standard/callcaching/CopyingActorBlacklistCacheSupport.scala new file mode 100644 index 00000000000..5257c636027 --- /dev/null +++ b/backend/src/main/scala/cromwell/backend/standard/callcaching/CopyingActorBlacklistCacheSupport.scala @@ -0,0 +1,160 @@ +package cromwell.backend.standard.callcaching +import cats.data.NonEmptyList +import cromwell.backend.BackendCacheHitCopyingActor.CopyOutputsCommand +import cromwell.core.io.{IoCommand, IoCopyCommand} +import cromwell.services.CallCaching.CallCachingEntryId + + +object CopyingActorBlacklistCacheSupport { + trait HasFormatting { + def metricFormat: String = getClass.getName.toLowerCase.split('$').last + } + + sealed trait Verb extends HasFormatting + case object Read extends Verb + case object Write extends Verb + + sealed trait EntityType extends HasFormatting + case object Hit extends EntityType + case object Bucket extends EntityType + + sealed trait CacheReadType + case object ReadHitOnly + case object ReadHitAndBucket +} + +trait CopyingActorBlacklistCacheSupport { + this: StandardCacheHitCopyingActor => + + import CopyingActorBlacklistCacheSupport._ + + def handleBlacklistingForGenericFailure(): Unit = { + // Not a forbidden failure so do not blacklist the bucket but do blacklist the hit. + for { + data <- stateData + cache <- standardParams.blacklistCache + _ = blacklistAndMetricHit(cache, data.cacheHit) + } yield () + () + } + + /* Whitelist by bucket and hit if appropriate. */ + def handleWhitelistingForSuccess(command: IoCommand[_]): Unit = { + for { + cache <- standardParams.blacklistCache + data <- stateData + _ = whitelistAndMetricHit(cache, data.cacheHit) + copy <- Option(command) collect { case c: IoCopyCommand => c } + prefix <- extractBlacklistPrefix(copy.source.toString) + _ = whitelistAndMetricBucket(cache, prefix) + } yield () + () + } + + def publishBlacklistMetric(blacklistCache: BlacklistCache, verb: Verb, entityType: EntityType, key: String, value: BlacklistStatus): Unit = { + val group = blacklistCache.name.getOrElse("none") + val metricPath = NonEmptyList.of( + "job", + "callcaching", "blacklist", verb.metricFormat, entityType.metricFormat, jobDescriptor.taskCall.localName, group, key, value.toString) + increment(metricPath) + } + + def blacklistAndMetricHit(blacklistCache: BlacklistCache, hit: CallCachingEntryId): Unit = { + blacklistCache.getBlacklistStatus(hit) match { + case UntestedCacheResult => + blacklistCache.blacklist(hit) + publishBlacklistMetric(blacklistCache, Write, Hit, hit.id.toString, value = BadCacheResult) + case BadCacheResult => + // Not a surprise, race conditions abound in cache hit copying. Do not overwrite with the same value or + // multiply publish metrics for this hit. + case GoodCacheResult => + // This hit was thought to be good but now a copy has failed for permissions reasons. Be conservative and + // mark the hit as BadCacheResult and log this strangeness. + log.warning( + "Cache hit {} found in GoodCacheResult blacklist state, but cache hit copying has failed for permissions reasons. Overwriting status to BadCacheResult state.", + hit.id) + blacklistCache.blacklist(hit) + publishBlacklistMetric(blacklistCache, Write, Hit, hit.id.toString, value = BadCacheResult) + } + } + + def blacklistAndMetricBucket(blacklistCache: BlacklistCache, bucket: String): Unit = { + blacklistCache.getBlacklistStatus(bucket) match { + case UntestedCacheResult => + blacklistCache.blacklist(bucket) + publishBlacklistMetric(blacklistCache, Write, Bucket, bucket, value = BadCacheResult) + case BadCacheResult => + // Not a surprise, race conditions abound in cache hit copying. Do not overwrite with the same value or + // multiply publish metrics for this bucket. + case GoodCacheResult => + // This bucket was thought to be good but now a copy has failed for permissions reasons. Be conservative and + // mark the bucket as BadCacheResult and log this strangeness. + log.warning( + "Bucket {} found in GoodCacheResult blacklist state, but cache hit copying has failed for permissions reasons. Overwriting status to BadCacheResult state.", + bucket) + blacklistCache.blacklist(bucket) + publishBlacklistMetric(blacklistCache, Write, Bucket, bucket, value = BadCacheResult) + } + } + + def whitelistAndMetricHit(blacklistCache: BlacklistCache, hit: CallCachingEntryId): Unit = { + blacklistCache.getBlacklistStatus(hit) match { + case UntestedCacheResult => + blacklistCache.whitelist(hit) + publishBlacklistMetric(blacklistCache, Write, Hit, hit.id.toString, value = GoodCacheResult) + case GoodCacheResult => // This hit is already known to be good, no need to rewrite or spam metrics. + case BadCacheResult => + // This is surprising, a hit that we failed to copy before has now been the source of a successful copy. + // Don't overwrite this to GoodCacheResult, hopefully there are less weird cache hits out there. + log.warning( + "Cache hit {} found in BadCacheResult blacklist state, not overwriting to GoodCacheResult despite successful copy.", + hit.id) + } + } + + def whitelistAndMetricBucket(blacklistCache: BlacklistCache, bucket: String): Unit = { + blacklistCache.getBlacklistStatus(bucket) match { + case UntestedCacheResult => + blacklistCache.whitelist(bucket) + publishBlacklistMetric(blacklistCache, Write, Bucket, bucket, value = GoodCacheResult) + case GoodCacheResult => // This bucket is already known to be good, no need to rewrite or spam metrics. + case BadCacheResult => + // This is surprising, a bucket that we failed to copy from before for auth reasons has now been the source + // of a successful copy. Don't overwrite this to GoodCacheResult, hopefully there are less weird cache hits out there. + log.warning( + "Bucket {} found in BadCacheResult blacklist state, not overwriting to GoodCacheResult despite successful copy.", + bucket) + } + } + + def publishBlacklistReadMetrics(command: CopyOutputsCommand, cacheHit: CallCachingEntryId, cacheReadType: Product) = { + for { + c <- standardParams.blacklistCache + hitBlacklistStatus = c.getBlacklistStatus(cacheHit) + // If blacklisting is on the hit cache is always checked so publish a hit read metric. + _ = publishBlacklistMetric(c, Read, Hit, cacheHit.id.toString, hitBlacklistStatus) + // Conditionally publish the bucket read if the backend supports bucket / prefix blacklisting and the bucket was read. + _ <- Option(cacheReadType).collect { case ReadHitAndBucket => () } + path = sourcePathFromCopyOutputsCommand(command) + prefix <- extractBlacklistPrefix(path) + bucketBlacklistStatus = c.getBlacklistStatus(prefix) + _ = publishBlacklistMetric(c, Read, Bucket, prefix, bucketBlacklistStatus) + } yield () + } + + def isSourceBlacklisted(command: CopyOutputsCommand): Boolean = { + val path = sourcePathFromCopyOutputsCommand(command) + (for { + cache <- standardParams.blacklistCache + prefix <- extractBlacklistPrefix(path) + value = cache.getBlacklistStatus(prefix) + } yield value == BadCacheResult).getOrElse(false) + } + + def isSourceBlacklisted(hit: CallCachingEntryId): Boolean = { + (for { + cache <- standardParams.blacklistCache + value = cache.getBlacklistStatus(hit) + } yield value == BadCacheResult).getOrElse(false) + } +} diff --git a/backend/src/main/scala/cromwell/backend/standard/callcaching/StandardCacheHitCopyingActor.scala b/backend/src/main/scala/cromwell/backend/standard/callcaching/StandardCacheHitCopyingActor.scala index df5d6a846c3..13bfd8abbce 100644 --- a/backend/src/main/scala/cromwell/backend/standard/callcaching/StandardCacheHitCopyingActor.scala +++ b/backend/src/main/scala/cromwell/backend/standard/callcaching/StandardCacheHitCopyingActor.scala @@ -7,11 +7,12 @@ import cats.instances.list._ import cats.instances.set._ import cats.instances.tuple._ import cats.syntax.foldable._ -import cromwell.backend.BackendCacheHitCopyingActor.{CacheCopyError, CopyOutputsCommand, CopyingOutputsFailedResponse, LoggableCacheCopyError, MetricableCacheCopyError} +import cromwell.backend.BackendCacheHitCopyingActor._ import cromwell.backend.BackendJobExecutionActor._ import cromwell.backend.BackendLifecycleActor.AbortJobCommand import cromwell.backend.io.JobPaths import cromwell.backend.standard.StandardCachingActorHelper +import cromwell.backend.standard.callcaching.CopyingActorBlacklistCacheSupport._ import cromwell.backend.standard.callcaching.StandardCacheHitCopyingActor._ import cromwell.backend.{BackendConfigurationDescriptor, BackendInitializationData, BackendJobDescriptor, MetricableCacheCopyErrorCategory} import cromwell.core.CallOutputs @@ -19,10 +20,13 @@ import cromwell.core.io._ import cromwell.core.logging.JobLogging import cromwell.core.path.{Path, PathCopier} import cromwell.core.simpleton.{WomValueBuilder, WomValueSimpleton} +import cromwell.services.CallCaching.CallCachingEntryId +import cromwell.services.instrumentation.CromwellInstrumentationActor import wom.values.WomSingleFile import scala.util.{Failure, Success, Try} + /** * Trait of parameters passed to a StandardCacheHitCopyingActor. */ @@ -79,6 +83,7 @@ object StandardCacheHitCopyingActor { case class StandardCacheHitCopyingActorData(commandsToWaitFor: List[Set[IoCommand[_]]], newJobOutputs: CallOutputs, newDetritus: DetritusMap, + cacheHit: CallCachingEntryId, returnCode: Option[Int] ) { @@ -109,6 +114,7 @@ object StandardCacheHitCopyingActor { private[callcaching] case object StillWaiting extends CommandSetState private[callcaching] case object AllCommandsDone extends CommandSetState private[callcaching] case class NextSubSet(commands: Set[IoCommand[_]]) extends CommandSetState + } class DefaultStandardCacheHitCopyingActor(standardParams: StandardCacheHitCopyingActorParams) extends StandardCacheHitCopyingActor(standardParams) @@ -117,7 +123,8 @@ class DefaultStandardCacheHitCopyingActor(standardParams: StandardCacheHitCopyin * Standard implementation of a BackendCacheHitCopyingActor. */ abstract class StandardCacheHitCopyingActor(val standardParams: StandardCacheHitCopyingActorParams) - extends FSM[StandardCacheHitCopyingActorState, Option[StandardCacheHitCopyingActorData]] with JobLogging with StandardCachingActorHelper with IoClientHelper { + extends FSM[StandardCacheHitCopyingActorState, Option[StandardCacheHitCopyingActorData]] + with JobLogging with StandardCachingActorHelper with IoClientHelper with CromwellInstrumentationActor with CopyingActorBlacklistCacheSupport { override lazy val jobDescriptor: BackendJobDescriptor = standardParams.jobDescriptor override lazy val backendInitializationDataOption: Option[BackendInitializationData] = standardParams.backendInitializationDataOption @@ -139,48 +146,58 @@ abstract class StandardCacheHitCopyingActor(val standardParams: StandardCacheHit protected def duplicate(copyPairs: Set[PathPair]): Option[Try[Unit]] = None when(Idle) { - case Event(command: CopyOutputsCommand, None) if isSourceBlacklisted(command) => - // We don't want to log this because bucket blacklisting is a common and expected occurrence. - failAndStop(MetricableCacheCopyError(MetricableCacheCopyErrorCategory.BucketBlacklisted)) - - case Event(CopyOutputsCommand(simpletons, jobDetritus, returnCode), None) => - // Try to make a Path of the callRootPath from the detritus - lookupSourceCallRootPath(jobDetritus) match { - case Success(sourceCallRootPath) => - - // process simpletons and detritus to get updated paths and corresponding IoCommands - val processed = for { - (destinationCallOutputs, simpletonIoCommands) <- processSimpletons(simpletons, sourceCallRootPath) - (destinationDetritus, detritusIoCommands) <- processDetritus(jobDetritus) - } yield (destinationCallOutputs, destinationDetritus, simpletonIoCommands ++ detritusIoCommands) - - processed match { - case Success((destinationCallOutputs, destinationDetritus, detritusAndOutputsIoCommands)) => - duplicate(ioCommandsToCopyPairs(detritusAndOutputsIoCommands)) match { - // Use the duplicate override if exists - case Some(Success(_)) => succeedAndStop(returnCode, destinationCallOutputs, destinationDetritus) - case Some(Failure(failure)) => - // Something went wrong in the custom duplication code. We consider this loggable because it's most likely a user-permission error: - failAndStop(LoggableCacheCopyError(failure)) - // Otherwise send the first round of IoCommands (file outputs and detritus) if any - case None if detritusAndOutputsIoCommands.nonEmpty => - detritusAndOutputsIoCommands foreach sendIoCommand - - // Add potential additional commands to the list - val additionalCommands = additionalIoCommands(sourceCallRootPath, simpletons, destinationCallOutputs, jobDetritus, destinationDetritus) - val allCommands = List(detritusAndOutputsIoCommands) ++ additionalCommands - - goto(WaitingForIoResponses) using Option(StandardCacheHitCopyingActorData(allCommands, destinationCallOutputs, destinationDetritus, returnCode)) - case _ => succeedAndStop(returnCode, destinationCallOutputs, destinationDetritus) + case Event(command @ CopyOutputsCommand(simpletons, jobDetritus, cacheHit, returnCode), None) => + val (nextState, cacheReadType) = + if (isSourceBlacklisted(cacheHit)) { + // We don't want to log this because blacklisting is a common and expected occurrence. + (failAndStop(BlacklistSkip(MetricableCacheCopyErrorCategory.HitBlacklisted)), ReadHitOnly) + } else if (isSourceBlacklisted(command)) { + // We don't want to log this because blacklisting is a common and expected occurrence. + (failAndStop(BlacklistSkip(MetricableCacheCopyErrorCategory.BucketBlacklisted)), ReadHitAndBucket) + } else { + // Try to make a Path of the callRootPath from the detritus + val next = lookupSourceCallRootPath(jobDetritus) match { + case Success(sourceCallRootPath) => + + // process simpletons and detritus to get updated paths and corresponding IoCommands + val processed = for { + (destinationCallOutputs, simpletonIoCommands) <- processSimpletons(simpletons, sourceCallRootPath) + (destinationDetritus, detritusIoCommands) <- processDetritus(jobDetritus) + } yield (destinationCallOutputs, destinationDetritus, simpletonIoCommands ++ detritusIoCommands) + + processed match { + case Success((destinationCallOutputs, destinationDetritus, detritusAndOutputsIoCommands)) => + duplicate(ioCommandsToCopyPairs(detritusAndOutputsIoCommands)) match { + // Use the duplicate override if exists + case Some(Success(_)) => succeedAndStop(returnCode, destinationCallOutputs, destinationDetritus) + case Some(Failure(failure)) => + // Something went wrong in the custom duplication code. We consider this loggable because it's most likely a user-permission error: + failAndStop(CopyAttemptError(failure)) + // Otherwise send the first round of IoCommands (file outputs and detritus) if any + case None if detritusAndOutputsIoCommands.nonEmpty => + detritusAndOutputsIoCommands foreach sendIoCommand + + // Add potential additional commands to the list + val additionalCommands = additionalIoCommands(sourceCallRootPath, simpletons, destinationCallOutputs, jobDetritus, destinationDetritus) + val allCommands = List(detritusAndOutputsIoCommands) ++ additionalCommands + + goto(WaitingForIoResponses) using Option(StandardCacheHitCopyingActorData(allCommands, destinationCallOutputs, destinationDetritus, cacheHit, returnCode)) + case _ => succeedAndStop(returnCode, destinationCallOutputs, destinationDetritus) + } + + // Something went wrong in generating duplication commands. We consider this loggable error because we don't expect this to happen: + case Failure(failure) => failAndStop(CopyAttemptError(failure)) } - // Something went wrong in generating duplication commands. We consider this loggable error because we don't expect this to happen: - case Failure(failure) => failAndStop(LoggableCacheCopyError(failure)) + // Something went wrong in looking up the call root... loggable because we don't expect this to happen: + case Failure(failure) => failAndStop(CopyAttemptError(failure)) } + (next, ReadHitAndBucket) + } - // Something went wrong in looking up the call root... loggable because we don't expect this to happen: - case Failure(failure) => failAndStop(LoggableCacheCopyError(failure)) - } + publishBlacklistReadMetrics(command, cacheHit, cacheReadType) + + nextState } when(WaitingForIoResponses) { @@ -189,30 +206,36 @@ abstract class StandardCacheHitCopyingActor(val standardParams: StandardCacheHit commandState match { case StillWaiting => stay() using Option(newData) - case AllCommandsDone => succeedAndStop(newData.returnCode, newData.newJobOutputs, newData.newDetritus) + case AllCommandsDone => + handleWhitelistingForSuccess(command) + succeedAndStop(newData.returnCode, newData.newJobOutputs, newData.newDetritus) case NextSubSet(commands) => commands foreach sendIoCommand stay() using Option(newData) } case Event(f: IoReadForbiddenFailure[_], Some(data)) => - handleForbidden( + handleBlacklistingForForbidden( path = f.forbiddenPath, // Loggable because this is an attempt-specific problem: - andThen = failAndAwaitPendingResponses(LoggableCacheCopyError(f.failure), f.command, data) + andThen = failAndAwaitPendingResponses(CopyAttemptError(f.failure), f.command, data) ) case Event(IoFailAck(command: IoCommand[_], failure), Some(data)) => + handleBlacklistingForGenericFailure() // Loggable because this is an attempt-specific problem: - failAndAwaitPendingResponses(LoggableCacheCopyError(failure), command, data) + failAndAwaitPendingResponses(CopyAttemptError(failure), command, data) // Should not be possible - case Event(IoFailAck(_: IoCommand[_], failure), None) => failAndStop(LoggableCacheCopyError(failure)) + case Event(IoFailAck(_: IoCommand[_], failure), None) => failAndStop(CopyAttemptError(failure)) } when(FailedState) { case Event(f: IoReadForbiddenFailure[_], Some(data)) => - handleForbidden( + handleBlacklistingForForbidden( path = f.forbiddenPath, andThen = stayOrStopInFailedState(f, data) ) + case Event(fail: IoFailAck[_], Some(data)) => + handleBlacklistingForGenericFailure() + stayOrStopInFailedState(fail, data) // At this point success or failure doesn't matter, we've already failed this hit case Event(response: IoAck[_], Some(data)) => stayOrStopInFailedState(response, data) @@ -238,12 +261,15 @@ abstract class StandardCacheHitCopyingActor(val standardParams: StandardCacheHit } } - /* Blacklist by prefix if appropriate. */ - private def handleForbidden[T](path: String, andThen: => State): State = { + /* Blacklist by bucket and hit if appropriate. */ + private def handleBlacklistingForForbidden[T](path: String, andThen: => State): State = { for { + // Blacklist the hit first in the forcomp since not all configurations will support bucket blacklisting. cache <- standardParams.blacklistCache + data <- stateData + _ = blacklistAndMetricHit(cache, data.cacheHit) prefix <- extractBlacklistPrefix(path) - _ = cache.blacklist(prefix) + _ = blacklistAndMetricBucket(cache, prefix) } yield() andThen } @@ -256,7 +282,7 @@ abstract class StandardCacheHitCopyingActor(val standardParams: StandardCacheHit stay() } - def failAndStop(failure: CacheCopyError): State = { + def failAndStop(failure: CacheCopyFailure): State = { context.parent ! CopyingOutputsFailedResponse(jobDescriptor.key, standardParams.cacheCopyAttempt, failure) context stop self stay() @@ -264,7 +290,7 @@ abstract class StandardCacheHitCopyingActor(val standardParams: StandardCacheHit /** If there are no responses pending this behaves like `failAndStop`, otherwise this goes to `FailedState` and waits * for all the pending responses to come back before stopping. */ - def failAndAwaitPendingResponses(failure: CacheCopyError, command: IoCommand[_], data: StandardCacheHitCopyingActorData): State = { + def failAndAwaitPendingResponses(failure: CacheCopyFailure, command: IoCommand[_], data: StandardCacheHitCopyingActorData): State = { context.parent ! CopyingOutputsFailedResponse(jobDescriptor.key, standardParams.cacheCopyAttempt, failure) val (newData, commandState) = data.commandComplete(command) @@ -297,7 +323,7 @@ abstract class StandardCacheHitCopyingActor(val standardParams: StandardCacheHit } /** - * Returns a pair of the list of simpletons with copied paths, and copy commands necessary to perform those copies. + * Returns a pair of the list of simpletons with copied paths, and copy commands necessary to perform those copies. */ protected def processSimpletons(womValueSimpletons: Seq[WomValueSimpleton], sourceCallRootPath: Path): Try[(CallOutputs, Set[IoCommand[_]])] = Try { val (destinationSimpletons, ioCommands): (List[WomValueSimpleton], Set[IoCommand[_]]) = womValueSimpletons.toList.foldMap({ @@ -324,7 +350,7 @@ abstract class StandardCacheHitCopyingActor(val standardParams: StandardCacheHit } /** - * Returns a pair of the detritus with copied paths, and copy commands necessary to perform those copies. + * Returns a pair of the detritus with copied paths, and copy commands necessary to perform those copies. */ protected def processDetritus(sourceJobDetritusFiles: Map[String, String]): Try[(Map[String, Path], Set[IoCommand[_]])] = Try { val fileKeys = detritusFileKeys(sourceJobDetritusFiles) @@ -361,24 +387,17 @@ abstract class StandardCacheHitCopyingActor(val standardParams: StandardCacheHit case other => s"The Cache hit copying actor timed out waiting for an unknown I/O operation: $other" } - // Loggable because this is an attempt-specific: - failAndStop(LoggableCacheCopyError(new TimeoutException(exceptionMessage))) + // Loggable because this is attempt-specific: + failAndStop(CopyAttemptError(new TimeoutException(exceptionMessage))) () } /** - * If a subclass of this `StandardCacheHitCopyingActor` supports blacklisting then it should implement this + * If a subclass of this `StandardCacheHitCopyingActor` supports blacklisting by path then it should implement this * to return the prefix of the path from the failed copy command to use for blacklisting. */ protected def extractBlacklistPrefix(path: String): Option[String] = None - private def sourcePathFromCopyOutputsCommand(command: CopyOutputsCommand): String = command.jobDetritusFiles.values.head + def sourcePathFromCopyOutputsCommand(command: CopyOutputsCommand): String = command.jobDetritusFiles.values.head - private def isSourceBlacklisted(command: CopyOutputsCommand): Boolean = { - val path = sourcePathFromCopyOutputsCommand(command) - (for { - prefix <- extractBlacklistPrefix(path) - cache <- standardParams.blacklistCache - } yield cache.isBlacklisted(prefix)).getOrElse(false) - } } diff --git a/backend/src/test/scala/cromwell/backend/standard/callcaching/BlacklistCacheSpec.scala b/backend/src/test/scala/cromwell/backend/standard/callcaching/BlacklistCacheSpec.scala index 40af0c690f0..72025af499a 100644 --- a/backend/src/test/scala/cromwell/backend/standard/callcaching/BlacklistCacheSpec.scala +++ b/backend/src/test/scala/cromwell/backend/standard/callcaching/BlacklistCacheSpec.scala @@ -1,22 +1,44 @@ package cromwell.backend.standard.callcaching import cromwell.core.CacheConfig -import org.scalatest.{BeforeAndAfterAll, FlatSpec, Matchers} +import cromwell.services.CallCaching.CallCachingEntryId +import org.scalatest.concurrent.Eventually +import org.scalatest.{FlatSpec, Matchers} import scala.concurrent.duration._ import scala.language.postfixOps -class BlacklistCacheSpec extends FlatSpec with BeforeAndAfterAll with Matchers { - "The blacklist cache" should "default, blacklist and expire" in { +class BlacklistCacheSpec extends FlatSpec with Matchers with Eventually { + "The blacklist cache" should "default, blacklist, whitelist and expire" in { + val hit = CallCachingEntryId(3) val bucket = "foo" - val cacheConfig = CacheConfig(concurrency = 1, size = Integer.MAX_VALUE, ttl = 1 second) - val cache = BlacklistCache(cacheConfig) - cache.isBlacklisted(bucket) shouldBe false + + val bucketCacheConfig = CacheConfig(concurrency = 1, size = Integer.MAX_VALUE, ttl = 1 second) + val hitCacheConfig = CacheConfig(concurrency = 1, size = Integer.MAX_VALUE, ttl = 1 second) + val cache = new RootWorkflowBlacklistCache(bucketCacheConfig = bucketCacheConfig, hitCacheConfig = hitCacheConfig) + cache.getBlacklistStatus(bucket) shouldBe UntestedCacheResult + cache.getBlacklistStatus(hit) shouldBe UntestedCacheResult cache.blacklist(bucket) - cache.isBlacklisted(bucket) shouldBe true + cache.blacklist(hit) + cache.getBlacklistStatus(bucket) shouldBe BadCacheResult + cache.getBlacklistStatus(hit) shouldBe BadCacheResult + + implicit val patienceConfig = PatienceConfig(timeout = scaled(5.seconds), interval = scaled(1.second)) // Test ttl - Thread.sleep(5000L) - cache.isBlacklisted(bucket) shouldBe false + eventually { + cache.getBlacklistStatus(bucket) shouldBe UntestedCacheResult + cache.getBlacklistStatus(hit) shouldBe UntestedCacheResult + } + + cache.whitelist(bucket) + cache.whitelist(hit) + cache.getBlacklistStatus(bucket) shouldBe GoodCacheResult + cache.getBlacklistStatus(hit) shouldBe GoodCacheResult + + eventually { + cache.getBlacklistStatus(bucket) shouldBe UntestedCacheResult + cache.getBlacklistStatus(hit) shouldBe UntestedCacheResult + } } } diff --git a/backend/src/test/scala/cromwell/backend/standard/callcaching/CallCachingBlacklistManagerSpec.scala b/backend/src/test/scala/cromwell/backend/standard/callcaching/CallCachingBlacklistManagerSpec.scala new file mode 100644 index 00000000000..d53f2c14c93 --- /dev/null +++ b/backend/src/test/scala/cromwell/backend/standard/callcaching/CallCachingBlacklistManagerSpec.scala @@ -0,0 +1,121 @@ +package cromwell.backend.standard.callcaching + +import akka.event.NoLogging +import com.typesafe.config.ConfigFactory +import cromwell.core._ +import org.scalatest.{FlatSpec, Matchers} +import spray.json._ + + +class CallCachingBlacklistManagerSpec extends FlatSpec with Matchers { + behavior of "CallCachingBlacklistManager" + + //noinspection RedundantDefaultArgument + val workflowSourcesNoGrouping = WorkflowSourceFilesWithoutImports( + workflowSource = None, + workflowUrl = None, + workflowRoot = None, + workflowType = None, + workflowTypeVersion = None, + inputsJson = "", + workflowOptions = WorkflowOptions(JsObject.empty), + labelsJson = "", + workflowOnHold = false, + warnings = List.empty + ) + + val workflowSourcesYesGrouping = workflowSourcesNoGrouping.copy( + workflowOptions = WorkflowOptions(""" { "google_project": "blacklist_group_testing" } """.parseJson.asJsObject) + ) + + val workflowNoGrouping = new HasWorkflowIdAndSources { + override def sources: WorkflowSourceFilesCollection = workflowSourcesNoGrouping + override def id: WorkflowId = WorkflowId.randomId() + } + + val workflowYesGrouping1 = new HasWorkflowIdAndSources { + override def sources: WorkflowSourceFilesCollection = workflowSourcesYesGrouping + override def id: WorkflowId = WorkflowId.randomId() + } + + val workflowYesGrouping2 = new HasWorkflowIdAndSources { + override def sources: WorkflowSourceFilesCollection = workflowSourcesYesGrouping + override def id: WorkflowId = WorkflowId.randomId() + } + + it should "be off by default" in { + val configString = "" + val manager = new CallCachingBlacklistManager(ConfigFactory.parseString(configString), logger = NoLogging) + + manager.blacklistCacheFor(workflowNoGrouping) shouldBe None + } + + it should "be on with default values if blacklisting is enabled" in { + val configString = + """ + |call-caching { + | blacklist-cache { + | enabled: true + | } + |} + |""".stripMargin + val manager = new CallCachingBlacklistManager(ConfigFactory.parseString(configString), logger = NoLogging) + + val cache = manager.blacklistCacheFor(workflowNoGrouping) + val rootWorkflowCache = cache.get.asInstanceOf[RootWorkflowBlacklistCache] + + rootWorkflowCache.hitCache.size() shouldBe 0 + rootWorkflowCache.bucketCache.size() shouldBe 0 + } + + it should "use root workflow level caches if no workflow-option is specified for groupings in config" in { + val configString = + """ + |call-caching { + | blacklist-cache { + | enabled: true + | } + |} + |""".stripMargin + val manager = new CallCachingBlacklistManager(ConfigFactory.parseString(configString), logger = NoLogging) + val cache = manager.blacklistCacheFor(workflowYesGrouping1) + val _ = cache.get.asInstanceOf[RootWorkflowBlacklistCache] + } + + it should "use root workflow level caches if no workflow-option is provided in workflow options" in { + val configString = + """ + |call-caching { + | blacklist-cache { + | enabled: true + | groupings: { + | workflow-option: "google_project" + | } + | } + |} + |""".stripMargin + val manager = new CallCachingBlacklistManager(ConfigFactory.parseString(configString), logger = NoLogging) + val cache = manager.blacklistCacheFor(workflowNoGrouping) + val _ = cache.get.asInstanceOf[RootWorkflowBlacklistCache] + } + + it should "use a grouping cache if there is a workflow-option in config and its value exists as a key in workflow options" in { + val configString = + """ + |call-caching { + | blacklist-cache { + | enabled: true + | groupings: { + | workflow-option: "google_project" + | } + | } + |} + |""".stripMargin + val manager = new CallCachingBlacklistManager(ConfigFactory.parseString(configString), logger = NoLogging) + val cache1 = manager.blacklistCacheFor(workflowYesGrouping1).get + val _ = cache1.asInstanceOf[GroupingBlacklistCache] + + val cache2 = manager.blacklistCacheFor(workflowYesGrouping2).get + System.identityHashCode(cache1) shouldEqual System.identityHashCode(cache2) + } +} diff --git a/build.sbt b/build.sbt index 8fdd780aa90..f276912f2ec 100644 --- a/build.sbt +++ b/build.sbt @@ -223,7 +223,7 @@ lazy val awsBackend = (project in backendRoot / "aws") .dependsOn(services % "test->test") lazy val sfsBackend = (project in backendRoot / "sfs") - .withLibrarySettings("cromwell-sfs-backend") + .withLibrarySettings("cromwell-sfs-backend", sfsBackendDependencies) .dependsOn(backend) .dependsOn(gcsFileSystem) .dependsOn(httpFileSystem) diff --git a/centaur/src/it/scala/centaur/AbstractCromwellEngineOrBackendUpgradeTestCaseSpec.scala b/centaur/src/it/scala/centaur/AbstractCromwellEngineOrBackendUpgradeTestCaseSpec.scala index 1cc20910886..fa8db361352 100644 --- a/centaur/src/it/scala/centaur/AbstractCromwellEngineOrBackendUpgradeTestCaseSpec.scala +++ b/centaur/src/it/scala/centaur/AbstractCromwellEngineOrBackendUpgradeTestCaseSpec.scala @@ -7,6 +7,7 @@ import cromwell.database.slick.{EngineSlickDatabase, MetadataSlickDatabase, Slic import cromwell.database.sql.SqlDatabase import org.scalatest.{Assertions, BeforeAndAfter, DoNotDiscover} import shapeless.syntax.typeable._ +import net.ceedubs.ficus.Ficus._ import scala.concurrent.Future @@ -71,7 +72,7 @@ object AbstractCromwellEngineOrBackendUpgradeTestCaseSpec { private def recreateDatabase(slickDatabase: SlickDatabase)(implicit cs: ContextShift[IO]): IO[Unit] = { import slickDatabase.dataAccess.driver.api._ - val schemaName = slickDatabase.databaseConfig.getString("db.schema") + val schemaName = slickDatabase.databaseConfig.getOrElse("db.cromwell-database-name", "cromwell_test") //noinspection SqlDialectInspection for { _ <- IO.fromFuture(IO(slickDatabase.database.run(sqlu"""DROP SCHEMA IF EXISTS #$schemaName"""))) diff --git a/centaur/src/main/resources/reference.conf b/centaur/src/main/resources/reference.conf index 4f7e96e52ce..48a4a404576 100644 --- a/centaur/src/main/resources/reference.conf +++ b/centaur/src/main/resources/reference.conf @@ -34,9 +34,6 @@ centaur { # } include required(classpath("reference_database.inc.conf")) - # Override the database url to use a stable in memory database. - database.db.schema = "cromwell_test" - database.db.url = "jdbc:hsqldb:file:"${centaur.cromwell.database.db.schema}";shutdown=false;hsqldb.tx=mvcc" } # The timeout of the Centaur send/receive + unmarshal pipeline diff --git a/centaur/src/main/resources/standardTestCases/sizerelativepath.test b/centaur/src/main/resources/standardTestCases/sizerelativepath.test new file mode 100644 index 00000000000..c8c266ddb1b --- /dev/null +++ b/centaur/src/main/resources/standardTestCases/sizerelativepath.test @@ -0,0 +1,13 @@ +name: sizerelativepath +testFormat: workflowsuccess + +backends: [Local, LocalNoDocker, Slurm, SlurmNoDocker] +backendsMode: "only" + +files { + workflow: sizerelativepath/sizerelativepath.wdl +} + +metadata { + "outputs.sizerelativepath.size_string": "1495" +} diff --git a/centaur/src/main/resources/standardTestCases/sizerelativepath/1495bytes.txt b/centaur/src/main/resources/standardTestCases/sizerelativepath/1495bytes.txt new file mode 100644 index 00000000000..9d9d23e4f4f --- /dev/null +++ b/centaur/src/main/resources/standardTestCases/sizerelativepath/1495bytes.txt @@ -0,0 +1,20 @@ +KNAZHPO7MYANAWUG54SHDEXMQWBFJ4QJPTXGF4HBPB6WLJ4U7ZHXGZ53ZMLC5ZFXW6AFNPX33OZT +SJA5XBO4GKPGJFLBGQPE2FYLRUSYNYNS7VS3TNOE7MYFEISAUX34KETVE6N2CHYJQTT23JDYJTBE +5USVDQX5SK4WOU55ABQNALNIEV2LGGPMDCZ77EJXF25RLS3IGOER5RNQJHVIUG5RWDR5SU65JPGC +BSG5RRD2L4H5FWVBXFKILP7SMWK7RENOVCINHWJNES2GKYVAKMJDRIYWBEJHXYOVN527HDKMYUJ2 +WH6D4VNC6E4LWFTA4ACWB2MN6WXRRRIL55SP5JYQPNUQBTEXXPKB4BRHROZZ5NBTRP6ERRP77X66 +J7TTEIBFZE3SJAOR7PLFYLDVHURUXKJMF3PMKSVBL57T2ZXYR34WEQHIGCZGVNKR7FCSPVJ2Y2IW +WNBW3SBCIHKJCPWQ6CSZ4HNI2REHEI3O234ECVGFS6KRAGGHGSSAXLFITWBE2KTNSJP23PTCU3MV +EYPDUX2AQVN7TXCIGWMOO2XRZJMU5UQXNKB4AQUGIJOMNCDR73QYLWKRTXLZSNHPMRLVAVS3CZ4M +XJYNSBOB5OMJEQXSINZRSNQOUPPUVSYMMUALKW7NII33FQWF6EIOHDAE2WSOAE6CJ4SKLPWGRVFX +TQHILVF4GWVAOJZJTMPL3JXZER44ZGWFCK7ZY7NCK3AMWWV4S2F5NBUUIOEHKBNT4SSWG2LJ2GSC +FWXKHWSWCM7RVTMRX6E2P6FTR6O2S7MOJ2RBHAEONZEDV5OWO2IMYXJR4RYCPE5EN5XLPWY5SLCG +A66UTVOVGPMBJ63TY5ZND4PKB4H3JD4KKJLFYYT7C34OBUEE4C6G7I54KTKK7N6JMCCZUZG3WUJN +PEJEANB4CG76ZO4CTU2IHLE6ILJXGFNXHMN7J6QXYRTWK3WLGIDQ7GFVLHDOB4IMJ365HQOIZOYT +QPZ65NDNMJH7PX44UJMKAZARA7PNY4AS7OQRXKGJOEJPXS5NXTY3CPA5FQ7C5N757GJPWA27UKFK +PMCIJVZCQPYUARZUV4HVGUCXGQPIQ4AIHO4HME45EQ5HDVMDY66WTE43QOBEIURA23MVQEVNL7ZW +EJFQQKE2WAQ6P5QNSZOQMKPSTA7GBNLK53P2MVVRLKLKPAHFDIMVYQG6LWWGHWPQQSX5ET4XBEAT +KKNIAGIFGCJTUXAGBGYVU322HVMRJDG3WZMSC5F4OPNMUJCMEEV3JAABTGTGGNC3IUZCU47RSBTJ +T4M3FXRWBPUA2VIT6LST5QCYK4A67OJIR7TUZGIYSCFS5SSZBNENIAXE6UYWMK7Q7E5YZVHQEYV7 +RQ4LCN2YMT63227SEDXMBKWWJSF24ARCXTBZATYNTXGRJP4TPQDIPHILNG6LD2RJDMNB3XYF3HL7 +7BVXUY4SRQQFTQQPR7B4GQSNDC7X6ZHO \ No newline at end of file diff --git a/centaur/src/main/resources/standardTestCases/sizerelativepath/sizerelativepath.wdl b/centaur/src/main/resources/standardTestCases/sizerelativepath/sizerelativepath.wdl new file mode 100644 index 00000000000..5e874835f0d --- /dev/null +++ b/centaur/src/main/resources/standardTestCases/sizerelativepath/sizerelativepath.wdl @@ -0,0 +1,32 @@ +version 1.0 + +task print_size { + input { + File file + } + Int bytes = ceil(size(file)) + + command { + echo ~{bytes} + } + + output { + String out = read_string(stdout()) + } + + runtime {docker: "ubuntu:latest"} +} + +workflow sizerelativepath { + input { + File file = "centaur/src/main/resources/standardTestCases/sizerelativepath/1495bytes.txt" + } + + call print_size { + input: + file = file + } + output { + String size_string = print_size.out + } +} diff --git a/core/src/main/resources/reference.conf b/core/src/main/resources/reference.conf index 1f83a963f8d..3eee76d200d 100644 --- a/core/src/main/resources/reference.conf +++ b/core/src/main/resources/reference.conf @@ -265,6 +265,9 @@ call-caching { # to fail for external reasons which should not invalidate the cache (e.g. auth differences between users): # (default: true) invalidate-bad-cache-results = true + + # The maximum number of times Cromwell will attempt to copy cache hits before giving up and running the job. + max-failed-copy-attempts = 1000000 } google { diff --git a/core/src/main/resources/reference_local_provider_config.inc.conf b/core/src/main/resources/reference_local_provider_config.inc.conf index bd5a041a1ec..2d87c62f56b 100644 --- a/core/src/main/resources/reference_local_provider_config.inc.conf +++ b/core/src/main/resources/reference_local_provider_config.inc.conf @@ -47,16 +47,15 @@ filesystems { ] caching { - # When copying a cached result, what type of file duplication should occur. Attempted in the order listed below: + # When copying a cached result, what type of file duplication should occur. + # For more information check: https://cromwell.readthedocs.io/en/stable/backends/HPC/#shared-filesystem duplication-strategy: [ "hard-link", "soft-link", "copy" ] - # Possible values: file, path - # "file" will compute an md5 hash of the file content. - # "path" will compute an md5 hash of the file path. This strategy will only be effective if the duplication-strategy (above) is set to "soft-link", - # in order to allow for the original file path to be hashed. - hashing-strategy: "file" + # Strategy to determine if a file has been used before. + # For extended explanation and alternative strategies check: https://cromwell.readthedocs.io/en/stable/Configuring/#call-caching + hashing-strategy: "md5" # When true, will check if a sibling file with the same name and the .md5 extension exists, and if it does, use the content of this file as a hash. # If false or the md5 does not exist, will proceed with the above-defined hashing strategy. diff --git a/core/src/main/scala/cromwell/core/WorkflowSourceFilesCollection.scala b/core/src/main/scala/cromwell/core/WorkflowSourceFilesCollection.scala index cf5d2baa113..9359eab5430 100644 --- a/core/src/main/scala/cromwell/core/WorkflowSourceFilesCollection.scala +++ b/core/src/main/scala/cromwell/core/WorkflowSourceFilesCollection.scala @@ -33,6 +33,11 @@ sealed trait WorkflowSourceFilesCollection { } } +trait HasWorkflowIdAndSources { + def sources: WorkflowSourceFilesCollection + def id: WorkflowId +} + object WorkflowSourceFilesCollection { def apply(workflowSource: Option[WorkflowSource], workflowUrl: Option[WorkflowUrl], diff --git a/cromwell.example.backends/LocalExample.conf b/cromwell.example.backends/LocalExample.conf index cbc79ea689f..91be9586fa7 100644 --- a/cromwell.example.backends/LocalExample.conf +++ b/cromwell.example.backends/LocalExample.conf @@ -92,18 +92,15 @@ # Call caching strategies caching { - # When copying a cached result, what type of file duplication should occur. Attempted in the order listed below: + # When copying a cached result, what type of file duplication should occur. + # For more information check: https://cromwell.readthedocs.io/en/stable/backends/HPC/#shared-filesystem duplication-strategy: [ "hard-link", "soft-link", "copy" ] - # Possible values: file, path, path+modtime - # "file" will compute an md5 hash of the file content. - # "path" will compute an md5 hash of the file path. This strategy will only be effective if the duplication-strategy (above) is set to "soft-link", - # in order to allow for the original file path to be hashed. - # "path+modtime" will compute an md5 hash of the file path and the last modified time. The same conditions as for "path" apply here. - # Default: file - hashing-strategy: "file" + # Strategy to determine if a file has been used before. + # For extended explanation and alternative strategies check: https://cromwell.readthedocs.io/en/stable/Configuring/#call-caching + hashing-strategy: "md5" # When true, will check if a sibling file with the same name and the .md5 extension exists, and if it does, use the content of this file as a hash. # If false or the md5 does not exist, will proceed with the above-defined hashing strategy. diff --git a/cromwell.example.backends/cromwell.examples.conf b/cromwell.example.backends/cromwell.examples.conf index de67732930f..e6898473c95 100644 --- a/cromwell.example.backends/cromwell.examples.conf +++ b/cromwell.example.backends/cromwell.examples.conf @@ -153,16 +153,41 @@ call-caching { # (default: true) #invalidate-bad-cache-results = true + # The maximum number of times Cromwell will attempt to copy cache hits before giving up and running the job. + #max-failed-copy-attempts = 1000000 + # blacklist-cache { - # # The call caching blacklist cache is off by default. This cache is used to blacklist cache hit paths based on the - # # prefixes of cache hit paths that Cromwell has previously failed to copy for permissions reasons. + # # The call caching blacklist cache is off by default. This cache is used to blacklist cache hits based on cache + # # hit ids or buckets of cache hit paths that Cromwell has previously failed to copy for permissions reasons. # enabled: true - # # Guava cache concurrency. - # concurrency: 10000 - # # How long entries in the cache should live from the time of their last access. - # ttl: 20 minutes - # # Maximum number of entries in the cache. - # size: 1000 + # + # # A blacklist grouping can be specified in workflow options which will inform the blacklister which workflows + # # should share a blacklist cache. + # groupings { + # workflow-option: call-cache-blacklist-group + # concurrency: 10000 + # ttl: 2 hours + # size: 1000 + # } + # + # buckets { + # # Guava cache concurrency. + # concurrency: 10000 + # # How long entries in the cache should live from the time of their last access. + # ttl: 20 minutes + # # Maximum number of entries in the cache. + # size: 1000 + # } + # + # hits { + # # Guava cache concurrency. + # concurrency: 10000 + # # How long entries in the cache should live from the time of their last access. + # ttl: 20 minutes + # # Maximum number of entries in the cache. + # size: 100000 + # } + # # } } diff --git a/database/migration/src/main/resources/metadata_changesets/remove_non_summarizable_metadata_from_queue.xml b/database/migration/src/main/resources/metadata_changesets/remove_non_summarizable_metadata_from_queue.xml new file mode 100644 index 00000000000..b8c4ce8e17d --- /dev/null +++ b/database/migration/src/main/resources/metadata_changesets/remove_non_summarizable_metadata_from_queue.xml @@ -0,0 +1,67 @@ + + + + + + Delete rows from the summary queue corresponding to metadata that will not be summarized. + + + DELETE FROM SUMMARY_QUEUE_ENTRY queue WHERE queue.METADATA_JOURNAL_ID NOT IN ( + SELECT metadata.METADATA_JOURNAL_ID FROM METADATA_ENTRY metadata WHERE + metadata.METADATA_JOURNAL_ID = queue.METADATA_JOURNAL_ID AND + metadata.CALL_FQN IS NULL AND + metadata.JOB_SCATTER_INDEX IS NULL AND + metadata.JOB_RETRY_ATTEMPT IS NULL AND ( + metadata.METADATA_KEY in + ('start', 'end', 'workflowName', 'status', 'submission', 'parentWorkflowId', 'rootWorkflowId') + OR + metadata.METADATA_KEY LIKE 'labels%' + ) + ) + + + + + + Delete rows from the summary queue corresponding to metadata that will not be summarized. + + + DELETE FROM "SUMMARY_QUEUE_ENTRY" queue WHERE queue."METADATA_JOURNAL_ID" NOT IN ( + SELECT metadata."METADATA_JOURNAL_ID" FROM "METADATA_ENTRY" metadata WHERE + metadata."METADATA_JOURNAL_ID" = queue."METADATA_JOURNAL_ID" AND + metadata."CALL_FQN" IS NULL AND + metadata."JOB_SCATTER_INDEX" IS NULL AND + metadata."JOB_RETRY_ATTEMPT" IS NULL AND ( + metadata."METADATA_KEY" in + ('start', 'end', 'workflowName', 'status', 'submission', 'parentWorkflowId', 'rootWorkflowId') + OR + metadata."METADATA_KEY" LIKE 'labels%' + ) + ) + + + + + + Delete rows from the summary queue corresponding to metadata that will not be summarized. + + + DELETE SUMMARY_QUEUE_ENTRY FROM SUMMARY_QUEUE_ENTRY + INNER JOIN METADATA_ENTRY ON + SUMMARY_QUEUE_ENTRY.METADATA_JOURNAL_ID = METADATA_ENTRY.METADATA_JOURNAL_ID WHERE NOT ( + METADATA_ENTRY.CALL_FQN IS NULL AND + METADATA_ENTRY.JOB_SCATTER_INDEX IS NULL AND + METADATA_ENTRY.JOB_RETRY_ATTEMPT IS NULL AND ( + METADATA_ENTRY.METADATA_KEY in + ('start', 'end', 'workflowName', 'status', 'submission', 'parentWorkflowId', 'rootWorkflowId') + OR + METADATA_ENTRY.METADATA_KEY LIKE 'labels%' + ) + ) + + + + diff --git a/database/migration/src/main/resources/sql_metadata_changelog.xml b/database/migration/src/main/resources/sql_metadata_changelog.xml index 0f74d8f6679..8ce89e5050a 100644 --- a/database/migration/src/main/resources/sql_metadata_changelog.xml +++ b/database/migration/src/main/resources/sql_metadata_changelog.xml @@ -16,5 +16,6 @@ + diff --git a/database/sql/src/main/scala/cromwell/database/slick/MetadataSlickDatabase.scala b/database/sql/src/main/scala/cromwell/database/slick/MetadataSlickDatabase.scala index 506ebee2a33..c558c6e432b 100644 --- a/database/sql/src/main/scala/cromwell/database/slick/MetadataSlickDatabase.scala +++ b/database/sql/src/main/scala/cromwell/database/slick/MetadataSlickDatabase.scala @@ -18,6 +18,33 @@ object MetadataSlickDatabase { val databaseConfig = SlickDatabase.getDatabaseConfig("metadata", parentConfig) new MetadataSlickDatabase(databaseConfig) } + + case class SummarizationPartitionedMetadata(nonSummarizableMetadata: Seq[MetadataEntry], + summarizableMetadata: Seq[MetadataEntry]) + + def partitionSummarizationMetadata(rawMetadataEntries: Seq[MetadataEntry], + startMetadataKey: String, + endMetadataKey: String, + nameMetadataKey: String, + statusMetadataKey: String, + submissionMetadataKey: String, + parentWorkflowIdKey: String, + rootWorkflowIdKey: String, + labelMetadataKey: String): SummarizationPartitionedMetadata = { + + val exactMatchMetadataKeys = Set(startMetadataKey, endMetadataKey, nameMetadataKey, statusMetadataKey, submissionMetadataKey, parentWorkflowIdKey, rootWorkflowIdKey) + val startsWithMetadataKeys = Set(labelMetadataKey) + + val (summarizable, nonSummarizable) = rawMetadataEntries partition { entry => + entry.callFullyQualifiedName.isEmpty && entry.jobIndex.isEmpty && entry.jobAttempt.isEmpty && + (exactMatchMetadataKeys.contains(entry.metadataKey) || startsWithMetadataKeys.exists(entry.metadataKey.startsWith)) + } + + SummarizationPartitionedMetadata( + summarizableMetadata = summarizable, + nonSummarizableMetadata = nonSummarizable + ) + } } class MetadataSlickDatabase(originalDatabaseConfig: Config) @@ -28,24 +55,56 @@ class MetadataSlickDatabase(originalDatabaseConfig: Config) override lazy val dataAccess = new MetadataDataAccessComponent(slickConfig.profile) import dataAccess.driver.api._ + import MetadataSlickDatabase._ override def existsMetadataEntries()(implicit ec: ExecutionContext): Future[Boolean] = { val action = dataAccess.metadataEntriesExists.result runTransaction(action) } - override def addMetadataEntries(metadataEntries: Iterable[MetadataEntry]) + override def addMetadataEntries(metadataEntries: Iterable[MetadataEntry], + startMetadataKey: String, + endMetadataKey: String, + nameMetadataKey: String, + statusMetadataKey: String, + submissionMetadataKey: String, + parentWorkflowIdKey: String, + rootWorkflowIdKey: String, + labelMetadataKey: String) (implicit ec: ExecutionContext): Future[Unit] = { - if (metadataEntries.isEmpty) Future.successful(()) else { - - val batchesToWrite = metadataEntries.grouped(insertBatchSize).toList + val partitioned = partitionSummarizationMetadata( + rawMetadataEntries = metadataEntries.toSeq, + startMetadataKey, + endMetadataKey, + nameMetadataKey, + statusMetadataKey, + submissionMetadataKey, + parentWorkflowIdKey, + rootWorkflowIdKey, + labelMetadataKey) + + // These entries also require a write to the summary queue. + def writeSummarizable(): Future[Unit] = if (partitioned.summarizableMetadata.isEmpty) Future.successful(()) else { + val batchesToWrite = partitioned.summarizableMetadata.grouped(insertBatchSize).toList val insertActions = batchesToWrite.map { batch => val insertMetadata = dataAccess.metadataEntryIdsAutoInc ++= batch insertMetadata.flatMap(ids => writeSummaryQueueEntries(ids)) } runTransaction(DBIO.sequence(insertActions)).void } + + // Non-summarizable metadata that only needs to go to the metadata table can be written much more efficiently + // than summarizable metadata. + def writeNonSummarizable(): Future[Unit] = if (partitioned.nonSummarizableMetadata.isEmpty) Future.successful(()) else { + val action = DBIO.sequence(partitioned.nonSummarizableMetadata.grouped(insertBatchSize).map(dataAccess.metadataEntries ++= _)) + runLobAction(action).void + } + + for { + _ <- writeSummarizable() + _ <- writeNonSummarizable() + } yield () } override def metadataEntryExists(workflowExecutionUuid: String)(implicit ec: ExecutionContext): Future[Boolean] = { @@ -177,14 +236,7 @@ class MetadataSlickDatabase(originalDatabaseConfig: Config) } } - override def summarizeIncreasing(startMetadataKey: String, - endMetadataKey: String, - nameMetadataKey: String, - statusMetadataKey: String, - submissionMetadataKey: String, - parentWorkflowIdKey: String, - rootWorkflowIdKey: String, - labelMetadataKey: String, + override def summarizeIncreasing(labelMetadataKey: String, limit: Int, buildUpdatedSummary: (Option[WorkflowMetadataSummaryEntry], Seq[MetadataEntry]) @@ -195,13 +247,6 @@ class MetadataSlickDatabase(originalDatabaseConfig: Config) _ <- buildMetadataSummaryFromRawMetadataAndWriteToDb( rawMetadataEntries = rawMetadataEntries, - startMetadataKey = startMetadataKey, - endMetadataKey = endMetadataKey, - nameMetadataKey = nameMetadataKey, - statusMetadataKey = statusMetadataKey, - submissionMetadataKey = submissionMetadataKey, - parentWorkflowIdKey = parentWorkflowIdKey, - rootWorkflowIdKey = rootWorkflowIdKey, labelMetadataKey = labelMetadataKey, buildUpdatedSummary = buildUpdatedSummary ) @@ -214,13 +259,6 @@ class MetadataSlickDatabase(originalDatabaseConfig: Config) override def summarizeDecreasing(summaryNameDecreasing: String, summaryNameIncreasing: String, - startMetadataKey: String, - endMetadataKey: String, - nameMetadataKey: String, - statusMetadataKey: String, - submissionMetadataKey: String, - parentWorkflowIdKey: String, - rootWorkflowIdKey: String, labelMetadataKey: String, limit: Int, buildUpdatedSummary: @@ -242,13 +280,6 @@ class MetadataSlickDatabase(originalDatabaseConfig: Config) _ <- buildMetadataSummaryFromRawMetadataAndWriteToDb( rawMetadataEntries = rawMetadataEntries, - startMetadataKey = startMetadataKey, - endMetadataKey = endMetadataKey, - nameMetadataKey = nameMetadataKey, - statusMetadataKey = statusMetadataKey, - submissionMetadataKey = submissionMetadataKey, - parentWorkflowIdKey = parentWorkflowIdKey, - rootWorkflowIdKey = rootWorkflowIdKey, labelMetadataKey = labelMetadataKey, buildUpdatedSummary = buildUpdatedSummary ) @@ -262,33 +293,17 @@ class MetadataSlickDatabase(originalDatabaseConfig: Config) } private def buildMetadataSummaryFromRawMetadataAndWriteToDb(rawMetadataEntries: Seq[MetadataEntry], - startMetadataKey: String, - endMetadataKey: String, - nameMetadataKey: String, - statusMetadataKey: String, - submissionMetadataKey: String, - parentWorkflowIdKey: String, - rootWorkflowIdKey: String, labelMetadataKey: String, buildUpdatedSummary: (Option[WorkflowMetadataSummaryEntry], Seq[MetadataEntry]) => WorkflowMetadataSummaryEntry )(implicit ec: ExecutionContext): DBIO[Unit] = { - val exactMatchMetadataKeys = Set(startMetadataKey, endMetadataKey, nameMetadataKey, statusMetadataKey, submissionMetadataKey, parentWorkflowIdKey, rootWorkflowIdKey) - val startsWithMetadataKeys = Set(labelMetadataKey) - - val metadataEntries = rawMetadataEntries filter { entry => - entry.callFullyQualifiedName.isEmpty && entry.jobIndex.isEmpty && entry.jobAttempt.isEmpty && - (exactMatchMetadataKeys.contains(entry.metadataKey) || startsWithMetadataKeys.exists(entry.metadataKey.startsWith)) - } - val metadataWithoutLabels = metadataEntries - .filterNot(_.metadataKey.contains(labelMetadataKey)) // Why are these "contains" while the filtering is "starts with"? - .groupBy(_.workflowExecutionUuid) - val customLabelEntries = metadataEntries.filter(_.metadataKey.contains(labelMetadataKey)) + val (summarizableLabelsMetadata, summarizableRegularMetadata) = rawMetadataEntries.partition(_.metadataKey.contains(labelMetadataKey)) + val groupedSummarizableRegularMetadata = summarizableRegularMetadata.groupBy(_.workflowExecutionUuid) for { - _ <- DBIO.sequence(metadataWithoutLabels map updateWorkflowMetadataSummaryEntry(buildUpdatedSummary)) - _ <- DBIO.sequence(customLabelEntries map toCustomLabelEntry map upsertCustomLabelEntry) + _ <- DBIO.sequence(groupedSummarizableRegularMetadata map updateWorkflowMetadataSummaryEntry(buildUpdatedSummary)) + _ <- DBIO.sequence(summarizableLabelsMetadata map toCustomLabelEntry map upsertCustomLabelEntry) } yield () } @@ -407,4 +422,8 @@ class MetadataSlickDatabase(originalDatabaseConfig: Config) runAction( countSummaryQueueEntries() ) + + override def getMetadataTotalRowNumberByRootWorkflowId(rootWorkflowId: String, timeout: Duration)(implicit ec: ExecutionContext): Future[Int] = { + runTransaction(dataAccess.metadataTotalSizeRowsForRootWorkflowId(rootWorkflowId).result, timeout = timeout) + } } diff --git a/database/sql/src/main/scala/cromwell/database/slick/tables/MetadataEntryComponent.scala b/database/sql/src/main/scala/cromwell/database/slick/tables/MetadataEntryComponent.scala index bdcc7ac9c0c..04addbbc9b9 100644 --- a/database/sql/src/main/scala/cromwell/database/slick/tables/MetadataEntryComponent.scala +++ b/database/sql/src/main/scala/cromwell/database/slick/tables/MetadataEntryComponent.scala @@ -78,6 +78,21 @@ trait MetadataEntryComponent { } ) + val metadataTotalSizeRowsForRootWorkflowId = Compiled( + (rootWorkflowId: Rep[String]) => { + val targetWorkflowIds = for { + summary <- workflowMetadataSummaryEntries + // Uses `IX_WORKFLOW_METADATA_SUMMARY_ENTRY_RWEU`, `UC_WORKFLOW_METADATA_SUMMARY_ENTRY_WEU` + if summary.rootWorkflowExecutionUuid === rootWorkflowId || summary.workflowExecutionUuid === rootWorkflowId + } yield summary.workflowExecutionUuid + + for { + metadata <- metadataEntries + if metadata.workflowExecutionUuid in targetWorkflowIds // Uses `METADATA_WORKFLOW_IDX` + } yield metadata + }.size + ) + val metadataEntryExistsForWorkflowExecutionUuid = Compiled( (workflowExecutionUuid: Rep[String]) => (for { metadataEntry <- metadataEntries diff --git a/database/sql/src/main/scala/cromwell/database/sql/MetadataSqlDatabase.scala b/database/sql/src/main/scala/cromwell/database/sql/MetadataSqlDatabase.scala index 23caa602e11..631de1383b0 100644 --- a/database/sql/src/main/scala/cromwell/database/sql/MetadataSqlDatabase.scala +++ b/database/sql/src/main/scala/cromwell/database/sql/MetadataSqlDatabase.scala @@ -25,7 +25,15 @@ trait MetadataSqlDatabase extends SqlDatabase { /** * Add metadata events to the database transactionally. */ - def addMetadataEntries(metadataEntries: Iterable[MetadataEntry])(implicit ec: ExecutionContext): Future[Unit] + def addMetadataEntries(metadataEntries: Iterable[MetadataEntry], + startMetadataKey: String, + endMetadataKey: String, + nameMetadataKey: String, + statusMetadataKey: String, + submissionMetadataKey: String, + parentWorkflowIdKey: String, + rootWorkflowIdKey: String, + labelMetadataKey: String)(implicit ec: ExecutionContext): Future[Unit] def metadataEntryExists(workflowExecutionUuid: String)(implicit ec: ExecutionContext): Future[Boolean] @@ -68,14 +76,7 @@ trait MetadataSqlDatabase extends SqlDatabase { * @param buildUpdatedSummary Takes in the optional existing summary and the metadata, returns the new summary. * @return A `Future` with the number of rows summarized by the invocation, and the number of rows still to summarize. */ - def summarizeIncreasing(startMetadataKey: String, - endMetadataKey: String, - nameMetadataKey: String, - statusMetadataKey: String, - submissionMetadataKey: String, - parentWorkflowIdKey: String, - rootWorkflowIdKey: String, - labelMetadataKey: String, + def summarizeIncreasing(labelMetadataKey: String, limit: Int, buildUpdatedSummary: (Option[WorkflowMetadataSummaryEntry], Seq[MetadataEntry]) @@ -90,13 +91,6 @@ trait MetadataSqlDatabase extends SqlDatabase { */ def summarizeDecreasing(summaryNameDecreasing: String, summaryNameIncreasing: String, - startMetadataKey: String, - endMetadataKey: String, - nameMetadataKey: String, - statusMetadataKey: String, - submissionMetadataKey: String, - parentWorkflowIdKey: String, - rootWorkflowIdKey: String, labelMetadataKey: String, limit: Int, buildUpdatedSummary: @@ -156,4 +150,6 @@ trait MetadataSqlDatabase extends SqlDatabase { def countRootWorkflowIdsByArchiveStatusAndEndedOnOrBeforeThresholdTimestamp(archiveStatus: Option[String], thresholdTimestamp: Timestamp)(implicit ec: ExecutionContext): Future[Int] def getSummaryQueueSize()(implicit ec: ExecutionContext): Future[Int] + + def getMetadataTotalRowNumberByRootWorkflowId(rootWorkflowId: String, timeout: Duration)(implicit ec: ExecutionContext): Future[Int] } diff --git a/docs/Configuring.md b/docs/Configuring.md index 9b31641c0a1..4aa76e3fbca 100644 --- a/docs/Configuring.md +++ b/docs/Configuring.md @@ -497,7 +497,8 @@ Read the [Abort](execution/ExecutionTwists/#abort) section to learn more about h ### Call caching -Call Caching allows Cromwell to detect when a job has been run in the past so it doesn't have to re-compute results. To learn more see [Call Caching](cromwell_features/CallCaching). +Call Caching allows Cromwell to detect when a job has been run in the past so it doesn't have to re-compute results. +To learn more see [Call Caching](cromwell_features/CallCaching). To enable Call Caching, add the following to your Cromwell configuration: @@ -515,25 +516,38 @@ Cromwell also accepts [Workflow Options](wf_options/Overview#call-caching-option ### Local filesystem options -When running a job on the Config (Shared Filesystem) backend, Cromwell provides some additional options in the backend's config section: +When running a job on the Config (Shared Filesystem) backend, Cromwell provides some additional options in the backend's +config section: ```HOCON config { filesystems { local { caching { - # When copying a cached result, what type of file duplication should occur. Attempted in the order listed below: + # When copying a cached result, what type of file duplication should occur. + # possible values: "hard-link", "soft-link", "copy", "cached-copy". + # For more information check: https://cromwell.readthedocs.io/en/stable/backends/HPC/#shared-filesystem + # Attempted in the order listed below: duplication-strategy: [ "hard-link", "soft-link", "copy" ] - # Possible values: file, path, path+modtime - # "file" will compute an md5 hash of the file content. + # Possible values: md5, xxh64, fingerprint, path, path+modtime + # For extended explanation check: https://cromwell.readthedocs.io/en/stable/Configuring/#call-caching + # "md5" will compute an md5 hash of the file content. + # "xxh64" will compute an xxh64 hash of the file content. Much faster than md5 + # "fingerprint" will take last modified time, size and hash the first 10 mb with xxh64 to create a file fingerprint. + # This strategy will only be effective if the duplication-strategy (above) is set to "hard-link", as copying changes the last modified time. # "path" will compute an md5 hash of the file path. This strategy will only be effective if the duplication-strategy (above) is set to "soft-link", # in order to allow for the original file path to be hashed. # "path+modtime" will compute an md5 hash of the file path and the last modified time. The same conditions as for "path" apply here. - # Default: file - hashing-strategy: "file" + # Default: "md5" + hashing-strategy: "md5" + + # When the 'fingerprint' strategy is used set how much of the beginning of the file is read as fingerprint. + # If the file is smaller than this size the entire file will be read. + # Default: 10485760 (10MB). + fingerprint-size: 10485760 # When true, will check if a sibling file with the same name and the .md5 extension exists, and if it does, use the content of this file as a hash. # If false or the md5 does not exist, will proceed with the above-defined hashing strategy. @@ -545,6 +559,30 @@ When running a job on the Config (Shared Filesystem) backend, Cromwell provides } ``` +#### Call cache strategy options for local filesystem + +* hash based options. These read the entire file. These strategies work with containers. + * `xxh64` (community-supported*). This uses the 64-bit implementation of the [xxHash](https://www.xxhash.com) + algorithm. This algorithm is optimized for file integrity hashing and provides a more than 10x speed improvement over + md5. + * `md5`. The well-known md5sum algorithm +* Path based options. These are based on filepath. Extremely lightweight, but only work with the `soft-link` file +caching strategy and can therefore never work with containers. + * `path` creates a md5 hash of the path. + * `path+modtime` creates a md5 hash of the path and its modification time. +* Fingerprinting. This strategy works with containers. + * `fingerprint` (community-supported*) tries to create a fingerprint for each file by taking its last modified time (milliseconds since + epoch in hexadecimal) + size (bytes in hexadecimal) + the xxh64 sum of the first 10 MB** of the file. + It is much more lightweight than the hash based options while still unique enough that collisions are unlikely. This + strategy works well for workflows that generate multi-gigabyte files and where hashing these files on the + cromwell instance provides CPU or I/O problems. + NOTE: This strategy requires hard-linking as a dupliation strategy, as copying changes the last modified time. + +(*) The `fingerprint` and `xxh64` strategies are features that are community supported by Cromwell's HPC community. There +is no official support from the core Cromwell team. + +(**) This value is configurable. + ### Workflow log directory To change the directory where Cromwell writes workflow logs, change the directory location via the setting: diff --git a/docs/RuntimeAttributes.md b/docs/RuntimeAttributes.md index 16c0a0732b4..c7c0e8295c5 100644 --- a/docs/RuntimeAttributes.md +++ b/docs/RuntimeAttributes.md @@ -51,8 +51,8 @@ There are a number of additional runtime attributes that apply to the Google Clo - [preemptible](#preemptible) - [bootDiskSizeGb](#bootdisksizegb) - [noAddress](#noaddress) -- [gpuCount and gpuType](#gpucount-and-gputype) -- [cpuPlatform](#cpuPlatform) +- [gpuCount, gpuType, and nvidiaDriverVersion](#gpucount-gputype-and-nvidiadriverversion) +- [cpuPlatform](#cpuplatform) diff --git a/docs/cromwell_features/CallCaching.md b/docs/cromwell_features/CallCaching.md index 5a9669717bd..323bddec04d 100644 --- a/docs/cromwell_features/CallCaching.md +++ b/docs/cromwell_features/CallCaching.md @@ -27,14 +27,8 @@ or **referenced from the original cached job** depending on the Cromwell Cromwell offers the option to cache file hashes within the scope of a root workflow to prevent repeatedly requesting the hashes of the same files multiple times. File hash caching is off by default and can be turned on with the configuration option `system.file-hash-cache=true`. -***Call cache copy authorization failure prefix blacklisting*** - -Cromwell has the option to filter call cache hits based on authorization failures copying previous -call cache hits. In a multi-user environment user A might cache hit to one of user B's results -but that doesn't necessarily mean user A is authorized to read user B's outputs from the filesystem. Call cache blacklisting -allows Cromwell to record on a per-root-workflow level which file path prefixes were involved in cache result copy authorization failures. -If Cromwell sees that the file paths for a candidate cache hit have a blacklisted prefix, Cromwell will quickly -fail the copy attempt without doing any potentially expensive I/O. +***Call cache blacklisting*** +Cromwell offers the ability to filter cache hits based on copying failures. Call cache blacklisting configuration looks like: @@ -47,23 +41,82 @@ call-caching { invalidate-bad-cache-results = false blacklist-cache { - # The call caching blacklist cache is off by default. This is used to blacklist cache hit paths based on the - # prefixes of cache hit paths that Cromwell previously failed to copy for authorization reasons. - enabled: true - # Guava cache concurrency. - concurrency: 10000 - # How long entries in the cache should live from the time of their last access. - ttl: 20 minutes - # Maximum number of entries in the cache. - size: 1000 + # The call caching blacklist cache is off by default. This cache is used to blacklist cache hits based on cache + # hit ids or buckets of cache hit paths that Cromwell has previously failed to copy for permissions reasons. + enabled: true + + # All blacklisting values below are optional. In order to use groupings (blacklist caches shared among root + # workflows) a value must be specified for `groupings.workflow-option` in configuration and the workflows to + # be grouped must be submitted with workflow options specifying the same group. + groupings { + workflow-option: call-cache-blacklist-group + concurrency: 10000 + ttl: 2 hours + size: 1000 + } + + buckets { + # Guava cache concurrency. + concurrency: 10000 + # How long entries in the cache should live from the time of their last access. + ttl: 1 hour + # Maximum number of entries in the cache. + size: 1000 + } + + hits { + # Guava cache concurrency. + concurrency: 10000 + # How long entries in the cache should live from the time of their last access. + ttl: 1 hour + # Maximum number of entries in the cache. + size: 20000 + } } } ``` -Call cache blacklisting could be supported by any backend type though is currently implemented only for the Google Pipelines API (PAPI) backends. -For PAPI backends the bucket is considered the prefix for blacklisting purposes. +**** Blacklist cache grouping **** + +By default Cromwell's blacklist caches work at the granularity of root workflows, but Cromwell can also be configured to +share a blacklist cache among a group of workflows. +If a value is specified for `call-caching.blacklisting.groupings.workflow-option` and a workflow option is specified +having a matching key, all workflows specifying the same value will share a blacklist cache. + +For example, if Cromwell configuration contains `call-caching.blacklisting.groupings.workflow-option = "project"` and +a workflow is submitted with the options + +```json +{ + "project": "Mary" +} +``` + +then this workflow will share a blacklist cache with any other workflows whose workflow options contain `"project": "Mary"`. + +Grouping of blacklist caches can significantly improve blacklisting effectiveness and overall call caching performance. +Workflows should be grouped by their effective authorization to ensure the same filesystem/object store permissions +exist for every workflow in the group. + +**** Hit blacklisting **** + +If a cache hit fails copying for any reason, Cromwell will record that failure in the blacklist cache and will not use +the hit again. Hit blacklisting is particularly effective at improving call caching performance in conjunction with the +grouping feature described above. + +**** Path prefix (GCS bucket) blacklisting on 403 Forbidden errors **** + +In a multi-user environment user A might cache hit to one of user B's results +but that doesn't necessarily mean user A is authorized to read user B's outputs from the filesystem. Call cache blacklisting +allows Cromwell to record which file path prefixes were involved in cache result copy authorization failures. +If Cromwell sees that the file paths for a candidate cache hit have a blacklisted prefix, Cromwell will quickly +fail the copy attempt without doing any potentially expensive I/O. + +Path prefix blacklisting could be supported by any backend type though it is currently implemented only for Google +(PAPI) backends. For Google backends the GCS bucket is considered the prefix for blacklisting purposes. + -***Call cache hit path prefixes*** +***Call cache whitelisting*** In a multi-user environment where access to job outputs may be restricted among different users, it can be useful to limit cache hits to those that are more likely to actually be readable for cache hit copies. diff --git a/engine/src/main/scala/cromwell/engine/workflow/WorkflowManagerActor.scala b/engine/src/main/scala/cromwell/engine/workflow/WorkflowManagerActor.scala index 197b5e5845f..34efa3809e5 100644 --- a/engine/src/main/scala/cromwell/engine/workflow/WorkflowManagerActor.scala +++ b/engine/src/main/scala/cromwell/engine/workflow/WorkflowManagerActor.scala @@ -9,9 +9,9 @@ import cats.data.NonEmptyList import com.typesafe.config.Config import common.exception.ThrowableAggregation import cromwell.backend.async.KnownJobFailureException -import cromwell.backend.standard.callcaching.{BlacklistCache, RootWorkflowFileHashCacheActor} +import cromwell.backend.standard.callcaching.{CallCachingBlacklistManager, RootWorkflowFileHashCacheActor} import cromwell.core.Dispatcher.EngineDispatcher -import cromwell.core.{CacheConfig, WorkflowId} +import cromwell.core.WorkflowId import cromwell.engine.SubWorkflowStart import cromwell.engine.backend.BackendSingletonCollection import cromwell.engine.workflow.WorkflowActor._ @@ -25,7 +25,6 @@ import org.apache.commons.lang3.exception.ExceptionUtils import scala.concurrent.ExecutionContext import scala.concurrent.duration._ -import scala.language.postfixOps import scala.util.Try object WorkflowManagerActor { @@ -278,6 +277,8 @@ class WorkflowManagerActor(params: WorkflowManagerActorParams) logger.debug(s"$tag transitioning from $fromState to $toState") } + private val callCachingBlacklistManager = new CallCachingBlacklistManager(config, logger) + /** * Submit the workflow and return an updated copy of the state data reflecting the addition of a * Workflow ID -> WorkflowActorRef entry. @@ -293,11 +294,6 @@ class WorkflowManagerActor(params: WorkflowManagerActorParams) val fileHashCacheActorProps: Option[Props] = fileHashCacheEnabled.option(RootWorkflowFileHashCacheActor.props(params.ioActor, workflowId)) - val callCachingBlacklistCache: Option[BlacklistCache] = for { - config <- config.as[Option[Config]]("call-caching.blacklist-cache") - cacheConfig <- CacheConfig.optionalConfig(config, defaultConcurrency = 1000, defaultSize = 1000, defaultTtl = 1 hour) - } yield BlacklistCache(cacheConfig) - val wfProps = WorkflowActor.props( workflowToStart = workflow, conf = config, @@ -318,7 +314,7 @@ class WorkflowManagerActor(params: WorkflowManagerActorParams) workflowHeartbeatConfig = params.workflowHeartbeatConfig, totalJobsByRootWf = new AtomicInteger(), fileHashCacheActorProps = fileHashCacheActorProps, - blacklistCache = callCachingBlacklistCache) + blacklistCache = callCachingBlacklistManager.blacklistCacheFor(workflow)) val wfActor = context.actorOf(wfProps, name = s"WorkflowActor-$workflowId") wfActor ! SubscribeTransitionCallBack(self) diff --git a/engine/src/main/scala/cromwell/engine/workflow/lifecycle/deletion/DeleteWorkflowFilesActor.scala b/engine/src/main/scala/cromwell/engine/workflow/lifecycle/deletion/DeleteWorkflowFilesActor.scala index 7bf5017841a..9959de379e0 100644 --- a/engine/src/main/scala/cromwell/engine/workflow/lifecycle/deletion/DeleteWorkflowFilesActor.scala +++ b/engine/src/main/scala/cromwell/engine/workflow/lifecycle/deletion/DeleteWorkflowFilesActor.scala @@ -10,6 +10,7 @@ import cromwell.engine.io.IoAttempts.EnhancedCromwellIoException import cromwell.engine.workflow.lifecycle.deletion.DeleteWorkflowFilesActor._ import cromwell.engine.workflow.lifecycle.execution.callcaching._ import cromwell.filesystems.gcs.batch.GcsBatchCommandBuilder +import cromwell.services.CallCaching.CallCachingEntryId import cromwell.services.EngineServicesStore import cromwell.services.metadata.MetadataService.PutMetadataAction import cromwell.services.metadata.impl.FileDeletionStatus diff --git a/engine/src/main/scala/cromwell/engine/workflow/lifecycle/execution/WorkflowExecutionActor.scala b/engine/src/main/scala/cromwell/engine/workflow/lifecycle/execution/WorkflowExecutionActor.scala index 2b23d647b5a..23efde68e4b 100644 --- a/engine/src/main/scala/cromwell/engine/workflow/lifecycle/execution/WorkflowExecutionActor.scala +++ b/engine/src/main/scala/cromwell/engine/workflow/lifecycle/execution/WorkflowExecutionActor.scala @@ -621,6 +621,16 @@ case class WorkflowExecutionActor(params: WorkflowExecutionActorParams) val ejeaName = s"${workflowDescriptor.id}-EngineJobExecutionActor-${jobKey.tag}" val backendName = backendLifecycleActorFactory.name val backendSingleton = params.backendSingletonCollection.backendSingletonActors(backendName) + + val callCachingParameters = EngineJobExecutionActor.CallCachingParameters( + mode = workflowDescriptor.callCachingMode, + readActor = params.callCacheReadActor, + writeActor = params.callCacheWriteActor, + fileHashCacheActor = params.fileHashCacheActor, + maxFailedCopyAttempts = params.rootConfig.getInt("call-caching.max-failed-copy-attempts"), + blacklistCache = params.blacklistCache + ) + val ejeaProps = EngineJobExecutionActor.props( self, jobKey, @@ -631,15 +641,11 @@ case class WorkflowExecutionActor(params: WorkflowExecutionActorParams) serviceRegistryActor = serviceRegistryActor, ioActor = params.ioActor, jobStoreActor = params.jobStoreActor, - callCacheReadActor = params.callCacheReadActor, - callCacheWriteActor = params.callCacheWriteActor, workflowDockerLookupActor = params.workflowDockerLookupActor, jobTokenDispenserActor = params.jobTokenDispenserActor, backendSingleton, - workflowDescriptor.callCachingMode, command, - fileHashCacheActor = params.fileHashCacheActor, - blacklistCache = params.blacklistCache + callCachingParameters ) val ejeaRef = context.actorOf(ejeaProps, ejeaName) diff --git a/engine/src/main/scala/cromwell/engine/workflow/lifecycle/execution/callcaching/CallCache.scala b/engine/src/main/scala/cromwell/engine/workflow/lifecycle/execution/callcaching/CallCache.scala index 188f4d2924d..78ba0561cea 100644 --- a/engine/src/main/scala/cromwell/engine/workflow/lifecycle/execution/callcaching/CallCache.scala +++ b/engine/src/main/scala/cromwell/engine/workflow/lifecycle/execution/callcaching/CallCache.scala @@ -16,11 +16,11 @@ import cromwell.database.sql.tables._ import cromwell.engine.workflow.lifecycle.execution.callcaching.CallCache._ import cromwell.engine.workflow.lifecycle.execution.callcaching.CallCacheReadActor.AggregatedCallHashes import cromwell.engine.workflow.lifecycle.execution.callcaching.EngineJobHashingActor.CallCacheHashes +import cromwell.services.CallCaching.CallCachingEntryId import wom.core._ import scala.concurrent.{ExecutionContext, Future} -final case class CallCachingEntryId(id: Int) /** * Given a database-layer CallCacheStore, this accessor can access the database with engine-friendly data types. */ diff --git a/engine/src/main/scala/cromwell/engine/workflow/lifecycle/execution/callcaching/CallCacheDiffActor.scala b/engine/src/main/scala/cromwell/engine/workflow/lifecycle/execution/callcaching/CallCacheDiffActor.scala index ad2a41a24b9..585f78f63a0 100644 --- a/engine/src/main/scala/cromwell/engine/workflow/lifecycle/execution/callcaching/CallCacheDiffActor.scala +++ b/engine/src/main/scala/cromwell/engine/workflow/lifecycle/execution/callcaching/CallCacheDiffActor.scala @@ -32,17 +32,17 @@ class CallCacheDiffActor(serviceRegistryActor: ActorRef) extends LoggingFSM[Call when(WaitingForMetadata) { // First Response // Response A - case Event(SuccessfulMetadataJsonResponse(GetMetadataAction(originalQuery, _), responseJson), data@CallCacheDiffWithRequest(queryA, _, None, None, _)) if queryA == originalQuery => + case Event(SuccessfulMetadataJsonResponse(GetMetadataAction(originalQuery, _, _), responseJson), data@CallCacheDiffWithRequest(queryA, _, None, None, _)) if queryA == originalQuery => stay() using data.copy(responseA = Option(WorkflowMetadataJson(responseJson))) // Response B - case Event(SuccessfulMetadataJsonResponse(GetMetadataAction(originalQuery, _), responseJson), data@CallCacheDiffWithRequest(_, queryB, None, None, _)) if queryB == originalQuery => + case Event(SuccessfulMetadataJsonResponse(GetMetadataAction(originalQuery, _, _), responseJson), data@CallCacheDiffWithRequest(_, queryB, None, None, _)) if queryB == originalQuery => stay() using data.copy(responseB = Option(WorkflowMetadataJson(responseJson))) // Second Response // Response A - case Event(SuccessfulMetadataJsonResponse(GetMetadataAction(originalQuery, _), responseJson), CallCacheDiffWithRequest(queryA, queryB, None, Some(responseB), replyTo)) if queryA == originalQuery => + case Event(SuccessfulMetadataJsonResponse(GetMetadataAction(originalQuery, _, _), responseJson), CallCacheDiffWithRequest(queryA, queryB, None, Some(responseB), replyTo)) if queryA == originalQuery => buildDiffAndRespond(queryA, queryB, WorkflowMetadataJson(responseJson), responseB, replyTo) // Response B - case Event(SuccessfulMetadataJsonResponse(GetMetadataAction(originalQuery, _), responseJson), CallCacheDiffWithRequest(queryA, queryB, Some(responseA), None, replyTo)) if queryB == originalQuery => + case Event(SuccessfulMetadataJsonResponse(GetMetadataAction(originalQuery, _, _), responseJson), CallCacheDiffWithRequest(queryA, queryB, Some(responseA), None, replyTo)) if queryB == originalQuery => buildDiffAndRespond(queryA, queryB, responseA, WorkflowMetadataJson(responseJson), replyTo) case Event(FailedMetadataJsonResponse(_, failure), data: CallCacheDiffWithRequest) => data.replyTo ! FailedCallCacheDiffResponse(failure) diff --git a/engine/src/main/scala/cromwell/engine/workflow/lifecycle/execution/callcaching/CallCacheInvalidateActor.scala b/engine/src/main/scala/cromwell/engine/workflow/lifecycle/execution/callcaching/CallCacheInvalidateActor.scala index 2a7fda049fd..16bb0fb9caa 100644 --- a/engine/src/main/scala/cromwell/engine/workflow/lifecycle/execution/callcaching/CallCacheInvalidateActor.scala +++ b/engine/src/main/scala/cromwell/engine/workflow/lifecycle/execution/callcaching/CallCacheInvalidateActor.scala @@ -3,6 +3,7 @@ package cromwell.engine.workflow.lifecycle.execution.callcaching import akka.actor.{Actor, ActorLogging, Props} import cromwell.core.Dispatcher.EngineDispatcher import cromwell.database.sql.tables.CallCachingEntry +import cromwell.services.CallCaching.CallCachingEntryId import scala.concurrent.ExecutionContext import scala.util.{Failure, Success} diff --git a/engine/src/main/scala/cromwell/engine/workflow/lifecycle/execution/callcaching/CallCacheReadActor.scala b/engine/src/main/scala/cromwell/engine/workflow/lifecycle/execution/callcaching/CallCacheReadActor.scala index ba3c213ed45..696c90099e7 100644 --- a/engine/src/main/scala/cromwell/engine/workflow/lifecycle/execution/callcaching/CallCacheReadActor.scala +++ b/engine/src/main/scala/cromwell/engine/workflow/lifecycle/execution/callcaching/CallCacheReadActor.scala @@ -13,6 +13,7 @@ import cromwell.services.EnhancedThrottlerActor import scala.concurrent.Future import scala.util.{Failure, Success} import CallCache._ +import cromwell.services.CallCaching.CallCachingEntryId /** * Queues up work sent to it because its receive is non-blocking. diff --git a/engine/src/main/scala/cromwell/engine/workflow/lifecycle/execution/callcaching/EngineJobHashingActor.scala b/engine/src/main/scala/cromwell/engine/workflow/lifecycle/execution/callcaching/EngineJobHashingActor.scala index f553859711c..5a3494795d5 100644 --- a/engine/src/main/scala/cromwell/engine/workflow/lifecycle/execution/callcaching/EngineJobHashingActor.scala +++ b/engine/src/main/scala/cromwell/engine/workflow/lifecycle/execution/callcaching/EngineJobHashingActor.scala @@ -11,6 +11,7 @@ import cromwell.engine.workflow.lifecycle.execution.callcaching.CallCache.CallCa import cromwell.engine.workflow.lifecycle.execution.callcaching.CallCacheHashingJobActor.{CompleteFileHashingResult, FinalFileHashingResult, InitialHashingResult, NoFileHashesResult} import cromwell.engine.workflow.lifecycle.execution.callcaching.CallCacheReadingJobActor.NextHit import cromwell.engine.workflow.lifecycle.execution.callcaching.EngineJobHashingActor._ +import cromwell.services.CallCaching.CallCachingEntryId import cromwell.services.metadata.CallMetadataKeys /** diff --git a/engine/src/main/scala/cromwell/engine/workflow/lifecycle/execution/callcaching/FetchCachedResultsActor.scala b/engine/src/main/scala/cromwell/engine/workflow/lifecycle/execution/callcaching/FetchCachedResultsActor.scala index daa13b70e2f..0eef565caf2 100644 --- a/engine/src/main/scala/cromwell/engine/workflow/lifecycle/execution/callcaching/FetchCachedResultsActor.scala +++ b/engine/src/main/scala/cromwell/engine/workflow/lifecycle/execution/callcaching/FetchCachedResultsActor.scala @@ -6,6 +6,7 @@ import cromwell.core.Dispatcher.EngineDispatcher import cromwell.core.simpleton.WomValueSimpleton import cromwell.database.sql.SqlConverters._ import cromwell.engine.workflow.lifecycle.execution.callcaching.FetchCachedResultsActor.{CachedOutputLookupFailed, CachedOutputLookupSucceeded} +import cromwell.services.CallCaching.CallCachingEntryId import scala.concurrent.ExecutionContext import scala.util.{Failure, Success} diff --git a/engine/src/main/scala/cromwell/engine/workflow/lifecycle/execution/job/EngineJobExecutionActor.scala b/engine/src/main/scala/cromwell/engine/workflow/lifecycle/execution/job/EngineJobExecutionActor.scala index 43c962db4d8..e0ad5290ad4 100644 --- a/engine/src/main/scala/cromwell/engine/workflow/lifecycle/execution/job/EngineJobExecutionActor.scala +++ b/engine/src/main/scala/cromwell/engine/workflow/lifecycle/execution/job/EngineJobExecutionActor.scala @@ -3,7 +3,7 @@ package cromwell.engine.workflow.lifecycle.execution.job import akka.actor.SupervisorStrategy.{Escalate, Stop} import akka.actor.{ActorInitializationException, ActorRef, LoggingFSM, OneForOneStrategy, Props} import cats.data.NonEmptyList -import cromwell.backend.BackendCacheHitCopyingActor.{CacheCopyError, CopyOutputsCommand, CopyingOutputsFailedResponse, LoggableCacheCopyError, MetricableCacheCopyError} +import cromwell.backend.BackendCacheHitCopyingActor.{CacheCopyFailure, CopyOutputsCommand, CopyingOutputsFailedResponse, CopyAttemptError, BlacklistSkip} import cromwell.backend.BackendJobExecutionActor._ import cromwell.backend.BackendLifecycleActor.AbortJobCommand import cromwell.backend.MetricableCacheCopyErrorCategory.MetricableCacheCopyErrorCategory @@ -37,6 +37,7 @@ import cromwell.engine.workflow.lifecycle.{EngineLifecycleActorAbortCommand, Tim import cromwell.engine.workflow.tokens.JobExecutionTokenDispenserActor.{JobExecutionTokenDispensed, JobExecutionTokenRequest, JobExecutionTokenReturn} import cromwell.jobstore.JobStoreActor._ import cromwell.jobstore._ +import cromwell.services.CallCaching.CallCachingEntryId import cromwell.services.EngineServicesStore import cromwell.services.instrumentation.CromwellInstrumentation import cromwell.services.metadata.CallMetadataKeys.CallCachingKeys @@ -57,15 +58,11 @@ class EngineJobExecutionActor(replyTo: ActorRef, val serviceRegistryActor: ActorRef, ioActor: ActorRef, jobStoreActor: ActorRef, - callCacheReadActor: ActorRef, - callCacheWriteActor: ActorRef, workflowDockerLookupActor: ActorRef, jobTokenDispenserActor: ActorRef, backendSingletonActor: Option[ActorRef], - callCachingMode: CallCachingMode, command: BackendJobExecutionActorCommand, - fileHashCachingActor: Option[ActorRef], - blacklistCache: Option[BlacklistCache]) extends LoggingFSM[EngineJobExecutionActorState, EJEAData] + callCachingParameters: CallCachingParameters) extends LoggingFSM[EngineJobExecutionActorState, EJEAData] with WorkflowLogging with CallMetadataHelper with JobInstrumentation @@ -97,8 +94,8 @@ class EngineJobExecutionActor(replyTo: ActorRef, if (backendLifecycleActorFactory.fileHashingActorProps.isEmpty) CallCachingOff else if (jobDescriptorKey.node.callable.meta.get("volatile").contains(MetaValueElementBoolean(true))) CallCachingOff else if (backendLifecycleActorFactory.cacheHitCopyingActorProps.isEmpty || jobDescriptorKey.attempt > 1) { - callCachingMode.withoutRead - } else callCachingMode + callCachingParameters.mode.withoutRead + } else callCachingParameters.mode } // For tests: @@ -226,7 +223,21 @@ class EngineJobExecutionActor(replyTo: ActorRef, writeToMetadata(Map( callCachingHitResultMetadataKey -> false, callCachingReadResultMetadataKey -> "Cache Miss")) - log.debug("Cache miss for job {}", jobTag) + + if (data.cacheHitFailureCount > 0) { + val totalHits = data.cacheHitFailureCount + val copyFails = data.failedCopyAttempts + val blacklisted = totalHits - copyFails + workflowLogger.info( + s"Could not copy a suitable cache hit for $jobTag. " + + s"EJEA attempted to copy $totalHits cache hits before failing. " + + s"Of these $copyFails failed to copy and $blacklisted were already blacklisted from previous attempts). " + + s"Falling back to running job." + ) + } else { + workflowLogger.info("Could not copy a suitable cache hit for {}. No copy attempts were made.", jobTag) + } + runJob(data) case Event(hashes: CallCacheHashes, data: ResponsePendingData) => addHashesAndStay(data, hashes) @@ -241,7 +252,7 @@ class EngineJobExecutionActor(replyTo: ActorRef, when(FetchingCachedOutputsFromDatabase) { case Event( CachedOutputLookupSucceeded(womValueSimpletons, jobDetritus, returnCode, cacheResultId, cacheHitDetails), - data@ResponsePendingData(_, _, _, _, Some(ejeaCacheHit), _, _), + data@ResponsePendingData(_, _, _, _, Some(ejeaCacheHit), _, _, _), ) => if (cacheResultId != ejeaCacheHit.hit.cacheResultId) { // Sanity check: was this the right set of results (a false here is a BAD thing!): @@ -267,7 +278,7 @@ class EngineJobExecutionActor(replyTo: ActorRef, // Backend copying response: case Event( response: JobSucceededResponse, - data@ResponsePendingData(_, _, Some(Success(hashes)), _, _, _, _), + data@ResponsePendingData(_, _, Some(Success(hashes)), _, _, _, _, _), ) => logCacheHitSuccessAndNotifyMetadata(data) saveCacheResults(hashes, data.withSuccessResponse(response)) @@ -280,7 +291,7 @@ class EngineJobExecutionActor(replyTo: ActorRef, saveJobCompletionToJobStore(data.withSuccessResponse(response)) case Event( CopyingOutputsFailedResponse(_, cacheCopyAttempt, reason), - data@ResponsePendingData(_, _, _, _, Some(cacheHit), _, _) + data@ResponsePendingData(_, _, _, _, Some(cacheHit), _, _, _) ) if cacheCopyAttempt == cacheHit.hitNumber => invalidateCacheHitAndTransition(cacheHit, data, reason) @@ -320,7 +331,7 @@ class EngineJobExecutionActor(replyTo: ActorRef, // writeToCache is true and all hashes have already been retrieved - save to the cache case Event( response: JobSucceededResponse, - data@ResponsePendingData(_, _, Some(Success(hashes)), _, _, _, _) + data@ResponsePendingData(_, _, Some(Success(hashes)), _, _, _, _, _) ) if effectiveCallCachingMode.writeToCache => eventList ++= response.executionEvents // Publish the image used now that we have it as we might lose the information if Cromwell is restarted @@ -342,7 +353,7 @@ class EngineJobExecutionActor(replyTo: ActorRef, // writeToCache is true and all hashes already retrieved - save to job store case Event( response: BackendJobFailedResponse, - data@ResponsePendingData(_, _, Some(Success(_)), _, _, _, _) + data@ResponsePendingData(_, _, Some(Success(_)), _, _, _, _, _) ) if effectiveCallCachingMode.writeToCache => saveJobCompletionToJobStore(data.withFailedResponse(response)) // Hashes are still missing and we want them (writeToCache is true) - wait for them @@ -579,7 +590,7 @@ class EngineJobExecutionActor(replyTo: ActorRef, def initializeJobHashing(jobDescriptor: BackendJobDescriptor, activity: CallCachingActivity, callCachingEligible: CallCachingEligible): Try[ActorRef] = { val maybeFileHashingActorProps = backendLifecycleActorFactory.fileHashingActorProps map { - _.apply(jobDescriptor, initializationData, serviceRegistryActor, ioActor, fileHashCachingActor) + _.apply(jobDescriptor, initializationData, serviceRegistryActor, ioActor, callCachingParameters.fileHashCacheActor) } maybeFileHashingActorProps match { @@ -590,7 +601,7 @@ class EngineJobExecutionActor(replyTo: ActorRef, jobDescriptor, initializationData, fileHashingActorProps, - CallCacheReadingJobActor.props(callCacheReadActor, callCachePathPrefixes), + CallCacheReadingJobActor.props(callCachingParameters.readActor, callCachePathPrefixes), backendLifecycleActorFactory.runtimeAttributeDefinitions(initializationData), backendLifecycleActorFactory.nameForCallCachingPurposes, activity, @@ -623,9 +634,9 @@ class EngineJobExecutionActor(replyTo: ActorRef, cacheCopyAttempt: Int) = { backendLifecycleActorFactory.cacheHitCopyingActorProps match { case Some(propsMaker) => - val backendCacheHitCopyingActorProps = propsMaker(data.jobDescriptor, initializationData, serviceRegistryActor, ioActor, cacheCopyAttempt, blacklistCache) + val backendCacheHitCopyingActorProps = propsMaker(data.jobDescriptor, initializationData, serviceRegistryActor, ioActor, cacheCopyAttempt, callCachingParameters.blacklistCache) val cacheHitCopyActor = context.actorOf(backendCacheHitCopyingActorProps, buildCacheHitCopyingActorName(data.jobDescriptor, cacheResultId)) - cacheHitCopyActor ! CopyOutputsCommand(womValueSimpletons, jobDetritusFiles, returnCode) + cacheHitCopyActor ! CopyOutputsCommand(womValueSimpletons, jobDetritusFiles, cacheResultId, returnCode) replyTo ! JobRunning(data.jobDescriptor.key, data.jobDescriptor.evaluatedTaskInputs) goto(BackendIsCopyingCachedOutputs) case None => @@ -668,15 +679,18 @@ class EngineJobExecutionActor(replyTo: ActorRef, } data.ejha match { - case Some(ejha) => + case Some(ejha) if data.failedCopyAttempts < callCachingParameters.maxFailedCopyAttempts => workflowLogger.debug("Trying to use another cache hit for job: {}", jobDescriptorKey) ejha ! NextHit - goto(CheckingCallCache) + goto(CheckingCallCache) using data + case Some(_) => + writeToMetadata(Map( + callCachingHitResultMetadataKey -> false, + callCachingReadResultMetadataKey -> s"Cache Miss (${callCachingParameters.maxFailedCopyAttempts} failed copy attempts)")) + log.warning("Cache miss for job {} due to exceeding the maximum of {} failed copy attempts.", jobTag, callCachingParameters.maxFailedCopyAttempts) + runJob(data) case _ => - workflowLogger.info( - "Could not find a suitable cache hit. " + - "Call cache hit process had {} total hit failures before completing unsuccessfully. " + - "Falling back to running job: {}", data.cacheHitFailureCount, jobDescriptorKey) + workflowLogger.error("Programmer error: We got a cache failure but there was no hashing actor scanning for hits. Falling back to running job") runJob(data) } } @@ -691,17 +705,36 @@ class EngineJobExecutionActor(replyTo: ActorRef, writeToMetadata(metadataMap) - workflowLogger.info( - "Call cache hit process had {} total hit failures before completing successfully", - data.cacheHitFailureCount, - ) + val totalFailures = data.cacheHitFailureCount + if (totalFailures > 0) { + val copyFailures = data.failedCopyAttempts + val blacklisted = totalFailures - copyFailures + + workflowLogger.info( + s"Call cache hit process had $totalFailures total copy failures before completing successfully" + + s" (of which, $copyFailures were copy failures, $blacklisted were already blacklisted)" + ) + } else { + workflowLogger.info("Call cache hit process had 0 total hit failures before completing successfully") + } } private def logCacheHitFailure(data: ResponsePendingData, reason: Throwable): Unit = { - workflowLogger.info(s"Failed copying cache results for job $jobDescriptorKey (${reason.getClass.getSimpleName}: ${reason.getMessage})") + val totalFailures = data.cacheHitFailureCount + + val multipleFailuresContext = if (totalFailures > 0) { + val copyFailures = data.failedCopyAttempts + val blacklisted = totalFailures - copyFailures + s"(this job has already failed to copy from another $totalFailures other hits, of which $copyFailures were copy failures and $blacklisted were already blacklisted)" + } else "" + + workflowLogger.info( + s"Failure copying cache results for job $jobDescriptorKey (${reason.getClass.getSimpleName}: ${reason.getMessage})" + + multipleFailuresContext + ) } - private def metricizeCacheHitFailure(data: ResponsePendingData, failureCategory: MetricableCacheCopyErrorCategory): Unit = { + private def publishBlacklistReadMetrics(data: ResponsePendingData, failureCategory: MetricableCacheCopyErrorCategory): Unit = { val callCachingErrorsMetricPath: NonEmptyList[String] = NonEmptyList.of( "job", @@ -709,13 +742,21 @@ class EngineJobExecutionActor(replyTo: ActorRef, increment(callCachingErrorsMetricPath) } - private def invalidateCacheHitAndTransition(ejeaCacheHit: EJEACacheHit, data: ResponsePendingData, reason: CacheCopyError) = { - reason match { - case LoggableCacheCopyError(failure) => logCacheHitFailure(data, failure) - case MetricableCacheCopyError(failureCategory) => metricizeCacheHitFailure(data, failureCategory) + private def invalidateCacheHitAndTransition(ejeaCacheHit: EJEACacheHit, data: ResponsePendingData, reason: CacheCopyFailure) = { + val copyAttemptIncrement = reason match { + case CopyAttemptError(failure) => + logCacheHitFailure(data, failure) + // An actual attempt to copy was made and failed so increment the attempt counter by 1. + 1 + case BlacklistSkip(failureCategory) => + publishBlacklistReadMetrics(data, failureCategory) + // Blacklisted hits are simply skipped and do not result in incrementing the attempt counter. + 0 } - val updatedData = data.copy(cacheHitFailureCount = data.cacheHitFailureCount + 1) + // Increment the total failure count and actual copy failure count as appropriate. + val updatedData = data.copy(cacheHitFailureCount = data.cacheHitFailureCount + 1, + failedCopyAttempts = data.failedCopyAttempts + copyAttemptIncrement) if (invalidationRequired) { workflowLogger.warn(s"Invalidating cache entry ${ejeaCacheHit.hit.cacheResultId} (Cache entry details: ${ejeaCacheHit.details})") @@ -733,12 +774,12 @@ class EngineJobExecutionActor(replyTo: ActorRef, } private def checkCacheEntryExistence() = { - callCacheReadActor ! CallCacheEntryForCall(workflowIdForLogging, jobDescriptorKey) + callCachingParameters.readActor ! CallCacheEntryForCall(workflowIdForLogging, jobDescriptorKey) goto(CheckingCacheEntryExistence) } private def saveCacheResults(hashes: CallCacheHashes, data: SucceededResponseData) = { - callCacheWriteActor ! SaveCallCacheHashes(CallCacheHashBundle(workflowIdForLogging, hashes, data.response)) + callCachingParameters.writeActor ! SaveCallCacheHashes(CallCacheHashBundle(workflowIdForLogging, hashes, data.response)) val updatedData = data.copy(hashes = Option(Success(hashes))) goto(UpdatingCallCache) using updatedData } @@ -824,7 +865,15 @@ object EngineJobExecutionActor { } } + case class CallCachingParameters( + mode: CallCachingMode, + readActor: ActorRef, + writeActor: ActorRef, + fileHashCacheActor: Option[ActorRef], + maxFailedCopyAttempts: Int, + blacklistCache: Option[BlacklistCache] + ) /** Commands */ sealed trait EngineJobExecutionActorCommand @@ -841,15 +890,12 @@ object EngineJobExecutionActor { serviceRegistryActor: ActorRef, ioActor: ActorRef, jobStoreActor: ActorRef, - callCacheReadActor: ActorRef, - callCacheWriteActor: ActorRef, workflowDockerLookupActor: ActorRef, jobTokenDispenserActor: ActorRef, backendSingletonActor: Option[ActorRef], - callCachingMode: CallCachingMode, command: BackendJobExecutionActorCommand, - fileHashCacheActor: Option[ActorRef], - blacklistCache: Option[BlacklistCache]) = { + callCachingParameters: EngineJobExecutionActor.CallCachingParameters) = { + Props(new EngineJobExecutionActor( replyTo = replyTo, jobDescriptorKey = jobDescriptorKey, @@ -860,15 +906,11 @@ object EngineJobExecutionActor { serviceRegistryActor = serviceRegistryActor, ioActor = ioActor, jobStoreActor = jobStoreActor, - callCacheReadActor = callCacheReadActor, - callCacheWriteActor = callCacheWriteActor, workflowDockerLookupActor = workflowDockerLookupActor, jobTokenDispenserActor = jobTokenDispenserActor, backendSingletonActor = backendSingletonActor, - callCachingMode = callCachingMode, command = command, - fileHashCachingActor = fileHashCacheActor, - blacklistCache = blacklistCache)).withDispatcher(EngineDispatcher) + callCachingParameters = callCachingParameters)).withDispatcher(EngineDispatcher) } case class EJEACacheHit(hit: CacheHit, hitNumber: Int, details: Option[String]) @@ -885,7 +927,8 @@ object EngineJobExecutionActor { ejha: Option[ActorRef] = None, ejeaCacheHit: Option[EJEACacheHit] = None, backendJobActor: Option[ActorRef] = None, - cacheHitFailureCount: Int = 0 + cacheHitFailureCount: Int = 0, + failedCopyAttempts: Int = 0 ) extends EJEAData { def withEJHA(ejha: ActorRef): EJEAData = this.copy(ejha = Option(ejha)) diff --git a/engine/src/main/scala/cromwell/engine/workflow/lifecycle/execution/job/preparation/JobPreparationActor.scala b/engine/src/main/scala/cromwell/engine/workflow/lifecycle/execution/job/preparation/JobPreparationActor.scala index 6625d2af953..2fbaf9d5d59 100644 --- a/engine/src/main/scala/cromwell/engine/workflow/lifecycle/execution/job/preparation/JobPreparationActor.scala +++ b/engine/src/main/scala/cromwell/engine/workflow/lifecycle/execution/job/preparation/JobPreparationActor.scala @@ -26,6 +26,7 @@ import cromwell.services.metadata.{CallMetadataKeys, MetadataEvent, MetadataValu import eu.timepit.refined.api.Refined import wom.RuntimeAttributesKeys import wom.callable.Callable.InputDefinition +import wom.expression.IoFunctionSet import wom.format.MemorySize import wom.values._ @@ -59,7 +60,11 @@ class JobPreparationActor(workflowDescriptor: EngineWorkflowDescriptor, private[preparation] lazy val noResponseTimeout: FiniteDuration = 3 minutes private[preparation] val ioEc = context.system.dispatchers.lookup(Dispatcher.IoDispatcher) - private[preparation] lazy val expressionLanguageFunctions = factory.expressionLanguageFunctions(workflowDescriptor.backendDescriptor, jobKey, initializationData, ioActor, ioEc) + private[preparation] lazy val expressionLanguageFunctions = { + val ioFunctionSet: IoFunctionSet = factory.expressionLanguageFunctions(workflowDescriptor.backendDescriptor, jobKey, initializationData, ioActor, ioEc) + ioFunctionSet.makeInputSpecificFunctions + } + private[preparation] lazy val dockerHashCredentials = factory.dockerHashCredentials(workflowDescriptor.backendDescriptor, initializationData) private[preparation] lazy val runtimeAttributeDefinitions = factory.runtimeAttributeDefinitions(initializationData) private[preparation] lazy val hasDockerDefinition = runtimeAttributeDefinitions.exists(_.name == DockerValidation.instance.key) diff --git a/engine/src/main/scala/cromwell/engine/workflow/lifecycle/finalization/CopyWorkflowOutputsActor.scala b/engine/src/main/scala/cromwell/engine/workflow/lifecycle/finalization/CopyWorkflowOutputsActor.scala index 6da958c2fe6..0fdeafc06a1 100644 --- a/engine/src/main/scala/cromwell/engine/workflow/lifecycle/finalization/CopyWorkflowOutputsActor.scala +++ b/engine/src/main/scala/cromwell/engine/workflow/lifecycle/finalization/CopyWorkflowOutputsActor.scala @@ -99,7 +99,7 @@ class CopyWorkflowOutputsActor(workflowId: WorkflowId, override val ioActor: Act // compiled for every single file. // "execution" should be optional, because its not created on AWS. // Also cacheCopy or attempt- folders are optional. - lazy val truncateRegex = ".*/call-[^/]*/(cacheCopy/)?(attempt-[1-9]+/)?(execution/)?".r + lazy val truncateRegex = ".*/call-[^/]*/(shard-[0-9]+/)?(cacheCopy/)?(attempt-[0-9]+/)?(execution/)?".r val outputFileDestinations = rootAndFiles flatMap { case (workflowRoot, outputs) => outputs map { output => diff --git a/engine/src/main/scala/cromwell/engine/workflow/workflowstore/workflowstore_.scala b/engine/src/main/scala/cromwell/engine/workflow/workflowstore/workflowstore_.scala index 685fa5eb45d..382b963670d 100644 --- a/engine/src/main/scala/cromwell/engine/workflow/workflowstore/workflowstore_.scala +++ b/engine/src/main/scala/cromwell/engine/workflow/workflowstore/workflowstore_.scala @@ -2,7 +2,7 @@ package cromwell.engine.workflow.workflowstore import java.time.OffsetDateTime -import cromwell.core.{HogGroup, WorkflowId, WorkflowSourceFilesCollection} +import cromwell.core.{HasWorkflowIdAndSources, HogGroup, WorkflowId, WorkflowSourceFilesCollection} /** * States of a workflow for which it can be fetched from the workflow store and started. @@ -27,4 +27,4 @@ final case class WorkflowToStart(id: WorkflowId, submissionTime: OffsetDateTime, sources: WorkflowSourceFilesCollection, state: StartableState, - hogGroup: HogGroup) + hogGroup: HogGroup) extends HasWorkflowIdAndSources diff --git a/engine/src/test/scala/cromwell/engine/workflow/lifecycle/execution/callcaching/CallCacheReadingJobActorSpec.scala b/engine/src/test/scala/cromwell/engine/workflow/lifecycle/execution/callcaching/CallCacheReadingJobActorSpec.scala index 9a4da415240..f61806c2583 100644 --- a/engine/src/test/scala/cromwell/engine/workflow/lifecycle/execution/callcaching/CallCacheReadingJobActorSpec.scala +++ b/engine/src/test/scala/cromwell/engine/workflow/lifecycle/execution/callcaching/CallCacheReadingJobActorSpec.scala @@ -7,6 +7,7 @@ import cromwell.engine.workflow.lifecycle.execution.callcaching.CallCacheHashing import cromwell.engine.workflow.lifecycle.execution.callcaching.CallCacheReadActor._ import cromwell.engine.workflow.lifecycle.execution.callcaching.CallCacheReadingJobActor.{CCRJAWithData, WaitingForCacheHitOrMiss, _} import cromwell.engine.workflow.lifecycle.execution.callcaching.EngineJobHashingActor.{CacheHit, CacheMiss, HashError} +import cromwell.services.CallCaching.CallCachingEntryId import org.scalatest.concurrent.Eventually import org.scalatest.{FlatSpecLike, Matchers} diff --git a/engine/src/test/scala/cromwell/webservice/MetadataBuilderActorSpec.scala b/engine/src/test/scala/cromwell/webservice/MetadataBuilderActorSpec.scala index 70bb9cf2d92..d46bee3ac3f 100644 --- a/engine/src/test/scala/cromwell/webservice/MetadataBuilderActorSpec.scala +++ b/engine/src/test/scala/cromwell/webservice/MetadataBuilderActorSpec.scala @@ -28,6 +28,7 @@ class MetadataBuilderActorSpec extends TestKitSuite("Metadata") with AsyncFlatSp behavior of "MetadataBuilderActor" + val defaultSafetyRowNumberThreshold = 1000000 val defaultTimeout: FiniteDuration = 1.second.dilated implicit val timeout: Timeout = defaultTimeout @@ -39,7 +40,7 @@ class MetadataBuilderActorSpec extends TestKitSuite("Metadata") with AsyncFlatSp def readMetadataWorkerMaker = () => mockReadMetadataWorkerActor.props - val mba = system.actorOf(MetadataBuilderActor.props(readMetadataWorkerMaker)) + val mba = system.actorOf(MetadataBuilderActor.props(readMetadataWorkerMaker, 1000000)) val response = mba.ask(action).mapTo[MetadataJsonResponse] mockReadMetadataWorkerActor.expectMsg(defaultTimeout, action) mockReadMetadataWorkerActor.reply(MetadataLookupResponse(queryReply, events)) @@ -47,6 +48,24 @@ class MetadataBuilderActorSpec extends TestKitSuite("Metadata") with AsyncFlatSp response.mapTo[SuccessfulMetadataJsonResponse] map { b => b.responseJson shouldBe expectedRes.parseJson} } + def assertMetadataFailureResponse(action: MetadataServiceAction, + mdQuery: MetadataQuery, + metadataServiceResponse: MetadataServiceResponse, + expectedException: Exception): Future[Assertion] = { + val mockReadMetadataWorkerActor = TestProbe() + val mba = system.actorOf(MetadataBuilderActor.props(() => mockReadMetadataWorkerActor.props, defaultSafetyRowNumberThreshold)) + val response = mba.ask(action).mapTo[MetadataServiceResponse] + + mockReadMetadataWorkerActor.expectMsg(defaultTimeout, action) + mockReadMetadataWorkerActor.reply(metadataServiceResponse) + + response map { r => r shouldBe a [FailedMetadataJsonResponse] } + response.mapTo[FailedMetadataJsonResponse] map { b => + b.reason.getClass shouldBe expectedException.getClass + b.reason.getMessage shouldBe expectedException.getMessage + } + } + it should "build workflow scope tree from metadata events" in { def makeEvent(workflow: WorkflowId, key: Option[MetadataJobKey]) = { MetadataEvent(MetadataKey(workflow, key, "NOT_CHECKED"), MetadataValue("NOT_CHECKED")) @@ -494,14 +513,14 @@ class MetadataBuilderActorSpec extends TestKitSuite("Metadata") with AsyncFlatSp val mainQueryAction = GetMetadataAction(mainQuery) val subQuery = MetadataQuery(subWorkflowId, None, None, None, None, expandSubWorkflows = true) - val subQueryAction = GetMetadataAction(subQuery) + val subQueryAction = GetMetadataAction(subQuery, checkTotalMetadataRowNumberBeforeQuerying = false) val parentProbe = TestProbe() val mockReadMetadataWorkerActor = TestProbe() def readMetadataWorkerMaker = () => mockReadMetadataWorkerActor.props - val metadataBuilder = TestActorRef(MetadataBuilderActor.props(readMetadataWorkerMaker), parentProbe.ref, s"MetadataActor-${UUID.randomUUID()}") + val metadataBuilder = TestActorRef(MetadataBuilderActor.props(readMetadataWorkerMaker, 1000000), parentProbe.ref, s"MetadataActor-${UUID.randomUUID()}") val response = metadataBuilder.ask(mainQueryAction).mapTo[MetadataJsonResponse] mockReadMetadataWorkerActor.expectMsg(defaultTimeout, mainQueryAction) mockReadMetadataWorkerActor.reply(MetadataLookupResponse(mainQuery, mainEvents)) @@ -550,7 +569,7 @@ class MetadataBuilderActorSpec extends TestKitSuite("Metadata") with AsyncFlatSp val mockReadMetadataWorkerActor = TestProbe() def readMetadataWorkerMaker= () => mockReadMetadataWorkerActor.props - val metadataBuilder = TestActorRef(MetadataBuilderActor.props(readMetadataWorkerMaker), parentProbe.ref, s"MetadataActor-${UUID.randomUUID()}") + val metadataBuilder = TestActorRef(MetadataBuilderActor.props(readMetadataWorkerMaker, 1000000), parentProbe.ref, s"MetadataActor-${UUID.randomUUID()}") val response = metadataBuilder.ask(queryNoExpandAction).mapTo[MetadataJsonResponse] mockReadMetadataWorkerActor.expectMsg(defaultTimeout, queryNoExpandAction) mockReadMetadataWorkerActor.reply(MetadataLookupResponse(queryNoExpand, mainEvents)) @@ -662,6 +681,37 @@ class MetadataBuilderActorSpec extends TestKitSuite("Metadata") with AsyncFlatSp } matchesExpectations.reduceLeft(_ && _) shouldBe true } + + it should "politely refuse building metadata JSON if metadata number of rows is too large" in { + val workflowId = WorkflowId.randomId() + + val mdQuery = MetadataQuery(workflowId, None, None, None, None, expandSubWorkflows = false) + val action = GetMetadataAction(mdQuery) + + val metadataRowNumber = 100500 + val expectedException = new MetadataTooLargeNumberOfRowsException(workflowId, metadataRowNumber, defaultSafetyRowNumberThreshold) + assertMetadataFailureResponse( + action, + mdQuery, + MetadataLookupFailedTooLargeResponse(mdQuery, metadataRowNumber), + expectedException + ) + } + + it should "politely refuse building metadata JSON if timeout occurs on attempt to read metadata from database" in { + val workflowId = WorkflowId.randomId() + + val mdQuery = MetadataQuery(workflowId, None, None, None, None, expandSubWorkflows = false) + val action = GetMetadataAction(mdQuery) + + val expectedException = new MetadataTooLargeTimeoutException(workflowId) + assertMetadataFailureResponse( + action, + mdQuery, + MetadataLookupFailedTimeoutResponse(mdQuery), + expectedException + ) + } } object MetadataBuilderActorSpec { diff --git a/engine/src/test/scala/cromwell/webservice/routes/CromwellApiServiceSpec.scala b/engine/src/test/scala/cromwell/webservice/routes/CromwellApiServiceSpec.scala index 77ac86346e7..dcbba0b3fe1 100644 --- a/engine/src/test/scala/cromwell/webservice/routes/CromwellApiServiceSpec.scala +++ b/engine/src/test/scala/cromwell/webservice/routes/CromwellApiServiceSpec.scala @@ -609,7 +609,7 @@ object CromwellApiServiceSpec { sender ! SuccessfulMetadataJsonResponse(request, MetadataBuilderActor.processOutputsResponse(id, event)) case request @ GetLogs(id, _) => sender ! SuccessfulMetadataJsonResponse(request, MetadataBuilderActor.workflowMetadataResponse(id, logsEvents(id), includeCallsIfEmpty = false, Map.empty)) - case request @ GetMetadataAction(MetadataQuery(id, _, _, withKeys, withoutKeys, _), _) => + case request @ GetMetadataAction(MetadataQuery(id, _, _, withKeys, withoutKeys, _), _, _) => val withKeysList = withKeys.map(_.toList).getOrElse(List.empty) val withoutKeysList = withoutKeys.map(_.toList).getOrElse(List.empty) sender ! SuccessfulMetadataJsonResponse(request, responseMetadataValues(id, withKeysList, withoutKeysList)) diff --git a/hybridCarboniteMetadataService/src/main/scala/cromwell/services/metadata/hybridcarbonite/CarbonitingMetadataFreezerActor.scala b/hybridCarboniteMetadataService/src/main/scala/cromwell/services/metadata/hybridcarbonite/CarbonitingMetadataFreezerActor.scala index 476538c00a9..9c2582656b5 100644 --- a/hybridCarboniteMetadataService/src/main/scala/cromwell/services/metadata/hybridcarbonite/CarbonitingMetadataFreezerActor.scala +++ b/hybridCarboniteMetadataService/src/main/scala/cromwell/services/metadata/hybridcarbonite/CarbonitingMetadataFreezerActor.scala @@ -5,13 +5,13 @@ import java.nio.file.StandardOpenOption import akka.actor.{ActorRef, LoggingFSM, Props} import cromwell.core.WorkflowId import cromwell.core.io.{AsyncIo, DefaultIoCommandBuilder} -import cromwell.services.metadata.MetadataArchiveStatus.{ArchiveFailed, Archived} +import cromwell.services.metadata.MetadataArchiveStatus.{ArchiveFailed, Archived, TooLargeToArchive} import cromwell.services.metadata.MetadataService.GetMetadataAction import cromwell.services.metadata.hybridcarbonite.CarboniteWorkerActor.CarboniteWorkflowComplete import cromwell.services.metadata.hybridcarbonite.CarbonitingMetadataFreezerActor._ import cromwell.services.metadata.impl.MetadataDatabaseAccess import cromwell.services.metadata.{MetadataArchiveStatus, MetadataQuery} -import cromwell.services.{FailedMetadataJsonResponse, MetadataServicesStore, SuccessfulMetadataJsonResponse} +import cromwell.services.{FailedMetadataJsonResponse, MetadataServicesStore, MetadataTooLargeException, SuccessfulMetadataJsonResponse} import cromwell.util.GracefulShutdownHelper.ShutdownCommand import scala.concurrent.ExecutionContext @@ -55,6 +55,10 @@ class CarbonitingMetadataFreezerActor(freezingConfig: ActiveMetadataFreezingConf } goto(Freezing) using FreezingData(workflowId) + case Event(FailedMetadataJsonResponse(_, reason: MetadataTooLargeException), FetchingData(workflowId)) => + log.error(reason, s"Carboniting failure: $reason. Marking as $TooLargeToArchive") + scheduleDatabaseUpdateAndAwaitResult(workflowId, TooLargeToArchive) + case Event(FailedMetadataJsonResponse(_, reason), FetchingData(workflowId)) => log.error(reason, s"Failed to fetch workflow $workflowId's metadata to archive. Marking as $ArchiveFailed") scheduleDatabaseUpdateAndAwaitResult(workflowId, ArchiveFailed) diff --git a/project/Dependencies.scala b/project/Dependencies.scala index 04158459e79..6d7d0acfa84 100644 --- a/project/Dependencies.scala +++ b/project/Dependencies.scala @@ -56,6 +56,7 @@ object Dependencies { private val liquibaseSlf4jV = "2.0.0" private val liquibaseV = "3.6.3" private val logbackV = "1.2.3" + private val lz4JavaV = "1.7.1" private val mariadbV = "2.4.2" private val metrics3ScalaV = "3.5.10" // https://github.com/erikvanoosten/metrics-scala/tree/f733e26#download-4x private val metrics3StatsdV = "4.2.0" @@ -526,6 +527,9 @@ object Dependencies { val bcsBackendDependencies = commonDependencies ++ refinedTypeDependenciesList ++ aliyunBatchComputeDependencies val tesBackendDependencies = akkaHttpDependencies val sparkBackendDependencies = akkaHttpDependencies + val sfsBackendDependencies = List ( + "org.lz4" % "lz4-java" % lz4JavaV + ) val testDependencies = List( "org.scalatest" %% "scalatest" % scalatestV, @@ -579,6 +583,7 @@ object Dependencies { ossFileSystemDependencies ++ perfDependencies ++ serverDependencies ++ + sfsBackendDependencies ++ sparkBackendDependencies ++ spiDependencies ++ spiUtilDependencies ++ diff --git a/project/Version.scala b/project/Version.scala index d143cefa8ac..5328133a361 100644 --- a/project/Version.scala +++ b/project/Version.scala @@ -5,7 +5,7 @@ import sbt._ object Version { // Upcoming release, or current if we're on a master / hotfix branch - val cromwellVersion = "50" + val cromwellVersion = "51" /** * Returns true if this project should be considered a snapshot. diff --git a/scripts/metadata_comparison/README.MD b/scripts/metadata_comparison/README.MD new file mode 100644 index 00000000000..42a85cc035f --- /dev/null +++ b/scripts/metadata_comparison/README.MD @@ -0,0 +1,36 @@ +# Metadata Comparison Scripts + +This `metadata_comparison` python project provides tools to compare workflows run +in different Cromwell environments to compare overall cost and performance. + +## Running a script + +Choose a script to run. For this example we will use the `extractor`. + +From this top-level directory `metadata_comparison` directory (ie the one +containing this README.MD file), run: + +```sh +# python3 -m metadata_comparison.extractor +``` + +### Questions + +- Q: Why not run the scripts directly, eg `python3 extractor.py`? + - A: Running python from this outer directory allows it to discover the `metadata_comparison` + project, and thus allows imports across and between scripts. + +## Unit tests + +To run the python unit tests from the top-level `metadata_comparison` directory +(ie the one containing this README.MD file), run: +```sh +# python3 -m unittest discover -v +``` + +This will: + - Find the `metadata_comparison` project in that subdirectory. + - And make it importable to other scripts. + - Run the python built-in unittest script, which will: + - Discover the tests project in the `test` directory + - Run them, verbosely. \ No newline at end of file diff --git a/scripts/metadata_comparison/metadata_comparison/__init__.py b/scripts/metadata_comparison/metadata_comparison/__init__.py new file mode 100644 index 00000000000..e69de29bb2d diff --git a/scripts/metadata_comparison/metadata_comparison/comparer.py b/scripts/metadata_comparison/metadata_comparison/comparer.py new file mode 100644 index 00000000000..aed83d73128 --- /dev/null +++ b/scripts/metadata_comparison/metadata_comparison/comparer.py @@ -0,0 +1,104 @@ +#!/usr/bin/env python3 +# +# comparer.py +# +# Purpose: Compare performance metadata JSON files produced by Digester and produce result in CSV format +# +# Usage: python3 comparer.py [-h] [-v] [--json_paths JSONPATH [JSONPATH ...]] [--output_path OUTPUTPATH] +# +# Python Prereqs (at least, the ones which I needed to manually install... YMMV): +# +# * pip3 install --upgrade pandas +# * pip3 install --upgrade google-api-python-client +# * pip3 install --upgrade google-cloud-storage +# +# Remember to login to create application default credentials before use: +# % gcloud auth application-default login + +from typing import List, Tuple +import argparse +import json +import pandas +import google.auth +from google.cloud import storage +import logging +from metadata_comparison.lib.logging import set_log_verbosity, quieten_chatty_imports +from metadata_comparison.lib.storage import upload_blob +from metadata_comparison.lib.argument_regex import gcs_path_regex_validator, digester_version_regex_validator, \ + workflow_regex_validator + +logger = logging.getLogger('metadata_comparison.comparer') + +def read_digester_jsons_from_gcs(bucket_name: str, + base_path: str, + digester_version: str, + workflow_ids: List[str], + storage_client: storage.Client) -> List[Tuple[str, dict]]: + bucket = storage_client.get_bucket(bucket_name) + result = [] + for workflow_id in workflow_ids: + blob = bucket.blob(f"{base_path}/{workflow_id}/digests/{digester_version}/digest.json") + json_string_bytes = blob.download_as_string() + result.append((workflow_id, json.loads(json_string_bytes))) + + return result + + +def compare_jsons(workflow_ids_and_jsons: List[Tuple[str, dict]]) -> pandas.DataFrame: + """ + Uses pandas library to convert JSONs into dataframes, and concatenate those dataframes into a single one. + Performs sanity check, producing exception, if at least one of the JSONs doesn't have matching subset of keys. + """ + columnToCompareNameEnding = ".cromwellTotalTimeSeconds" + versionColumnName = "version" + result = pandas.DataFrame() + last_cols = [] + for workflow_id_and_json in workflow_ids_and_jsons: + df = pandas.json_normalize(workflow_id_and_json[1]) + cols = [c for c in df.columns if c.endswith(columnToCompareNameEnding)] + cols.sort() + cols.insert(0, versionColumnName) + + if last_cols and last_cols != cols: + raise Exception(f"JSON data at {workflow_ids_and_jsons[0]} doesn't have matching subset of columns. Expected: {last_cols} but got {cols}") + + last_cols = cols + df.index = [workflow_id_and_json[0]] + result = pandas.concat([result, df[cols]]) + + renameVersionColumnTo = "digester format version" + result.rename(columns={versionColumnName: renameVersionColumnTo}, inplace=True) + result.index.name = "workflow id" + + return result + + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description='Compare performance metadata JSONs and produce CSV result') + parser.add_argument('-v', '--verbose', action='store_true') + parser.add_argument('--digester-version', metavar='DIGESTERVERSION', type=digester_version_regex_validator, nargs=1, + help='Compare digests produced by this version of the digester') + parser.add_argument('--digest-gcs-base-path', metavar='DIGESTGCSBASEPATH', type=gcs_path_regex_validator, nargs=1, + help='GCS base path to the directory containing JSONs produced by digester') + parser.add_argument('--output-gcs-file-path', metavar='OUTPUTGCSFILE', type=gcs_path_regex_validator, nargs=1, + help='GCS path to output CSV file') + parser.add_argument('--workflow-ids', metavar='WORKFLOWIDS', type=workflow_regex_validator, nargs='+', + help='Workflow ids for performance comparison') + + args = parser.parse_args() + set_log_verbosity(args.verbose) + quieten_chatty_imports() + logger.info("Starting Comparer operation.") + + credentials, project_id = google.auth.default() + storage_client = storage.Client(credentials = credentials) + input_gcs_bucket, input_gcs_path = args.digest_gcs_base_path[0] + + workflow_ids_and_jsons = read_digester_jsons_from_gcs(input_gcs_bucket, input_gcs_path, args.digester_version[0], args.workflow_ids, storage_client) + comparison_result_df = compare_jsons(workflow_ids_and_jsons) + result_csv_string = comparison_result_df.to_csv() + + output_gcs_bucket, output_gcs_path = args.output_gcs_file_path[0] + upload_blob(output_gcs_bucket, result_csv_string, output_gcs_path, storage_client, logger) + + logger.info('Comparer operation completed successfully.') diff --git a/scripts/metadata_comparison/metadata_comparison/digester.py b/scripts/metadata_comparison/metadata_comparison/digester.py new file mode 100644 index 00000000000..8e0cf9f4cea --- /dev/null +++ b/scripts/metadata_comparison/metadata_comparison/digester.py @@ -0,0 +1,109 @@ +import argparse +import json +from metadata_comparison.lib import logging, operation_ids +from metadata_comparison.lib.operation_ids import CallNameSequence, JsonObject, OperationId +from metadata_comparison.lib.comparison_paths import ComparisonPath +from metadata_comparison.lib.operations_digesters import OperationDigester + +import dateutil.parser +from typing import AnyStr, Dict + +Version = "0.0.1" + + +def main(args: argparse.Namespace) -> None: + for path in args.paths: + parent_path = ComparisonPath.create(path) + + workflow_path = parent_path / 'workflow.json' + operations_dir_path = parent_path / 'operations' + + digest_parent = parent_path / 'digests' / Version + digest_path = digest_parent / 'digest.json' + + if not digest_path.exists() or args.force: + digest_parent.mkdir_p() + digest_json = digest(workflow_path, operations_dir_path) + digest_string = json.dumps(digest_json, sort_keys=True, indent=4) + digest_path.write_text(digest_string) + else: + raise ValueError(f'digest file already exists at {digest_path} and --force not specified') + + +def parse_args() -> argparse.Namespace: + def validate_path(p: AnyStr) -> AnyStr: + if ComparisonPath.is_valid_path_string(p): + return p + raise ValueError(f'{p} is not a valid path whatsoever') + + parser = argparse.ArgumentParser( + description='Digest workflow metadata and job operation details, reading from and reuploading to GCS.') + parser.add_argument('-v', '--verbose', action='store_true', + help='whether to log verbosely (default False)') + parser.add_argument('-f', '--force', action='store_true', + help='whether to overwrite existing digests (default False)') + parser.add_argument('paths', metavar="PATH", nargs='+', type=validate_path, + help="Location at which to find metadata (local or GCS)") + + return parser.parse_args() + + +CallName = AnyStr + + +def digest(workflow_path: ComparisonPath, operations_path: ComparisonPath) -> JsonObject: + def call_fn(succeeded_operations: Dict[CallName, JsonObject], + operation_id: OperationId, + path: CallNameSequence, + attempt: JsonObject) -> None: + backend_status = attempt.get('backendStatus', 'Unknown') + # This script should only ever be pointed at successful workflow metadata. All jobs that have a backend status + # other than `Success` must have later been re-run successfully, so any un`Success`ful attempts are ignored. + # It's possible that a future version of the digester might actually want to look at these jobs since they + # may have completed some lifecycle events which could be useful in accumulating more performance data. + if backend_status == 'Success': + string_path = '.'.join(path) + cromwell_start = attempt.get('start') + cromwell_end = attempt.get('end') + + cromwell_total_time_seconds = (dateutil.parser.parse(cromwell_end) - + dateutil.parser.parse(cromwell_start)).total_seconds() + + bare_operation_id = operation_id.split('/')[-1] + operations_file_path = operations_path / f'{bare_operation_id}.json' + operations_data = operations_file_path.read_text() + operations_metadata = json.loads(operations_data) + operation = OperationDigester.create(operations_metadata) + + papi_total_time_seconds = operation.total_time_seconds() + + cromwell_additional_total_time_seconds = \ + float("%.3f" % (cromwell_total_time_seconds - papi_total_time_seconds)) + + succeeded_operations[string_path] = { + "attempt": attempt.get('attempt'), + "shardIndex": attempt.get('shardIndex'), + "operationId": operation_id, + "cromwellStart": cromwell_start, + "cromwellEnd": cromwell_end, + "cromwellTotalTimeSeconds": cromwell_total_time_seconds, + "papiStart": operation.start_time(), + "papiEnd": operation.end_time(), + "papiTotalTimeSeconds": operation.total_time_seconds(), + "cromwellAdditionalTotalTimeSeconds": cromwell_additional_total_time_seconds, + "dockerImagePullSeconds": operation.docker_image_pull_seconds() + } + + data = workflow_path.read_text() + metadata = json.loads(data) + + shards = operation_ids.visit_papi_operations(metadata, call_fn, initial_accumulator={}) + return {'version': Version, 'calls': shards, 'workflowId': metadata['id']} + + +if __name__ == "__main__": + logging.quieten_chatty_imports() + _args = parse_args() + logging.set_log_verbosity(_args.verbose) + + main(_args) diff --git a/scripts/metadata_comparison/metadata_comparison/extractor.py b/scripts/metadata_comparison/metadata_comparison/extractor.py new file mode 100755 index 00000000000..f41a645c059 --- /dev/null +++ b/scripts/metadata_comparison/metadata_comparison/extractor.py @@ -0,0 +1,186 @@ +#!/usr/bin/env python3 +# +# extractor.py +# +# Purpose: Read workflow metadata from Cromwell, and all metadata for its jobs, +# and upload it to a GCS bucket +# +# Usage: python3 extractor.py [ [...]] +# +# Python Prereqs (at least, the ones which I needed to manually install... YMMV): +# +# * pip3 install --upgrade requests +# * pip3 install --upgrade google-api-python-client +# * pip3 install --upgrade google-cloud +# * pip3 install --upgrade google-cloud-storage +# * pip3 install --upgrade gitpython +# +# Remember to login to create application default credentials before use: +# % gcloud auth application-default login + +import argparse +import json +import requests +from google.cloud import storage +import google.auth +from pathlib import Path +import git +import os +import zipfile +import logging +from metadata_comparison.lib.argument_regex import gcs_path_regex_validator, workflow_regex_validator +from metadata_comparison.lib.operation_ids import get_operation_id_number, visit_papi_operations, CallNameSequence, \ + JsonObject, OperationId +from metadata_comparison.lib.papi.papi_clients import PapiClients +from metadata_comparison.lib.storage import upload_blob +from typing import Any, AnyStr, List, Mapping, Sequence, Union +from metadata_comparison.lib.logging import quieten_chatty_imports, set_log_verbosity + +logger = logging.getLogger('metadata_comparison.extractor') + + +def __create_snapshot_of_local_repo(repo: git.Repo, cromwell_snapshots_path: Union[Path, str]) -> Union[Path, str]: + last_commit_hash = repo.head.commit.hexsha + if not os.path.exists(cromwell_snapshots_path): + os.makedirs(cromwell_snapshots_path) + current_snapshot_path = cromwell_snapshots_path / last_commit_hash + if not os.path.exists(current_snapshot_path): + os.makedirs(current_snapshot_path) + repo.clone(current_snapshot_path) + return current_snapshot_path + + +def __create_zip_file(zip_file_path: Union[Path, str], current_snapshot_path: Union[Path, str]): + with zipfile.ZipFile(zip_file_path, "a", allowZip64=False) as zip_file: + for root, dirs, files in os.walk(current_snapshot_path): + for file in files: + zip_file.write(os.path.join(root, file)) + + +def upload_local_checkout(cromwell_path: Path, + gcs_bucket: str, + gcs_path: str, + gcs_storage_client: storage.Client) -> None: + cromwell_snapshots_path = cromwell_path.parent / "cromwell_snapshots" + + repo = git.Repo(cromwell_path) + if repo.is_dirty(): + raise Exception("Unable to upload local checkout to GCS: repository is dirty - need to do check in first.") + + zip_file_name = f"cromwell_code.zip" + zip_file_path = Path(cromwell_snapshots_path / zip_file_name) + if not os.path.exists(zip_file_path): + current_snapshot_path = __create_snapshot_of_local_repo(repo, cromwell_snapshots_path) + __create_zip_file(zip_file_path, current_snapshot_path) + + upload_blob(gcs_bucket, zip_file_path.read_bytes(), f"{gcs_path}/{zip_file_name}", gcs_storage_client, logger) + + +def upload_local_config(config_path: Path, gcs_bucket: str, gcs_path: str, gcs_storage_client: storage.Client): + configuration_file_name = "cromwell.conf" + upload_blob(gcs_bucket, config_path.read_text(), f"{gcs_path}/{configuration_file_name}", gcs_storage_client, logger) + + +def fetch_raw_workflow_metadata(cromwell_url: str, workflow: str) -> (requests.Response, JsonObject): + """Fetches workflow metadata for a workflow. Returns the raw response and the dict read from json""" + url = f'{cromwell_url}/api/workflows/v1/{workflow}/metadata?expandSubWorkflows=true' + logger.info(f'Fetching Cromwell metadata from {url}...') + result = requests.get(url) + return result.content, result.json() + + +def upload_workflow_metadata_json(bucket_name: str, + raw_workflow_metadata: bytes, + workflow_gcs_base_path: str, + gcs_storage_client: storage.Client) -> None: + workflow_gcs_metadata_upload_path = f'{workflow_gcs_base_path}/metadata.json' + upload_blob(bucket_name, raw_workflow_metadata, workflow_gcs_metadata_upload_path, gcs_storage_client, logger) + + +def upload_operations_metadata_json(bucket_name: str, + operation_id: str, + operations_metadata: Mapping[str, Any], + workflow_gcs_base_path: str, + gcs_storage_client: storage.Client) -> None: + """Uploads metadata to cloud storage, as json""" + operation_upload_path = f'{workflow_gcs_base_path}/operations/{get_operation_id_number(operation_id)}.json' + formatted_metadata = json.dumps(operations_metadata, indent=2) + upload_blob(bucket_name, bytes(formatted_metadata, 'utf-8'), operation_upload_path, gcs_storage_client, logger) + + +def find_operation_ids_in_metadata(json_metadata: JsonObject) -> Sequence[AnyStr]: + """Finds all instances of PAPI operations IDs in a workflow""" + # Eg given: + # { + # "calls": { + # "workflow_name.task_name": [ + # { + # "jobId": "projects/broad-dsde-cromwell-dev/operations/01234567891011121314", + # ... + # + # We want to extract "projects/broad-dsde-cromwell-dev/operations/01234567891011121314" + def call_fn(acc: List[AnyStr], + operation_id: OperationId, + call_name_sequence: CallNameSequence, + attempt: JsonObject) -> None: + acc.append(operation_id) + + return visit_papi_operations(json_metadata, call_fn, initial_accumulator=[]) + + +def process_workflow(cromwell_url: str, + gcs_bucket: str, + gcs_path: str, + gcs_storage_client: storage.Client, + papi_clients: PapiClients, + workflow: str) -> None: + raw_metadata, json_metadata = fetch_raw_workflow_metadata(cromwell_url, workflow) + workflow_gcs_base_path = f'{gcs_path}/{workflow}/extractor' + + operation_ids = find_operation_ids_in_metadata(json_metadata) + for id in operation_ids: + operation_metadata = papi_clients.request_operation_metadata(id) + upload_operations_metadata_json(gcs_bucket, id, operation_metadata, workflow_gcs_base_path, gcs_storage_client) + upload_workflow_metadata_json(gcs_bucket, raw_metadata, workflow_gcs_base_path, gcs_storage_client) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser( + description='Extract metadata and operation details for workflows and upload to GCS') + parser.add_argument('-v', '--verbose', action='store_true') + parser.add_argument('cromwell_url', metavar='CROMWELL', type=str, nargs=1, + help='Cromwell host') + parser.add_argument('gcs_path', metavar='GCSPATH', type=gcs_path_regex_validator, nargs=1, + help='GCS path to upload to') + parser.add_argument('workflows', metavar='WORKFLOW', type=workflow_regex_validator, nargs='+', + help='Workflows to process') + parser.add_argument('cromwell_checkout_path', metavar='CROMWELLCHECKOUTPATH', type=Path, + help='Path to Cromwell git checkout used to run workflows') + parser.add_argument('cromwell_config_path', metavar='CROMWELLCONFIGPATH', type=Path, + help='Path to Cromwell configuration file used to run workflows') + + args = parser.parse_args() + set_log_verbosity(args.verbose) + quieten_chatty_imports() + + cromwell_url = args.cromwell_url[0] + gcs_bucket, gcs_path = args.gcs_path[0] + workflows = args.workflows + + credentials, project_id = google.auth.default() + storage_client = storage.Client(credentials=credentials) + papi_clients = PapiClients(credentials) + + logger.info(f'cromwell: {cromwell_url}') + logger.info(f'gcs_bucket: {gcs_bucket}; gcs_path: {gcs_path}') + logger.info(f'workflows: {workflows}') + + for workflow in workflows: + process_workflow(cromwell_url, gcs_bucket, gcs_path, storage_client, papi_clients, workflow) + + if args.cromwell_checkout_path: + upload_local_checkout(args.cromwell_checkout_path, gcs_bucket, gcs_path, storage_client) + if args.cromwell_config_path: + upload_local_config(args.cromwell_config_path, gcs_bucket, gcs_path, storage_client) + + logger.info('Extractor operation completed successfully.') diff --git a/scripts/metadata_comparison/metadata_comparison/lib/__init__.py b/scripts/metadata_comparison/metadata_comparison/lib/__init__.py new file mode 100644 index 00000000000..e69de29bb2d diff --git a/scripts/metadata_comparison/metadata_comparison/lib/argument_regex.py b/scripts/metadata_comparison/metadata_comparison/lib/argument_regex.py new file mode 100644 index 00000000000..6cee2009243 --- /dev/null +++ b/scripts/metadata_comparison/metadata_comparison/lib/argument_regex.py @@ -0,0 +1,66 @@ +#!/usr/bin/env python3 + +import argparse +import re + + +def workflow_regex_validator(value: str) -> str: + """Makes sure that a value is a valid Cromwell workflow ID then returns the workflow ID""" + workflow_regex=re.compile('^[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}$') + if not workflow_regex.match(value): + msg = f'Invalid workflow ID {value}. Expected {workflow_regex.pattern}' + raise argparse.ArgumentTypeError(msg) + else: + return value + + +def url_regex_validator(value: str) -> str: + """ + Validates then extract the root of the Cromwell URL from the various URL strings which might be provided. + Deliberately flexible because it's tedious to remember which script requires which type of format. + eg: + 'http://localhost' => 'http://localhost' + 'http://localhost:8000' => 'http://localhost:8000' + 'http://localhost:8000/' => 'http://localhost:8000' + 'http://localhost:8000/api/workflows/' => 'http://localhost:8000' + 'http://localhost:8000/custom/prefix/api/workflows/' => 'http://localhost:8000/custom/prefix' + """ + url_regex = re.compile('(http(s?)://((?!/api).)*[^/])(/(api.*)?)?') + m = url_regex.match(value) + if m: + return m.group(1) + else: + msg = f'Invalid Cromwell URL {value}. Expected {url_regex.pattern}' + raise argparse.ArgumentTypeError(msg) + + +def gcs_path_regex_validator(value: str) -> (str, str): + """ + Validates then extracts the bucket and object-path from a GS string. Returned as a pair. + eg: + 'gs://bucket/path/to/directory/' -> ('bucket', 'path/to/directory') + or + 'gs://bucket/path/to/file.ext' -> ('bucket', 'path/to/file.ext') + """ + bucket_class = 'a-zA-Z0-9-' + object_class = '_\\.' + bucket_class + gcs_regex = re.compile(f'^gs://(?P[{bucket_class}]+)/(?P([{object_class}]+/)*[{object_class}]+)/?$') + m = gcs_regex.match(value) + if m: + return m.group('bucket'), m.group('object') + else: + msg = f'Invalid GCS path {value}. Expected {gcs_regex.pattern}' + raise argparse.ArgumentTypeError(msg) + + +def digester_version_regex_validator(value: str) -> str: + """ + Validates that digester version looks like 0.0.1 + """ + digester_version_regex = re.compile('^\\d+\\.\\d+\\.\\d+$') + m = digester_version_regex.match(value) + if m: + return m.group(0) + else: + msg = f'Invalid digester version {value}. Expected {digester_version_regex.pattern}' + raise argparse.ArgumentTypeError(msg) diff --git a/scripts/metadata_comparison/metadata_comparison/lib/comparison_paths.py b/scripts/metadata_comparison/metadata_comparison/lib/comparison_paths.py new file mode 100644 index 00000000000..2b01f83e84c --- /dev/null +++ b/scripts/metadata_comparison/metadata_comparison/lib/comparison_paths.py @@ -0,0 +1,127 @@ +from google.cloud import storage +import google.auth +import logging +from metadata_comparison.lib import argument_regex + +from pathlib import Path, PosixPath +from typing import AnyStr, Union +from abc import ABC, abstractmethod + + +class ComparisonPath(ABC): + """ + Abstract Base Class for Local and GCS paths sharing an interface for the purpose of PAPI metadata comparison. + There's nothing particularly "Comparison" about these paths, I just couldn't think of a better name. + """ + @staticmethod + def create(path: Union[AnyStr, Path]): + if isinstance(path, PosixPath): + return LocalPath(path) + elif path.startswith('gs://'): + bucket, obj = argument_regex.gcs_path_regex_validator(path) + return GcsPath(bucket, obj) + return LocalPath(path) + + @staticmethod + def is_valid_path_string(path: AnyStr) -> bool: + # ick + return GcsPath.is_valid_path_string(path) or LocalPath.is_valid_path_string(path) + + @abstractmethod + def read_text(self, encoding: AnyStr = 'utf_8') -> AnyStr: pass + + # `/` operator, used to implement pathlib.Path style ` / ` syntax. + @abstractmethod + def __truediv__(self, other): pass + + @abstractmethod + def exists(self) -> bool: pass + + @abstractmethod + def mkdir_p(self) -> None: pass + + @abstractmethod + def write_text(self, content: AnyStr, encoding: AnyStr = 'utf_8') -> None: pass + + @abstractmethod + def description(self) -> AnyStr: pass + + +class GcsPath(ComparisonPath): + def __init__(self, bucket: AnyStr, obj: AnyStr, storage_bucket: storage.Bucket = None): + self._bucket = bucket + self._object = obj + self._storage_blob = None + if storage_bucket is None: + credentials, project_id = google.auth.default() + logging.info(f'Creating storage client for bucket {bucket}') + client = storage.Client(credentials=credentials) + self._storage_bucket = client.bucket(bucket) + else: + self._storage_bucket = storage_bucket + + def __storage_blob(self) -> storage.Blob: + if self._storage_blob is None: + logging.info(f'Creating storage blob for {self}') + self._storage_blob = self._storage_bucket.blob(self._object) + return self._storage_blob + + def read_text(self, encoding: AnyStr = 'utf_8') -> AnyStr: + return self.__storage_blob().download_as_string() + + def __truediv__(self, other) -> ComparisonPath: + return GcsPath(bucket=self._bucket, + obj=f'{self._object}/{other}', + storage_bucket=self._storage_bucket) + + def exists(self) -> bool: + return self.__storage_blob().exists() + + def mkdir_p(self) -> None: + # Nothing to do here, "directory structure" is implicitly "mkdir -p"'d in GCS. + pass + + def write_text(self, content: AnyStr, encoding: AnyStr = 'utf_8') -> None: + self.__storage_blob().upload_from_string(content) + + @staticmethod + def is_valid_path_string(path: AnyStr) -> bool: + if path.startswith('gs://'): + return argument_regex.gcs_path_regex_validator(path) + return False + + def __str__(self) -> AnyStr: + return f'gs://{self._bucket}/{self._object}' + + def description(self) -> AnyStr: + return 'GCS' + + +class LocalPath(ComparisonPath): + def __init__(self, local_spec: Union[AnyStr, Path]): + self.path = Path(local_spec) + + def read_text(self, encoding: AnyStr = 'utf_8') -> AnyStr: + return self.path.read_text(encoding) + + def __truediv__(self, other) -> ComparisonPath: + return LocalPath(self.path / other) + + def exists(self) -> bool: + return self.path.exists() + + def mkdir_p(self) -> None: + self.path.mkdir(parents=True, exist_ok=True) + + def write_text(self, content: AnyStr, encoding: AnyStr = 'utf_8') -> None: + self.path.write_text(content, encoding) + + @staticmethod + def is_valid_path_string(path: AnyStr) -> bool: + return True + + def __str__(self) -> AnyStr: + return str(self.path) + + def description(self) -> AnyStr: + return 'Local filesystem' diff --git a/scripts/metadata_comparison/metadata_comparison/lib/logging.py b/scripts/metadata_comparison/metadata_comparison/lib/logging.py new file mode 100644 index 00000000000..017329734fb --- /dev/null +++ b/scripts/metadata_comparison/metadata_comparison/lib/logging.py @@ -0,0 +1,16 @@ +#!/usr/bin/env python3 + +import logging +import warnings + + +def set_log_verbosity(verbose: bool) -> None: + level = logging.INFO if verbose else logging.WARNING + logging.basicConfig(format='[%(asctime)s] [%(name)s] %(message)s', level=level) + + +def quieten_chatty_imports() -> None: + logging.getLogger('googleapiclient.discovery_cache').setLevel(logging.ERROR) + logging.getLogger('googleapiclient.discovery').setLevel(logging.WARNING) + # Controversial and doesn't seem to work for the tests anyway, YMMV. + # warnings.filterwarnings("ignore", "Your application has authenticated using end user credentials") diff --git a/scripts/metadata_comparison/metadata_comparison/lib/operation_ids.py b/scripts/metadata_comparison/metadata_comparison/lib/operation_ids.py new file mode 100644 index 00000000000..43686f3fd1b --- /dev/null +++ b/scripts/metadata_comparison/metadata_comparison/lib/operation_ids.py @@ -0,0 +1,103 @@ +#!/usr/bin/env python3 + +import re +from typing import Any, AnyStr, Callable, Dict, List, Sequence, TypeVar, Union + +PAPI_V1_OPERATION_REGEX = re.compile('^operations/[^/]*') +PAPI_V2ALPHA1_OPERATION_REGEX = re.compile('^projects/[^/]*/operations/[0-9]*') +PAPI_V2BETA_OPERATION_REGEX = re.compile('^projects/[^/]*/locations/[^/]*/operations/[0-9]*') + +PAPI_V1_API_VERSION = 'v1alpha2' +PAPI_V2_ALPHA1_API_VERSION = 'v2alpha1' +PAPI_V2_BETA_API_VERSION = 'v2beta' + + +def get_operation_id_number(value: str) -> str: + """ + Validates then extracts from PAPI operation IDs just the final number. + eg: + papiv1: 'operations/EMj9o52aLhj78ZLxzunkiHcg0e2BmaAdKg9wcm9kdWN0aW9uUXVldWU -> EMj9o52aLhj78ZLxzunkiHcg0e2BmaAdKg9wcm9kdWN0aW9uUXVldWU' + papiv2alpha1: 'projects/project_name/operations/01234567891011121314' -> '01234567891011121314' + """ + return value.split('/')[-1] + + +def operation_id_to_api_version(value: str) -> str: + """ + Examines an operation ID and returns the PAPI API version which produced it + Luckily, this is currently a 1:1 format-to-api mapping so we don't need any other clues to tell the API version. + """ + if PAPI_V1_OPERATION_REGEX.match(value): + return PAPI_V1_API_VERSION + elif PAPI_V2ALPHA1_OPERATION_REGEX.match(value): + return PAPI_V2_ALPHA1_API_VERSION + elif PAPI_V2BETA_OPERATION_REGEX.match(value): + return PAPI_V2_BETA_API_VERSION + else: + raise Exception(f'Cannot deduce PAPI api version from unexpected operation ID format \'{value}\'') + + +# What a JSON Object works out to be. +JsonObject = Dict[str, Union[Union[None, AnyStr, float], Any]] +Accumulator = TypeVar('Accumulator') +OperationId = AnyStr +CallNameSequence = Sequence[AnyStr] + +OperationMappingCallFunction = Callable[[Accumulator, OperationId, CallNameSequence, JsonObject], None] + + +def visit_papi_operations(json_metadata: JsonObject, + call_fn: OperationMappingCallFunction, + initial_accumulator: Accumulator) -> Accumulator: + """ + Visits all PAPI operations represented in the Cromwell metadata of `json_metadata`. + There will be more operations than calls if any of the operations were preempted or were failed and retried. + For every PAPI operation, the function `call_fn` is invoked with the parameters: + + - Accumulator: an object of the type specified by the caller of this function per `initial_accumulator`. + - OperationId: The PAPI operation ID of the job as a string. + - CallNameSequence: The "breadcrumbs" leading to this call (e.g. [grandparent_wf, parent_wf, wf, call]) + - JsonObject: The JSON object representing the individual job being examined. + + The final Accumulator is returned as the result of this function. + """ + + accumulator = initial_accumulator + + def examine_calls(calls: JsonObject, path_so_far: List[AnyStr]) -> None: + for call_name in calls: + attempts = calls[call_name] + for attempt in attempts: + operation_id = attempt.get('jobId') + sub_workflow_metadata = attempt.get('subWorkflowMetadata') + path = build_call_path(call_name, path_so_far, attempt) + if operation_id: + call_fn(accumulator, operation_id, path, attempt) + if sub_workflow_metadata: + examine_calls(sub_workflow_metadata.get('calls', {}), path) + + def build_call_path(call_name: str, path_so_far: List[AnyStr], attempt: dict) -> List[AnyStr]: + call_path = path_so_far.copy() + + # Remove confusing duplication in subworkflow call names. + # A parent workflow would name a subworkflow call "parent_wf.sub_wf". + # The subworkflow would name its calls "sub_wf.sub_call". + # If those call components were simply joined the result would be + # "parent_wf.sub_wf.sub_wf.sub_call". This logic removes the duplication of "sub_wf", + # resulting in "parent_wf.sub_wf.sub_call". + deduplicated_call_name = call_name + if len(path_so_far) > 0: + this_call_components = call_name.split('.') + if len(this_call_components) > 1 and path_so_far[-1].endswith('.' + this_call_components[0]): + deduplicated_call_name = '.'.join(this_call_components[1:]) + + call_path.append(deduplicated_call_name) + shard_index = attempt.get('shardIndex', -1) + if shard_index != -1: + call_path.append(f"shard_{shard_index:04d}") + + return call_path + + examine_calls(calls=json_metadata.get('calls', {}), path_so_far=[]) + + return accumulator diff --git a/scripts/metadata_comparison/metadata_comparison/lib/operations_digesters.py b/scripts/metadata_comparison/metadata_comparison/lib/operations_digesters.py new file mode 100644 index 00000000000..49d59f22f30 --- /dev/null +++ b/scripts/metadata_comparison/metadata_comparison/lib/operations_digesters.py @@ -0,0 +1,96 @@ + +from abc import ABC, abstractmethod +import dateutil.parser +from metadata_comparison.lib.operation_ids import JsonObject, operation_id_to_api_version, \ + PAPI_V1_API_VERSION, PAPI_V2_ALPHA1_API_VERSION, PAPI_V2_BETA_API_VERSION +from datetime import datetime +import re + +from typing import AnyStr, Iterator + + +class OperationDigester(ABC): + """ + Abstract Base Class for PAPI operation subclasses sharing an interface for the purpose of treating digesters + uniformly regardless of PAPI version. + """ + + def __init__(self, operation_json: JsonObject): + self.__json = operation_json + + def __metadata(self) -> JsonObject: + return self.__json.get('metadata') + + def __events(self) -> JsonObject: + return self.__metadata()['events'] + + def start_time(self) -> AnyStr: + return self.__metadata().get('createTime') + + def end_time(self) -> AnyStr: + return self.__metadata().get('endTime') + + def total_time_seconds(self) -> float: + return (dateutil.parser.parse(self.end_time()) - dateutil.parser.parse(self.start_time())).total_seconds() + + @staticmethod + def create(operation_json: JsonObject): + operation_id = operation_json.get('name') + version = operation_id_to_api_version(operation_id) + if version == PAPI_V1_API_VERSION: + return PapiV1OperationDigester(operation_json) + elif version == PAPI_V2_ALPHA1_API_VERSION: + return PapiV2AlphaOperationDigester(operation_json) + elif version == PAPI_V2_BETA_API_VERSION: + return PapiV2BetaOperationDigester(operation_json) + else: + raise ValueError(f"Unrecognized format for PAPI operation ID {operation_id}") + + @abstractmethod + def docker_image_pull_seconds(self) -> float: pass + + def event_with_description(self, description: AnyStr) -> JsonObject: + def has_description(event: JsonObject) -> bool: + return event.get('description') == description + + for unique in filter(has_description, self.__metadata().get('events')): + return unique + + def event_with_description_like(self, description: AnyStr) -> Iterator[JsonObject]: + regex = re.compile(description) + + def has_description_like(event: JsonObject) -> bool: + return regex.match(event.get('description')) is not None + + return filter(has_description_like, self.__metadata().get('events')) + + +class PapiV1OperationDigester(OperationDigester): + def __init__(self, operation_json: JsonObject): + super(PapiV1OperationDigester, self).__init__(operation_json) + + def docker_image_pull_seconds(self) -> float: + descriptions = ['localizing-files', 'pulling-image'] + end, start = [dateutil.parser.parse(self.event_with_description(d).get('startTime')) for d in descriptions] + return (end - start).total_seconds() + + +class PapiV2OperationDigester(OperationDigester, ABC): + def __init__(self, operation_json: JsonObject): + super(PapiV2OperationDigester, self).__init__(operation_json) + + def docker_image_pull_seconds(self) -> float: + description = "^(Started|Stopped) pulling .*" + pull_events = [dateutil.parser.parse(d.get('timestamp')) for d in self.event_with_description_like(description)] + pull_events.sort() + return (pull_events[-1] - pull_events[0]).total_seconds() + + +class PapiV2AlphaOperationDigester(PapiV2OperationDigester): + def __init__(self, operation_json: JsonObject): + super(PapiV2AlphaOperationDigester, self).__init__(operation_json) + + +class PapiV2BetaOperationDigester(PapiV2OperationDigester): + def __init__(self, operation_json: JsonObject): + super(PapiV2BetaOperationDigester, self).__init__(operation_json) diff --git a/scripts/metadata_comparison/metadata_comparison/lib/papi/papi_clients.py b/scripts/metadata_comparison/metadata_comparison/lib/papi/papi_clients.py new file mode 100644 index 00000000000..9e435750d70 --- /dev/null +++ b/scripts/metadata_comparison/metadata_comparison/lib/papi/papi_clients.py @@ -0,0 +1,71 @@ +#!/usr/bin/env python3 +# +# Initializer for various PAPI clients which only creates clients for specific APIs when we need them +# + +from googleapiclient.discovery import build as google_client_build, Resource +from google.auth.credentials import Credentials +from metadata_comparison.lib.operation_ids import operation_id_to_api_version +from typing import Mapping, Any +import logging + +logger = logging.getLogger('metadata_comparison.lib.papi.PapiClients') + + +class PapiClients: + clients = {} + + def __init__(self, credentials: Credentials) -> None: + self.credentials = credentials + + def __get_client(self, api_version: str) -> Resource: + """Gets the relevant client for accessing a PAPI API, or makes a new instance if necessary""" + if api_version not in self.clients: + self.clients[api_version] = self.__make_client(api_version) + return self.clients[api_version] + + def __make_client(self, api_version: str) -> Resource: + """Makes a new client for accessing a specified PAPI API""" + if api_version in ['v1alpha2', 'v2alpha1']: + return google_client_build('genomics', api_version, credentials=self.credentials) + elif api_version == 'v2beta': + return google_client_build('lifesciences', api_version, credentials=self.credentials) + else: + raise Exception(f'Unsupported client api_version: "{api_version}"') + + @staticmethod + def __read_papi_v1_operation_metadata(operation_id: str, genomics_v1_client: Resource) -> Mapping[str, Any]: + """Reads the operations metadata for a pipelines API v1 job ID. Returns a python dict""" + logger.info(f'Reading PAPI v1 operation metadata for {operation_id}...') + result = genomics_v1_client.operations().get(name=operation_id).execute() + return result + + @staticmethod + def __read_papi_v2alpha1_operation_metadata(operation_id: str, genomics_v2alpha1_client: Resource) -> Mapping[str, Any]: + """Reads the operations metadata for a pipelines API v2alpha1 job ID. Returns a python dict""" + logger.info(f'Reading PAPI v2alpha1 operation metadata for {operation_id}...') + result = genomics_v2alpha1_client.projects().operations().get(name=operation_id).execute() + return result + + @staticmethod + def __read_papi_v2beta_operation_metadata(operation_id: str, genomics_v2beta_client: Resource) -> Mapping[str, Any]: + """Reads the operations metadata for a pipelines API v2beta job ID. Returns a python dict""" + logger.info(f'Reading PAPI v2beta operation metadata for {operation_id}...') + result = genomics_v2beta_client.projects().locations().operations().get(name=operation_id).execute() + return result + + def request_operation_metadata(self, operation_id: str) -> Mapping[str, Any]: + """ + Reads the operations metadata for any supported pipelines API version. + Returns a python dict + """ + api_version = operation_id_to_api_version(operation_id) + client = self.__get_client(api_version) + if api_version == 'v1alpha2': + return self.__read_papi_v1_operation_metadata(operation_id, client) + elif api_version == 'v2alpha1': + return self.__read_papi_v2alpha1_operation_metadata(operation_id, client) + elif api_version == 'v2beta': + return self.__read_papi_v2beta_operation_metadata(operation_id, client) + else: + raise Exception(f'Unsupported client api_version in request_operation_metadata: "{api_version}"') diff --git a/scripts/metadata_comparison/metadata_comparison/lib/storage.py b/scripts/metadata_comparison/metadata_comparison/lib/storage.py new file mode 100644 index 00000000000..b52bba60b56 --- /dev/null +++ b/scripts/metadata_comparison/metadata_comparison/lib/storage.py @@ -0,0 +1,20 @@ +#!/usr/bin/env python3 + +from google.cloud import storage +import logging +from typing import AnyStr + + +def upload_blob(bucket_name: str, + source_file_contents: AnyStr, + destination_blob_name: str, + gcs_storage_client: storage.Client, + logger: logging.Logger) -> None: + """Uploads a file to the cloud""" + # bucket_name = "your-bucket-name" + # source_file_contents = "... some file contents..." + # destination_blob_name = "storage/object/name" + bucket = gcs_storage_client.bucket(bucket_name) + logger.info(f'Uploading file content to gs://{bucket_name}/{destination_blob_name}...') + blob = bucket.blob(destination_blob_name) + blob.upload_from_string(source_file_contents) diff --git a/scripts/metadata_comparison/test/__init__.py b/scripts/metadata_comparison/test/__init__.py new file mode 100644 index 00000000000..e69de29bb2d diff --git a/scripts/metadata_comparison/test/lib/__init__.py b/scripts/metadata_comparison/test/lib/__init__.py new file mode 100644 index 00000000000..e69de29bb2d diff --git a/scripts/metadata_comparison/test/lib/helper_functions.py b/scripts/metadata_comparison/test/lib/helper_functions.py new file mode 100644 index 00000000000..b3a7e4d4334 --- /dev/null +++ b/scripts/metadata_comparison/test/lib/helper_functions.py @@ -0,0 +1,9 @@ +#!/usr/bin/env python3 + +from pathlib import Path + +""" +Can be used to read files from the resources directory, like: + (RESOURCES / filename).read_text() +""" +RESOURCES = Path(__file__).parent.parent / Path("resources") diff --git a/scripts/metadata_comparison/test/lib/storage.py b/scripts/metadata_comparison/test/lib/storage.py new file mode 100644 index 00000000000..91b0d7a81d6 --- /dev/null +++ b/scripts/metadata_comparison/test/lib/storage.py @@ -0,0 +1,9 @@ +#!/usr/bin/env python3 + +from pathlib import Path + +""" +Can be used to read files from the resources directory, like: + (RESOURCES / filename).read_text() +""" +RESOURCES = Path(__file__).parent.parent / "resources" diff --git a/scripts/metadata_comparison/test/lib/test_argument_regex.py b/scripts/metadata_comparison/test/lib/test_argument_regex.py new file mode 100755 index 00000000000..cd5b4bbfdd0 --- /dev/null +++ b/scripts/metadata_comparison/test/lib/test_argument_regex.py @@ -0,0 +1,39 @@ +#!/usr/bin/env python3 + +import unittest +from metadata_comparison.lib.argument_regex import * +from test.lib.helper_functions import * + +class ArgumentRegexTestMethods(unittest.TestCase): + + def test_cromwell_url_regex_valid(self): + cases = [ + ('http://localhost', 'http://localhost'), + ('http://localhost:8000', 'http://localhost:8000'), + ('http://localhost:8000/', 'http://localhost:8000'), + ('http://localhost:8000/api/workflows', 'http://localhost:8000'), + ('http://localhost:8000/prefix/to/api/workflows', 'http://localhost:8000/prefix/to') + ] + + for case in cases: + with self.subTest(case=case): + input = case[0] + expectation = case[1] + self.assertEqual(url_regex_validator(input), expectation) + + + def test_gcs_regex_valid(self): + cases = [ + ('gs://bucket/path/to/directory', ('bucket', 'path/to/directory')), + ('gs://bucket/path/to/directory', ('bucket', 'path/to/directory')) + ] + + for case in cases: + with self.subTest(case=case): + input = case[0] + expectation = case[1] + self.assertEqual(gcs_path_regex_validator(input), expectation) + + +if __name__ == '__main__': + unittest.main() diff --git a/scripts/metadata_comparison/test/lib/test_operation_ids.py b/scripts/metadata_comparison/test/lib/test_operation_ids.py new file mode 100755 index 00000000000..dc082140116 --- /dev/null +++ b/scripts/metadata_comparison/test/lib/test_operation_ids.py @@ -0,0 +1,104 @@ +#!/usr/bin/env python3 + +import unittest +import json +from metadata_comparison.lib.operation_ids import * +from metadata_comparison.extractor import find_operation_ids_in_metadata +from test.lib.helper_functions import RESOURCES + +class OperationIdTestMethods(unittest.TestCase): + + v1alpha2_ids = [ + 'operations/EMj9o52aLhj78ZLxzunkiHcg0e2BmaAdKg9wcm9kdWN0aW9uUXVldWU', + 'operations/EMLel52aLhjHt6LH_fmuqOUBINHtgZmgHSoPcHJvZHVjdGlvblF1ZXVl', + 'operations/EJGZrp2aLhix7YG3sMaj-usBINHtgZmgHSoPcHJvZHVjdGlvblF1ZXVl', + 'operations/EM79o52aLhisyJifgbTuzb4BINHtgZmgHSoPcHJvZHVjdGlvblF1ZXVl' + ] + + v2alpha1_ids = [ + 'projects/broad-dsde-cromwell-dev/operations/4960504346170163809', + 'projects/broad-dsde-cromwell-dev/operations/1242302480525595574', + 'projects/broad-dsde-cromwell-dev/operations/11113854067012168443', + 'projects/broad-dsde-cromwell-dev/operations/14350975406210565808' + ] + + v2beta_ids = [ + 'projects/1005074806481/locations/us-central1/operations/14642412977689025366', + 'projects/1005074806481/locations/us-central1/operations/3128777609532869613', + 'projects/1005074806481/locations/us-central1/operations/18107476451113522273', + 'projects/1005074806481/locations/us-central1/operations/13032426715870634389' + ] + + def test_operation_id_number_valid(self): + v1alpha2_short_ids = [ + 'EMj9o52aLhj78ZLxzunkiHcg0e2BmaAdKg9wcm9kdWN0aW9uUXVldWU', + 'EMLel52aLhjHt6LH_fmuqOUBINHtgZmgHSoPcHJvZHVjdGlvblF1ZXVl', + 'EJGZrp2aLhix7YG3sMaj-usBINHtgZmgHSoPcHJvZHVjdGlvblF1ZXVl', + 'EM79o52aLhisyJifgbTuzb4BINHtgZmgHSoPcHJvZHVjdGlvblF1ZXVl' + ] + v1_cases = zip(self.v1alpha2_ids, v1alpha2_short_ids) + v2alpha1_short_ids = [ + '4960504346170163809', + '1242302480525595574', + '11113854067012168443', + '14350975406210565808' + ] + v2alpha1_cases = zip(self.v2alpha1_ids, v2alpha1_short_ids) + v2beta_short_ids = [ + '14642412977689025366', + '3128777609532869613', + '18107476451113522273', + '13032426715870634389' + ] + v2beta_cases = zip(self.v2beta_ids, v2beta_short_ids) + all_cases = list(v1_cases) + list(v2alpha1_cases) + list(v2beta_cases) + + for case in all_cases: + with self.subTest(case=case): + input = case[0] + expectation = case[1] + self.assertEqual(get_operation_id_number(input), expectation) + + def test_find_v1alpha2_operation_ids_in_metadata(self): + metadata = json.loads((RESOURCES / 'forkjoin_metadata_papi_v1alpha2.json').read_text()) + self.assertEqual(find_operation_ids_in_metadata(metadata), self.v1alpha2_ids) + + + def test_find_v2alpha1_operation_ids_in_metadata(self): + metadata = json.loads((RESOURCES / 'forkjoin_metadata_papi_v2alpha1.json').read_text()) + self.assertEqual(find_operation_ids_in_metadata(metadata), self.v2alpha1_ids) + + + def test_find_v2beta_operation_ids_in_metadata(self): + metadata = json.loads((RESOURCES / 'forkjoin_metadata_papi_v2beta.json').read_text()) + self.assertEqual(find_operation_ids_in_metadata(metadata), self.v2beta_ids) + + + def test_find_operation_ids_in_metadata_subworkflows(self): + metadata = json.loads((RESOURCES / 'subworkflow_hello_world_metadata.json').read_text()) + self.assertEqual(find_operation_ids_in_metadata(metadata), + ['projects/broad-dsde-cromwell-dev/operations/2244029211726316446']) + + + def test_operation_id_to_api_version(self): + for case in self.v1alpha2_ids: + with self.subTest(case=case): + self.assertEqual(operation_id_to_api_version(case), 'v1alpha2') + for case in self.v2alpha1_ids: + with self.subTest(case=case): + self.assertEqual(operation_id_to_api_version(case), 'v2alpha1') + for case in self.v2beta_ids: + with self.subTest(case=case): + self.assertEqual(operation_id_to_api_version(case), 'v2beta') + + + def test_invalid_operation_id_has_no_api_version(self): + case = "badstart/projects/broad-dsde-cromwell-dev/operations/4960504346170163809" + with self.assertRaises(Exception) as context: + operation_id_to_api_version(case) + self.assertEqual(str(context.exception), 'Cannot deduce PAPI api version from unexpected operation ID format \'badstart/projects/broad-dsde-cromwell-dev/operations/4960504346170163809\'') + + + +if __name__ == '__main__': + unittest.main() diff --git a/scripts/metadata_comparison/test/resources/comparer/performance_json_additional_key.json b/scripts/metadata_comparison/test/resources/comparer/performance_json_additional_key.json new file mode 100644 index 00000000000..397b38fdbb4 --- /dev/null +++ b/scripts/metadata_comparison/test/resources/comparer/performance_json_additional_key.json @@ -0,0 +1,25 @@ +{ + "version": "0.0.1", + "calls": { + "root_wf.sub_wf.shard_0.call_foo.shard_0": { + "attempt": 3, + "operationId": "123", + "cromwellTotalTimeSeconds": 11 + }, + "root_wf.sub_wf.shard_1.call_bar.shard_2": { + "attempt": 1, + "operationId": "456", + "cromwellTotalTimeSeconds": 99 + }, + "root_wf.baz": { + "attempt": 2, + "operationId": "789", + "cromwellTotalTimeSeconds": 93 + }, + "root_wf.excessive_key": { + "attempt": 2, + "operationId": "789", + "cromwellTotalTimeSeconds": 93 + } + } +} diff --git a/scripts/metadata_comparison/test/resources/comparer/performance_json_changed_key.json b/scripts/metadata_comparison/test/resources/comparer/performance_json_changed_key.json new file mode 100644 index 00000000000..a862e2bd180 --- /dev/null +++ b/scripts/metadata_comparison/test/resources/comparer/performance_json_changed_key.json @@ -0,0 +1,20 @@ +{ + "version": "0.0.1", + "calls": { + "root_wf.sub_wf.shard_0.call_foo.shard_0": { + "attempt": 3, + "operationId": "123", + "cromwellTotalTimeSeconds": 11 + }, + "root_wf.sub_wf.shard_1.call_bar.shard_99999999": { + "attempt": 1, + "operationId": "456", + "cromwellTotalTimeSeconds": 99 + }, + "root_wf.baz": { + "attempt": 2, + "operationId": "789", + "cromwellTotalTimeSeconds": 93 + } + } +} diff --git a/scripts/metadata_comparison/test/resources/comparer/performance_json_missing_key.json b/scripts/metadata_comparison/test/resources/comparer/performance_json_missing_key.json new file mode 100644 index 00000000000..6af210206dc --- /dev/null +++ b/scripts/metadata_comparison/test/resources/comparer/performance_json_missing_key.json @@ -0,0 +1,15 @@ +{ + "version": "0.0.1", + "calls": { + "root_wf.sub_wf.shard_0.call_foo.shard_0": { + "attempt": 3, + "operationId": "123", + "cromwellTotalTimeSeconds": 11 + }, + "root_wf.sub_wf.shard_1.call_bar.shard_2": { + "attempt": 1, + "operationId": "456", + "cromwellTotalTimeSeconds": 99 + } + } +} diff --git a/scripts/metadata_comparison/test/resources/comparer/performance_json_workflow_111.json b/scripts/metadata_comparison/test/resources/comparer/performance_json_workflow_111.json new file mode 100644 index 00000000000..5d05ee7fea8 --- /dev/null +++ b/scripts/metadata_comparison/test/resources/comparer/performance_json_workflow_111.json @@ -0,0 +1,20 @@ +{ + "version": "0.0.1", + "calls": { + "root_wf.sub_wf.shard_0.call_foo.shard_0": { + "attempt": 3, + "operationId": "123", + "cromwellTotalTimeSeconds": 9 + }, + "root_wf.sub_wf.shard_1.call_bar.shard_2": { + "attempt": 1, + "operationId": "456", + "cromwellTotalTimeSeconds": 42 + }, + "root_wf.baz": { + "attempt": 2, + "operationId": "789", + "cromwellTotalTimeSeconds": 99 + } + } +} diff --git a/scripts/metadata_comparison/test/resources/comparer/performance_json_workflow_222.json b/scripts/metadata_comparison/test/resources/comparer/performance_json_workflow_222.json new file mode 100644 index 00000000000..c5db8ee4508 --- /dev/null +++ b/scripts/metadata_comparison/test/resources/comparer/performance_json_workflow_222.json @@ -0,0 +1,20 @@ +{ + "version": "0.0.1", + "calls": { + "root_wf.sub_wf.shard_0.call_foo.shard_0": { + "attempt": 3, + "operationId": "123", + "cromwellTotalTimeSeconds": 11 + }, + "root_wf.sub_wf.shard_1.call_bar.shard_2": { + "attempt": 1, + "operationId": "456", + "cromwellTotalTimeSeconds": 99 + }, + "root_wf.baz": { + "attempt": 2, + "operationId": "789", + "cromwellTotalTimeSeconds": 93 + } + } +} diff --git a/scripts/metadata_comparison/test/resources/comparer/performance_json_workflow_222_differently_sorted.json b/scripts/metadata_comparison/test/resources/comparer/performance_json_workflow_222_differently_sorted.json new file mode 100644 index 00000000000..aca018b0864 --- /dev/null +++ b/scripts/metadata_comparison/test/resources/comparer/performance_json_workflow_222_differently_sorted.json @@ -0,0 +1,20 @@ +{ + "version": "0.0.1", + "calls": { + "root_wf.baz": { + "attempt": 2, + "operationId": "789", + "cromwellTotalTimeSeconds": 93 + }, + "root_wf.sub_wf.shard_0.call_foo.shard_0": { + "attempt": 3, + "operationId": "123", + "cromwellTotalTimeSeconds": 11 + }, + "root_wf.sub_wf.shard_1.call_bar.shard_2": { + "attempt": 1, + "operationId": "456", + "cromwellTotalTimeSeconds": 99 + } + } +} diff --git a/scripts/metadata_comparison/test/resources/comparer/valid_comparison_result.csv b/scripts/metadata_comparison/test/resources/comparer/valid_comparison_result.csv new file mode 100644 index 00000000000..d24cdb29dc9 --- /dev/null +++ b/scripts/metadata_comparison/test/resources/comparer/valid_comparison_result.csv @@ -0,0 +1,3 @@ +workflow id,digester format version,calls.root_wf.baz.cromwellTotalTimeSeconds,calls.root_wf.sub_wf.shard_0.call_foo.shard_0.cromwellTotalTimeSeconds,calls.root_wf.sub_wf.shard_1.call_bar.shard_2.cromwellTotalTimeSeconds +111,0.0.1,99,9,42 +222,0.0.1,93,11,99 \ No newline at end of file diff --git a/scripts/metadata_comparison/test/resources/forkjoin_metadata_papi_v1alpha2.json b/scripts/metadata_comparison/test/resources/forkjoin_metadata_papi_v1alpha2.json new file mode 100644 index 00000000000..9f1ee314d5b --- /dev/null +++ b/scripts/metadata_comparison/test/resources/forkjoin_metadata_papi_v1alpha2.json @@ -0,0 +1,696 @@ +{ + "workflowName": "forkjoin", + "workflowProcessingEvents": [ + { + "cromwellId": "cromid-65ffba4", + "description": "PickedUp", + "timestamp": "2020-04-22T20:47:53.884Z", + "cromwellVersion": "cromwell-version.conf-to-be-generated-by-sbt" + }, + { + "cromwellId": "cromid-65ffba4", + "description": "Finished", + "timestamp": "2020-04-22T20:57:15.733Z", + "cromwellVersion": "cromwell-version.conf-to-be-generated-by-sbt" + } + ], + "metadataSource": "Unarchived", + "actualWorkflowLanguageVersion": "draft-2", + "submittedFiles": { + "workflow": "##\n# Checks a simple branch and join operation.\n# We start with a task, branch into two parallel executions, and then rejoin to calculate the result.\n##\n\ntask mkFile {\n command {\n for i in `seq 1 1000`\n do\n echo $i\n done\n }\n output {\n File numbers = stdout()\n }\n runtime {docker: \"ubuntu:latest\"}\n}\n\ntask grep {\n String pattern\n File in_file\n command {\n grep '${pattern}' ${in_file} | wc -l\n }\n output {\n Int count = read_int(stdout())\n }\n runtime {docker: \"ubuntu:latest\"}\n}\n\ntask wc {\n File in_file\n command {\n cat ${in_file} | wc -l\n }\n output {\n Int count = read_int(stdout())\n }\n runtime {docker: \"ubuntu:latest\"}\n}\n\ntask join {\n Int grepCount\n Int wcCount\n command {\n expr ${wcCount} / ${grepCount}\n }\n output {\n Int proportion = read_int(stdout())\n }\n runtime {docker: \"ubuntu:latest\"}\n}\n\nworkflow forkjoin {\n call mkFile\n call grep { input: in_file = mkFile.numbers }\n call wc { input: in_file=mkFile.numbers }\n call join { input: wcCount = wc.count, grepCount = grep.count }\n output {\n join.proportion\n }\n}\n", + "root": "", + "options": "{\n \"default_runtime_attributes\": {\n \"docker\": \"ubuntu:latest\"\n },\n \"read_from_cache\": false\n}", + "inputs": "{\"forkjoin.grep.pattern\":\"10\"}", + "workflowUrl": "", + "labels": "{}" + }, + "calls": { + "forkjoin.wc": [ + { + "preemptible": false, + "executionStatus": "Done", + "stdout": "gs://cloud-cromwell-dev-self-cleaning/cromwell_execution/ci/forkjoin/6803d4ce-d912-41c6-be26-df1e483d6480/call-wc/wc-stdout.log", + "backendStatus": "Success", + "compressedDockerSize": 26726969, + "commandLine": "cat /cromwell_root/cloud-cromwell-dev-self-cleaning/cromwell_execution/ci/forkjoin/6803d4ce-d912-41c6-be26-df1e483d6480/call-mkFile/stdout | wc -l", + "shardIndex": -1, + "jes": { + "endpointUrl": "https://genomics.googleapis.com/", + "machineType": "us-central1-b/n1-standard-1", + "googleProject": "broad-dsde-cromwell-dev", + "executionBucket": "gs://cloud-cromwell-dev-self-cleaning/cromwell_execution/ci", + "zone": "us-central1-b", + "instanceName": "ggp-8579800723739687163" + }, + "outputs": { + "count": 1000 + }, + "runtimeAttributes": { + "preemptible": "0", + "failOnStderr": "false", + "bootDiskSizeGb": "10", + "disks": "local-disk 10 SSD", + "continueOnReturnCode": "0", + "docker": "ubuntu:latest", + "maxRetries": "0", + "cpu": "1", + "cpuMin": "1", + "noAddress": "false", + "zones": "us-central1-b", + "memoryMin": "2 GB", + "memory": "2 GB" + }, + "callCaching": { + "allowResultReuse": true, + "effectiveCallCachingMode": "WriteCache", + "hashes": { + "output count": "C4CA4238A0B923820DCC509A6F75849B", + "runtime attribute": { + "docker": "5E7DF90C2675647DEC5193F290917C18", + "continueOnReturnCode": "CFCD208495D565EF66E7DFF9F98764DA", + "failOnStderr": "68934A3E9455FA72420237EB05902327" + }, + "output expression": { + "Int count": "93114C80C6826C071C28393FDD0D5F73" + }, + "input count": "C4CA4238A0B923820DCC509A6F75849B", + "backend name": "36EF4A8AB268D1A1C74D8108C93D48ED", + "command template": "D357F0A427A86FF1FED64C347D4DFBAA", + "input": { + "File in_file": "4DC9uA==" + } + } + }, + "inputs": { + "in_file": "gs://cloud-cromwell-dev-self-cleaning/cromwell_execution/ci/forkjoin/6803d4ce-d912-41c6-be26-df1e483d6480/call-mkFile/stdout" + }, + "backendLabels": { + "wdl-task-name": "wc", + "cromwell-workflow-id": "cromwell-6803d4ce-d912-41c6-be26-df1e483d6480" + }, + "returnCode": 0, + "labels": { + "wdl-task-name": "wc", + "cromwell-workflow-id": "cromwell-6803d4ce-d912-41c6-be26-df1e483d6480" + }, + "jobId": "operations/EMj9o52aLhj78ZLxzunkiHcg0e2BmaAdKg9wcm9kdWN0aW9uUXVldWU", + "backend": "Papi", + "end": "2020-04-22T20:53:17.893Z", + "dockerImageUsed": "ubuntu@sha256:e5dd9dbb37df5b731a6688fa49f4003359f6f126958c9c928f937bec69836320", + "stderr": "gs://cloud-cromwell-dev-self-cleaning/cromwell_execution/ci/forkjoin/6803d4ce-d912-41c6-be26-df1e483d6480/call-wc/wc-stderr.log", + "callRoot": "gs://cloud-cromwell-dev-self-cleaning/cromwell_execution/ci/forkjoin/6803d4ce-d912-41c6-be26-df1e483d6480/call-wc", + "attempt": 1, + "executionEvents": [ + { + "startTime": "2020-04-22T20:51:39.000Z", + "description": "initializing VM", + "endTime": "2020-04-22T20:51:58.529Z" + }, + { + "startTime": "2020-04-22T20:51:10.114Z", + "description": "WaitingForValueStore", + "endTime": "2020-04-22T20:51:10.115Z" + }, + { + "startTime": "2020-04-22T20:53:16.898Z", + "description": "UpdatingJobStore", + "endTime": "2020-04-22T20:53:17.893Z" + }, + { + "startTime": "2020-04-22T20:52:01.470Z", + "description": "pulling-image", + "endTime": "2020-04-22T20:53:03.760Z" + }, + { + "startTime": "2020-04-22T20:51:10.115Z", + "description": "PreparingJob", + "endTime": "2020-04-22T20:51:10.122Z" + }, + { + "startTime": "2020-04-22T20:53:05.708Z", + "description": "running-docker", + "endTime": "2020-04-22T20:53:07.329Z" + }, + { + "startTime": "2020-04-22T20:53:11.514Z", + "description": "cromwell poll interval", + "endTime": "2020-04-22T20:53:15.625Z" + }, + { + "startTime": "2020-04-22T20:51:58.529Z", + "description": "start", + "endTime": "2020-04-22T20:52:01.470Z" + }, + { + "startTime": "2020-04-22T20:51:09.916Z", + "description": "Pending", + "endTime": "2020-04-22T20:51:09.917Z" + }, + { + "startTime": "2020-04-22T20:53:11.514Z", + "description": "ok", + "endTime": "2020-04-22T20:53:11.514Z" + }, + { + "startTime": "2020-04-22T20:53:07.329Z", + "description": "delocalizing-files", + "endTime": "2020-04-22T20:53:11.514Z" + }, + { + "startTime": "2020-04-22T20:51:10.122Z", + "description": "RunningJob", + "endTime": "2020-04-22T20:51:33.000Z" + }, + { + "startTime": "2020-04-22T20:53:15.625Z", + "description": "UpdatingCallCache", + "endTime": "2020-04-22T20:53:16.898Z" + }, + { + "startTime": "2020-04-22T20:51:33.000Z", + "description": "waiting for quota", + "endTime": "2020-04-22T20:51:39.000Z" + }, + { + "startTime": "2020-04-22T20:51:09.917Z", + "description": "RequestingExecutionToken", + "endTime": "2020-04-22T20:51:10.114Z" + }, + { + "startTime": "2020-04-22T20:53:03.760Z", + "description": "localizing-files", + "endTime": "2020-04-22T20:53:05.708Z" + } + ], + "backendLogs": { + "log": "gs://cloud-cromwell-dev-self-cleaning/cromwell_execution/ci/forkjoin/6803d4ce-d912-41c6-be26-df1e483d6480/call-wc/wc.log" + }, + "start": "2020-04-22T20:51:09.916Z" + } + ], + "forkjoin.mkFile": [ + { + "preemptible": false, + "executionStatus": "Done", + "stdout": "gs://cloud-cromwell-dev-self-cleaning/cromwell_execution/ci/forkjoin/6803d4ce-d912-41c6-be26-df1e483d6480/call-mkFile/mkFile-stdout.log", + "backendStatus": "Success", + "compressedDockerSize": 26726969, + "commandLine": "for i in `seq 1 1000`\ndo\n echo $i\ndone", + "shardIndex": -1, + "jes": { + "endpointUrl": "https://genomics.googleapis.com/", + "machineType": "us-central1-b/n1-standard-1", + "googleProject": "broad-dsde-cromwell-dev", + "executionBucket": "gs://cloud-cromwell-dev-self-cleaning/cromwell_execution/ci", + "zone": "us-central1-b", + "instanceName": "ggp-16523913534194097095" + }, + "outputs": { + "numbers": "gs://cloud-cromwell-dev-self-cleaning/cromwell_execution/ci/forkjoin/6803d4ce-d912-41c6-be26-df1e483d6480/call-mkFile/stdout" + }, + "runtimeAttributes": { + "preemptible": "0", + "failOnStderr": "false", + "bootDiskSizeGb": "10", + "disks": "local-disk 10 SSD", + "continueOnReturnCode": "0", + "docker": "ubuntu:latest", + "maxRetries": "0", + "cpu": "1", + "cpuMin": "1", + "noAddress": "false", + "zones": "us-central1-b", + "memoryMin": "2 GB", + "memory": "2 GB" + }, + "callCaching": { + "allowResultReuse": true, + "effectiveCallCachingMode": "WriteCache", + "hashes": { + "output count": "C4CA4238A0B923820DCC509A6F75849B", + "runtime attribute": { + "docker": "5E7DF90C2675647DEC5193F290917C18", + "continueOnReturnCode": "CFCD208495D565EF66E7DFF9F98764DA", + "failOnStderr": "68934A3E9455FA72420237EB05902327" + }, + "output expression": { + "File numbers": "174A537D0C36799273C5E3539866FDF4" + }, + "input count": "CFCD208495D565EF66E7DFF9F98764DA", + "backend name": "36EF4A8AB268D1A1C74D8108C93D48ED", + "command template": "22653C31522C78A881EEC11AA2F1D871" + } + }, + "inputs": {}, + "backendLabels": { + "wdl-task-name": "mkfile", + "cromwell-workflow-id": "cromwell-6803d4ce-d912-41c6-be26-df1e483d6480" + }, + "returnCode": 0, + "labels": { + "wdl-task-name": "mkFile", + "cromwell-workflow-id": "cromwell-6803d4ce-d912-41c6-be26-df1e483d6480" + }, + "jobId": "operations/EMLel52aLhjHt6LH_fmuqOUBINHtgZmgHSoPcHJvZHVjdGlvblF1ZXVl", + "backend": "Papi", + "end": "2020-04-22T20:51:08.889Z", + "dockerImageUsed": "ubuntu@sha256:e5dd9dbb37df5b731a6688fa49f4003359f6f126958c9c928f937bec69836320", + "stderr": "gs://cloud-cromwell-dev-self-cleaning/cromwell_execution/ci/forkjoin/6803d4ce-d912-41c6-be26-df1e483d6480/call-mkFile/mkFile-stderr.log", + "callRoot": "gs://cloud-cromwell-dev-self-cleaning/cromwell_execution/ci/forkjoin/6803d4ce-d912-41c6-be26-df1e483d6480/call-mkFile", + "attempt": 1, + "executionEvents": [ + { + "startTime": "2020-04-22T20:51:07.894Z", + "description": "UpdatingJobStore", + "endTime": "2020-04-22T20:51:08.889Z" + }, + { + "startTime": "2020-04-22T20:47:57.129Z", + "description": "Pending", + "endTime": "2020-04-22T20:47:57.129Z" + }, + { + "startTime": "2020-04-22T20:49:30.752Z", + "description": "pulling-image", + "endTime": "2020-04-22T20:50:05.612Z" + }, + { + "startTime": "2020-04-22T20:50:12.579Z", + "description": "ok", + "endTime": "2020-04-22T20:50:13.000Z" + }, + { + "startTime": "2020-04-22T20:47:57.129Z", + "description": "RequestingExecutionToken", + "endTime": "2020-04-22T20:47:58.099Z" + }, + { + "startTime": "2020-04-22T20:50:06.653Z", + "description": "running-docker", + "endTime": "2020-04-22T20:50:08.340Z" + }, + { + "startTime": "2020-04-22T20:47:58.100Z", + "description": "PreparingJob", + "endTime": "2020-04-22T20:47:58.110Z" + }, + { + "startTime": "2020-04-22T20:48:19.000Z", + "description": "initializing VM", + "endTime": "2020-04-22T20:49:27.848Z" + }, + { + "startTime": "2020-04-22T20:51:05.492Z", + "description": "UpdatingCallCache", + "endTime": "2020-04-22T20:51:07.894Z" + }, + { + "startTime": "2020-04-22T20:47:58.099Z", + "description": "WaitingForValueStore", + "endTime": "2020-04-22T20:47:58.100Z" + }, + { + "startTime": "2020-04-22T20:50:08.340Z", + "description": "delocalizing-files", + "endTime": "2020-04-22T20:50:12.579Z" + }, + { + "startTime": "2020-04-22T20:50:05.612Z", + "description": "localizing-files", + "endTime": "2020-04-22T20:50:06.653Z" + }, + { + "startTime": "2020-04-22T20:50:13.000Z", + "description": "cromwell poll interval", + "endTime": "2020-04-22T20:51:05.492Z" + }, + { + "startTime": "2020-04-22T20:49:27.848Z", + "description": "start", + "endTime": "2020-04-22T20:49:30.752Z" + }, + { + "startTime": "2020-04-22T20:47:58.110Z", + "description": "RunningJob", + "endTime": "2020-04-22T20:48:13.000Z" + }, + { + "startTime": "2020-04-22T20:48:13.000Z", + "description": "waiting for quota", + "endTime": "2020-04-22T20:48:19.000Z" + } + ], + "backendLogs": { + "log": "gs://cloud-cromwell-dev-self-cleaning/cromwell_execution/ci/forkjoin/6803d4ce-d912-41c6-be26-df1e483d6480/call-mkFile/mkFile.log" + }, + "start": "2020-04-22T20:47:57.129Z" + } + ], + "forkjoin.join": [ + { + "preemptible": false, + "executionStatus": "Done", + "stdout": "gs://cloud-cromwell-dev-self-cleaning/cromwell_execution/ci/forkjoin/6803d4ce-d912-41c6-be26-df1e483d6480/call-join/join-stdout.log", + "backendStatus": "Success", + "compressedDockerSize": 26726969, + "commandLine": "expr 1000 / 21", + "shardIndex": -1, + "jes": { + "endpointUrl": "https://genomics.googleapis.com/", + "machineType": "us-central1-b/n1-standard-1", + "googleProject": "broad-dsde-cromwell-dev", + "executionBucket": "gs://cloud-cromwell-dev-self-cleaning/cromwell_execution/ci", + "zone": "us-central1-b", + "instanceName": "ggp-17002370843040315057" + }, + "outputs": { + "proportion": 47 + }, + "runtimeAttributes": { + "preemptible": "0", + "failOnStderr": "false", + "bootDiskSizeGb": "10", + "disks": "local-disk 10 SSD", + "continueOnReturnCode": "0", + "docker": "ubuntu:latest", + "maxRetries": "0", + "cpu": "1", + "cpuMin": "1", + "noAddress": "false", + "zones": "us-central1-b", + "memoryMin": "2 GB", + "memory": "2 GB" + }, + "callCaching": { + "allowResultReuse": true, + "effectiveCallCachingMode": "WriteCache", + "hashes": { + "output count": "C4CA4238A0B923820DCC509A6F75849B", + "runtime attribute": { + "docker": "5E7DF90C2675647DEC5193F290917C18", + "continueOnReturnCode": "CFCD208495D565EF66E7DFF9F98764DA", + "failOnStderr": "68934A3E9455FA72420237EB05902327" + }, + "output expression": { + "Int proportion": "93114C80C6826C071C28393FDD0D5F73" + }, + "input count": "C81E728D9D4C2F636F067F89CC14862C", + "backend name": "36EF4A8AB268D1A1C74D8108C93D48ED", + "command template": "CB9A57DD4ABEC16E2C1284E5D31249CD", + "input": { + "Int wcCount": "A9B7BA70783B617E9998DC4DD82EB3C5", + "Int grepCount": "3C59DC048E8850243BE8079A5C74D079" + } + } + }, + "inputs": { + "wcCount": 1000, + "grepCount": 21 + }, + "backendLabels": { + "wdl-task-name": "join", + "cromwell-workflow-id": "cromwell-6803d4ce-d912-41c6-be26-df1e483d6480" + }, + "returnCode": 0, + "labels": { + "wdl-task-name": "join", + "cromwell-workflow-id": "cromwell-6803d4ce-d912-41c6-be26-df1e483d6480" + }, + "jobId": "operations/EJGZrp2aLhix7YG3sMaj-usBINHtgZmgHSoPcHJvZHVjdGlvblF1ZXVl", + "backend": "Papi", + "end": "2020-04-22T20:57:11.901Z", + "dockerImageUsed": "ubuntu@sha256:e5dd9dbb37df5b731a6688fa49f4003359f6f126958c9c928f937bec69836320", + "stderr": "gs://cloud-cromwell-dev-self-cleaning/cromwell_execution/ci/forkjoin/6803d4ce-d912-41c6-be26-df1e483d6480/call-join/join-stderr.log", + "callRoot": "gs://cloud-cromwell-dev-self-cleaning/cromwell_execution/ci/forkjoin/6803d4ce-d912-41c6-be26-df1e483d6480/call-join", + "attempt": 1, + "executionEvents": [ + { + "startTime": "2020-04-22T20:57:10.805Z", + "description": "UpdatingCallCache", + "endTime": "2020-04-22T20:57:10.905Z" + }, + { + "startTime": "2020-04-22T20:53:56.121Z", + "description": "PreparingJob", + "endTime": "2020-04-22T20:53:56.125Z" + }, + { + "startTime": "2020-04-22T20:53:56.121Z", + "description": "WaitingForValueStore", + "endTime": "2020-04-22T20:53:56.121Z" + }, + { + "startTime": "2020-04-22T20:57:10.905Z", + "description": "UpdatingJobStore", + "endTime": "2020-04-22T20:57:11.901Z" + }, + { + "startTime": "2020-04-22T20:53:55.161Z", + "description": "RequestingExecutionToken", + "endTime": "2020-04-22T20:53:56.121Z" + }, + { + "startTime": "2020-04-22T20:54:21.000Z", + "description": "waiting for quota", + "endTime": "2020-04-22T20:54:26.000Z" + }, + { + "startTime": "2020-04-22T20:56:16.578Z", + "description": "cromwell poll interval", + "endTime": "2020-04-22T20:57:10.805Z" + }, + { + "startTime": "2020-04-22T20:56:09.720Z", + "description": "localizing-files", + "endTime": "2020-04-22T20:56:10.680Z" + }, + { + "startTime": "2020-04-22T20:56:10.680Z", + "description": "running-docker", + "endTime": "2020-04-22T20:56:12.392Z" + }, + { + "startTime": "2020-04-22T20:56:12.392Z", + "description": "delocalizing-files", + "endTime": "2020-04-22T20:56:16.578Z" + }, + { + "startTime": "2020-04-22T20:53:56.125Z", + "description": "RunningJob", + "endTime": "2020-04-22T20:54:21.000Z" + }, + { + "startTime": "2020-04-22T20:54:47.338Z", + "description": "start", + "endTime": "2020-04-22T20:54:50.227Z" + }, + { + "startTime": "2020-04-22T20:56:16.578Z", + "description": "ok", + "endTime": "2020-04-22T20:56:16.578Z" + }, + { + "startTime": "2020-04-22T20:54:26.000Z", + "description": "initializing VM", + "endTime": "2020-04-22T20:54:47.338Z" + }, + { + "startTime": "2020-04-22T20:54:50.227Z", + "description": "pulling-image", + "endTime": "2020-04-22T20:56:09.720Z" + }, + { + "startTime": "2020-04-22T20:53:55.160Z", + "description": "Pending", + "endTime": "2020-04-22T20:53:55.161Z" + } + ], + "backendLogs": { + "log": "gs://cloud-cromwell-dev-self-cleaning/cromwell_execution/ci/forkjoin/6803d4ce-d912-41c6-be26-df1e483d6480/call-join/join.log" + }, + "start": "2020-04-22T20:53:55.160Z" + } + ], + "forkjoin.grep": [ + { + "preemptible": false, + "executionStatus": "Done", + "stdout": "gs://cloud-cromwell-dev-self-cleaning/cromwell_execution/ci/forkjoin/6803d4ce-d912-41c6-be26-df1e483d6480/call-grep/grep-stdout.log", + "backendStatus": "Success", + "compressedDockerSize": 26726969, + "commandLine": "grep '10' /cromwell_root/cloud-cromwell-dev-self-cleaning/cromwell_execution/ci/forkjoin/6803d4ce-d912-41c6-be26-df1e483d6480/call-mkFile/stdout | wc -l", + "shardIndex": -1, + "jes": { + "endpointUrl": "https://genomics.googleapis.com/", + "machineType": "us-central1-b/n1-standard-1", + "googleProject": "broad-dsde-cromwell-dev", + "executionBucket": "gs://cloud-cromwell-dev-self-cleaning/cromwell_execution/ci", + "zone": "us-central1-b", + "instanceName": "ggp-13734775585776215084" + }, + "outputs": { + "count": 21 + }, + "runtimeAttributes": { + "preemptible": "0", + "failOnStderr": "false", + "bootDiskSizeGb": "10", + "disks": "local-disk 10 SSD", + "continueOnReturnCode": "0", + "docker": "ubuntu:latest", + "maxRetries": "0", + "cpu": "1", + "cpuMin": "1", + "noAddress": "false", + "zones": "us-central1-b", + "memoryMin": "2 GB", + "memory": "2 GB" + }, + "callCaching": { + "allowResultReuse": true, + "effectiveCallCachingMode": "WriteCache", + "hashes": { + "output count": "C4CA4238A0B923820DCC509A6F75849B", + "runtime attribute": { + "docker": "5E7DF90C2675647DEC5193F290917C18", + "continueOnReturnCode": "CFCD208495D565EF66E7DFF9F98764DA", + "failOnStderr": "68934A3E9455FA72420237EB05902327" + }, + "output expression": { + "Int count": "93114C80C6826C071C28393FDD0D5F73" + }, + "input count": "C81E728D9D4C2F636F067F89CC14862C", + "backend name": "36EF4A8AB268D1A1C74D8108C93D48ED", + "command template": "B473D09A649FA4B1E4646844E5DE04C6", + "input": { + "File in_file": "4DC9uA==", + "String pattern": "C76A2042F6D6B4BABE36D71F036740E2" + } + } + }, + "inputs": { + "in_file": "gs://cloud-cromwell-dev-self-cleaning/cromwell_execution/ci/forkjoin/6803d4ce-d912-41c6-be26-df1e483d6480/call-mkFile/stdout", + "pattern": "10" + }, + "backendLabels": { + "wdl-task-name": "grep", + "cromwell-workflow-id": "cromwell-6803d4ce-d912-41c6-be26-df1e483d6480" + }, + "returnCode": 0, + "labels": { + "wdl-task-name": "grep", + "cromwell-workflow-id": "cromwell-6803d4ce-d912-41c6-be26-df1e483d6480" + }, + "jobId": "operations/EM79o52aLhisyJifgbTuzb4BINHtgZmgHSoPcHJvZHVjdGlvblF1ZXVl", + "backend": "Papi", + "end": "2020-04-22T20:53:53.899Z", + "dockerImageUsed": "ubuntu@sha256:e5dd9dbb37df5b731a6688fa49f4003359f6f126958c9c928f937bec69836320", + "stderr": "gs://cloud-cromwell-dev-self-cleaning/cromwell_execution/ci/forkjoin/6803d4ce-d912-41c6-be26-df1e483d6480/call-grep/grep-stderr.log", + "callRoot": "gs://cloud-cromwell-dev-self-cleaning/cromwell_execution/ci/forkjoin/6803d4ce-d912-41c6-be26-df1e483d6480/call-grep", + "attempt": 1, + "executionEvents": [ + { + "startTime": "2020-04-22T20:53:05.386Z", + "description": "localizing-files", + "endTime": "2020-04-22T20:53:07.131Z" + }, + { + "startTime": "2020-04-22T20:52:01.380Z", + "description": "pulling-image", + "endTime": "2020-04-22T20:53:05.386Z" + }, + { + "startTime": "2020-04-22T20:51:09.916Z", + "description": "Pending", + "endTime": "2020-04-22T20:51:09.916Z" + }, + { + "startTime": "2020-04-22T20:53:50.682Z", + "description": "UpdatingCallCache", + "endTime": "2020-04-22T20:53:52.895Z" + }, + { + "startTime": "2020-04-22T20:51:39.000Z", + "description": "initializing VM", + "endTime": "2020-04-22T20:51:58.506Z" + }, + { + "startTime": "2020-04-22T20:53:13.000Z", + "description": "cromwell poll interval", + "endTime": "2020-04-22T20:53:50.682Z" + }, + { + "startTime": "2020-04-22T20:51:58.506Z", + "description": "start", + "endTime": "2020-04-22T20:52:01.380Z" + }, + { + "startTime": "2020-04-22T20:53:08.791Z", + "description": "delocalizing-files", + "endTime": "2020-04-22T20:53:12.878Z" + }, + { + "startTime": "2020-04-22T20:51:10.115Z", + "description": "PreparingJob", + "endTime": "2020-04-22T20:51:10.126Z" + }, + { + "startTime": "2020-04-22T20:51:09.916Z", + "description": "RequestingExecutionToken", + "endTime": "2020-04-22T20:51:10.114Z" + }, + { + "startTime": "2020-04-22T20:51:33.000Z", + "description": "waiting for quota", + "endTime": "2020-04-22T20:51:39.000Z" + }, + { + "startTime": "2020-04-22T20:53:12.878Z", + "description": "ok", + "endTime": "2020-04-22T20:53:13.000Z" + }, + { + "startTime": "2020-04-22T20:51:10.114Z", + "description": "WaitingForValueStore", + "endTime": "2020-04-22T20:51:10.115Z" + }, + { + "startTime": "2020-04-22T20:51:10.126Z", + "description": "RunningJob", + "endTime": "2020-04-22T20:51:33.000Z" + }, + { + "startTime": "2020-04-22T20:53:52.895Z", + "description": "UpdatingJobStore", + "endTime": "2020-04-22T20:53:53.899Z" + }, + { + "startTime": "2020-04-22T20:53:07.131Z", + "description": "running-docker", + "endTime": "2020-04-22T20:53:08.791Z" + } + ], + "backendLogs": { + "log": "gs://cloud-cromwell-dev-self-cleaning/cromwell_execution/ci/forkjoin/6803d4ce-d912-41c6-be26-df1e483d6480/call-grep/grep.log" + }, + "start": "2020-04-22T20:51:09.916Z" + } + ] + }, + "outputs": { + "forkjoin.join.proportion": 47 + }, + "workflowRoot": "gs://cloud-cromwell-dev-self-cleaning/cromwell_execution/ci/forkjoin/6803d4ce-d912-41c6-be26-df1e483d6480/", + "actualWorkflowLanguage": "WDL", + "id": "6803d4ce-d912-41c6-be26-df1e483d6480", + "inputs": { + "forkjoin.grep.pattern": "10" + }, + "labels": { + "cromwell-workflow-id": "cromwell-6803d4ce-d912-41c6-be26-df1e483d6480" + }, + "submission": "2020-04-22T20:47:53.006Z", + "status": "Succeeded", + "end": "2020-04-22T20:57:15.733Z", + "start": "2020-04-22T20:47:53.885Z" +} diff --git a/scripts/metadata_comparison/test/resources/forkjoin_metadata_papi_v2alpha1.json b/scripts/metadata_comparison/test/resources/forkjoin_metadata_papi_v2alpha1.json new file mode 100644 index 00000000000..c25e9a69af3 --- /dev/null +++ b/scripts/metadata_comparison/test/resources/forkjoin_metadata_papi_v2alpha1.json @@ -0,0 +1,765 @@ +{ + "actualWorkflowLanguage": "WDL", + "actualWorkflowLanguageVersion": "draft-2", + "calls": { + "forkjoin.grep": [ + { + "attempt": 1, + "backend": "Papi", + "backendLabels": { + "cromwell-workflow-id": "cromwell-515cfe12-1c50-4b93-ae77-b6675e87c0b8", + "wdl-task-name": "grep" + }, + "backendLogs": { + "log": "gs://cloud-cromwell-dev-self-cleaning/cromwell_execution/ci/forkjoin/515cfe12-1c50-4b93-ae77-b6675e87c0b8/call-grep/grep.log" + }, + "backendStatus": "Success", + "callCaching": { + "allowResultReuse": true, + "effectiveCallCachingMode": "ReadAndWriteCache", + "hashes": { + "backend name": "36EF4A8AB268D1A1C74D8108C93D48ED", + "command template": "B473D09A649FA4B1E4646844E5DE04C6", + "input": { + "File in_file": "4DC9uA==", + "String pattern": "C76A2042F6D6B4BABE36D71F036740E2" + }, + "input count": "C81E728D9D4C2F636F067F89CC14862C", + "output count": "C4CA4238A0B923820DCC509A6F75849B", + "output expression": { + "Int count": "93114C80C6826C071C28393FDD0D5F73" + }, + "runtime attribute": { + "continueOnReturnCode": "CFCD208495D565EF66E7DFF9F98764DA", + "docker": "5E7DF90C2675647DEC5193F290917C18", + "failOnStderr": "68934A3E9455FA72420237EB05902327" + } + }, + "hit": false, + "result": "Cache Miss" + }, + "callRoot": "gs://cloud-cromwell-dev-self-cleaning/cromwell_execution/ci/forkjoin/515cfe12-1c50-4b93-ae77-b6675e87c0b8/call-grep", + "commandLine": "grep '10' /cromwell_root/cloud-cromwell-dev-self-cleaning/cromwell_execution/ci/forkjoin/515cfe12-1c50-4b93-ae77-b6675e87c0b8/call-mkFile/stdout | wc -l", + "compressedDockerSize": 26726969, + "dockerImageUsed": "ubuntu@sha256:e5dd9dbb37df5b731a6688fa49f4003359f6f126958c9c928f937bec69836320", + "end": "2020-04-16T21:29:10.540Z", + "executionEvents": [ + { + "description": "ContainerSetup", + "endTime": "2020-04-16T21:27:55.730Z", + "startTime": "2020-04-16T21:27:49.750Z" + }, + { + "description": "Pulling \"gcr.io/google.com/cloudsdktool/cloud-sdk:276.0.0-slim\"", + "endTime": "2020-04-16T21:27:36.159Z", + "startTime": "2020-04-16T21:26:55.256Z" + }, + { + "description": "RequestingExecutionToken", + "endTime": "2020-04-16T21:25:51.771Z", + "startTime": "2020-04-16T21:25:51.222Z" + }, + { + "description": "Complete in GCE / Cromwell Poll Interval", + "endTime": "2020-04-16T21:29:07.269Z", + "startTime": "2020-04-16T21:28:26.637Z" + }, + { + "description": "CallCacheReading", + "endTime": "2020-04-16T21:25:51.794Z", + "startTime": "2020-04-16T21:25:51.785Z" + }, + { + "description": "Worker \"google-pipelines-worker-cd1d47dd20cdd038a21626b89cbcd9c8\" assigned in \"us-central1-b\"", + "endTime": "2020-04-16T21:26:55.256Z", + "startTime": "2020-04-16T21:26:22.042Z" + }, + { + "description": "Delocalization", + "endTime": "2020-04-16T21:28:26.637Z", + "startTime": "2020-04-16T21:28:13.030Z" + }, + { + "description": "waiting for quota", + "endTime": "2020-04-16T21:26:22.042Z", + "startTime": "2020-04-16T21:26:19.271Z" + }, + { + "description": "Worker released", + "endTime": "2020-04-16T21:28:26.637Z", + "startTime": "2020-04-16T21:28:26.637Z" + }, + { + "description": "PreparingJob", + "endTime": "2020-04-16T21:25:51.785Z", + "startTime": "2020-04-16T21:25:51.772Z" + }, + { + "description": "Background", + "endTime": "2020-04-16T21:27:49.377Z", + "startTime": "2020-04-16T21:27:48.761Z" + }, + { + "description": "WaitingForValueStore", + "endTime": "2020-04-16T21:25:51.772Z", + "startTime": "2020-04-16T21:25:51.771Z" + }, + { + "description": "UserAction", + "endTime": "2020-04-16T21:28:13.030Z", + "startTime": "2020-04-16T21:28:10.267Z" + }, + { + "description": "Pending", + "endTime": "2020-04-16T21:25:51.222Z", + "startTime": "2020-04-16T21:25:51.221Z" + }, + { + "description": "RunningJob", + "endTime": "2020-04-16T21:26:19.271Z", + "startTime": "2020-04-16T21:25:51.794Z" + }, + { + "description": "Pulling \"ubuntu@sha256:e5dd9dbb37df5b731a6688fa49f4003359f6f126958c9c928f937bec69836320\"", + "endTime": "2020-04-16T21:27:48.761Z", + "startTime": "2020-04-16T21:27:36.159Z" + }, + { + "description": "Localization", + "endTime": "2020-04-16T21:28:10.267Z", + "startTime": "2020-04-16T21:27:55.730Z" + }, + { + "description": "UpdatingJobStore", + "endTime": "2020-04-16T21:29:10.540Z", + "startTime": "2020-04-16T21:29:09.548Z" + }, + { + "description": "UpdatingCallCache", + "endTime": "2020-04-16T21:29:09.548Z", + "startTime": "2020-04-16T21:29:07.269Z" + } + ], + "executionStatus": "Done", + "inputs": { + "in_file": "gs://cloud-cromwell-dev-self-cleaning/cromwell_execution/ci/forkjoin/515cfe12-1c50-4b93-ae77-b6675e87c0b8/call-mkFile/stdout", + "pattern": "10" + }, + "jes": { + "endpointUrl": "https://genomics.googleapis.com/", + "executionBucket": "gs://cloud-cromwell-dev-self-cleaning/cromwell_execution/ci", + "googleProject": "broad-dsde-cromwell-dev", + "instanceName": "google-pipelines-worker-cd1d47dd20cdd038a21626b89cbcd9c8", + "machineType": "custom-1-2048", + "zone": "us-central1-b" + }, + "jobId": "projects/broad-dsde-cromwell-dev/operations/4960504346170163809", + "labels": { + "cromwell-workflow-id": "cromwell-515cfe12-1c50-4b93-ae77-b6675e87c0b8", + "wdl-task-name": "grep" + }, + "outputs": { + "count": 21 + }, + "preemptible": false, + "returnCode": 0, + "runtimeAttributes": { + "bootDiskSizeGb": "10", + "continueOnReturnCode": "0", + "cpu": "1", + "cpuMin": "1", + "disks": "local-disk 10 SSD", + "docker": "ubuntu:latest", + "failOnStderr": "false", + "maxRetries": "0", + "memory": "2 GB", + "memoryMin": "2 GB", + "noAddress": "false", + "preemptible": "0", + "zones": "us-central1-b" + }, + "shardIndex": -1, + "start": "2020-04-16T21:25:51.221Z", + "stderr": "gs://cloud-cromwell-dev-self-cleaning/cromwell_execution/ci/forkjoin/515cfe12-1c50-4b93-ae77-b6675e87c0b8/call-grep/stderr", + "stdout": "gs://cloud-cromwell-dev-self-cleaning/cromwell_execution/ci/forkjoin/515cfe12-1c50-4b93-ae77-b6675e87c0b8/call-grep/stdout" + } + ], + "forkjoin.join": [ + { + "attempt": 1, + "backend": "Papi", + "backendLabels": { + "cromwell-workflow-id": "cromwell-515cfe12-1c50-4b93-ae77-b6675e87c0b8", + "wdl-task-name": "join" + }, + "backendLogs": { + "log": "gs://cloud-cromwell-dev-self-cleaning/cromwell_execution/ci/forkjoin/515cfe12-1c50-4b93-ae77-b6675e87c0b8/call-join/join.log" + }, + "backendStatus": "Success", + "callCaching": { + "allowResultReuse": true, + "effectiveCallCachingMode": "ReadAndWriteCache", + "hashes": { + "backend name": "36EF4A8AB268D1A1C74D8108C93D48ED", + "command template": "CB9A57DD4ABEC16E2C1284E5D31249CD", + "input": { + "Int grepCount": "3C59DC048E8850243BE8079A5C74D079", + "Int wcCount": "A9B7BA70783B617E9998DC4DD82EB3C5" + }, + "input count": "C81E728D9D4C2F636F067F89CC14862C", + "output count": "C4CA4238A0B923820DCC509A6F75849B", + "output expression": { + "Int proportion": "93114C80C6826C071C28393FDD0D5F73" + }, + "runtime attribute": { + "continueOnReturnCode": "CFCD208495D565EF66E7DFF9F98764DA", + "docker": "5E7DF90C2675647DEC5193F290917C18", + "failOnStderr": "68934A3E9455FA72420237EB05902327" + } + }, + "hit": false, + "result": "Cache Miss" + }, + "callRoot": "gs://cloud-cromwell-dev-self-cleaning/cromwell_execution/ci/forkjoin/515cfe12-1c50-4b93-ae77-b6675e87c0b8/call-join", + "commandLine": "expr 1000 / 21", + "compressedDockerSize": 26726969, + "dockerImageUsed": "ubuntu@sha256:e5dd9dbb37df5b731a6688fa49f4003359f6f126958c9c928f937bec69836320", + "end": "2020-04-16T21:32:34.546Z", + "executionEvents": [ + { + "description": "ContainerSetup", + "endTime": "2020-04-16T21:31:42.498Z", + "startTime": "2020-04-16T21:31:36.730Z" + }, + { + "description": "Delocalization", + "endTime": "2020-04-16T21:32:12.308Z", + "startTime": "2020-04-16T21:31:57.324Z" + }, + { + "description": "PreparingJob", + "endTime": "2020-04-16T21:29:12.783Z", + "startTime": "2020-04-16T21:29:12.778Z" + }, + { + "description": "Pending", + "endTime": "2020-04-16T21:29:12.167Z", + "startTime": "2020-04-16T21:29:12.167Z" + }, + { + "description": "Pulling \"ubuntu@sha256:e5dd9dbb37df5b731a6688fa49f4003359f6f126958c9c928f937bec69836320\"", + "endTime": "2020-04-16T21:31:35.707Z", + "startTime": "2020-04-16T21:31:31.345Z" + }, + { + "description": "Worker \"google-pipelines-worker-2ee5e49dca6d92011ef10bfb13cdf5ac\" assigned in \"us-central1-b\"", + "endTime": "2020-04-16T21:30:14.570Z", + "startTime": "2020-04-16T21:29:42.831Z" + }, + { + "description": "CallCacheReading", + "endTime": "2020-04-16T21:29:12.785Z", + "startTime": "2020-04-16T21:29:12.783Z" + }, + { + "description": "Pulling \"gcr.io/google.com/cloudsdktool/cloud-sdk:276.0.0-slim\"", + "endTime": "2020-04-16T21:31:31.345Z", + "startTime": "2020-04-16T21:30:14.570Z" + }, + { + "description": "RequestingExecutionToken", + "endTime": "2020-04-16T21:29:12.777Z", + "startTime": "2020-04-16T21:29:12.167Z" + }, + { + "description": "RunningJob", + "endTime": "2020-04-16T21:29:39.917Z", + "startTime": "2020-04-16T21:29:12.785Z" + }, + { + "description": "Background", + "endTime": "2020-04-16T21:31:36.730Z", + "startTime": "2020-04-16T21:31:35.948Z" + }, + { + "description": "Localization", + "endTime": "2020-04-16T21:31:55.706Z", + "startTime": "2020-04-16T21:31:42.498Z" + }, + { + "description": "waiting for quota", + "endTime": "2020-04-16T21:29:42.831Z", + "startTime": "2020-04-16T21:29:39.917Z" + }, + { + "description": "UpdatingJobStore", + "endTime": "2020-04-16T21:32:34.546Z", + "startTime": "2020-04-16T21:32:33.554Z" + }, + { + "description": "UserAction", + "endTime": "2020-04-16T21:31:57.324Z", + "startTime": "2020-04-16T21:31:55.706Z" + }, + { + "description": "WaitingForValueStore", + "endTime": "2020-04-16T21:29:12.778Z", + "startTime": "2020-04-16T21:29:12.777Z" + }, + { + "description": "Complete in GCE / Cromwell Poll Interval", + "endTime": "2020-04-16T21:32:32.251Z", + "startTime": "2020-04-16T21:32:12.308Z" + }, + { + "description": "UpdatingCallCache", + "endTime": "2020-04-16T21:32:33.554Z", + "startTime": "2020-04-16T21:32:32.251Z" + }, + { + "description": "Worker released", + "endTime": "2020-04-16T21:32:12.308Z", + "startTime": "2020-04-16T21:32:12.308Z" + } + ], + "executionStatus": "Done", + "inputs": { + "grepCount": 21, + "wcCount": 1000 + }, + "jes": { + "endpointUrl": "https://genomics.googleapis.com/", + "executionBucket": "gs://cloud-cromwell-dev-self-cleaning/cromwell_execution/ci", + "googleProject": "broad-dsde-cromwell-dev", + "instanceName": "google-pipelines-worker-2ee5e49dca6d92011ef10bfb13cdf5ac", + "machineType": "custom-1-2048", + "zone": "us-central1-b" + }, + "jobId": "projects/broad-dsde-cromwell-dev/operations/1242302480525595574", + "labels": { + "cromwell-workflow-id": "cromwell-515cfe12-1c50-4b93-ae77-b6675e87c0b8", + "wdl-task-name": "join" + }, + "outputs": { + "proportion": 47 + }, + "preemptible": false, + "returnCode": 0, + "runtimeAttributes": { + "bootDiskSizeGb": "10", + "continueOnReturnCode": "0", + "cpu": "1", + "cpuMin": "1", + "disks": "local-disk 10 SSD", + "docker": "ubuntu:latest", + "failOnStderr": "false", + "maxRetries": "0", + "memory": "2 GB", + "memoryMin": "2 GB", + "noAddress": "false", + "preemptible": "0", + "zones": "us-central1-b" + }, + "shardIndex": -1, + "start": "2020-04-16T21:29:12.167Z", + "stderr": "gs://cloud-cromwell-dev-self-cleaning/cromwell_execution/ci/forkjoin/515cfe12-1c50-4b93-ae77-b6675e87c0b8/call-join/stderr", + "stdout": "gs://cloud-cromwell-dev-self-cleaning/cromwell_execution/ci/forkjoin/515cfe12-1c50-4b93-ae77-b6675e87c0b8/call-join/stdout" + } + ], + "forkjoin.mkFile": [ + { + "attempt": 1, + "backend": "Papi", + "backendLabels": { + "cromwell-workflow-id": "cromwell-515cfe12-1c50-4b93-ae77-b6675e87c0b8", + "wdl-task-name": "mkfile" + }, + "backendLogs": { + "log": "gs://cloud-cromwell-dev-self-cleaning/cromwell_execution/ci/forkjoin/515cfe12-1c50-4b93-ae77-b6675e87c0b8/call-mkFile/mkFile.log" + }, + "backendStatus": "Success", + "callCaching": { + "allowResultReuse": true, + "effectiveCallCachingMode": "ReadAndWriteCache", + "hashes": { + "backend name": "36EF4A8AB268D1A1C74D8108C93D48ED", + "command template": "22653C31522C78A881EEC11AA2F1D871", + "input count": "CFCD208495D565EF66E7DFF9F98764DA", + "output count": "C4CA4238A0B923820DCC509A6F75849B", + "output expression": { + "File numbers": "174A537D0C36799273C5E3539866FDF4" + }, + "runtime attribute": { + "continueOnReturnCode": "CFCD208495D565EF66E7DFF9F98764DA", + "docker": "5E7DF90C2675647DEC5193F290917C18", + "failOnStderr": "68934A3E9455FA72420237EB05902327" + } + }, + "hit": false, + "result": "Cache Miss" + }, + "callRoot": "gs://cloud-cromwell-dev-self-cleaning/cromwell_execution/ci/forkjoin/515cfe12-1c50-4b93-ae77-b6675e87c0b8/call-mkFile", + "commandLine": "for i in `seq 1 1000`\ndo\n echo $i\ndone", + "compressedDockerSize": 26726969, + "dockerImageUsed": "ubuntu@sha256:e5dd9dbb37df5b731a6688fa49f4003359f6f126958c9c928f937bec69836320", + "end": "2020-04-16T21:25:49.557Z", + "executionEvents": [ + { + "description": "PreparingJob", + "endTime": "2020-04-16T21:22:57.368Z", + "startTime": "2020-04-16T21:22:56.802Z" + }, + { + "description": "UserAction", + "endTime": "2020-04-16T21:25:26.395Z", + "startTime": "2020-04-16T21:25:24.791Z" + }, + { + "description": "Worker released", + "endTime": "2020-04-16T21:25:41.253Z", + "startTime": "2020-04-16T21:25:41.253Z" + }, + { + "description": "WaitingForValueStore", + "endTime": "2020-04-16T21:22:56.802Z", + "startTime": "2020-04-16T21:22:56.783Z" + }, + { + "description": "waiting for quota", + "endTime": "2020-04-16T21:23:34.942Z", + "startTime": "2020-04-16T21:23:32.262Z" + }, + { + "description": "Localization", + "endTime": "2020-04-16T21:25:24.791Z", + "startTime": "2020-04-16T21:25:11.600Z" + }, + { + "description": "CallCacheReading", + "endTime": "2020-04-16T21:22:57.410Z", + "startTime": "2020-04-16T21:22:57.368Z" + }, + { + "description": "Complete in GCE / Cromwell Poll Interval", + "endTime": "2020-04-16T21:25:47.059Z", + "startTime": "2020-04-16T21:25:41.253Z" + }, + { + "description": "ContainerSetup", + "endTime": "2020-04-16T21:25:11.600Z", + "startTime": "2020-04-16T21:25:05.891Z" + }, + { + "description": "UpdatingJobStore", + "endTime": "2020-04-16T21:25:49.558Z", + "startTime": "2020-04-16T21:25:48.585Z" + }, + { + "description": "RequestingExecutionToken", + "endTime": "2020-04-16T21:22:56.783Z", + "startTime": "2020-04-16T21:22:55.824Z" + }, + { + "description": "Worker \"google-pipelines-worker-60f1e8162eafc88c980e5e6ffd1c2a19\" assigned in \"us-central1-b\"", + "endTime": "2020-04-16T21:24:11.706Z", + "startTime": "2020-04-16T21:23:34.942Z" + }, + { + "description": "Pulling \"gcr.io/google.com/cloudsdktool/cloud-sdk:276.0.0-slim\"", + "endTime": "2020-04-16T21:24:59.132Z", + "startTime": "2020-04-16T21:24:11.706Z" + }, + { + "description": "Background", + "endTime": "2020-04-16T21:25:05.891Z", + "startTime": "2020-04-16T21:25:04.887Z" + }, + { + "description": "Pending", + "endTime": "2020-04-16T21:22:55.824Z", + "startTime": "2020-04-16T21:22:55.799Z" + }, + { + "description": "Pulling \"ubuntu@sha256:e5dd9dbb37df5b731a6688fa49f4003359f6f126958c9c928f937bec69836320\"", + "endTime": "2020-04-16T21:25:04.887Z", + "startTime": "2020-04-16T21:24:59.132Z" + }, + { + "description": "RunningJob", + "endTime": "2020-04-16T21:23:32.262Z", + "startTime": "2020-04-16T21:22:57.410Z" + }, + { + "description": "UpdatingCallCache", + "endTime": "2020-04-16T21:25:48.585Z", + "startTime": "2020-04-16T21:25:47.059Z" + }, + { + "description": "Delocalization", + "endTime": "2020-04-16T21:25:41.253Z", + "startTime": "2020-04-16T21:25:26.395Z" + } + ], + "executionStatus": "Done", + "inputs": {}, + "jes": { + "endpointUrl": "https://genomics.googleapis.com/", + "executionBucket": "gs://cloud-cromwell-dev-self-cleaning/cromwell_execution/ci", + "googleProject": "broad-dsde-cromwell-dev", + "instanceName": "google-pipelines-worker-60f1e8162eafc88c980e5e6ffd1c2a19", + "machineType": "custom-1-2048", + "zone": "us-central1-b" + }, + "jobId": "projects/broad-dsde-cromwell-dev/operations/11113854067012168443", + "labels": { + "cromwell-workflow-id": "cromwell-515cfe12-1c50-4b93-ae77-b6675e87c0b8", + "wdl-task-name": "mkFile" + }, + "outputs": { + "numbers": "gs://cloud-cromwell-dev-self-cleaning/cromwell_execution/ci/forkjoin/515cfe12-1c50-4b93-ae77-b6675e87c0b8/call-mkFile/stdout" + }, + "preemptible": false, + "returnCode": 0, + "runtimeAttributes": { + "bootDiskSizeGb": "10", + "continueOnReturnCode": "0", + "cpu": "1", + "cpuMin": "1", + "disks": "local-disk 10 SSD", + "docker": "ubuntu:latest", + "failOnStderr": "false", + "maxRetries": "0", + "memory": "2 GB", + "memoryMin": "2 GB", + "noAddress": "false", + "preemptible": "0", + "zones": "us-central1-b" + }, + "shardIndex": -1, + "start": "2020-04-16T21:22:55.772Z", + "stderr": "gs://cloud-cromwell-dev-self-cleaning/cromwell_execution/ci/forkjoin/515cfe12-1c50-4b93-ae77-b6675e87c0b8/call-mkFile/stderr", + "stdout": "gs://cloud-cromwell-dev-self-cleaning/cromwell_execution/ci/forkjoin/515cfe12-1c50-4b93-ae77-b6675e87c0b8/call-mkFile/stdout" + } + ], + "forkjoin.wc": [ + { + "attempt": 1, + "backend": "Papi", + "backendLabels": { + "cromwell-workflow-id": "cromwell-515cfe12-1c50-4b93-ae77-b6675e87c0b8", + "wdl-task-name": "wc" + }, + "backendLogs": { + "log": "gs://cloud-cromwell-dev-self-cleaning/cromwell_execution/ci/forkjoin/515cfe12-1c50-4b93-ae77-b6675e87c0b8/call-wc/wc.log" + }, + "backendStatus": "Success", + "callCaching": { + "allowResultReuse": true, + "effectiveCallCachingMode": "ReadAndWriteCache", + "hashes": { + "backend name": "36EF4A8AB268D1A1C74D8108C93D48ED", + "command template": "D357F0A427A86FF1FED64C347D4DFBAA", + "input": { + "File in_file": "4DC9uA==" + }, + "input count": "C4CA4238A0B923820DCC509A6F75849B", + "output count": "C4CA4238A0B923820DCC509A6F75849B", + "output expression": { + "Int count": "93114C80C6826C071C28393FDD0D5F73" + }, + "runtime attribute": { + "continueOnReturnCode": "CFCD208495D565EF66E7DFF9F98764DA", + "docker": "5E7DF90C2675647DEC5193F290917C18", + "failOnStderr": "68934A3E9455FA72420237EB05902327" + } + }, + "hit": false, + "result": "Cache Miss" + }, + "callRoot": "gs://cloud-cromwell-dev-self-cleaning/cromwell_execution/ci/forkjoin/515cfe12-1c50-4b93-ae77-b6675e87c0b8/call-wc", + "commandLine": "cat /cromwell_root/cloud-cromwell-dev-self-cleaning/cromwell_execution/ci/forkjoin/515cfe12-1c50-4b93-ae77-b6675e87c0b8/call-mkFile/stdout | wc -l", + "compressedDockerSize": 26726969, + "dockerImageUsed": "ubuntu@sha256:e5dd9dbb37df5b731a6688fa49f4003359f6f126958c9c928f937bec69836320", + "end": "2020-04-16T21:29:10.540Z", + "executionEvents": [ + { + "description": "UpdatingJobStore", + "endTime": "2020-04-16T21:29:10.540Z", + "startTime": "2020-04-16T21:29:09.548Z" + }, + { + "description": "Pulling \"ubuntu@sha256:e5dd9dbb37df5b731a6688fa49f4003359f6f126958c9c928f937bec69836320\"", + "endTime": "2020-04-16T21:27:47.713Z", + "startTime": "2020-04-16T21:27:37.623Z" + }, + { + "description": "CallCacheReading", + "endTime": "2020-04-16T21:25:51.788Z", + "startTime": "2020-04-16T21:25:51.782Z" + }, + { + "description": "RunningJob", + "endTime": "2020-04-16T21:26:19.353Z", + "startTime": "2020-04-16T21:25:51.788Z" + }, + { + "description": "Delocalization", + "endTime": "2020-04-16T21:28:28.165Z", + "startTime": "2020-04-16T21:28:13.469Z" + }, + { + "description": "UserAction", + "endTime": "2020-04-16T21:28:13.469Z", + "startTime": "2020-04-16T21:28:10.152Z" + }, + { + "description": "Localization", + "endTime": "2020-04-16T21:28:10.152Z", + "startTime": "2020-04-16T21:27:55.402Z" + }, + { + "description": "RequestingExecutionToken", + "endTime": "2020-04-16T21:25:51.771Z", + "startTime": "2020-04-16T21:25:51.221Z" + }, + { + "description": "Worker released", + "endTime": "2020-04-16T21:28:28.165Z", + "startTime": "2020-04-16T21:28:28.165Z" + }, + { + "description": "waiting for quota", + "endTime": "2020-04-16T21:26:22.042Z", + "startTime": "2020-04-16T21:26:19.353Z" + }, + { + "description": "WaitingForValueStore", + "endTime": "2020-04-16T21:25:51.772Z", + "startTime": "2020-04-16T21:25:51.771Z" + }, + { + "description": "UpdatingCallCache", + "endTime": "2020-04-16T21:29:09.548Z", + "startTime": "2020-04-16T21:29:07.247Z" + }, + { + "description": "Complete in GCE / Cromwell Poll Interval", + "endTime": "2020-04-16T21:29:07.247Z", + "startTime": "2020-04-16T21:28:28.165Z" + }, + { + "description": "Background", + "endTime": "2020-04-16T21:27:48.412Z", + "startTime": "2020-04-16T21:27:47.713Z" + }, + { + "description": "Pulling \"gcr.io/google.com/cloudsdktool/cloud-sdk:276.0.0-slim\"", + "endTime": "2020-04-16T21:27:37.623Z", + "startTime": "2020-04-16T21:26:53.969Z" + }, + { + "description": "Worker \"google-pipelines-worker-a4137c15f137ee0020b601289d3f0782\" assigned in \"us-central1-b\"", + "endTime": "2020-04-16T21:26:53.969Z", + "startTime": "2020-04-16T21:26:22.042Z" + }, + { + "description": "ContainerSetup", + "endTime": "2020-04-16T21:27:55.402Z", + "startTime": "2020-04-16T21:27:48.866Z" + }, + { + "description": "PreparingJob", + "endTime": "2020-04-16T21:25:51.782Z", + "startTime": "2020-04-16T21:25:51.772Z" + }, + { + "description": "Pending", + "endTime": "2020-04-16T21:25:51.221Z", + "startTime": "2020-04-16T21:25:51.221Z" + } + ], + "executionStatus": "Done", + "inputs": { + "in_file": "gs://cloud-cromwell-dev-self-cleaning/cromwell_execution/ci/forkjoin/515cfe12-1c50-4b93-ae77-b6675e87c0b8/call-mkFile/stdout" + }, + "jes": { + "endpointUrl": "https://genomics.googleapis.com/", + "executionBucket": "gs://cloud-cromwell-dev-self-cleaning/cromwell_execution/ci", + "googleProject": "broad-dsde-cromwell-dev", + "instanceName": "google-pipelines-worker-a4137c15f137ee0020b601289d3f0782", + "machineType": "custom-1-2048", + "zone": "us-central1-b" + }, + "jobId": "projects/broad-dsde-cromwell-dev/operations/14350975406210565808", + "labels": { + "cromwell-workflow-id": "cromwell-515cfe12-1c50-4b93-ae77-b6675e87c0b8", + "wdl-task-name": "wc" + }, + "outputs": { + "count": 1000 + }, + "preemptible": false, + "returnCode": 0, + "runtimeAttributes": { + "bootDiskSizeGb": "10", + "continueOnReturnCode": "0", + "cpu": "1", + "cpuMin": "1", + "disks": "local-disk 10 SSD", + "docker": "ubuntu:latest", + "failOnStderr": "false", + "maxRetries": "0", + "memory": "2 GB", + "memoryMin": "2 GB", + "noAddress": "false", + "preemptible": "0", + "zones": "us-central1-b" + }, + "shardIndex": -1, + "start": "2020-04-16T21:25:51.221Z", + "stderr": "gs://cloud-cromwell-dev-self-cleaning/cromwell_execution/ci/forkjoin/515cfe12-1c50-4b93-ae77-b6675e87c0b8/call-wc/stderr", + "stdout": "gs://cloud-cromwell-dev-self-cleaning/cromwell_execution/ci/forkjoin/515cfe12-1c50-4b93-ae77-b6675e87c0b8/call-wc/stdout" + } + ] + }, + "end": "2020-04-16T21:32:36.239Z", + "id": "515cfe12-1c50-4b93-ae77-b6675e87c0b8", + "inputs": { + "forkjoin.grep.pattern": "10" + }, + "labels": { + "cromwell-workflow-id": "cromwell-515cfe12-1c50-4b93-ae77-b6675e87c0b8", + "newlyAddedLabel": "All mimsy were the borogoves, and the mome raths outgrabe" + }, + "metadataSource": "Archived", + "outputs": { + "forkjoin.join.proportion": 47 + }, + "start": "2020-04-16T21:22:53.375Z", + "status": "Succeeded", + "submission": "2020-04-16T21:22:52.699Z", + "submittedFiles": { + "inputs": "{\"forkjoin.grep.pattern\":\"10\"}", + "labels": "{}", + "options": "{\n\n}", + "root": "", + "workflow": "##\n# Checks a simple branch and join operation.\n# We start with a task, branch into two parallel executions, and then rejoin to calculate the result.\n##\n\ntask mkFile {\n command {\n for i in `seq 1 1000`\n do\n echo $i\n done\n }\n output {\n File numbers = stdout()\n }\n runtime {docker: \"ubuntu:latest\"}\n}\n\ntask grep {\n String pattern\n File in_file\n command {\n grep '${pattern}' ${in_file} | wc -l\n }\n output {\n Int count = read_int(stdout())\n }\n runtime {docker: \"ubuntu:latest\"}\n}\n\ntask wc {\n File in_file\n command {\n cat ${in_file} | wc -l\n }\n output {\n Int count = read_int(stdout())\n }\n runtime {docker: \"ubuntu:latest\"}\n}\n\ntask join {\n Int grepCount\n Int wcCount\n command {\n expr ${wcCount} / ${grepCount}\n }\n output {\n Int proportion = read_int(stdout())\n }\n runtime {docker: \"ubuntu:latest\"}\n}\n\nworkflow forkjoin {\n call mkFile\n call grep { input: in_file = mkFile.numbers }\n call wc { input: in_file=mkFile.numbers }\n call join { input: wcCount = wc.count, grepCount = grep.count }\n output {\n join.proportion\n }\n}\n", + "workflowUrl": "" + }, + "workflowName": "forkjoin", + "workflowProcessingEvents": [ + { + "cromwellId": "cromid-1444cfc", + "cromwellVersion": "cromwell-version.conf-to-be-generated-by-sbt", + "description": "Finished", + "timestamp": "2020-04-16T21:32:36.240Z" + }, + { + "cromwellId": "cromid-1444cfc", + "cromwellVersion": "cromwell-version.conf-to-be-generated-by-sbt", + "description": "PickedUp", + "timestamp": "2020-04-16T21:22:53.327Z" + } + ], + "workflowRoot": "gs://cloud-cromwell-dev-self-cleaning/cromwell_execution/ci/forkjoin/515cfe12-1c50-4b93-ae77-b6675e87c0b8/" +} diff --git a/scripts/metadata_comparison/test/resources/forkjoin_metadata_papi_v2beta.json b/scripts/metadata_comparison/test/resources/forkjoin_metadata_papi_v2beta.json new file mode 100644 index 00000000000..fb0fac33407 --- /dev/null +++ b/scripts/metadata_comparison/test/resources/forkjoin_metadata_papi_v2beta.json @@ -0,0 +1,736 @@ +{ + "workflowName": "forkjoin", + "workflowProcessingEvents": [ + { + "cromwellId": "cromid-fb0ef9d", + "description": "PickedUp", + "timestamp": "2020-04-22T20:59:09.782Z", + "cromwellVersion": "cromwell-version.conf-to-be-generated-by-sbt" + }, + { + "cromwellId": "cromid-fb0ef9d", + "description": "Finished", + "timestamp": "2020-04-22T21:09:24.732Z", + "cromwellVersion": "cromwell-version.conf-to-be-generated-by-sbt" + } + ], + "metadataSource": "Unarchived", + "actualWorkflowLanguageVersion": "draft-2", + "submittedFiles": { + "workflow": "##\n# Checks a simple branch and join operation.\n# We start with a task, branch into two parallel executions, and then rejoin to calculate the result.\n##\n\ntask mkFile {\n command {\n for i in `seq 1 1000`\n do\n echo $i\n done\n }\n output {\n File numbers = stdout()\n }\n runtime {docker: \"ubuntu:latest\"}\n}\n\ntask grep {\n String pattern\n File in_file\n command {\n grep '${pattern}' ${in_file} | wc -l\n }\n output {\n Int count = read_int(stdout())\n }\n runtime {docker: \"ubuntu:latest\"}\n}\n\ntask wc {\n File in_file\n command {\n cat ${in_file} | wc -l\n }\n output {\n Int count = read_int(stdout())\n }\n runtime {docker: \"ubuntu:latest\"}\n}\n\ntask join {\n Int grepCount\n Int wcCount\n command {\n expr ${wcCount} / ${grepCount}\n }\n output {\n Int proportion = read_int(stdout())\n }\n runtime {docker: \"ubuntu:latest\"}\n}\n\nworkflow forkjoin {\n call mkFile\n call grep { input: in_file = mkFile.numbers }\n call wc { input: in_file=mkFile.numbers }\n call join { input: wcCount = wc.count, grepCount = grep.count }\n output {\n join.proportion\n }\n}\n", + "root": "", + "options": "{\n \"default_runtime_attributes\": {\n \"docker\": \"ubuntu:latest\"\n },\n \"read_from_cache\": false\n}", + "inputs": "{\"forkjoin.grep.pattern\":\"10\"}", + "workflowUrl": "", + "labels": "{}" + }, + "calls": { + "forkjoin.wc": [ + { + "preemptible": false, + "executionStatus": "Done", + "stdout": "gs://cloud-cromwell-dev-self-cleaning/cromwell_execution/ci/forkjoin/b9264f9e-c5de-4285-8397-43639b7f65e1/call-wc/stdout", + "backendStatus": "Success", + "compressedDockerSize": 26726969, + "commandLine": "cat /cromwell_root/cloud-cromwell-dev-self-cleaning/cromwell_execution/ci/forkjoin/b9264f9e-c5de-4285-8397-43639b7f65e1/call-mkFile/stdout | wc -l", + "shardIndex": -1, + "jes": { + "endpointUrl": "https://lifesciences.googleapis.com/", + "machineType": "custom-1-2048", + "googleProject": "broad-dsde-cromwell-dev", + "executionBucket": "gs://cloud-cromwell-dev-self-cleaning/cromwell_execution/ci", + "zone": "us-central1-b", + "instanceName": "google-pipelines-worker-eabe41313f18516668171cf25dbfa1ac" + }, + "outputs": { + "count": 1000 + }, + "runtimeAttributes": { + "preemptible": "0", + "failOnStderr": "false", + "bootDiskSizeGb": "10", + "disks": "local-disk 10 SSD", + "continueOnReturnCode": "0", + "docker": "ubuntu:latest", + "maxRetries": "0", + "cpu": "1", + "cpuMin": "1", + "noAddress": "false", + "zones": "us-central1-b", + "memoryMin": "2 GB", + "memory": "2 GB" + }, + "callCaching": { + "allowResultReuse": true, + "effectiveCallCachingMode": "WriteCache", + "hashes": { + "output count": "C4CA4238A0B923820DCC509A6F75849B", + "runtime attribute": { + "docker": "5E7DF90C2675647DEC5193F290917C18", + "continueOnReturnCode": "CFCD208495D565EF66E7DFF9F98764DA", + "failOnStderr": "68934A3E9455FA72420237EB05902327" + }, + "output expression": { + "Int count": "93114C80C6826C071C28393FDD0D5F73" + }, + "input count": "C4CA4238A0B923820DCC509A6F75849B", + "backend name": "36EF4A8AB268D1A1C74D8108C93D48ED", + "command template": "D357F0A427A86FF1FED64C347D4DFBAA", + "input": { + "File in_file": "4DC9uA==" + } + } + }, + "inputs": { + "in_file": "gs://cloud-cromwell-dev-self-cleaning/cromwell_execution/ci/forkjoin/b9264f9e-c5de-4285-8397-43639b7f65e1/call-mkFile/stdout" + }, + "backendLabels": { + "wdl-task-name": "wc", + "cromwell-workflow-id": "cromwell-b9264f9e-c5de-4285-8397-43639b7f65e1" + }, + "returnCode": 0, + "labels": { + "wdl-task-name": "wc", + "cromwell-workflow-id": "cromwell-b9264f9e-c5de-4285-8397-43639b7f65e1" + }, + "jobId": "projects/1005074806481/locations/us-central1/operations/14642412977689025366", + "backend": "Papi", + "end": "2020-04-22T21:06:02.044Z", + "dockerImageUsed": "ubuntu@sha256:e5dd9dbb37df5b731a6688fa49f4003359f6f126958c9c928f937bec69836320", + "stderr": "gs://cloud-cromwell-dev-self-cleaning/cromwell_execution/ci/forkjoin/b9264f9e-c5de-4285-8397-43639b7f65e1/call-wc/stderr", + "callRoot": "gs://cloud-cromwell-dev-self-cleaning/cromwell_execution/ci/forkjoin/b9264f9e-c5de-4285-8397-43639b7f65e1/call-wc", + "attempt": 1, + "executionEvents": [ + { + "description": "Localization", + "startTime": "2020-04-22T21:04:41.945Z", + "endTime": "2020-04-22T21:04:56.276Z" + }, + { + "startTime": "2020-04-22T21:02:09.214Z", + "description": "RequestingExecutionToken", + "endTime": "2020-04-22T21:02:09.264Z" + }, + { + "startTime": "2020-04-22T21:02:09.279Z", + "description": "RunningJob", + "endTime": "2020-04-22T21:02:38.403Z" + }, + { + "description": "Delocalization", + "startTime": "2020-04-22T21:04:58.992Z", + "endTime": "2020-04-22T21:05:13.380Z" + }, + { + "startTime": "2020-04-22T21:02:09.264Z", + "description": "WaitingForValueStore", + "endTime": "2020-04-22T21:02:09.265Z" + }, + { + "startTime": "2020-04-22T21:02:41.092Z", + "description": "Worker \"google-pipelines-worker-eabe41313f18516668171cf25dbfa1ac\" assigned in \"us-central1-b\"", + "endTime": "2020-04-22T21:03:13.871Z" + }, + { + "startTime": "2020-04-22T21:06:01.060Z", + "description": "UpdatingJobStore", + "endTime": "2020-04-22T21:06:02.044Z" + }, + { + "startTime": "2020-04-22T21:05:13.380Z", + "description": "Complete in GCE / Cromwell Poll Interval", + "endTime": "2020-04-22T21:06:00.741Z" + }, + { + "description": "Pulling \"ubuntu@sha256:e5dd9dbb37df5b731a6688fa49f4003359f6f126958c9c928f937bec69836320\"", + "startTime": "2020-04-22T21:04:30.894Z", + "endTime": "2020-04-22T21:04:35.185Z" + }, + { + "description": "Background", + "startTime": "2020-04-22T21:04:35.797Z", + "endTime": "2020-04-22T21:04:36.161Z" + }, + { + "description": "UserAction", + "startTime": "2020-04-22T21:04:56.276Z", + "endTime": "2020-04-22T21:04:58.992Z" + }, + { + "description": "Pulling \"gcr.io/google.com/cloudsdktool/cloud-sdk:276.0.0-slim\"", + "startTime": "2020-04-22T21:03:13.871Z", + "endTime": "2020-04-22T21:04:30.894Z" + }, + { + "startTime": "2020-04-22T21:02:09.265Z", + "description": "PreparingJob", + "endTime": "2020-04-22T21:02:09.279Z" + }, + { + "startTime": "2020-04-22T21:05:13.380Z", + "description": "Worker released", + "endTime": "2020-04-22T21:05:13.380Z" + }, + { + "startTime": "2020-04-22T21:02:38.403Z", + "description": "waiting for quota", + "endTime": "2020-04-22T21:02:41.092Z" + }, + { + "description": "ContainerSetup", + "startTime": "2020-04-22T21:04:36.161Z", + "endTime": "2020-04-22T21:04:41.945Z" + }, + { + "startTime": "2020-04-22T21:02:09.214Z", + "description": "Pending", + "endTime": "2020-04-22T21:02:09.214Z" + }, + { + "startTime": "2020-04-22T21:06:00.741Z", + "description": "UpdatingCallCache", + "endTime": "2020-04-22T21:06:01.060Z" + } + ], + "backendLogs": { + "log": "gs://cloud-cromwell-dev-self-cleaning/cromwell_execution/ci/forkjoin/b9264f9e-c5de-4285-8397-43639b7f65e1/call-wc/wc.log" + }, + "start": "2020-04-22T21:02:09.214Z" + } + ], + "forkjoin.mkFile": [ + { + "preemptible": false, + "executionStatus": "Done", + "stdout": "gs://cloud-cromwell-dev-self-cleaning/cromwell_execution/ci/forkjoin/b9264f9e-c5de-4285-8397-43639b7f65e1/call-mkFile/stdout", + "backendStatus": "Success", + "compressedDockerSize": 26726969, + "commandLine": "for i in `seq 1 1000`\ndo\n echo $i\ndone", + "shardIndex": -1, + "jes": { + "endpointUrl": "https://lifesciences.googleapis.com/", + "machineType": "custom-1-2048", + "googleProject": "broad-dsde-cromwell-dev", + "executionBucket": "gs://cloud-cromwell-dev-self-cleaning/cromwell_execution/ci", + "zone": "us-central1-b", + "instanceName": "google-pipelines-worker-5e88bf1c818462a12174b606d0148eb6" + }, + "outputs": { + "numbers": "gs://cloud-cromwell-dev-self-cleaning/cromwell_execution/ci/forkjoin/b9264f9e-c5de-4285-8397-43639b7f65e1/call-mkFile/stdout" + }, + "runtimeAttributes": { + "preemptible": "0", + "failOnStderr": "false", + "bootDiskSizeGb": "10", + "disks": "local-disk 10 SSD", + "continueOnReturnCode": "0", + "docker": "ubuntu:latest", + "maxRetries": "0", + "cpu": "1", + "cpuMin": "1", + "noAddress": "false", + "zones": "us-central1-b", + "memoryMin": "2 GB", + "memory": "2 GB" + }, + "callCaching": { + "allowResultReuse": true, + "effectiveCallCachingMode": "WriteCache", + "hashes": { + "output count": "C4CA4238A0B923820DCC509A6F75849B", + "runtime attribute": { + "docker": "5E7DF90C2675647DEC5193F290917C18", + "continueOnReturnCode": "CFCD208495D565EF66E7DFF9F98764DA", + "failOnStderr": "68934A3E9455FA72420237EB05902327" + }, + "output expression": { + "File numbers": "174A537D0C36799273C5E3539866FDF4" + }, + "input count": "CFCD208495D565EF66E7DFF9F98764DA", + "backend name": "36EF4A8AB268D1A1C74D8108C93D48ED", + "command template": "22653C31522C78A881EEC11AA2F1D871" + } + }, + "inputs": {}, + "backendLabels": { + "wdl-task-name": "mkfile", + "cromwell-workflow-id": "cromwell-b9264f9e-c5de-4285-8397-43639b7f65e1" + }, + "returnCode": 0, + "labels": { + "wdl-task-name": "mkFile", + "cromwell-workflow-id": "cromwell-b9264f9e-c5de-4285-8397-43639b7f65e1" + }, + "jobId": "projects/1005074806481/locations/us-central1/operations/3128777609532869613", + "backend": "Papi", + "end": "2020-04-22T21:02:08.051Z", + "dockerImageUsed": "ubuntu@sha256:e5dd9dbb37df5b731a6688fa49f4003359f6f126958c9c928f937bec69836320", + "stderr": "gs://cloud-cromwell-dev-self-cleaning/cromwell_execution/ci/forkjoin/b9264f9e-c5de-4285-8397-43639b7f65e1/call-mkFile/stderr", + "callRoot": "gs://cloud-cromwell-dev-self-cleaning/cromwell_execution/ci/forkjoin/b9264f9e-c5de-4285-8397-43639b7f65e1/call-mkFile", + "attempt": 1, + "executionEvents": [ + { + "description": "Pulling \"gcr.io/google.com/cloudsdktool/cloud-sdk:276.0.0-slim\"", + "startTime": "2020-04-22T20:59:52.928Z", + "endTime": "2020-04-22T21:00:25.831Z" + }, + { + "startTime": "2020-04-22T21:02:05.556Z", + "description": "UpdatingCallCache", + "endTime": "2020-04-22T21:02:07.081Z" + }, + { + "description": "Delocalization", + "startTime": "2020-04-22T21:00:59.637Z", + "endTime": "2020-04-22T21:01:14.580Z" + }, + { + "startTime": "2020-04-22T21:01:14.580Z", + "description": "Complete in GCE / Cromwell Poll Interval", + "endTime": "2020-04-22T21:02:05.556Z" + }, + { + "description": "Pulling \"ubuntu@sha256:e5dd9dbb37df5b731a6688fa49f4003359f6f126958c9c928f937bec69836320\"", + "startTime": "2020-04-22T21:00:25.831Z", + "endTime": "2020-04-22T21:00:37.671Z" + }, + { + "startTime": "2020-04-22T20:59:21.637Z", + "description": "Worker \"google-pipelines-worker-5e88bf1c818462a12174b606d0148eb6\" assigned in \"us-central1-b\"", + "endTime": "2020-04-22T20:59:52.928Z" + }, + { + "startTime": "2020-04-22T20:59:17.401Z", + "description": "waiting for quota", + "endTime": "2020-04-22T20:59:21.637Z" + }, + { + "startTime": "2020-04-22T20:59:12.761Z", + "description": "Pending", + "endTime": "2020-04-22T20:59:12.780Z" + }, + { + "description": "Background", + "startTime": "2020-04-22T21:00:37.671Z", + "endTime": "2020-04-22T21:00:38.663Z" + }, + { + "startTime": "2020-04-22T21:02:07.081Z", + "description": "UpdatingJobStore", + "endTime": "2020-04-22T21:02:08.051Z" + }, + { + "description": "Localization", + "startTime": "2020-04-22T21:00:44.821Z", + "endTime": "2020-04-22T21:00:57.785Z" + }, + { + "startTime": "2020-04-22T20:59:13.277Z", + "description": "WaitingForValueStore", + "endTime": "2020-04-22T20:59:13.290Z" + }, + { + "startTime": "2020-04-22T20:59:13.911Z", + "description": "RunningJob", + "endTime": "2020-04-22T20:59:17.401Z" + }, + { + "startTime": "2020-04-22T20:59:12.780Z", + "description": "RequestingExecutionToken", + "endTime": "2020-04-22T20:59:13.277Z" + }, + { + "startTime": "2020-04-22T20:59:13.290Z", + "description": "PreparingJob", + "endTime": "2020-04-22T20:59:13.911Z" + }, + { + "startTime": "2020-04-22T21:01:14.580Z", + "description": "Worker released", + "endTime": "2020-04-22T21:01:14.580Z" + }, + { + "description": "ContainerSetup", + "startTime": "2020-04-22T21:00:38.663Z", + "endTime": "2020-04-22T21:00:44.821Z" + }, + { + "description": "UserAction", + "startTime": "2020-04-22T21:00:57.785Z", + "endTime": "2020-04-22T21:00:59.637Z" + } + ], + "backendLogs": { + "log": "gs://cloud-cromwell-dev-self-cleaning/cromwell_execution/ci/forkjoin/b9264f9e-c5de-4285-8397-43639b7f65e1/call-mkFile/mkFile.log" + }, + "start": "2020-04-22T20:59:12.746Z" + } + ], + "forkjoin.join": [ + { + "preemptible": false, + "executionStatus": "Done", + "stdout": "gs://cloud-cromwell-dev-self-cleaning/cromwell_execution/ci/forkjoin/b9264f9e-c5de-4285-8397-43639b7f65e1/call-join/stdout", + "backendStatus": "Success", + "compressedDockerSize": 26726969, + "commandLine": "expr 1000 / 21", + "shardIndex": -1, + "jes": { + "endpointUrl": "https://lifesciences.googleapis.com/", + "machineType": "custom-1-2048", + "googleProject": "broad-dsde-cromwell-dev", + "executionBucket": "gs://cloud-cromwell-dev-self-cleaning/cromwell_execution/ci", + "zone": "us-central1-b", + "instanceName": "google-pipelines-worker-5aabb0ed0606d74e9815f84ee50615ff" + }, + "outputs": { + "proportion": 47 + }, + "runtimeAttributes": { + "preemptible": "0", + "failOnStderr": "false", + "bootDiskSizeGb": "10", + "disks": "local-disk 10 SSD", + "continueOnReturnCode": "0", + "docker": "ubuntu:latest", + "maxRetries": "0", + "cpu": "1", + "cpuMin": "1", + "noAddress": "false", + "zones": "us-central1-b", + "memoryMin": "2 GB", + "memory": "2 GB" + }, + "callCaching": { + "allowResultReuse": true, + "effectiveCallCachingMode": "WriteCache", + "hashes": { + "output count": "C4CA4238A0B923820DCC509A6F75849B", + "runtime attribute": { + "docker": "5E7DF90C2675647DEC5193F290917C18", + "continueOnReturnCode": "CFCD208495D565EF66E7DFF9F98764DA", + "failOnStderr": "68934A3E9455FA72420237EB05902327" + }, + "output expression": { + "Int proportion": "93114C80C6826C071C28393FDD0D5F73" + }, + "input count": "C81E728D9D4C2F636F067F89CC14862C", + "backend name": "36EF4A8AB268D1A1C74D8108C93D48ED", + "command template": "CB9A57DD4ABEC16E2C1284E5D31249CD", + "input": { + "Int wcCount": "A9B7BA70783B617E9998DC4DD82EB3C5", + "Int grepCount": "3C59DC048E8850243BE8079A5C74D079" + } + } + }, + "inputs": { + "wcCount": 1000, + "grepCount": 21 + }, + "backendLabels": { + "wdl-task-name": "join", + "cromwell-workflow-id": "cromwell-b9264f9e-c5de-4285-8397-43639b7f65e1" + }, + "returnCode": 0, + "labels": { + "wdl-task-name": "join", + "cromwell-workflow-id": "cromwell-b9264f9e-c5de-4285-8397-43639b7f65e1" + }, + "jobId": "projects/1005074806481/locations/us-central1/operations/18107476451113522273", + "backend": "Papi", + "end": "2020-04-22T21:09:23.014Z", + "dockerImageUsed": "ubuntu@sha256:e5dd9dbb37df5b731a6688fa49f4003359f6f126958c9c928f937bec69836320", + "stderr": "gs://cloud-cromwell-dev-self-cleaning/cromwell_execution/ci/forkjoin/b9264f9e-c5de-4285-8397-43639b7f65e1/call-join/stderr", + "callRoot": "gs://cloud-cromwell-dev-self-cleaning/cromwell_execution/ci/forkjoin/b9264f9e-c5de-4285-8397-43639b7f65e1/call-join", + "attempt": 1, + "executionEvents": [ + { + "description": "Delocalization", + "startTime": "2020-04-22T21:08:31.659Z", + "endTime": "2020-04-22T21:08:46.137Z" + }, + { + "startTime": "2020-04-22T21:09:22.017Z", + "description": "UpdatingJobStore", + "endTime": "2020-04-22T21:09:23.014Z" + }, + { + "startTime": "2020-04-22T21:08:46.137Z", + "description": "Worker released", + "endTime": "2020-04-22T21:08:46.137Z" + }, + { + "startTime": "2020-04-22T21:06:04.270Z", + "description": "WaitingForValueStore", + "endTime": "2020-04-22T21:06:04.270Z" + }, + { + "startTime": "2020-04-22T21:06:32.685Z", + "description": "waiting for quota", + "endTime": "2020-04-22T21:06:36.167Z" + }, + { + "startTime": "2020-04-22T21:06:03.821Z", + "description": "RequestingExecutionToken", + "endTime": "2020-04-22T21:06:04.270Z" + }, + { + "startTime": "2020-04-22T21:06:03.821Z", + "description": "Pending", + "endTime": "2020-04-22T21:06:03.821Z" + }, + { + "description": "Pulling \"ubuntu@sha256:e5dd9dbb37df5b731a6688fa49f4003359f6f126958c9c928f937bec69836320\"", + "startTime": "2020-04-22T21:07:56.732Z", + "endTime": "2020-04-22T21:08:10.603Z" + }, + { + "description": "Background", + "startTime": "2020-04-22T21:08:10.821Z", + "endTime": "2020-04-22T21:08:11.542Z" + }, + { + "startTime": "2020-04-22T21:06:36.167Z", + "description": "Worker \"google-pipelines-worker-5aabb0ed0606d74e9815f84ee50615ff\" assigned in \"us-central1-b\"", + "endTime": "2020-04-22T21:07:10.790Z" + }, + { + "startTime": "2020-04-22T21:06:04.276Z", + "description": "RunningJob", + "endTime": "2020-04-22T21:06:32.685Z" + }, + { + "startTime": "2020-04-22T21:06:04.270Z", + "description": "PreparingJob", + "endTime": "2020-04-22T21:06:04.276Z" + }, + { + "startTime": "2020-04-22T21:08:46.137Z", + "description": "Complete in GCE / Cromwell Poll Interval", + "endTime": "2020-04-22T21:09:20.699Z" + }, + { + "description": "Pulling \"gcr.io/google.com/cloudsdktool/cloud-sdk:276.0.0-slim\"", + "startTime": "2020-04-22T21:07:10.790Z", + "endTime": "2020-04-22T21:07:56.732Z" + }, + { + "description": "Localization", + "startTime": "2020-04-22T21:08:17.207Z", + "endTime": "2020-04-22T21:08:29.937Z" + }, + { + "startTime": "2020-04-22T21:09:20.699Z", + "description": "UpdatingCallCache", + "endTime": "2020-04-22T21:09:22.017Z" + }, + { + "description": "UserAction", + "startTime": "2020-04-22T21:08:30.070Z", + "endTime": "2020-04-22T21:08:31.659Z" + }, + { + "description": "ContainerSetup", + "startTime": "2020-04-22T21:08:11.542Z", + "endTime": "2020-04-22T21:08:17.207Z" + } + ], + "backendLogs": { + "log": "gs://cloud-cromwell-dev-self-cleaning/cromwell_execution/ci/forkjoin/b9264f9e-c5de-4285-8397-43639b7f65e1/call-join/join.log" + }, + "start": "2020-04-22T21:06:03.821Z" + } + ], + "forkjoin.grep": [ + { + "preemptible": false, + "executionStatus": "Done", + "stdout": "gs://cloud-cromwell-dev-self-cleaning/cromwell_execution/ci/forkjoin/b9264f9e-c5de-4285-8397-43639b7f65e1/call-grep/stdout", + "backendStatus": "Success", + "compressedDockerSize": 26726969, + "commandLine": "grep '10' /cromwell_root/cloud-cromwell-dev-self-cleaning/cromwell_execution/ci/forkjoin/b9264f9e-c5de-4285-8397-43639b7f65e1/call-mkFile/stdout | wc -l", + "shardIndex": -1, + "jes": { + "endpointUrl": "https://lifesciences.googleapis.com/", + "machineType": "custom-1-2048", + "googleProject": "broad-dsde-cromwell-dev", + "executionBucket": "gs://cloud-cromwell-dev-self-cleaning/cromwell_execution/ci", + "zone": "us-central1-b", + "instanceName": "google-pipelines-worker-4c30a1d4b1059f5640105d06da9cf4e1" + }, + "outputs": { + "count": 21 + }, + "runtimeAttributes": { + "preemptible": "0", + "failOnStderr": "false", + "bootDiskSizeGb": "10", + "disks": "local-disk 10 SSD", + "continueOnReturnCode": "0", + "docker": "ubuntu:latest", + "maxRetries": "0", + "cpu": "1", + "cpuMin": "1", + "noAddress": "false", + "zones": "us-central1-b", + "memoryMin": "2 GB", + "memory": "2 GB" + }, + "callCaching": { + "allowResultReuse": true, + "effectiveCallCachingMode": "WriteCache", + "hashes": { + "output count": "C4CA4238A0B923820DCC509A6F75849B", + "runtime attribute": { + "docker": "5E7DF90C2675647DEC5193F290917C18", + "continueOnReturnCode": "CFCD208495D565EF66E7DFF9F98764DA", + "failOnStderr": "68934A3E9455FA72420237EB05902327" + }, + "output expression": { + "Int count": "93114C80C6826C071C28393FDD0D5F73" + }, + "input count": "C81E728D9D4C2F636F067F89CC14862C", + "backend name": "36EF4A8AB268D1A1C74D8108C93D48ED", + "command template": "B473D09A649FA4B1E4646844E5DE04C6", + "input": { + "File in_file": "4DC9uA==", + "String pattern": "C76A2042F6D6B4BABE36D71F036740E2" + } + } + }, + "inputs": { + "in_file": "gs://cloud-cromwell-dev-self-cleaning/cromwell_execution/ci/forkjoin/b9264f9e-c5de-4285-8397-43639b7f65e1/call-mkFile/stdout", + "pattern": "10" + }, + "backendLabels": { + "wdl-task-name": "grep", + "cromwell-workflow-id": "cromwell-b9264f9e-c5de-4285-8397-43639b7f65e1" + }, + "returnCode": 0, + "labels": { + "wdl-task-name": "grep", + "cromwell-workflow-id": "cromwell-b9264f9e-c5de-4285-8397-43639b7f65e1" + }, + "jobId": "projects/1005074806481/locations/us-central1/operations/13032426715870634389", + "backend": "Papi", + "end": "2020-04-22T21:05:32.054Z", + "dockerImageUsed": "ubuntu@sha256:e5dd9dbb37df5b731a6688fa49f4003359f6f126958c9c928f937bec69836320", + "stderr": "gs://cloud-cromwell-dev-self-cleaning/cromwell_execution/ci/forkjoin/b9264f9e-c5de-4285-8397-43639b7f65e1/call-grep/stderr", + "callRoot": "gs://cloud-cromwell-dev-self-cleaning/cromwell_execution/ci/forkjoin/b9264f9e-c5de-4285-8397-43639b7f65e1/call-grep", + "attempt": 1, + "executionEvents": [ + { + "startTime": "2020-04-22T21:05:12.514Z", + "description": "Complete in GCE / Cromwell Poll Interval", + "endTime": "2020-04-22T21:05:30.710Z" + }, + { + "description": "UserAction", + "startTime": "2020-04-22T21:04:55.709Z", + "endTime": "2020-04-22T21:04:58.739Z" + }, + { + "startTime": "2020-04-22T21:05:31.056Z", + "description": "UpdatingJobStore", + "endTime": "2020-04-22T21:05:32.054Z" + }, + { + "startTime": "2020-04-22T21:02:38.352Z", + "description": "waiting for quota", + "endTime": "2020-04-22T21:02:41.092Z" + }, + { + "description": "ContainerSetup", + "startTime": "2020-04-22T21:04:35.404Z", + "endTime": "2020-04-22T21:04:41.102Z" + }, + { + "startTime": "2020-04-22T21:02:09.265Z", + "description": "PreparingJob", + "endTime": "2020-04-22T21:02:09.280Z" + }, + { + "description": "Localization", + "startTime": "2020-04-22T21:04:41.102Z", + "endTime": "2020-04-22T21:04:55.709Z" + }, + { + "startTime": "2020-04-22T21:02:09.215Z", + "description": "RequestingExecutionToken", + "endTime": "2020-04-22T21:02:09.264Z" + }, + { + "startTime": "2020-04-22T21:05:12.514Z", + "description": "Worker released", + "endTime": "2020-04-22T21:05:12.514Z" + }, + { + "description": "Background", + "startTime": "2020-04-22T21:04:34.455Z", + "endTime": "2020-04-22T21:04:35.404Z" + }, + { + "startTime": "2020-04-22T21:02:09.264Z", + "description": "WaitingForValueStore", + "endTime": "2020-04-22T21:02:09.265Z" + }, + { + "description": "Pulling \"ubuntu@sha256:e5dd9dbb37df5b731a6688fa49f4003359f6f126958c9c928f937bec69836320\"", + "startTime": "2020-04-22T21:04:30.268Z", + "endTime": "2020-04-22T21:04:34.455Z" + }, + { + "startTime": "2020-04-22T21:02:41.092Z", + "description": "Worker \"google-pipelines-worker-4c30a1d4b1059f5640105d06da9cf4e1\" assigned in \"us-central1-b\"", + "endTime": "2020-04-22T21:03:13.997Z" + }, + { + "startTime": "2020-04-22T21:02:09.280Z", + "description": "RunningJob", + "endTime": "2020-04-22T21:02:38.352Z" + }, + { + "startTime": "2020-04-22T21:02:09.214Z", + "description": "Pending", + "endTime": "2020-04-22T21:02:09.215Z" + }, + { + "description": "Delocalization", + "startTime": "2020-04-22T21:04:58.739Z", + "endTime": "2020-04-22T21:05:12.514Z" + }, + { + "startTime": "2020-04-22T21:05:30.710Z", + "description": "UpdatingCallCache", + "endTime": "2020-04-22T21:05:31.056Z" + }, + { + "description": "Pulling \"gcr.io/google.com/cloudsdktool/cloud-sdk:276.0.0-slim\"", + "startTime": "2020-04-22T21:03:13.997Z", + "endTime": "2020-04-22T21:04:30.268Z" + } + ], + "backendLogs": { + "log": "gs://cloud-cromwell-dev-self-cleaning/cromwell_execution/ci/forkjoin/b9264f9e-c5de-4285-8397-43639b7f65e1/call-grep/grep.log" + }, + "start": "2020-04-22T21:02:09.214Z" + } + ] + }, + "outputs": { + "forkjoin.join.proportion": 47 + }, + "workflowRoot": "gs://cloud-cromwell-dev-self-cleaning/cromwell_execution/ci/forkjoin/b9264f9e-c5de-4285-8397-43639b7f65e1/", + "actualWorkflowLanguage": "WDL", + "id": "b9264f9e-c5de-4285-8397-43639b7f65e1", + "inputs": { + "forkjoin.grep.pattern": "10" + }, + "labels": { + "cromwell-workflow-id": "cromwell-b9264f9e-c5de-4285-8397-43639b7f65e1" + }, + "submission": "2020-04-22T20:59:09.500Z", + "status": "Succeeded", + "end": "2020-04-22T21:09:24.732Z", + "start": "2020-04-22T20:59:09.819Z" +} diff --git a/scripts/metadata_comparison/test/resources/subworkflow_hello_world_metadata.json b/scripts/metadata_comparison/test/resources/subworkflow_hello_world_metadata.json new file mode 100644 index 00000000000..75384aaaa7b --- /dev/null +++ b/scripts/metadata_comparison/test/resources/subworkflow_hello_world_metadata.json @@ -0,0 +1,286 @@ +{ + "actualWorkflowLanguage": "WDL", + "actualWorkflowLanguageVersion": "draft-2", + "calls": { + "main_workflow.wf_hello": [ + { + "attempt": 1, + "end": "2020-04-16T20:51:20.931Z", + "executionEvents": [ + { + "description": "SubWorkflowRunningState", + "endTime": "2020-04-16T20:51:20.930Z", + "startTime": "2020-04-16T20:48:53.990Z" + }, + { + "description": "SubWorkflowPendingState", + "endTime": "2020-04-16T20:48:53.942Z", + "startTime": "2020-04-16T20:48:53.925Z" + }, + { + "description": "WaitingForValueStore", + "endTime": "2020-04-16T20:48:53.954Z", + "startTime": "2020-04-16T20:48:53.942Z" + }, + { + "description": "SubWorkflowPreparingState", + "endTime": "2020-04-16T20:48:53.990Z", + "startTime": "2020-04-16T20:48:53.954Z" + } + ], + "executionStatus": "Done", + "inputs": { + "wf_hello_input": "sub world" + }, + "outputs": { + "salutation": "Hello sub world!" + }, + "shardIndex": -1, + "start": "2020-04-16T20:48:53.919Z", + "subWorkflowMetadata": { + "calls": { + "wf_hello.hello": [ + { + "attempt": 1, + "backend": "Papi", + "backendLabels": { + "cromwell-sub-workflow-name": "sub-wf-hello", + "cromwell-workflow-id": "cromwell-f6cf8176-f7f9-458c-9bf7-e2b2a416e28e", + "wdl-call-alias": "hello", + "wdl-task-name": "sub-hello" + }, + "backendLogs": { + "log": "gs://cloud-cromwell-dev-self-cleaning/cromwell_execution/ci/main_workflow/f6cf8176-f7f9-458c-9bf7-e2b2a416e28e/call-wf_hello/sub.wf_hello/e0350fb3-4a12-4f71-a651-63125a51def3/call-hello/hello.log" + }, + "backendStatus": "Success", + "callCaching": { + "allowResultReuse": true, + "effectiveCallCachingMode": "ReadAndWriteCache", + "hashes": { + "backend name": "36EF4A8AB268D1A1C74D8108C93D48ED", + "command template": "4EAADE3CD5D558C5A6CFA4FD101A1486", + "input": { + "String addressee": "B3EB934782A0B263A4A7759213CB3ADA" + }, + "input count": "C4CA4238A0B923820DCC509A6F75849B", + "output count": "C4CA4238A0B923820DCC509A6F75849B", + "output expression": { + "String salutation": "0183144CF6617D5341681C6B2F756046" + }, + "runtime attribute": { + "continueOnReturnCode": "CFCD208495D565EF66E7DFF9F98764DA", + "docker": "5E7DF90C2675647DEC5193F290917C18", + "failOnStderr": "68934A3E9455FA72420237EB05902327" + } + }, + "hit": false, + "result": "Cache Miss" + }, + "callRoot": "gs://cloud-cromwell-dev-self-cleaning/cromwell_execution/ci/main_workflow/f6cf8176-f7f9-458c-9bf7-e2b2a416e28e/call-wf_hello/sub.wf_hello/e0350fb3-4a12-4f71-a651-63125a51def3/call-hello", + "commandLine": "echo \"Hello sub world!\"", + "compressedDockerSize": 26726969, + "dockerImageUsed": "ubuntu@sha256:e5dd9dbb37df5b731a6688fa49f4003359f6f126958c9c928f937bec69836320", + "end": "2020-04-16T20:51:18.909Z", + "executionEvents": [ + { + "description": "Pending", + "endTime": "2020-04-16T20:48:56.128Z", + "startTime": "2020-04-16T20:48:56.099Z" + }, + { + "description": "Pulling \"gcr.io/google.com/cloudsdktool/cloud-sdk:276.0.0-slim\"", + "endTime": "2020-04-16T20:50:17.436Z", + "startTime": "2020-04-16T20:49:38.420Z" + }, + { + "description": "Complete in GCE / Cromwell Poll Interval", + "endTime": "2020-04-16T20:51:17.754Z", + "startTime": "2020-04-16T20:51:02.387Z" + }, + { + "description": "CallCacheReading", + "endTime": "2020-04-16T20:48:57.928Z", + "startTime": "2020-04-16T20:48:57.865Z" + }, + { + "description": "UserAction", + "endTime": "2020-04-16T20:50:47.250Z", + "startTime": "2020-04-16T20:50:45.652Z" + }, + { + "description": "Localization", + "endTime": "2020-04-16T20:50:45.652Z", + "startTime": "2020-04-16T20:50:32.437Z" + }, + { + "description": "RunningJob", + "endTime": "2020-04-16T20:49:01.635Z", + "startTime": "2020-04-16T20:48:57.928Z" + }, + { + "description": "Worker \"google-pipelines-worker-5750d321068586e8a46330cb5938b9c3\" assigned in \"us-central1-b\"", + "endTime": "2020-04-16T20:49:38.420Z", + "startTime": "2020-04-16T20:49:04.937Z" + }, + { + "description": "Pulling \"ubuntu@sha256:e5dd9dbb37df5b731a6688fa49f4003359f6f126958c9c928f937bec69836320\"", + "endTime": "2020-04-16T20:50:25.595Z", + "startTime": "2020-04-16T20:50:17.436Z" + }, + { + "description": "ContainerSetup", + "endTime": "2020-04-16T20:50:32.437Z", + "startTime": "2020-04-16T20:50:26.639Z" + }, + { + "description": "PreparingJob", + "endTime": "2020-04-16T20:48:57.865Z", + "startTime": "2020-04-16T20:48:57.149Z" + }, + { + "description": "Background", + "endTime": "2020-04-16T20:50:26.639Z", + "startTime": "2020-04-16T20:50:25.873Z" + }, + { + "description": "UpdatingCallCache", + "endTime": "2020-04-16T20:51:17.962Z", + "startTime": "2020-04-16T20:51:17.754Z" + }, + { + "description": "Delocalization", + "endTime": "2020-04-16T20:51:02.387Z", + "startTime": "2020-04-16T20:50:47.250Z" + }, + { + "description": "Worker released", + "endTime": "2020-04-16T20:51:02.387Z", + "startTime": "2020-04-16T20:51:02.387Z" + }, + { + "description": "waiting for quota", + "endTime": "2020-04-16T20:49:04.937Z", + "startTime": "2020-04-16T20:49:01.635Z" + }, + { + "description": "UpdatingJobStore", + "endTime": "2020-04-16T20:51:18.910Z", + "startTime": "2020-04-16T20:51:17.962Z" + }, + { + "description": "WaitingForValueStore", + "endTime": "2020-04-16T20:48:57.149Z", + "startTime": "2020-04-16T20:48:57.141Z" + }, + { + "description": "RequestingExecutionToken", + "endTime": "2020-04-16T20:48:57.141Z", + "startTime": "2020-04-16T20:48:56.128Z" + } + ], + "executionStatus": "Done", + "inputs": { + "addressee": "sub world" + }, + "jes": { + "endpointUrl": "https://genomics.googleapis.com/", + "executionBucket": "gs://cloud-cromwell-dev-self-cleaning/cromwell_execution/ci", + "googleProject": "broad-dsde-cromwell-dev", + "instanceName": "google-pipelines-worker-5750d321068586e8a46330cb5938b9c3", + "machineType": "custom-1-2048", + "zone": "us-central1-b" + }, + "jobId": "projects/broad-dsde-cromwell-dev/operations/2244029211726316446", + "labels": { + "cromwell-sub-workflow-name": "sub.wf_hello", + "cromwell-workflow-id": "cromwell-f6cf8176-f7f9-458c-9bf7-e2b2a416e28e", + "wdl-call-alias": "hello", + "wdl-task-name": "sub.hello" + }, + "outputs": { + "salutation": "Hello sub world!" + }, + "preemptible": false, + "returnCode": 0, + "runtimeAttributes": { + "bootDiskSizeGb": "10", + "continueOnReturnCode": "0", + "cpu": "1", + "cpuMin": "1", + "disks": "local-disk 10 SSD", + "docker": "ubuntu:latest", + "failOnStderr": "false", + "maxRetries": "0", + "memory": "2 GB", + "memoryMin": "2 GB", + "noAddress": "false", + "preemptible": "0", + "zones": "us-central1-b" + }, + "shardIndex": -1, + "start": "2020-04-16T20:48:56.074Z", + "stderr": "gs://cloud-cromwell-dev-self-cleaning/cromwell_execution/ci/main_workflow/f6cf8176-f7f9-458c-9bf7-e2b2a416e28e/call-wf_hello/sub.wf_hello/e0350fb3-4a12-4f71-a651-63125a51def3/call-hello/stderr", + "stdout": "gs://cloud-cromwell-dev-self-cleaning/cromwell_execution/ci/main_workflow/f6cf8176-f7f9-458c-9bf7-e2b2a416e28e/call-wf_hello/sub.wf_hello/e0350fb3-4a12-4f71-a651-63125a51def3/call-hello/stdout" + } + ] + }, + "end": "2020-04-16T20:51:20.929Z", + "id": "e0350fb3-4a12-4f71-a651-63125a51def3", + "inputs": { + "wf_hello_input": "sub world" + }, + "outputs": { + "wf_hello.salutation": "Hello sub world!" + }, + "parentWorkflowId": "f6cf8176-f7f9-458c-9bf7-e2b2a416e28e", + "rootWorkflowId": "f6cf8176-f7f9-458c-9bf7-e2b2a416e28e", + "start": "2020-04-16T20:48:53.953Z", + "status": "Succeeded", + "workflowName": "wf_hello", + "workflowRoot": "gs://cloud-cromwell-dev-self-cleaning/cromwell_execution/ci/main_workflow/f6cf8176-f7f9-458c-9bf7-e2b2a416e28e/" + } + } + ] + }, + "end": "2020-04-16T20:51:22.877Z", + "id": "f6cf8176-f7f9-458c-9bf7-e2b2a416e28e", + "inputs": {}, + "labels": { + "cromwell-workflow-id": "cromwell-f6cf8176-f7f9-458c-9bf7-e2b2a416e28e", + "newlyAddedLabel": "All mimsy were the borogoves, and the mome raths outgrabe" + }, + "metadataSource": "Archived", + "outputs": { + "main_workflow.main_output": "Hello sub world!" + }, + "start": "2020-04-16T20:48:49.736Z", + "status": "Succeeded", + "submission": "2020-04-16T20:48:48.934Z", + "submittedFiles": { + "imports": { + "sub_workflow_hello_world_import.wdl": "task hello {\n String addressee\n command {\n echo \"Hello ${addressee}!\"\n }\n runtime {\n docker: \"ubuntu:latest\"\n }\n output {\n String salutation = read_string(stdout())\n }\n}\n\nworkflow wf_hello {\n String wf_hello_input = \"world\"\n \n call hello { input: addressee = wf_hello_input }\n \n output {\n String salutation = hello.salutation\n }\n}\n" + }, + "inputs": "{}", + "labels": "{}", + "options": "{\n\n}", + "root": "", + "workflow": "import \"sub_workflow_hello_world_import.wdl\" as sub\n\nworkflow main_workflow {\n call sub.wf_hello { input: wf_hello_input = \"sub world\" }\n output {\n String main_output = wf_hello.salutation\n }\n}", + "workflowUrl": "" + }, + "workflowName": "main_workflow", + "workflowProcessingEvents": [ + { + "cromwellId": "cromid-e4f268e", + "cromwellVersion": "cromwell-version.conf-to-be-generated-by-sbt", + "description": "PickedUp", + "timestamp": "2020-04-16T20:48:49.675Z" + }, + { + "cromwellId": "cromid-e4f268e", + "cromwellVersion": "cromwell-version.conf-to-be-generated-by-sbt", + "description": "Finished", + "timestamp": "2020-04-16T20:51:22.878Z" + } + ], + "workflowRoot": "gs://cloud-cromwell-dev-self-cleaning/cromwell_execution/ci/main_workflow/f6cf8176-f7f9-458c-9bf7-e2b2a416e28e/" +} diff --git a/scripts/metadata_comparison/test/test_comparer.py b/scripts/metadata_comparison/test/test_comparer.py new file mode 100755 index 00000000000..ca68c7da46c --- /dev/null +++ b/scripts/metadata_comparison/test/test_comparer.py @@ -0,0 +1,63 @@ +#!/usr/bin/env python3 + +import unittest +import json +import pandas +from typing import List, Tuple +from pathlib import Path +from metadata_comparison.comparer import compare_jsons +from test.lib.storage import RESOURCES + +class ComparerTestMethods(unittest.TestCase): + + valid_comparison_result_file = RESOURCES / Path("comparer/valid_comparison_result.csv") + + def __read_test_json(self, filename: str) -> dict: + json_str = (RESOURCES / "comparer" / filename).read_text() + return json.loads(json_str) + + + def __produce_workflow_id_and_json_tuples(self, workflow_id_and_filename_1: Tuple[str, str], workflow_id_and_filename_2: Tuple[str, str]) -> List[Tuple[str, dict]]: + json1 = self.__read_test_json(workflow_id_and_filename_1[1]) + json2 = self.__read_test_json(workflow_id_and_filename_2[1]) + return [(workflow_id_and_filename_1[0], json1), (workflow_id_and_filename_2[0], json2)] + + + def test_compare_valid_jsons(self) -> None: + cases = [ + ((111, "performance_json_workflow_111.json"), (222, "performance_json_workflow_222.json")), + ((111, "performance_json_workflow_111.json"), (222, "performance_json_workflow_222_differently_sorted.json")) + ] + + for case in cases: + with self.subTest(case=case): + actual_workflow_ids_and_jsons = self.__produce_workflow_id_and_json_tuples(case[0], case[1]) + actual_df = compare_jsons(actual_workflow_ids_and_jsons) + expected_df = pandas.read_csv(self.valid_comparison_result_file, index_col = 0) + + are_equal = pandas.DataFrame.equals(expected_df, actual_df) + if are_equal == False: + # will print out dataframe having `true` in cells, which matching values and `false` otherwise + print(expected_df.eq(actual_df)) + + self.assertTrue(are_equal) + + + def test_compare_invalid_jsons(self) -> None: + cases = [ + ((111, "performance_json_workflow_111.json"), (222, "performance_json_changed_key.json")), + ((111, "performance_json_workflow_111.json"), (222, "performance_json_missing_key.json")), + ((111, "performance_json_workflow_111.json"), (222, "performance_json_additional_key.json")) + ] + + for case in cases: + with self.subTest(case=case): + actual_workflow_ids_and_jsons = self.__produce_workflow_id_and_json_tuples(case[0], case[1]) + with self.assertRaises(Exception) as context: + compare_jsons(actual_workflow_ids_and_jsons) + + self.assertTrue("doesn't have matching subset of columns" in str(context.exception)) + + +if __name__ == '__main__': + unittest.main() diff --git a/scripts/metadata_comparison/test/test_digester.py b/scripts/metadata_comparison/test/test_digester.py new file mode 100644 index 00000000000..dad99d860ca --- /dev/null +++ b/scripts/metadata_comparison/test/test_digester.py @@ -0,0 +1,187 @@ +#!/usr/bin/env python3 + +import os +import unittest +from typing import AnyStr, Callable +from metadata_comparison.lib.comparison_paths import ComparisonPath +from pathlib import Path +import logging +from metadata_comparison.digester import digest +from metadata_comparison.lib.logging import quieten_chatty_imports, set_log_verbosity +from metadata_comparison.lib.operation_ids import JsonObject +import google.auth +from google.cloud import storage + + +class DigesterTestMethods(unittest.TestCase): + set_log_verbosity(verbose=True) + quieten_chatty_imports() + + def test_digestion(self) -> None: + """ + This uses "real" metadata from the PAPI v2 performance spike to drive digester testing. The metadata is stored + in GCS and copied down to the local machine if not already present from an earlier run. The digester can run + against either local or GCS paths using `ComparisonPath`s. Local is nicer to iterate on than GCS since it + runs so much more quickly. Since it is slow GCS testing is off by default, it can be turned on by setting the + DIGESTER_TEST_GCS environment variable. + """ + + credentials, project_id = google.auth.default() + storage_client = storage.Client(credentials=credentials) + + bucket_name = 'papi-performance-analysis' + bucket = storage_client.get_bucket(bucket_name) + + # A cache of expensive-to-create GCS comparison paths. + gcs_comparison_path_by_subdir = {} + papi_versions = [VERSION_PAPI_V1, VERSION_PAPI_V2] + + for papi_version in papi_versions: + subdir = subdir_for_papi_version(papi_version) + local_parent = ComparisonPath.create(subdir) + + for sample_name in EXPECTATIONS.keys(): + download_metadata_from_gcs_if_needed(sample_name, local_parent, bucket) + parents_to_test = [local_parent] + # Skip slow GCS testing unless this environment variable is set. + if os.environ.get('DIGESTER_TEST_GCS'): + parents_to_test.append(gcs_parent(subdir, gcs_comparison_path_by_subdir)) + + for parent in parents_to_test: + description = parent.description() + logging.info( + f"Running digester test on {description} for sample '{sample_name}' on backend {papi_version}") + sample_path = parent / sample_name + workflow_path = sample_path / 'workflow.json' + operations_path = sample_path / 'operations' + actual = digest(workflow_path, operations_path) + + expected = EXPECTATIONS[sample_name][papi_version] + calls: JsonObject = actual.get('calls') + + actual_total = len(calls) + self.assertEqual(actual_total, expected['total_jobs']) + + for num_attempts in [1, 2, 3]: + actual_len = len(list(filter(more_than_x_attempts(calls, num_attempts), calls))) + self.assertEqual(actual_len, expected[f'more_than_{num_attempts}_attempts']) + + for minutes_longer in range(3, 9): + actual_len = len(list(filter(more_than_x_minutes_longer(calls, minutes_longer), calls))) + expectation = expected[f'cromwell_time_more_than_{minutes_longer}_minutes_longer_total'] + self.assertEqual(actual_len, expectation) + + # Currently just a smoke test to assert not-completely-insane results for both v1 and v2 digesters. + self.assertTrue(all([calls[name].get('dockerImagePullSeconds') > 0 for name in calls])) + + +def read_resource(filename: AnyStr) -> AnyStr: + path = Path('test/resources') / filename + with open(path, 'r') as file: + data = file.read() + return data + + +def download_metadata_from_gcs(bucket: storage.Bucket, local_sample_path: ComparisonPath) -> None: + (local_sample_path / "operations").mkdir_p() + + prefix = str(local_sample_path) + blobs = bucket.list_blobs(prefix=prefix) + for blob in blobs: + if not blob.name.endswith('/digest.json'): + logging.info(f'Downloading blob: {blob.name}') + blob.download_to_filename(blob.name) + + +def subdir_for_papi_version(papi_version: AnyStr) -> AnyStr: + if papi_version == VERSION_PAPI_V1: + path_element = 'PAPIv1' + elif papi_version == VERSION_PAPI_V2: + path_element = 'PAPIv2_alpha1/v1_style_machine_types' + else: + raise ValueError(f'Unrecognized PAPI version {papi_version}') + return f'exome_germline_single_sample_v1.3/{path_element}' + + +def download_metadata_from_gcs_if_needed(sample_name: AnyStr, local_parent: ComparisonPath, bucket: storage.Bucket) -> None: + """ + Copy down workflow and PAPI operations metadata from GCS if needed to test Local. + """ + local_sample_path = local_parent / sample_name + if not local_sample_path.exists(): + logging.info(f"Local sample directory '{local_sample_path}' does not exist, downloading from GCS.") + download_metadata_from_gcs(bucket, local_sample_path) + + +VERSION_PAPI_V1 = 'PAPIv1' +VERSION_PAPI_V2 = 'PAPIv2_alpha1' + +EXPECTATIONS = { + 'dev_C1963.CHMI_CHMI3_Nex1': { + 'PAPIv1': { + 'total_jobs': 133, + 'more_than_1_attempts': 19, + 'more_than_2_attempts': 3, + 'more_than_3_attempts': 1, + 'cromwell_time_more_than_3_minutes_longer_total': 15, + 'cromwell_time_more_than_4_minutes_longer_total': 4, + 'cromwell_time_more_than_5_minutes_longer_total': 2, + 'cromwell_time_more_than_6_minutes_longer_total': 1, + 'cromwell_time_more_than_7_minutes_longer_total': 1, + 'cromwell_time_more_than_8_minutes_longer_total': 0, + }, + 'PAPIv2_alpha1': { + 'total_jobs': 133, + 'more_than_1_attempts': 12, + 'more_than_2_attempts': 1, + 'more_than_3_attempts': 0, + 'cromwell_time_more_than_3_minutes_longer_total': 21, + 'cromwell_time_more_than_4_minutes_longer_total': 7, + 'cromwell_time_more_than_5_minutes_longer_total': 4, + 'cromwell_time_more_than_6_minutes_longer_total': 2, + 'cromwell_time_more_than_7_minutes_longer_total': 1, + 'cromwell_time_more_than_8_minutes_longer_total': 0, + # insert more intelligent assertions here + } + } + # more samples if needed + # 'dev_C862.NA19238', + # 'dev_D5327.NA12878', + # 'dev_D5327.NA12891', + # 'dev_D5327.NA12892', + # 'dev_RP-1535.NA17-308' +} + + +def more_than_x_attempts(calls: JsonObject, attempts: int) -> Callable[[AnyStr], bool]: + """ + Return a function to filter the calls that had more than the specified number of attempts. + """ + def inner(call_name: AnyStr) -> bool: + return calls.get(call_name).get('attempt') > attempts + + return inner + + +def more_than_x_minutes_longer(calls: JsonObject, minutes: int) -> Callable[[AnyStr], bool]: + """ + Return a function to filter the calls that ran for more than the specified number of minutes. + """ + def inner(call_name: AnyStr) -> bool: + return calls.get(call_name).get('cromwellAdditionalTotalTimeSeconds') > minutes * 60 + + return inner + + +def gcs_parent(subdir: AnyStr, gcs_comparison_path_by_subdir: dict) -> ComparisonPath: + """ + GcsComparisonPaths are somewhat expensive to create so cache them. + """ + if subdir not in gcs_comparison_path_by_subdir: + path = ComparisonPath.create(f'gs://papi-performance-analysis/{subdir}') + gcs_comparison_path_by_subdir[subdir] = path + return gcs_comparison_path_by_subdir[subdir] + + +if __name__ == '__main__': + unittest.main() diff --git a/server/src/test/scala/cromwell/engine/WorkflowStoreActorSpec.scala b/server/src/test/scala/cromwell/engine/WorkflowStoreActorSpec.scala index 4626977b069..3702cc76d32 100644 --- a/server/src/test/scala/cromwell/engine/WorkflowStoreActorSpec.scala +++ b/server/src/test/scala/cromwell/engine/WorkflowStoreActorSpec.scala @@ -213,7 +213,7 @@ class WorkflowStoreActorSpec extends CromwellTestKitWordSpec with CoordinatedWor eventually(timeout(15.seconds.dilated), interval(500.millis.dilated)) { val actorNameUniquificationString = UUID.randomUUID().toString.take(7) val readMetadataActor = system.actorOf( - ReadDatabaseMetadataWorkerActor.props(metadataReadTimeout = 30 seconds), + ReadDatabaseMetadataWorkerActor.props(metadataReadTimeout = 30 seconds, metadataReadRowNumberSafetyThreshold = 20000), s"ReadMetadataActor-FetchEncryptedOptions-$actorNameUniquificationString" ) diff --git a/server/src/test/scala/cromwell/engine/workflow/lifecycle/execution/ejea/EjeaBackendIsCopyingCachedOutputsSpec.scala b/server/src/test/scala/cromwell/engine/workflow/lifecycle/execution/ejea/EjeaBackendIsCopyingCachedOutputsSpec.scala index 7971b838fb5..4bd076e707b 100644 --- a/server/src/test/scala/cromwell/engine/workflow/lifecycle/execution/ejea/EjeaBackendIsCopyingCachedOutputsSpec.scala +++ b/server/src/test/scala/cromwell/engine/workflow/lifecycle/execution/ejea/EjeaBackendIsCopyingCachedOutputsSpec.scala @@ -4,10 +4,10 @@ import cats.data.NonEmptyList import cromwell.core.callcaching._ import cromwell.engine.workflow.lifecycle.execution.job.EngineJobExecutionActor._ import cromwell.engine.workflow.lifecycle.execution.callcaching.CallCacheReadingJobActor.NextHit -import cromwell.engine.workflow.lifecycle.execution.callcaching.CallCachingEntryId import cromwell.engine.workflow.lifecycle.execution.callcaching.EngineJobHashingActor.{CacheHit, CallCacheHashes, EJHAResponse, HashError} import cromwell.engine.workflow.lifecycle.execution.ejea.EngineJobExecutionActorSpec._ import cromwell.engine.workflow.lifecycle.execution.ejea.HasJobSuccessResponse.SuccessfulCallCacheHashes +import cromwell.services.CallCaching.CallCachingEntryId import cromwell.services.instrumentation.{CromwellBucket, CromwellIncrement} import cromwell.services.instrumentation.InstrumentationService.InstrumentationServiceMessage @@ -102,10 +102,10 @@ class EjeaBackendIsCopyingCachedOutputsSpec extends EngineJobExecutionActorSpec } if (mode.readFromCache) { - s"invalidate a call for caching if backend coping failed when it was going to receive $hashComboName, if call caching is $mode" in { + s"invalidate a call for caching if backend copying failed when it was going to receive $hashComboName, if call caching is $mode" in { ejea = ejeaInBackendIsCopyingCachedOutputsState(initialHashData, mode) // Send the response from the copying actor - ejea ! loggableFailedToCopyResponse(cacheHitNumber) + ejea ! copyAttemptFailedResponse(cacheHitNumber) expectInvalidateCallCacheActor(cacheId) eventually { @@ -118,11 +118,12 @@ class EjeaBackendIsCopyingCachedOutputsSpec extends EngineJobExecutionActorSpec Option(helper.ejhaProbe.ref), cacheHit, None, - 1, + cacheHitFailureCount = 1, + failedCopyAttempts = 1 )) } - s"not invalidate a call for caching if backend coping failed when invalidation is disabled, when it was going to receive $hashComboName, if call caching is $mode" in { + s"not invalidate a call for caching if backend copying failed when invalidation is disabled, when it was going to receive $hashComboName, if call caching is $mode" in { val invalidationDisabledOptions = CallCachingOptions(invalidateBadCacheResults = false, workflowOptionCallCachePrefixes = None) val cacheInvalidationDisabledMode = mode match { case CallCachingActivity(rw, _) => CallCachingActivity(rw, invalidationDisabledOptions) @@ -130,7 +131,7 @@ class EjeaBackendIsCopyingCachedOutputsSpec extends EngineJobExecutionActorSpec } ejea = ejeaInBackendIsCopyingCachedOutputsState(initialHashData, cacheInvalidationDisabledMode) // Send the response from the copying actor - ejea ! loggableFailedToCopyResponse(cacheHitNumber) + ejea ! copyAttemptFailedResponse(cacheHitNumber) helper.ejhaProbe.expectMsg(NextHit) @@ -139,7 +140,7 @@ class EjeaBackendIsCopyingCachedOutputsSpec extends EngineJobExecutionActorSpec } // Make sure we didn't start invalidating anything: helper.invalidateCacheActorCreations.hasExactlyOne should be(false) - ejea.stateData should be(ResponsePendingData(helper.backendJobDescriptor, helper.bjeaProps, initialHashData, Option(helper.ejhaProbe.ref), cacheHit)) + ejea.stateData should be(ResponsePendingData(helper.backendJobDescriptor, helper.bjeaProps, initialHashData, Option(helper.ejhaProbe.ref), cacheHit, cacheHitFailureCount = 1, failedCopyAttempts = 1)) } def checkInvalidateOnCopyFailure(expectMetric: Boolean) = { @@ -155,7 +156,7 @@ class EjeaBackendIsCopyingCachedOutputsSpec extends EngineJobExecutionActorSpec helper.jobStoreProbe.expectNoMessage(awaitAlmostNothing) // Send the response from the copying actor - val copyFailureMessage = if (expectMetric) metricableFailedToCopyResponse(cacheHitNumber) else loggableFailedToCopyResponse(cacheHitNumber) + val copyFailureMessage = if (expectMetric) cacheHitBlacklistedResponse(cacheHitNumber) else copyAttemptFailedResponse(cacheHitNumber) ejea ! copyFailureMessage expectInvalidateCallCacheActor(cacheId) @@ -169,7 +170,10 @@ class EjeaBackendIsCopyingCachedOutputsSpec extends EngineJobExecutionActorSpec Option(helper.ejhaProbe.ref), cacheHit, None, - 1, + cacheHitFailureCount = 1, + failedCopyAttempts = if (expectMetric) 0 else 1 + // In this context `expectMetric` means "was blacklisted". + // If the cache hit was blacklisted there should have been no additional copy attempt so no failed copy attempt. )) if (expectMetric) { diff --git a/server/src/test/scala/cromwell/engine/workflow/lifecycle/execution/ejea/EjeaCheckingCallCacheSpec.scala b/server/src/test/scala/cromwell/engine/workflow/lifecycle/execution/ejea/EjeaCheckingCallCacheSpec.scala index 51e765f4e2b..6ba6141daeb 100644 --- a/server/src/test/scala/cromwell/engine/workflow/lifecycle/execution/ejea/EjeaCheckingCallCacheSpec.scala +++ b/server/src/test/scala/cromwell/engine/workflow/lifecycle/execution/ejea/EjeaCheckingCallCacheSpec.scala @@ -2,9 +2,9 @@ package cromwell.engine.workflow.lifecycle.execution.ejea import cromwell.core.callcaching.{CallCachingActivity, CallCachingOff, ReadCache} import cromwell.engine.workflow.lifecycle.execution.job.EngineJobExecutionActor.{CheckingCallCache, FetchingCachedOutputsFromDatabase, ResponsePendingData, RunningJob} -import cromwell.engine.workflow.lifecycle.execution.callcaching.CallCachingEntryId import cromwell.engine.workflow.lifecycle.execution.callcaching.EngineJobHashingActor.{CacheHit, CacheMiss, HashError} import cromwell.engine.workflow.lifecycle.execution.ejea.EngineJobExecutionActorSpec.EnhancedTestEJEA +import cromwell.services.CallCaching.CallCachingEntryId import org.scalatest.concurrent.Eventually import scala.util.control.NoStackTrace diff --git a/server/src/test/scala/cromwell/engine/workflow/lifecycle/execution/ejea/EjeaFetchingCachedOutputsFromDatabaseSpec.scala b/server/src/test/scala/cromwell/engine/workflow/lifecycle/execution/ejea/EjeaFetchingCachedOutputsFromDatabaseSpec.scala index 55d25c28857..fab5cfa0b65 100644 --- a/server/src/test/scala/cromwell/engine/workflow/lifecycle/execution/ejea/EjeaFetchingCachedOutputsFromDatabaseSpec.scala +++ b/server/src/test/scala/cromwell/engine/workflow/lifecycle/execution/ejea/EjeaFetchingCachedOutputsFromDatabaseSpec.scala @@ -7,12 +7,11 @@ import cromwell.core.simpleton.WomValueSimpleton import cromwell.engine.workflow.lifecycle.execution.job.EngineJobExecutionActor._ import cromwell.engine.workflow.lifecycle.execution.callcaching.EngineJobHashingActor.{CacheHit, HashError} import cromwell.engine.workflow.lifecycle.execution.callcaching.FetchCachedResultsActor.{CachedOutputLookupFailed, CachedOutputLookupSucceeded} -import cromwell.engine.workflow.lifecycle.execution.callcaching.CallCachingEntryId import cromwell.engine.workflow.lifecycle.execution.ejea.EngineJobExecutionActorSpec._ import cromwell.engine.workflow.lifecycle.execution.ejea.HasJobSuccessResponse.SuccessfulCallCacheHashes import wom.values.WomString - import common.assertion.CaseClassAssertions._ +import cromwell.services.CallCaching.CallCachingEntryId import scala.util.{Failure, Success} @@ -41,7 +40,7 @@ class EjeaFetchingCachedOutputsFromDatabaseSpec extends EngineJobExecutionActorS val cachedReturnCode = Some(17) val sourceCacheDetails = s"${WorkflowId.randomId()}:call-someTask:1" ejea ! CachedOutputLookupSucceeded(cachedSimpletons, detritusMap, cachedReturnCode, callCachingEntryId, sourceCacheDetails) - helper.callCacheHitCopyingProbe.expectMsg(CopyOutputsCommand(cachedSimpletons, detritusMap, cachedReturnCode)) + helper.callCacheHitCopyingProbe.expectMsg(CopyOutputsCommand(cachedSimpletons, detritusMap, callCachingEntryId, cachedReturnCode)) // Check we end up in the right state: ejea.stateName should be(BackendIsCopyingCachedOutputs) diff --git a/server/src/test/scala/cromwell/engine/workflow/lifecycle/execution/ejea/EjeaInvalidatingCacheEntrySpec.scala b/server/src/test/scala/cromwell/engine/workflow/lifecycle/execution/ejea/EjeaInvalidatingCacheEntrySpec.scala index 6260ac6f2ec..bc860b8a7cc 100644 --- a/server/src/test/scala/cromwell/engine/workflow/lifecycle/execution/ejea/EjeaInvalidatingCacheEntrySpec.scala +++ b/server/src/test/scala/cromwell/engine/workflow/lifecycle/execution/ejea/EjeaInvalidatingCacheEntrySpec.scala @@ -4,8 +4,9 @@ import akka.actor.ActorRef import cromwell.core.callcaching.{CallCachingActivity, ReadCache} import cromwell.engine.workflow.lifecycle.execution.job.EngineJobExecutionActor._ import cromwell.engine.workflow.lifecycle.execution.callcaching.CallCacheReadingJobActor.NextHit -import cromwell.engine.workflow.lifecycle.execution.callcaching.{CallCacheInvalidatedFailure, CallCacheInvalidatedSuccess, CallCachingEntryId} +import cromwell.engine.workflow.lifecycle.execution.callcaching.{CallCacheInvalidatedFailure, CallCacheInvalidatedSuccess} import cromwell.engine.workflow.lifecycle.execution.ejea.EngineJobExecutionActorSpec._ +import cromwell.services.CallCaching.CallCachingEntryId class EjeaInvalidatingCacheEntrySpec extends EngineJobExecutionActorSpec { diff --git a/server/src/test/scala/cromwell/engine/workflow/lifecycle/execution/ejea/EngineJobExecutionActorSpecUtil.scala b/server/src/test/scala/cromwell/engine/workflow/lifecycle/execution/ejea/EngineJobExecutionActorSpecUtil.scala index d7f674f1325..9f7770e5b6f 100644 --- a/server/src/test/scala/cromwell/engine/workflow/lifecycle/execution/ejea/EngineJobExecutionActorSpecUtil.scala +++ b/server/src/test/scala/cromwell/engine/workflow/lifecycle/execution/ejea/EngineJobExecutionActorSpecUtil.scala @@ -1,14 +1,14 @@ package cromwell.engine.workflow.lifecycle.execution.ejea -import cromwell.backend.BackendCacheHitCopyingActor.{CopyingOutputsFailedResponse, LoggableCacheCopyError, MetricableCacheCopyError} +import cromwell.backend.BackendCacheHitCopyingActor.{CopyingOutputsFailedResponse, CopyAttemptError, BlacklistSkip} import cromwell.backend.{BackendJobDescriptor, MetricableCacheCopyErrorCategory} import cromwell.backend.BackendJobExecutionActor._ import cromwell.core.callcaching._ -import cromwell.engine.workflow.lifecycle.execution.callcaching.CallCachingEntryId import cromwell.engine.workflow.lifecycle.execution.callcaching.EngineJobHashingActor.{CallCacheHashes, FileHashes} import cromwell.engine.workflow.lifecycle.execution.job.EngineJobExecutionActor.{EJEAData, SucceededResponseData, UpdatingCallCache, UpdatingJobStore} import cromwell.jobstore.JobStoreActor.RegisterJobCompleted import cromwell.jobstore.{JobResultFailure, JobResultSuccess, JobStoreKey} +import cromwell.services.CallCaching.CallCachingEntryId import cromwell.util.WomMocks import org.scalatest.concurrent.Eventually import wom.values.{WomInteger, WomString} @@ -120,6 +120,6 @@ private[ejea] trait HasCopyFailureResponses { self: EngineJobExecutionActorSpec new Exception("Deliberate failure for test case: failed to copy cache outputs!") with NoStackTrace // Need to delay making the response because job descriptors come from the per-test "helper", which is null outside tests! - def loggableFailedToCopyResponse(attemptNumber: Int) = CopyingOutputsFailedResponse(helper.jobDescriptorKey, attemptNumber, LoggableCacheCopyError(copyFailureReason)) - def metricableFailedToCopyResponse(attemptNumber: Int) = CopyingOutputsFailedResponse(helper.jobDescriptorKey, attemptNumber, MetricableCacheCopyError(MetricableCacheCopyErrorCategory.BucketBlacklisted)) + def copyAttemptFailedResponse(attemptNumber: Int) = CopyingOutputsFailedResponse(helper.jobDescriptorKey, attemptNumber, CopyAttemptError(copyFailureReason)) + def cacheHitBlacklistedResponse(attemptNumber: Int) = CopyingOutputsFailedResponse(helper.jobDescriptorKey, attemptNumber, BlacklistSkip(MetricableCacheCopyErrorCategory.BucketBlacklisted)) } diff --git a/server/src/test/scala/cromwell/engine/workflow/lifecycle/execution/ejea/PerTestHelper.scala b/server/src/test/scala/cromwell/engine/workflow/lifecycle/execution/ejea/PerTestHelper.scala index 3c21d55dd13..eed4c1ed9c7 100644 --- a/server/src/test/scala/cromwell/engine/workflow/lifecycle/execution/ejea/PerTestHelper.scala +++ b/server/src/test/scala/cromwell/engine/workflow/lifecycle/execution/ejea/PerTestHelper.scala @@ -8,11 +8,11 @@ import cromwell.backend.standard.callcaching._ import cromwell.core.callcaching._ import cromwell.core.{CallOutputs, HogGroup, WorkflowId, WorkflowOptions} import cromwell.engine.EngineWorkflowDescriptor -import cromwell.engine.workflow.lifecycle.execution.callcaching.CallCachingEntryId import cromwell.engine.workflow.lifecycle.execution.ejea.EngineJobExecutionActorSpec._ import cromwell.engine.workflow.lifecycle.execution.job.EngineJobExecutionActor -import cromwell.engine.workflow.lifecycle.execution.job.EngineJobExecutionActor.{EJEAData, EngineJobExecutionActorState, ResponsePendingData} +import cromwell.engine.workflow.lifecycle.execution.job.EngineJobExecutionActor.{CallCachingParameters, EJEAData, EngineJobExecutionActorState, ResponsePendingData} import cromwell.engine.workflow.mocks.{DeclarationMock, TaskMock, WdlWomExpressionMock} +import cromwell.services.CallCaching.CallCachingEntryId import cromwell.util.AkkaTestUtil._ import cromwell.util.WomMocks import org.specs2.mock.Mockito @@ -129,6 +129,15 @@ private[ejea] class PerTestHelper(implicit val system: ActorSystem) extends Mock val factory: BackendLifecycleActorFactory = buildFactory() val descriptor = EngineWorkflowDescriptor(WomMocks.mockWorkflowDefinition(workflowName), backendWorkflowDescriptor, null, null, null, callCachingMode) + val callCachingParameters = CallCachingParameters( + mode = callCachingMode, + readActor = callCacheReadActorProbe.ref, + writeActor = callCacheWriteActorProbe.ref, + fileHashCacheActor = Option(dockerHashActorProbe.ref), + maxFailedCopyAttempts = 1000000, + blacklistCache = None + ) + val myBrandNewEjea = new TestFSMRef[EngineJobExecutionActorState, EJEAData, MockEjea](system, Props(new MockEjea( helper = this, jobPreparationProbe = jobPreparationProbe, @@ -141,11 +150,9 @@ private[ejea] class PerTestHelper(implicit val system: ActorSystem) extends Mock serviceRegistryActor = serviceRegistryProbe.ref, ioActor = ioActorProbe.ref, jobStoreActor = jobStoreProbe.ref, - callCacheReadActor = callCacheReadActorProbe.ref, - callCacheWriteActor = callCacheWriteActorProbe.ref, dockerHashActor = dockerHashActorProbe.ref, jobTokenDispenserActor = jobTokenDispenserProbe.ref, - callCachingMode = callCachingMode + callCachingParameters = callCachingParameters )), parentProbe.ref, s"EngineJobExecutionActorSpec-$workflowId") deathwatch watch myBrandNewEjea @@ -164,16 +171,23 @@ private[ejea] class MockEjea(helper: PerTestHelper, serviceRegistryActor: ActorRef, ioActor: ActorRef, jobStoreActor: ActorRef, - callCacheReadActor: ActorRef, - callCacheWriteActor: ActorRef, dockerHashActor: ActorRef, jobTokenDispenserActor: ActorRef, - callCachingMode: CallCachingMode) extends EngineJobExecutionActor(replyTo, jobDescriptorKey, workflowDescriptor, factory, - initializationData, restarting, serviceRegistryActor, ioActor, - jobStoreActor, callCacheReadActor, callCacheWriteActor, - dockerHashActor, jobTokenDispenserActor, None, callCachingMode, - if (restarting) RecoverJobCommand else ExecuteJobCommand, fileHashCachingActor = None, blacklistCache = None -) { + callCachingParameters: EngineJobExecutionActor.CallCachingParameters) extends EngineJobExecutionActor( + replyTo = replyTo, + jobDescriptorKey = jobDescriptorKey, + workflowDescriptor = workflowDescriptor, + backendLifecycleActorFactory = factory, + initializationData = initializationData, + restarting = restarting, + serviceRegistryActor = serviceRegistryActor, + ioActor = ioActor, + jobStoreActor = jobStoreActor, + workflowDockerLookupActor = dockerHashActor, + jobTokenDispenserActor = jobTokenDispenserActor, + backendSingletonActor = None, + command = if (restarting) RecoverJobCommand else ExecuteJobCommand, + callCachingParameters = callCachingParameters) { implicit val system = context.system override def makeFetchCachedResultsActor(cacheId: CallCachingEntryId) = helper.fetchCachedResultsActorCreations = helper.fetchCachedResultsActorCreations.foundOne((cacheId, null)) diff --git a/services/src/main/scala/cromwell/services/CallCaching.scala b/services/src/main/scala/cromwell/services/CallCaching.scala new file mode 100644 index 00000000000..c85431ca752 --- /dev/null +++ b/services/src/main/scala/cromwell/services/CallCaching.scala @@ -0,0 +1,5 @@ +package cromwell.services + +object CallCaching { + final case class CallCachingEntryId(id: Int) +} diff --git a/services/src/main/scala/cromwell/services/instrumentation/AsynchronousThrottlingGaugeMetricActor.scala b/services/src/main/scala/cromwell/services/instrumentation/AsynchronousThrottlingGaugeMetricActor.scala index 08570a0b55b..1f1f4567514 100644 --- a/services/src/main/scala/cromwell/services/instrumentation/AsynchronousThrottlingGaugeMetricActor.scala +++ b/services/src/main/scala/cromwell/services/instrumentation/AsynchronousThrottlingGaugeMetricActor.scala @@ -28,7 +28,7 @@ class AsynchronousThrottlingGaugeMetricActor(metricPath: NonEmptyList[String], } when(MetricCalculationInProgress) { - case Event(CalculateMetricValue, _) => + case Event(CalculateMetricValue(_), _) => // metric actor is already busy calculating metric value, so we dismiss this request stay() case Event(MetricValue(value), _) => diff --git a/services/src/main/scala/cromwell/services/metadata/MetadataArchiveStatus.scala b/services/src/main/scala/cromwell/services/metadata/MetadataArchiveStatus.scala index 406d53706ca..74de125808d 100644 --- a/services/src/main/scala/cromwell/services/metadata/MetadataArchiveStatus.scala +++ b/services/src/main/scala/cromwell/services/metadata/MetadataArchiveStatus.scala @@ -7,7 +7,7 @@ import MetadataArchiveStatus._ sealed trait MetadataArchiveStatus { final def isArchived = this match { case Archived | ArchivedAndPurged => true - case ArchiveFailed | Unarchived => false + case ArchiveFailed | Unarchived | TooLargeToArchive => false } } @@ -34,5 +34,6 @@ object MetadataArchiveStatus { case object Archived extends MetadataArchiveStatus case object ArchivedAndPurged extends MetadataArchiveStatus // `purged` means that original data is deleted from METADATA_ENTRY table case object ArchiveFailed extends MetadataArchiveStatus + case object TooLargeToArchive extends MetadataArchiveStatus // would cause OOM on attempt to load metadata in memory } diff --git a/services/src/main/scala/cromwell/services/metadata/MetadataService.scala b/services/src/main/scala/cromwell/services/metadata/MetadataService.scala index e8d34c34bbf..e243d78a5f7 100644 --- a/services/src/main/scala/cromwell/services/metadata/MetadataService.scala +++ b/services/src/main/scala/cromwell/services/metadata/MetadataService.scala @@ -103,7 +103,11 @@ object MetadataService { } } - final case class GetMetadataAction(key: MetadataQuery, metadataSourceOverride: Option[MetadataSourceOverride] = None) extends BuildWorkflowMetadataJsonWithOverridableSourceAction { + final case class GetMetadataAction(key: MetadataQuery, + metadataSourceOverride: Option[MetadataSourceOverride] = None, + checkTotalMetadataRowNumberBeforeQuerying: Boolean = true) + extends BuildWorkflowMetadataJsonWithOverridableSourceAction { + override def workflowId: WorkflowId = key.workflowId } final case class GetStatus(workflowId: WorkflowId) extends BuildWorkflowMetadataJsonAction @@ -132,7 +136,8 @@ object MetadataService { } final case class MetadataLookupJsonResponse(query: MetadataQuery, result: Json) extends MetadataServiceResponse - final case class MetadataLookupFailed(query: MetadataQuery, reason: Throwable) + final case class MetadataLookupFailedTooLargeResponse(query: MetadataQuery, metadataSizeRows: Int) extends MetadataServiceResponse + final case class MetadataLookupFailedTimeoutResponse(query: MetadataQuery) extends MetadataServiceResponse final case class MetadataLookupResponse(query: MetadataQuery, eventList: Seq[MetadataEvent]) extends MetadataServiceResponse final case class MetadataServiceKeyLookupFailed(query: MetadataQuery, reason: Throwable) extends MetadataServiceFailure diff --git a/services/src/main/scala/cromwell/services/metadata/impl/MetadataDatabaseAccess.scala b/services/src/main/scala/cromwell/services/metadata/impl/MetadataDatabaseAccess.scala index ca597a6717e..844112b8014 100644 --- a/services/src/main/scala/cromwell/services/metadata/impl/MetadataDatabaseAccess.scala +++ b/services/src/main/scala/cromwell/services/metadata/impl/MetadataDatabaseAccess.scala @@ -100,7 +100,16 @@ trait MetadataDatabaseAccess { MetadataEntry(workflowUuid, jobKey.map(_._1), jobKey.flatMap(_._2), jobKey.map(_._3), key.key, value.toClobOption, valueType, timestamp) } - metadataDatabaseInterface.addMetadataEntries(metadata) + metadataDatabaseInterface.addMetadataEntries( + metadataEntries = metadata, + startMetadataKey = WorkflowMetadataKeys.StartTime, + endMetadataKey = WorkflowMetadataKeys.EndTime, + nameMetadataKey = WorkflowMetadataKeys.Name, + statusMetadataKey = WorkflowMetadataKeys.Status, + submissionMetadataKey = WorkflowMetadataKeys.SubmissionTime, + parentWorkflowIdKey = WorkflowMetadataKeys.ParentWorkflowId, + rootWorkflowIdKey = WorkflowMetadataKeys.RootWorkflowId, + labelMetadataKey = WorkflowMetadataKeys.Labels) } private def metadataToMetadataEvents(workflowId: WorkflowId)(metadata: Seq[MetadataEntry]): Seq[MetadataEvent] = { @@ -120,6 +129,10 @@ trait MetadataDatabaseAccess { } } + def queryMetadataEventsTotalRowNumber(workflowId: WorkflowId, timeout: Duration)(implicit ec: ExecutionContext): Future[Int] = { + metadataDatabaseInterface.getMetadataTotalRowNumberByRootWorkflowId(workflowId.toString, timeout) + } + def queryMetadataEvents(query: MetadataQuery, timeout: Duration)(implicit ec: ExecutionContext): Future[Seq[MetadataEvent]] = { def listKeyRequirements(keyRequirementsInput: Option[NonEmptyList[String]]): List[String] = keyRequirementsInput.map(_.toList).toList.flatten.map(_ + "%") @@ -170,26 +183,12 @@ trait MetadataDatabaseAccess { def refreshWorkflowMetadataSummaries(limit: Int)(implicit ec: ExecutionContext): Future[SummaryResult] = { for { increasingProcessed <- metadataDatabaseInterface.summarizeIncreasing( - startMetadataKey = WorkflowMetadataKeys.StartTime, - endMetadataKey = WorkflowMetadataKeys.EndTime, - nameMetadataKey = WorkflowMetadataKeys.Name, - statusMetadataKey = WorkflowMetadataKeys.Status, - submissionMetadataKey = WorkflowMetadataKeys.SubmissionTime, - parentWorkflowIdKey = WorkflowMetadataKeys.ParentWorkflowId, - rootWorkflowIdKey = WorkflowMetadataKeys.RootWorkflowId, labelMetadataKey = WorkflowMetadataKeys.Labels, limit = limit, buildUpdatedSummary = MetadataDatabaseAccess.buildUpdatedSummary) (decreasingProcessed, decreasingGap) <- metadataDatabaseInterface.summarizeDecreasing( summaryNameDecreasing = WorkflowMetadataKeys.SummaryNameDecreasing, summaryNameIncreasing = WorkflowMetadataKeys.SummaryNameIncreasing, - startMetadataKey = WorkflowMetadataKeys.StartTime, - endMetadataKey = WorkflowMetadataKeys.EndTime, - nameMetadataKey = WorkflowMetadataKeys.Name, - statusMetadataKey = WorkflowMetadataKeys.Status, - submissionMetadataKey = WorkflowMetadataKeys.SubmissionTime, - parentWorkflowIdKey = WorkflowMetadataKeys.ParentWorkflowId, - rootWorkflowIdKey = WorkflowMetadataKeys.RootWorkflowId, labelMetadataKey = WorkflowMetadataKeys.Labels, limit = limit, buildUpdatedSummary = MetadataDatabaseAccess.buildUpdatedSummary) diff --git a/services/src/main/scala/cromwell/services/metadata/impl/MetadataServiceActor.scala b/services/src/main/scala/cromwell/services/metadata/impl/MetadataServiceActor.scala index 9a7cceb4fcc..44b212debf1 100644 --- a/services/src/main/scala/cromwell/services/metadata/impl/MetadataServiceActor.scala +++ b/services/src/main/scala/cromwell/services/metadata/impl/MetadataServiceActor.scala @@ -49,9 +49,17 @@ case class MetadataServiceActor(serviceConfig: Config, globalConfig: Config, ser private val metadataReadTimeout: Duration = serviceConfig.getOrElse[Duration]("metadata-read-query-timeout", Duration.Inf) + private val metadataReadRowNumberSafetyThreshold: Int = + serviceConfig.getOrElse[Int]("metadata-read-row-number-safety-threshold", 1000000) - def readMetadataWorkerActorProps(): Props = ReadDatabaseMetadataWorkerActor.props(metadataReadTimeout).withDispatcher(ServiceDispatcher) - def metadataBuilderActorProps(): Props = MetadataBuilderActor.props(readMetadataWorkerActorProps).withDispatcher(ServiceDispatcher) + def readMetadataWorkerActorProps(): Props = + ReadDatabaseMetadataWorkerActor + .props(metadataReadTimeout, metadataReadRowNumberSafetyThreshold) + .withDispatcher(ServiceDispatcher) + + def metadataBuilderActorProps(): Props = MetadataBuilderActor + .props(readMetadataWorkerActorProps, metadataReadRowNumberSafetyThreshold) + .withDispatcher(ServiceDispatcher) val readActor = context.actorOf(ReadMetadataRegulatorActor.props(metadataBuilderActorProps, readMetadataWorkerActorProps), "ClassicMSA-ReadMetadataRegulatorActor") diff --git a/services/src/main/scala/cromwell/services/metadata/impl/ReadDatabaseMetadataWorkerActor.scala b/services/src/main/scala/cromwell/services/metadata/impl/ReadDatabaseMetadataWorkerActor.scala index f65937bbee0..21a54e48ee2 100644 --- a/services/src/main/scala/cromwell/services/metadata/impl/ReadDatabaseMetadataWorkerActor.scala +++ b/services/src/main/scala/cromwell/services/metadata/impl/ReadDatabaseMetadataWorkerActor.scala @@ -1,5 +1,7 @@ package cromwell.services.metadata.impl +import java.sql.SQLTimeoutException + import akka.actor.{Actor, ActorLogging, ActorRef, PoisonPill, Props} import cromwell.core.Dispatcher.ServiceDispatcher import cromwell.core.{WorkflowId, WorkflowSubmitted} @@ -12,15 +14,21 @@ import scala.concurrent.duration.Duration import scala.util.Try object ReadDatabaseMetadataWorkerActor { - def props(metadataReadTimeout: Duration) = Props(new ReadDatabaseMetadataWorkerActor(metadataReadTimeout)).withDispatcher(ServiceDispatcher) + def props(metadataReadTimeout: Duration, metadataReadRowNumberSafetyThreshold: Int) = + Props(new ReadDatabaseMetadataWorkerActor(metadataReadTimeout, metadataReadRowNumberSafetyThreshold)).withDispatcher(ServiceDispatcher) } -class ReadDatabaseMetadataWorkerActor(metadataReadTimeout: Duration) extends Actor with ActorLogging with MetadataDatabaseAccess with MetadataServicesStore { +class ReadDatabaseMetadataWorkerActor(metadataReadTimeout: Duration, metadataReadRowNumberSafetyThreshold: Int) + extends Actor + with ActorLogging + with MetadataDatabaseAccess + with MetadataServicesStore { implicit val ec = context.dispatcher def receive = { - case GetMetadataAction(query: MetadataQuery, _) => evaluateRespondAndStop(sender(), getMetadata(query)) + case GetMetadataAction(query: MetadataQuery, _, checkTotalMetadataRowNumberBeforeQuerying: Boolean) => + evaluateRespondAndStop(sender(), getMetadata(query, checkTotalMetadataRowNumberBeforeQuerying)) case GetStatus(workflowId) => evaluateRespondAndStop(sender(), getStatus(workflowId)) case GetLabels(workflowId) => evaluateRespondAndStop(sender(), queryLabelsAndRespond(workflowId)) case GetRootAndSubworkflowLabels(rootWorkflowId: WorkflowId) => evaluateRespondAndStop(sender(), queryRootAndSubworkflowLabelsAndRespond(rootWorkflowId)) @@ -41,14 +49,30 @@ class ReadDatabaseMetadataWorkerActor(metadataReadTimeout: Duration) extends Act () } - private def getMetadata(query: MetadataQuery): Future[MetadataServiceResponse] = { + private def getMetadata(query: MetadataQuery, checkResultSizeBeforeQuerying: Boolean): Future[MetadataServiceResponse] = { + if (checkResultSizeBeforeQuerying) { + queryMetadataEventsTotalRowNumber(query.workflowId, metadataReadTimeout) flatMap { size => + if (size > metadataReadRowNumberSafetyThreshold) { + Future.successful(MetadataLookupFailedTooLargeResponse(query, size)) + } else { + queryMetadata(query) + } + } recoverWith { + case _: SQLTimeoutException => Future.successful(MetadataLookupFailedTimeoutResponse(query)) + case t => Future.successful(MetadataServiceKeyLookupFailed(query, t)) + } + } else { + queryMetadata(query) + } + } + private def queryMetadata(query: MetadataQuery): Future[MetadataServiceResponse] = queryMetadataEvents(query, metadataReadTimeout) map { m => MetadataLookupResponse(query, m) } recover { + case _: SQLTimeoutException => MetadataLookupFailedTimeoutResponse(query) case t => MetadataServiceKeyLookupFailed(query, t) } - } private def getStatus(id: WorkflowId): Future[MetadataServiceResponse] = { diff --git a/services/src/main/scala/cromwell/services/metadata/impl/builder/MetadataBuilderActor.scala b/services/src/main/scala/cromwell/services/metadata/impl/builder/MetadataBuilderActor.scala index 6b7c50c92dd..ba93756bceb 100644 --- a/services/src/main/scala/cromwell/services/metadata/impl/builder/MetadataBuilderActor.scala +++ b/services/src/main/scala/cromwell/services/metadata/impl/builder/MetadataBuilderActor.scala @@ -43,8 +43,8 @@ object MetadataBuilderActor { def isComplete = subWorkflowsMetadata.size == waitFor } - def props(readMetadataWorkerMaker: () => Props, isForSubworkflows: Boolean = false) = { - Props(new MetadataBuilderActor(readMetadataWorkerMaker, isForSubworkflows)) + def props(readMetadataWorkerMaker: () => Props, metadataReadRowNumberSafetyThreshold: Int, isForSubworkflows: Boolean = false) = { + Props(new MetadataBuilderActor(readMetadataWorkerMaker, metadataReadRowNumberSafetyThreshold, isForSubworkflows)) } val log = LoggerFactory.getLogger("MetadataBuilder") @@ -241,7 +241,7 @@ object MetadataBuilderActor { } } -class MetadataBuilderActor(readMetadataWorkerMaker: () => Props, isForSubworkflows: Boolean) +class MetadataBuilderActor(readMetadataWorkerMaker: () => Props, metadataReadRowNumberSafetyThreshold: Int, isForSubworkflows: Boolean) extends LoggingFSM[MetadataBuilderActorState, MetadataBuilderActorData] with DefaultJsonProtocol { import MetadataBuilderActor._ @@ -278,6 +278,14 @@ class MetadataBuilderActor(readMetadataWorkerMaker: () => Props, isForSubworkflo allDone() case Event(MetadataLookupResponse(query, metadata), HasWorkData(target, originalRequest)) => processMetadataResponse(query, metadata, target, originalRequest) + case Event(MetadataLookupFailedTooLargeResponse(query, metadataSizeRows), HasWorkData(target, originalRequest)) => + val metadataTooLargeNumberOfRowsException = new MetadataTooLargeNumberOfRowsException(query.workflowId, metadataSizeRows, metadataReadRowNumberSafetyThreshold) + target ! FailedMetadataJsonResponse(originalRequest, metadataTooLargeNumberOfRowsException) + allDone() + case Event(MetadataLookupFailedTimeoutResponse(query), HasWorkData(target, originalRequest)) => + val metadataTooLargeTimeoutException = new MetadataTooLargeTimeoutException(query.workflowId) + target ! FailedMetadataJsonResponse(originalRequest, metadataTooLargeTimeoutException) + allDone() case Event(failure: MetadataServiceFailure, HasWorkData(target, originalRequest)) => target ! FailedMetadataJsonResponse(originalRequest, failure.reason) allDone() @@ -310,7 +318,7 @@ class MetadataBuilderActor(readMetadataWorkerMaker: () => Props, isForSubworkflo def processSubWorkflowMetadata(metadataResponse: MetadataJsonResponse, data: HasReceivedEventsData) = { metadataResponse match { - case SuccessfulMetadataJsonResponse(GetMetadataAction(queryKey, _), js) => + case SuccessfulMetadataJsonResponse(GetMetadataAction(queryKey, _, _), js) => val subId: WorkflowId = queryKey.workflowId val newData = data.withSubWorkflow(subId.toString, js) @@ -359,8 +367,8 @@ class MetadataBuilderActor(readMetadataWorkerMaker: () => Props, isForSubworkflo else { // Otherwise spin up a metadata builder actor for each sub workflow subWorkflowIds foreach { subId => - val subMetadataBuilder = context.actorOf(MetadataBuilderActor.props(readMetadataWorkerMaker, isForSubworkflows = true), uniqueActorName(subId)) - subMetadataBuilder ! GetMetadataAction(query.copy(workflowId = WorkflowId.fromString(subId))) + val subMetadataBuilder = context.actorOf(MetadataBuilderActor.props(readMetadataWorkerMaker, metadataReadRowNumberSafetyThreshold, isForSubworkflows = true), uniqueActorName(subId)) + subMetadataBuilder ! GetMetadataAction(query.copy(workflowId = WorkflowId.fromString(subId)), checkTotalMetadataRowNumberBeforeQuerying = false) } goto(WaitingForSubWorkflows) using HasReceivedEventsData(target, originalRequest, query, eventsList, Map.empty, subWorkflowIds.size) } diff --git a/services/src/main/scala/cromwell/services/metadata/package.scala b/services/src/main/scala/cromwell/services/metadata/package.scala index a4fef091f5a..e3a445e6723 100644 --- a/services/src/main/scala/cromwell/services/metadata/package.scala +++ b/services/src/main/scala/cromwell/services/metadata/package.scala @@ -1,5 +1,6 @@ package cromwell.services +import cromwell.core.WorkflowId import cromwell.services.metadata.MetadataService.{BuildMetadataJsonAction, MetadataServiceResponse} import spray.json.JsObject @@ -10,3 +11,16 @@ package object metadata { sealed trait MetadataJsonResponse extends MetadataServiceResponse { def originalRequest: BuildMetadataJsonAction } final case class SuccessfulMetadataJsonResponse(originalRequest: BuildMetadataJsonAction, responseJson: JsObject) extends MetadataJsonResponse final case class FailedMetadataJsonResponse(originalRequest: BuildMetadataJsonAction, reason: Throwable) extends MetadataJsonResponse + +class MetadataTooLargeException(message: String) extends RuntimeException(message) + +final class MetadataTooLargeNumberOfRowsException(workflowId: WorkflowId, metadataSizeRows: Int, metadataLimitRows: Int) + extends MetadataTooLargeException(s"Metadata for workflow $workflowId exists in" + + s"database, but cannot be served. This is done in order to avoid Cromwell failure: metadata is too large - " + + s"$metadataSizeRows rows, and may cause Cromwell instance to die on attempt to read it in memory. Configured " + + s"metadata safety limit is $metadataLimitRows.") + +final class MetadataTooLargeTimeoutException(workflowId: WorkflowId) + extends MetadataTooLargeException(s"Metadata for workflow $workflowId exists in" + + s"database, but cannot be served. This is done in order to avoid Cromwell failure: metadata is probably too " + + s"large - timeout occurred on attempt to fetch it from the database.") diff --git a/services/src/test/scala/cromwell/services/database/MetadataSlickDatabaseSpec.scala b/services/src/test/scala/cromwell/services/database/MetadataSlickDatabaseSpec.scala index a0cef43e33d..db404c1c9a1 100644 --- a/services/src/test/scala/cromwell/services/database/MetadataSlickDatabaseSpec.scala +++ b/services/src/test/scala/cromwell/services/database/MetadataSlickDatabaseSpec.scala @@ -4,13 +4,18 @@ import java.time.OffsetDateTime import com.dimafeng.testcontainers.Container import cromwell.core.Tags.DbmsTest +import cromwell.core.{WorkflowId, WorkflowMetadataKeys} +import cromwell.database.migration.metadata.table.symbol.MetadataStatement._ +import cromwell.database.slick.MetadataSlickDatabase +import cromwell.database.slick.MetadataSlickDatabase.SummarizationPartitionedMetadata import cromwell.database.sql.tables.{MetadataEntry, WorkflowMetadataSummaryEntry} +import cromwell.services.metadata.CallMetadataKeys import org.scalatest.concurrent.PatienceConfiguration.Timeout import org.scalatest.concurrent.ScalaFutures import org.scalatest.{FlatSpec, Matchers} -import scala.concurrent.duration._ import scala.concurrent.ExecutionContext +import scala.concurrent.duration._ class MetadataSlickDatabaseSpec extends FlatSpec with Matchers with ScalaFutures { @@ -23,9 +28,8 @@ class MetadataSlickDatabaseSpec extends FlatSpec with Matchers with ScalaFutures val containerOpt: Option[Container] = DatabaseTestKit.getDatabaseTestContainer(databaseSystem) lazy val database = DatabaseTestKit.initializeDatabaseByContainerOptTypeAndSystem(containerOpt, MetadataDatabaseType, databaseSystem) - import database.dataAccess.driver.api._ - import cromwell.database.migration.metadata.table.symbol.MetadataStatement.OffsetDateTimeToSystemTimestamp + import database.dataAccess.driver.api._ val now = OffsetDateTime.now().toSystemTimestamp it should "start container if required" taggedAs DbmsTest in { @@ -95,4 +99,118 @@ class MetadataSlickDatabaseSpec extends FlatSpec with Matchers with ScalaFutures } } + + behavior of "MetadataSlickDatabase" + it should "partition metadata for summarization correctly" in { + + def partition(metadata: Seq[MetadataEntry]): SummarizationPartitionedMetadata = { + MetadataSlickDatabase.partitionSummarizationMetadata( + rawMetadataEntries = metadata, + startMetadataKey = WorkflowMetadataKeys.StartTime, + endMetadataKey = WorkflowMetadataKeys.EndTime, + nameMetadataKey = WorkflowMetadataKeys.Name, + statusMetadataKey = WorkflowMetadataKeys.Status, + submissionMetadataKey = WorkflowMetadataKeys.SubmissionTime, + parentWorkflowIdKey = WorkflowMetadataKeys.ParentWorkflowId, + rootWorkflowIdKey = WorkflowMetadataKeys.RootWorkflowId, + labelMetadataKey = WorkflowMetadataKeys.Labels) + } + + { + // Edge condition: empty input + val partitioned = partition(List.empty) + partitioned.nonSummarizableMetadata shouldBe empty + partitioned.summarizableMetadata shouldBe empty + } + + { + // A mix of summarizable and non-summarizable keys specified at workflow and call levels. + val wfid = WorkflowId.randomId().id.toString + val callName = "my.call" + + def callEntry(key: String): MetadataEntry = + MetadataEntry(wfid, Option(callName), None, Option(1), key, None, None, OffsetDateTime.now().toSystemTimestamp) + + def workflowEntry(key: String): MetadataEntry = + MetadataEntry(wfid, None, None, None, key, None, None, OffsetDateTime.now().toSystemTimestamp) + + val rightKeysCallLevel = List( + callEntry(WorkflowMetadataKeys.StartTime), + callEntry(WorkflowMetadataKeys.EndTime), + callEntry(WorkflowMetadataKeys.Name), + callEntry(WorkflowMetadataKeys.Status), + callEntry(WorkflowMetadataKeys.SubmissionTime), + callEntry(WorkflowMetadataKeys.ParentWorkflowId), + callEntry(WorkflowMetadataKeys.RootWorkflowId), + callEntry(WorkflowMetadataKeys.Labels + ":arbitrary-label") + ) + + val wrongKeysCallLevel = List( + callEntry("complete"), + callEntry("rubbish") + ) + + val thingsThatLookKindOfLikeTheRightWorkflowKeysButActuallyAreNotAndAreCallScopedAnyway = List( + callEntry(CallMetadataKeys.Inputs + ":" + WorkflowMetadataKeys.StartTime), + callEntry(CallMetadataKeys.Inputs + ":" + WorkflowMetadataKeys.EndTime), + callEntry(CallMetadataKeys.Inputs + ":" + WorkflowMetadataKeys.Name), + callEntry(CallMetadataKeys.Inputs + ":" + WorkflowMetadataKeys.Status), + callEntry(CallMetadataKeys.Inputs + ":" + WorkflowMetadataKeys.SubmissionTime), + callEntry(CallMetadataKeys.Inputs + ":" + WorkflowMetadataKeys.ParentWorkflowId), + callEntry(CallMetadataKeys.Inputs + ":" + WorkflowMetadataKeys.RootWorkflowId), + callEntry(CallMetadataKeys.Inputs + ":" + WorkflowMetadataKeys.Labels + ":arbitrary-label"), + callEntry(CallMetadataKeys.Outputs + ":" + WorkflowMetadataKeys.StartTime), + callEntry(CallMetadataKeys.Outputs + ":" + WorkflowMetadataKeys.EndTime), + callEntry(CallMetadataKeys.Outputs + ":" + WorkflowMetadataKeys.Name), + callEntry(CallMetadataKeys.Outputs + ":" + WorkflowMetadataKeys.Status), + callEntry(CallMetadataKeys.Outputs + ":" + WorkflowMetadataKeys.SubmissionTime), + callEntry(CallMetadataKeys.Outputs + ":" + WorkflowMetadataKeys.ParentWorkflowId), + callEntry(CallMetadataKeys.Outputs + ":" + WorkflowMetadataKeys.RootWorkflowId), + callEntry(CallMetadataKeys.Outputs + ":" + WorkflowMetadataKeys.Labels + ":arbitrary-label") + ) + + val rightKeysWorkflowLevel = List( + workflowEntry(WorkflowMetadataKeys.StartTime), + workflowEntry(WorkflowMetadataKeys.EndTime), + workflowEntry(WorkflowMetadataKeys.Name), + workflowEntry(WorkflowMetadataKeys.Status), + workflowEntry(WorkflowMetadataKeys.SubmissionTime), + workflowEntry(WorkflowMetadataKeys.ParentWorkflowId), + workflowEntry(WorkflowMetadataKeys.RootWorkflowId), + workflowEntry(WorkflowMetadataKeys.Labels + ":arbitrary-label") + ) + + val wrongKeysWorkflowLevel = List( + workflowEntry("total"), + workflowEntry("garbage") + ) + + val thingsThatLookKindOfLikeTheRightWorkflowKeysButActuallyAreNot = List( + workflowEntry(WorkflowMetadataKeys.Inputs + ":" + WorkflowMetadataKeys.StartTime), + workflowEntry(WorkflowMetadataKeys.Inputs + ":" + WorkflowMetadataKeys.EndTime), + workflowEntry(WorkflowMetadataKeys.Inputs + ":" + WorkflowMetadataKeys.Name), + workflowEntry(WorkflowMetadataKeys.Inputs + ":" + WorkflowMetadataKeys.Status), + workflowEntry(WorkflowMetadataKeys.Inputs + ":" + WorkflowMetadataKeys.SubmissionTime), + workflowEntry(WorkflowMetadataKeys.Inputs + ":" + WorkflowMetadataKeys.ParentWorkflowId), + workflowEntry(WorkflowMetadataKeys.Inputs + ":" + WorkflowMetadataKeys.RootWorkflowId), + workflowEntry(WorkflowMetadataKeys.Inputs + ":" + WorkflowMetadataKeys.Labels + ":arbitrary-label"), + workflowEntry(WorkflowMetadataKeys.Outputs + ":" + WorkflowMetadataKeys.StartTime), + workflowEntry(WorkflowMetadataKeys.Outputs + ":" + WorkflowMetadataKeys.EndTime), + workflowEntry(WorkflowMetadataKeys.Outputs + ":" + WorkflowMetadataKeys.Name), + workflowEntry(WorkflowMetadataKeys.Outputs + ":" + WorkflowMetadataKeys.Status), + workflowEntry(WorkflowMetadataKeys.Outputs + ":" + WorkflowMetadataKeys.SubmissionTime), + workflowEntry(WorkflowMetadataKeys.Outputs + ":" + WorkflowMetadataKeys.ParentWorkflowId), + workflowEntry(WorkflowMetadataKeys.Outputs + ":" + WorkflowMetadataKeys.RootWorkflowId), + workflowEntry(WorkflowMetadataKeys.Outputs + ":" + WorkflowMetadataKeys.Labels + ":arbitrary-label") + ) + + val allTheWrongThings = rightKeysCallLevel ++ wrongKeysCallLevel ++ wrongKeysWorkflowLevel ++ + thingsThatLookKindOfLikeTheRightWorkflowKeysButActuallyAreNot ++ + thingsThatLookKindOfLikeTheRightWorkflowKeysButActuallyAreNotAndAreCallScopedAnyway + + val partitioned = partition(rightKeysWorkflowLevel ++ allTheWrongThings) + partitioned.nonSummarizableMetadata.toSet shouldBe (allTheWrongThings).toSet + partitioned.summarizableMetadata shouldBe rightKeysWorkflowLevel + } + } } diff --git a/services/src/test/scala/cromwell/services/metadata/impl/WriteMetadataActorSpec.scala b/services/src/test/scala/cromwell/services/metadata/impl/WriteMetadataActorSpec.scala index 5fa5abc8ea4..ed6e48f5c49 100644 --- a/services/src/test/scala/cromwell/services/metadata/impl/WriteMetadataActorSpec.scala +++ b/services/src/test/scala/cromwell/services/metadata/impl/WriteMetadataActorSpec.scala @@ -130,7 +130,15 @@ class WriteMetadataActorSpec extends TestKitSuite with FlatSpecLike with Matcher var requestsSinceLastSuccess = 0 // Return successful - override def addMetadataEntries(metadataEntries: Iterable[MetadataEntry]) + override def addMetadataEntries(metadataEntries: Iterable[MetadataEntry], + startMetadataKey: String, + endMetadataKey: String, + nameMetadataKey: String, + statusMetadataKey: String, + submissionMetadataKey: String, + parentWorkflowIdKey: String, + rootWorkflowIdKey: String, + labelMetadataKey: String) (implicit ec: ExecutionContext): Future[Unit] = { if (requestsSinceLastSuccess == failuresBetweenEachSuccess) { requestsSinceLastSuccess = 0 @@ -175,14 +183,7 @@ class WriteMetadataActorSpec extends TestKitSuite with FlatSpecLike with Matcher timeout: Duration) (implicit ec: ExecutionContext): Nothing = notImplemented() - override def summarizeIncreasing(startMetadataKey: String, - endMetadataKey: String, - nameMetadataKey: String, - statusMetadataKey: String, - submissionMetadataKey: String, - parentWorkflowIdKey: String, - rootWorkflowIdKey: String, - labelMetadataKey: String, + override def summarizeIncreasing(labelMetadataKey: String, limit: Int, buildUpdatedSummary: (Option[WorkflowMetadataSummaryEntry], Seq[MetadataEntry]) @@ -197,13 +198,6 @@ class WriteMetadataActorSpec extends TestKitSuite with FlatSpecLike with Matcher */ override def summarizeDecreasing(summaryNameDecreasing: String, summaryNameIncreasing: String, - startMetadataKey: String, - endMetadataKey: String, - nameMetadataKey: String, - statusMetadataKey: String, - submissionMetadataKey: String, - parentWorkflowIdKey: String, - rootWorkflowIdKey: String, labelMetadataKey: String, limit: Int, buildUpdatedSummary: @@ -289,6 +283,10 @@ class WriteMetadataActorSpec extends TestKitSuite with FlatSpecLike with Matcher override def getSummaryQueueSize()(implicit ec: ExecutionContext): Future[Int] = { notImplemented() } + + override def getMetadataTotalRowNumberByRootWorkflowId(rootWorkflowId: String, timeout: Duration)(implicit ec: ExecutionContext): Future[Int] = { + notImplemented() + } } } diff --git a/src/ci/bin/test.inc.sh b/src/ci/bin/test.inc.sh index e13d9dc7adf..28f525c7d39 100644 --- a/src/ci/bin/test.inc.sh +++ b/src/ci/bin/test.inc.sh @@ -113,11 +113,21 @@ cromwell::private::create_build_variables() { fi local git_commit_message - git_commit_message="$(git log --format=%B --max-count=1 HEAD 2>/dev/null || true)" + # The commit message to analyze should be the last one in the commit range. + # This works for both pull_request and push builds, unlike using 'git log HEAD' which gives a merge commit message + # on pull requests: + git_commit_message="$(git log --reverse ${TRAVIS_COMMIT_RANGE} | tail -n1 2>/dev/null || true)" + echo "Building for git commit message: ${git_commit_message}" + if [[ "${git_commit_message}" == *"[force ci]"* ]]; then CROMWELL_BUILD_FORCE_TESTS=true - else + CROMWELL_BUILD_MINIMAL_TESTS=false + elif [[ "${git_commit_message}" == *"[minimal ci]"* ]]; then CROMWELL_BUILD_FORCE_TESTS=false + CROMWELL_BUILD_MINIMAL_TESTS=true + else + CROMWELL_BUILD_FORCE_TESTS=false + CROMWELL_BUILD_MINIMAL_TESTS=false fi local git_revision @@ -132,7 +142,7 @@ cromwell::private::create_build_variables() { # branch. So, in case of push builds `git diff` will always return empty result. This is why we only use this short # circuiting logic for pull request builds cromwell::private::set_variable_if_only_some_files_changed "^mkdocs.yml|^docs/" "CROMWELL_BUILD_ONLY_DOCS_CHANGED" - cromwell::private::set_variable_if_only_some_files_changed "^scripts/" "CROMWELL_BUILD_ONLY_SCRIPTS_CHANGED" + cromwell::private::set_variable_if_only_some_files_changed "^src/ci/bin/testMetadataComparisonPython.sh|^scripts/" "CROMWELL_BUILD_ONLY_SCRIPTS_CHANGED" case "${CROMWELL_BUILD_PROVIDER}" in "${CROMWELL_BUILD_PROVIDER_TRAVIS}") @@ -157,7 +167,11 @@ cromwell::private::create_build_variables() { elif [[ "${CROMWELL_BUILD_ONLY_DOCS_CHANGED}" == "true" ]] && \ [[ "${BUILD_TYPE}" != "checkPublish" ]]; then CROMWELL_BUILD_RUN_TESTS=false - elif [[ "${CROMWELL_BUILD_ONLY_SCRIPTS_CHANGED}" == "true" ]]; then + elif [[ "${CROMWELL_BUILD_MINIMAL_TESTS}" == "true" ]] && \ + [[ "${TRAVIS_EVENT_TYPE}" != "push" ]]; then + CROMWELL_BUILD_RUN_TESTS=false + elif [[ "${CROMWELL_BUILD_ONLY_SCRIPTS_CHANGED}" == "true" ]] && \ + [[ "${BUILD_TYPE}" != "metadataComparisonPython" ]]; then CROMWELL_BUILD_RUN_TESTS=false elif [[ "${TRAVIS_EVENT_TYPE}" == "push" ]] && \ [[ "${BUILD_TYPE}" != "sbt" ]]; then diff --git a/src/ci/bin/testMetadataComparisonPython.sh b/src/ci/bin/testMetadataComparisonPython.sh new file mode 100755 index 00000000000..b6666a0a623 --- /dev/null +++ b/src/ci/bin/testMetadataComparisonPython.sh @@ -0,0 +1,34 @@ +#!/usr/bin/env bash + +set -o errexit -o nounset -o pipefail + +export CROMWELL_BUILD_REQUIRES_SECURE=true + +# import in shellcheck / CI / IntelliJ compatible ways +# shellcheck source=/dev/null +source "${BASH_SOURCE%/*}/test.inc.sh" || source test.inc.sh + +cromwell::build::setup_common_environment + +# Our Python scripts can only be run by Python version >= 3.6, because we use string interpolation which was introduced +# in Python 3.6. But Travis environment is Ubuntu 16.04 Xenial LTS, which officially supports only Python <= 3.5. +# Thus we have to run Python tests in Docker container. +GOOGLE_CENTAUR_SERVICE_ACCOUNT_JSON="cromwell-centaur-service-account.json" +export GOOGLE_APPLICATION_CREDENTIALS="${CROMWELL_BUILD_RESOURCES_DIRECTORY}/${GOOGLE_CENTAUR_SERVICE_ACCOUNT_JSON}" +export DIGESTER_TEST_GCS=true + +docker run -it --rm \ + -e GOOGLE_APPLICATION_CREDENTIALS \ + -e DIGESTER_TEST_GCS \ + -v "${CROMWELL_BUILD_RESOURCES_DIRECTORY}:${CROMWELL_BUILD_RESOURCES_DIRECTORY}" \ + -v "${CROMWELL_BUILD_ROOT_DIRECTORY}/scripts/metadata_comparison:/metadata_comparison" \ + python:3 /bin/bash -c " + +pip install --upgrade requests +pip install --upgrade google-api-python-client +pip install --upgrade google-cloud +pip install --upgrade google-cloud-storage +pip install --upgrade pandas +pip install --upgrade gitpython +python -m unittest discover -v /metadata_comparison +" diff --git a/supportedBackends/aws/src/main/scala/cromwell/backend/impl/aws/AwsBatchAsyncBackendJobExecutionActor.scala b/supportedBackends/aws/src/main/scala/cromwell/backend/impl/aws/AwsBatchAsyncBackendJobExecutionActor.scala index 0a911d963f5..494dd584a5f 100755 --- a/supportedBackends/aws/src/main/scala/cromwell/backend/impl/aws/AwsBatchAsyncBackendJobExecutionActor.scala +++ b/supportedBackends/aws/src/main/scala/cromwell/backend/impl/aws/AwsBatchAsyncBackendJobExecutionActor.scala @@ -32,6 +32,7 @@ package cromwell.backend.impl.aws import java.net.SocketTimeoutException +import java.io.FileNotFoundException import akka.actor.ActorRef import akka.pattern.AskSupport @@ -46,9 +47,10 @@ import cromwell.backend.impl.aws.OccasionalStatusPollingActor.{NotifyOfStatus, W import cromwell.backend.impl.aws.RunStatus.{Initializing, TerminalRunStatus} import cromwell.backend.impl.aws.io._ import cromwell.backend.io.DirectoryFunctions +import cromwell.backend.io.JobPaths import cromwell.backend.standard.{StandardAsyncExecutionActor, StandardAsyncExecutionActorParams, StandardAsyncJob} import cromwell.core._ -import cromwell.core.path.{DefaultPathBuilder, Path} +import cromwell.core.path.{DefaultPathBuilder, Path, PathFactory,PathBuilder} import cromwell.core.io.DefaultIoCommandBuilder import cromwell.core.retry.SimpleExponentialBackoff import cromwell.filesystems.s3.S3Path @@ -460,9 +462,28 @@ class AwsBatchAsyncBackendJobExecutionActor(override val standardParams: Standar case unknown => throw new RuntimeException(s"Attempt to get terminal metadata from non terminal status: $unknown") } } + def hostAbsoluteFilePath(jobPaths: JobPaths, pathString: String): Path = { + + val pathBuilders:List[PathBuilder] = List(DefaultPathBuilder) + val path = PathFactory.buildPath(pathString, pathBuilders) + if (!path.isAbsolute) + jobPaths.callExecutionRoot.resolve(path).toAbsolutePath + else if(jobPaths.isInExecution(path.pathAsString)) + jobPaths.hostPathFromContainerPath(path.pathAsString) + else + jobPaths.hostPathFromContainerInputs(path.pathAsString) + } override def mapOutputWomFile(womFile: WomFile): WomFile = { - womFileToPath(generateAwsBatchOutputs(jobDescriptor))(womFile) + val wfile = configuration.fileSystem match { + case AWSBatchStorageSystems.s3 => + womFile + case _ => + val hostPath = hostAbsoluteFilePath(jobPaths, womFile.valueString) + if (!hostPath.exists) throw new FileNotFoundException(s"Could not process output, file not found: ${hostPath.pathAsString}") + womFile mapFile { _ => hostPath.pathAsString } + } + womFileToPath(generateAwsBatchOutputs(jobDescriptor))(wfile) } private[aws] def womFileToPath(outputs: Set[AwsBatchFileOutput])(womFile: WomFile): WomFile = { diff --git a/supportedBackends/google/pipelines/common/src/main/scala/cromwell/backend/google/pipelines/common/PipelinesApiJobPaths.scala b/supportedBackends/google/pipelines/common/src/main/scala/cromwell/backend/google/pipelines/common/PipelinesApiJobPaths.scala index f6a4d5d491b..a9c359bc972 100644 --- a/supportedBackends/google/pipelines/common/src/main/scala/cromwell/backend/google/pipelines/common/PipelinesApiJobPaths.scala +++ b/supportedBackends/google/pipelines/common/src/main/scala/cromwell/backend/google/pipelines/common/PipelinesApiJobPaths.scala @@ -14,7 +14,10 @@ object PipelinesApiJobPaths { val GcsDelocalizationScriptName = "gcs_delocalization.sh" } -final case class PipelinesApiJobPaths(override val workflowPaths: PipelinesApiWorkflowPaths, jobKey: BackendJobDescriptorKey, override val isCallCacheCopyAttempt: Boolean = false) extends JobPaths { +// Non-`final` as this is mocked for testing since using a real instance proved too difficult. +// Do not subclass this or other case classes in production code, at least without understanding the pitfalls: +// https://nrinaudo.github.io/scala-best-practices/tricky_behaviours/final_case_classes.html +case class PipelinesApiJobPaths(override val workflowPaths: PipelinesApiWorkflowPaths, jobKey: BackendJobDescriptorKey, override val isCallCacheCopyAttempt: Boolean = false) extends JobPaths { // `jesLogBasename` is a `def` rather than a `val` because it is referenced polymorphically from // the initialization code of the extended `JobPaths` trait, but this class will not have initialized its `val`s diff --git a/supportedBackends/google/pipelines/common/src/test/scala/cromwell/backend/google/pipelines/common/callcaching/PipelinesApiBackendCacheHitCopyingActorSpec.scala b/supportedBackends/google/pipelines/common/src/test/scala/cromwell/backend/google/pipelines/common/callcaching/PipelinesApiBackendCacheHitCopyingActorSpec.scala new file mode 100644 index 00000000000..4311814260b --- /dev/null +++ b/supportedBackends/google/pipelines/common/src/test/scala/cromwell/backend/google/pipelines/common/callcaching/PipelinesApiBackendCacheHitCopyingActorSpec.scala @@ -0,0 +1,544 @@ +package cromwell.backend.google.pipelines.common.callcaching + +import akka.event.NoLogging +import akka.testkit.{ImplicitSender, TestFSMRef, TestProbe} +import com.typesafe.config.ConfigFactory +import cromwell.backend.BackendCacheHitCopyingActor.{CopyOutputsCommand, CopyingOutputsFailedResponse} +import cromwell.backend.BackendJobExecutionActor.JobSucceededResponse +import cromwell.backend.google.pipelines.common._ +import cromwell.backend.io.JobPaths +import cromwell.backend.standard.StandardValidatedRuntimeAttributesBuilder +import cromwell.backend.standard.callcaching.CopyingActorBlacklistCacheSupport.HasFormatting +import cromwell.backend.standard.callcaching.StandardCacheHitCopyingActor._ +import cromwell.backend.standard.callcaching._ +import cromwell.backend.validation.ValidatedRuntimeAttributes +import cromwell.backend.{BackendJobDescriptor, BackendJobDescriptorKey, BackendWorkflowDescriptor} +import cromwell.core._ +import cromwell.core.callcaching.DockerWithHash +import cromwell.core.io.DefaultIoCommand.DefaultIoCopyCommand +import cromwell.core.io.{IoFailure, IoReadForbiddenFailure, IoSuccess} +import cromwell.core.path.Path +import cromwell.services.CallCaching.CallCachingEntryId +import cromwell.services.instrumentation.CromwellCount +import cromwell.services.instrumentation.InstrumentationService.InstrumentationServiceMessage +import eu.timepit.refined.numeric.Positive +import eu.timepit.refined.refineMV +import org.scalatest.concurrent.Eventually +import org.scalatest.{FlatSpecLike, Matchers} +import org.slf4j.Logger +import org.specs2.mock.Mockito +import wom.callable.CommandTaskDefinition +import wom.graph.{CommandCallNode, FullyQualifiedName, LocalName, WomIdentifier} +import wom.values.WomValue + +import scala.concurrent.duration._ +import scala.language.postfixOps +import scala.util.{Success, Try} + + +class PipelinesApiBackendCacheHitCopyingActorSpec extends TestKitSuite("PipelinesApiBackendCacheHitCopyingActor") + with FlatSpecLike with Matchers with ImplicitSender with Mockito with Eventually { + + behavior of "PipelinesApiBackendCacheHitCopyingActor" + + private val TaskCall = "bar" + private val LockedDownBucket = "locked-down-bucket" + private val WideOpenBucket = "wide-open-bucket" + private val GoogleProject = "cache_as_cache_can" + + it should "do all the right things with blacklisting hits and buckets with groupings enabled" in { + + val configString = + """ + |call-caching { + | enabled: true + | blacklist-cache { + | enabled: true + | + | groupings { + | workflow-option: google_project + | } + | } + |} + |""".stripMargin + + val blacklistManager = new CallCachingBlacklistManager(ConfigFactory.parseString(configString), NoLogging) + val grouping = Option(GoogleProject) + val workflow = buildWorkflow(grouping) + val blacklistCache = blacklistManager.blacklistCacheFor(workflow).get + + // Make sure we got the expected type of cache + blacklistCache match { + case _: GroupingBlacklistCache => + case bad => fail(s"Unexpected blacklist cache type, expected GroupingBlacklistCache: ${bad.getClass.getSimpleName}") + } + + { + // Step 0: a successful copy attempt. There's a lot of darkness ahead so begin with a bit of light. + val ioActor = TestProbe() + val serviceRegistryActor = TestProbe() + val supervisor = TestProbe() + val copyActor = buildCopyActor( + workflow = workflow, + blacklistCache = blacklistCache, + fakeIoActor = ioActor, + fakeServiceRegistryActor = serviceRegistryActor, + supervisor = supervisor, + grouping = grouping) + + val copyCommand = buildCopyCommand(hitId = 0, bucket = WideOpenBucket) + supervisor watch copyActor + + copyActor ! copyCommand + + eventually { + copyActor.underlyingActor.stateName shouldBe WaitingForIoResponses + } + + ioActor.expectMsgPF(5 seconds) { + case ioCommand: DefaultIoCopyCommand => + ioActor.reply(IoSuccess(ioCommand, ())) + } + + supervisor.expectMsgPF(5 seconds) { case _: JobSucceededResponse => } + + val counts = instrumentationCounts(n = 4, serviceRegistryActor = serviceRegistryActor) + val (List(hitBegin, hitEnd), List(bucketBegin, bucketEnd)) = counts partition { + _.bucket.path.toList.contains("hit") + } + + hitBegin.bucket.path.toList shouldBe expectedMetric(Hit, Read, grouping = GoogleProject, value = "0", UntestedCacheResult) + bucketBegin.bucket.path.toList shouldBe expectedMetric(Bucket, Read, grouping = GoogleProject, value = WideOpenBucket, UntestedCacheResult) + + hitEnd.bucket.path.toList shouldBe expectedMetric(Hit, Write, grouping = GoogleProject, value = "0", GoodCacheResult) + bucketEnd.bucket.path.toList shouldBe expectedMetric(Bucket, Write, grouping = GoogleProject, value = WideOpenBucket, GoodCacheResult) + + blacklistCache.bucketCache.size() shouldBe 1 + blacklistCache.bucketCache.get(WideOpenBucket) shouldBe GoodCacheResult + + blacklistCache.hitCache.size() shouldBe 1 + blacklistCache.hitCache.get(CallCachingEntryId(0)) shouldBe GoodCacheResult + } + + { + // Step 1: an attempt to read a hit of unknown status from a bucket of unknown status, but the IoActor will report + // a forbidden (403) failure which should cause hit and bucket blacklisting. + val ioActor = TestProbe() + val serviceRegistryActor = TestProbe() + val supervisor = TestProbe() + + val copyActor = buildCopyActor( + workflow = workflow, + blacklistCache = blacklistCache, + fakeIoActor = ioActor, + fakeServiceRegistryActor = serviceRegistryActor, + supervisor = supervisor, + grouping = grouping) + + val command = buildCopyCommand(hitId = 1, bucket = LockedDownBucket) + supervisor watch copyActor + + copyActor ! command + + eventually { + copyActor.underlyingActor.stateName shouldBe WaitingForIoResponses + } + + ioActor.expectMsgPF(5 seconds) { + case ioCommand: DefaultIoCopyCommand => + val failedPath = command.jobDetritusFiles(JobPaths.ReturnCodePathKey) + ioActor.reply(IoReadForbiddenFailure(ioCommand, new RuntimeException(), failedPath)) + } + + supervisor.expectMsgPF(5 seconds) { case _: CopyingOutputsFailedResponse => } + + val counts = instrumentationCounts(n = 4, serviceRegistryActor = serviceRegistryActor) + + // Expect read hit and read bucket UntestedCacheResult followed by write hit and write bucket BadCacheResult. + { + val (List(hitBegin, hitEnd), List(bucketBegin, bucketEnd)) = counts partition { + _.bucket.path.toList.contains("hit") + } + + hitBegin.bucket.path.toList shouldBe expectedMetric(Hit, Read, grouping = GoogleProject, value = "1", UntestedCacheResult) + bucketBegin.bucket.path.toList shouldBe expectedMetric(Bucket, Read, grouping = GoogleProject, value = LockedDownBucket, UntestedCacheResult) + + hitEnd.bucket.path.toList shouldBe expectedMetric(Hit, Write, grouping = GoogleProject, value = "1", BadCacheResult) + bucketEnd.bucket.path.toList shouldBe expectedMetric(Bucket, Write, grouping = GoogleProject, value = LockedDownBucket, BadCacheResult) + } + + // Assert blacklist entries were made for bucket and hit. + blacklistCache.bucketCache.size() shouldBe 2 + blacklistCache.bucketCache.get(WideOpenBucket) shouldBe GoodCacheResult + blacklistCache.bucketCache.get(LockedDownBucket) shouldBe BadCacheResult + + blacklistCache.hitCache.size() shouldBe 2 + blacklistCache.hitCache.get(CallCachingEntryId(0)) shouldBe GoodCacheResult + blacklistCache.hitCache.get(CallCachingEntryId(1)) shouldBe BadCacheResult + + supervisor.expectTerminated(copyActor, 5 seconds) + } + + { + // Step 2: an attempt to read an unknown hit from a blacklisted bucket. + val ioActor = TestProbe() + val serviceRegistryActor = TestProbe() + val supervisor = TestProbe() + + val copyActor = buildCopyActor( + workflow = workflow, + blacklistCache = blacklistCache, + fakeIoActor = ioActor, + fakeServiceRegistryActor = serviceRegistryActor, + supervisor = supervisor, + grouping = grouping) + + supervisor watch copyActor + + val command = buildCopyCommand(hitId = 2, bucket = LockedDownBucket) + copyActor ! command + + supervisor.expectMsgPF(5 seconds) { case _: CopyingOutputsFailedResponse => } + // In this circumstance the copy actor just stops itself without transitioning out of Idle. + supervisor.expectTerminated(copyActor) + // Copying should be short-circuited by the bucket being blacklisted, so no communication with the IoActor. + ioActor.expectNoMessage(max = 5 seconds) + + val List(hitMessage, bucketMessage) = instrumentationCounts(n = 2, serviceRegistryActor = serviceRegistryActor) + + // Hit status is unknown but bucket status is known bad. + hitMessage.bucket.path.toList shouldBe expectedMetric(Hit, Read, grouping = GoogleProject, value = "2", UntestedCacheResult) + bucketMessage.bucket.path.toList shouldBe expectedMetric(Bucket, Read, grouping = GoogleProject, value = LockedDownBucket, BadCacheResult) + + blacklistCache.bucketCache.size() shouldBe 2 + blacklistCache.bucketCache.get(WideOpenBucket) shouldBe GoodCacheResult + blacklistCache.bucketCache.get(LockedDownBucket) shouldBe BadCacheResult + + blacklistCache.hitCache.size() shouldBe 3 + blacklistCache.hitCache.get(CallCachingEntryId(0)) shouldBe GoodCacheResult + blacklistCache.hitCache.get(CallCachingEntryId(1)) shouldBe BadCacheResult + blacklistCache.hitCache.get(CallCachingEntryId(2)) shouldBe UntestedCacheResult + } + + { + // Step 3: a generic failure to read a cache hit from a bucket not known to be bad should cause the hit to be + // marked bad but not its containing bucket. + val ioActor = TestProbe() + val serviceRegistryActor = TestProbe() + val supervisor = TestProbe() + val copyActor = buildCopyActor( + workflow = workflow, + blacklistCache = blacklistCache, + fakeIoActor = ioActor, + fakeServiceRegistryActor = serviceRegistryActor, + supervisor = supervisor, + grouping = grouping) + + supervisor watch copyActor + + val command = buildCopyCommand(hitId = 3, bucket = WideOpenBucket) + copyActor ! command + + eventually { + copyActor.underlyingActor.stateName shouldBe WaitingForIoResponses + } + + ioActor.expectMsgPF(5 seconds) { + case ioCommand: DefaultIoCopyCommand => + ioActor.reply(IoFailure(ioCommand, new RuntimeException())) + } + + val List(readHit, readBucket, writeHit) = instrumentationCounts(n = 3, serviceRegistryActor = serviceRegistryActor) + + readHit.bucket.path.toList shouldBe expectedMetric(Hit, Read, grouping = GoogleProject, value = "3", UntestedCacheResult) + readBucket.bucket.path.toList shouldBe expectedMetric(Bucket, Read, grouping = GoogleProject, value = WideOpenBucket, GoodCacheResult) + writeHit.bucket.path.toList shouldBe expectedMetric(Hit, Write, grouping = GoogleProject, value = "3", BadCacheResult) + + // Assert blacklist entries were made for bucket and hit. + blacklistCache.bucketCache.size() shouldBe 2 + // The hit is bad but because the failure was generic the bucket which was marked good should stay that way. + blacklistCache.bucketCache.get(WideOpenBucket) shouldBe GoodCacheResult + blacklistCache.bucketCache.get(LockedDownBucket) shouldBe BadCacheResult + + blacklistCache.hitCache.size() shouldBe 4 + blacklistCache.hitCache.get(CallCachingEntryId(0)) shouldBe GoodCacheResult + blacklistCache.hitCache.get(CallCachingEntryId(1)) shouldBe BadCacheResult + blacklistCache.hitCache.get(CallCachingEntryId(2)) shouldBe UntestedCacheResult + blacklistCache.hitCache.get(CallCachingEntryId(3)) shouldBe BadCacheResult + + supervisor.expectMsgPF(5 seconds) { case _: CopyingOutputsFailedResponse => } + + supervisor.expectTerminated(copyActor, 5 seconds) + } + + val workflow2 = buildWorkflow(grouping) + + { + // Step 4: a new workflow from the same grouping tries to copy the bad cache hit from step 3. + val ioActor = TestProbe() + val serviceRegistryActor = TestProbe() + val supervisor = TestProbe() + val copyActor = buildCopyActor( + workflow = workflow2, + blacklistCache = blacklistCache, + fakeIoActor = ioActor, + fakeServiceRegistryActor = serviceRegistryActor, + supervisor = supervisor, + grouping = grouping) + + supervisor watch copyActor + + val command = buildCopyCommand(hitId = 3, bucket = WideOpenBucket) + copyActor ! command + + supervisor.expectMsgPF(5 seconds) { + case _: CopyingOutputsFailedResponse => + } + // The IoActor should not be consulted and the copying actor should simply stop itself without transitioning. + supervisor.expectTerminated(copyActor) + ioActor.expectNoMessage(5 seconds) + + val List(readHit) = instrumentationCounts(n = 1, serviceRegistryActor = serviceRegistryActor) + + readHit.bucket.path.toList shouldBe expectedMetric(Hit, Read, grouping = GoogleProject, value = "3", BadCacheResult) + + // Assert blacklist entries were made for bucket and hit. + blacklistCache.bucketCache.size() shouldBe 2 + // The hit is bad but because the failure was generic the bucket which was previously marked good should stay that way. + blacklistCache.bucketCache.get(WideOpenBucket) shouldBe GoodCacheResult + blacklistCache.bucketCache.get(LockedDownBucket) shouldBe BadCacheResult + + blacklistCache.hitCache.size() shouldBe 4 + blacklistCache.hitCache.get(CallCachingEntryId(0)) shouldBe GoodCacheResult + blacklistCache.hitCache.get(CallCachingEntryId(1)) shouldBe BadCacheResult + blacklistCache.hitCache.get(CallCachingEntryId(2)) shouldBe UntestedCacheResult + blacklistCache.hitCache.get(CallCachingEntryId(3)) shouldBe BadCacheResult + } + + { + // Step 5: a new workflow from the same grouping tries to copy an unknown cache hit from the bucket blacklisted in step 1. + val ioActor = TestProbe() + val serviceRegistryActor = TestProbe() + val supervisor = TestProbe() + val copyActor = buildCopyActor( + workflow = workflow2, + blacklistCache = blacklistCache, + fakeIoActor = ioActor, + fakeServiceRegistryActor = serviceRegistryActor, + supervisor = supervisor, + grouping = grouping) + + supervisor watch copyActor + + val command = buildCopyCommand(hitId = 4, bucket = LockedDownBucket) + copyActor ! command + + supervisor.expectMsgPF(5 seconds) { + case _: CopyingOutputsFailedResponse => + } + // The IoActor should not be consulted and the copying actor should simply stop itself without transitioning. + supervisor.expectTerminated(copyActor) + ioActor.expectNoMessage(5 seconds) + + val List(readHit, readBucket) = instrumentationCounts(n = 2, serviceRegistryActor = serviceRegistryActor) + + readHit.bucket.path.toList shouldBe expectedMetric(Hit, Read, grouping = GoogleProject, value = "4", UntestedCacheResult) + readBucket.bucket.path.toList shouldBe expectedMetric(Bucket, Read, grouping = GoogleProject, value = LockedDownBucket, BadCacheResult) + + // Assert blacklist entries were made for bucket and hit. + blacklistCache.bucketCache.size() shouldBe 2 + // The hit is bad but because the failure was generic the bucket which was previously marked good should stay that way. + blacklistCache.bucketCache.get(WideOpenBucket) shouldBe GoodCacheResult + blacklistCache.bucketCache.get(LockedDownBucket) shouldBe BadCacheResult + + blacklistCache.hitCache.size() shouldBe 5 + blacklistCache.hitCache.get(CallCachingEntryId(0)) shouldBe GoodCacheResult + blacklistCache.hitCache.get(CallCachingEntryId(1)) shouldBe BadCacheResult + blacklistCache.hitCache.get(CallCachingEntryId(2)) shouldBe UntestedCacheResult + blacklistCache.hitCache.get(CallCachingEntryId(3)) shouldBe BadCacheResult + blacklistCache.hitCache.get(CallCachingEntryId(4)) shouldBe UntestedCacheResult + } + } + + private def instrumentationCounts(n: Int, serviceRegistryActor: TestProbe): List[CromwellCount] = { + val received = serviceRegistryActor.receiveN(n = n, max = 5 seconds).toList + val instrumentationCounts = received collect { case InstrumentationServiceMessage(c) => c } collect { case c: CromwellCount => c } + instrumentationCounts foreach { c => c.value shouldBe 1; c.sampling shouldBe 1.0 } + + instrumentationCounts + } + + type TestFSMRefPipelinesApiBackendCacheHitCopyingActor = TestFSMRef[ + StandardCacheHitCopyingActorState, + Option[StandardCacheHitCopyingActorData], + PipelinesApiBackendCacheHitCopyingActor + ] + + private def buildWorkflow(grouping: Option[String]): HasWorkflowIdAndSources = { + val workflowId = WorkflowId.randomId() + val workflow = new HasWorkflowIdAndSources { + override val sources: WorkflowSourceFilesCollection = { + val collection = mock[WorkflowSourceFilesCollection] + val workflowOptions = grouping match { + case None => WorkflowOptions.empty + case Some(g) => WorkflowOptions.fromMap(Map("google_project" -> g)).get + } + collection.workflowOptions returns workflowOptions + + collection + } + + override def id: WorkflowId = workflowId + } + workflow + } + + private def buildCopyActor(workflow: HasWorkflowIdAndSources, + blacklistCache: BlacklistCache, + fakeIoActor: TestProbe, + fakeServiceRegistryActor: TestProbe, + supervisor: TestProbe, + grouping: Option[String]): TestFSMRefPipelinesApiBackendCacheHitCopyingActor = { + // Couldn't mock this, possibly due to the use of `Refined` in two parameters: + // + // Underlying exception : java.lang.IllegalArgumentException: Cannot cast to primitive type: int + // org.mockito.exceptions.base.MockitoException: + // Mockito cannot mock this class: class cromwell.backend.google.pipelines.common.PipelinesApiConfigurationAttributes. + val papiConfigurationAttributes = PipelinesApiConfigurationAttributes( + project = null, + computeServiceAccount = null, + auths = null, + restrictMetadataAccess = false, + enableFuse = false, + executionBucket = null, + endpointUrl = null, + location = null, + maxPollingInterval = 0, + qps = refineMV[Positive](10), + cacheHitDuplicationStrategy = CopyCachedOutputs, + requestWorkers = refineMV[Positive](1), + pipelineTimeout = null, + logFlushPeriod = None, + gcsTransferConfiguration = null, + virtualPrivateCloudConfiguration = None, + batchRequestTimeoutConfiguration = null, + memoryRetryConfiguration = None, + ) + + val papiConfiguration = mock[PipelinesApiConfiguration] + papiConfiguration.papiAttributes returns papiConfigurationAttributes + + val commandTaskDefinition = mock[CommandTaskDefinition] + commandTaskDefinition.outputs returns List.empty + val commandCallNode = CommandCallNode( + identifier = WomIdentifier(LocalName("bar"), FullyQualifiedName("foo.bar")), + callable = commandTaskDefinition, + inputPorts = Set.empty, + inputDefinitionMappings = List.empty, + nonInputBasedPrerequisites = Set.empty, + outputIdentifierCompoundingFunction = null, + sourceLocation = None + ) + + val backendJobDescriptorKey = BackendJobDescriptorKey( + call = commandCallNode, + index = None, + attempt = 1 + ) + + def mapper(jobPaths: PipelinesApiJobPaths, originalPath: String): String = originalPath + + val workflowDescriptor = mock[BackendWorkflowDescriptor] + workflowDescriptor.id returns workflow.id + workflowDescriptor.workflowOptions returns WorkflowOptions.fromMap(Map("jes_gcs_root" -> "foo")).get + + val workflowPaths = mock[PipelinesApiWorkflowPaths] + workflowPaths.standardStreamNameToFileNameMetadataMapper returns mapper + workflowPaths.workflowRoot returns mock[Path] + val pipelinesApiJobPaths = mock[PipelinesApiJobPaths] + pipelinesApiJobPaths.workflowPaths returns workflowPaths + + val copyDestinationPaths = mock[PipelinesApiJobPaths] + val copyDestinationRcPath = mock[Path] + copyDestinationPaths.detritusPaths returns Map(JobPaths.ReturnCodePathKey -> copyDestinationRcPath) + + pipelinesApiJobPaths.forCallCacheCopyAttempts returns copyDestinationPaths + pipelinesApiJobPaths.metadataPaths returns Map.empty + workflowPaths.toJobPaths(any[BackendJobDescriptor]).returns(pipelinesApiJobPaths) + + def identityPathMocker(str: Any): Try[Path] = { + val path = mock[Path] + path.toString returns str.asInstanceOf[String] + Success(path) + } + + workflowPaths.getPath(anyString).answers(identityPathMocker _) + workflowPaths.gcsAuthFilePath returns mock[Path] + + val runtimeAttributesBuilder = mock[StandardValidatedRuntimeAttributesBuilder] + runtimeAttributesBuilder.build(any[Map[String, WomValue]], any[Logger]).returns(ValidatedRuntimeAttributes(Map.empty)) + + val backendInitializationData = mock[PipelinesApiBackendInitializationData] + backendInitializationData.papiConfiguration returns papiConfiguration + backendInitializationData.workflowPaths returns workflowPaths + backendInitializationData.runtimeAttributesBuilder returns runtimeAttributesBuilder + + val backendJobDescriptor = BackendJobDescriptor( + workflowDescriptor = workflowDescriptor, + key = backendJobDescriptorKey, + runtimeAttributes = Map.empty, + evaluatedTaskInputs = Map.empty, + maybeCallCachingEligible = DockerWithHash("foo"), + dockerSize = None, + prefetchedKvStoreEntries = Map.empty + ) + + val params = DefaultStandardCacheHitCopyingActorParams( + jobDescriptor = backendJobDescriptor, + backendInitializationDataOption = Option(backendInitializationData), + serviceRegistryActor = fakeServiceRegistryActor.ref, + ioActor = fakeIoActor.ref, + configurationDescriptor = null, + cacheCopyAttempt = 0, + blacklistCache = Option(blacklistCache) + ) + + val actorUnderTest = TestFSMRef(new PipelinesApiBackendCacheHitCopyingActor(params), supervisor = supervisor.ref) + + eventually { + actorUnderTest.underlyingActor.stateName shouldBe Idle + } + + actorUnderTest + } + + private def buildCopyCommand(hitId: Int, bucket: String): CopyOutputsCommand = { + val callRoot = s"gs://$bucket/workflow-id/call-name" + val rcFile = callRoot + "/rc" + + CopyOutputsCommand( + womValueSimpletons = List.empty, + jobDetritusFiles = Map( + JobPaths.CallRootPathKey -> callRoot, + JobPaths.ReturnCodePathKey -> rcFile), + returnCode = Option(0), + cacheHit = CallCachingEntryId(hitId) + ) + } + + sealed trait BlacklistingType extends HasFormatting + case object Hit extends BlacklistingType + case object Bucket extends BlacklistingType + + sealed trait CacheAccessType extends HasFormatting + case object Read extends CacheAccessType + case object Write extends CacheAccessType + + private def expectedMetric(hitOrBucket: BlacklistingType, accessType: CacheAccessType, grouping: String, value: String, status: BlacklistStatus): List[String] = { + List("job", "callcaching", "blacklist", + accessType.metricFormat, + hitOrBucket.metricFormat, + TaskCall, grouping, value, + status.getClass.getSimpleName.dropRight(1)) + } +} diff --git a/supportedBackends/google/pipelines/v2beta/src/main/scala/cromwell/backend/google/pipelines/v2beta/PipelinesUtilityConversions.scala b/supportedBackends/google/pipelines/v2beta/src/main/scala/cromwell/backend/google/pipelines/v2beta/PipelinesUtilityConversions.scala index 5d62cd01e79..733e11f9a5c 100644 --- a/supportedBackends/google/pipelines/v2beta/src/main/scala/cromwell/backend/google/pipelines/v2beta/PipelinesUtilityConversions.scala +++ b/supportedBackends/google/pipelines/v2beta/src/main/scala/cromwell/backend/google/pipelines/v2beta/PipelinesUtilityConversions.scala @@ -12,7 +12,6 @@ import mouse.all._ import PipelinesUtilityConversions._ import scala.language.postfixOps -import scala.util.Try trait PipelinesUtilityConversions { def toAccelerator(gpuResource: GpuResource) = new Accelerator().setCount(gpuResource.gpuCount.value.toLong).setType(gpuResource.gpuType.toString) @@ -57,11 +56,17 @@ object PipelinesUtilityConversions { implicit class EnhancedEvent(val event: Event) extends AnyVal { def getActionId: Option[Integer] = { - Try(event.getContainerKilled.getActionId) - .orElse(Try(event.getContainerStarted.getActionId)) - .orElse(Try(event.getContainerStopped.getActionId)) - .orElse(Try(event.getUnexpectedExitStatus.getActionId)) - .toOption + if (event.getContainerKilled != null) { + Option(event.getContainerKilled.getActionId) + } else if (event.getContainerStarted != null) { + Option(event.getContainerStarted.getActionId) + } else if (event.getContainerStopped != null) { + Option(event.getContainerStopped.getActionId) + } else if (event.getUnexpectedExitStatus != null) { + Option(event.getUnexpectedExitStatus.getActionId) + } else { + None + } } } -} \ No newline at end of file +} diff --git a/supportedBackends/sfs/src/main/scala/cromwell/backend/impl/sfs/config/ConfigHashingStrategy.scala b/supportedBackends/sfs/src/main/scala/cromwell/backend/impl/sfs/config/ConfigHashingStrategy.scala index 6ad4e98deed..1478819f72a 100644 --- a/supportedBackends/sfs/src/main/scala/cromwell/backend/impl/sfs/config/ConfigHashingStrategy.scala +++ b/supportedBackends/sfs/src/main/scala/cromwell/backend/impl/sfs/config/ConfigHashingStrategy.scala @@ -1,6 +1,6 @@ package cromwell.backend.impl.sfs.config -import java.io.FileNotFoundException +import java.io.{FileNotFoundException, InputStream} import akka.event.LoggingAdapter import com.typesafe.config.Config @@ -9,6 +9,7 @@ import cromwell.backend.standard.callcaching.StandardFileHashingActor.SingleFile import cromwell.core.path.{Path, PathFactory} import cromwell.util.TryWithResource._ import net.ceedubs.ficus.Ficus._ +import net.jpountz.xxhash.XXHashFactory import org.apache.commons.codec.digest.DigestUtils import org.slf4j.{Logger, LoggerFactory} @@ -16,15 +17,23 @@ import scala.util.{Failure, Try} object ConfigHashingStrategy { val logger: Logger = LoggerFactory.getLogger(getClass) - val defaultStrategy = HashFileStrategy(checkSiblingMd5 = false) + val defaultStrategy = HashFileMd5Strategy(checkSiblingMd5 = false) def apply(hashingConfig: Config): ConfigHashingStrategy = { val checkSiblingMd5 = hashingConfig.as[Option[Boolean]]("check-sibling-md5").getOrElse(false) + // Fingerprint strategy by default checks the first 10 MiB (10485760 bytes) for performance reasons. + // 100 MB will take to much time on network file systems. 1 MB might not be unique enough. + // The value is user configurable. + lazy val fingerprintSize = hashingConfig.as[Option[Long]]("fingerprint-size").getOrElse(10L * 1024 * 1024) + hashingConfig.as[Option[String]]("hashing-strategy").getOrElse("file") match { case "path" => HashPathStrategy(checkSiblingMd5) - case "file" => HashFileStrategy(checkSiblingMd5) + case "file" => HashFileMd5Strategy(checkSiblingMd5) + case "md5" => HashFileMd5Strategy(checkSiblingMd5) case "path+modtime" => HashPathModTimeStrategy(checkSiblingMd5) + case "xxh64" => HashFileXxH64Strategy(checkSiblingMd5) + case "fingerprint" => FingerprintStrategy(checkSiblingMd5, fingerprintSize) case what => logger.warn(s"Unrecognized hashing strategy $what.") HashPathStrategy(checkSiblingMd5) @@ -87,10 +96,73 @@ final case class HashPathModTimeStrategy(checkSiblingMd5: Boolean) extends Confi override val description = "hash file path and last modified time" } -final case class HashFileStrategy(checkSiblingMd5: Boolean) extends ConfigHashingStrategy { +final case class HashFileMd5Strategy(checkSiblingMd5: Boolean) extends ConfigHashingStrategy { override protected def hash(file: Path): Try[String] = { tryWithResource(() => file.newInputStream) { DigestUtils.md5Hex } } - override val description = "hash file content" + override val description = "hash file content with md5" +} + +final case class HashFileXxH64Strategy(checkSiblingMd5: Boolean) extends ConfigHashingStrategy { + override protected def hash(file: Path): Try[String] = { + tryWithResource(() => file.newInputStream) {HashFileXxH64StrategyMethods.xxh64sum(_)} + } + override val description = "hash file content with xxh64" +} + +final case class FingerprintStrategy(checkSiblingMd5: Boolean, fingerprintSize: Long) extends ConfigHashingStrategy { + override protected def hash(file: Path): Try[String] = { + Try { + // Calculate the xxh64 hash of last modified time and filesize. These are NOT added, as it will lead to loss of + // information. Instead their hexstrings are concatenated and then hashed. + HashFileXxH64StrategyMethods.xxh64sumString(file.lastModifiedTime.toEpochMilli.toHexString + + file.size.toHexString) + + HashFileXxH64StrategyMethods.xxh64sum(file.newInputStream, maxSize = fingerprintSize) + } + } + override val description = "fingerprint the file with last modified time, size and a xxh64 hash of the first part of the file" } + +object HashFileXxH64StrategyMethods { + // For more information about the choice of buffer size: https://github.com/rhpvorderman/hashtest/ + private lazy val defaultBufferSize: Int = 128 * 1024 + private lazy val xxhashFactory: XXHashFactory = XXHashFactory.fastestInstance() + + /** + * Returns the xxh64sum of an input stream. The input stream is read in a buffered way. + * @param inputStream an input Stream + * @param bufferSize the size in bytes for the buffer. + * @param maxSize, only calculate the hash for the first maxSize bytes. Must be a multiply of bufferSize. + * @return A hex string of the digest. + */ + def xxh64sum(inputStream: InputStream, + bufferSize: Int = defaultBufferSize, + maxSize: Long = Long.MaxValue, + seed: Long = 0L): String = { + val hasher = xxhashFactory.newStreamingHash64(seed) + val buffer: Array[Byte] = new Array[Byte](bufferSize) + var byteCounter: Long = 0 + try { + while (inputStream.available() > 0 && byteCounter < maxSize) { + val length: Int = inputStream.read(buffer) + hasher.update(buffer, 0, length) + byteCounter += length + } + } + finally inputStream.close() + // Long.toHexString does not add leading zero's + f"%%16s".format(hasher.getValue.toHexString).replace(" ", "0") + } + + // Only instantiate the xxh64hasher once + private lazy val xxh64hasher = xxhashFactory.hash64() + + def xxh64sumString(string: String, seed: Long = 0L): String = { + val bytes: Array[Byte] = string.toCharArray.map(_.toByte) + val hash = xxh64hasher.hash(bytes, 0, bytes.length, seed) + // Long.toHexString does not add leading zero's + f"%%16s".format(hash.toHexString).replace(" ", "0") + } +} + diff --git a/supportedBackends/sfs/src/main/scala/cromwell/backend/sfs/SharedFileSystemExpressionFunctions.scala b/supportedBackends/sfs/src/main/scala/cromwell/backend/sfs/SharedFileSystemExpressionFunctions.scala index 0e5bdb79a87..78fd4cad7c8 100644 --- a/supportedBackends/sfs/src/main/scala/cromwell/backend/sfs/SharedFileSystemExpressionFunctions.scala +++ b/supportedBackends/sfs/src/main/scala/cromwell/backend/sfs/SharedFileSystemExpressionFunctions.scala @@ -4,7 +4,8 @@ import akka.actor.ActorRef import cromwell.backend.io._ import cromwell.backend.standard.{DefaultStandardExpressionFunctionsParams, StandardExpressionFunctions, StandardExpressionFunctionsParams} import cromwell.core.CallContext -import cromwell.core.path.{DefaultPath, Path, PathBuilder} +import cromwell.core.path.{DefaultPath, DefaultPathBuilder, Path, PathBuilder} +import wom.expression.IoFunctionSet import scala.concurrent.ExecutionContext @@ -27,6 +28,8 @@ class SharedFileSystemExpressionFunctions(standardParams: StandardExpressionFunc this(DefaultStandardExpressionFunctionsParams(pathBuilders, callContext, ioActorProxy, ec)) } + override def makeInputSpecificFunctions: IoFunctionSet = new SharedFileSystemExpressionFunctionsForInput(standardParams) + override def postMapping(path: Path) = { path match { case _: DefaultPath if !path.isAbsolute => callContext.root.resolve(path) @@ -34,3 +37,19 @@ class SharedFileSystemExpressionFunctions(standardParams: StandardExpressionFunc } } } + +class SharedFileSystemExpressionFunctionsForInput(standardParams: StandardExpressionFunctionsParams) + extends SharedFileSystemExpressionFunctions(standardParams) { + + // override needed to prevent class self-reference + override def makeInputSpecificFunctions: IoFunctionSet = this + + lazy val cromwellCwd: Path = DefaultPathBuilder.build(sys.props("user.dir")).get + + override def postMapping(path: Path) = { + path match { + case _: DefaultPath if !path.isAbsolute => cromwellCwd.resolve(path) + case _ => path + } + } +} diff --git a/supportedBackends/sfs/src/test/scala/cromwell/backend/impl/sfs/config/ConfigHashingStrategySpec.scala b/supportedBackends/sfs/src/test/scala/cromwell/backend/impl/sfs/config/ConfigHashingStrategySpec.scala index 4413339d093..920d67767a2 100644 --- a/supportedBackends/sfs/src/test/scala/cromwell/backend/impl/sfs/config/ConfigHashingStrategySpec.scala +++ b/supportedBackends/sfs/src/test/scala/cromwell/backend/impl/sfs/config/ConfigHashingStrategySpec.scala @@ -13,7 +13,6 @@ import org.scalatest.prop.TableDrivenPropertyChecks import org.scalatest.{BeforeAndAfterAll, FlatSpec, Matchers} import org.specs2.mock.Mockito import wom.values.WomSingleFile - import scala.util.Success class ConfigHashingStrategySpec extends FlatSpec with Matchers with TableDrivenPropertyChecks with Mockito with BeforeAndAfterAll { @@ -21,10 +20,11 @@ class ConfigHashingStrategySpec extends FlatSpec with Matchers with TableDrivenP behavior of "ConfigHashingStrategy" val steak = "Steak" - val steakHash = DigestUtils.md5Hex(steak) + val steakMd5 = DigestUtils.md5Hex(steak) + val steakXxh64 = HashFileXxH64StrategyMethods.xxh64sumString(steak) val file = DefaultPathBuilder.createTempFile() val symLinksDir = DefaultPathBuilder.createTempDirectory("sym-dir") - val pathHash = DigestUtils.md5Hex(file.pathAsString) + val pathMd5 = DigestUtils.md5Hex(file.pathAsString) val md5File = file.sibling(s"${file.name}.md5") // Not the md5 value of "Steak". This is intentional so we can verify which hash is used depending on the strategy val md5FileHash = "103508832bace55730c8ee8d89c1a45f" @@ -81,9 +81,9 @@ class ConfigHashingStrategySpec extends FlatSpec with Matchers with TableDrivenP val table = Table( ("check", "withMd5", "expected"), (true, true, md5FileHash), - (false, true, pathHash), - (true, false, pathHash), - (false, false, pathHash) + (false, true, pathMd5), + (true, false, pathMd5), + (false, false, pathMd5) ) forAll(table) { (check, withMd5, expected) => @@ -145,31 +145,31 @@ class ConfigHashingStrategySpec extends FlatSpec with Matchers with TableDrivenP } } - it should "create a file hashing strategy from config" in { + it should "create a md5 hashing strategy from config" in { val defaultSibling = makeStrategy("file") - defaultSibling.isInstanceOf[HashFileStrategy] shouldBe true + defaultSibling.isInstanceOf[HashFileMd5Strategy] shouldBe true defaultSibling.checkSiblingMd5 shouldBe false - val checkSibling = makeStrategy("file", Option(true)) + val checkSibling = makeStrategy("md5", Option(true)) - checkSibling.isInstanceOf[HashFileStrategy] shouldBe true + checkSibling.isInstanceOf[HashFileMd5Strategy] shouldBe true checkSibling.checkSiblingMd5 shouldBe true - checkSibling.toString shouldBe "Call caching hashing strategy: Check first for sibling md5 and if not found hash file content." + checkSibling.toString shouldBe "Call caching hashing strategy: Check first for sibling md5 and if not found hash file content with md5." val dontCheckSibling = makeStrategy("file", Option(false)) - dontCheckSibling.isInstanceOf[HashFileStrategy] shouldBe true + dontCheckSibling.isInstanceOf[HashFileMd5Strategy] shouldBe true dontCheckSibling.checkSiblingMd5 shouldBe false - dontCheckSibling.toString shouldBe "Call caching hashing strategy: hash file content." + dontCheckSibling.toString shouldBe "Call caching hashing strategy: hash file content with md5." } it should "have a file hashing strategy and use md5 sibling file when appropriate" in { val table = Table( ("check", "withMd5", "expected"), (true, true, md5FileHash), - (false, true, steakHash), - (true, false, steakHash), - (false, false, steakHash) + (false, true, steakMd5), + (true, false, steakMd5), + (false, false, steakMd5) ) forAll(table) { (check, withMd5, expected) => @@ -186,6 +186,100 @@ class ConfigHashingStrategySpec extends FlatSpec with Matchers with TableDrivenP } } + it should "create a xxh64 hashing strategy from config" in { + val defaultSibling = makeStrategy("xxh64") + defaultSibling.isInstanceOf[HashFileXxH64Strategy] shouldBe true + defaultSibling.checkSiblingMd5 shouldBe false + + val checkSibling = makeStrategy("xxh64", Option(true)) + + checkSibling.isInstanceOf[HashFileXxH64Strategy] shouldBe true + checkSibling.checkSiblingMd5 shouldBe true + checkSibling.toString shouldBe "Call caching hashing strategy: Check first for sibling md5 and if not found hash file content with xxh64." + + val dontCheckSibling = makeStrategy("xxh64", Option(false)) + + dontCheckSibling.isInstanceOf[HashFileXxH64Strategy] shouldBe true + dontCheckSibling.checkSiblingMd5 shouldBe false + dontCheckSibling.toString shouldBe "Call caching hashing strategy: hash file content with xxh64." + } + + it should "have a xxh64 hashing strategy and use md5 sibling file when appropriate" in { + val table = Table( + ("check", "withMd5", "expected"), + (true, true, md5FileHash), + (false, true, steakXxh64), + (true, false, steakXxh64), + (false, false, steakXxh64) + ) + + forAll(table) { (check, withMd5, expected) => + md5File.delete(swallowIOExceptions = true) + val checkSibling = makeStrategy("xxh64", Option(check)) + + checkSibling.getHash(mockRequest(withMd5, symlink = false), mock[LoggingAdapter]) shouldBe Success(expected) + + val symLinkRequest: SingleFileHashRequest = mockRequest(withMd5, symlink = true) + val symlink = DefaultPathBuilder.get(symLinkRequest.file.valueString) + + symlink.isSymbolicLink shouldBe true + checkSibling.getHash(symLinkRequest, mock[LoggingAdapter]) shouldBe Success(expected) + } + } + + it should "create a fingerprint strategy from config" in { + val defaultFingerprint: FingerprintStrategy = makeStrategy("fingerprint").asInstanceOf[FingerprintStrategy] + defaultFingerprint.isInstanceOf[FingerprintStrategy] shouldBe true + defaultFingerprint.checkSiblingMd5 shouldBe false + defaultFingerprint.fingerprintSize shouldBe 10 * 1024 * 1024 + + val config = ConfigFactory.parseString( + """|hashing-strategy: "fingerprint" + |fingerprint-size: 123456789 + |""".stripMargin) + val otherFingerprint: FingerprintStrategy = ConfigHashingStrategy.apply(config).asInstanceOf[FingerprintStrategy] + otherFingerprint.fingerprintSize shouldBe 123456789 + otherFingerprint.isInstanceOf[FingerprintStrategy] shouldBe true + + val checkSibling = makeStrategy("fingerprint", Option(true)) + + checkSibling.isInstanceOf[FingerprintStrategy] shouldBe true + checkSibling.checkSiblingMd5 shouldBe true + checkSibling.toString shouldBe "Call caching hashing strategy: Check first for sibling md5 and if not found fingerprint the file with last modified time, size and a xxh64 hash of the first part of the file." + + val dontCheckSibling = makeStrategy("fingerprint", Option(false)) + + dontCheckSibling.isInstanceOf[FingerprintStrategy] shouldBe true + dontCheckSibling.checkSiblingMd5 shouldBe false + dontCheckSibling.toString shouldBe "Call caching hashing strategy: fingerprint the file with last modified time, size and a xxh64 hash of the first part of the file." + + } + + it should "have a fingerprint strategy and use md5 sibling file when appropriate" in { + val fingerPrintHash = HashFileXxH64StrategyMethods.xxh64sumString(file.lastModifiedTime.toEpochMilli.toHexString + + file.size.toHexString) + steakXxh64 + val table = Table( + ("check", "withMd5", "expected"), + (true, true, md5FileHash), + (false, true, fingerPrintHash), + (true, false, fingerPrintHash), + (false, false, fingerPrintHash) + ) + + forAll(table) { (check, withMd5, expected) => + md5File.delete(swallowIOExceptions = true) + val checkSibling = makeStrategy("fingerprint", Option(check)) + + checkSibling.getHash(mockRequest(withMd5, symlink = false), mock[LoggingAdapter]) shouldBe Success(expected) + + val symLinkRequest: SingleFileHashRequest = mockRequest(withMd5, symlink = true) + val symlink = DefaultPathBuilder.get(symLinkRequest.file.valueString) + + symlink.isSymbolicLink shouldBe true + checkSibling.getHash(symLinkRequest, mock[LoggingAdapter]) shouldBe Success(expected) + } + } + override def afterAll() = { file.delete(true) md5File.delete(true) diff --git a/wom/src/main/scala/wom/expression/WomExpression.scala b/wom/src/main/scala/wom/expression/WomExpression.scala index 19cb6f3aff5..05884f0f20a 100644 --- a/wom/src/main/scala/wom/expression/WomExpression.scala +++ b/wom/src/main/scala/wom/expression/WomExpression.scala @@ -163,4 +163,11 @@ trait IoFunctionSet { implicit def ec: ExecutionContext implicit def cs = IO.contextShift(ec) + + /** + * Returns an IO function set where input specific functions have been turned on. This allows backends such as the sfs + * backend to use a different set of functions when evaluating inputs. + * @return an IoFunctionSet + */ + def makeInputSpecificFunctions(): IoFunctionSet = this }