diff --git a/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/GmsGraphQLEngine.java b/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/GmsGraphQLEngine.java index 59335ba605a74..3c46c1a8dce35 100644 --- a/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/GmsGraphQLEngine.java +++ b/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/GmsGraphQLEngine.java @@ -56,6 +56,7 @@ import com.linkedin.datahub.graphql.generated.DataJobInputOutput; import com.linkedin.datahub.graphql.generated.DataPlatform; import com.linkedin.datahub.graphql.generated.DataPlatformInstance; +import com.linkedin.datahub.graphql.generated.DataProcessInstance; import com.linkedin.datahub.graphql.generated.DataQualityContract; import com.linkedin.datahub.graphql.generated.Dataset; import com.linkedin.datahub.graphql.generated.DatasetStatsSummary; @@ -346,6 +347,7 @@ import com.linkedin.datahub.graphql.types.datajob.DataJobType; import com.linkedin.datahub.graphql.types.dataplatform.DataPlatformType; import com.linkedin.datahub.graphql.types.dataplatforminstance.DataPlatformInstanceType; +import com.linkedin.datahub.graphql.types.dataprocessinst.DataProcessInstanceType; import com.linkedin.datahub.graphql.types.dataprocessinst.mappers.DataProcessInstanceRunEventMapper; import com.linkedin.datahub.graphql.types.dataproduct.DataProductType; import com.linkedin.datahub.graphql.types.dataset.DatasetType; @@ -530,6 +532,7 @@ public class GmsGraphQLEngine { private final FormType formType; private final IncidentType incidentType; private final RestrictedType restrictedType; + private final DataProcessInstanceType dataProcessInstanceType; private final int graphQLQueryComplexityLimit; private final int graphQLQueryDepthLimit; @@ -649,6 +652,7 @@ public GmsGraphQLEngine(final GmsGraphQLEngineArgs args) { this.formType = new FormType(entityClient); this.incidentType = new IncidentType(entityClient); this.restrictedType = new RestrictedType(entityClient, restrictedService); + this.dataProcessInstanceType = new DataProcessInstanceType(entityClient, featureFlags); this.graphQLQueryComplexityLimit = args.graphQLQueryComplexityLimit; this.graphQLQueryDepthLimit = args.graphQLQueryDepthLimit; @@ -699,7 +703,8 @@ public GmsGraphQLEngine(final GmsGraphQLEngineArgs args) { formType, incidentType, restrictedType, - businessAttributeType)); + businessAttributeType, + dataProcessInstanceType)); this.loadableTypes = new ArrayList<>(entityTypes); // Extend loadable types with types from the plugins // This allows us to offer search and browse capabilities out of the box for @@ -1024,6 +1029,7 @@ private void configureQueryResolvers(final RuntimeWiring.Builder builder) { .dataFetcher("tag", getResolver(tagType)) .dataFetcher("dataFlow", getResolver(dataFlowType)) .dataFetcher("dataJob", getResolver(dataJobType)) + .dataFetcher("dataProcessInstance", getResolver(dataProcessInstanceType)) .dataFetcher("glossaryTerm", getResolver(glossaryTermType)) .dataFetcher("glossaryNode", getResolver(glossaryNodeType)) .dataFetcher("domain", getResolver((domainType))) @@ -3058,6 +3064,35 @@ private void configureDataProcessInstanceResolvers(final RuntimeWiring.Builder b "DataProcessInstance", typeWiring -> typeWiring + .dataFetcher( + "dataPlatformInstance", + new LoadableTypeResolver<>( + dataPlatformInstanceType, + (env) -> { + final DataProcessInstance dataProcessInstance = env.getSource(); + return dataProcessInstance.getDataPlatformInstance() != null + ? dataProcessInstance.getDataPlatformInstance().getUrn() + : null; + })) + .dataFetcher( + "platform", + new LoadableTypeResolver<>( + dataPlatformType, + (env) -> { + final DataProcessInstance dataProcessInstance = env.getSource(); + return dataProcessInstance.getPlatform() != null + ? dataProcessInstance.getPlatform().getUrn() + : null; + })) + .dataFetcher("parentContainers", new ParentContainersResolver(entityClient)) + .dataFetcher( + "container", + new LoadableTypeResolver<>( + containerType, + (env) -> { + final DataProcessInstance dpi = env.getSource(); + return dpi.getContainer() != null ? dpi.getContainer().getUrn() : null; + })) .dataFetcher("relationships", new EntityRelationshipsResultResolver(graphClient)) .dataFetcher( "lineage", diff --git a/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/types/common/mappers/DataPlatformInstanceAspectMapper.java b/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/types/common/mappers/DataPlatformInstanceAspectMapper.java index 4345819867617..ab3127a3ae232 100644 --- a/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/types/common/mappers/DataPlatformInstanceAspectMapper.java +++ b/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/types/common/mappers/DataPlatformInstanceAspectMapper.java @@ -1,6 +1,7 @@ package com.linkedin.datahub.graphql.types.common.mappers; import com.linkedin.datahub.graphql.QueryContext; +import com.linkedin.datahub.graphql.generated.DataPlatform; import com.linkedin.datahub.graphql.generated.DataPlatformInstance; import com.linkedin.datahub.graphql.generated.EntityType; import com.linkedin.datahub.graphql.types.mappers.ModelMapper; @@ -28,6 +29,11 @@ public DataPlatformInstance apply( result.setType(EntityType.DATA_PLATFORM_INSTANCE); result.setUrn(input.getInstance().toString()); } + result.setPlatform( + DataPlatform.builder() + .setUrn(input.getPlatform().toString()) + .setType(EntityType.DATA_PLATFORM) + .build()); return result; } } diff --git a/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/types/common/mappers/TimeStampToAuditStampMapper.java b/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/types/common/mappers/TimeStampToAuditStampMapper.java new file mode 100644 index 0000000000000..58f78b146b406 --- /dev/null +++ b/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/types/common/mappers/TimeStampToAuditStampMapper.java @@ -0,0 +1,24 @@ +package com.linkedin.datahub.graphql.types.common.mappers; + +import com.linkedin.common.TimeStamp; +import com.linkedin.datahub.graphql.QueryContext; +import com.linkedin.datahub.graphql.generated.AuditStamp; +import javax.annotation.Nullable; + +public class TimeStampToAuditStampMapper { + + public static final TimeStampToAuditStampMapper INSTANCE = new TimeStampToAuditStampMapper(); + + public static AuditStamp map( + @Nullable final QueryContext context, @Nullable final TimeStamp input) { + if (input == null) { + return null; + } + final AuditStamp result = new AuditStamp(); + result.setTime(input.getTime()); + if (input.hasActor()) { + result.setActor(input.getActor().toString()); + } + return result; + } +} diff --git a/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/types/common/mappers/UrnToEntityMapper.java b/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/types/common/mappers/UrnToEntityMapper.java index 1988cafc486c1..eae33e6da2e56 100644 --- a/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/types/common/mappers/UrnToEntityMapper.java +++ b/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/types/common/mappers/UrnToEntityMapper.java @@ -18,6 +18,7 @@ import com.linkedin.datahub.graphql.generated.DataJob; import com.linkedin.datahub.graphql.generated.DataPlatform; import com.linkedin.datahub.graphql.generated.DataPlatformInstance; +import com.linkedin.datahub.graphql.generated.DataProcessInstance; import com.linkedin.datahub.graphql.generated.DataProduct; import com.linkedin.datahub.graphql.generated.Dataset; import com.linkedin.datahub.graphql.generated.Domain; @@ -225,6 +226,11 @@ public Entity apply(@Nullable QueryContext context, Urn input) { ((BusinessAttribute) partialEntity).setUrn(input.toString()); ((BusinessAttribute) partialEntity).setType(EntityType.BUSINESS_ATTRIBUTE); } + if (input.getEntityType().equals(DATA_PROCESS_INSTANCE_ENTITY_NAME)) { + partialEntity = new DataProcessInstance(); + ((DataProcessInstance) partialEntity).setUrn(input.toString()); + ((DataProcessInstance) partialEntity).setType(EntityType.DATA_PROCESS_INSTANCE); + } return partialEntity; } } diff --git a/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/types/dataprocessinst/DataProcessInstanceType.java b/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/types/dataprocessinst/DataProcessInstanceType.java new file mode 100644 index 0000000000000..eeaaaa96f5170 --- /dev/null +++ b/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/types/dataprocessinst/DataProcessInstanceType.java @@ -0,0 +1,102 @@ +package com.linkedin.datahub.graphql.types.dataprocessinst; + +import static com.linkedin.metadata.Constants.*; + +import com.google.common.collect.ImmutableSet; +import com.linkedin.common.urn.Urn; +import com.linkedin.common.urn.UrnUtils; +import com.linkedin.datahub.graphql.QueryContext; +import com.linkedin.datahub.graphql.featureflags.FeatureFlags; +import com.linkedin.datahub.graphql.generated.DataProcessInstance; +import com.linkedin.datahub.graphql.generated.Entity; +import com.linkedin.datahub.graphql.generated.EntityType; +import com.linkedin.datahub.graphql.types.dataprocessinst.mappers.DataProcessInstanceMapper; +import com.linkedin.entity.EntityResponse; +import com.linkedin.entity.client.EntityClient; +import graphql.execution.DataFetcherResult; +import java.util.ArrayList; +import java.util.HashMap; +import java.util.HashSet; +import java.util.List; +import java.util.Map; +import java.util.Set; +import java.util.function.Function; +import java.util.stream.Collectors; +import javax.annotation.Nonnull; +import lombok.RequiredArgsConstructor; + +@RequiredArgsConstructor +public class DataProcessInstanceType + implements com.linkedin.datahub.graphql.types.EntityType { + + public static final Set ASPECTS_TO_FETCH = + ImmutableSet.of( + DATA_PROCESS_INSTANCE_KEY_ASPECT_NAME, + DATA_PLATFORM_INSTANCE_ASPECT_NAME, + DATA_PROCESS_INSTANCE_PROPERTIES_ASPECT_NAME, + DATA_PROCESS_INSTANCE_INPUT_ASPECT_NAME, + DATA_PROCESS_INSTANCE_OUTPUT_ASPECT_NAME, + DATA_PROCESS_INSTANCE_RUN_EVENT_ASPECT_NAME, + TEST_RESULTS_ASPECT_NAME, + DATA_PROCESS_INSTANCE_RELATIONSHIPS_ASPECT_NAME, + ML_TRAINING_RUN_PROPERTIES_ASPECT_NAME, + SUB_TYPES_ASPECT_NAME, + CONTAINER_ASPECT_NAME); + + private final EntityClient _entityClient; + private final FeatureFlags _featureFlags; + + @Override + public EntityType type() { + return EntityType.DATA_PROCESS_INSTANCE; + } + + @Override + public Function getKeyProvider() { + return Entity::getUrn; + } + + @Override + public Class objectClass() { + return DataProcessInstance.class; + } + + @Override + public List> batchLoad( + @Nonnull List urns, @Nonnull QueryContext context) throws Exception { + final List dataProcessInstanceUrns = + urns.stream().map(UrnUtils::getUrn).collect(Collectors.toList()); + + try { + Map entities = new HashMap<>(); + if (_featureFlags.isDataProcessInstanceEntityEnabled()) { + entities = + _entityClient.batchGetV2( + context.getOperationContext(), + DATA_PROCESS_INSTANCE_ENTITY_NAME, + new HashSet<>(dataProcessInstanceUrns), + ASPECTS_TO_FETCH); + } + + final List gmsResults = new ArrayList<>(); + for (Urn urn : dataProcessInstanceUrns) { + if (_featureFlags.isDataProcessInstanceEntityEnabled()) { + gmsResults.add(entities.getOrDefault(urn, null)); + } + } + + return gmsResults.stream() + .map( + gmsResult -> + gmsResult == null + ? null + : DataFetcherResult.newResult() + .data(DataProcessInstanceMapper.map(context, gmsResult)) + .build()) + .collect(Collectors.toList()); + + } catch (Exception e) { + throw new RuntimeException("Failed to load Data Process Instance entity", e); + } + } +} diff --git a/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/types/dataprocessinst/mappers/DataProcessInstanceMapper.java b/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/types/dataprocessinst/mappers/DataProcessInstanceMapper.java index 7a4d342281fe5..28c9c8936fdbf 100644 --- a/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/types/dataprocessinst/mappers/DataProcessInstanceMapper.java +++ b/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/types/dataprocessinst/mappers/DataProcessInstanceMapper.java @@ -2,25 +2,38 @@ import static com.linkedin.metadata.Constants.*; +import com.linkedin.common.DataPlatformInstance; +import com.linkedin.common.SubTypes; +import com.linkedin.common.urn.Urn; import com.linkedin.data.DataMap; import com.linkedin.data.template.RecordTemplate; import com.linkedin.datahub.graphql.QueryContext; +import com.linkedin.datahub.graphql.generated.DataPlatform; import com.linkedin.datahub.graphql.generated.DataProcessInstance; import com.linkedin.datahub.graphql.generated.EntityType; import com.linkedin.datahub.graphql.types.common.mappers.AuditStampMapper; +import com.linkedin.datahub.graphql.types.common.mappers.CustomPropertiesMapper; +import com.linkedin.datahub.graphql.types.common.mappers.DataPlatformInstanceAspectMapper; +import com.linkedin.datahub.graphql.types.common.mappers.SubTypesMapper; import com.linkedin.datahub.graphql.types.common.mappers.util.MappingHelper; import com.linkedin.datahub.graphql.types.mappers.ModelMapper; +import com.linkedin.datahub.graphql.types.mlmodel.mappers.MLHyperParamMapper; +import com.linkedin.datahub.graphql.types.mlmodel.mappers.MLMetricMapper; import com.linkedin.dataprocess.DataProcessInstanceProperties; import com.linkedin.entity.EntityResponse; import com.linkedin.entity.EnvelopedAspectMap; +import com.linkedin.ml.metadata.MLTrainingRunProperties; +import java.util.stream.Collectors; import javax.annotation.Nonnull; import javax.annotation.Nullable; +import lombok.extern.slf4j.Slf4j; /** * Maps Pegasus {@link RecordTemplate} objects to objects conforming to the GQL schema. * *

To be replaced by auto-generated mappers implementations */ +@Slf4j public class DataProcessInstanceMapper implements ModelMapper { public static final DataProcessInstanceMapper INSTANCE = new DataProcessInstanceMapper(); @@ -30,6 +43,19 @@ public static DataProcessInstance map( return INSTANCE.apply(context, entityResponse); } + private void mapContainers( + @Nullable final QueryContext context, + @Nonnull DataProcessInstance dataProcessInstance, + @Nonnull DataMap dataMap) { + final com.linkedin.container.Container gmsContainer = + new com.linkedin.container.Container(dataMap); + dataProcessInstance.setContainer( + com.linkedin.datahub.graphql.generated.Container.builder() + .setType(EntityType.CONTAINER) + .setUrn(gmsContainer.getContainer().toString()) + .build()); + } + @Override public DataProcessInstance apply( @Nullable QueryContext context, @Nonnull final EntityResponse entityResponse) { @@ -37,24 +63,97 @@ public DataProcessInstance apply( result.setUrn(entityResponse.getUrn().toString()); result.setType(EntityType.DATA_PROCESS_INSTANCE); + Urn entityUrn = entityResponse.getUrn(); EnvelopedAspectMap aspectMap = entityResponse.getAspects(); MappingHelper mappingHelper = new MappingHelper<>(aspectMap, result); mappingHelper.mapToResult( - context, DATA_PROCESS_INSTANCE_PROPERTIES_ASPECT_NAME, this::mapDataProcessProperties); + DATA_PROCESS_INSTANCE_PROPERTIES_ASPECT_NAME, + (dataProcessInstance, dataMap) -> + mapDataProcessProperties(context, dataProcessInstance, dataMap, entityUrn)); + mappingHelper.mapToResult( + ML_TRAINING_RUN_PROPERTIES_ASPECT_NAME, + (dataProcessInstance, dataMap) -> + mapTrainingRunProperties(context, dataProcessInstance, dataMap)); + mappingHelper.mapToResult( + DATA_PLATFORM_INSTANCE_ASPECT_NAME, + (dataProcessInstance, dataMap) -> { + DataPlatformInstance dataPlatformInstance = new DataPlatformInstance(dataMap); + dataProcessInstance.setDataPlatformInstance( + DataPlatformInstanceAspectMapper.map(context, dataPlatformInstance)); + DataPlatform dataPlatform = new DataPlatform(); + dataPlatform.setUrn(dataPlatformInstance.getPlatform().toString()); + dataPlatform.setType(EntityType.DATA_PLATFORM); + dataProcessInstance.setPlatform(dataPlatform); + }); + mappingHelper.mapToResult( + SUB_TYPES_ASPECT_NAME, + (dataProcessInstance, dataMap) -> + dataProcessInstance.setSubTypes(SubTypesMapper.map(context, new SubTypes(dataMap)))); + mappingHelper.mapToResult( + CONTAINER_ASPECT_NAME, + (dataProcessInstance, dataMap) -> mapContainers(context, dataProcessInstance, dataMap)); return mappingHelper.getResult(); } - private void mapDataProcessProperties( + private void mapTrainingRunProperties( @Nonnull QueryContext context, @Nonnull DataProcessInstance dpi, @Nonnull DataMap dataMap) { + MLTrainingRunProperties trainingProperties = new MLTrainingRunProperties(dataMap); + + com.linkedin.datahub.graphql.generated.MLTrainingRunProperties properties = + new com.linkedin.datahub.graphql.generated.MLTrainingRunProperties(); + if (trainingProperties.hasId()) { + properties.setId(trainingProperties.getId()); + } + if (trainingProperties.hasOutputUrls()) { + properties.setOutputUrls( + trainingProperties.getOutputUrls().stream() + .map(url -> url.toString()) + .collect(Collectors.toList())); + } + if (trainingProperties.getHyperParams() != null) { + properties.setHyperParams( + trainingProperties.getHyperParams().stream() + .map(param -> MLHyperParamMapper.map(context, param)) + .collect(Collectors.toList())); + } + if (trainingProperties.getTrainingMetrics() != null) { + properties.setTrainingMetrics( + trainingProperties.getTrainingMetrics().stream() + .map(metric -> MLMetricMapper.map(context, metric)) + .collect(Collectors.toList())); + } + if (trainingProperties.hasId()) { + properties.setId(trainingProperties.getId()); + } + dpi.setMlTrainingRunProperties(properties); + } + + private void mapDataProcessProperties( + @Nonnull QueryContext context, + @Nonnull DataProcessInstance dpi, + @Nonnull DataMap dataMap, + @Nonnull Urn entityUrn) { DataProcessInstanceProperties dataProcessInstanceProperties = new DataProcessInstanceProperties(dataMap); + + com.linkedin.datahub.graphql.generated.DataProcessInstanceProperties properties = + new com.linkedin.datahub.graphql.generated.DataProcessInstanceProperties(); + dpi.setName(dataProcessInstanceProperties.getName()); - if (dataProcessInstanceProperties.hasCreated()) { - dpi.setCreated(AuditStampMapper.map(context, dataProcessInstanceProperties.getCreated())); - } + properties.setName(dataProcessInstanceProperties.getName()); if (dataProcessInstanceProperties.hasExternalUrl()) { dpi.setExternalUrl(dataProcessInstanceProperties.getExternalUrl().toString()); + properties.setExternalUrl(dataProcessInstanceProperties.getExternalUrl().toString()); + } + if (dataProcessInstanceProperties.hasCustomProperties()) { + properties.setCustomProperties( + CustomPropertiesMapper.map( + dataProcessInstanceProperties.getCustomProperties(), entityUrn)); + } + if (dataProcessInstanceProperties.hasCreated()) { + dpi.setCreated(AuditStampMapper.map(context, dataProcessInstanceProperties.getCreated())); } + dpi.setProperties(properties); } } diff --git a/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/types/entitytype/EntityTypeUrnMapper.java b/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/types/entitytype/EntityTypeUrnMapper.java index 334faf753cb8b..5b72c2b3c11c5 100644 --- a/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/types/entitytype/EntityTypeUrnMapper.java +++ b/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/types/entitytype/EntityTypeUrnMapper.java @@ -77,6 +77,9 @@ public class EntityTypeUrnMapper { .put( Constants.BUSINESS_ATTRIBUTE_ENTITY_NAME, "urn:li:entityType:datahub.businessAttribute") + .put( + Constants.DATA_PROCESS_INSTANCE_ENTITY_NAME, + "urn:li:entityType:datahub.dataProcessInstance") .build(); private static final Map ENTITY_TYPE_URN_TO_NAME = diff --git a/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/types/mlmodel/mappers/MLModelPropertiesMapper.java b/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/types/mlmodel/mappers/MLModelPropertiesMapper.java index 265005c2caa9e..7b00fe88f2d68 100644 --- a/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/types/mlmodel/mappers/MLModelPropertiesMapper.java +++ b/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/types/mlmodel/mappers/MLModelPropertiesMapper.java @@ -7,6 +7,7 @@ import com.linkedin.datahub.graphql.generated.MLModelGroup; import com.linkedin.datahub.graphql.generated.MLModelProperties; import com.linkedin.datahub.graphql.types.common.mappers.CustomPropertiesMapper; +import com.linkedin.datahub.graphql.types.common.mappers.TimeStampToAuditStampMapper; import com.linkedin.datahub.graphql.types.mappers.EmbeddedModelMapper; import java.util.stream.Collectors; import javax.annotation.Nonnull; @@ -31,6 +32,15 @@ public MLModelProperties apply( final MLModelProperties result = new MLModelProperties(); result.setDate(mlModelProperties.getDate()); + if (mlModelProperties.getName() != null) { + result.setName(mlModelProperties.getName()); + } else { + // backfill name from URN for backwards compatibility + result.setName(entityUrn.getEntityKey().get(1)); // indexed access is safe here + } + result.setCreated(TimeStampToAuditStampMapper.map(context, mlModelProperties.getCreated())); + result.setLastModified( + TimeStampToAuditStampMapper.map(context, mlModelProperties.getLastModified())); result.setDescription(mlModelProperties.getDescription()); if (mlModelProperties.getExternalUrl() != null) { result.setExternalUrl(mlModelProperties.getExternalUrl().toString()); diff --git a/datahub-graphql-core/src/main/resources/entity.graphql b/datahub-graphql-core/src/main/resources/entity.graphql index adb24d92587b5..9dd1948e18e04 100644 --- a/datahub-graphql-core/src/main/resources/entity.graphql +++ b/datahub-graphql-core/src/main/resources/entity.graphql @@ -10098,7 +10098,7 @@ type MLModelProperties { """ The display name of the model used in the UI """ - name: String! + name: String """ Detailed description of the model's purpose and characteristics diff --git a/datahub-graphql-core/src/test/java/com/linkedin/datahub/graphql/types/common/mappers/TimeStampToAuditStampMapperTest.java b/datahub-graphql-core/src/test/java/com/linkedin/datahub/graphql/types/common/mappers/TimeStampToAuditStampMapperTest.java new file mode 100644 index 0000000000000..4e0dbd7b1733b --- /dev/null +++ b/datahub-graphql-core/src/test/java/com/linkedin/datahub/graphql/types/common/mappers/TimeStampToAuditStampMapperTest.java @@ -0,0 +1,46 @@ +package com.linkedin.datahub.graphql.types.common.mappers; + +import static org.testng.Assert.*; + +import com.linkedin.common.TimeStamp; +import com.linkedin.common.urn.Urn; +import com.linkedin.datahub.graphql.generated.AuditStamp; +import org.testng.annotations.Test; + +public class TimeStampToAuditStampMapperTest { + + private static final String TEST_ACTOR_URN = "urn:li:corpuser:testUser"; + private static final long TEST_TIME = 1234567890L; + + @Test + public void testMapWithActor() throws Exception { + TimeStamp input = new TimeStamp(); + input.setTime(TEST_TIME); + input.setActor(Urn.createFromString(TEST_ACTOR_URN)); + + AuditStamp result = TimeStampToAuditStampMapper.map(null, input); + + assertNotNull(result); + assertEquals(result.getTime().longValue(), TEST_TIME); + assertEquals(result.getActor(), TEST_ACTOR_URN); + } + + @Test + public void testMapWithoutActor() { + TimeStamp input = new TimeStamp(); + input.setTime(TEST_TIME); + + AuditStamp result = TimeStampToAuditStampMapper.map(null, input); + + assertNotNull(result); + assertEquals(result.getTime().longValue(), TEST_TIME); + assertNull(result.getActor()); + } + + @Test + public void testMapNull() { + AuditStamp result = TimeStampToAuditStampMapper.map(null, null); + + assertNull(result); + } +} diff --git a/datahub-graphql-core/src/test/java/com/linkedin/datahub/graphql/types/dataplatforminstance/mapper/DataPlatformInstanceAspectMapperTest.java b/datahub-graphql-core/src/test/java/com/linkedin/datahub/graphql/types/dataplatforminstance/mapper/DataPlatformInstanceAspectMapperTest.java new file mode 100644 index 0000000000000..479d7340fef94 --- /dev/null +++ b/datahub-graphql-core/src/test/java/com/linkedin/datahub/graphql/types/dataplatforminstance/mapper/DataPlatformInstanceAspectMapperTest.java @@ -0,0 +1,75 @@ +package com.linkedin.datahub.graphql.types.dataplatforminstance.mapper; + +import static org.testng.Assert.*; + +import com.linkedin.common.urn.DataPlatformUrn; +import com.linkedin.common.urn.Urn; +import com.linkedin.datahub.graphql.generated.DataPlatformInstance; +import com.linkedin.datahub.graphql.generated.EntityType; +import com.linkedin.datahub.graphql.types.common.mappers.DataPlatformInstanceAspectMapper; +import org.testng.annotations.Test; + +public class DataPlatformInstanceAspectMapperTest { + + private static final String TEST_PLATFORM = "hive"; + private static final String TEST_INSTANCE = "prod"; + private static final String TEST_PLATFORM_URN = "urn:li:dataPlatform:" + TEST_PLATFORM; + private static final String TEST_INSTANCE_URN = + String.format( + "urn:li:dataPlatformInstance:(urn:li:dataPlatform:%s,%s)", TEST_PLATFORM, TEST_INSTANCE); + + @Test + public void testMapWithInstance() throws Exception { + // Create test input + com.linkedin.common.DataPlatformInstance input = new com.linkedin.common.DataPlatformInstance(); + DataPlatformUrn platformUrn = new DataPlatformUrn(TEST_PLATFORM); + Urn instanceUrn = Urn.createFromString(TEST_INSTANCE_URN); + + input.setPlatform(platformUrn); + input.setInstance(instanceUrn); + + // Map and verify + DataPlatformInstance result = DataPlatformInstanceAspectMapper.map(null, input); + + assertNotNull(result); + assertEquals(result.getType(), EntityType.DATA_PLATFORM_INSTANCE); + assertEquals(result.getUrn(), TEST_INSTANCE_URN); + + // Verify platform mapping + assertNotNull(result.getPlatform()); + assertEquals(result.getPlatform().getType(), EntityType.DATA_PLATFORM); + assertEquals(result.getPlatform().getUrn(), TEST_PLATFORM_URN); + } + + @Test + public void testMapWithoutInstance() throws Exception { + // Create test input with only platform + com.linkedin.common.DataPlatformInstance input = new com.linkedin.common.DataPlatformInstance(); + DataPlatformUrn platformUrn = new DataPlatformUrn(TEST_PLATFORM); + input.setPlatform(platformUrn); + + // Map and verify + DataPlatformInstance result = DataPlatformInstanceAspectMapper.map(null, input); + + assertNotNull(result); + assertNull(result.getType()); // Type should be null when no instance + assertNull(result.getUrn()); // URN should be null when no instance + + // Verify platform is still mapped correctly + assertNotNull(result.getPlatform()); + assertEquals(result.getPlatform().getType(), EntityType.DATA_PLATFORM); + assertEquals(result.getPlatform().getUrn(), TEST_PLATFORM_URN); + } + + @Test(expectedExceptions = NullPointerException.class) + public void testMapNull() { + DataPlatformInstanceAspectMapper.map(null, null); + } + + @Test + public void testSingleton() { + assertNotNull(DataPlatformInstanceAspectMapper.INSTANCE); + assertSame( + DataPlatformInstanceAspectMapper.INSTANCE, DataPlatformInstanceAspectMapper.INSTANCE); + } +} diff --git a/datahub-graphql-core/src/test/java/com/linkedin/datahub/graphql/types/dataprocessinst/DataProcessInstanceTypeTest.java b/datahub-graphql-core/src/test/java/com/linkedin/datahub/graphql/types/dataprocessinst/DataProcessInstanceTypeTest.java new file mode 100644 index 0000000000000..437c74ab66914 --- /dev/null +++ b/datahub-graphql-core/src/test/java/com/linkedin/datahub/graphql/types/dataprocessinst/DataProcessInstanceTypeTest.java @@ -0,0 +1,246 @@ +package com.linkedin.datahub.graphql.types.dataprocessinst; + +import static com.linkedin.datahub.graphql.TestUtils.getMockAllowContext; +import static org.mockito.ArgumentMatchers.any; +import static org.testng.Assert.*; + +import com.google.common.collect.ImmutableList; +import com.google.common.collect.ImmutableMap; +import com.google.common.collect.ImmutableSet; +import com.linkedin.common.DataPlatformInstance; +import com.linkedin.common.FabricType; +import com.linkedin.common.Status; +import com.linkedin.common.SubTypes; +import com.linkedin.common.UrnArray; +import com.linkedin.common.urn.DataPlatformUrn; +import com.linkedin.common.urn.DatasetUrn; +import com.linkedin.common.urn.Urn; +import com.linkedin.common.urn.UrnUtils; +import com.linkedin.container.Container; +import com.linkedin.data.template.StringArray; +import com.linkedin.datahub.graphql.QueryContext; +import com.linkedin.datahub.graphql.featureflags.FeatureFlags; +import com.linkedin.datahub.graphql.generated.DataProcessInstance; +import com.linkedin.datahub.graphql.generated.EntityType; +import com.linkedin.dataprocess.DataProcessInstanceInput; +import com.linkedin.dataprocess.DataProcessInstanceOutput; +import com.linkedin.dataprocess.DataProcessInstanceProperties; +import com.linkedin.dataprocess.DataProcessInstanceRelationships; +import com.linkedin.dataprocess.DataProcessInstanceRunEvent; +import com.linkedin.dataprocess.DataProcessRunStatus; +import com.linkedin.dataprocess.DataProcessType; +import com.linkedin.entity.Aspect; +import com.linkedin.entity.EntityResponse; +import com.linkedin.entity.EnvelopedAspect; +import com.linkedin.entity.EnvelopedAspectMap; +import com.linkedin.entity.client.EntityClient; +import com.linkedin.metadata.Constants; +import com.linkedin.metadata.key.DataProcessInstanceKey; +import com.linkedin.ml.metadata.MLTrainingRunProperties; +import com.linkedin.r2.RemoteInvocationException; +import com.linkedin.test.TestResult; +import com.linkedin.test.TestResultArray; +import com.linkedin.test.TestResultType; +import com.linkedin.test.TestResults; +import graphql.execution.DataFetcherResult; +import java.util.HashMap; +import java.util.HashSet; +import java.util.List; +import java.util.Map; +import org.mockito.Mockito; +import org.testng.annotations.Test; + +public class DataProcessInstanceTypeTest { + + private static final String TEST_INSTANCE_URN = + "urn:li:dataProcessInstance:(test-workflow,test-instance-1)"; + private static final String TEST_DPI_1_URN = "urn:li:dataProcessInstance:id-1"; + private static final DatasetUrn DATASET_URN = + new DatasetUrn(new DataPlatformUrn("kafka"), "dataset1", FabricType.TEST); + private static final Urn DPI_URN_REL = UrnUtils.getUrn("urn:li:dataProcessInstance:id-2"); + private static final DataProcessInstanceKey TEST_DPI_1_KEY = + new DataProcessInstanceKey().setId("id-1"); + private static final DataProcessInstanceProperties TEST_DPI_1_PROPERTIES = + new DataProcessInstanceProperties().setName("Test DPI").setType(DataProcessType.STREAMING); + private static final DataProcessInstanceInput TEST_DPI_1_DPI_INPUT = + new DataProcessInstanceInput().setInputs(new UrnArray(ImmutableList.of(DATASET_URN))); + private static final DataProcessInstanceOutput TEST_DPI_1_DPI_OUTPUT = + new DataProcessInstanceOutput().setOutputs(new UrnArray(ImmutableList.of(DATASET_URN))); + private static final DataProcessInstanceRelationships TEST_DPI_1_DPI_RELATIONSHIPS = + new DataProcessInstanceRelationships() + .setParentInstance(DPI_URN_REL) + .setUpstreamInstances(new UrnArray(ImmutableList.of(DPI_URN_REL))) + .setParentTemplate(DPI_URN_REL); + private static final DataProcessInstanceRunEvent TEST_DPI_1_DPI_RUN_EVENT = + new DataProcessInstanceRunEvent().setStatus(DataProcessRunStatus.COMPLETE); + private static final DataPlatformInstance TEST_DPI_1_DATA_PLATFORM_INSTANCE = + new DataPlatformInstance().setPlatform(new DataPlatformUrn("kafka")); + private static final Status TEST_DPI_1_STATUS = new Status().setRemoved(false); + private static final TestResults TEST_DPI_1_TEST_RESULTS = + new TestResults() + .setPassing( + new TestResultArray( + ImmutableList.of( + new TestResult() + .setTest(UrnUtils.getUrn("urn:li:test:123")) + .setType(TestResultType.SUCCESS)))) + .setFailing(new TestResultArray()); + private static final SubTypes TEST_DPI_1_SUB_TYPES = + new SubTypes().setTypeNames(new StringArray("subtype1")); + private static final Container TEST_DPI_1_CONTAINER = + new Container().setContainer(UrnUtils.getUrn("urn:li:container:123")); + private static final MLTrainingRunProperties ML_TRAINING_RUN_PROPERTIES = + new MLTrainingRunProperties().setId("mytrainingrun"); + + private static final String TEST_DPI_2_URN = "urn:li:dataProcessInstance:id-2"; + + @Test + public void testBatchLoadFull() throws Exception { + EntityClient client = Mockito.mock(EntityClient.class); + + Urn dpiUrn1 = Urn.createFromString(TEST_DPI_1_URN); + Urn dpiUrn2 = Urn.createFromString(TEST_DPI_2_URN); + + Map aspectMap = new HashMap<>(); + aspectMap.put( + Constants.DATA_PROCESS_INSTANCE_KEY_ASPECT_NAME, + new EnvelopedAspect().setValue(new Aspect(TEST_DPI_1_KEY.data()))); + aspectMap.put( + Constants.DATA_PROCESS_INSTANCE_PROPERTIES_ASPECT_NAME, + new EnvelopedAspect().setValue(new Aspect(TEST_DPI_1_PROPERTIES.data()))); + aspectMap.put( + Constants.DATA_PROCESS_INSTANCE_INPUT_ASPECT_NAME, + new EnvelopedAspect().setValue(new Aspect(TEST_DPI_1_DPI_INPUT.data()))); + aspectMap.put( + Constants.DATA_PROCESS_INSTANCE_OUTPUT_ASPECT_NAME, + new EnvelopedAspect().setValue(new Aspect(TEST_DPI_1_DPI_OUTPUT.data()))); + aspectMap.put( + Constants.DATA_PROCESS_INSTANCE_RELATIONSHIPS_ASPECT_NAME, + new EnvelopedAspect().setValue(new Aspect(TEST_DPI_1_DPI_RELATIONSHIPS.data()))); + aspectMap.put( + Constants.DATA_PROCESS_INSTANCE_RUN_EVENT_ASPECT_NAME, + new EnvelopedAspect().setValue(new Aspect(TEST_DPI_1_DPI_RUN_EVENT.data()))); + aspectMap.put( + Constants.DATA_PLATFORM_INSTANCE_ASPECT_NAME, + new EnvelopedAspect().setValue(new Aspect(TEST_DPI_1_DATA_PLATFORM_INSTANCE.data()))); + aspectMap.put( + Constants.STATUS_ASPECT_NAME, + new EnvelopedAspect().setValue(new Aspect(TEST_DPI_1_STATUS.data()))); + aspectMap.put( + Constants.TEST_RESULTS_ASPECT_NAME, + new EnvelopedAspect().setValue(new Aspect(TEST_DPI_1_TEST_RESULTS.data()))); + aspectMap.put( + Constants.SUB_TYPES_ASPECT_NAME, + new EnvelopedAspect().setValue(new Aspect(TEST_DPI_1_SUB_TYPES.data()))); + aspectMap.put( + Constants.CONTAINER_ASPECT_NAME, + new EnvelopedAspect().setValue(new Aspect(TEST_DPI_1_CONTAINER.data()))); + aspectMap.put( + Constants.ML_TRAINING_RUN_PROPERTIES_ASPECT_NAME, + new EnvelopedAspect().setValue(new Aspect(ML_TRAINING_RUN_PROPERTIES.data()))); + + Mockito.when( + client.batchGetV2( + any(), + Mockito.eq(Constants.DATA_PROCESS_INSTANCE_ENTITY_NAME), + Mockito.eq(new HashSet<>(ImmutableSet.of(dpiUrn1, dpiUrn2))), + Mockito.eq(DataProcessInstanceType.ASPECTS_TO_FETCH))) + .thenReturn( + ImmutableMap.of( + dpiUrn1, + new EntityResponse() + .setEntityName(Constants.DATA_PROCESS_INSTANCE_ENTITY_NAME) + .setUrn(dpiUrn1) + .setAspects(new EnvelopedAspectMap(aspectMap)))); + + FeatureFlags mockFeatureFlags = Mockito.mock(FeatureFlags.class); + Mockito.when(mockFeatureFlags.isDataProcessInstanceEntityEnabled()).thenReturn(true); + + DataProcessInstanceType type = new DataProcessInstanceType(client, mockFeatureFlags); + + QueryContext mockContext = getMockAllowContext(); + List> result = + type.batchLoad(ImmutableList.of(TEST_DPI_1_URN, TEST_DPI_2_URN), mockContext); + + // Verify response + Mockito.verify(client, Mockito.times(1)) + .batchGetV2( + any(), + Mockito.eq(Constants.DATA_PROCESS_INSTANCE_ENTITY_NAME), + Mockito.eq(ImmutableSet.of(dpiUrn1, dpiUrn2)), + Mockito.eq(DataProcessInstanceType.ASPECTS_TO_FETCH)); + + assertEquals(result.size(), 2); + + DataProcessInstance dpi1 = result.get(0).getData(); + assertEquals(dpi1.getUrn(), TEST_DPI_1_URN); + assertEquals(dpi1.getName(), "Test DPI"); + assertEquals(dpi1.getType(), EntityType.DATA_PROCESS_INSTANCE); + + // Assert second element is null + assertNull(result.get(1)); + } + + @Test + public void testBatchLoad() throws Exception { + EntityClient mockClient = Mockito.mock(EntityClient.class); + FeatureFlags mockFeatureFlags = Mockito.mock(FeatureFlags.class); + Mockito.when(mockFeatureFlags.isDataProcessInstanceEntityEnabled()).thenReturn(true); + + DataProcessInstanceType type = new DataProcessInstanceType(mockClient, mockFeatureFlags); + + List> result = + type.batchLoad(ImmutableList.of(TEST_INSTANCE_URN), getMockAllowContext()); + + assertEquals(result.size(), 1); + } + + @Test + public void testBatchLoadFeatureFlagDisabled() throws Exception { + EntityClient mockClient = Mockito.mock(EntityClient.class); + FeatureFlags mockFeatureFlags = Mockito.mock(FeatureFlags.class); + Mockito.when(mockFeatureFlags.isDataProcessInstanceEntityEnabled()).thenReturn(false); + + DataProcessInstanceType type = new DataProcessInstanceType(mockClient, mockFeatureFlags); + + List> result = + type.batchLoad(ImmutableList.of(TEST_INSTANCE_URN), getMockAllowContext()); + + assertEquals(result.size(), 0); + + Mockito.verify(mockClient, Mockito.never()) + .batchGetV2(any(), Mockito.anyString(), Mockito.anySet(), Mockito.anySet()); + } + + @Test(expectedExceptions = RuntimeException.class) + public void testBatchLoadClientException() throws Exception { + EntityClient mockClient = Mockito.mock(EntityClient.class); + FeatureFlags mockFeatureFlags = Mockito.mock(FeatureFlags.class); + Mockito.when(mockFeatureFlags.isDataProcessInstanceEntityEnabled()).thenReturn(true); + + Mockito.doThrow(RemoteInvocationException.class) + .when(mockClient) + .batchGetV2(any(), Mockito.anyString(), Mockito.anySet(), Mockito.anySet()); + + DataProcessInstanceType type = new DataProcessInstanceType(mockClient, mockFeatureFlags); + type.batchLoad(ImmutableList.of(TEST_INSTANCE_URN), getMockAllowContext()); + } + + @Test + public void testGetType() { + EntityClient mockClient = Mockito.mock(EntityClient.class); + FeatureFlags mockFeatureFlags = Mockito.mock(FeatureFlags.class); + DataProcessInstanceType type = new DataProcessInstanceType(mockClient, mockFeatureFlags); + + assertEquals(type.type(), EntityType.DATA_PROCESS_INSTANCE); + } + + @Test + public void testObjectClass() { + EntityClient mockClient = Mockito.mock(EntityClient.class); + FeatureFlags mockFeatureFlags = Mockito.mock(FeatureFlags.class); + DataProcessInstanceType type = new DataProcessInstanceType(mockClient, mockFeatureFlags); + + assertEquals(type.objectClass(), DataProcessInstance.class); + } +} diff --git a/datahub-graphql-core/src/test/java/com/linkedin/datahub/graphql/types/dataprocessinst/mappers/DataProcessInstanceMapperTest.java b/datahub-graphql-core/src/test/java/com/linkedin/datahub/graphql/types/dataprocessinst/mappers/DataProcessInstanceMapperTest.java new file mode 100644 index 0000000000000..dc1ce935ad5ec --- /dev/null +++ b/datahub-graphql-core/src/test/java/com/linkedin/datahub/graphql/types/dataprocessinst/mappers/DataProcessInstanceMapperTest.java @@ -0,0 +1,127 @@ +package com.linkedin.datahub.graphql.types.dataprocessinst.mappers; + +import static org.testng.Assert.assertEquals; +import static org.testng.Assert.assertNotNull; + +import com.linkedin.common.DataPlatformInstance; +import com.linkedin.common.url.Url; +import com.linkedin.common.urn.Urn; +import com.linkedin.container.Container; +import com.linkedin.data.template.RecordTemplate; +import com.linkedin.data.template.StringArray; +import com.linkedin.datahub.graphql.generated.DataProcessInstance; +import com.linkedin.datahub.graphql.generated.EntityType; +import com.linkedin.dataprocess.DataProcessInstanceProperties; +import com.linkedin.entity.Aspect; +import com.linkedin.entity.EntityResponse; +import com.linkedin.entity.EnvelopedAspect; +import com.linkedin.entity.EnvelopedAspectMap; +import com.linkedin.metadata.Constants; +import com.linkedin.ml.metadata.MLTrainingRunProperties; +import java.util.HashMap; +import org.testng.annotations.BeforeMethod; +import org.testng.annotations.Test; + +public class DataProcessInstanceMapperTest { + + private static final String TEST_PLATFORM_URN = "urn:li:dataPlatform:kafka"; + private static final String TEST_INSTANCE_URN = + "urn:li:dataProcessInstance:(test-workflow,test-instance)"; + private static final String TEST_CONTAINER_URN = "urn:li:container:testContainer"; + private static final String TEST_EXTERNAL_URL = "https://example.com/process"; + private static final String TEST_NAME = "Test Process Instance"; + + private EntityResponse entityResponse; + private Urn urn; + + @BeforeMethod + public void setup() throws Exception { + urn = Urn.createFromString(TEST_INSTANCE_URN); + entityResponse = new EntityResponse(); + entityResponse.setUrn(urn); + entityResponse.setAspects(new EnvelopedAspectMap(new HashMap<>())); + } + + @Test + public void testMapBasicFields() throws Exception { + DataProcessInstance instance = DataProcessInstanceMapper.map(null, entityResponse); + + assertNotNull(instance); + assertEquals(instance.getUrn(), urn.toString()); + assertEquals(instance.getType(), EntityType.DATA_PROCESS_INSTANCE); + } + + @Test + public void testMapDataProcessProperties() throws Exception { + // Create DataProcessInstanceProperties + DataProcessInstanceProperties properties = new DataProcessInstanceProperties(); + properties.setName(TEST_NAME); + properties.setExternalUrl(new Url(TEST_EXTERNAL_URL)); + + // Add properties aspect + addAspect(Constants.DATA_PROCESS_INSTANCE_PROPERTIES_ASPECT_NAME, properties); + + DataProcessInstance instance = DataProcessInstanceMapper.map(null, entityResponse); + + assertNotNull(instance.getProperties()); + assertEquals(instance.getName(), TEST_NAME); + assertEquals(instance.getExternalUrl(), TEST_EXTERNAL_URL); + } + + @Test + public void testMapPlatformInstance() throws Exception { + // Create DataPlatformInstance + DataPlatformInstance platformInstance = new DataPlatformInstance(); + platformInstance.setPlatform(Urn.createFromString(TEST_PLATFORM_URN)); + + // Add platform instance aspect + addAspect(Constants.DATA_PLATFORM_INSTANCE_ASPECT_NAME, platformInstance); + + DataProcessInstance instance = DataProcessInstanceMapper.map(null, entityResponse); + + assertNotNull(instance.getDataPlatformInstance()); + assertNotNull(instance.getPlatform()); + assertEquals(instance.getPlatform().getUrn(), TEST_PLATFORM_URN); + assertEquals(instance.getPlatform().getType(), EntityType.DATA_PLATFORM); + } + + @Test + public void testMapContainer() throws Exception { + // Create Container aspect + Container container = new Container(); + container.setContainer(Urn.createFromString(TEST_CONTAINER_URN)); + + // Add container aspect + addAspect(Constants.CONTAINER_ASPECT_NAME, container); + + DataProcessInstance instance = DataProcessInstanceMapper.map(null, entityResponse); + + assertNotNull(instance.getContainer()); + assertEquals(instance.getContainer().getUrn(), TEST_CONTAINER_URN); + assertEquals(instance.getContainer().getType(), EntityType.CONTAINER); + } + + @Test + public void testMapMLTrainingProperties() throws Exception { + // Create MLTrainingRunProperties + MLTrainingRunProperties trainingProperties = new MLTrainingRunProperties(); + trainingProperties.setId("test-run-id"); + trainingProperties.setOutputUrls(new StringArray("s3://test-bucket/model")); + + // Add ML training properties aspect + addAspect(Constants.ML_TRAINING_RUN_PROPERTIES_ASPECT_NAME, trainingProperties); + + DataProcessInstance instance = DataProcessInstanceMapper.map(null, entityResponse); + + assertNotNull(instance); + assertEquals(instance.getMlTrainingRunProperties().getId(), "test-run-id"); + assertEquals( + instance.getMlTrainingRunProperties().getOutputUrls().get(0), "s3://test-bucket/model"); + } + + private void addAspect(String aspectName, RecordTemplate aspect) { + EnvelopedAspect envelopedAspect = new EnvelopedAspect(); + envelopedAspect.setValue(new Aspect(aspect.data())); + entityResponse.getAspects().put(aspectName, envelopedAspect); + } +} diff --git a/datahub-web-react/src/app/buildEntityRegistry.ts b/datahub-web-react/src/app/buildEntityRegistry.ts index 181ec7d328a58..b7ff97b3a0746 100644 --- a/datahub-web-react/src/app/buildEntityRegistry.ts +++ b/datahub-web-react/src/app/buildEntityRegistry.ts @@ -25,6 +25,7 @@ import { RestrictedEntity } from './entity/restricted/RestrictedEntity'; import { BusinessAttributeEntity } from './entity/businessAttribute/BusinessAttributeEntity'; import { SchemaFieldPropertiesEntity } from './entity/schemaField/SchemaFieldPropertiesEntity'; import { StructuredPropertyEntity } from './entity/structuredProperty/StructuredPropertyEntity'; +import { DataProcessInstanceEntity } from './entity/dataProcessInstance/DataProcessInstanceEntity'; export default function buildEntityRegistry() { const registry = new EntityRegistry(); @@ -54,5 +55,6 @@ export default function buildEntityRegistry() { registry.register(new BusinessAttributeEntity()); registry.register(new SchemaFieldPropertiesEntity()); registry.register(new StructuredPropertyEntity()); + registry.register(new DataProcessInstanceEntity()); return registry; } diff --git a/datahub-web-react/src/app/entity/dataProcessInstance/DataProcessInstanceEntity.tsx b/datahub-web-react/src/app/entity/dataProcessInstance/DataProcessInstanceEntity.tsx new file mode 100644 index 0000000000000..4834a026ad94a --- /dev/null +++ b/datahub-web-react/src/app/entity/dataProcessInstance/DataProcessInstanceEntity.tsx @@ -0,0 +1,264 @@ +import React from 'react'; +import { ApiOutlined } from '@ant-design/icons'; +import { + DataProcessInstance, + Entity as GeneratedEntity, + EntityType, + OwnershipType, + SearchResult, +} from '../../../types.generated'; +import { Preview } from './preview/Preview'; +import { Entity, EntityCapabilityType, IconStyleType, PreviewType } from '../Entity'; +import { EntityProfile } from '../shared/containers/profile/EntityProfile'; +import { useGetDataProcessInstanceQuery } from '../../../graphql/dataProcessInstance.generated'; +import { PropertiesTab } from '../shared/tabs/Properties/PropertiesTab'; +import { LineageTab } from '../shared/tabs/Lineage/LineageTab'; +import { SidebarAboutSection } from '../shared/containers/profile/sidebar/AboutSection/SidebarAboutSection'; +import { SidebarTagsSection } from '../shared/containers/profile/sidebar/SidebarTagsSection'; +import { SidebarOwnerSection } from '../shared/containers/profile/sidebar/Ownership/sidebar/SidebarOwnerSection'; +import { GenericEntityProperties } from '../shared/types'; +import { getDataForEntityType } from '../shared/containers/profile/utils'; +import { SidebarDomainSection } from '../shared/containers/profile/sidebar/Domain/SidebarDomainSection'; +import { EntityMenuItems } from '../shared/EntityDropdown/EntityDropdown'; +import { capitalizeFirstLetterOnly } from '../../shared/textUtil'; +import DataProductSection from '../shared/containers/profile/sidebar/DataProduct/DataProductSection'; +import { getDataProduct } from '../shared/utils'; +// import SummaryTab from './profile/DataProcessInstaceSummary'; + +// const getProcessPlatformName = (data?: DataProcessInstance): string => { +// return ( +// data?.dataPlatformInstance?.platform?.properties?.displayName || +// capitalizeFirstLetterOnly(data?.dataPlatformInstance?.platform?.name) || +// '' +// ); +// }; + +const getParentEntities = (data: DataProcessInstance): GeneratedEntity[] => { + const parentEntity = data?.relationships?.relationships?.find( + (rel) => rel.type === 'InstanceOf' && rel.entity?.type === EntityType.DataJob, + ); + + if (!parentEntity?.entity) return []; + + // Convert to GeneratedEntity + return [ + { + type: parentEntity.entity.type, + urn: (parentEntity.entity as any).urn, // Make sure urn exists + relationships: (parentEntity.entity as any).relationships, + }, + ]; +}; +/** + * Definition of the DataHub DataProcessInstance entity. + */ +export class DataProcessInstanceEntity implements Entity { + type: EntityType = EntityType.DataProcessInstance; + + icon = (fontSize: number, styleType: IconStyleType, color?: string) => { + if (styleType === IconStyleType.TAB_VIEW) { + return ; + } + + if (styleType === IconStyleType.HIGHLIGHT) { + return ; + } + + return ( + + ); + }; + + isSearchEnabled = () => true; + + isBrowseEnabled = () => true; + + isLineageEnabled = () => true; + + getAutoCompleteFieldName = () => 'name'; + + getPathName = () => 'dataProcessInstance'; + + getEntityName = () => 'Process Instance'; + + getGraphName = () => 'dataProcessInstance'; + + getCollectionName = () => 'Process Instances'; + + useEntityQuery = useGetDataProcessInstanceQuery; + + renderProfile = (urn: string) => ( + { + // const activeIncidentCount = processInstance?.dataProcessInstance?.activeIncidents.total; + // return `Incidents${(activeIncidentCount && ` (${activeIncidentCount})`) || ''}`; + // }, + // }, + ]} + sidebarSections={this.getSidebarSections()} + /> + ); + + getSidebarSections = () => [ + { + component: SidebarAboutSection, + }, + { + component: SidebarOwnerSection, + properties: { + defaultOwnerType: OwnershipType.TechnicalOwner, + }, + }, + { + component: SidebarTagsSection, + properties: { + hasTags: true, + hasTerms: true, + }, + }, + { + component: SidebarDomainSection, + }, + { + component: DataProductSection, + }, + ]; + + getOverridePropertiesFromEntity = (processInstance?: DataProcessInstance | null): GenericEntityProperties => { + const name = processInstance?.name; + const externalUrl = processInstance?.externalUrl; + return { + name, + externalUrl, + }; + }; + + renderPreview = (_: PreviewType, data: DataProcessInstance) => { + const genericProperties = this.getGenericEntityProperties(data); + const parentEntities = getParentEntities(data); + return ( + + ); + }; + + renderSearch = (result: SearchResult) => { + const data = result.entity as DataProcessInstance; + const genericProperties = this.getGenericEntityProperties(data); + const parentEntities = getParentEntities(data); + return ( + + ); + }; + + getLineageVizConfig = (entity: DataProcessInstance) => { + return { + urn: entity?.urn, + name: this.displayName(entity), + type: EntityType.DataProcessInstance, + subtype: entity?.subTypes?.typeNames?.[0], + icon: entity?.platform?.properties?.logoUrl || undefined, + platform: entity?.platform, + container: entity?.container, + // health: entity?.health || undefined, + }; + }; + + displayName = (data: DataProcessInstance) => { + return data.properties?.name || data.urn; + }; + + getGenericEntityProperties = (data: DataProcessInstance) => { + return getDataForEntityType({ + data, + entityType: this.type, + getOverrideProperties: this.getOverridePropertiesFromEntity, + }); + }; + + supportedCapabilities = () => { + return new Set([ + EntityCapabilityType.OWNERS, + EntityCapabilityType.GLOSSARY_TERMS, + EntityCapabilityType.TAGS, + EntityCapabilityType.DOMAINS, + EntityCapabilityType.DEPRECATION, + EntityCapabilityType.SOFT_DELETE, + EntityCapabilityType.DATA_PRODUCTS, + ]); + }; +} diff --git a/datahub-web-react/src/app/entity/dataProcessInstance/preview/Preview.tsx b/datahub-web-react/src/app/entity/dataProcessInstance/preview/Preview.tsx new file mode 100644 index 0000000000000..3a3b0340695d9 --- /dev/null +++ b/datahub-web-react/src/app/entity/dataProcessInstance/preview/Preview.tsx @@ -0,0 +1,103 @@ +import React from 'react'; +import { + DataProduct, + Deprecation, + Domain, + Entity as GeneratedEntity, + EntityPath, + EntityType, + GlobalTags, + Health, + Owner, + SearchInsight, + Container, + ParentContainersResult, +} from '../../../../types.generated'; +import DefaultPreviewCard from '../../../preview/DefaultPreviewCard'; +import { useEntityRegistry } from '../../../useEntityRegistry'; +import { IconStyleType } from '../../Entity'; + +export const Preview = ({ + urn, + name, + subType, + description, + platformName, + platformLogo, + platformInstanceId, + container, + owners, + domain, + dataProduct, + deprecation, + globalTags, + snippet, + insights, + externalUrl, + degree, + paths, + health, + parentEntities, + parentContainers, +}: // duration, +// status, +// startTime, +{ + urn: string; + name: string; + subType?: string | null; + description?: string | null; + platformName?: string; + platformLogo?: string | null; + platformInstanceId?: string; + container?: Container; + owners?: Array | null; + domain?: Domain | null; + dataProduct?: DataProduct | null; + deprecation?: Deprecation | null; + globalTags?: GlobalTags | null; + snippet?: React.ReactNode | null; + insights?: Array | null; + externalUrl?: string | null; + degree?: number; + paths?: EntityPath[]; + health?: Health[] | null; + parentEntities?: Array | null; + parentContainers?: ParentContainersResult | null; + // duration?: number | null; + // status?: string | null; + // startTime?: number | null; +}): JSX.Element => { + const entityRegistry = useEntityRegistry(); + return ( + + ); +}; diff --git a/datahub-web-react/src/graphql/dataProcessInstance.graphql b/datahub-web-react/src/graphql/dataProcessInstance.graphql new file mode 100644 index 0000000000000..8f55ca4903d52 --- /dev/null +++ b/datahub-web-react/src/graphql/dataProcessInstance.graphql @@ -0,0 +1,181 @@ +fragment processInstanceRelationshipResults on EntityRelationshipsResult { + start + count + total + relationships { + type + direction + entity { + urn + type + ... on Dataset { + name + properties { + name + description + qualifiedName + } + editableProperties { + description + } + platform { + ...platformFields + } + subTypes { + typeNames + } + status { + removed + } + } + ... on DataJob { + urn + type + dataFlow { + ...nonRecursiveDataFlowFields + } + jobId + properties { + name + description + externalUrl + customProperties { + key + value + } + } + deprecation { + ...deprecationFields + } + dataPlatformInstance { + ...dataPlatformInstanceFields + } + subTypes { + typeNames + } + editableProperties { + description + } + status { + removed + } + } + } + } +} + +fragment dataProcessInstanceFields on DataProcessInstance { + urn + type + platform { + ...platformFields + } + parentContainers { + ...parentContainersFields + } + container { + ...entityContainer + } + subTypes { + typeNames + } + properties { + name + createdTS: created { + time + actor + } + customProperties { + key + value + } + } + mlTrainingRunProperties { + outputUrls + trainingMetrics { + name + description + value + } + hyperParams { + name + description + value + } + } + dataPlatformInstance { + ...dataPlatformInstanceFields + } + state(startTimeMillis: null, endTimeMillis: null, limit: 1) { + status + attempt + result { + resultType + nativeResultType + } + timestampMillis + durationMillis + } + relationships(input: { types: ["InstanceOf", "Consumes", "Produces"], direction: OUTGOING, start: 0, count: 50 }) { + ...processInstanceRelationshipResults + } +} + +query getDataProcessInstance($urn: String!) { + dataProcessInstance(urn: $urn) { + urn + type + platform { + ...platformFields + } + parentContainers { + ...parentContainersFields + } + subTypes { + typeNames + } + container { + ...entityContainer + } + name + properties { + name + created { + time + actor + } + } + mlTrainingRunProperties { + id + outputUrls + trainingMetrics { + name + description + value + } + hyperParams { + name + description + value + } + } + relationships( + input: { types: ["InstanceOf", "Consumes", "Produces"], direction: OUTGOING, start: 0, count: 50 } + ) { + ...processInstanceRelationshipResults + } + dataPlatformInstance { + ...dataPlatformInstanceFields + } + state(startTimeMillis: null, endTimeMillis: null, limit: 1) { + status + attempt + result { + resultType + nativeResultType + } + timestampMillis + durationMillis + } + } +} diff --git a/datahub-web-react/src/graphql/fragments.graphql b/datahub-web-react/src/graphql/fragments.graphql index 68c57c5cb5db5..ecac299748935 100644 --- a/datahub-web-react/src/graphql/fragments.graphql +++ b/datahub-web-react/src/graphql/fragments.graphql @@ -863,8 +863,17 @@ fragment nonRecursiveMLModel on MLModel { ...ownershipFields } properties { + name description date + created { + time + actor + } + lastModified { + time + actor + } externalUrl version type @@ -956,7 +965,12 @@ fragment nonRecursiveMLModelGroupFields on MLModelGroup { ...deprecationFields } properties { + name description + created { + time + actor + } } browsePathV2 { ...browsePathV2Fields diff --git a/datahub-web-react/src/graphql/lineage.graphql b/datahub-web-react/src/graphql/lineage.graphql index ee05811cbb72d..457936ed62cd2 100644 --- a/datahub-web-react/src/graphql/lineage.graphql +++ b/datahub-web-react/src/graphql/lineage.graphql @@ -259,6 +259,9 @@ fragment lineageNodeProperties on EntityWithRelationships { name description origin + tags { + ...globalTagsFields + } platform { ...platformFields } @@ -268,6 +271,19 @@ fragment lineageNodeProperties on EntityWithRelationships { status { removed } + properties { + createdTS: created { + time + actor + } + customProperties { + key + value + } + } + editableProperties { + description + } structuredProperties { properties { ...structuredPropertiesFields @@ -328,6 +344,9 @@ fragment lineageNodeProperties on EntityWithRelationships { urn type } + ... on DataProcessInstance { + ...dataProcessInstanceFields + } } fragment lineageFields on EntityWithRelationships { diff --git a/datahub-web-react/src/graphql/mlModelGroup.graphql b/datahub-web-react/src/graphql/mlModelGroup.graphql index 81ab65d0b9a08..4f11ed4984d37 100644 --- a/datahub-web-react/src/graphql/mlModelGroup.graphql +++ b/datahub-web-react/src/graphql/mlModelGroup.graphql @@ -2,6 +2,18 @@ query getMLModelGroup($urn: String!) { mlModelGroup(urn: $urn) { urn type + properties { + name + description + created { + time + actor + } + lastModified { + time + actor + } + } ...nonRecursiveMLModelGroupFields incoming: relationships( input: { diff --git a/li-utils/src/main/java/com/linkedin/metadata/Constants.java b/li-utils/src/main/java/com/linkedin/metadata/Constants.java index 42080e4e17596..01c33a2530efb 100644 --- a/li-utils/src/main/java/com/linkedin/metadata/Constants.java +++ b/li-utils/src/main/java/com/linkedin/metadata/Constants.java @@ -421,6 +421,10 @@ public class Constants { "dataProcessInstanceRunEvent"; public static final String DATA_PROCESS_INSTANCE_RELATIONSHIPS_ASPECT_NAME = "dataProcessInstanceRelationships"; + public static final String DATA_PROCESS_INSTANCE_INPUT_ASPECT_NAME = "dataProcessInstanceInput"; + public static final String DATA_PROCESS_INSTANCE_OUTPUT_ASPECT_NAME = "dataProcessInstanceOutput"; + public static final String DATA_PROCESS_INSTANCE_KEY_ASPECT_NAME = "dataProcessInstanceKey"; + public static final String ML_TRAINING_RUN_PROPERTIES_ASPECT_NAME = "mlTrainingRunProperties"; // Business Attribute public static final String BUSINESS_ATTRIBUTE_KEY_ASPECT_NAME = "businessAttributeKey"; diff --git a/metadata-io/src/test/java/com/linkedin/metadata/search/query/request/SearchRequestHandlerTest.java b/metadata-io/src/test/java/com/linkedin/metadata/search/query/request/SearchRequestHandlerTest.java index e51511699e345..1a91ae35c6595 100644 --- a/metadata-io/src/test/java/com/linkedin/metadata/search/query/request/SearchRequestHandlerTest.java +++ b/metadata-io/src/test/java/com/linkedin/metadata/search/query/request/SearchRequestHandlerTest.java @@ -772,6 +772,11 @@ public void testQueryByDefault() { EntityType.SCHEMA_FIELD, Stream.concat(COMMON.stream(), Stream.of("schemaFieldAliases", "parent")) .collect(Collectors.toSet())) + .put( + EntityType.DATA_PROCESS_INSTANCE, + Stream.concat( + COMMON.stream(), Stream.of("parentInstance", "parentTemplate", "status")) + .collect(Collectors.toSet())) .build(); for (EntityType entityType : SEARCHABLE_ENTITY_TYPES) { diff --git a/metadata-models/src/main/pegasus/com/linkedin/ml/metadata/MLModelGroupProperties.pdl b/metadata-models/src/main/pegasus/com/linkedin/ml/metadata/MLModelGroupProperties.pdl index 81c5e7a240f61..b9e364bee8c65 100644 --- a/metadata-models/src/main/pegasus/com/linkedin/ml/metadata/MLModelGroupProperties.pdl +++ b/metadata-models/src/main/pegasus/com/linkedin/ml/metadata/MLModelGroupProperties.pdl @@ -12,7 +12,7 @@ import com.linkedin.common.TimeStamp @Aspect = { "name": "mlModelGroupProperties" } -record MLModelGroupProperties includes CustomProperties { +record MLModelGroupProperties includes CustomProperties, MLModelLineageInfo { /** * Display name of the MLModelGroup @@ -50,18 +50,6 @@ record MLModelGroupProperties includes CustomProperties { */ lastModified: optional TimeStamp - /** - * List of jobs (if any) used to train the model group. Visible in Lineage. - */ - @Relationship = { - "/*": { - "name": "TrainedBy", - "entityTypes": [ "dataJob" ], - "isLineage": true - } - } - trainingJobs: optional array[Urn] - /** * Version of the MLModelGroup */ diff --git a/metadata-models/src/main/pegasus/com/linkedin/ml/metadata/MLModelLineageInfo.pdl b/metadata-models/src/main/pegasus/com/linkedin/ml/metadata/MLModelLineageInfo.pdl new file mode 100644 index 0000000000000..4c17d6e6ab1a0 --- /dev/null +++ b/metadata-models/src/main/pegasus/com/linkedin/ml/metadata/MLModelLineageInfo.pdl @@ -0,0 +1,35 @@ +namespace com.linkedin.ml.metadata +import com.linkedin.common.Urn + + +/** +* A set of re-usable fields used to capture lineage information for ML Models and ML Model Groups +*/ +record MLModelLineageInfo { + + /** + * List of jobs or process instances (if any) used to train the model or group. Visible in Lineage. Note that ML Models can also be specified as the output of a specific Data Process Instances (runs) via the DataProcessInstanceOutputs aspect. + */ + @Relationship = { + "/*": { + "name": "TrainedBy", + "entityTypes": [ "dataJob", "dataProcessInstance" ], + "isLineage": true + } + } + trainingJobs: optional array[Urn] + + /** + * List of jobs or process instances (if any) that use the model or group. + */ + @Relationship = { + "/*": { + "name": "UsedBy", + "entityTypes": [ "dataJob", "dataProcessInstance" ], + "isLineage": true, + "isUpstream": false + } + } + downstreamJobs: optional array[Urn] + +} \ No newline at end of file diff --git a/metadata-models/src/main/pegasus/com/linkedin/ml/metadata/MLModelProperties.pdl b/metadata-models/src/main/pegasus/com/linkedin/ml/metadata/MLModelProperties.pdl index d89d07384bba1..ac10e0add13a1 100644 --- a/metadata-models/src/main/pegasus/com/linkedin/ml/metadata/MLModelProperties.pdl +++ b/metadata-models/src/main/pegasus/com/linkedin/ml/metadata/MLModelProperties.pdl @@ -14,7 +14,7 @@ import com.linkedin.common.TimeStamp @Aspect = { "name": "mlModelProperties" } -record MLModelProperties includes CustomProperties, ExternalReference { +record MLModelProperties includes CustomProperties, ExternalReference, MLModelLineageInfo { /** * Display name of the MLModel @@ -116,31 +116,6 @@ record MLModelProperties includes CustomProperties, ExternalReference { } deployments: optional array[Urn] - /** - * List of jobs (if any) used to train the model. Visible in Lineage. Note that ML Models can also be specified as the output of a specific Data Process Instances (runs) via the DataProcessInstanceOutputs aspect. - */ - @Relationship = { - "/*": { - "name": "TrainedBy", - "entityTypes": [ "dataJob", "dataProcessInstance" ], - "isLineage": true - } - } - trainingJobs: optional array[Urn] - - /** - * List of jobs (if any) that use the model - */ - @Relationship = { - "/*": { - "name": "UsedBy", - "entityTypes": [ "dataJob" ], - "isLineage": true, - "isUpstream": false - } - } - downstreamJobs: optional array[Urn] - /** * Groups the model belongs to */ diff --git a/metadata-service/configuration/src/main/java/com/linkedin/datahub/graphql/featureflags/FeatureFlags.java b/metadata-service/configuration/src/main/java/com/linkedin/datahub/graphql/featureflags/FeatureFlags.java index 28abb26be1f52..97ca0dcabea9f 100644 --- a/metadata-service/configuration/src/main/java/com/linkedin/datahub/graphql/featureflags/FeatureFlags.java +++ b/metadata-service/configuration/src/main/java/com/linkedin/datahub/graphql/featureflags/FeatureFlags.java @@ -25,4 +25,5 @@ public class FeatureFlags { private boolean showSeparateSiblings = false; private boolean alternateMCPValidation = false; private boolean showManageStructuredProperties = false; + private boolean dataProcessInstanceEntityEnabled = true; } diff --git a/metadata-service/restli-api/src/main/snapshot/com.linkedin.entity.aspects.snapshot.json b/metadata-service/restli-api/src/main/snapshot/com.linkedin.entity.aspects.snapshot.json index 1c713fd33884b..432c4a9ddcb73 100644 --- a/metadata-service/restli-api/src/main/snapshot/com.linkedin.entity.aspects.snapshot.json +++ b/metadata-service/restli-api/src/main/snapshot/com.linkedin.entity.aspects.snapshot.json @@ -3827,7 +3827,43 @@ "name" : "MLModelProperties", "namespace" : "com.linkedin.ml.metadata", "doc" : "Properties associated with a ML Model\r", - "include" : [ "com.linkedin.common.CustomProperties", "com.linkedin.common.ExternalReference" ], + "include" : [ "com.linkedin.common.CustomProperties", "com.linkedin.common.ExternalReference", { + "type" : "record", + "name" : "MLModelLineageInfo", + "doc" : "A set of re-usable fields used to capture lineage information for ML Models and ML Model Groups\r", + "fields" : [ { + "name" : "trainingJobs", + "type" : { + "type" : "array", + "items" : "com.linkedin.common.Urn" + }, + "doc" : "List of jobs or process instances (if any) used to train the model or group. Visible in Lineage. Note that ML Models can also be specified as the output of a specific Data Process Instances (runs) via the DataProcessInstanceOutputs aspect.\r", + "optional" : true, + "Relationship" : { + "/*" : { + "entityTypes" : [ "dataJob", "dataProcessInstance" ], + "isLineage" : true, + "name" : "TrainedBy" + } + } + }, { + "name" : "downstreamJobs", + "type" : { + "type" : "array", + "items" : "com.linkedin.common.Urn" + }, + "doc" : "List of jobs or process instances (if any) that use the model or group.\r", + "optional" : true, + "Relationship" : { + "/*" : { + "entityTypes" : [ "dataJob", "dataProcessInstance" ], + "isLineage" : true, + "isUpstream" : false, + "name" : "UsedBy" + } + } + } ] + } ], "fields" : [ { "name" : "name", "type" : "string", @@ -4005,37 +4041,6 @@ "name" : "DeployedTo" } } - }, { - "name" : "trainingJobs", - "type" : { - "type" : "array", - "items" : "com.linkedin.common.Urn" - }, - "doc" : "List of jobs (if any) used to train the model. Visible in Lineage. Note that ML Models can also be specified as the output of a specific Data Process Instances (runs) via the DataProcessInstanceOutputs aspect.\r", - "optional" : true, - "Relationship" : { - "/*" : { - "entityTypes" : [ "dataJob", "dataProcessInstance" ], - "isLineage" : true, - "name" : "TrainedBy" - } - } - }, { - "name" : "downstreamJobs", - "type" : { - "type" : "array", - "items" : "com.linkedin.common.Urn" - }, - "doc" : "List of jobs (if any) that use the model\r", - "optional" : true, - "Relationship" : { - "/*" : { - "entityTypes" : [ "dataJob" ], - "isLineage" : true, - "isUpstream" : false, - "name" : "UsedBy" - } - } }, { "name" : "groups", "type" : { @@ -4213,7 +4218,7 @@ }, "doc" : "The order to sort the results i.e. ASCENDING or DESCENDING" } ] - }, "com.linkedin.metadata.query.filter.SortOrder", "com.linkedin.ml.metadata.BaseData", "com.linkedin.ml.metadata.CaveatDetails", "com.linkedin.ml.metadata.CaveatsAndRecommendations", "com.linkedin.ml.metadata.EthicalConsiderations", "com.linkedin.ml.metadata.EvaluationData", "com.linkedin.ml.metadata.HyperParameterValueType", "com.linkedin.ml.metadata.IntendedUse", "com.linkedin.ml.metadata.IntendedUserType", "com.linkedin.ml.metadata.MLFeatureProperties", "com.linkedin.ml.metadata.MLHyperParam", "com.linkedin.ml.metadata.MLMetric", "com.linkedin.ml.metadata.MLModelFactorPrompts", "com.linkedin.ml.metadata.MLModelFactors", "com.linkedin.ml.metadata.MLModelProperties", "com.linkedin.ml.metadata.Metrics", "com.linkedin.ml.metadata.QuantitativeAnalyses", "com.linkedin.ml.metadata.ResultsType", "com.linkedin.ml.metadata.SourceCode", "com.linkedin.ml.metadata.SourceCodeUrl", "com.linkedin.ml.metadata.SourceCodeUrlType", "com.linkedin.ml.metadata.TrainingData", "com.linkedin.mxe.GenericAspect", { + }, "com.linkedin.metadata.query.filter.SortOrder", "com.linkedin.ml.metadata.BaseData", "com.linkedin.ml.metadata.CaveatDetails", "com.linkedin.ml.metadata.CaveatsAndRecommendations", "com.linkedin.ml.metadata.EthicalConsiderations", "com.linkedin.ml.metadata.EvaluationData", "com.linkedin.ml.metadata.HyperParameterValueType", "com.linkedin.ml.metadata.IntendedUse", "com.linkedin.ml.metadata.IntendedUserType", "com.linkedin.ml.metadata.MLFeatureProperties", "com.linkedin.ml.metadata.MLHyperParam", "com.linkedin.ml.metadata.MLMetric", "com.linkedin.ml.metadata.MLModelFactorPrompts", "com.linkedin.ml.metadata.MLModelFactors", "com.linkedin.ml.metadata.MLModelLineageInfo", "com.linkedin.ml.metadata.MLModelProperties", "com.linkedin.ml.metadata.Metrics", "com.linkedin.ml.metadata.QuantitativeAnalyses", "com.linkedin.ml.metadata.ResultsType", "com.linkedin.ml.metadata.SourceCode", "com.linkedin.ml.metadata.SourceCodeUrl", "com.linkedin.ml.metadata.SourceCodeUrlType", "com.linkedin.ml.metadata.TrainingData", "com.linkedin.mxe.GenericAspect", { "type" : "record", "name" : "MetadataChangeProposal", "namespace" : "com.linkedin.mxe", diff --git a/metadata-service/restli-api/src/main/snapshot/com.linkedin.entity.entities.snapshot.json b/metadata-service/restli-api/src/main/snapshot/com.linkedin.entity.entities.snapshot.json index 77d4644f3c121..45e91873de10f 100644 --- a/metadata-service/restli-api/src/main/snapshot/com.linkedin.entity.entities.snapshot.json +++ b/metadata-service/restli-api/src/main/snapshot/com.linkedin.entity.entities.snapshot.json @@ -3985,7 +3985,43 @@ "name" : "MLModelProperties", "namespace" : "com.linkedin.ml.metadata", "doc" : "Properties associated with a ML Model\r", - "include" : [ "com.linkedin.common.CustomProperties", "com.linkedin.common.ExternalReference" ], + "include" : [ "com.linkedin.common.CustomProperties", "com.linkedin.common.ExternalReference", { + "type" : "record", + "name" : "MLModelLineageInfo", + "doc" : "A set of re-usable fields used to capture lineage information for ML Models and ML Model Groups\r", + "fields" : [ { + "name" : "trainingJobs", + "type" : { + "type" : "array", + "items" : "com.linkedin.common.Urn" + }, + "doc" : "List of jobs or process instances (if any) used to train the model or group. Visible in Lineage. Note that ML Models can also be specified as the output of a specific Data Process Instances (runs) via the DataProcessInstanceOutputs aspect.\r", + "optional" : true, + "Relationship" : { + "/*" : { + "entityTypes" : [ "dataJob", "dataProcessInstance" ], + "isLineage" : true, + "name" : "TrainedBy" + } + } + }, { + "name" : "downstreamJobs", + "type" : { + "type" : "array", + "items" : "com.linkedin.common.Urn" + }, + "doc" : "List of jobs or process instances (if any) that use the model or group.\r", + "optional" : true, + "Relationship" : { + "/*" : { + "entityTypes" : [ "dataJob", "dataProcessInstance" ], + "isLineage" : true, + "isUpstream" : false, + "name" : "UsedBy" + } + } + } ] + } ], "fields" : [ { "name" : "name", "type" : "string", @@ -4163,37 +4199,6 @@ "name" : "DeployedTo" } } - }, { - "name" : "trainingJobs", - "type" : { - "type" : "array", - "items" : "com.linkedin.common.Urn" - }, - "doc" : "List of jobs (if any) used to train the model. Visible in Lineage. Note that ML Models can also be specified as the output of a specific Data Process Instances (runs) via the DataProcessInstanceOutputs aspect.\r", - "optional" : true, - "Relationship" : { - "/*" : { - "entityTypes" : [ "dataJob", "dataProcessInstance" ], - "isLineage" : true, - "name" : "TrainedBy" - } - } - }, { - "name" : "downstreamJobs", - "type" : { - "type" : "array", - "items" : "com.linkedin.common.Urn" - }, - "doc" : "List of jobs (if any) that use the model\r", - "optional" : true, - "Relationship" : { - "/*" : { - "entityTypes" : [ "dataJob" ], - "isLineage" : true, - "isUpstream" : false, - "name" : "UsedBy" - } - } }, { "name" : "groups", "type" : { @@ -5004,7 +5009,7 @@ "name" : "MLModelGroupProperties", "namespace" : "com.linkedin.ml.metadata", "doc" : "Properties associated with an ML Model Group\r", - "include" : [ "com.linkedin.common.CustomProperties" ], + "include" : [ "com.linkedin.common.CustomProperties", "MLModelLineageInfo" ], "fields" : [ { "name" : "name", "type" : "string", @@ -5041,21 +5046,6 @@ "type" : "com.linkedin.common.TimeStamp", "doc" : "Date when the MLModelGroup was last modified\r", "optional" : true - }, { - "name" : "trainingJobs", - "type" : { - "type" : "array", - "items" : "com.linkedin.common.Urn" - }, - "doc" : "List of jobs (if any) used to train the model group. Visible in Lineage.\r", - "optional" : true, - "Relationship" : { - "/*" : { - "entityTypes" : [ "dataJob" ], - "isLineage" : true, - "name" : "TrainedBy" - } - } }, { "name" : "version", "type" : "com.linkedin.common.VersionTag", @@ -6700,7 +6690,7 @@ "type" : "int", "doc" : "The total number of entities directly under searched path" } ] - }, "com.linkedin.metadata.search.SearchResultMetadata", "com.linkedin.metadata.search.SearchSuggestion", "com.linkedin.metadata.snapshot.ChartSnapshot", "com.linkedin.metadata.snapshot.CorpGroupSnapshot", "com.linkedin.metadata.snapshot.CorpUserSnapshot", "com.linkedin.metadata.snapshot.DashboardSnapshot", "com.linkedin.metadata.snapshot.DataFlowSnapshot", "com.linkedin.metadata.snapshot.DataHubPolicySnapshot", "com.linkedin.metadata.snapshot.DataHubRetentionSnapshot", "com.linkedin.metadata.snapshot.DataJobSnapshot", "com.linkedin.metadata.snapshot.DataPlatformSnapshot", "com.linkedin.metadata.snapshot.DataProcessSnapshot", "com.linkedin.metadata.snapshot.DatasetSnapshot", "com.linkedin.metadata.snapshot.GlossaryNodeSnapshot", "com.linkedin.metadata.snapshot.GlossaryTermSnapshot", "com.linkedin.metadata.snapshot.MLFeatureSnapshot", "com.linkedin.metadata.snapshot.MLFeatureTableSnapshot", "com.linkedin.metadata.snapshot.MLModelDeploymentSnapshot", "com.linkedin.metadata.snapshot.MLModelGroupSnapshot", "com.linkedin.metadata.snapshot.MLModelSnapshot", "com.linkedin.metadata.snapshot.MLPrimaryKeySnapshot", "com.linkedin.metadata.snapshot.SchemaFieldSnapshot", "com.linkedin.metadata.snapshot.Snapshot", "com.linkedin.metadata.snapshot.TagSnapshot", "com.linkedin.ml.metadata.BaseData", "com.linkedin.ml.metadata.CaveatDetails", "com.linkedin.ml.metadata.CaveatsAndRecommendations", "com.linkedin.ml.metadata.DeploymentStatus", "com.linkedin.ml.metadata.EthicalConsiderations", "com.linkedin.ml.metadata.EvaluationData", "com.linkedin.ml.metadata.HyperParameterValueType", "com.linkedin.ml.metadata.IntendedUse", "com.linkedin.ml.metadata.IntendedUserType", "com.linkedin.ml.metadata.MLFeatureProperties", "com.linkedin.ml.metadata.MLFeatureTableProperties", "com.linkedin.ml.metadata.MLHyperParam", "com.linkedin.ml.metadata.MLMetric", "com.linkedin.ml.metadata.MLModelDeploymentProperties", "com.linkedin.ml.metadata.MLModelFactorPrompts", "com.linkedin.ml.metadata.MLModelFactors", "com.linkedin.ml.metadata.MLModelGroupProperties", "com.linkedin.ml.metadata.MLModelProperties", "com.linkedin.ml.metadata.MLPrimaryKeyProperties", "com.linkedin.ml.metadata.Metrics", "com.linkedin.ml.metadata.QuantitativeAnalyses", "com.linkedin.ml.metadata.ResultsType", "com.linkedin.ml.metadata.SourceCode", "com.linkedin.ml.metadata.SourceCodeUrl", "com.linkedin.ml.metadata.SourceCodeUrlType", "com.linkedin.ml.metadata.TrainingData", { + }, "com.linkedin.metadata.search.SearchResultMetadata", "com.linkedin.metadata.search.SearchSuggestion", "com.linkedin.metadata.snapshot.ChartSnapshot", "com.linkedin.metadata.snapshot.CorpGroupSnapshot", "com.linkedin.metadata.snapshot.CorpUserSnapshot", "com.linkedin.metadata.snapshot.DashboardSnapshot", "com.linkedin.metadata.snapshot.DataFlowSnapshot", "com.linkedin.metadata.snapshot.DataHubPolicySnapshot", "com.linkedin.metadata.snapshot.DataHubRetentionSnapshot", "com.linkedin.metadata.snapshot.DataJobSnapshot", "com.linkedin.metadata.snapshot.DataPlatformSnapshot", "com.linkedin.metadata.snapshot.DataProcessSnapshot", "com.linkedin.metadata.snapshot.DatasetSnapshot", "com.linkedin.metadata.snapshot.GlossaryNodeSnapshot", "com.linkedin.metadata.snapshot.GlossaryTermSnapshot", "com.linkedin.metadata.snapshot.MLFeatureSnapshot", "com.linkedin.metadata.snapshot.MLFeatureTableSnapshot", "com.linkedin.metadata.snapshot.MLModelDeploymentSnapshot", "com.linkedin.metadata.snapshot.MLModelGroupSnapshot", "com.linkedin.metadata.snapshot.MLModelSnapshot", "com.linkedin.metadata.snapshot.MLPrimaryKeySnapshot", "com.linkedin.metadata.snapshot.SchemaFieldSnapshot", "com.linkedin.metadata.snapshot.Snapshot", "com.linkedin.metadata.snapshot.TagSnapshot", "com.linkedin.ml.metadata.BaseData", "com.linkedin.ml.metadata.CaveatDetails", "com.linkedin.ml.metadata.CaveatsAndRecommendations", "com.linkedin.ml.metadata.DeploymentStatus", "com.linkedin.ml.metadata.EthicalConsiderations", "com.linkedin.ml.metadata.EvaluationData", "com.linkedin.ml.metadata.HyperParameterValueType", "com.linkedin.ml.metadata.IntendedUse", "com.linkedin.ml.metadata.IntendedUserType", "com.linkedin.ml.metadata.MLFeatureProperties", "com.linkedin.ml.metadata.MLFeatureTableProperties", "com.linkedin.ml.metadata.MLHyperParam", "com.linkedin.ml.metadata.MLMetric", "com.linkedin.ml.metadata.MLModelDeploymentProperties", "com.linkedin.ml.metadata.MLModelFactorPrompts", "com.linkedin.ml.metadata.MLModelFactors", "com.linkedin.ml.metadata.MLModelGroupProperties", "com.linkedin.ml.metadata.MLModelLineageInfo", "com.linkedin.ml.metadata.MLModelProperties", "com.linkedin.ml.metadata.MLPrimaryKeyProperties", "com.linkedin.ml.metadata.Metrics", "com.linkedin.ml.metadata.QuantitativeAnalyses", "com.linkedin.ml.metadata.ResultsType", "com.linkedin.ml.metadata.SourceCode", "com.linkedin.ml.metadata.SourceCodeUrl", "com.linkedin.ml.metadata.SourceCodeUrlType", "com.linkedin.ml.metadata.TrainingData", { "type" : "record", "name" : "SystemMetadata", "namespace" : "com.linkedin.mxe", diff --git a/metadata-service/restli-api/src/main/snapshot/com.linkedin.entity.runs.snapshot.json b/metadata-service/restli-api/src/main/snapshot/com.linkedin.entity.runs.snapshot.json index 8b6def75f7a66..9061cbff18813 100644 --- a/metadata-service/restli-api/src/main/snapshot/com.linkedin.entity.runs.snapshot.json +++ b/metadata-service/restli-api/src/main/snapshot/com.linkedin.entity.runs.snapshot.json @@ -3551,7 +3551,43 @@ "name" : "MLModelProperties", "namespace" : "com.linkedin.ml.metadata", "doc" : "Properties associated with a ML Model\r", - "include" : [ "com.linkedin.common.CustomProperties", "com.linkedin.common.ExternalReference" ], + "include" : [ "com.linkedin.common.CustomProperties", "com.linkedin.common.ExternalReference", { + "type" : "record", + "name" : "MLModelLineageInfo", + "doc" : "A set of re-usable fields used to capture lineage information for ML Models and ML Model Groups\r", + "fields" : [ { + "name" : "trainingJobs", + "type" : { + "type" : "array", + "items" : "com.linkedin.common.Urn" + }, + "doc" : "List of jobs or process instances (if any) used to train the model or group. Visible in Lineage. Note that ML Models can also be specified as the output of a specific Data Process Instances (runs) via the DataProcessInstanceOutputs aspect.\r", + "optional" : true, + "Relationship" : { + "/*" : { + "entityTypes" : [ "dataJob", "dataProcessInstance" ], + "isLineage" : true, + "name" : "TrainedBy" + } + } + }, { + "name" : "downstreamJobs", + "type" : { + "type" : "array", + "items" : "com.linkedin.common.Urn" + }, + "doc" : "List of jobs or process instances (if any) that use the model or group.\r", + "optional" : true, + "Relationship" : { + "/*" : { + "entityTypes" : [ "dataJob", "dataProcessInstance" ], + "isLineage" : true, + "isUpstream" : false, + "name" : "UsedBy" + } + } + } ] + } ], "fields" : [ { "name" : "name", "type" : "string", @@ -3729,37 +3765,6 @@ "name" : "DeployedTo" } } - }, { - "name" : "trainingJobs", - "type" : { - "type" : "array", - "items" : "com.linkedin.common.Urn" - }, - "doc" : "List of jobs (if any) used to train the model. Visible in Lineage. Note that ML Models can also be specified as the output of a specific Data Process Instances (runs) via the DataProcessInstanceOutputs aspect.\r", - "optional" : true, - "Relationship" : { - "/*" : { - "entityTypes" : [ "dataJob", "dataProcessInstance" ], - "isLineage" : true, - "name" : "TrainedBy" - } - } - }, { - "name" : "downstreamJobs", - "type" : { - "type" : "array", - "items" : "com.linkedin.common.Urn" - }, - "doc" : "List of jobs (if any) that use the model\r", - "optional" : true, - "Relationship" : { - "/*" : { - "entityTypes" : [ "dataJob" ], - "isLineage" : true, - "isUpstream" : false, - "name" : "UsedBy" - } - } }, { "name" : "groups", "type" : { @@ -4002,7 +4007,7 @@ } } } ] - }, "com.linkedin.metadata.run.UnsafeEntityInfo", "com.linkedin.ml.metadata.BaseData", "com.linkedin.ml.metadata.CaveatDetails", "com.linkedin.ml.metadata.CaveatsAndRecommendations", "com.linkedin.ml.metadata.EthicalConsiderations", "com.linkedin.ml.metadata.EvaluationData", "com.linkedin.ml.metadata.HyperParameterValueType", "com.linkedin.ml.metadata.IntendedUse", "com.linkedin.ml.metadata.IntendedUserType", "com.linkedin.ml.metadata.MLFeatureProperties", "com.linkedin.ml.metadata.MLHyperParam", "com.linkedin.ml.metadata.MLMetric", "com.linkedin.ml.metadata.MLModelFactorPrompts", "com.linkedin.ml.metadata.MLModelFactors", "com.linkedin.ml.metadata.MLModelProperties", "com.linkedin.ml.metadata.Metrics", "com.linkedin.ml.metadata.QuantitativeAnalyses", "com.linkedin.ml.metadata.ResultsType", "com.linkedin.ml.metadata.SourceCode", "com.linkedin.ml.metadata.SourceCodeUrl", "com.linkedin.ml.metadata.SourceCodeUrlType", "com.linkedin.ml.metadata.TrainingData", "com.linkedin.schema.ArrayType", "com.linkedin.schema.BinaryJsonSchema", "com.linkedin.schema.BooleanType", "com.linkedin.schema.BytesType", "com.linkedin.schema.DatasetFieldForeignKey", "com.linkedin.schema.DateType", "com.linkedin.schema.EditableSchemaFieldInfo", "com.linkedin.schema.EditableSchemaMetadata", "com.linkedin.schema.EnumType", "com.linkedin.schema.EspressoSchema", "com.linkedin.schema.FixedType", "com.linkedin.schema.ForeignKeyConstraint", "com.linkedin.schema.ForeignKeySpec", "com.linkedin.schema.KafkaSchema", "com.linkedin.schema.KeyValueSchema", "com.linkedin.schema.MapType", "com.linkedin.schema.MySqlDDL", "com.linkedin.schema.NullType", "com.linkedin.schema.NumberType", "com.linkedin.schema.OracleDDL", "com.linkedin.schema.OrcSchema", "com.linkedin.schema.OtherSchema", "com.linkedin.schema.PrestoDDL", "com.linkedin.schema.RecordType", "com.linkedin.schema.SchemaField", "com.linkedin.schema.SchemaFieldDataType", "com.linkedin.schema.SchemaMetadata", "com.linkedin.schema.SchemaMetadataKey", "com.linkedin.schema.Schemaless", "com.linkedin.schema.StringType", "com.linkedin.schema.TimeType", "com.linkedin.schema.UnionType", "com.linkedin.schema.UrnForeignKey", "com.linkedin.tag.TagProperties" ], + }, "com.linkedin.metadata.run.UnsafeEntityInfo", "com.linkedin.ml.metadata.BaseData", "com.linkedin.ml.metadata.CaveatDetails", "com.linkedin.ml.metadata.CaveatsAndRecommendations", "com.linkedin.ml.metadata.EthicalConsiderations", "com.linkedin.ml.metadata.EvaluationData", "com.linkedin.ml.metadata.HyperParameterValueType", "com.linkedin.ml.metadata.IntendedUse", "com.linkedin.ml.metadata.IntendedUserType", "com.linkedin.ml.metadata.MLFeatureProperties", "com.linkedin.ml.metadata.MLHyperParam", "com.linkedin.ml.metadata.MLMetric", "com.linkedin.ml.metadata.MLModelFactorPrompts", "com.linkedin.ml.metadata.MLModelFactors", "com.linkedin.ml.metadata.MLModelLineageInfo", "com.linkedin.ml.metadata.MLModelProperties", "com.linkedin.ml.metadata.Metrics", "com.linkedin.ml.metadata.QuantitativeAnalyses", "com.linkedin.ml.metadata.ResultsType", "com.linkedin.ml.metadata.SourceCode", "com.linkedin.ml.metadata.SourceCodeUrl", "com.linkedin.ml.metadata.SourceCodeUrlType", "com.linkedin.ml.metadata.TrainingData", "com.linkedin.schema.ArrayType", "com.linkedin.schema.BinaryJsonSchema", "com.linkedin.schema.BooleanType", "com.linkedin.schema.BytesType", "com.linkedin.schema.DatasetFieldForeignKey", "com.linkedin.schema.DateType", "com.linkedin.schema.EditableSchemaFieldInfo", "com.linkedin.schema.EditableSchemaMetadata", "com.linkedin.schema.EnumType", "com.linkedin.schema.EspressoSchema", "com.linkedin.schema.FixedType", "com.linkedin.schema.ForeignKeyConstraint", "com.linkedin.schema.ForeignKeySpec", "com.linkedin.schema.KafkaSchema", "com.linkedin.schema.KeyValueSchema", "com.linkedin.schema.MapType", "com.linkedin.schema.MySqlDDL", "com.linkedin.schema.NullType", "com.linkedin.schema.NumberType", "com.linkedin.schema.OracleDDL", "com.linkedin.schema.OrcSchema", "com.linkedin.schema.OtherSchema", "com.linkedin.schema.PrestoDDL", "com.linkedin.schema.RecordType", "com.linkedin.schema.SchemaField", "com.linkedin.schema.SchemaFieldDataType", "com.linkedin.schema.SchemaMetadata", "com.linkedin.schema.SchemaMetadataKey", "com.linkedin.schema.Schemaless", "com.linkedin.schema.StringType", "com.linkedin.schema.TimeType", "com.linkedin.schema.UnionType", "com.linkedin.schema.UrnForeignKey", "com.linkedin.tag.TagProperties" ], "schema" : { "name" : "runs", "namespace" : "com.linkedin.entity", diff --git a/metadata-service/restli-api/src/main/snapshot/com.linkedin.operations.operations.snapshot.json b/metadata-service/restli-api/src/main/snapshot/com.linkedin.operations.operations.snapshot.json index e4cc5c42303ee..e6be4e828c976 100644 --- a/metadata-service/restli-api/src/main/snapshot/com.linkedin.operations.operations.snapshot.json +++ b/metadata-service/restli-api/src/main/snapshot/com.linkedin.operations.operations.snapshot.json @@ -3545,7 +3545,43 @@ "name" : "MLModelProperties", "namespace" : "com.linkedin.ml.metadata", "doc" : "Properties associated with a ML Model\r", - "include" : [ "com.linkedin.common.CustomProperties", "com.linkedin.common.ExternalReference" ], + "include" : [ "com.linkedin.common.CustomProperties", "com.linkedin.common.ExternalReference", { + "type" : "record", + "name" : "MLModelLineageInfo", + "doc" : "A set of re-usable fields used to capture lineage information for ML Models and ML Model Groups\r", + "fields" : [ { + "name" : "trainingJobs", + "type" : { + "type" : "array", + "items" : "com.linkedin.common.Urn" + }, + "doc" : "List of jobs or process instances (if any) used to train the model or group. Visible in Lineage. Note that ML Models can also be specified as the output of a specific Data Process Instances (runs) via the DataProcessInstanceOutputs aspect.\r", + "optional" : true, + "Relationship" : { + "/*" : { + "entityTypes" : [ "dataJob", "dataProcessInstance" ], + "isLineage" : true, + "name" : "TrainedBy" + } + } + }, { + "name" : "downstreamJobs", + "type" : { + "type" : "array", + "items" : "com.linkedin.common.Urn" + }, + "doc" : "List of jobs or process instances (if any) that use the model or group.\r", + "optional" : true, + "Relationship" : { + "/*" : { + "entityTypes" : [ "dataJob", "dataProcessInstance" ], + "isLineage" : true, + "isUpstream" : false, + "name" : "UsedBy" + } + } + } ] + } ], "fields" : [ { "name" : "name", "type" : "string", @@ -3723,37 +3759,6 @@ "name" : "DeployedTo" } } - }, { - "name" : "trainingJobs", - "type" : { - "type" : "array", - "items" : "com.linkedin.common.Urn" - }, - "doc" : "List of jobs (if any) used to train the model. Visible in Lineage. Note that ML Models can also be specified as the output of a specific Data Process Instances (runs) via the DataProcessInstanceOutputs aspect.\r", - "optional" : true, - "Relationship" : { - "/*" : { - "entityTypes" : [ "dataJob", "dataProcessInstance" ], - "isLineage" : true, - "name" : "TrainedBy" - } - } - }, { - "name" : "downstreamJobs", - "type" : { - "type" : "array", - "items" : "com.linkedin.common.Urn" - }, - "doc" : "List of jobs (if any) that use the model\r", - "optional" : true, - "Relationship" : { - "/*" : { - "entityTypes" : [ "dataJob" ], - "isLineage" : true, - "isUpstream" : false, - "name" : "UsedBy" - } - } }, { "name" : "groups", "type" : { @@ -3908,7 +3913,7 @@ "name" : "version", "type" : "long" } ] - }, "com.linkedin.metadata.key.ChartKey", "com.linkedin.metadata.key.CorpGroupKey", "com.linkedin.metadata.key.CorpUserKey", "com.linkedin.metadata.key.DashboardKey", "com.linkedin.metadata.key.DataFlowKey", "com.linkedin.metadata.key.DataJobKey", "com.linkedin.metadata.key.GlossaryNodeKey", "com.linkedin.metadata.key.GlossaryTermKey", "com.linkedin.metadata.key.MLFeatureKey", "com.linkedin.metadata.key.MLModelKey", "com.linkedin.metadata.key.TagKey", "com.linkedin.ml.metadata.BaseData", "com.linkedin.ml.metadata.CaveatDetails", "com.linkedin.ml.metadata.CaveatsAndRecommendations", "com.linkedin.ml.metadata.EthicalConsiderations", "com.linkedin.ml.metadata.EvaluationData", "com.linkedin.ml.metadata.HyperParameterValueType", "com.linkedin.ml.metadata.IntendedUse", "com.linkedin.ml.metadata.IntendedUserType", "com.linkedin.ml.metadata.MLFeatureProperties", "com.linkedin.ml.metadata.MLHyperParam", "com.linkedin.ml.metadata.MLMetric", "com.linkedin.ml.metadata.MLModelFactorPrompts", "com.linkedin.ml.metadata.MLModelFactors", "com.linkedin.ml.metadata.MLModelProperties", "com.linkedin.ml.metadata.Metrics", "com.linkedin.ml.metadata.QuantitativeAnalyses", "com.linkedin.ml.metadata.ResultsType", "com.linkedin.ml.metadata.SourceCode", "com.linkedin.ml.metadata.SourceCodeUrl", "com.linkedin.ml.metadata.SourceCodeUrlType", "com.linkedin.ml.metadata.TrainingData", "com.linkedin.schema.ArrayType", "com.linkedin.schema.BinaryJsonSchema", "com.linkedin.schema.BooleanType", "com.linkedin.schema.BytesType", "com.linkedin.schema.DatasetFieldForeignKey", "com.linkedin.schema.DateType", "com.linkedin.schema.EditableSchemaFieldInfo", "com.linkedin.schema.EditableSchemaMetadata", "com.linkedin.schema.EnumType", "com.linkedin.schema.EspressoSchema", "com.linkedin.schema.FixedType", "com.linkedin.schema.ForeignKeyConstraint", "com.linkedin.schema.ForeignKeySpec", "com.linkedin.schema.KafkaSchema", "com.linkedin.schema.KeyValueSchema", "com.linkedin.schema.MapType", "com.linkedin.schema.MySqlDDL", "com.linkedin.schema.NullType", "com.linkedin.schema.NumberType", "com.linkedin.schema.OracleDDL", "com.linkedin.schema.OrcSchema", "com.linkedin.schema.OtherSchema", "com.linkedin.schema.PrestoDDL", "com.linkedin.schema.RecordType", "com.linkedin.schema.SchemaField", "com.linkedin.schema.SchemaFieldDataType", "com.linkedin.schema.SchemaMetadata", "com.linkedin.schema.SchemaMetadataKey", "com.linkedin.schema.Schemaless", "com.linkedin.schema.StringType", "com.linkedin.schema.TimeType", "com.linkedin.schema.UnionType", "com.linkedin.schema.UrnForeignKey", "com.linkedin.tag.TagProperties", { + }, "com.linkedin.metadata.key.ChartKey", "com.linkedin.metadata.key.CorpGroupKey", "com.linkedin.metadata.key.CorpUserKey", "com.linkedin.metadata.key.DashboardKey", "com.linkedin.metadata.key.DataFlowKey", "com.linkedin.metadata.key.DataJobKey", "com.linkedin.metadata.key.GlossaryNodeKey", "com.linkedin.metadata.key.GlossaryTermKey", "com.linkedin.metadata.key.MLFeatureKey", "com.linkedin.metadata.key.MLModelKey", "com.linkedin.metadata.key.TagKey", "com.linkedin.ml.metadata.BaseData", "com.linkedin.ml.metadata.CaveatDetails", "com.linkedin.ml.metadata.CaveatsAndRecommendations", "com.linkedin.ml.metadata.EthicalConsiderations", "com.linkedin.ml.metadata.EvaluationData", "com.linkedin.ml.metadata.HyperParameterValueType", "com.linkedin.ml.metadata.IntendedUse", "com.linkedin.ml.metadata.IntendedUserType", "com.linkedin.ml.metadata.MLFeatureProperties", "com.linkedin.ml.metadata.MLHyperParam", "com.linkedin.ml.metadata.MLMetric", "com.linkedin.ml.metadata.MLModelFactorPrompts", "com.linkedin.ml.metadata.MLModelFactors", "com.linkedin.ml.metadata.MLModelLineageInfo", "com.linkedin.ml.metadata.MLModelProperties", "com.linkedin.ml.metadata.Metrics", "com.linkedin.ml.metadata.QuantitativeAnalyses", "com.linkedin.ml.metadata.ResultsType", "com.linkedin.ml.metadata.SourceCode", "com.linkedin.ml.metadata.SourceCodeUrl", "com.linkedin.ml.metadata.SourceCodeUrlType", "com.linkedin.ml.metadata.TrainingData", "com.linkedin.schema.ArrayType", "com.linkedin.schema.BinaryJsonSchema", "com.linkedin.schema.BooleanType", "com.linkedin.schema.BytesType", "com.linkedin.schema.DatasetFieldForeignKey", "com.linkedin.schema.DateType", "com.linkedin.schema.EditableSchemaFieldInfo", "com.linkedin.schema.EditableSchemaMetadata", "com.linkedin.schema.EnumType", "com.linkedin.schema.EspressoSchema", "com.linkedin.schema.FixedType", "com.linkedin.schema.ForeignKeyConstraint", "com.linkedin.schema.ForeignKeySpec", "com.linkedin.schema.KafkaSchema", "com.linkedin.schema.KeyValueSchema", "com.linkedin.schema.MapType", "com.linkedin.schema.MySqlDDL", "com.linkedin.schema.NullType", "com.linkedin.schema.NumberType", "com.linkedin.schema.OracleDDL", "com.linkedin.schema.OrcSchema", "com.linkedin.schema.OtherSchema", "com.linkedin.schema.PrestoDDL", "com.linkedin.schema.RecordType", "com.linkedin.schema.SchemaField", "com.linkedin.schema.SchemaFieldDataType", "com.linkedin.schema.SchemaMetadata", "com.linkedin.schema.SchemaMetadataKey", "com.linkedin.schema.Schemaless", "com.linkedin.schema.StringType", "com.linkedin.schema.TimeType", "com.linkedin.schema.UnionType", "com.linkedin.schema.UrnForeignKey", "com.linkedin.tag.TagProperties", { "type" : "record", "name" : "TimeseriesIndexSizeResult", "namespace" : "com.linkedin.timeseries", diff --git a/metadata-service/restli-api/src/main/snapshot/com.linkedin.platform.platform.snapshot.json b/metadata-service/restli-api/src/main/snapshot/com.linkedin.platform.platform.snapshot.json index e375ac698ab51..10f3218d46975 100644 --- a/metadata-service/restli-api/src/main/snapshot/com.linkedin.platform.platform.snapshot.json +++ b/metadata-service/restli-api/src/main/snapshot/com.linkedin.platform.platform.snapshot.json @@ -3979,7 +3979,43 @@ "name" : "MLModelProperties", "namespace" : "com.linkedin.ml.metadata", "doc" : "Properties associated with a ML Model\r", - "include" : [ "com.linkedin.common.CustomProperties", "com.linkedin.common.ExternalReference" ], + "include" : [ "com.linkedin.common.CustomProperties", "com.linkedin.common.ExternalReference", { + "type" : "record", + "name" : "MLModelLineageInfo", + "doc" : "A set of re-usable fields used to capture lineage information for ML Models and ML Model Groups\r", + "fields" : [ { + "name" : "trainingJobs", + "type" : { + "type" : "array", + "items" : "com.linkedin.common.Urn" + }, + "doc" : "List of jobs or process instances (if any) used to train the model or group. Visible in Lineage. Note that ML Models can also be specified as the output of a specific Data Process Instances (runs) via the DataProcessInstanceOutputs aspect.\r", + "optional" : true, + "Relationship" : { + "/*" : { + "entityTypes" : [ "dataJob", "dataProcessInstance" ], + "isLineage" : true, + "name" : "TrainedBy" + } + } + }, { + "name" : "downstreamJobs", + "type" : { + "type" : "array", + "items" : "com.linkedin.common.Urn" + }, + "doc" : "List of jobs or process instances (if any) that use the model or group.\r", + "optional" : true, + "Relationship" : { + "/*" : { + "entityTypes" : [ "dataJob", "dataProcessInstance" ], + "isLineage" : true, + "isUpstream" : false, + "name" : "UsedBy" + } + } + } ] + } ], "fields" : [ { "name" : "name", "type" : "string", @@ -4157,37 +4193,6 @@ "name" : "DeployedTo" } } - }, { - "name" : "trainingJobs", - "type" : { - "type" : "array", - "items" : "com.linkedin.common.Urn" - }, - "doc" : "List of jobs (if any) used to train the model. Visible in Lineage. Note that ML Models can also be specified as the output of a specific Data Process Instances (runs) via the DataProcessInstanceOutputs aspect.\r", - "optional" : true, - "Relationship" : { - "/*" : { - "entityTypes" : [ "dataJob", "dataProcessInstance" ], - "isLineage" : true, - "name" : "TrainedBy" - } - } - }, { - "name" : "downstreamJobs", - "type" : { - "type" : "array", - "items" : "com.linkedin.common.Urn" - }, - "doc" : "List of jobs (if any) that use the model\r", - "optional" : true, - "Relationship" : { - "/*" : { - "entityTypes" : [ "dataJob" ], - "isLineage" : true, - "isUpstream" : false, - "name" : "UsedBy" - } - } }, { "name" : "groups", "type" : { @@ -4998,7 +5003,7 @@ "name" : "MLModelGroupProperties", "namespace" : "com.linkedin.ml.metadata", "doc" : "Properties associated with an ML Model Group\r", - "include" : [ "com.linkedin.common.CustomProperties" ], + "include" : [ "com.linkedin.common.CustomProperties", "MLModelLineageInfo" ], "fields" : [ { "name" : "name", "type" : "string", @@ -5035,21 +5040,6 @@ "type" : "com.linkedin.common.TimeStamp", "doc" : "Date when the MLModelGroup was last modified\r", "optional" : true - }, { - "name" : "trainingJobs", - "type" : { - "type" : "array", - "items" : "com.linkedin.common.Urn" - }, - "doc" : "List of jobs (if any) used to train the model group. Visible in Lineage.\r", - "optional" : true, - "Relationship" : { - "/*" : { - "entityTypes" : [ "dataJob" ], - "isLineage" : true, - "name" : "TrainedBy" - } - } }, { "name" : "version", "type" : "com.linkedin.common.VersionTag", @@ -5844,7 +5834,7 @@ } ] } } ] - }, "com.linkedin.glossary.GlossaryNodeInfo", "com.linkedin.glossary.GlossaryRelatedTerms", "com.linkedin.glossary.GlossaryTermInfo", "com.linkedin.identity.CorpGroupInfo", "com.linkedin.identity.CorpUserEditableInfo", "com.linkedin.identity.CorpUserInfo", "com.linkedin.identity.CorpUserStatus", "com.linkedin.identity.GroupMembership", "com.linkedin.metadata.aspect.ChartAspect", "com.linkedin.metadata.aspect.CorpGroupAspect", "com.linkedin.metadata.aspect.CorpUserAspect", "com.linkedin.metadata.aspect.DashboardAspect", "com.linkedin.metadata.aspect.DataFlowAspect", "com.linkedin.metadata.aspect.DataHubPolicyAspect", "com.linkedin.metadata.aspect.DataHubRetentionAspect", "com.linkedin.metadata.aspect.DataJobAspect", "com.linkedin.metadata.aspect.DataPlatformAspect", "com.linkedin.metadata.aspect.DataProcessAspect", "com.linkedin.metadata.aspect.DatasetAspect", "com.linkedin.metadata.aspect.GlossaryNodeAspect", "com.linkedin.metadata.aspect.GlossaryTermAspect", "com.linkedin.metadata.aspect.MLFeatureAspect", "com.linkedin.metadata.aspect.MLFeatureTableAspect", "com.linkedin.metadata.aspect.MLModelAspect", "com.linkedin.metadata.aspect.MLModelDeploymentAspect", "com.linkedin.metadata.aspect.MLModelGroupAspect", "com.linkedin.metadata.aspect.MLPrimaryKeyAspect", "com.linkedin.metadata.aspect.SchemaFieldAspect", "com.linkedin.metadata.aspect.TagAspect", "com.linkedin.metadata.key.ChartKey", "com.linkedin.metadata.key.CorpGroupKey", "com.linkedin.metadata.key.CorpUserKey", "com.linkedin.metadata.key.DashboardKey", "com.linkedin.metadata.key.DataFlowKey", "com.linkedin.metadata.key.DataHubPolicyKey", "com.linkedin.metadata.key.DataHubRetentionKey", "com.linkedin.metadata.key.DataJobKey", "com.linkedin.metadata.key.DataPlatformKey", "com.linkedin.metadata.key.DataProcessKey", "com.linkedin.metadata.key.DatasetKey", "com.linkedin.metadata.key.GlossaryNodeKey", "com.linkedin.metadata.key.GlossaryTermKey", "com.linkedin.metadata.key.MLFeatureKey", "com.linkedin.metadata.key.MLFeatureTableKey", "com.linkedin.metadata.key.MLModelDeploymentKey", "com.linkedin.metadata.key.MLModelGroupKey", "com.linkedin.metadata.key.MLModelKey", "com.linkedin.metadata.key.MLPrimaryKeyKey", "com.linkedin.metadata.key.SchemaFieldKey", "com.linkedin.metadata.key.TagKey", "com.linkedin.metadata.snapshot.ChartSnapshot", "com.linkedin.metadata.snapshot.CorpGroupSnapshot", "com.linkedin.metadata.snapshot.CorpUserSnapshot", "com.linkedin.metadata.snapshot.DashboardSnapshot", "com.linkedin.metadata.snapshot.DataFlowSnapshot", "com.linkedin.metadata.snapshot.DataHubPolicySnapshot", "com.linkedin.metadata.snapshot.DataHubRetentionSnapshot", "com.linkedin.metadata.snapshot.DataJobSnapshot", "com.linkedin.metadata.snapshot.DataPlatformSnapshot", "com.linkedin.metadata.snapshot.DataProcessSnapshot", "com.linkedin.metadata.snapshot.DatasetSnapshot", "com.linkedin.metadata.snapshot.GlossaryNodeSnapshot", "com.linkedin.metadata.snapshot.GlossaryTermSnapshot", "com.linkedin.metadata.snapshot.MLFeatureSnapshot", "com.linkedin.metadata.snapshot.MLFeatureTableSnapshot", "com.linkedin.metadata.snapshot.MLModelDeploymentSnapshot", "com.linkedin.metadata.snapshot.MLModelGroupSnapshot", "com.linkedin.metadata.snapshot.MLModelSnapshot", "com.linkedin.metadata.snapshot.MLPrimaryKeySnapshot", "com.linkedin.metadata.snapshot.SchemaFieldSnapshot", "com.linkedin.metadata.snapshot.Snapshot", "com.linkedin.metadata.snapshot.TagSnapshot", "com.linkedin.ml.metadata.BaseData", "com.linkedin.ml.metadata.CaveatDetails", "com.linkedin.ml.metadata.CaveatsAndRecommendations", "com.linkedin.ml.metadata.DeploymentStatus", "com.linkedin.ml.metadata.EthicalConsiderations", "com.linkedin.ml.metadata.EvaluationData", "com.linkedin.ml.metadata.HyperParameterValueType", "com.linkedin.ml.metadata.IntendedUse", "com.linkedin.ml.metadata.IntendedUserType", "com.linkedin.ml.metadata.MLFeatureProperties", "com.linkedin.ml.metadata.MLFeatureTableProperties", "com.linkedin.ml.metadata.MLHyperParam", "com.linkedin.ml.metadata.MLMetric", "com.linkedin.ml.metadata.MLModelDeploymentProperties", "com.linkedin.ml.metadata.MLModelFactorPrompts", "com.linkedin.ml.metadata.MLModelFactors", "com.linkedin.ml.metadata.MLModelGroupProperties", "com.linkedin.ml.metadata.MLModelProperties", "com.linkedin.ml.metadata.MLPrimaryKeyProperties", "com.linkedin.ml.metadata.Metrics", "com.linkedin.ml.metadata.QuantitativeAnalyses", "com.linkedin.ml.metadata.ResultsType", "com.linkedin.ml.metadata.SourceCode", "com.linkedin.ml.metadata.SourceCodeUrl", "com.linkedin.ml.metadata.SourceCodeUrlType", "com.linkedin.ml.metadata.TrainingData", { + }, "com.linkedin.glossary.GlossaryNodeInfo", "com.linkedin.glossary.GlossaryRelatedTerms", "com.linkedin.glossary.GlossaryTermInfo", "com.linkedin.identity.CorpGroupInfo", "com.linkedin.identity.CorpUserEditableInfo", "com.linkedin.identity.CorpUserInfo", "com.linkedin.identity.CorpUserStatus", "com.linkedin.identity.GroupMembership", "com.linkedin.metadata.aspect.ChartAspect", "com.linkedin.metadata.aspect.CorpGroupAspect", "com.linkedin.metadata.aspect.CorpUserAspect", "com.linkedin.metadata.aspect.DashboardAspect", "com.linkedin.metadata.aspect.DataFlowAspect", "com.linkedin.metadata.aspect.DataHubPolicyAspect", "com.linkedin.metadata.aspect.DataHubRetentionAspect", "com.linkedin.metadata.aspect.DataJobAspect", "com.linkedin.metadata.aspect.DataPlatformAspect", "com.linkedin.metadata.aspect.DataProcessAspect", "com.linkedin.metadata.aspect.DatasetAspect", "com.linkedin.metadata.aspect.GlossaryNodeAspect", "com.linkedin.metadata.aspect.GlossaryTermAspect", "com.linkedin.metadata.aspect.MLFeatureAspect", "com.linkedin.metadata.aspect.MLFeatureTableAspect", "com.linkedin.metadata.aspect.MLModelAspect", "com.linkedin.metadata.aspect.MLModelDeploymentAspect", "com.linkedin.metadata.aspect.MLModelGroupAspect", "com.linkedin.metadata.aspect.MLPrimaryKeyAspect", "com.linkedin.metadata.aspect.SchemaFieldAspect", "com.linkedin.metadata.aspect.TagAspect", "com.linkedin.metadata.key.ChartKey", "com.linkedin.metadata.key.CorpGroupKey", "com.linkedin.metadata.key.CorpUserKey", "com.linkedin.metadata.key.DashboardKey", "com.linkedin.metadata.key.DataFlowKey", "com.linkedin.metadata.key.DataHubPolicyKey", "com.linkedin.metadata.key.DataHubRetentionKey", "com.linkedin.metadata.key.DataJobKey", "com.linkedin.metadata.key.DataPlatformKey", "com.linkedin.metadata.key.DataProcessKey", "com.linkedin.metadata.key.DatasetKey", "com.linkedin.metadata.key.GlossaryNodeKey", "com.linkedin.metadata.key.GlossaryTermKey", "com.linkedin.metadata.key.MLFeatureKey", "com.linkedin.metadata.key.MLFeatureTableKey", "com.linkedin.metadata.key.MLModelDeploymentKey", "com.linkedin.metadata.key.MLModelGroupKey", "com.linkedin.metadata.key.MLModelKey", "com.linkedin.metadata.key.MLPrimaryKeyKey", "com.linkedin.metadata.key.SchemaFieldKey", "com.linkedin.metadata.key.TagKey", "com.linkedin.metadata.snapshot.ChartSnapshot", "com.linkedin.metadata.snapshot.CorpGroupSnapshot", "com.linkedin.metadata.snapshot.CorpUserSnapshot", "com.linkedin.metadata.snapshot.DashboardSnapshot", "com.linkedin.metadata.snapshot.DataFlowSnapshot", "com.linkedin.metadata.snapshot.DataHubPolicySnapshot", "com.linkedin.metadata.snapshot.DataHubRetentionSnapshot", "com.linkedin.metadata.snapshot.DataJobSnapshot", "com.linkedin.metadata.snapshot.DataPlatformSnapshot", "com.linkedin.metadata.snapshot.DataProcessSnapshot", "com.linkedin.metadata.snapshot.DatasetSnapshot", "com.linkedin.metadata.snapshot.GlossaryNodeSnapshot", "com.linkedin.metadata.snapshot.GlossaryTermSnapshot", "com.linkedin.metadata.snapshot.MLFeatureSnapshot", "com.linkedin.metadata.snapshot.MLFeatureTableSnapshot", "com.linkedin.metadata.snapshot.MLModelDeploymentSnapshot", "com.linkedin.metadata.snapshot.MLModelGroupSnapshot", "com.linkedin.metadata.snapshot.MLModelSnapshot", "com.linkedin.metadata.snapshot.MLPrimaryKeySnapshot", "com.linkedin.metadata.snapshot.SchemaFieldSnapshot", "com.linkedin.metadata.snapshot.Snapshot", "com.linkedin.metadata.snapshot.TagSnapshot", "com.linkedin.ml.metadata.BaseData", "com.linkedin.ml.metadata.CaveatDetails", "com.linkedin.ml.metadata.CaveatsAndRecommendations", "com.linkedin.ml.metadata.DeploymentStatus", "com.linkedin.ml.metadata.EthicalConsiderations", "com.linkedin.ml.metadata.EvaluationData", "com.linkedin.ml.metadata.HyperParameterValueType", "com.linkedin.ml.metadata.IntendedUse", "com.linkedin.ml.metadata.IntendedUserType", "com.linkedin.ml.metadata.MLFeatureProperties", "com.linkedin.ml.metadata.MLFeatureTableProperties", "com.linkedin.ml.metadata.MLHyperParam", "com.linkedin.ml.metadata.MLMetric", "com.linkedin.ml.metadata.MLModelDeploymentProperties", "com.linkedin.ml.metadata.MLModelFactorPrompts", "com.linkedin.ml.metadata.MLModelFactors", "com.linkedin.ml.metadata.MLModelGroupProperties", "com.linkedin.ml.metadata.MLModelLineageInfo", "com.linkedin.ml.metadata.MLModelProperties", "com.linkedin.ml.metadata.MLPrimaryKeyProperties", "com.linkedin.ml.metadata.Metrics", "com.linkedin.ml.metadata.QuantitativeAnalyses", "com.linkedin.ml.metadata.ResultsType", "com.linkedin.ml.metadata.SourceCode", "com.linkedin.ml.metadata.SourceCodeUrl", "com.linkedin.ml.metadata.SourceCodeUrlType", "com.linkedin.ml.metadata.TrainingData", { "type" : "record", "name" : "GenericPayload", "namespace" : "com.linkedin.mxe", diff --git a/smoke-test/tests/data_process_instance/__init__.py b/smoke-test/tests/data_process_instance/__init__.py new file mode 100644 index 0000000000000..e69de29bb2d1d diff --git a/smoke-test/tests/data_process_instance/test_data_process_instance.py b/smoke-test/tests/data_process_instance/test_data_process_instance.py new file mode 100644 index 0000000000000..a8aca6034d5be --- /dev/null +++ b/smoke-test/tests/data_process_instance/test_data_process_instance.py @@ -0,0 +1,293 @@ +import logging +import os +import tempfile +from random import randint + +import pytest +from datahub.emitter.mcp import MetadataChangeProposalWrapper +from datahub.ingestion.api.common import PipelineContext, RecordEnvelope +from datahub.ingestion.api.sink import NoopWriteCallback +from datahub.ingestion.sink.file import FileSink, FileSinkConfig +from datahub.metadata.schema_classes import ( + AuditStampClass, + ContainerClass, + ContainerPropertiesClass, + DataPlatformInstanceClass, + DataPlatformInstancePropertiesClass, + DataProcessInstanceKeyClass, + DataProcessInstancePropertiesClass, + DataProcessInstanceRunEventClass, + MLHyperParamClass, + MLMetricClass, + MLTrainingRunPropertiesClass, + SubTypesClass, + TimeWindowSizeClass, +) + +from tests.utils import ( + delete_urns_from_file, + ingest_file_via_rest, + wait_for_writes_to_sync, +) + +logger = logging.getLogger(__name__) + +# Generate unique DPI ID +dpi_id = f"test-pipeline-run-{randint(1000, 9999)}" +dpi_urn = f"urn:li:dataProcessInstance:{dpi_id}" + + +class FileEmitter: + def __init__(self, filename: str) -> None: + self.sink: FileSink = FileSink( + ctx=PipelineContext(run_id="create_test_data"), + config=FileSinkConfig(filename=filename), + ) + + def emit(self, event): + self.sink.write_record_async( + record_envelope=RecordEnvelope(record=event, metadata={}), + write_callback=NoopWriteCallback(), + ) + + def close(self): + self.sink.close() + + +def create_test_data(filename: str): + mcps = [ + # Key aspect + MetadataChangeProposalWrapper( + entityType="dataProcessInstance", + entityUrn=dpi_urn, + aspectName="dataProcessInstanceKey", + aspect=DataProcessInstanceKeyClass(id=dpi_id), + ), + # Properties aspect + MetadataChangeProposalWrapper( + entityType="dataProcessInstance", + entityUrn=dpi_urn, + aspectName="dataProcessInstanceProperties", + aspect=DataProcessInstancePropertiesClass( + name="Test Pipeline Run", + type="BATCH_SCHEDULED", + created=AuditStampClass( + time=1640692800000, actor="urn:li:corpuser:datahub" + ), + ), + ), + # Run Event aspect + MetadataChangeProposalWrapper( + entityType="dataProcessInstance", + entityUrn=dpi_urn, + aspectName="dataProcessInstanceRunEvent", + aspect=DataProcessInstanceRunEventClass( + timestampMillis=1704067200000, + eventGranularity=TimeWindowSizeClass(unit="WEEK", multiple=1), + status="COMPLETE", + ), + ), + # Platform Instance aspect + MetadataChangeProposalWrapper( + entityType="dataProcessInstance", + entityUrn=dpi_urn, + aspectName="dataPlatformInstance", + aspect=DataPlatformInstanceClass( + platform="urn:li:dataPlatform:airflow", + instance="urn:li:dataPlatformInstance:(urn:li:dataPlatform:airflow,1234567890)", + ), + ), + MetadataChangeProposalWrapper( + entityType="dataPlatformInstance", + entityUrn="urn:li:dataPlatformInstance:(urn:li:dataPlatform:airflow,1234567890)", + aspectName="dataPlatformInstanceProperties", + aspect=DataPlatformInstancePropertiesClass( + name="my process instance", + ), + ), + # SubTypes aspect + MetadataChangeProposalWrapper( + entityType="dataProcessInstance", + entityUrn=dpi_urn, + aspectName="subTypes", + aspect=SubTypesClass(typeNames=["TEST", "BATCH_JOB"]), + ), + # Container aspect + MetadataChangeProposalWrapper( + entityType="dataProcessInstance", + entityUrn=dpi_urn, + aspectName="container", + aspect=ContainerClass(container="urn:li:container:testGroup1"), + ), + MetadataChangeProposalWrapper( + entityType="container", + entityUrn="urn:li:container:testGroup1", + aspectName="containerProperties", + aspect=ContainerPropertiesClass(name="testGroup1"), + ), + # ML Training Run Properties aspect + MetadataChangeProposalWrapper( + entityType="dataProcessInstance", + entityUrn=dpi_urn, + aspectName="mlTrainingRunProperties", + aspect=MLTrainingRunPropertiesClass( + id="test-training-run-123", + trainingMetrics=[ + MLMetricClass( + name="accuracy", + description="accuracy of the model", + value="0.95", + ), + MLMetricClass( + name="loss", + description="accuracy loss of the model", + value="0.05", + ), + ], + hyperParams=[ + MLHyperParamClass( + name="learningRate", + description="rate of learning", + value="0.001", + ), + MLHyperParamClass( + name="batchSize", description="size of the batch", value="32" + ), + ], + outputUrls=["s3://my-bucket/ml/output"], + ), + ), + ] + + file_emitter = FileEmitter(filename) + for mcp in mcps: + file_emitter.emit(mcp) + file_emitter.close() + + +@pytest.fixture(scope="module", autouse=False) +def ingest_cleanup_data(auth_session, graph_client, request): + new_file, filename = tempfile.mkstemp(suffix=".json") + try: + create_test_data(filename) + print("ingesting data process instance test data") + ingest_file_via_rest(auth_session, filename) + wait_for_writes_to_sync() + yield + print("removing data process instance test data") + delete_urns_from_file(graph_client, filename) + wait_for_writes_to_sync() + finally: + os.remove(filename) + + +@pytest.mark.integration +def test_search_dpi(auth_session, ingest_cleanup_data): + """Test DPI search and validation of returned fields using GraphQL.""" + + json = { + "query": """query scrollAcrossEntities($input: ScrollAcrossEntitiesInput!) { + scrollAcrossEntities(input: $input) { + nextScrollId + count + total + searchResults { + entity { + ... on DataProcessInstance { + urn + properties { + name + externalUrl + } + dataPlatformInstance { + platform { + urn + name + } + } + subTypes { + typeNames + } + container { + urn + } + platform { + urn + name + properties { + type + } + } + mlTrainingRunProperties { + id + trainingMetrics { + name + value + } + hyperParams { + name + value + } + outputUrls + } + } + } + } + } + }""", + "variables": { + "input": {"types": ["DATA_PROCESS_INSTANCE"], "query": dpi_id, "count": 10} + }, + } + + response = auth_session.post( + f"{auth_session.frontend_url()}/api/v2/graphql", json=json + ) + response.raise_for_status() + res_data = response.json() + + # Basic response structure validation + assert res_data, "Response should not be empty" + assert "data" in res_data, "Response should contain 'data' field" + print("RESPONSE DATA:" + str(res_data)) + assert ( + "scrollAcrossEntities" in res_data["data"] + ), "Response should contain 'scrollAcrossEntities' field" + + search_results = res_data["data"]["scrollAcrossEntities"] + assert ( + "searchResults" in search_results + ), "Response should contain 'searchResults' field" + + results = search_results["searchResults"] + assert len(results) > 0, "Should find at least one result" + + # Find our test entity + test_entity = None + for result in results: + if result["entity"]["urn"] == dpi_urn: + test_entity = result["entity"] + break + + assert test_entity is not None, f"Should find test entity with URN {dpi_urn}" + + # Validate fields + props = test_entity["properties"] + assert props["name"] == "Test Pipeline Run" + + platform_instance = test_entity["dataPlatformInstance"] + assert platform_instance["platform"]["urn"] == "urn:li:dataPlatform:airflow" + + sub_types = test_entity["subTypes"] + assert set(sub_types["typeNames"]) == {"TEST", "BATCH_JOB"} + + container = test_entity["container"] + assert container["urn"] == "urn:li:container:testGroup1" + + ml_props = test_entity["mlTrainingRunProperties"] + assert ml_props["id"] == "test-training-run-123" + assert ml_props["trainingMetrics"][0] == {"name": "accuracy", "value": "0.95"} + assert ml_props["trainingMetrics"][1] == {"name": "loss", "value": "0.05"} + assert ml_props["hyperParams"][0] == {"name": "learningRate", "value": "0.001"} + assert ml_props["hyperParams"][1] == {"name": "batchSize", "value": "32"} + assert ml_props["outputUrls"][0] == "s3://my-bucket/ml/output" diff --git a/smoke-test/tests/ml_models/__init__.py b/smoke-test/tests/ml_models/__init__.py new file mode 100644 index 0000000000000..e69de29bb2d1d diff --git a/smoke-test/tests/ml_models/test_ml_models.py b/smoke-test/tests/ml_models/test_ml_models.py new file mode 100644 index 0000000000000..59821ab3e3cc4 --- /dev/null +++ b/smoke-test/tests/ml_models/test_ml_models.py @@ -0,0 +1,133 @@ +import logging +import os +import tempfile +from random import randint + +import pytest +from datahub.emitter.mce_builder import make_ml_model_group_urn, make_ml_model_urn +from datahub.emitter.mcp import MetadataChangeProposalWrapper +from datahub.ingestion.api.common import PipelineContext, RecordEnvelope +from datahub.ingestion.api.sink import NoopWriteCallback +from datahub.ingestion.graph.client import DataHubGraph +from datahub.ingestion.sink.file import FileSink, FileSinkConfig +from datahub.metadata.schema_classes import ( + MLModelGroupPropertiesClass, + MLModelPropertiesClass, +) + +from tests.utils import ( + delete_urns_from_file, + get_sleep_info, + ingest_file_via_rest, + wait_for_writes_to_sync, +) + +logger = logging.getLogger(__name__) + +# Generate unique model names for testing +start_index = randint(10, 10000) +model_names = [f"test_model_{i}" for i in range(start_index, start_index + 3)] +model_group_urn = make_ml_model_group_urn("workbench", "test_group", "DEV") +model_urns = [make_ml_model_urn("workbench", name, "DEV") for name in model_names] + + +class FileEmitter: + def __init__(self, filename: str) -> None: + self.sink: FileSink = FileSink( + ctx=PipelineContext(run_id="create_test_data"), + config=FileSinkConfig(filename=filename), + ) + + def emit(self, event): + self.sink.write_record_async( + record_envelope=RecordEnvelope(record=event, metadata={}), + write_callback=NoopWriteCallback(), + ) + + def close(self): + self.sink.close() + + +def create_test_data(filename: str): + # Create model group + model_group_mcp = MetadataChangeProposalWrapper( + entityUrn=str(model_group_urn), + aspect=MLModelGroupPropertiesClass( + description="Test model group for integration testing", + trainingJobs=["urn:li:dataProcessInstance:test_job"], + ), + ) + + # Create models that belong to the group + model_mcps = [ + MetadataChangeProposalWrapper( + entityUrn=model_urn, + aspect=MLModelPropertiesClass( + name=f"Test Model ({model_urn})", + description=f"Test model {model_urn}", + groups=[str(model_group_urn)], + trainingJobs=["urn:li:dataProcessInstance:test_job"], + ), + ) + for model_urn in model_urns + ] + + file_emitter = FileEmitter(filename) + for mcps in [model_group_mcp] + model_mcps: + file_emitter.emit(mcps) + + file_emitter.close() + + +sleep_sec, sleep_times = get_sleep_info() + + +@pytest.fixture(scope="module", autouse=False) +def ingest_cleanup_data(auth_session, graph_client, request): + new_file, filename = tempfile.mkstemp(suffix=".json") + try: + create_test_data(filename) + print("ingesting ml model test data") + ingest_file_via_rest(auth_session, filename) + wait_for_writes_to_sync() + yield + print("removing ml model test data") + delete_urns_from_file(graph_client, filename) + wait_for_writes_to_sync() + finally: + os.remove(filename) + + +@pytest.mark.integration +def test_create_ml_models(graph_client: DataHubGraph, ingest_cleanup_data): + """Test creation and validation of ML models and model groups.""" + + # Validate model group properties + fetched_group_props = graph_client.get_aspect( + str(model_group_urn), MLModelGroupPropertiesClass + ) + assert fetched_group_props is not None + assert fetched_group_props.description == "Test model group for integration testing" + assert fetched_group_props.trainingJobs == ["urn:li:dataProcessInstance:test_job"] + + # Validate individual models + for model_urn in model_urns: + fetched_model_props = graph_client.get_aspect(model_urn, MLModelPropertiesClass) + assert fetched_model_props is not None + assert fetched_model_props.name == f"Test Model ({model_urn})" + assert fetched_model_props.description == f"Test model {model_urn}" + assert str(model_group_urn) in (fetched_model_props.groups or []) + assert fetched_model_props.trainingJobs == [ + "urn:li:dataProcessInstance:test_job" + ] + + # Validate relationships between models and group + related_models = set() + for e in graph_client.get_related_entities( + str(model_group_urn), + relationship_types=["MemberOf"], + direction=DataHubGraph.RelationshipDirection.INCOMING, + ): + related_models.add(e.urn) + + assert set(model_urns) == related_models