diff --git a/datahub-web-react/src/app/ingest/source/builder/constants.ts b/datahub-web-react/src/app/ingest/source/builder/constants.ts index ec2d6b119d14e5..89efd267edfe77 100644 --- a/datahub-web-react/src/app/ingest/source/builder/constants.ts +++ b/datahub-web-react/src/app/ingest/source/builder/constants.ts @@ -1,5 +1,6 @@ import athenaLogo from '@images/awsathenalogo.png'; import azureLogo from '@images/azure-ad.png'; +import azureDataFactoryLogo from '@images/azuredatafactorylogo.svg'; import bigqueryLogo from '@images/bigquerylogo.png'; import cassandraLogo from '@images/cassandralogo.png'; import clickhouseLogo from '@images/clickhouselogo.png'; @@ -50,6 +51,8 @@ export const ATHENA = 'athena'; export const ATHENA_URN = `urn:li:dataPlatform:${ATHENA}`; export const AZURE = 'azure-ad'; export const AZURE_URN = `urn:li:dataPlatform:${AZURE}`; +export const AZURE_DATA_FACTORY = 'azure-data-factory'; +export const AZURE_DATA_FACTORY_URN = `urn:li:dataPlatform:${AZURE_DATA_FACTORY}`; export const BIGQUERY = 'bigquery'; export const BIGQUERY_USAGE = 'bigquery-usage'; export const BIGQUERY_BETA = 'bigquery-beta'; @@ -162,6 +165,7 @@ export const STREAMLIT_URN = `urn:li:dataPlatform:${STREAMLIT}`; export const PLATFORM_URN_TO_LOGO = { [ATHENA_URN]: athenaLogo, [AZURE_URN]: azureLogo, + [AZURE_DATA_FACTORY_URN]: azureDataFactoryLogo, [BIGQUERY_URN]: bigqueryLogo, [CLICKHOUSE_URN]: clickhouseLogo, [COCKROACHDB_URN]: cockroachdbLogo, diff --git a/datahub-web-react/src/app/ingestV2/source/builder/constants.ts b/datahub-web-react/src/app/ingestV2/source/builder/constants.ts index be3f8100650414..bf8d80bc0afa41 100644 --- a/datahub-web-react/src/app/ingestV2/source/builder/constants.ts +++ b/datahub-web-react/src/app/ingestV2/source/builder/constants.ts @@ -1,5 +1,6 @@ import athenaLogo from '@images/awsathenalogo.png'; import azureLogo from '@images/azure-ad.png'; +import azureDataFactoryLogo from '@images/azuredatafactorylogo.svg'; import bigqueryLogo from '@images/bigquerylogo.png'; import cassandraLogo from '@images/cassandralogo.png'; import clickhouseLogo from '@images/clickhouselogo.png'; @@ -48,6 +49,8 @@ export const ATHENA = 'athena'; export const ATHENA_URN = `urn:li:dataPlatform:${ATHENA}`; export const AZURE = 'azure-ad'; export const AZURE_URN = `urn:li:dataPlatform:${AZURE}`; +export const AZURE_DATA_FACTORY = 'azure-data-factory'; +export const AZURE_DATA_FACTORY_URN = `urn:li:dataPlatform:${AZURE_DATA_FACTORY}`; export const BIGQUERY = 'bigquery'; export const BIGQUERY_BETA = 'bigquery-beta'; export const BIGQUERY_URN = `urn:li:dataPlatform:${BIGQUERY}`; @@ -155,6 +158,7 @@ export const SNAPLOGIC_URN = `urn:li:dataPlatform:${SNAPLOGIC}`; export const PLATFORM_URN_TO_LOGO = { [ATHENA_URN]: athenaLogo, [AZURE_URN]: azureLogo, + [AZURE_DATA_FACTORY_URN]: azureDataFactoryLogo, [BIGQUERY_URN]: bigqueryLogo, [CLICKHOUSE_URN]: clickhouseLogo, [COCKROACHDB_URN]: cockroachdbLogo, diff --git a/datahub-web-react/src/images/azuredatafactorylogo.svg b/datahub-web-react/src/images/azuredatafactorylogo.svg new file mode 100644 index 00000000000000..22373367da353e --- /dev/null +++ b/datahub-web-react/src/images/azuredatafactorylogo.svg @@ -0,0 +1 @@ +Icon-databases-126 \ No newline at end of file diff --git a/metadata-ingestion/docs/sources/azure-data-factory/README.md b/metadata-ingestion/docs/sources/azure-data-factory/README.md new file mode 100644 index 00000000000000..407ed8bbd83e0e --- /dev/null +++ b/metadata-ingestion/docs/sources/azure-data-factory/README.md @@ -0,0 +1,80 @@ +# Azure Data Factory + +For context on getting started with ingestion, check out our [metadata ingestion guide](../../../../metadata-ingestion/README.md). + +## Setup + +To install this plugin, run `pip install 'acryl-datahub[azure-data-factory]'`. + +## Quickstart Recipe + +```yaml +source: + type: azure-data-factory + config: + # Required + subscription_id: ${AZURE_SUBSCRIPTION_ID} + + # Authentication (service principal) + credential: + authentication_method: service_principal + client_id: ${AZURE_CLIENT_ID} + client_secret: ${AZURE_CLIENT_SECRET} + tenant_id: ${AZURE_TENANT_ID} + + # Optional filters + factory_pattern: + allow: ["prod-.*"] + + # Features + include_lineage: true + include_execution_history: false + + env: PROD + +sink: + type: datahub-rest + config: + server: "http://localhost:8080" +``` + +## Authentication Methods + +| Method | Config Value | Use Case | +| ----------------- | ------------------- | ----------------- | +| Service Principal | `service_principal` | Production | +| Managed Identity | `managed_identity` | Azure-hosted | +| Azure CLI | `cli` | Local development | +| Auto-detect | `default` | Flexible | + +## Config Details + +| Field | Required | Description | +| ---------------------------------- | -------- | ----------------------------------------- | +| `subscription_id` | ✅ | Azure subscription ID | +| `credential.authentication_method` | | Auth method (default: `default`) | +| `credential.client_id` | | App (client) ID for service principal | +| `credential.client_secret` | | Client secret for service principal | +| `credential.tenant_id` | | Tenant (directory) ID | +| `resource_group` | | Filter to specific resource group | +| `factory_pattern` | | Regex allow/deny for factories | +| `pipeline_pattern` | | Regex allow/deny for pipelines | +| `include_lineage` | | Extract lineage (default: `true`) | +| `include_execution_history` | | Extract pipeline runs (default: `false`) | +| `execution_history_days` | | Days of history, 1-90 (default: `7`) | +| `platform_instance_map` | | Map linked services to platform instances | +| `env` | | Environment (default: `PROD`) | + +## Entity Mapping + +| ADF Concept | DataHub Entity | +| ------------ | ------------------- | +| Data Factory | Container | +| Pipeline | DataFlow | +| Activity | DataJob | +| Dataset | Dataset | +| Pipeline Run | DataProcessInstance | + +## Questions + +If you've got any questions on configuring this source, feel free to ping us on [our Slack](https://slack.datahubproject.io/). diff --git a/metadata-ingestion/docs/sources/azure-data-factory/azure-data-factory_pre.md b/metadata-ingestion/docs/sources/azure-data-factory/azure-data-factory_pre.md new file mode 100644 index 00000000000000..0f8ba02229d439 --- /dev/null +++ b/metadata-ingestion/docs/sources/azure-data-factory/azure-data-factory_pre.md @@ -0,0 +1,296 @@ +## Overview + +This connector extracts metadata from Azure Data Factory (ADF), including: + +- **Data Factories** as Containers +- **Pipelines** as DataFlows +- **Activities** as DataJobs (Copy, Data Flow, Lookup, etc.) +- **Lineage** between source and destination datasets +- **Execution History** as DataProcessInstance (optional) + +:::note Not Azure Fabric +This connector is for **Azure Data Factory** (classic), not Azure Fabric's Data Factory. Azure Fabric support is planned for a future release. +::: + +## Prerequisites + +### Azure Authentication + +The connector supports multiple authentication methods: + +| Method | Best For | Configuration | +| -------------------------- | ------------------------------------------------ | --------------------------------------------------- | +| **Service Principal** | Production environments | `authentication_method: service_principal` | +| **Managed Identity** | Azure-hosted deployments (VMs, AKS, App Service) | `authentication_method: managed_identity` | +| **Azure CLI** | Local development | `authentication_method: cli` (run `az login` first) | +| **DefaultAzureCredential** | Flexible environments | `authentication_method: default` | + +### Required Azure Permissions + +Grant the following role to your identity on the Data Factory resources: + +| Role | Required For | +| ---------------------------- | ----------------------------------- | +| **Reader** | Basic metadata extraction | +| **Data Factory Contributor** | Full access including pipeline runs | + +To set up a service principal: + +1. Create an App Registration in Azure Portal > Microsoft Entra ID > App registrations +2. Create a client secret under Certificates & secrets +3. Grant the service principal **Reader** or **Data Factory Contributor** role on your resource group or Data Factory + +## Concept Mapping + +| Azure Data Factory | DataHub Entity | SubType | +| ------------------ | ------------------------------------------------------------------------------------------------------ | ---------------------------- | +| Data Factory | [Container](https://docs.datahub.com/docs/generated/metamodel/entities/container/) | Data Factory | +| Pipeline | [DataFlow](https://docs.datahub.com/docs/generated/metamodel/entities/dataflow/) | Pipeline | +| Activity | [DataJob](https://docs.datahub.com/docs/generated/metamodel/entities/datajob/) | Copy, DataFlow, Lookup, etc. | +| Dataset | [Dataset](https://docs.datahub.com/docs/generated/metamodel/entities/dataset/) | Based on linked service type | +| Pipeline Run | [DataProcessInstance](https://docs.datahub.com/docs/generated/metamodel/entities/dataprocessinstance/) | - | + +## Capabilities + +| Capability | Status | Notes | +| --------------------- | ------ | ------------------------------------------- | +| Platform Instance | ✅ | Enabled by default | +| Containers | ✅ | Data Factories as containers | +| Lineage (Table-level) | ✅ | From activity inputs/outputs and Data Flows | +| Pipeline-to-Pipeline | ✅ | ExecutePipeline activities create lineage | +| Data Flow Scripts | ✅ | Stored as transformation logic | +| Execution History | ✅ | Optional, via `include_execution_history` | +| Stateful Ingestion | ✅ | Stale entity removal | + +## Lineage Extraction + +The connector extracts lineage from: + +1. **Copy Activities**: Maps input/output datasets to DataHub datasets +2. **Data Flow Activities**: Extracts sources and sinks from Data Flow definitions +3. **Lookup Activities**: Maps lookup datasets as inputs +4. **ExecutePipeline Activities**: Creates pipeline-to-pipeline lineage to child pipelines + +### Pipeline-to-Pipeline Lineage + +When a pipeline calls another pipeline via an `ExecutePipeline` activity, the connector creates a lineage edge showing the calling activity as **upstream** of the child pipeline's first activity. This enables: + +- Tracing orchestration hierarchies across nested pipelines +- Impact analysis when modifying child pipelines +- Understanding dependencies between modular pipelines + +**Lineage Direction:** `ExecutePipeline` → `ChildFirstActivity` + +The ExecutePipeline activity's DataJob entity will include: + +- Custom property `calls_pipeline`: Name of the child pipeline +- Custom property `child_pipeline_urn`: URN of the child DataFlow +- Custom property `child_first_activity`: Name of the first activity in the child pipeline + +The child pipeline's first activity will have the ExecutePipeline as its input/upstream dependency. + +### Supported Linked Service Mappings + +| ADF Linked Service | DataHub Platform | +| --------------------------------------------------- | ---------------- | +| AzureBlobStorage, AzureBlobFS, AzureDataLakeStore | `abs` | +| AzureSqlDatabase, AzureSqlDW, AzureSynapseAnalytics | `mssql` | +| Snowflake | `snowflake` | +| AmazonS3 | `s3` | +| GoogleBigQuery | `bigquery` | +| PostgreSql, AzurePostgreSql | `postgres` | +| MySql, AzureMySql | `mysql` | +| Oracle | `oracle` | +| Salesforce | `salesforce` | +| CosmosDb | `cosmosdb` | +| AzureDatabricks, AzureDatabricksDeltaLake | `databricks` | + +### Platform Instance Mapping + +For accurate lineage resolution to existing datasets in DataHub, map linked service names to platform instances: + +```yaml +source: + type: azure-data-factory + config: + platform_instance_map: + "snowflake-prod-connection": "prod_snowflake" + "synapse-analytics-connection": "prod_synapse" +``` + +## Data Flow Scripts + +For activities that execute ADF Data Flows (mapping data flows), the connector extracts the Data Flow script and stores it as transformation logic on the DataJob entity. + +This enables: + +- Viewing the complete Data Flow transformation script in DataHub +- Understanding the data transformations applied by each Data Flow activity +- Searching for Data Flows by their transformation logic + +The script is stored in the `dataTransformLogic` aspect and is visible in the DataHub UI under the activity's details. + +## Execution History + +When `include_execution_history: true`, the connector extracts pipeline runs as `DataProcessInstance` entities: + +```yaml +source: + type: azure-data-factory + config: + include_execution_history: true + execution_history_days: 7 # 1-90 days +``` + +This provides: + +- Pipeline run status (Succeeded, Failed, Cancelled, In Progress) +- Run duration and timestamps +- Trigger information (who/what started the run) +- Run parameters + +## When to Use Platform Instance + +The `platform_instance` configuration is used to distinguish between **separate ADF deployments** (e.g., different Azure subscriptions or tenants), not for separating factories within the same deployment. + +### When to Use `platform_instance` + +| Scenario | Example Configuration | +| -------------------------------- | ---------------------------------------- | +| **Multiple Azure Subscriptions** | Different subscriptions for prod vs dev | +| **Multi-Tenant Organizations** | Separate Azure tenants per business unit | +| **Multi-Region Deployments** | US-East vs EU-West deployments | + +**Example: Multiple Subscriptions** + +```yaml +# Production subscription +source: + type: azure-data-factory + config: + subscription_id: "prod-subscription-id" + platform_instance: "production" + +# Development subscription +source: + type: azure-data-factory + config: + subscription_id: "dev-subscription-id" + platform_instance: "development" +``` + +**Example: Multi-Region** + +```yaml +# US Region +source: + type: azure-data-factory + config: + subscription_id: "us-east-subscription" + platform_instance: "us-east" + +# EU Region +source: + type: azure-data-factory + config: + subscription_id: "eu-west-subscription" + platform_instance: "eu-west" +``` + +### When NOT to Use `platform_instance` + +- **Single subscription** - Factory names in URNs already provide uniqueness +- **Multiple factories in same subscription** - The factory name is included in the URN automatically +- **Same logical environment** - Don't use it just to differentiate factories + +:::note URN Uniqueness +The connector automatically includes the factory name in pipeline URNs (e.g., `my-factory.ETL-Pipeline`), so you don't need `platform_instance` to distinguish pipelines across factories within the same subscription. +::: + +## URN Format + +Pipeline URNs include the factory name for uniqueness across multiple factories: + +``` +urn:li:dataFlow:(azure-data-factory,{factory_name}.{pipeline_name},{env}) +``` + +Example: `urn:li:dataFlow:(azure-data-factory,my-factory.ETL-Pipeline,PROD)` + +Activity URNs reference their parent pipeline: + +``` +urn:li:dataJob:(urn:li:dataFlow:(azure-data-factory,{factory_name}.{pipeline_name},{env}),{activity_name}) +``` + +With `platform_instance` set, it's prepended to the URN: + +``` +urn:li:dataFlow:(azure-data-factory,{platform_instance}.{factory_name}.{pipeline_name},{env}) +``` + +Example: `urn:li:dataFlow:(azure-data-factory,production.my-factory.ETL-Pipeline,PROD)` + +## Naming Rules and Uniqueness + +### Azure Naming Rules + +Azure Data Factory enforces specific naming rules documented at [Azure Data Factory naming rules](https://learn.microsoft.com/en-us/azure/data-factory/naming-rules): + +| Resource | Uniqueness | Case Sensitivity | +| --------------- | ---------------------------- | ---------------- | +| Data Factory | Globally unique across Azure | Case-insensitive | +| Pipelines | Unique within a factory | Case-insensitive | +| Datasets | Unique within a factory | Case-insensitive | +| Linked Services | Unique within a factory | Case-insensitive | +| Data Flows | Unique within a factory | Case-insensitive | + +### How DataHub Handles Uniqueness + +The connector constructs URNs using `{factory_name}.{pipeline_name}` format: + +- **Factory names are globally unique** in Azure, preventing collisions within a subscription +- **Pipeline names are unique within a factory**, so the combination is globally unique +- **No additional namespacing needed** for single-subscription deployments + +### Multi-Subscription and Multi-Tenant Scenarios + +:::warning Important +Factory names are globally unique _within Azure_, but different Azure tenants or subscriptions in different regions could have identically-named factories. +::: + +| Scenario | Risk | Solution | +| ------------------------------------ | ------------------------------------- | -------------------------------------------------- | +| Single subscription | None | Default URN format works | +| Multiple subscriptions (same tenant) | Low - factory names still unique | Default works, but `platform_instance` recommended | +| Multiple tenants | **High** - same factory name possible | **Must use `platform_instance`** | + +**Example: Multi-Tenant Setup** + +```yaml +# Tenant A +source: + type: azure-data-factory + config: + subscription_id: "tenant-a-sub" + platform_instance: "tenant-a" + +# Tenant B (could have same factory name!) +source: + type: azure-data-factory + config: + subscription_id: "tenant-b-sub" + platform_instance: "tenant-b" +``` + +### Case Sensitivity + +Azure treats names as **case-insensitive** (e.g., `MyFactory` and `myfactory` are the same factory). DataHub URNs are case-sensitive, but this doesn't cause issues because: + +1. Azure prevents creating duplicate names with different casing at the source +2. The connector uses exact names from the Azure API response +3. Consistent casing is maintained throughout ingestion + +:::tip +If you're ingesting from multiple Azure tenants and see unexpected entity overwrites in DataHub, ensure each ingestion recipe uses a unique `platform_instance` value. +::: diff --git a/metadata-ingestion/docs/sources/azure-data-factory/azure-data-factory_recipe.yml b/metadata-ingestion/docs/sources/azure-data-factory/azure-data-factory_recipe.yml new file mode 100644 index 00000000000000..e07c71556b0649 --- /dev/null +++ b/metadata-ingestion/docs/sources/azure-data-factory/azure-data-factory_recipe.yml @@ -0,0 +1,56 @@ +# Example recipe for Azure Data Factory source +# See README.md for full configuration options + +source: + type: azure-data-factory + config: + # Required: Azure subscription containing Data Factories + subscription_id: ${AZURE_SUBSCRIPTION_ID} + + # Optional: Filter to specific resource group + # resource_group: my-resource-group + + # Authentication (using service principal) + credential: + authentication_method: service_principal + client_id: ${AZURE_CLIENT_ID} + client_secret: ${AZURE_CLIENT_SECRET} + tenant_id: ${AZURE_TENANT_ID} + + # Optional: Filter factories by name pattern + factory_pattern: + allow: + - ".*" # Allow all factories by default + deny: [] + + # Optional: Filter pipelines by name pattern + pipeline_pattern: + allow: + - ".*" # Allow all pipelines by default + deny: [] + + # Feature flags + include_lineage: true + include_column_lineage: false # Advanced: requires Data Flow parsing + include_execution_history: false # Set to true for pipeline run history + execution_history_days: 7 # Only used when include_execution_history is true + + # Optional: Map linked services to platform instances for accurate lineage + # platform_instance_map: + # "my-snowflake-connection": "prod_snowflake" + + # Optional: Platform instance for this ADF connector + # platform_instance: "main-adf" + + # Environment + env: PROD + + # Optional: Stateful ingestion for stale entity removal + # stateful_ingestion: + # enabled: true + +sink: + type: datahub-rest + config: + server: "http://localhost:8080" + diff --git a/metadata-ingestion/setup.py b/metadata-ingestion/setup.py index e79c1f94857d5e..c27cbe4cd0a2d0 100644 --- a/metadata-ingestion/setup.py +++ b/metadata-ingestion/setup.py @@ -348,6 +348,11 @@ *path_spec_common, } +azure_data_factory = { + "azure-identity>=1.21.0", + "azure-mgmt-datafactory>=9.0.0", +} + data_lake_profiling = { "pydeequ>=1.1.0", "pyspark~=3.5.6", @@ -450,6 +455,7 @@ "tenacity!=8.4.0", }, "azure-ad": set(), + "azure-data-factory": azure_data_factory, "bigquery": sql_common | bigquery_common | sqlglot_lib @@ -781,6 +787,7 @@ dependency for plugin in [ "athena", + "azure-data-factory", "circuit-breaker", "clickhouse", "delta-lake", @@ -817,6 +824,7 @@ "sqlalchemy = datahub.ingestion.source.sql.sql_generic:SQLAlchemyGenericSource", "athena = datahub.ingestion.source.sql.athena:AthenaSource", "azure-ad = datahub.ingestion.source.identity.azure_ad:AzureADSource", + "azure-data-factory = datahub.ingestion.source.azure_data_factory.adf_source:AzureDataFactorySource", "bigquery = datahub.ingestion.source.bigquery_v2.bigquery:BigqueryV2Source", "bigquery-queries = datahub.ingestion.source.bigquery_v2.bigquery_queries:BigQueryQueriesSource", "clickhouse = datahub.ingestion.source.sql.clickhouse:ClickHouseSource", diff --git a/metadata-ingestion/src/datahub/ingestion/autogenerated/capability_summary.json b/metadata-ingestion/src/datahub/ingestion/autogenerated/capability_summary.json index e0b59f866091a7..9e864336fc2c49 100644 --- a/metadata-ingestion/src/datahub/ingestion/autogenerated/capability_summary.json +++ b/metadata-ingestion/src/datahub/ingestion/autogenerated/capability_summary.json @@ -1,5 +1,5 @@ { - "generated_at": "2025-11-14T14:26:00.526772+00:00", + "generated_at": "2025-12-09T02:03:28.193633+00:00", "generated_by": "metadata-ingestion/scripts/capability_summary.py", "plugin_details": { "abs": { @@ -136,6 +136,38 @@ "platform_name": "Azure AD", "support_status": "CERTIFIED" }, + "azure-data-factory": { + "capabilities": [ + { + "capability": "CONTAINERS", + "description": "Enabled by default", + "subtype_modifier": null, + "supported": true + }, + { + "capability": "DELETION_DETECTION", + "description": "Enabled by default via stateful ingestion", + "subtype_modifier": null, + "supported": true + }, + { + "capability": "PLATFORM_INSTANCE", + "description": "Enabled by default", + "subtype_modifier": null, + "supported": true + }, + { + "capability": "LINEAGE_COARSE", + "description": "Extracts lineage from activity inputs/outputs", + "subtype_modifier": null, + "supported": true + } + ], + "classname": "datahub.ingestion.source.azure_data_factory.adf_source.AzureDataFactorySource", + "platform_id": "azure-data-factory", + "platform_name": "Azure Data Factory", + "support_status": "INCUBATING" + }, "bigquery": { "capabilities": [ { diff --git a/metadata-ingestion/src/datahub/ingestion/source/azure/azure_auth.py b/metadata-ingestion/src/datahub/ingestion/source/azure/azure_auth.py new file mode 100644 index 00000000000000..424a796e323fcc --- /dev/null +++ b/metadata-ingestion/src/datahub/ingestion/source/azure/azure_auth.py @@ -0,0 +1,190 @@ +"""Unified Azure authentication module for DataHub connectors. + +This module provides a reusable authentication configuration that can be used +across all Azure connectors (ADF, Synapse, Fabric, etc.). + +Supports multiple authentication methods: +- Service Principal (client_id + client_secret + tenant_id) +- Managed Identity (system-assigned or user-assigned) +- Azure CLI credentials (for local development) +- DefaultAzureCredential (auto-detects environment) +""" + +from typing import Optional + +from azure.core.credentials import TokenCredential +from azure.identity import ( + AzureCliCredential, + ClientSecretCredential, + DefaultAzureCredential, + ManagedIdentityCredential, +) +from pydantic import Field, SecretStr, model_validator + +from datahub.configuration import ConfigModel +from datahub.utilities.str_enum import StrEnum + + +class AzureAuthenticationMethod(StrEnum): + """Supported Azure authentication methods. + + - DEFAULT: Uses DefaultAzureCredential which auto-detects credentials from + environment variables, managed identity, Azure CLI, etc. + - SERVICE_PRINCIPAL: Uses client ID, client secret, and tenant ID + - MANAGED_IDENTITY: Uses Azure Managed Identity (system or user-assigned) + - CLI: Uses Azure CLI credential (requires `az login`) + """ + + DEFAULT = "default" + SERVICE_PRINCIPAL = "service_principal" + MANAGED_IDENTITY = "managed_identity" + CLI = "cli" + + +class AzureCredentialConfig(ConfigModel): + """Unified Azure authentication configuration. + + This class provides a reusable authentication configuration that can be + composed into any Azure connector's configuration. It supports multiple + authentication methods and returns a TokenCredential that works with + any Azure SDK client. + + Example usage in a connector config: + class MyAzureConnectorConfig(ConfigModel): + credential: AzureCredentialConfig = Field( + default_factory=AzureCredentialConfig, + description="Azure authentication configuration" + ) + subscription_id: str = Field(...) + """ + + authentication_method: AzureAuthenticationMethod = Field( + default=AzureAuthenticationMethod.DEFAULT, + description=( + "Authentication method to use. Options: " + "'default' (auto-detects from environment), " + "'service_principal' (client ID + secret + tenant), " + "'managed_identity' (Azure Managed Identity), " + "'cli' (Azure CLI credential). " + "Recommended: Use 'default' which tries multiple methods automatically." + ), + ) + + # Service Principal credentials (required when authentication_method = "service_principal") + client_id: Optional[str] = Field( + default=None, + description=( + "Azure Application (client) ID. Required for service_principal authentication. " + "Find this in Azure Portal > App registrations > Your app > Overview." + ), + ) + client_secret: Optional[SecretStr] = Field( + default=None, + description=( + "Azure client secret. Required for service_principal authentication. " + "Create in Azure Portal > App registrations > Your app > Certificates & secrets." + ), + ) + tenant_id: Optional[str] = Field( + default=None, + description=( + "Azure tenant (directory) ID. Required for service_principal authentication. " + "Find this in Azure Portal > Microsoft Entra ID > Overview." + ), + ) + + # Managed Identity options (optional, for user-assigned managed identity) + managed_identity_client_id: Optional[str] = Field( + default=None, + description=( + "Client ID for user-assigned managed identity. " + "Leave empty to use system-assigned managed identity. " + "Only used when authentication_method is 'managed_identity'." + ), + ) + + # Additional options for DefaultAzureCredential + exclude_cli_credential: bool = Field( + default=False, + description=( + "When using 'default' authentication, exclude Azure CLI credential. " + "Useful in production to avoid accidentally using developer credentials." + ), + ) + exclude_environment_credential: bool = Field( + default=False, + description=( + "When using 'default' authentication, exclude environment variables. " + "Environment variables checked: AZURE_CLIENT_ID, AZURE_CLIENT_SECRET, AZURE_TENANT_ID." + ), + ) + exclude_managed_identity_credential: bool = Field( + default=False, + description=( + "When using 'default' authentication, exclude managed identity. " + "Useful during local development when managed identity is not available." + ), + ) + + def get_credential(self) -> TokenCredential: + """Get Azure credential based on the configured authentication method. + + Returns: + TokenCredential: An Azure credential object that can be used with + any Azure SDK client (e.g., DataFactoryManagementClient). + + Raises: + ValueError: If required credentials are missing for the chosen method. + """ + if self.authentication_method == AzureAuthenticationMethod.SERVICE_PRINCIPAL: + # Validate all required fields (also validated in validate_credentials()) + if not self.client_secret: + raise ValueError( + "client_secret is required for service_principal authentication" + ) + if not self.tenant_id: + raise ValueError( + "tenant_id is required for service_principal authentication" + ) + if not self.client_id: + raise ValueError( + "client_id is required for service_principal authentication" + ) + return ClientSecretCredential( + tenant_id=self.tenant_id, + client_id=self.client_id, + client_secret=self.client_secret.get_secret_value(), + ) + + elif self.authentication_method == AzureAuthenticationMethod.MANAGED_IDENTITY: + return ManagedIdentityCredential(client_id=self.managed_identity_client_id) + + elif self.authentication_method == AzureAuthenticationMethod.CLI: + return AzureCliCredential() + + else: # DEFAULT + return DefaultAzureCredential( + exclude_cli_credential=self.exclude_cli_credential, + exclude_environment_credential=self.exclude_environment_credential, + exclude_managed_identity_credential=self.exclude_managed_identity_credential, + ) + + @model_validator(mode="after") + def validate_credentials(self) -> "AzureCredentialConfig": + """Validate that required credentials are provided for the chosen method.""" + if self.authentication_method == AzureAuthenticationMethod.SERVICE_PRINCIPAL: + missing = [] + if not self.client_id: + missing.append("client_id") + if not self.client_secret: + missing.append("client_secret") + if not self.tenant_id: + missing.append("tenant_id") + + if missing: + raise ValueError( + f"Service principal authentication requires: {', '.join(missing)}. " + f"These can be found in Azure Portal > App registrations." + ) + + return self diff --git a/metadata-ingestion/src/datahub/ingestion/source/azure_data_factory/__init__.py b/metadata-ingestion/src/datahub/ingestion/source/azure_data_factory/__init__.py new file mode 100644 index 00000000000000..f3adb4ffcaf005 --- /dev/null +++ b/metadata-ingestion/src/datahub/ingestion/source/azure_data_factory/__init__.py @@ -0,0 +1,22 @@ +"""Azure Data Factory DataHub connector. + +This package provides a connector to ingest metadata from Azure Data Factory +into DataHub, including: + +- Data Factories as Containers +- Pipelines as DataFlows +- Activities as DataJobs +- Dataset lineage +- Execution history (optional) + +Usage: + source: + type: azure_data_factory + config: + subscription_id: ${AZURE_SUBSCRIPTION_ID} + credential: + authentication_method: service_principal + client_id: ${AZURE_CLIENT_ID} + client_secret: ${AZURE_CLIENT_SECRET} + tenant_id: ${AZURE_TENANT_ID} +""" diff --git a/metadata-ingestion/src/datahub/ingestion/source/azure_data_factory/adf_client.py b/metadata-ingestion/src/datahub/ingestion/source/azure_data_factory/adf_client.py new file mode 100644 index 00000000000000..d5e06630930643 --- /dev/null +++ b/metadata-ingestion/src/datahub/ingestion/source/azure_data_factory/adf_client.py @@ -0,0 +1,438 @@ +"""Azure Data Factory REST API client wrapper. + +This module provides a typed client for interacting with the Azure Data Factory +REST API. It handles authentication, pagination, and error handling. + +API Documentation: https://learn.microsoft.com/en-us/rest/api/datafactory/ +""" + +import logging +from datetime import datetime, timedelta, timezone +from typing import Iterator, Optional + +from azure.core.credentials import TokenCredential +from azure.core.exceptions import HttpResponseError +from azure.mgmt.datafactory import DataFactoryManagementClient +from azure.mgmt.datafactory.models import ( + ActivityRunsQueryResponse, + PipelineRunsQueryResponse, + RunFilterParameters, +) + +from datahub.ingestion.source.azure_data_factory.adf_models import ( + ActivityRun, + DataFlow, + Dataset, + Factory, + LinkedService, + Pipeline, + PipelineRun, + Trigger, +) + +logger = logging.getLogger(__name__) + +# Maximum retention period for activity run queries (Azure limit) +MAX_ACTIVITY_RUN_RETENTION_DAYS = 90 + + +class AzureDataFactoryClient: + """Client for Azure Data Factory REST API. + + Uses the Azure SDK (azure-mgmt-datafactory) for type safety and + automatic pagination handling. + + API Reference: https://learn.microsoft.com/en-us/rest/api/datafactory/ + """ + + def __init__( + self, + credential: TokenCredential, + subscription_id: str, + ) -> None: + """Initialize the ADF client. + + Args: + credential: Azure credential for authentication (from AzureCredentialConfig) + subscription_id: Azure subscription ID containing Data Factories + """ + self.subscription_id = subscription_id + self._client = DataFactoryManagementClient( + credential=credential, + subscription_id=subscription_id, + ) + + def get_factories( + self, + resource_group: Optional[str] = None, + ) -> Iterator[Factory]: + """List all Data Factories. + + API Reference: + - By subscription: https://learn.microsoft.com/en-us/rest/api/datafactory/factories/list + - By resource group: https://learn.microsoft.com/en-us/rest/api/datafactory/factories/list-by-resource-group + + Args: + resource_group: Optional resource group name to filter factories + + Yields: + Factory objects + """ + try: + if resource_group: + # GET /subscriptions/{sub}/resourceGroups/{rg}/providers/Microsoft.DataFactory/factories + # Docs: https://learn.microsoft.com/en-us/rest/api/datafactory/factories/list-by-resource-group + factories_response = self._client.factories.list_by_resource_group( + resource_group_name=resource_group + ) + else: + # GET /subscriptions/{sub}/providers/Microsoft.DataFactory/factories + # Docs: https://learn.microsoft.com/en-us/rest/api/datafactory/factories/list + factories_response = self._client.factories.list() + + for factory in factories_response: + yield Factory.model_validate(factory.as_dict()) + + except HttpResponseError as e: + logger.error(f"Failed to list factories: {e.message}") + raise + + def get_factory( + self, + resource_group: str, + factory_name: str, + ) -> Factory: + """Get a specific Data Factory. + + API Reference: https://learn.microsoft.com/en-us/rest/api/datafactory/factories/get + + Args: + resource_group: Resource group name + factory_name: Data Factory name + + Returns: + Factory object + """ + # GET /subscriptions/{sub}/resourceGroups/{rg}/providers/Microsoft.DataFactory/factories/{factoryName} + # Docs: https://learn.microsoft.com/en-us/rest/api/datafactory/factories/get + factory = self._client.factories.get( + resource_group_name=resource_group, + factory_name=factory_name, + ) + if factory is None: + raise ValueError(f"Factory not found: {factory_name}") + return Factory.model_validate(factory.as_dict()) + + def get_pipelines( + self, + resource_group: str, + factory_name: str, + ) -> Iterator[Pipeline]: + """List all pipelines in a Data Factory. + + API Reference: https://learn.microsoft.com/en-us/rest/api/datafactory/pipelines/list-by-factory + + Args: + resource_group: Resource group name + factory_name: Data Factory name + + Yields: + Pipeline objects with activities + """ + try: + # GET /subscriptions/{sub}/resourceGroups/{rg}/providers/Microsoft.DataFactory/factories/{factoryName}/pipelines + # Docs: https://learn.microsoft.com/en-us/rest/api/datafactory/pipelines/list-by-factory + pipelines_response = self._client.pipelines.list_by_factory( + resource_group_name=resource_group, + factory_name=factory_name, + ) + + for pipeline in pipelines_response: + yield Pipeline.model_validate(pipeline.as_dict()) + + except HttpResponseError as e: + logger.error( + f"Failed to list pipelines for factory {factory_name}: {e.message}" + ) + raise + + def get_pipeline( + self, + resource_group: str, + factory_name: str, + pipeline_name: str, + ) -> Pipeline: + """Get a specific pipeline. + + API Reference: https://learn.microsoft.com/en-us/rest/api/datafactory/pipelines/get + + Args: + resource_group: Resource group name + factory_name: Data Factory name + pipeline_name: Pipeline name + + Returns: + Pipeline object with activities + """ + # GET /subscriptions/{sub}/resourceGroups/{rg}/providers/Microsoft.DataFactory/factories/{factoryName}/pipelines/{pipelineName} + # Docs: https://learn.microsoft.com/en-us/rest/api/datafactory/pipelines/get + pipeline = self._client.pipelines.get( + resource_group_name=resource_group, + factory_name=factory_name, + pipeline_name=pipeline_name, + ) + if pipeline is None: + raise ValueError(f"Pipeline not found: {pipeline_name}") + return Pipeline.model_validate(pipeline.as_dict()) + + def get_datasets( + self, + resource_group: str, + factory_name: str, + ) -> Iterator[Dataset]: + """List all datasets in a Data Factory. + + API Reference: https://learn.microsoft.com/en-us/rest/api/datafactory/datasets/list-by-factory + + Args: + resource_group: Resource group name + factory_name: Data Factory name + + Yields: + Dataset objects + """ + try: + # GET /subscriptions/{sub}/resourceGroups/{rg}/providers/Microsoft.DataFactory/factories/{factoryName}/datasets + # Docs: https://learn.microsoft.com/en-us/rest/api/datafactory/datasets/list-by-factory + datasets_response = self._client.datasets.list_by_factory( + resource_group_name=resource_group, + factory_name=factory_name, + ) + + for dataset in datasets_response: + yield Dataset.model_validate(dataset.as_dict()) + + except HttpResponseError as e: + logger.error( + f"Failed to list datasets for factory {factory_name}: {e.message}" + ) + raise + + def get_linked_services( + self, + resource_group: str, + factory_name: str, + ) -> Iterator[LinkedService]: + """List all linked services in a Data Factory. + + API Reference: https://learn.microsoft.com/en-us/rest/api/datafactory/linked-services/list-by-factory + + Args: + resource_group: Resource group name + factory_name: Data Factory name + + Yields: + LinkedService objects + """ + try: + # GET /subscriptions/{sub}/resourceGroups/{rg}/providers/Microsoft.DataFactory/factories/{factoryName}/linkedservices + # Docs: https://learn.microsoft.com/en-us/rest/api/datafactory/linked-services/list-by-factory + linked_services_response = self._client.linked_services.list_by_factory( + resource_group_name=resource_group, + factory_name=factory_name, + ) + + for linked_service in linked_services_response: + yield LinkedService.model_validate(linked_service.as_dict()) + + except HttpResponseError as e: + logger.error( + f"Failed to list linked services for factory {factory_name}: {e.message}" + ) + raise + + def get_data_flows( + self, + resource_group: str, + factory_name: str, + ) -> Iterator[DataFlow]: + """List all data flows in a Data Factory. + + API Reference: https://learn.microsoft.com/en-us/rest/api/datafactory/data-flows/list-by-factory + + Args: + resource_group: Resource group name + factory_name: Data Factory name + + Yields: + DataFlow objects + """ + try: + # GET /subscriptions/{sub}/resourceGroups/{rg}/providers/Microsoft.DataFactory/factories/{factoryName}/dataflows + # Docs: https://learn.microsoft.com/en-us/rest/api/datafactory/data-flows/list-by-factory + data_flows_response = self._client.data_flows.list_by_factory( + resource_group_name=resource_group, + factory_name=factory_name, + ) + + for data_flow in data_flows_response: + yield DataFlow.model_validate(data_flow.as_dict()) + + except HttpResponseError as e: + logger.error( + f"Failed to list data flows for factory {factory_name}: {e.message}" + ) + raise + + def get_triggers( + self, + resource_group: str, + factory_name: str, + ) -> Iterator[Trigger]: + """List all triggers in a Data Factory. + + API Reference: https://learn.microsoft.com/en-us/rest/api/datafactory/triggers/list-by-factory + + Args: + resource_group: Resource group name + factory_name: Data Factory name + + Yields: + Trigger objects + """ + try: + # GET /subscriptions/{sub}/resourceGroups/{rg}/providers/Microsoft.DataFactory/factories/{factoryName}/triggers + # Docs: https://learn.microsoft.com/en-us/rest/api/datafactory/triggers/list-by-factory + triggers_response = self._client.triggers.list_by_factory( + resource_group_name=resource_group, + factory_name=factory_name, + ) + + for trigger in triggers_response: + yield Trigger.model_validate(trigger.as_dict()) + + except HttpResponseError as e: + logger.error( + f"Failed to list triggers for factory {factory_name}: {e.message}" + ) + raise + + def get_pipeline_runs( + self, + resource_group: str, + factory_name: str, + days: int = 7, + ) -> Iterator[PipelineRun]: + """Query pipeline runs for a Data Factory. + + API Reference: https://learn.microsoft.com/en-us/rest/api/datafactory/pipeline-runs/query-by-factory + + Args: + resource_group: Resource group name + factory_name: Data Factory name + days: Number of days of history to fetch + + Yields: + PipelineRun objects + """ + try: + end_time = datetime.now(timezone.utc) + start_time = end_time - timedelta(days=days) + + # POST /subscriptions/{sub}/resourceGroups/{rg}/providers/Microsoft.DataFactory/factories/{factoryName}/queryPipelineRuns + # Docs: https://learn.microsoft.com/en-us/rest/api/datafactory/pipeline-runs/query-by-factory + filter_params = RunFilterParameters( + last_updated_after=start_time, + last_updated_before=end_time, + ) + + response: PipelineRunsQueryResponse = ( + self._client.pipeline_runs.query_by_factory( + resource_group_name=resource_group, + factory_name=factory_name, + filter_parameters=filter_params, + ) + ) + + for run in response.value or []: + yield PipelineRun.model_validate(run.as_dict()) + + # Handle pagination via continuation token + while response.continuation_token: + filter_params.continuation_token = response.continuation_token + response = self._client.pipeline_runs.query_by_factory( + resource_group_name=resource_group, + factory_name=factory_name, + filter_parameters=filter_params, + ) + for run in response.value or []: + yield PipelineRun.model_validate(run.as_dict()) + + except HttpResponseError as e: + logger.error( + f"Failed to query pipeline runs for factory {factory_name}: {e.message}" + ) + raise + + def get_activity_runs( + self, + resource_group: str, + factory_name: str, + run_id: str, + ) -> Iterator[ActivityRun]: + """Query activity runs for a pipeline run. + + API Reference: https://learn.microsoft.com/en-us/rest/api/datafactory/activity-runs/query-by-pipeline-run + + Args: + resource_group: Resource group name + factory_name: Data Factory name + run_id: Pipeline run ID + + Yields: + ActivityRun objects + """ + try: + end_time = datetime.now(timezone.utc) + start_time = end_time - timedelta(days=MAX_ACTIVITY_RUN_RETENTION_DAYS) + + # POST /subscriptions/{sub}/resourceGroups/{rg}/providers/Microsoft.DataFactory/factories/{factoryName}/pipelineruns/{runId}/queryActivityruns + # Docs: https://learn.microsoft.com/en-us/rest/api/datafactory/activity-runs/query-by-pipeline-run + filter_params = RunFilterParameters( + last_updated_after=start_time, + last_updated_before=end_time, + ) + + response: ActivityRunsQueryResponse = ( + self._client.activity_runs.query_by_pipeline_run( + resource_group_name=resource_group, + factory_name=factory_name, + run_id=run_id, + filter_parameters=filter_params, + ) + ) + + for run in response.value or []: + yield ActivityRun.model_validate(run.as_dict()) + + # Handle pagination via continuation token + while response.continuation_token: + filter_params.continuation_token = response.continuation_token + response = self._client.activity_runs.query_by_pipeline_run( + resource_group_name=resource_group, + factory_name=factory_name, + run_id=run_id, + filter_parameters=filter_params, + ) + for run in response.value or []: + yield ActivityRun.model_validate(run.as_dict()) + + except HttpResponseError as e: + logger.error( + f"Failed to query activity runs for pipeline run {run_id}: {e.message}" + ) + raise + + def close(self) -> None: + """Close the client and release resources.""" + self._client.close() diff --git a/metadata-ingestion/src/datahub/ingestion/source/azure_data_factory/adf_config.py b/metadata-ingestion/src/datahub/ingestion/source/azure_data_factory/adf_config.py new file mode 100644 index 00000000000000..2db88fecd2773d --- /dev/null +++ b/metadata-ingestion/src/datahub/ingestion/source/azure_data_factory/adf_config.py @@ -0,0 +1,134 @@ +"""Configuration classes for Azure Data Factory connector.""" + +from typing import Optional + +from pydantic import Field + +from datahub.configuration.common import AllowDenyPattern +from datahub.configuration.source_common import ( + EnvConfigMixin, + PlatformInstanceConfigMixin, +) +from datahub.ingestion.source.azure.azure_auth import AzureCredentialConfig +from datahub.ingestion.source.state.stale_entity_removal_handler import ( + StatefulStaleMetadataRemovalConfig, +) +from datahub.ingestion.source.state.stateful_ingestion_base import ( + StatefulIngestionConfigBase, +) + + +class AzureDataFactoryConfig( + StatefulIngestionConfigBase, + PlatformInstanceConfigMixin, + EnvConfigMixin, +): + """Configuration for Azure Data Factory source. + + This connector extracts metadata from Azure Data Factory including: + - Data Factories as Containers + - Pipelines as DataFlows + - Activities as DataJobs + - Dataset lineage + - Execution history (optional) + """ + + # Azure Authentication + credential: AzureCredentialConfig = Field( + default_factory=AzureCredentialConfig, + description=( + "Azure authentication configuration. Supports service principal, " + "managed identity, Azure CLI, or auto-detection (DefaultAzureCredential). " + "See AzureCredentialConfig for detailed options." + ), + ) + + # Azure Scope + subscription_id: str = Field( + description=( + "Azure subscription ID containing the Data Factories to ingest. " + "Find this in Azure Portal > Subscriptions." + ), + ) + + resource_group: Optional[str] = Field( + default=None, + description=( + "Azure resource group name to filter Data Factories. " + "If not specified, all Data Factories in the subscription will be ingested." + ), + ) + + # Filtering + factory_pattern: AllowDenyPattern = Field( + default=AllowDenyPattern.allow_all(), + description=( + "Regex patterns to filter Data Factories by name. " + "Example: allow=['prod-.*'], deny=['.*-test']" + ), + ) + + pipeline_pattern: AllowDenyPattern = Field( + default=AllowDenyPattern.allow_all(), + description=( + "Regex patterns to filter pipelines by name. " + "Applied to all factories matching factory_pattern." + ), + ) + + # Feature Flags + include_lineage: bool = Field( + default=True, + description=( + "Extract lineage from activity inputs/outputs. " + "Maps ADF datasets to DataHub datasets based on linked service type." + ), + ) + + include_column_lineage: bool = Field( + default=True, + description=( + "Extract column-level lineage from Data Flow activities. " + "Requires parsing Data Flow definitions." + ), + ) + + include_execution_history: bool = Field( + default=True, + description=( + "Extract pipeline and activity execution history as DataProcessInstance. " + "Includes run status, duration, and parameters. " + "Enables lineage extraction from parameterized activities using actual runtime values." + ), + ) + + execution_history_days: int = Field( + default=7, + description=( + "Number of days of execution history to extract. " + "Only used when include_execution_history is True. " + "Higher values increase ingestion time." + ), + ge=1, + le=90, + ) + + # Platform Mapping + platform_instance_map: dict[str, str] = Field( + default_factory=dict, + description=( + "Map linked service names to DataHub platform instances. " + "Example: {'my-snowflake-connection': 'prod_snowflake'}. " + "Used for accurate lineage resolution to existing datasets." + ), + ) + + # Stateful Ingestion + stateful_ingestion: Optional[StatefulStaleMetadataRemovalConfig] = Field( + default=None, + description=( + "Configuration for stateful ingestion and stale entity removal. " + "When enabled, tracks ingested entities and removes those that " + "no longer exist in Azure Data Factory." + ), + ) diff --git a/metadata-ingestion/src/datahub/ingestion/source/azure_data_factory/adf_models.py b/metadata-ingestion/src/datahub/ingestion/source/azure_data_factory/adf_models.py new file mode 100644 index 00000000000000..4d89df3c583c8c --- /dev/null +++ b/metadata-ingestion/src/datahub/ingestion/source/azure_data_factory/adf_models.py @@ -0,0 +1,624 @@ +"""Pydantic models for Azure Data Factory API responses. + +These models provide type safety and validation for ADF REST API responses. +Field names match the Azure API response structure (camelCase). + +API Documentation: https://learn.microsoft.com/en-us/rest/api/datafactory/ +""" + +from datetime import datetime +from typing import Any, Optional, Union + +from pydantic import BaseModel, ConfigDict, Field, model_validator +from typing_extensions import TypedDict + +# Type aliases for common JSON value types in ADF API responses +# Azure API parameters and variables can contain primitive types +JsonPrimitive = Union[str, int, float, bool, None] + + +# TypedDict for well-known structures in ADF API responses. +# These provide type hints for commonly-used nested dictionaries from Azure SDK. +# Using total=False makes all fields optional, matching Azure's inconsistent responses. +class FolderInfo(TypedDict, total=False): + """Folder organization structure used by pipelines, datasets, etc.""" + + name: str + + +class InvokedByInfo(TypedDict, total=False): + """Information about what triggered a pipeline run.""" + + name: str + id: str + invokedByType: str + + +class UserProperty(TypedDict, total=False): + """User-defined property on an activity.""" + + name: str + value: str + + +class IntegrationRuntimeReference(TypedDict, total=False): + """Reference to an integration runtime.""" + + referenceName: str + type: str + + +class ActivityPolicy(TypedDict, total=False): + """Execution policy for an activity.""" + + timeout: str + retry: int + retryIntervalInSeconds: int + secureInput: bool + secureOutput: bool + + +class SchemaColumn(TypedDict, total=False): + """Column definition in a dataset schema.""" + + name: str + type: str + physicalType: str + precision: int + scale: int + + +class AdfResource(BaseModel): + """Base model for Azure Data Factory resources.""" + + model_config = ConfigDict(populate_by_name=True, extra="allow") + + id: str = Field(description="Azure resource ID") + name: str = Field(description="Resource name") + type: str = Field(description="Azure resource type") + etag: Optional[str] = Field(default=None, description="Resource ETag") + + +class FactoryProperties(BaseModel): + """Properties of a Data Factory.""" + + model_config = ConfigDict(populate_by_name=True, extra="allow") + + provisioning_state: Optional[str] = Field( + default=None, alias="provisioningState", description="Provisioning state" + ) + create_time: Optional[datetime] = Field( + default=None, alias="createTime", description="Factory creation time" + ) + version: Optional[str] = Field(default=None, description="Factory version") + + +class Factory(AdfResource): + """Azure Data Factory resource. + + API Reference: https://learn.microsoft.com/en-us/rest/api/datafactory/factories/get + """ + + location: str = Field(description="Azure region") + tags: dict[str, str] = Field(default_factory=dict, description="Resource tags") + properties: Optional[FactoryProperties] = Field( + default=None, description="Factory properties" + ) + + +class ActivityDependency(BaseModel): + """Dependency between activities in a pipeline.""" + + model_config = ConfigDict(populate_by_name=True, extra="allow") + + activity: str = Field(description="Name of the dependent activity") + dependency_conditions: list[str] = Field( + default_factory=list, + alias="dependencyConditions", + description="Conditions for dependency (Succeeded, Failed, Skipped, Completed)", + ) + + +class DatasetReference(BaseModel): + """Reference to an ADF dataset.""" + + model_config = ConfigDict(populate_by_name=True, extra="allow") + + reference_name: str = Field(alias="referenceName", description="Dataset name") + type: str = Field(default="DatasetReference", description="Reference type") + parameters: dict[str, JsonPrimitive] = Field( + default_factory=dict, description="Dataset parameters" + ) + + +class LinkedServiceReference(BaseModel): + """Reference to a linked service.""" + + model_config = ConfigDict(populate_by_name=True, extra="allow") + + reference_name: str = Field( + alias="referenceName", description="Linked service name" + ) + type: str = Field(default="LinkedServiceReference", description="Reference type") + parameters: dict[str, JsonPrimitive] = Field( + default_factory=dict, description="Linked service parameters" + ) + + +class ActivityInput(BaseModel): + """Input configuration for an activity.""" + + model_config = ConfigDict(populate_by_name=True, extra="allow") + + # For Copy activities - source config varies by source type (SQL, Blob, etc.) + source: Optional[dict[str, Any]] = Field( + default=None, description="Source configuration" + ) + + # Dataset reference (common) + dataset: Optional[DatasetReference] = Field( + default=None, description="Input dataset reference" + ) + + +class ActivityOutput(BaseModel): + """Output configuration for an activity.""" + + model_config = ConfigDict(populate_by_name=True, extra="allow") + + # For Copy activities - sink config varies by sink type + sink: Optional[dict[str, Any]] = Field( + default=None, description="Sink configuration" + ) + + # Dataset reference (common) + dataset: Optional[DatasetReference] = Field( + default=None, description="Output dataset reference" + ) + + +class Activity(BaseModel): + """Activity within an ADF pipeline. + + API Reference: https://learn.microsoft.com/en-us/rest/api/datafactory/pipelines/get + """ + + model_config = ConfigDict(populate_by_name=True, extra="allow") + + name: str = Field(description="Activity name") + type: str = Field( + description="Activity type (e.g., Copy, DataFlow, ExecutePipeline)" + ) + description: Optional[str] = Field(default=None, description="Activity description") + + # Dependencies + depends_on: list[ActivityDependency] = Field( + default_factory=list, alias="dependsOn", description="Activity dependencies" + ) + + # Type-specific properties vary by activity type (Copy, DataFlow, ExecutePipeline, etc.) + # Contains nested structures like {"pipeline": {"referenceName": "...", "type": "..."}} + # Uses Any due to deeply nested and varying structures from Azure API + type_properties: Optional[dict[str, Any]] = Field( + default=None, alias="typeProperties", description="Type-specific properties" + ) + + # Inputs/Outputs (for Copy and other data activities) + inputs: list[DatasetReference] = Field( + default_factory=list, description="Input dataset references" + ) + outputs: list[DatasetReference] = Field( + default_factory=list, description="Output dataset references" + ) + + # Linked service (for some activities) + linked_service_name: Optional[LinkedServiceReference] = Field( + default=None, + alias="linkedServiceName", + description="Linked service for activity", + ) + + # Policy + policy: Optional[ActivityPolicy] = Field( + default=None, description="Activity execution policy" + ) + + # User properties + user_properties: list[UserProperty] = Field( + default_factory=list, + alias="userProperties", + description="User-defined properties", + ) + + +class PipelineProperties(BaseModel): + """Properties of an ADF pipeline.""" + + model_config = ConfigDict(populate_by_name=True, extra="allow") + + description: Optional[str] = Field(default=None, description="Pipeline description") + activities: list[Activity] = Field( + default_factory=list, description="Pipeline activities" + ) + # Parameters have complex structure: {"name": {"type": "String", "defaultValue": ...}} + parameters: dict[str, Any] = Field( + default_factory=dict, description="Pipeline parameters" + ) + # Variables have complex structure similar to parameters + variables: dict[str, Any] = Field( + default_factory=dict, description="Pipeline variables" + ) + concurrency: Optional[int] = Field(default=None, description="Max concurrent runs") + annotations: list[str] = Field( + default_factory=list, description="Pipeline annotations" + ) + folder: Optional[FolderInfo] = Field( + default=None, description="Folder path for organization" + ) + + +class Pipeline(AdfResource): + """Azure Data Factory pipeline. + + API Reference: https://learn.microsoft.com/en-us/rest/api/datafactory/pipelines/get + + Note: The Azure SDK may return pipeline data with properties at the root level + or nested under 'properties'. This model handles both cases. + """ + + # Properties can be nested or at root level depending on Azure SDK version + properties: Optional[PipelineProperties] = Field( + default=None, description="Pipeline properties" + ) + + # Root-level fields (used when properties are flattened) + description: Optional[str] = Field(default=None, description="Pipeline description") + activities: list[Activity] = Field( + default_factory=list, description="Pipeline activities" + ) + # Parameters have complex structure: {"name": {"type": "String", "defaultValue": ...}} + parameters: dict[str, Any] = Field( + default_factory=dict, description="Pipeline parameters" + ) + # Variables have complex structure similar to parameters + variables: dict[str, Any] = Field( + default_factory=dict, description="Pipeline variables" + ) + concurrency: Optional[int] = Field(default=None, description="Max concurrent runs") + annotations: list[str] = Field( + default_factory=list, description="Pipeline annotations" + ) + folder: Optional[FolderInfo] = Field( + default=None, description="Folder path for organization" + ) + + @model_validator(mode="after") + def normalize_properties(self) -> "Pipeline": + """Ensure properties are accessible whether nested or flat.""" + if self.properties is None: + # Properties are at root level, create a PipelineProperties object + self.properties = PipelineProperties( + description=self.description, + activities=self.activities, + parameters=self.parameters, + variables=self.variables, + concurrency=self.concurrency, + annotations=self.annotations, + folder=self.folder, + ) + return self + + +class DatasetProperties(BaseModel): + """Properties of an ADF dataset.""" + + model_config = ConfigDict(populate_by_name=True, extra="allow") + + description: Optional[str] = Field(default=None, description="Dataset description") + linked_service_name: LinkedServiceReference = Field( + alias="linkedServiceName", description="Associated linked service" + ) + # Parameters can have complex structure: {"name": {"type": "String"}} + parameters: dict[str, Any] = Field( + default_factory=dict, description="Dataset parameters" + ) + annotations: list[str] = Field( + default_factory=list, description="Dataset annotations" + ) + folder: Optional[FolderInfo] = Field( + default=None, description="Folder path for organization" + ) + type: str = Field( + description="Dataset type (e.g., AzureBlobDataset, DelimitedTextDataset)" + ) + + # Type-specific properties vary by dataset type (AzureBlobDataset, SqlTable, etc.) + # Contains nested structures for connection details, file paths, etc. + # Uses Any due to deeply nested and varying structures from Azure API + type_properties: Optional[dict[str, Any]] = Field( + default=None, alias="typeProperties", description="Type-specific properties" + ) + + # Schema (optional) - named schema_definition to avoid conflict with Pydantic's schema method + schema_definition: Optional[list[SchemaColumn]] = Field( + default=None, alias="schema", description="Dataset schema definition" + ) + + # Structure (legacy schema format) + structure: Optional[list[SchemaColumn]] = Field( + default=None, description="Dataset structure (legacy)" + ) + + +class Dataset(AdfResource): + """Azure Data Factory dataset. + + API Reference: https://learn.microsoft.com/en-us/rest/api/datafactory/datasets/get + """ + + properties: DatasetProperties = Field(description="Dataset properties") + + +class LinkedServiceProperties(BaseModel): + """Properties of a linked service.""" + + model_config = ConfigDict(populate_by_name=True, extra="allow") + + description: Optional[str] = Field( + default=None, description="Linked service description" + ) + type: str = Field( + description="Linked service type (e.g., AzureBlobStorage, AzureSqlDatabase)" + ) + # Type-specific properties vary by linked service type (SQL, Blob, etc.) + # Uses Any due to deeply nested and varying structures from Azure API + type_properties: Optional[dict[str, Any]] = Field( + default=None, alias="typeProperties", description="Type-specific properties" + ) + annotations: list[str] = Field( + default_factory=list, description="Linked service annotations" + ) + connect_via: Optional[IntegrationRuntimeReference] = Field( + default=None, alias="connectVia", description="Integration runtime reference" + ) + + +class LinkedService(AdfResource): + """Azure Data Factory linked service (connection). + + API Reference: https://learn.microsoft.com/en-us/rest/api/datafactory/linked-services/get + """ + + properties: LinkedServiceProperties = Field(description="Linked service properties") + + +class DataFlowSource(BaseModel): + """Source definition in a data flow.""" + + model_config = ConfigDict(populate_by_name=True, extra="allow") + + name: str = Field(description="Source name") + dataset: Optional[DatasetReference] = Field( + default=None, description="Source dataset" + ) + linked_service: Optional[LinkedServiceReference] = Field( + default=None, alias="linkedService", description="Inline linked service" + ) + schema_linked_service: Optional[LinkedServiceReference] = Field( + default=None, alias="schemaLinkedService", description="Schema linked service" + ) + + +class DataFlowSink(BaseModel): + """Sink definition in a data flow.""" + + model_config = ConfigDict(populate_by_name=True, extra="allow") + + name: str = Field(description="Sink name") + dataset: Optional[DatasetReference] = Field( + default=None, description="Sink dataset" + ) + linked_service: Optional[LinkedServiceReference] = Field( + default=None, alias="linkedService", description="Inline linked service" + ) + schema_linked_service: Optional[LinkedServiceReference] = Field( + default=None, alias="schemaLinkedService", description="Schema linked service" + ) + + +class DataFlowTransformation(TypedDict, total=False): + """Transformation step in a data flow.""" + + name: str + description: str + + +class DataFlowProperties(BaseModel): + """Properties of a mapping data flow.""" + + model_config = ConfigDict(populate_by_name=True, extra="allow") + + description: Optional[str] = Field( + default=None, description="Data flow description" + ) + type: str = Field(default="MappingDataFlow", description="Data flow type") + # Type-specific properties contain sources, sinks, transformations, scripts + # Uses Any due to deeply nested and varying structures from Azure API + type_properties: Optional[dict[str, Any]] = Field( + default=None, alias="typeProperties", description="Type-specific properties" + ) + annotations: list[str] = Field( + default_factory=list, description="Data flow annotations" + ) + folder: Optional[FolderInfo] = Field( + default=None, description="Folder path for organization" + ) + + # Sources and sinks for lineage extraction + sources: list[DataFlowSource] = Field( + default_factory=list, description="Data flow sources" + ) + sinks: list[DataFlowSink] = Field( + default_factory=list, description="Data flow sinks" + ) + + # Transformations and script + transformations: list[DataFlowTransformation] = Field( + default_factory=list, description="Data flow transformations" + ) + script_lines: list[str] = Field( + default_factory=list, + alias="scriptLines", + description="Data flow script lines (DSL)", + ) + + def get_script(self) -> Optional[str]: + """Get the complete Data Flow script as a single string.""" + if self.script_lines: + return "\n".join(self.script_lines) + return None + + +class DataFlow(AdfResource): + """Azure Data Factory mapping data flow. + + API Reference: https://learn.microsoft.com/en-us/rest/api/datafactory/data-flows/get + """ + + properties: DataFlowProperties = Field(description="Data flow properties") + + +class TriggerPipelineReference(TypedDict, total=False): + """Reference to a pipeline from a trigger.""" + + pipelineReference: dict[str, str] + parameters: dict[str, str] + + +class TriggerProperties(BaseModel): + """Properties of a trigger.""" + + model_config = ConfigDict(populate_by_name=True, extra="allow") + + description: Optional[str] = Field(default=None, description="Trigger description") + type: str = Field( + description="Trigger type (e.g., ScheduleTrigger, BlobEventsTrigger)" + ) + runtime_state: Optional[str] = Field( + default=None, + alias="runtimeState", + description="Trigger state (Started, Stopped)", + ) + # Type-specific properties vary by trigger type (Schedule, BlobEvents, etc.) + # Uses Any due to deeply nested and varying structures from Azure API + type_properties: Optional[dict[str, Any]] = Field( + default=None, alias="typeProperties", description="Type-specific properties" + ) + annotations: list[str] = Field( + default_factory=list, description="Trigger annotations" + ) + pipelines: list[TriggerPipelineReference] = Field( + default_factory=list, description="Pipelines triggered" + ) + + +class Trigger(AdfResource): + """Azure Data Factory trigger. + + API Reference: https://learn.microsoft.com/en-us/rest/api/datafactory/triggers/get + """ + + properties: TriggerProperties = Field(description="Trigger properties") + + +class PipelineRun(BaseModel): + """Pipeline run execution record. + + API Reference: https://learn.microsoft.com/en-us/rest/api/datafactory/pipeline-runs/get + """ + + model_config = ConfigDict(populate_by_name=True, extra="allow") + + run_id: str = Field(alias="runId", description="Unique run identifier") + pipeline_name: str = Field(alias="pipelineName", description="Pipeline name") + status: str = Field(description="Run status (Succeeded, Failed, InProgress, etc.)") + run_start: Optional[datetime] = Field( + default=None, alias="runStart", description="Run start time" + ) + run_end: Optional[datetime] = Field( + default=None, alias="runEnd", description="Run end time" + ) + duration_in_ms: Optional[int] = Field( + default=None, alias="durationInMs", description="Duration in milliseconds" + ) + message: Optional[str] = Field(default=None, description="Run message or error") + parameters: dict[str, str] = Field( + default_factory=dict, description="Run parameters" + ) + invoked_by: Optional[InvokedByInfo] = Field( + default=None, + alias="invokedBy", + description="Trigger or user that invoked the run", + ) + last_updated: Optional[datetime] = Field( + default=None, alias="lastUpdated", description="Last update time" + ) + run_group_id: Optional[str] = Field( + default=None, alias="runGroupId", description="Run group identifier" + ) + is_latest: Optional[bool] = Field( + default=None, alias="isLatest", description="Is this the latest run" + ) + + +class ActivityRun(BaseModel): + """Activity run execution record. + + API Reference: https://learn.microsoft.com/en-us/rest/api/datafactory/activity-runs/query-by-pipeline-run + """ + + model_config = ConfigDict(populate_by_name=True, extra="allow") + + activity_run_id: str = Field( + alias="activityRunId", description="Unique run identifier" + ) + activity_name: str = Field(alias="activityName", description="Activity name") + activity_type: str = Field(alias="activityType", description="Activity type") + pipeline_run_id: str = Field( + alias="pipelineRunId", description="Parent pipeline run ID" + ) + pipeline_name: str = Field(alias="pipelineName", description="Parent pipeline name") + status: str = Field(description="Run status") + activity_run_start: Optional[datetime] = Field( + default=None, alias="activityRunStart", description="Activity start time" + ) + activity_run_end: Optional[datetime] = Field( + default=None, alias="activityRunEnd", description="Activity end time" + ) + duration_in_ms: Optional[int] = Field( + default=None, alias="durationInMs", description="Duration in milliseconds" + ) + # Input/output/error contain runtime data that varies by activity type + # These can contain deeply nested structures from Azure API + input: Optional[dict[str, Any]] = Field(default=None, description="Activity input") + output: Optional[dict[str, Any]] = Field( + default=None, description="Activity output" + ) + error: Optional[dict[str, Any]] = Field( + default=None, description="Error details if failed" + ) + + +class ListResponse(BaseModel): + """Generic list response with pagination. + + API Reference: https://learn.microsoft.com/en-us/rest/api/datafactory/factories/list + """ + + model_config = ConfigDict(populate_by_name=True, extra="allow") + + # Resources contain nested structures that vary by type + value: list[dict[str, Any]] = Field(description="List of resources") + next_link: Optional[str] = Field( + default=None, alias="nextLink", description="URL for next page of results" + ) diff --git a/metadata-ingestion/src/datahub/ingestion/source/azure_data_factory/adf_report.py b/metadata-ingestion/src/datahub/ingestion/source/azure_data_factory/adf_report.py new file mode 100644 index 00000000000000..bfc9f19842e677 --- /dev/null +++ b/metadata-ingestion/src/datahub/ingestion/source/azure_data_factory/adf_report.py @@ -0,0 +1,150 @@ +"""Custom report class for Azure Data Factory connector.""" + +from dataclasses import dataclass, field +from typing import Dict + +from datahub.ingestion.source.state.stale_entity_removal_handler import ( + StaleEntityRemovalSourceReport, +) +from datahub.utilities.lossy_collections import LossyList + + +@dataclass +class AzureDataFactorySourceReport(StaleEntityRemovalSourceReport): + """Ingestion report for Azure Data Factory source. + + Tracks metrics specific to ADF ingestion including counts of + factories, pipelines, activities, and lineage extraction results. + """ + + # Entity counts + factories_scanned: int = 0 + pipelines_scanned: int = 0 + activities_scanned: int = 0 + datasets_scanned: int = 0 + linked_services_scanned: int = 0 + data_flows_scanned: int = 0 + triggers_scanned: int = 0 + + # Filtered entities + filtered_factories: LossyList[str] = field(default_factory=LossyList) + filtered_pipelines: LossyList[str] = field(default_factory=LossyList) + + # Lineage metrics - split by type for better visibility + dataset_lineage_extracted: int = 0 # Dataset-to-dataset lineage (Copy activities) + pipeline_lineage_extracted: int = ( + 0 # Pipeline-to-pipeline lineage (ExecutePipeline) + ) + dataflow_lineage_extracted: int = 0 # Data Flow source/sink lineage + lineage_extraction_failures: int = 0 + datasets_with_lineage: int = 0 + datasets_without_platform_mapping: LossyList[str] = field(default_factory=LossyList) + + # Execution history metrics + pipeline_runs_scanned: int = 0 + activity_runs_scanned: int = 0 + + # API metrics - granular tracking by endpoint type + api_calls_total_count: int = 0 + api_calls_total_error_count: int = 0 + api_call_counts_by_type: Dict[str, int] = field(default_factory=dict) + total_api_response_time_seconds: float = 0.0 + + def report_factory_scanned(self) -> None: + """Increment factories scanned counter.""" + self.factories_scanned += 1 + + def report_factory_filtered(self, factory_name: str) -> None: + """Record a filtered factory.""" + self.filtered_factories.append(factory_name) + + def report_pipeline_scanned(self) -> None: + """Increment pipelines scanned counter.""" + self.pipelines_scanned += 1 + + def report_pipeline_filtered(self, pipeline_name: str) -> None: + """Record a filtered pipeline.""" + self.filtered_pipelines.append(pipeline_name) + + def report_activity_scanned(self) -> None: + """Increment activities scanned counter.""" + self.activities_scanned += 1 + + def report_dataset_scanned(self) -> None: + """Increment datasets scanned counter.""" + self.datasets_scanned += 1 + + def report_linked_service_scanned(self) -> None: + """Increment linked services scanned counter.""" + self.linked_services_scanned += 1 + + def report_data_flow_scanned(self) -> None: + """Increment data flows scanned counter.""" + self.data_flows_scanned += 1 + + def report_trigger_scanned(self) -> None: + """Increment triggers scanned counter.""" + self.triggers_scanned += 1 + + def report_lineage_extracted(self, lineage_type: str = "dataset") -> None: + """Increment lineage edges counter by type. + + Args: + lineage_type: One of "dataset", "pipeline", or "dataflow" + """ + if lineage_type == "dataset": + self.dataset_lineage_extracted += 1 + elif lineage_type == "pipeline": + self.pipeline_lineage_extracted += 1 + elif lineage_type == "dataflow": + self.dataflow_lineage_extracted += 1 + self.datasets_with_lineage += 1 + + def report_lineage_failed(self, entity_name: str, error: str) -> None: + """Record a lineage extraction failure.""" + self.lineage_extraction_failures += 1 + self.report_warning( + title="Lineage Extraction Failed", + message="Unable to extract lineage for this entity.", + context=f"entity={entity_name}, error={error}", + ) + + def report_unmapped_platform( + self, dataset_name: str, linked_service_type: str + ) -> None: + """Record a dataset with unmapped platform.""" + self.datasets_without_platform_mapping.append( + f"{dataset_name} (type={linked_service_type})" + ) + + def report_pipeline_run_scanned(self) -> None: + """Increment pipeline runs scanned counter.""" + self.pipeline_runs_scanned += 1 + + def report_activity_run_scanned(self) -> None: + """Increment activity runs scanned counter.""" + self.activity_runs_scanned += 1 + + def report_api_call( + self, api_type: str = "general", duration_seconds: float = 0.0 + ) -> None: + """Track an API call with timing. + + Args: + api_type: Type of API call (e.g., "factories", "pipelines", "datasets") + duration_seconds: Time taken for the API call + """ + self.api_calls_total_count += 1 + self.total_api_response_time_seconds += duration_seconds + if api_type not in self.api_call_counts_by_type: + self.api_call_counts_by_type[api_type] = 0 + self.api_call_counts_by_type[api_type] += 1 + + def report_api_error(self, endpoint: str, error: str) -> None: + """Record an API error.""" + self.api_calls_total_error_count += 1 + self.report_warning( + title="API Error", + message="Failed to call Azure Data Factory API.", + context=f"endpoint={endpoint}, error={error}", + ) diff --git a/metadata-ingestion/src/datahub/ingestion/source/azure_data_factory/adf_source.py b/metadata-ingestion/src/datahub/ingestion/source/azure_data_factory/adf_source.py new file mode 100644 index 00000000000000..13ec2b83a0a837 --- /dev/null +++ b/metadata-ingestion/src/datahub/ingestion/source/azure_data_factory/adf_source.py @@ -0,0 +1,1266 @@ +"""Azure Data Factory ingestion source for DataHub. + +This connector extracts metadata from Azure Data Factory including: +- Data Factories as Containers +- Pipelines as DataFlows +- Activities as DataJobs +- Dataset lineage (activity inputs/outputs) +- Pipeline execution history (optional) + +Usage: + source: + type: azure_data_factory + config: + subscription_id: ${AZURE_SUBSCRIPTION_ID} + credential: + authentication_method: service_principal + client_id: ${AZURE_CLIENT_ID} + client_secret: ${AZURE_CLIENT_SECRET} + tenant_id: ${AZURE_TENANT_ID} +""" + +import logging +from typing import Iterable, Optional + +from datahub.api.entities.dataprocess.dataprocess_instance import ( + DataProcessInstance, + InstanceRunResult, +) +from datahub.emitter.mcp import MetadataChangeProposalWrapper +from datahub.emitter.mcp_builder import ContainerKey +from datahub.ingestion.api.common import PipelineContext +from datahub.ingestion.api.decorators import ( + SourceCapability, + SupportStatus, + capability, + config_class, + platform_name, + support_status, +) +from datahub.ingestion.api.source import MetadataWorkUnitProcessor +from datahub.ingestion.api.workunit import MetadataWorkUnit +from datahub.ingestion.source.azure_data_factory.adf_client import ( + AzureDataFactoryClient, +) +from datahub.ingestion.source.azure_data_factory.adf_config import ( + AzureDataFactoryConfig, +) +from datahub.ingestion.source.azure_data_factory.adf_models import ( + Activity, + DataFlow as AdfDataFlow, + Dataset as AdfDataset, + Factory, + LinkedService, + Pipeline, + PipelineRun, + Trigger, +) +from datahub.ingestion.source.azure_data_factory.adf_report import ( + AzureDataFactorySourceReport, +) +from datahub.ingestion.source.state.stale_entity_removal_handler import ( + StaleEntityRemovalHandler, +) +from datahub.ingestion.source.state.stateful_ingestion_base import ( + StatefulIngestionSourceBase, +) +from datahub.metadata.schema_classes import ( + DataJobInputOutputClass, + DataProcessTypeClass, + DataTransformClass, + DataTransformLogicClass, + QueryLanguageClass, + QueryStatementClass, +) +from datahub.metadata.urns import DataFlowUrn, DataJobUrn, DatasetUrn +from datahub.sdk._shared import DatasetUrnOrStr +from datahub.sdk.container import Container +from datahub.sdk.dataflow import DataFlow +from datahub.sdk.datajob import DataJob + +logger = logging.getLogger(__name__) + +# Platform identifier for Azure Data Factory +PLATFORM = "azure-data-factory" + +# Constants for pipeline run processing +MAX_RUN_MESSAGE_LENGTH = 500 # Truncate long error/status messages +MAX_RUN_PARAMETERS = 10 # Limit number of parameters to store +MAX_PARAMETER_VALUE_LENGTH = 100 # Truncate long parameter values + +# Mapping of ADF linked service types to DataHub platforms. +# Platform identifiers must match those defined in: +# metadata-service/configuration/src/main/resources/bootstrap_mcps/data-platforms.yaml +LINKED_SERVICE_PLATFORM_MAP: dict[str, str] = { + # Azure Storage - all Azure storage types map to "abs" (Azure Blob Storage) + "AzureBlobStorage": "abs", + "AzureBlobFS": "abs", # Azure Data Lake Storage Gen2 (uses abfs:// protocol) + "AzureDataLakeStore": "abs", # Azure Data Lake Storage Gen1 + "AzureDataLakeStoreCosmosStructuredStream": "abs", + "AzureFileStorage": "abs", + # Azure Databases - Synapse uses mssql protocol + "AzureSqlDatabase": "mssql", + "AzureSqlDW": "mssql", # Azure Synapse (formerly SQL DW) + "AzureSynapseAnalytics": "mssql", # Azure Synapse Analytics + "AzureSqlMI": "mssql", + "SqlServer": "mssql", + "AzurePostgreSql": "postgres", + "AzureMySql": "mysql", + "CosmosDb": "cosmosdb", + "CosmosDbMongoDbApi": "mongodb", + # Databricks + "AzureDatabricks": "databricks", + "AzureDatabricksDeltaLake": "databricks", + # Cloud Platforms + "AmazonS3": "s3", + "AmazonS3Compatible": "s3", + "GoogleCloudStorage": "gcs", + "AmazonRedshift": "redshift", + "GoogleBigQuery": "bigquery", + "Snowflake": "snowflake", + # Traditional Databases + "PostgreSql": "postgres", + "MySql": "mysql", + "Oracle": "oracle", + "OracleServiceCloud": "oracle", + "Db2": "db2", + "Sybase": "sybase", + "Teradata": "teradata", + "Informix": "informix", + "Netezza": "netezza", + "Vertica": "vertica", + "Greenplum": "greenplum", + # Data Warehouses + "Hive": "hive", + "Spark": "spark", + "Hdfs": "hdfs", + # SaaS Applications + "Salesforce": "salesforce", + "SalesforceServiceCloud": "salesforce", + "SalesforceMarketingCloud": "salesforce", + "ServiceNow": "servicenow", + "Dynamics": "dynamics", + "DynamicsAX": "dynamics", + "DynamicsCrm": "dynamics", + # File Formats (use linked service or default) + "FtpServer": "ftp", + "Sftp": "sftp", + "HttpServer": "http", + "OData": "odata", + "Rest": "rest", +} + +# Mapping of ADF activity types to DataHub subtypes +ACTIVITY_SUBTYPE_MAP: dict[str, str] = { + "Copy": "Copy Activity", + "DataFlow": "Data Flow Activity", + "ExecutePipeline": "Execute Pipeline", + "ExecuteDataFlow": "Data Flow Activity", + "Lookup": "Lookup Activity", + "GetMetadata": "Get Metadata Activity", + "SqlServerStoredProcedure": "Stored Procedure Activity", + "Script": "Script Activity", + "WebActivity": "Web Activity", + "WebHook": "Webhook Activity", + "IfCondition": "If Condition", + "ForEach": "ForEach Loop", + "Until": "Until Loop", + "Wait": "Wait Activity", + "SetVariable": "Set Variable", + "AppendVariable": "Append Variable", + "Switch": "Switch Activity", + "Filter": "Filter Activity", + "Validation": "Validation Activity", + "DatabricksNotebook": "Databricks Notebook", + "DatabricksSparkJar": "Databricks Spark Jar", + "DatabricksSparkPython": "Databricks Spark Python", + "HDInsightHive": "HDInsight Hive", + "HDInsightPig": "HDInsight Pig", + "HDInsightSpark": "HDInsight Spark", + "HDInsightMapReduce": "HDInsight MapReduce", + "HDInsightStreaming": "HDInsight Streaming", + "AzureFunctionActivity": "Azure Function Activity", + "AzureMLBatchExecution": "Azure ML Batch", + "AzureMLUpdateResource": "Azure ML Update", + "AzureMLExecutePipeline": "Azure ML Pipeline", + "Custom": "Custom Activity", + "Delete": "Delete Activity", + "SynapseNotebook": "Synapse Notebook", + "SparkJob": "Spark Job", + "SynapseSparkJob": "Synapse Spark Job", + "SqlPoolStoredProcedure": "SQL Pool Stored Procedure", + "Fail": "Fail Activity", +} + + +class AzureDataFactoryContainerKey(ContainerKey): + """Container key for Azure Data Factory resources.""" + + resource_group: str + factory_name: str + + +@platform_name("Azure Data Factory") +@config_class(AzureDataFactoryConfig) +@support_status(SupportStatus.INCUBATING) +@capability(SourceCapability.PLATFORM_INSTANCE, "Enabled by default") +@capability( + SourceCapability.LINEAGE_COARSE, + "Extracts lineage from activity inputs/outputs", +) +@capability(SourceCapability.CONTAINERS, "Enabled by default") +class AzureDataFactorySource(StatefulIngestionSourceBase): + """Extracts metadata from Azure Data Factory. + + This source extracts: + - Data Factories as Containers + - Pipelines as DataFlows + - Activities as DataJobs + - Dataset lineage from activity inputs/outputs + - Execution history (optional) + """ + + config: AzureDataFactoryConfig + report: AzureDataFactorySourceReport + platform: str = PLATFORM + + def __init__(self, config: AzureDataFactoryConfig, ctx: PipelineContext) -> None: + super().__init__(config, ctx) + self.config = config + self.report = AzureDataFactorySourceReport() + + # Initialize Azure client + credential = config.credential.get_credential() + self.client = AzureDataFactoryClient( + credential=credential, + subscription_id=config.subscription_id, + ) + + # Cache for datasets, linked services, data flows, pipelines, and triggers. + # Structure: {factory_key: {resource_name: resource_object}} + # - factory_key: "{resource_group}/{factory_name}" - uniquely identifies a factory + # - resource_name: Name of the ADF resource (e.g., "MyDataset", "MyPipeline") + # - resource_object: Parsed ADF resource model + # These caches enable resolution of cross-references (e.g., dataset -> linked service) + self._datasets_cache: dict[str, dict[str, AdfDataset]] = {} + self._linked_services_cache: dict[str, dict[str, LinkedService]] = {} + self._data_flows_cache: dict[str, dict[str, AdfDataFlow]] = {} + self._pipelines_cache: dict[str, dict[str, Pipeline]] = {} + self._triggers_cache: dict[str, list[Trigger]] = {} + + @classmethod + def create( + cls, config_dict: dict, ctx: PipelineContext + ) -> "AzureDataFactorySource": + config = AzureDataFactoryConfig.model_validate(config_dict) + return cls(config, ctx) + + def get_workunit_processors(self) -> list[Optional[MetadataWorkUnitProcessor]]: + return [ + *super().get_workunit_processors(), + StaleEntityRemovalHandler.create( + self, self.config, self.ctx + ).workunit_processor, + ] + + def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]: + """Generate workunits for all Azure Data Factory resources.""" + logger.info( + f"Starting Azure Data Factory ingestion for subscription: {self.config.subscription_id}" + ) + if self.config.resource_group: + logger.info(f"Filtering to resource group: {self.config.resource_group}") + + # Iterate over all factories + for factory in self.client.get_factories( + resource_group=self.config.resource_group + ): + self.report.report_api_call() + + # Check if factory matches pattern + if not self.config.factory_pattern.allowed(factory.name): + self.report.report_factory_filtered(factory.name) + continue + + self.report.report_factory_scanned() + logger.info(f"Processing factory: {factory.name}") + + # Extract resource group from factory ID + # Format: /subscriptions/{sub}/resourceGroups/{rg}/providers/... + resource_group = self._extract_resource_group(factory.id) + + # Cache datasets and linked services for this factory + if self.config.include_lineage: + logger.info(f"Fetching lineage resources for factory: {factory.name}") + self._cache_factory_resources(resource_group, factory.name) + + # Emit factory as container and get the Container object for browse paths + container, container_workunits = self._emit_factory(factory, resource_group) + yield from container_workunits + + # Process pipelines, passing the Container for proper browse path hierarchy + logger.info( + f"Extracting pipelines and activities for factory: {factory.name}" + ) + yield from self._process_pipelines(factory, resource_group, container) + + # Process execution history if enabled + if self.config.include_execution_history: + yield from self._process_execution_history(factory, resource_group) + + def _extract_resource_group(self, resource_id: str) -> str: + """Extract resource group name from Azure resource ID.""" + # Format: /subscriptions/{sub}/resourceGroups/{rg}/providers/... + parts = resource_id.split("/") + try: + rg_index = parts.index("resourceGroups") + return parts[rg_index + 1] + except (ValueError, IndexError): + logger.warning(f"Could not extract resource group from: {resource_id}") + return "unknown" + + def _cache_factory_resources(self, resource_group: str, factory_name: str) -> None: + """Cache datasets and linked services for a factory.""" + factory_key = f"{resource_group}/{factory_name}" + + # Cache datasets (needed for lineage resolution) + if self.config.include_lineage: + self._datasets_cache[factory_key] = {} + for dataset in self.client.get_datasets(resource_group, factory_name): + self.report.report_api_call() + self.report.report_dataset_scanned() + self._datasets_cache[factory_key][dataset.name] = dataset + + # Cache linked services (needed for lineage resolution - maps datasets to platforms) + if self.config.include_lineage: + self._linked_services_cache[factory_key] = {} + for ls in self.client.get_linked_services(resource_group, factory_name): + self.report.report_api_call() + self.report.report_linked_service_scanned() + self._linked_services_cache[factory_key][ls.name] = ls + + # Cache triggers (for custom properties on pipelines) + self._triggers_cache[factory_key] = [] + for trigger in self.client.get_triggers(resource_group, factory_name): + self.report.report_api_call() + self.report.report_trigger_scanned() + self._triggers_cache[factory_key].append(trigger) + + # Cache data flows (for lineage extraction from Data Flow activities) + if self.config.include_lineage: + self._data_flows_cache[factory_key] = {} + for data_flow in self.client.get_data_flows(resource_group, factory_name): + self.report.report_api_call() + self.report.report_data_flow_scanned() + self._data_flows_cache[factory_key][data_flow.name] = data_flow + + def _emit_factory( + self, factory: Factory, resource_group: str + ) -> tuple[Container, Iterable[MetadataWorkUnit]]: + """Emit a Data Factory as a Container. + + Returns: + Tuple of (Container object, workunits). The Container object is needed + by child entities (DataFlows) to properly set up browse paths. + """ + container_key = AzureDataFactoryContainerKey( + platform=PLATFORM, + instance=self.config.platform_instance, + resource_group=resource_group, + factory_name=factory.name, + env=self.config.env, + ) + + # Build custom properties + custom_props: dict[str, str] = { + "azure_resource_id": factory.id, + "location": factory.location, + } + if factory.tags: + for key, value in factory.tags.items(): + custom_props[f"tag:{key}"] = value + if factory.properties and factory.properties.provisioning_state: + custom_props["provisioning_state"] = factory.properties.provisioning_state + + container = Container( + container_key, + display_name=factory.name, + description=f"Azure Data Factory: {factory.name}", + subtype="Data Factory", + external_url=self._get_factory_url(factory, resource_group), + extra_properties=custom_props, + parent_container=None, # Top-level container + ) + + return container, container.as_workunits() + + def _get_factory_url(self, factory: Factory, resource_group: str) -> str: + """Generate Azure Portal URL for a Data Factory.""" + return ( + f"https://adf.azure.com/en/home" + f"?factory=/subscriptions/{self.config.subscription_id}" + f"/resourceGroups/{resource_group}" + f"/providers/Microsoft.DataFactory/factories/{factory.name}" + ) + + def _process_pipelines( + self, factory: Factory, resource_group: str, container: Container + ) -> Iterable[MetadataWorkUnit]: + """Process all pipelines in a factory using two-pass approach. + + First pass: Fetch and cache all pipelines for the factory. + Second pass: Process pipelines and emit entities with proper lineage. + + This two-pass approach enables ExecutePipeline activities to reference + child pipelines that may not have been processed yet. + + Args: + factory: The Data Factory + resource_group: Azure resource group name + container: The parent Container object (for browse path hierarchy) + """ + factory_key = f"{resource_group}/{factory.name}" + + # First pass: Cache all pipelines for this factory + self._pipelines_cache[factory_key] = {} + for pipeline in self.client.get_pipelines(resource_group, factory.name): + self.report.report_api_call() + self._pipelines_cache[factory_key][pipeline.name] = pipeline + + # Second pass: Process pipelines and emit entities + for pipeline_name, pipeline in self._pipelines_cache[factory_key].items(): + # Check if pipeline matches pattern + if not self.config.pipeline_pattern.allowed(pipeline_name): + self.report.report_pipeline_filtered(pipeline_name) + continue + + self.report.report_pipeline_scanned() + logger.debug(f"Processing pipeline: {factory.name}/{pipeline_name}") + + # Emit pipeline as DataFlow, passing the Container for proper browse paths + dataflow = self._create_dataflow( + pipeline, factory, resource_group, container + ) + yield from dataflow.as_workunits() + + # Emit activities as DataJobs + if pipeline.properties is None: + logger.warning( + f"Pipeline {pipeline_name} has no properties, skipping activities" + ) + continue + for activity in pipeline.properties.activities: + self.report.report_activity_scanned() + + datajob = self._create_datajob( + activity, pipeline, factory, resource_group, dataflow, factory_key + ) + yield from datajob.as_workunits() + + # Emit dataTransformLogic for Data Flow activities + if activity.type == "ExecuteDataFlow": + yield from self._emit_data_flow_script( + activity, datajob, factory_key + ) + + # Emit pipeline-to-pipeline lineage for ExecutePipeline activities + if activity.type == "ExecutePipeline": + yield from self._emit_pipeline_lineage( + activity, datajob, factory, factory_key + ) + + def _create_dataflow( + self, + pipeline: Pipeline, + factory: Factory, + resource_group: str, + container: Container, + ) -> DataFlow: + """Create a DataFlow entity for a pipeline. + + Args: + pipeline: The ADF pipeline + factory: The parent Data Factory + resource_group: Azure resource group name + container: The parent Container object (enables proper browse path hierarchy) + """ + # Build flow name with factory prefix for uniqueness across factories + flow_name = f"{factory.name}.{pipeline.name}" + + # Custom properties + custom_props: dict[str, str] = { + "azure_resource_id": pipeline.id, + "factory_name": factory.name, + } + + # Extract properties if available + description: Optional[str] = None + if pipeline.properties is not None: + if pipeline.properties.concurrency: + custom_props["concurrency"] = str(pipeline.properties.concurrency) + if pipeline.properties.folder: + folder_name = pipeline.properties.folder.get("name", "") + if folder_name: + custom_props["folder"] = folder_name + if pipeline.properties.annotations: + custom_props["annotations"] = ", ".join(pipeline.properties.annotations) + description = pipeline.properties.description + + # Add trigger info if available + triggers = self._get_pipeline_triggers( + resource_group, factory.name, pipeline.name + ) + if triggers: + custom_props["triggers"] = ", ".join(triggers) + + # Pass the Container object directly so the SDK can properly build + # browse paths by inheriting from the parent container's path + dataflow = DataFlow( + platform=PLATFORM, + name=flow_name, + platform_instance=self.config.platform_instance, + env=self.config.env, + display_name=pipeline.name, + description=description, + external_url=self._get_pipeline_url(factory, resource_group, pipeline.name), + custom_properties=custom_props, + subtype="Pipeline", + parent_container=container, + ) + + return dataflow + + def _get_pipeline_triggers( + self, resource_group: str, factory_name: str, pipeline_name: str + ) -> list[str]: + """Get trigger names associated with a pipeline.""" + factory_key = f"{resource_group}/{factory_name}" + triggers = self._triggers_cache.get(factory_key, []) + + result = [] + for trigger in triggers: + # Check if trigger references this pipeline + for pipeline_ref in trigger.properties.pipelines: + ref_name = pipeline_ref.get("pipelineReference", {}).get( + "referenceName", "" + ) + if ref_name == pipeline_name: + result.append(trigger.name) + break + + return result + + def _get_pipeline_url( + self, factory: Factory, resource_group: str, pipeline_name: str + ) -> str: + """Generate Azure Portal URL for a pipeline.""" + return ( + f"https://adf.azure.com/en/authoring/pipeline/{pipeline_name}" + f"?factory=/subscriptions/{self.config.subscription_id}" + f"/resourceGroups/{resource_group}" + f"/providers/Microsoft.DataFactory/factories/{factory.name}" + ) + + def _create_datajob( + self, + activity: Activity, + pipeline: Pipeline, + factory: Factory, + resource_group: str, + dataflow: DataFlow, + factory_key: str, + ) -> DataJob: + """Create a DataJob entity for an activity.""" + # Determine activity subtype + subtype = ACTIVITY_SUBTYPE_MAP.get(activity.type, activity.type) + + # Custom properties + custom_props: dict[str, str] = { + "activity_type": activity.type, + } + if activity.description: + custom_props["activity_description"] = activity.description + + # Add policy info + if activity.policy: + if "timeout" in activity.policy: + custom_props["timeout"] = str(activity.policy["timeout"]) + if "retry" in activity.policy: + custom_props["retry"] = str(activity.policy["retry"]) + + # Extract lineage (inlets/outlets) + inlets: Optional[list[DatasetUrnOrStr]] = None + outlets: Optional[list[DatasetUrnOrStr]] = None + + if self.config.include_lineage: + extracted_inlets = self._extract_activity_inputs(activity, factory_key) + extracted_outlets = self._extract_activity_outputs(activity, factory_key) + if extracted_inlets: + inlets = extracted_inlets + if extracted_outlets: + outlets = extracted_outlets + + # Create DataJob with external URL to the parent pipeline + # (ADF doesn't have direct activity URLs, so we link to the pipeline) + datajob = DataJob( + name=activity.name, + flow=dataflow, + display_name=activity.name, + description=activity.description, + external_url=self._get_pipeline_url(factory, resource_group, pipeline.name), + custom_properties=custom_props, + subtype=subtype, + inlets=inlets, + outlets=outlets, + ) + + return datajob + + def _extract_activity_inputs( + self, activity: Activity, factory_key: str + ) -> list[DatasetUrnOrStr]: + """Extract input dataset URNs from an activity.""" + inputs: list[DatasetUrnOrStr] = [] + + # Process explicit inputs (for Copy activities and others) + for input_ref in activity.inputs: + dataset_urn = self._resolve_dataset_urn( + input_ref.reference_name, factory_key + ) + if dataset_urn: + inputs.append(str(dataset_urn)) + self.report.report_lineage_extracted("dataset") + + # Process Data Flow activities - extract sources as inputs + if activity.type == "ExecuteDataFlow": + data_flow_inputs = self._extract_data_flow_sources(activity, factory_key) + inputs.extend(data_flow_inputs) + + # Process source in typeProperties (for Copy activities) + if activity.type_properties and "source" in activity.type_properties: + source = activity.type_properties["source"] + if "datasetSettings" in source: + # Inline dataset configuration + pass # Complex case, skip for now + # Source might reference a dataset in storeSettings + store_settings = source.get("storeSettings", {}) + if "linkedServiceName" in store_settings: + # Could resolve to a dataset if we have schema info + pass + + return inputs + + def _extract_activity_outputs( + self, activity: Activity, factory_key: str + ) -> list[DatasetUrnOrStr]: + """Extract output dataset URNs from an activity.""" + outputs: list[DatasetUrnOrStr] = [] + + # Process explicit outputs (for Copy activities and others) + for output_ref in activity.outputs: + dataset_urn = self._resolve_dataset_urn( + output_ref.reference_name, factory_key + ) + if dataset_urn: + outputs.append(str(dataset_urn)) + self.report.report_lineage_extracted("dataset") + + # Process Data Flow activities - extract sinks as outputs + if activity.type == "ExecuteDataFlow": + data_flow_outputs = self._extract_data_flow_sinks(activity, factory_key) + outputs.extend(data_flow_outputs) + + # Process sink in typeProperties (for Copy activities) + if activity.type_properties and "sink" in activity.type_properties: + sink = activity.type_properties["sink"] + if "datasetSettings" in sink: + # Inline dataset configuration + pass # Complex case, skip for now + + return outputs + + def _get_data_flow_name_from_activity( + self, activity: Activity, factory_key: str + ) -> Optional[str]: + """Get the Data Flow name referenced by an ExecuteDataFlow activity. + + Due to a case-sensitivity bug in the Azure SDK where it expects + 'typeProperties.dataFlow' but the API returns 'typeProperties.dataflow', + we try multiple approaches to find the Data Flow name. + + Args: + activity: The ExecuteDataFlow activity + factory_key: Factory key for cache lookup + + Returns: + Data Flow name if found, None otherwise + """ + # Approach 1: Try typeProperties.dataFlow (SDK expected format) + if activity.type_properties: + data_flow_ref = activity.type_properties.get( + "dataFlow", activity.type_properties.get("dataflow", {}) + ) + if isinstance(data_flow_ref, dict): + name = data_flow_ref.get("referenceName") + if name: + return name + + # Approach 2: Try to match activity name to Data Flow name + # Many users name their activity similarly to the Data Flow + data_flows = self._data_flows_cache.get(factory_key, {}) + + # Exact match + if activity.name in data_flows: + logger.debug( + f"Found Data Flow by exact activity name match: {activity.name}" + ) + return activity.name + + # Fuzzy match - try removing common suffixes/variations + activity_name_normalized = activity.name.replace(" ", "").lower() + for df_name in data_flows: + df_name_normalized = df_name.replace(" ", "").lower() + if activity_name_normalized == df_name_normalized: + logger.debug( + f"Found Data Flow by fuzzy match: activity='{activity.name}' -> dataflow='{df_name}'" + ) + return df_name + + return None + + def _emit_data_flow_script( + self, activity: Activity, datajob: DataJob, factory_key: str + ) -> Iterable[MetadataWorkUnit]: + """Emit the Data Flow script as a dataTransformLogic aspect. + + For ExecuteDataFlow activities, this extracts the Data Flow DSL script + and emits it as a transformation aspect, making it viewable in the UI. + + Args: + activity: The ExecuteDataFlow activity + datajob: The DataJob entity for this activity + factory_key: Factory key for cache lookup + + Yields: + MetadataWorkUnit for the dataTransformLogic aspect + """ + # Get the Data Flow name + data_flow_name = self._get_data_flow_name_from_activity(activity, factory_key) + if not data_flow_name: + return + + # Look up the Data Flow definition + data_flows = self._data_flows_cache.get(factory_key, {}) + data_flow = data_flows.get(data_flow_name) + if not data_flow or not data_flow.properties: + return + + # Get the script from the Data Flow + script = data_flow.properties.get_script() + if not script: + logger.debug(f"No script found for Data Flow: {data_flow_name}") + return + + # Emit the dataTransformLogic aspect + # Note: Using SQL as language because UNKNOWN is not yet broadly supported + # in the UI. The Data Flow DSL is similar to SQL in structure. + logger.debug( + f"Emitting Data Flow script for activity '{activity.name}' " + f"({len(script)} chars)" + ) + yield MetadataChangeProposalWrapper( + entityUrn=str(datajob.urn), + aspect=DataTransformLogicClass( + transforms=[ + DataTransformClass( + queryStatement=QueryStatementClass( + value=script, + language=QueryLanguageClass.SQL, + ) + ) + ] + ), + ).as_workunit() + + def _extract_data_flow_endpoints( + self, activity: Activity, factory_key: str, endpoint_type: str + ) -> list[str]: + """Extract source or sink dataset URNs from a Data Flow activity. + + Data Flow activities reference a Data Flow definition which contains + sources (inputs) and sinks (outputs). This method extracts either based + on the endpoint_type parameter. + + Args: + activity: The ExecuteDataFlow activity + factory_key: Factory key for cache lookup + endpoint_type: "sources" or "sinks" + + Returns: + List of dataset URNs for the specified endpoint type + """ + urns: list[str] = [] + + # Get the Data Flow name using our robust lookup + data_flow_name = self._get_data_flow_name_from_activity(activity, factory_key) + + if not data_flow_name: + logger.debug( + f"Could not find Data Flow reference for activity: {activity.name}" + ) + return urns + + # Look up the Data Flow definition + data_flows = self._data_flows_cache.get(factory_key, {}) + data_flow = data_flows.get(data_flow_name) + + if not data_flow: + logger.debug(f"Data Flow not found in cache: {data_flow_name}") + return urns + + # Extract endpoints from the Data Flow + if data_flow.properties: + endpoints = getattr(data_flow.properties, endpoint_type, []) + endpoint_label = endpoint_type[:-1] # "sources" -> "source" + for endpoint in endpoints: + if endpoint.dataset: + dataset_urn = self._resolve_dataset_urn( + endpoint.dataset.reference_name, factory_key + ) + if dataset_urn: + urns.append(str(dataset_urn)) + self.report.report_lineage_extracted("dataflow") + logger.debug( + f"Extracted Data Flow {endpoint_label}: {endpoint.name} -> {dataset_urn}" + ) + + return urns + + def _extract_data_flow_sources( + self, activity: Activity, factory_key: str + ) -> list[str]: + """Extract source dataset URNs from a Data Flow activity.""" + return self._extract_data_flow_endpoints(activity, factory_key, "sources") + + def _extract_data_flow_sinks( + self, activity: Activity, factory_key: str + ) -> list[str]: + """Extract sink dataset URNs from a Data Flow activity.""" + return self._extract_data_flow_endpoints(activity, factory_key, "sinks") + + def _emit_pipeline_lineage( + self, + activity: Activity, + datajob: DataJob, + factory: Factory, + factory_key: str, + ) -> Iterable[MetadataWorkUnit]: + """Emit pipeline-to-pipeline lineage for ExecutePipeline activities. + + When a pipeline calls another pipeline via ExecutePipeline activity, + we create a DataJob-to-DataJob dependency from the calling activity + to the first activity in the child pipeline. This creates visible + lineage edges in the DataHub UI. + + Args: + activity: The ExecutePipeline activity + datajob: The DataJob entity for this activity + factory: The parent Data Factory + factory_key: Factory key for URN construction + + Yields: + MetadataWorkUnit for the pipeline dependency + """ + if not activity.type_properties: + return + + # Extract the child pipeline reference from typeProperties + pipeline_ref = activity.type_properties.get("pipeline", {}) + child_pipeline_name = pipeline_ref.get("referenceName") + + if not child_pipeline_name: + logger.debug( + f"ExecutePipeline activity {activity.name} has no pipeline reference" + ) + return + + # Build the child pipeline's DataFlow URN + child_flow_id = f"{factory.name}.{child_pipeline_name}" + child_flow_urn = DataFlowUrn.create_from_ids( + orchestrator=PLATFORM, + flow_id=child_flow_id, + env=self.config.env, + ) + + # Look up child pipeline from cache to get its first activity + pipelines = self._pipelines_cache.get(factory_key, {}) + child_pipeline = pipelines.get(child_pipeline_name) + + child_datajob_urn: Optional[DataJobUrn] = None + first_activity_name: Optional[str] = None + + if child_pipeline and child_pipeline.properties: + activities = child_pipeline.properties.activities + if activities: + first_activity_name = activities[0].name + child_datajob_urn = DataJobUrn.create_from_ids( + data_flow_urn=str(child_flow_urn), + job_id=first_activity_name, + ) + logger.debug( + f"ExecutePipeline {activity.name} -> {child_pipeline_name}." + f"{first_activity_name} (URN: {child_datajob_urn})" + ) + else: + logger.debug( + f"Child pipeline {child_pipeline_name} not found in cache or has no activities" + ) + + # Update custom properties to include the child pipeline reference + current_props = datajob.custom_properties + current_props["calls_pipeline"] = child_pipeline_name + current_props["child_pipeline_urn"] = str(child_flow_urn) + if first_activity_name: + current_props["child_first_activity"] = first_activity_name + datajob.set_custom_properties(current_props) + + self.report.report_lineage_extracted("pipeline") + + # Emit DataJobInputOutput on the CHILD's first activity, setting ExecutePipeline as upstream + # This creates lineage: ExecutePipeline -> ChildFirstActivity + # (The parent activity triggers the child, so parent is upstream of child) + if child_datajob_urn: + yield MetadataChangeProposalWrapper( + entityUrn=str(child_datajob_urn), # Child's first activity + aspect=DataJobInputOutputClass( + inputDatasets=[], + outputDatasets=[], + inputDatajobs=[ + str(datajob.urn) + ], # ExecutePipeline as input/upstream + ), + ).as_workunit() + + def _resolve_dataset_urn( + self, dataset_name: str, factory_key: str + ) -> Optional[DatasetUrn]: + """Resolve an ADF dataset reference to a DataHub DatasetUrn.""" + # Get dataset from cache + datasets = self._datasets_cache.get(factory_key, {}) + dataset = datasets.get(dataset_name) + + if not dataset: + logger.debug(f"Dataset not found in cache: {dataset_name}") + return None + + # Get linked service to determine platform + linked_service_ref = dataset.properties.linked_service_name + linked_services = self._linked_services_cache.get(factory_key, {}) + linked_service = linked_services.get(linked_service_ref.reference_name) + + if not linked_service: + logger.debug( + f"Linked service not found: {linked_service_ref.reference_name}" + ) + self.report.report_unmapped_platform(dataset_name, "unknown") + return None + + # Map linked service type to DataHub platform + ls_type = linked_service.properties.type + platform = LINKED_SERVICE_PLATFORM_MAP.get(ls_type) + + if not platform: + logger.debug(f"Unknown linked service type: {ls_type}") + self.report.report_unmapped_platform(dataset_name, ls_type) + return None + + # Build dataset name from type properties + table_name = self._extract_table_name(dataset, linked_service) + if not table_name: + table_name = dataset_name # Fallback to ADF dataset name + + # Check if there's a platform instance mapping + platform_instance = self.config.platform_instance_map.get( + linked_service_ref.reference_name + ) + + return DatasetUrn.create_from_ids( + platform_id=platform, + table_name=table_name, + env=self.config.env, + platform_instance=platform_instance, + ) + + def _extract_table_name( + self, dataset: AdfDataset, linked_service: LinkedService + ) -> Optional[str]: + """Extract table/file name from dataset type properties.""" + if not dataset.properties.type_properties: + return None + + type_props = dataset.properties.type_properties + + # SQL-like datasets + if "tableName" in type_props: + return type_props["tableName"] + if "table" in type_props: + return type_props["table"] + + # Structured table reference + if "schema" in type_props and "table" in type_props: + schema = type_props.get("schema", "") + table = type_props.get("table", "") + if schema and table: + return f"{schema}.{table}" + + # File-based datasets + if "fileName" in type_props: + folder = type_props.get("folderPath", "") + filename = type_props.get("fileName", "") + if folder and filename: + return f"{folder}/{filename}" + return filename + + # Container/path based + if "location" in type_props: + location = type_props["location"] + if isinstance(location, dict): + container = location.get("container", "") + folder = location.get("folderPath", "") + filename = location.get("fileName", "") + parts = [p for p in [container, folder, filename] if p] + if parts: + return "/".join(parts) + + return None + + def _process_execution_history( + self, factory: Factory, resource_group: str + ) -> Iterable[MetadataWorkUnit]: + """Process pipeline execution history for a factory.""" + logger.info( + f"Fetching execution history for factory: {factory.name} " + f"(last {self.config.execution_history_days} days)" + ) + + for pipeline_run in self.client.get_pipeline_runs( + resource_group, + factory.name, + days=self.config.execution_history_days, + ): + self.report.report_api_call() + self.report.report_pipeline_run_scanned() + + # Check if pipeline matches pattern + if not self.config.pipeline_pattern.allowed(pipeline_run.pipeline_name): + continue + + yield from self._emit_pipeline_run(pipeline_run, factory, resource_group) + + def _emit_pipeline_run( + self, + pipeline_run: PipelineRun, + factory: Factory, + resource_group: str, + ) -> Iterable[MetadataWorkUnit]: + """Emit a pipeline run as DataProcessInstance.""" + # Build DataFlow URN for the template - include factory name for uniqueness + flow_name = f"{factory.name}.{pipeline_run.pipeline_name}" + flow_urn = DataFlowUrn.create_from_ids( + orchestrator=PLATFORM, + flow_id=flow_name, + env=self.config.env, + platform_instance=self.config.platform_instance, + ) + + # Map ADF status to InstanceRunResult + result = self._map_run_status(pipeline_run.status) + + # Build custom properties + properties: dict[str, str] = { + "run_id": pipeline_run.run_id, + "status": pipeline_run.status, + } + if pipeline_run.message: + properties["message"] = pipeline_run.message[:MAX_RUN_MESSAGE_LENGTH] + if pipeline_run.invoked_by: + invoker_name = pipeline_run.invoked_by.get("name", "") + invoker_type = pipeline_run.invoked_by.get("invokedByType", "") + if invoker_name: + properties["invoked_by"] = invoker_name + if invoker_type: + properties["invoked_by_type"] = invoker_type + if pipeline_run.parameters: + for key, value in list(pipeline_run.parameters.items())[ + :MAX_RUN_PARAMETERS + ]: + properties[f"param:{key}"] = str(value)[:MAX_PARAMETER_VALUE_LENGTH] + + # Create DataProcessInstance + dpi = DataProcessInstance( + id=pipeline_run.run_id, + orchestrator=PLATFORM, + cluster=self.config.env, + type=DataProcessTypeClass.BATCH_SCHEDULED, + template_urn=flow_urn, + properties=properties, + url=self._get_pipeline_run_url( + factory, resource_group, pipeline_run.run_id + ), + data_platform_instance=self.config.platform_instance, + subtype="Pipeline Run", + ) + + # Emit the instance + for mcp in dpi.generate_mcp( + created_ts_millis=( + int(pipeline_run.run_start.timestamp() * 1000) + if pipeline_run.run_start + else None + ), + materialize_iolets=False, + ): + yield mcp.as_workunit() + + # Emit start event + if pipeline_run.run_start: + start_ts = int(pipeline_run.run_start.timestamp() * 1000) + for mcp in dpi.start_event_mcp(start_ts): + yield mcp.as_workunit() + + # Emit end event if run is complete + if pipeline_run.run_end and result: + end_ts = int(pipeline_run.run_end.timestamp() * 1000) + for mcp in dpi.end_event_mcp( + end_timestamp_millis=end_ts, + result=result, + result_type=pipeline_run.status, + ): + yield mcp.as_workunit() + + # Emit activity runs for this pipeline run + yield from self._emit_activity_runs(pipeline_run, factory, resource_group) + + def _map_run_status(self, status: str) -> Optional[InstanceRunResult]: + """Map ADF run status to DataHub InstanceRunResult.""" + status_map = { + "Succeeded": InstanceRunResult.SUCCESS, + "Failed": InstanceRunResult.FAILURE, + "Cancelled": InstanceRunResult.SKIPPED, + "Cancelling": None, # Still running + "InProgress": None, # Still running + "Queued": None, # Not started + } + return status_map.get(status) + + def _get_pipeline_run_url( + self, factory: Factory, resource_group: str, run_id: str + ) -> str: + """Generate Azure Portal URL for a pipeline run.""" + return ( + f"https://adf.azure.com/en/monitoring/pipelineruns/{run_id}" + f"?factory=/subscriptions/{self.config.subscription_id}" + f"/resourceGroups/{resource_group}" + f"/providers/Microsoft.DataFactory/factories/{factory.name}" + ) + + def _emit_activity_runs( + self, + pipeline_run: PipelineRun, + factory: Factory, + resource_group: str, + ) -> Iterable[MetadataWorkUnit]: + """Emit activity runs as DataProcessInstance for each DataJob.""" + try: + for activity_run in self.client.get_activity_runs( + resource_group, + factory.name, + pipeline_run.run_id, + ): + self.report.report_api_call() + self.report.report_activity_run_scanned() + + # Build DataJob URN for the template + flow_name = f"{factory.name}.{activity_run.pipeline_name}" + flow_urn = DataFlowUrn.create_from_ids( + orchestrator=PLATFORM, + flow_id=flow_name, + env=self.config.env, + platform_instance=self.config.platform_instance, + ) + job_urn = DataJobUrn.create_from_ids( + data_flow_urn=str(flow_urn), + job_id=activity_run.activity_name, + ) + + # Map ADF status to InstanceRunResult + result = self._map_run_status(activity_run.status) + + # Build custom properties + properties: dict[str, str] = { + "activity_run_id": activity_run.activity_run_id, + "activity_type": activity_run.activity_type, + "pipeline_run_id": activity_run.pipeline_run_id, + "status": activity_run.status, + } + if activity_run.duration_in_ms is not None: + properties["duration_ms"] = str(activity_run.duration_in_ms) + if activity_run.error: + error_msg = str(activity_run.error.get("message", "")) + if error_msg: + properties["error"] = error_msg[:MAX_RUN_MESSAGE_LENGTH] + + # Create DataProcessInstance linked to DataJob + dpi = DataProcessInstance( + id=activity_run.activity_run_id, + orchestrator=PLATFORM, + cluster=self.config.env, + type=DataProcessTypeClass.BATCH_SCHEDULED, + template_urn=job_urn, + properties=properties, + url=self._get_pipeline_run_url( + factory, resource_group, pipeline_run.run_id + ), + data_platform_instance=self.config.platform_instance, + subtype="Activity Run", + ) + + # Emit the instance + for mcp in dpi.generate_mcp( + created_ts_millis=( + int(activity_run.activity_run_start.timestamp() * 1000) + if activity_run.activity_run_start + else None + ), + materialize_iolets=False, + ): + yield mcp.as_workunit() + + # Emit start event + if activity_run.activity_run_start: + start_ts = int(activity_run.activity_run_start.timestamp() * 1000) + for mcp in dpi.start_event_mcp(start_ts): + yield mcp.as_workunit() + + # Emit end event if run is complete + if activity_run.activity_run_end and result: + end_ts = int(activity_run.activity_run_end.timestamp() * 1000) + for mcp in dpi.end_event_mcp( + end_timestamp_millis=end_ts, + result=result, + result_type=activity_run.status, + ): + yield mcp.as_workunit() + + except Exception as e: + logger.warning( + f"Failed to fetch activity runs for pipeline run {pipeline_run.run_id}: {e}" + ) + + def get_report(self) -> AzureDataFactorySourceReport: + return self.report + + def close(self) -> None: + """Clean up resources.""" + self.client.close() + super().close() diff --git a/metadata-ingestion/tests/integration/azure_data_factory/__init__.py b/metadata-ingestion/tests/integration/azure_data_factory/__init__.py new file mode 100644 index 00000000000000..261403a51885d4 --- /dev/null +++ b/metadata-ingestion/tests/integration/azure_data_factory/__init__.py @@ -0,0 +1 @@ +"""Integration tests for Azure Data Factory connector.""" diff --git a/metadata-ingestion/tests/integration/azure_data_factory/adf_basic_golden.json b/metadata-ingestion/tests/integration/azure_data_factory/adf_basic_golden.json new file mode 100644 index 00000000000000..9ef535630cce09 --- /dev/null +++ b/metadata-ingestion/tests/integration/azure_data_factory/adf_basic_golden.json @@ -0,0 +1,775 @@ +[ +{ + "entityType": "container", + "entityUrn": "urn:li:container:cdaebfa861d2b2a3853328719496bce5", + "changeType": "UPSERT", + "aspectName": "containerProperties", + "aspect": { + "json": { + "customProperties": { + "platform": "azure-data-factory", + "env": "PROD", + "resource_group": "test-resource-group", + "factory_name": "test-data-factory", + "azure_resource_id": "/subscriptions/12345678-1234-1234-1234-123456789012/resourceGroups/test-resource-group/providers/Microsoft.DataFactory/factories/test-data-factory", + "location": "eastus", + "tag:environment": "test", + "tag:team": "data-engineering", + "provisioning_state": "Succeeded" + }, + "externalUrl": "https://adf.azure.com/en/home?factory=/subscriptions/12345678-1234-1234-1234-123456789012/resourceGroups/test-resource-group/providers/Microsoft.DataFactory/factories/test-data-factory", + "name": "test-data-factory", + "description": "Azure Data Factory: test-data-factory", + "env": "PROD" + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-test-basic", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "container", + "entityUrn": "urn:li:container:cdaebfa861d2b2a3853328719496bce5", + "changeType": "UPSERT", + "aspectName": "dataPlatformInstance", + "aspect": { + "json": { + "platform": "urn:li:dataPlatform:azure-data-factory" + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-test-basic", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure-data-factory,test-data-factory.DataProcessingPipeline,PROD),CallStoredProc)", + "changeType": "UPSERT", + "aspectName": "dataJobInfo", + "aspect": { + "json": { + "customProperties": { + "activity_type": "SqlServerStoredProcedure", + "activity_description": "Call stored procedure", + "timeout": "7.00:00:00", + "retry": "0" + }, + "externalUrl": "https://adf.azure.com/en/authoring/pipeline/DataProcessingPipeline?factory=/subscriptions/12345678-1234-1234-1234-123456789012/resourceGroups/test-resource-group/providers/Microsoft.DataFactory/factories/test-data-factory", + "name": "CallStoredProc", + "description": "Call stored procedure", + "type": { + "string": "COMMAND" + }, + "flowUrn": "urn:li:dataFlow:(azure-data-factory,test-data-factory.DataProcessingPipeline,PROD)", + "env": "PROD" + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-test-basic", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "container", + "entityUrn": "urn:li:container:cdaebfa861d2b2a3853328719496bce5", + "changeType": "UPSERT", + "aspectName": "browsePathsV2", + "aspect": { + "json": { + "path": [] + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-test-basic", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure-data-factory,test-data-factory.DataProcessingPipeline,PROD),CallStoredProc)", + "changeType": "UPSERT", + "aspectName": "subTypes", + "aspect": { + "json": { + "typeNames": [ + "Stored Procedure Activity" + ] + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-test-basic", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "container", + "entityUrn": "urn:li:container:cdaebfa861d2b2a3853328719496bce5", + "changeType": "UPSERT", + "aspectName": "subTypes", + "aspect": { + "json": { + "typeNames": [ + "Data Factory" + ] + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-test-basic", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure-data-factory,test-data-factory.DataProcessingPipeline,PROD),CallStoredProc)", + "changeType": "UPSERT", + "aspectName": "dataPlatformInstance", + "aspect": { + "json": { + "platform": "urn:li:dataPlatform:azure-data-factory" + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-test-basic", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure-data-factory,test-data-factory.DataProcessingPipeline,PROD),CallStoredProc)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-test-basic", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataFlow", + "entityUrn": "urn:li:dataFlow:(azure-data-factory,test-data-factory.DataIngestionPipeline,PROD)", + "changeType": "UPSERT", + "aspectName": "dataPlatformInstance", + "aspect": { + "json": { + "platform": "urn:li:dataPlatform:azure-data-factory" + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-test-basic", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataFlow", + "entityUrn": "urn:li:dataFlow:(azure-data-factory,test-data-factory.DataIngestionPipeline,PROD)", + "changeType": "UPSERT", + "aspectName": "dataFlowInfo", + "aspect": { + "json": { + "customProperties": { + "azure_resource_id": "/subscriptions/12345678-1234-1234-1234-123456789012/resourceGroups/test-resource-group/providers/Microsoft.DataFactory/factories/test-data-factory/pipelines/DataIngestionPipeline", + "factory_name": "test-data-factory", + "triggers": "DailyScheduleTrigger" + }, + "externalUrl": "https://adf.azure.com/en/authoring/pipeline/DataIngestionPipeline?factory=/subscriptions/12345678-1234-1234-1234-123456789012/resourceGroups/test-resource-group/providers/Microsoft.DataFactory/factories/test-data-factory", + "name": "DataIngestionPipeline", + "description": "Main data ingestion pipeline", + "env": "PROD" + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-test-basic", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure-data-factory,test-data-factory.DataProcessingPipeline,PROD),CallStoredProc)", + "changeType": "UPSERT", + "aspectName": "browsePathsV2", + "aspect": { + "json": { + "path": [ + { + "id": "urn:li:container:cdaebfa861d2b2a3853328719496bce5", + "urn": "urn:li:container:cdaebfa861d2b2a3853328719496bce5" + }, + { + "id": "test-data-factory.DataProcessingPipeline", + "urn": "urn:li:dataFlow:(azure-data-factory,test-data-factory.DataProcessingPipeline,PROD)" + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-test-basic", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataFlow", + "entityUrn": "urn:li:dataFlow:(azure-data-factory,test-data-factory.DataProcessingPipeline,PROD)", + "changeType": "UPSERT", + "aspectName": "subTypes", + "aspect": { + "json": { + "typeNames": [ + "Pipeline" + ] + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-test-basic", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataFlow", + "entityUrn": "urn:li:dataFlow:(azure-data-factory,test-data-factory.DataIngestionPipeline,PROD)", + "changeType": "UPSERT", + "aspectName": "browsePathsV2", + "aspect": { + "json": { + "path": [ + { + "id": "urn:li:container:cdaebfa861d2b2a3853328719496bce5", + "urn": "urn:li:container:cdaebfa861d2b2a3853328719496bce5" + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-test-basic", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataFlow", + "entityUrn": "urn:li:dataFlow:(azure-data-factory,test-data-factory.DataIngestionPipeline,PROD)", + "changeType": "UPSERT", + "aspectName": "subTypes", + "aspect": { + "json": { + "typeNames": [ + "Pipeline" + ] + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-test-basic", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataFlow", + "entityUrn": "urn:li:dataFlow:(azure-data-factory,test-data-factory.DataIngestionPipeline,PROD)", + "changeType": "UPSERT", + "aspectName": "container", + "aspect": { + "json": { + "container": "urn:li:container:cdaebfa861d2b2a3853328719496bce5" + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-test-basic", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataFlow", + "entityUrn": "urn:li:dataFlow:(azure-data-factory,test-data-factory.DataProcessingPipeline,PROD)", + "changeType": "UPSERT", + "aspectName": "container", + "aspect": { + "json": { + "container": "urn:li:container:cdaebfa861d2b2a3853328719496bce5" + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-test-basic", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataFlow", + "entityUrn": "urn:li:dataFlow:(azure-data-factory,test-data-factory.DataProcessingPipeline,PROD)", + "changeType": "UPSERT", + "aspectName": "dataPlatformInstance", + "aspect": { + "json": { + "platform": "urn:li:dataPlatform:azure-data-factory" + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-test-basic", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataFlow", + "entityUrn": "urn:li:dataFlow:(azure-data-factory,test-data-factory.DataProcessingPipeline,PROD)", + "changeType": "UPSERT", + "aspectName": "dataFlowInfo", + "aspect": { + "json": { + "customProperties": { + "azure_resource_id": "/subscriptions/12345678-1234-1234-1234-123456789012/resourceGroups/test-resource-group/providers/Microsoft.DataFactory/factories/test-data-factory/pipelines/DataProcessingPipeline", + "factory_name": "test-data-factory" + }, + "externalUrl": "https://adf.azure.com/en/authoring/pipeline/DataProcessingPipeline?factory=/subscriptions/12345678-1234-1234-1234-123456789012/resourceGroups/test-resource-group/providers/Microsoft.DataFactory/factories/test-data-factory", + "name": "DataProcessingPipeline", + "description": "Data processing and transformation", + "env": "PROD" + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-test-basic", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure-data-factory,test-data-factory.DataIngestionPipeline,PROD),CopyBlobToSQL)", + "changeType": "UPSERT", + "aspectName": "dataPlatformInstance", + "aspect": { + "json": { + "platform": "urn:li:dataPlatform:azure-data-factory" + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-test-basic", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataFlow", + "entityUrn": "urn:li:dataFlow:(azure-data-factory,test-data-factory.DataProcessingPipeline,PROD)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-test-basic", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure-data-factory,test-data-factory.DataIngestionPipeline,PROD),CopyBlobToSQL)", + "changeType": "UPSERT", + "aspectName": "dataJobInfo", + "aspect": { + "json": { + "customProperties": { + "activity_type": "Copy", + "activity_description": "Copy data from Blob to SQL", + "timeout": "7.00:00:00", + "retry": "0" + }, + "externalUrl": "https://adf.azure.com/en/authoring/pipeline/DataIngestionPipeline?factory=/subscriptions/12345678-1234-1234-1234-123456789012/resourceGroups/test-resource-group/providers/Microsoft.DataFactory/factories/test-data-factory", + "name": "CopyBlobToSQL", + "description": "Copy data from Blob to SQL", + "type": { + "string": "COMMAND" + }, + "flowUrn": "urn:li:dataFlow:(azure-data-factory,test-data-factory.DataIngestionPipeline,PROD)", + "env": "PROD" + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-test-basic", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure-data-factory,test-data-factory.DataIngestionPipeline,PROD),CopyBlobToSQL)", + "changeType": "UPSERT", + "aspectName": "dataJobInputOutput", + "aspect": { + "json": { + "inputDatasets": [ + "urn:li:dataset:(urn:li:dataPlatform:abs,raw-data/input/data.csv,PROD)" + ], + "outputDatasets": [ + "urn:li:dataset:(urn:li:dataPlatform:mssql,ProcessedData,PROD)" + ] + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-test-basic", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure-data-factory,test-data-factory.DataIngestionPipeline,PROD),CopyBlobToSQL)", + "changeType": "UPSERT", + "aspectName": "browsePathsV2", + "aspect": { + "json": { + "path": [ + { + "id": "urn:li:container:cdaebfa861d2b2a3853328719496bce5", + "urn": "urn:li:container:cdaebfa861d2b2a3853328719496bce5" + }, + { + "id": "test-data-factory.DataIngestionPipeline", + "urn": "urn:li:dataFlow:(azure-data-factory,test-data-factory.DataIngestionPipeline,PROD)" + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-test-basic", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure-data-factory,test-data-factory.DataIngestionPipeline,PROD),CopyBlobToSQL)", + "changeType": "UPSERT", + "aspectName": "subTypes", + "aspect": { + "json": { + "typeNames": [ + "Copy Activity" + ] + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-test-basic", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataFlow", + "entityUrn": "urn:li:dataFlow:(azure-data-factory,test-data-factory.DataProcessingPipeline,PROD)", + "changeType": "UPSERT", + "aspectName": "browsePathsV2", + "aspect": { + "json": { + "path": [ + { + "id": "urn:li:container:cdaebfa861d2b2a3853328719496bce5", + "urn": "urn:li:container:cdaebfa861d2b2a3853328719496bce5" + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-test-basic", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure-data-factory,test-data-factory.DataIngestionPipeline,PROD),LookupConfig)", + "changeType": "UPSERT", + "aspectName": "dataJobInfo", + "aspect": { + "json": { + "customProperties": { + "activity_type": "Lookup", + "activity_description": "Lookup configuration values", + "timeout": "7.00:00:00", + "retry": "0" + }, + "externalUrl": "https://adf.azure.com/en/authoring/pipeline/DataIngestionPipeline?factory=/subscriptions/12345678-1234-1234-1234-123456789012/resourceGroups/test-resource-group/providers/Microsoft.DataFactory/factories/test-data-factory", + "name": "LookupConfig", + "description": "Lookup configuration values", + "type": { + "string": "COMMAND" + }, + "flowUrn": "urn:li:dataFlow:(azure-data-factory,test-data-factory.DataIngestionPipeline,PROD)", + "env": "PROD" + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-test-basic", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure-data-factory,test-data-factory.DataIngestionPipeline,PROD),LookupConfig)", + "changeType": "UPSERT", + "aspectName": "subTypes", + "aspect": { + "json": { + "typeNames": [ + "Lookup Activity" + ] + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-test-basic", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure-data-factory,test-data-factory.DataIngestionPipeline,PROD),LookupConfig)", + "changeType": "UPSERT", + "aspectName": "dataPlatformInstance", + "aspect": { + "json": { + "platform": "urn:li:dataPlatform:azure-data-factory" + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-test-basic", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure-data-factory,test-data-factory.DataIngestionPipeline,PROD),TransformData)", + "changeType": "UPSERT", + "aspectName": "dataPlatformInstance", + "aspect": { + "json": { + "platform": "urn:li:dataPlatform:azure-data-factory" + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-test-basic", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure-data-factory,test-data-factory.DataIngestionPipeline,PROD),LookupConfig)", + "changeType": "UPSERT", + "aspectName": "dataJobInputOutput", + "aspect": { + "json": { + "inputDatasets": [ + "urn:li:dataset:(urn:li:dataPlatform:abs,config/settings.json,PROD)" + ], + "outputDatasets": [] + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-test-basic", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure-data-factory,test-data-factory.DataIngestionPipeline,PROD),TransformData)", + "changeType": "UPSERT", + "aspectName": "dataJobInfo", + "aspect": { + "json": { + "customProperties": { + "activity_type": "ExecuteDataFlow", + "activity_description": "Execute mapping data flow", + "timeout": "7.00:00:00", + "retry": "0" + }, + "externalUrl": "https://adf.azure.com/en/authoring/pipeline/DataIngestionPipeline?factory=/subscriptions/12345678-1234-1234-1234-123456789012/resourceGroups/test-resource-group/providers/Microsoft.DataFactory/factories/test-data-factory", + "name": "TransformData", + "description": "Execute mapping data flow", + "type": { + "string": "COMMAND" + }, + "flowUrn": "urn:li:dataFlow:(azure-data-factory,test-data-factory.DataIngestionPipeline,PROD)", + "env": "PROD" + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-test-basic", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure-data-factory,test-data-factory.DataIngestionPipeline,PROD),TransformData)", + "changeType": "UPSERT", + "aspectName": "browsePathsV2", + "aspect": { + "json": { + "path": [ + { + "id": "urn:li:container:cdaebfa861d2b2a3853328719496bce5", + "urn": "urn:li:container:cdaebfa861d2b2a3853328719496bce5" + }, + { + "id": "test-data-factory.DataIngestionPipeline", + "urn": "urn:li:dataFlow:(azure-data-factory,test-data-factory.DataIngestionPipeline,PROD)" + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-test-basic", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure-data-factory,test-data-factory.DataIngestionPipeline,PROD),TransformData)", + "changeType": "UPSERT", + "aspectName": "subTypes", + "aspect": { + "json": { + "typeNames": [ + "Data Flow Activity" + ] + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-test-basic", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "container", + "entityUrn": "urn:li:container:cdaebfa861d2b2a3853328719496bce5", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-test-basic", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure-data-factory,test-data-factory.DataIngestionPipeline,PROD),LookupConfig)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-test-basic", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure-data-factory,test-data-factory.DataIngestionPipeline,PROD),LookupConfig)", + "changeType": "UPSERT", + "aspectName": "browsePathsV2", + "aspect": { + "json": { + "path": [ + { + "id": "urn:li:container:cdaebfa861d2b2a3853328719496bce5", + "urn": "urn:li:container:cdaebfa861d2b2a3853328719496bce5" + }, + { + "id": "test-data-factory.DataIngestionPipeline", + "urn": "urn:li:dataFlow:(azure-data-factory,test-data-factory.DataIngestionPipeline,PROD)" + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-test-basic", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure-data-factory,test-data-factory.DataIngestionPipeline,PROD),CopyBlobToSQL)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-test-basic", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure-data-factory,test-data-factory.DataIngestionPipeline,PROD),TransformData)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-test-basic", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataFlow", + "entityUrn": "urn:li:dataFlow:(azure-data-factory,test-data-factory.DataIngestionPipeline,PROD)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-test-basic", + "lastRunId": "no-run-id-provided" + } +} +] \ No newline at end of file diff --git a/metadata-ingestion/tests/integration/azure_data_factory/adf_branching_golden.json b/metadata-ingestion/tests/integration/azure_data_factory/adf_branching_golden.json new file mode 100644 index 00000000000000..ab0139120c5ca9 --- /dev/null +++ b/metadata-ingestion/tests/integration/azure_data_factory/adf_branching_golden.json @@ -0,0 +1,508 @@ +[ +{ + "entityType": "container", + "entityUrn": "urn:li:container:7f68411fe6779dae7f37506c1e9bacfe", + "changeType": "UPSERT", + "aspectName": "containerProperties", + "aspect": { + "json": { + "customProperties": { + "platform": "azure-data-factory", + "env": "DEV", + "resource_group": "complex-test-rg", + "factory_name": "complex-data-factory", + "azure_resource_id": "/subscriptions/12345678-1234-1234-1234-123456789012/resourceGroups/complex-test-rg/providers/Microsoft.DataFactory/factories/complex-data-factory", + "location": "eastus", + "tag:environment": "test", + "tag:purpose": "complex-integration-tests", + "provisioning_state": "Succeeded" + }, + "externalUrl": "https://adf.azure.com/en/home?factory=/subscriptions/12345678-1234-1234-1234-123456789012/resourceGroups/complex-test-rg/providers/Microsoft.DataFactory/factories/complex-data-factory", + "name": "complex-data-factory", + "description": "Azure Data Factory: complex-data-factory", + "env": "DEV" + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-branching-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "container", + "entityUrn": "urn:li:container:7f68411fe6779dae7f37506c1e9bacfe", + "changeType": "UPSERT", + "aspectName": "dataPlatformInstance", + "aspect": { + "json": { + "platform": "urn:li:dataPlatform:azure-data-factory" + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-branching-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataFlow", + "entityUrn": "urn:li:dataFlow:(azure-data-factory,complex-data-factory.BranchingPipeline,DEV)", + "changeType": "UPSERT", + "aspectName": "subTypes", + "aspect": { + "json": { + "typeNames": [ + "Pipeline" + ] + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-branching-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "container", + "entityUrn": "urn:li:container:7f68411fe6779dae7f37506c1e9bacfe", + "changeType": "UPSERT", + "aspectName": "browsePathsV2", + "aspect": { + "json": { + "path": [] + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-branching-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataFlow", + "entityUrn": "urn:li:dataFlow:(azure-data-factory,complex-data-factory.BranchingPipeline,DEV)", + "changeType": "UPSERT", + "aspectName": "container", + "aspect": { + "json": { + "container": "urn:li:container:7f68411fe6779dae7f37506c1e9bacfe" + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-branching-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "container", + "entityUrn": "urn:li:container:7f68411fe6779dae7f37506c1e9bacfe", + "changeType": "UPSERT", + "aspectName": "subTypes", + "aspect": { + "json": { + "typeNames": [ + "Data Factory" + ] + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-branching-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataFlow", + "entityUrn": "urn:li:dataFlow:(azure-data-factory,complex-data-factory.BranchingPipeline,DEV)", + "changeType": "UPSERT", + "aspectName": "dataPlatformInstance", + "aspect": { + "json": { + "platform": "urn:li:dataPlatform:azure-data-factory" + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-branching-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataFlow", + "entityUrn": "urn:li:dataFlow:(azure-data-factory,complex-data-factory.BranchingPipeline,DEV)", + "changeType": "UPSERT", + "aspectName": "dataFlowInfo", + "aspect": { + "json": { + "customProperties": { + "azure_resource_id": "/subscriptions/12345678-1234-1234-1234-123456789012/resourceGroups/complex-test-rg/providers/Microsoft.DataFactory/factories/complex-data-factory/pipelines/BranchingPipeline", + "factory_name": "complex-data-factory" + }, + "externalUrl": "https://adf.azure.com/en/authoring/pipeline/BranchingPipeline?factory=/subscriptions/12345678-1234-1234-1234-123456789012/resourceGroups/complex-test-rg/providers/Microsoft.DataFactory/factories/complex-data-factory", + "name": "BranchingPipeline", + "description": "Pipeline with If-Condition and Switch branching", + "env": "DEV" + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-branching-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataFlow", + "entityUrn": "urn:li:dataFlow:(azure-data-factory,complex-data-factory.BranchingPipeline,DEV)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-branching-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataFlow", + "entityUrn": "urn:li:dataFlow:(azure-data-factory,complex-data-factory.BranchingPipeline,DEV)", + "changeType": "UPSERT", + "aspectName": "browsePathsV2", + "aspect": { + "json": { + "path": [ + { + "id": "urn:li:container:7f68411fe6779dae7f37506c1e9bacfe", + "urn": "urn:li:container:7f68411fe6779dae7f37506c1e9bacfe" + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-branching-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure-data-factory,complex-data-factory.BranchingPipeline,DEV),ProcessByRegion)", + "changeType": "UPSERT", + "aspectName": "dataJobInfo", + "aspect": { + "json": { + "customProperties": { + "activity_type": "Switch" + }, + "externalUrl": "https://adf.azure.com/en/authoring/pipeline/BranchingPipeline?factory=/subscriptions/12345678-1234-1234-1234-123456789012/resourceGroups/complex-test-rg/providers/Microsoft.DataFactory/factories/complex-data-factory", + "name": "ProcessByRegion", + "type": { + "string": "COMMAND" + }, + "flowUrn": "urn:li:dataFlow:(azure-data-factory,complex-data-factory.BranchingPipeline,DEV)", + "env": "DEV" + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-branching-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure-data-factory,complex-data-factory.BranchingPipeline,DEV),ProcessByRegion)", + "changeType": "UPSERT", + "aspectName": "subTypes", + "aspect": { + "json": { + "typeNames": [ + "Switch Activity" + ] + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-branching-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure-data-factory,complex-data-factory.BranchingPipeline,DEV),ProcessByRegion)", + "changeType": "UPSERT", + "aspectName": "dataPlatformInstance", + "aspect": { + "json": { + "platform": "urn:li:dataPlatform:azure-data-factory" + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-branching-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure-data-factory,complex-data-factory.BranchingPipeline,DEV),CheckDataExists)", + "changeType": "UPSERT", + "aspectName": "dataPlatformInstance", + "aspect": { + "json": { + "platform": "urn:li:dataPlatform:azure-data-factory" + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-branching-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure-data-factory,complex-data-factory.BranchingPipeline,DEV),ProcessByRegion)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-branching-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure-data-factory,complex-data-factory.BranchingPipeline,DEV),CheckDataExists)", + "changeType": "UPSERT", + "aspectName": "dataJobInfo", + "aspect": { + "json": { + "customProperties": { + "activity_type": "Lookup" + }, + "externalUrl": "https://adf.azure.com/en/authoring/pipeline/BranchingPipeline?factory=/subscriptions/12345678-1234-1234-1234-123456789012/resourceGroups/complex-test-rg/providers/Microsoft.DataFactory/factories/complex-data-factory", + "name": "CheckDataExists", + "type": { + "string": "COMMAND" + }, + "flowUrn": "urn:li:dataFlow:(azure-data-factory,complex-data-factory.BranchingPipeline,DEV)", + "env": "DEV" + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-branching-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure-data-factory,complex-data-factory.BranchingPipeline,DEV),CheckDataExists)", + "changeType": "UPSERT", + "aspectName": "browsePathsV2", + "aspect": { + "json": { + "path": [ + { + "id": "urn:li:container:7f68411fe6779dae7f37506c1e9bacfe", + "urn": "urn:li:container:7f68411fe6779dae7f37506c1e9bacfe" + }, + { + "id": "complex-data-factory.BranchingPipeline", + "urn": "urn:li:dataFlow:(azure-data-factory,complex-data-factory.BranchingPipeline,DEV)" + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-branching-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure-data-factory,complex-data-factory.BranchingPipeline,DEV),DataExistsCheck)", + "changeType": "UPSERT", + "aspectName": "dataPlatformInstance", + "aspect": { + "json": { + "platform": "urn:li:dataPlatform:azure-data-factory" + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-branching-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure-data-factory,complex-data-factory.BranchingPipeline,DEV),CheckDataExists)", + "changeType": "UPSERT", + "aspectName": "subTypes", + "aspect": { + "json": { + "typeNames": [ + "Lookup Activity" + ] + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-branching-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure-data-factory,complex-data-factory.BranchingPipeline,DEV),DataExistsCheck)", + "changeType": "UPSERT", + "aspectName": "dataJobInfo", + "aspect": { + "json": { + "customProperties": { + "activity_type": "IfCondition" + }, + "externalUrl": "https://adf.azure.com/en/authoring/pipeline/BranchingPipeline?factory=/subscriptions/12345678-1234-1234-1234-123456789012/resourceGroups/complex-test-rg/providers/Microsoft.DataFactory/factories/complex-data-factory", + "name": "DataExistsCheck", + "type": { + "string": "COMMAND" + }, + "flowUrn": "urn:li:dataFlow:(azure-data-factory,complex-data-factory.BranchingPipeline,DEV)", + "env": "DEV" + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-branching-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure-data-factory,complex-data-factory.BranchingPipeline,DEV),DataExistsCheck)", + "changeType": "UPSERT", + "aspectName": "browsePathsV2", + "aspect": { + "json": { + "path": [ + { + "id": "urn:li:container:7f68411fe6779dae7f37506c1e9bacfe", + "urn": "urn:li:container:7f68411fe6779dae7f37506c1e9bacfe" + }, + { + "id": "complex-data-factory.BranchingPipeline", + "urn": "urn:li:dataFlow:(azure-data-factory,complex-data-factory.BranchingPipeline,DEV)" + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-branching-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure-data-factory,complex-data-factory.BranchingPipeline,DEV),DataExistsCheck)", + "changeType": "UPSERT", + "aspectName": "subTypes", + "aspect": { + "json": { + "typeNames": [ + "If Condition" + ] + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-branching-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "container", + "entityUrn": "urn:li:container:7f68411fe6779dae7f37506c1e9bacfe", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-branching-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure-data-factory,complex-data-factory.BranchingPipeline,DEV),ProcessByRegion)", + "changeType": "UPSERT", + "aspectName": "browsePathsV2", + "aspect": { + "json": { + "path": [ + { + "id": "urn:li:container:7f68411fe6779dae7f37506c1e9bacfe", + "urn": "urn:li:container:7f68411fe6779dae7f37506c1e9bacfe" + }, + { + "id": "complex-data-factory.BranchingPipeline", + "urn": "urn:li:dataFlow:(azure-data-factory,complex-data-factory.BranchingPipeline,DEV)" + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-branching-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure-data-factory,complex-data-factory.BranchingPipeline,DEV),CheckDataExists)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-branching-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure-data-factory,complex-data-factory.BranchingPipeline,DEV),DataExistsCheck)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-branching-test", + "lastRunId": "no-run-id-provided" + } +} +] \ No newline at end of file diff --git a/metadata-ingestion/tests/integration/azure_data_factory/adf_dataflow_golden.json b/metadata-ingestion/tests/integration/azure_data_factory/adf_dataflow_golden.json new file mode 100644 index 00000000000000..80468bab9c4f0b --- /dev/null +++ b/metadata-ingestion/tests/integration/azure_data_factory/adf_dataflow_golden.json @@ -0,0 +1,308 @@ +[ +{ + "entityType": "container", + "entityUrn": "urn:li:container:7f68411fe6779dae7f37506c1e9bacfe", + "changeType": "UPSERT", + "aspectName": "containerProperties", + "aspect": { + "json": { + "customProperties": { + "platform": "azure-data-factory", + "env": "DEV", + "resource_group": "complex-test-rg", + "factory_name": "complex-data-factory", + "azure_resource_id": "/subscriptions/12345678-1234-1234-1234-123456789012/resourceGroups/complex-test-rg/providers/Microsoft.DataFactory/factories/complex-data-factory", + "location": "eastus", + "tag:environment": "test", + "tag:purpose": "complex-integration-tests", + "provisioning_state": "Succeeded" + }, + "externalUrl": "https://adf.azure.com/en/home?factory=/subscriptions/12345678-1234-1234-1234-123456789012/resourceGroups/complex-test-rg/providers/Microsoft.DataFactory/factories/complex-data-factory", + "name": "complex-data-factory", + "description": "Azure Data Factory: complex-data-factory", + "env": "DEV" + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-dataflow-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "container", + "entityUrn": "urn:li:container:7f68411fe6779dae7f37506c1e9bacfe", + "changeType": "UPSERT", + "aspectName": "dataPlatformInstance", + "aspect": { + "json": { + "platform": "urn:li:dataPlatform:azure-data-factory" + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-dataflow-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure-data-factory,complex-data-factory.DataFlowPipeline,DEV),RunSalesTransformation)", + "changeType": "UPSERT", + "aspectName": "dataJobInfo", + "aspect": { + "json": { + "customProperties": { + "activity_type": "ExecuteDataFlow" + }, + "externalUrl": "https://adf.azure.com/en/authoring/pipeline/DataFlowPipeline?factory=/subscriptions/12345678-1234-1234-1234-123456789012/resourceGroups/complex-test-rg/providers/Microsoft.DataFactory/factories/complex-data-factory", + "name": "RunSalesTransformation", + "type": { + "string": "COMMAND" + }, + "flowUrn": "urn:li:dataFlow:(azure-data-factory,complex-data-factory.DataFlowPipeline,DEV)", + "env": "DEV" + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-dataflow-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "container", + "entityUrn": "urn:li:container:7f68411fe6779dae7f37506c1e9bacfe", + "changeType": "UPSERT", + "aspectName": "browsePathsV2", + "aspect": { + "json": { + "path": [] + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-dataflow-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure-data-factory,complex-data-factory.DataFlowPipeline,DEV),RunSalesTransformation)", + "changeType": "UPSERT", + "aspectName": "subTypes", + "aspect": { + "json": { + "typeNames": [ + "Data Flow Activity" + ] + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-dataflow-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "container", + "entityUrn": "urn:li:container:7f68411fe6779dae7f37506c1e9bacfe", + "changeType": "UPSERT", + "aspectName": "subTypes", + "aspect": { + "json": { + "typeNames": [ + "Data Factory" + ] + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-dataflow-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure-data-factory,complex-data-factory.DataFlowPipeline,DEV),RunSalesTransformation)", + "changeType": "UPSERT", + "aspectName": "dataPlatformInstance", + "aspect": { + "json": { + "platform": "urn:li:dataPlatform:azure-data-factory" + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-dataflow-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure-data-factory,complex-data-factory.DataFlowPipeline,DEV),RunSalesTransformation)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-dataflow-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataFlow", + "entityUrn": "urn:li:dataFlow:(azure-data-factory,complex-data-factory.DataFlowPipeline,DEV)", + "changeType": "UPSERT", + "aspectName": "dataPlatformInstance", + "aspect": { + "json": { + "platform": "urn:li:dataPlatform:azure-data-factory" + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-dataflow-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataFlow", + "entityUrn": "urn:li:dataFlow:(azure-data-factory,complex-data-factory.DataFlowPipeline,DEV)", + "changeType": "UPSERT", + "aspectName": "dataFlowInfo", + "aspect": { + "json": { + "customProperties": { + "azure_resource_id": "/subscriptions/12345678-1234-1234-1234-123456789012/resourceGroups/complex-test-rg/providers/Microsoft.DataFactory/factories/complex-data-factory/pipelines/DataFlowPipeline", + "factory_name": "complex-data-factory" + }, + "externalUrl": "https://adf.azure.com/en/authoring/pipeline/DataFlowPipeline?factory=/subscriptions/12345678-1234-1234-1234-123456789012/resourceGroups/complex-test-rg/providers/Microsoft.DataFactory/factories/complex-data-factory", + "name": "DataFlowPipeline", + "description": "Pipeline that executes a mapping data flow", + "env": "DEV" + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-dataflow-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure-data-factory,complex-data-factory.DataFlowPipeline,DEV),RunSalesTransformation)", + "changeType": "UPSERT", + "aspectName": "browsePathsV2", + "aspect": { + "json": { + "path": [ + { + "id": "urn:li:container:7f68411fe6779dae7f37506c1e9bacfe", + "urn": "urn:li:container:7f68411fe6779dae7f37506c1e9bacfe" + }, + { + "id": "complex-data-factory.DataFlowPipeline", + "urn": "urn:li:dataFlow:(azure-data-factory,complex-data-factory.DataFlowPipeline,DEV)" + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-dataflow-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataFlow", + "entityUrn": "urn:li:dataFlow:(azure-data-factory,complex-data-factory.DataFlowPipeline,DEV)", + "changeType": "UPSERT", + "aspectName": "subTypes", + "aspect": { + "json": { + "typeNames": [ + "Pipeline" + ] + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-dataflow-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataFlow", + "entityUrn": "urn:li:dataFlow:(azure-data-factory,complex-data-factory.DataFlowPipeline,DEV)", + "changeType": "UPSERT", + "aspectName": "browsePathsV2", + "aspect": { + "json": { + "path": [ + { + "id": "urn:li:container:7f68411fe6779dae7f37506c1e9bacfe", + "urn": "urn:li:container:7f68411fe6779dae7f37506c1e9bacfe" + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-dataflow-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataFlow", + "entityUrn": "urn:li:dataFlow:(azure-data-factory,complex-data-factory.DataFlowPipeline,DEV)", + "changeType": "UPSERT", + "aspectName": "container", + "aspect": { + "json": { + "container": "urn:li:container:7f68411fe6779dae7f37506c1e9bacfe" + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-dataflow-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "container", + "entityUrn": "urn:li:container:7f68411fe6779dae7f37506c1e9bacfe", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-dataflow-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataFlow", + "entityUrn": "urn:li:dataFlow:(azure-data-factory,complex-data-factory.DataFlowPipeline,DEV)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-dataflow-test", + "lastRunId": "no-run-id-provided" + } +} +] \ No newline at end of file diff --git a/metadata-ingestion/tests/integration/azure_data_factory/adf_diverse_golden.json b/metadata-ingestion/tests/integration/azure_data_factory/adf_diverse_golden.json new file mode 100644 index 00000000000000..b9d147739ddcea --- /dev/null +++ b/metadata-ingestion/tests/integration/azure_data_factory/adf_diverse_golden.json @@ -0,0 +1,1108 @@ +[ +{ + "entityType": "container", + "entityUrn": "urn:li:container:7f68411fe6779dae7f37506c1e9bacfe", + "changeType": "UPSERT", + "aspectName": "containerProperties", + "aspect": { + "json": { + "customProperties": { + "platform": "azure-data-factory", + "env": "DEV", + "resource_group": "complex-test-rg", + "factory_name": "complex-data-factory", + "azure_resource_id": "/subscriptions/12345678-1234-1234-1234-123456789012/resourceGroups/complex-test-rg/providers/Microsoft.DataFactory/factories/complex-data-factory", + "location": "eastus", + "tag:environment": "test", + "tag:purpose": "complex-integration-tests", + "provisioning_state": "Succeeded" + }, + "externalUrl": "https://adf.azure.com/en/home?factory=/subscriptions/12345678-1234-1234-1234-123456789012/resourceGroups/complex-test-rg/providers/Microsoft.DataFactory/factories/complex-data-factory", + "name": "complex-data-factory", + "description": "Azure Data Factory: complex-data-factory", + "env": "DEV" + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-diverse-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "container", + "entityUrn": "urn:li:container:7f68411fe6779dae7f37506c1e9bacfe", + "changeType": "UPSERT", + "aspectName": "dataPlatformInstance", + "aspect": { + "json": { + "platform": "urn:li:dataPlatform:azure-data-factory" + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-diverse-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure-data-factory,complex-data-factory.DiverseActivitiesPipeline,DEV),CheckOutputExists)", + "changeType": "UPSERT", + "aspectName": "dataJobInfo", + "aspect": { + "json": { + "customProperties": { + "activity_type": "GetMetadata" + }, + "externalUrl": "https://adf.azure.com/en/authoring/pipeline/DiverseActivitiesPipeline?factory=/subscriptions/12345678-1234-1234-1234-123456789012/resourceGroups/complex-test-rg/providers/Microsoft.DataFactory/factories/complex-data-factory", + "name": "CheckOutputExists", + "type": { + "string": "COMMAND" + }, + "flowUrn": "urn:li:dataFlow:(azure-data-factory,complex-data-factory.DiverseActivitiesPipeline,DEV)", + "env": "DEV" + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-diverse-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "container", + "entityUrn": "urn:li:container:7f68411fe6779dae7f37506c1e9bacfe", + "changeType": "UPSERT", + "aspectName": "browsePathsV2", + "aspect": { + "json": { + "path": [] + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-diverse-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataFlow", + "entityUrn": "urn:li:dataFlow:(azure-data-factory,complex-data-factory.DiverseActivitiesPipeline,DEV)", + "changeType": "UPSERT", + "aspectName": "dataPlatformInstance", + "aspect": { + "json": { + "platform": "urn:li:dataPlatform:azure-data-factory" + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-diverse-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataFlow", + "entityUrn": "urn:li:dataFlow:(azure-data-factory,complex-data-factory.DiverseActivitiesPipeline,DEV)", + "changeType": "UPSERT", + "aspectName": "dataFlowInfo", + "aspect": { + "json": { + "customProperties": { + "azure_resource_id": "/subscriptions/12345678-1234-1234-1234-123456789012/resourceGroups/complex-test-rg/providers/Microsoft.DataFactory/factories/complex-data-factory/pipelines/DiverseActivitiesPipeline", + "factory_name": "complex-data-factory" + }, + "externalUrl": "https://adf.azure.com/en/authoring/pipeline/DiverseActivitiesPipeline?factory=/subscriptions/12345678-1234-1234-1234-123456789012/resourceGroups/complex-test-rg/providers/Microsoft.DataFactory/factories/complex-data-factory", + "name": "DiverseActivitiesPipeline", + "description": "Pipeline demonstrating various activity types", + "env": "DEV" + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-diverse-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure-data-factory,complex-data-factory.DiverseActivitiesPipeline,DEV),CheckOutputExists)", + "changeType": "UPSERT", + "aspectName": "subTypes", + "aspect": { + "json": { + "typeNames": [ + "Get Metadata Activity" + ] + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-diverse-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "container", + "entityUrn": "urn:li:container:7f68411fe6779dae7f37506c1e9bacfe", + "changeType": "UPSERT", + "aspectName": "subTypes", + "aspect": { + "json": { + "typeNames": [ + "Data Factory" + ] + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-diverse-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataFlow", + "entityUrn": "urn:li:dataFlow:(azure-data-factory,complex-data-factory.DiverseActivitiesPipeline,DEV)", + "changeType": "UPSERT", + "aspectName": "browsePathsV2", + "aspect": { + "json": { + "path": [ + { + "id": "urn:li:container:7f68411fe6779dae7f37506c1e9bacfe", + "urn": "urn:li:container:7f68411fe6779dae7f37506c1e9bacfe" + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-diverse-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataFlow", + "entityUrn": "urn:li:dataFlow:(azure-data-factory,complex-data-factory.DiverseActivitiesPipeline,DEV)", + "changeType": "UPSERT", + "aspectName": "subTypes", + "aspect": { + "json": { + "typeNames": [ + "Pipeline" + ] + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-diverse-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataFlow", + "entityUrn": "urn:li:dataFlow:(azure-data-factory,complex-data-factory.DiverseActivitiesPipeline,DEV)", + "changeType": "UPSERT", + "aspectName": "container", + "aspect": { + "json": { + "container": "urn:li:container:7f68411fe6779dae7f37506c1e9bacfe" + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-diverse-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure-data-factory,complex-data-factory.DiverseActivitiesPipeline,DEV),CheckOutputExists)", + "changeType": "UPSERT", + "aspectName": "dataPlatformInstance", + "aspect": { + "json": { + "platform": "urn:li:dataPlatform:azure-data-factory" + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-diverse-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure-data-factory,complex-data-factory.DiverseActivitiesPipeline,DEV),CheckOutputExists)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-diverse-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure-data-factory,complex-data-factory.DiverseActivitiesPipeline,DEV),CheckOutputExists)", + "changeType": "UPSERT", + "aspectName": "browsePathsV2", + "aspect": { + "json": { + "path": [ + { + "id": "urn:li:container:7f68411fe6779dae7f37506c1e9bacfe", + "urn": "urn:li:container:7f68411fe6779dae7f37506c1e9bacfe" + }, + { + "id": "complex-data-factory.DiverseActivitiesPipeline", + "urn": "urn:li:dataFlow:(azure-data-factory,complex-data-factory.DiverseActivitiesPipeline,DEV)" + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-diverse-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure-data-factory,complex-data-factory.DiverseActivitiesPipeline,DEV),RunAnalyticsScript)", + "changeType": "UPSERT", + "aspectName": "dataJobInfo", + "aspect": { + "json": { + "customProperties": { + "activity_type": "Script" + }, + "externalUrl": "https://adf.azure.com/en/authoring/pipeline/DiverseActivitiesPipeline?factory=/subscriptions/12345678-1234-1234-1234-123456789012/resourceGroups/complex-test-rg/providers/Microsoft.DataFactory/factories/complex-data-factory", + "name": "RunAnalyticsScript", + "type": { + "string": "COMMAND" + }, + "flowUrn": "urn:li:dataFlow:(azure-data-factory,complex-data-factory.DiverseActivitiesPipeline,DEV)", + "env": "DEV" + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-diverse-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure-data-factory,complex-data-factory.DiverseActivitiesPipeline,DEV),RunAnalyticsScript)", + "changeType": "UPSERT", + "aspectName": "subTypes", + "aspect": { + "json": { + "typeNames": [ + "Script Activity" + ] + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-diverse-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure-data-factory,complex-data-factory.DiverseActivitiesPipeline,DEV),RunAnalyticsScript)", + "changeType": "UPSERT", + "aspectName": "dataPlatformInstance", + "aspect": { + "json": { + "platform": "urn:li:dataPlatform:azure-data-factory" + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-diverse-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure-data-factory,complex-data-factory.DiverseActivitiesPipeline,DEV),RunAnalyticsScript)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-diverse-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure-data-factory,complex-data-factory.DiverseActivitiesPipeline,DEV),InitializeCounter)", + "changeType": "UPSERT", + "aspectName": "dataPlatformInstance", + "aspect": { + "json": { + "platform": "urn:li:dataPlatform:azure-data-factory" + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-diverse-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure-data-factory,complex-data-factory.DiverseActivitiesPipeline,DEV),RunAnalyticsScript)", + "changeType": "UPSERT", + "aspectName": "browsePathsV2", + "aspect": { + "json": { + "path": [ + { + "id": "urn:li:container:7f68411fe6779dae7f37506c1e9bacfe", + "urn": "urn:li:container:7f68411fe6779dae7f37506c1e9bacfe" + }, + { + "id": "complex-data-factory.DiverseActivitiesPipeline", + "urn": "urn:li:dataFlow:(azure-data-factory,complex-data-factory.DiverseActivitiesPipeline,DEV)" + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-diverse-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure-data-factory,complex-data-factory.DiverseActivitiesPipeline,DEV),InitializeCounter)", + "changeType": "UPSERT", + "aspectName": "dataJobInfo", + "aspect": { + "json": { + "customProperties": { + "activity_type": "SetVariable" + }, + "externalUrl": "https://adf.azure.com/en/authoring/pipeline/DiverseActivitiesPipeline?factory=/subscriptions/12345678-1234-1234-1234-123456789012/resourceGroups/complex-test-rg/providers/Microsoft.DataFactory/factories/complex-data-factory", + "name": "InitializeCounter", + "type": { + "string": "COMMAND" + }, + "flowUrn": "urn:li:dataFlow:(azure-data-factory,complex-data-factory.DiverseActivitiesPipeline,DEV)", + "env": "DEV" + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-diverse-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure-data-factory,complex-data-factory.DiverseActivitiesPipeline,DEV),InitializeCounter)", + "changeType": "UPSERT", + "aspectName": "browsePathsV2", + "aspect": { + "json": { + "path": [ + { + "id": "urn:li:container:7f68411fe6779dae7f37506c1e9bacfe", + "urn": "urn:li:container:7f68411fe6779dae7f37506c1e9bacfe" + }, + { + "id": "complex-data-factory.DiverseActivitiesPipeline", + "urn": "urn:li:dataFlow:(azure-data-factory,complex-data-factory.DiverseActivitiesPipeline,DEV)" + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-diverse-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure-data-factory,complex-data-factory.DiverseActivitiesPipeline,DEV),ProcessDataWithSP)", + "changeType": "UPSERT", + "aspectName": "dataPlatformInstance", + "aspect": { + "json": { + "platform": "urn:li:dataPlatform:azure-data-factory" + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-diverse-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure-data-factory,complex-data-factory.DiverseActivitiesPipeline,DEV),FetchConfiguration)", + "changeType": "UPSERT", + "aspectName": "dataPlatformInstance", + "aspect": { + "json": { + "platform": "urn:li:dataPlatform:azure-data-factory" + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-diverse-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure-data-factory,complex-data-factory.DiverseActivitiesPipeline,DEV),ProcessDataWithSP)", + "changeType": "UPSERT", + "aspectName": "dataJobInfo", + "aspect": { + "json": { + "customProperties": { + "activity_type": "SqlServerStoredProcedure" + }, + "externalUrl": "https://adf.azure.com/en/authoring/pipeline/DiverseActivitiesPipeline?factory=/subscriptions/12345678-1234-1234-1234-123456789012/resourceGroups/complex-test-rg/providers/Microsoft.DataFactory/factories/complex-data-factory", + "name": "ProcessDataWithSP", + "type": { + "string": "COMMAND" + }, + "flowUrn": "urn:li:dataFlow:(azure-data-factory,complex-data-factory.DiverseActivitiesPipeline,DEV)", + "env": "DEV" + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-diverse-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure-data-factory,complex-data-factory.DiverseActivitiesPipeline,DEV),ProcessDataWithSP)", + "changeType": "UPSERT", + "aspectName": "browsePathsV2", + "aspect": { + "json": { + "path": [ + { + "id": "urn:li:container:7f68411fe6779dae7f37506c1e9bacfe", + "urn": "urn:li:container:7f68411fe6779dae7f37506c1e9bacfe" + }, + { + "id": "complex-data-factory.DiverseActivitiesPipeline", + "urn": "urn:li:dataFlow:(azure-data-factory,complex-data-factory.DiverseActivitiesPipeline,DEV)" + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-diverse-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure-data-factory,complex-data-factory.DiverseActivitiesPipeline,DEV),ProcessDataWithSP)", + "changeType": "UPSERT", + "aspectName": "subTypes", + "aspect": { + "json": { + "typeNames": [ + "Stored Procedure Activity" + ] + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-diverse-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure-data-factory,complex-data-factory.DiverseActivitiesPipeline,DEV),InitializeCounter)", + "changeType": "UPSERT", + "aspectName": "subTypes", + "aspect": { + "json": { + "typeNames": [ + "Set Variable" + ] + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-diverse-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure-data-factory,complex-data-factory.DiverseActivitiesPipeline,DEV),FetchConfiguration)", + "changeType": "UPSERT", + "aspectName": "dataJobInfo", + "aspect": { + "json": { + "customProperties": { + "activity_type": "WebActivity" + }, + "externalUrl": "https://adf.azure.com/en/authoring/pipeline/DiverseActivitiesPipeline?factory=/subscriptions/12345678-1234-1234-1234-123456789012/resourceGroups/complex-test-rg/providers/Microsoft.DataFactory/factories/complex-data-factory", + "name": "FetchConfiguration", + "type": { + "string": "COMMAND" + }, + "flowUrn": "urn:li:dataFlow:(azure-data-factory,complex-data-factory.DiverseActivitiesPipeline,DEV)", + "env": "DEV" + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-diverse-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure-data-factory,complex-data-factory.DiverseActivitiesPipeline,DEV),FetchConfiguration)", + "changeType": "UPSERT", + "aspectName": "browsePathsV2", + "aspect": { + "json": { + "path": [ + { + "id": "urn:li:container:7f68411fe6779dae7f37506c1e9bacfe", + "urn": "urn:li:container:7f68411fe6779dae7f37506c1e9bacfe" + }, + { + "id": "complex-data-factory.DiverseActivitiesPipeline", + "urn": "urn:li:dataFlow:(azure-data-factory,complex-data-factory.DiverseActivitiesPipeline,DEV)" + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-diverse-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure-data-factory,complex-data-factory.DiverseActivitiesPipeline,DEV),FetchConfiguration)", + "changeType": "UPSERT", + "aspectName": "subTypes", + "aspect": { + "json": { + "typeNames": [ + "Web Activity" + ] + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-diverse-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure-data-factory,complex-data-factory.DiverseActivitiesPipeline,DEV),WaitForReplication)", + "changeType": "UPSERT", + "aspectName": "dataJobInfo", + "aspect": { + "json": { + "customProperties": { + "activity_type": "Wait" + }, + "externalUrl": "https://adf.azure.com/en/authoring/pipeline/DiverseActivitiesPipeline?factory=/subscriptions/12345678-1234-1234-1234-123456789012/resourceGroups/complex-test-rg/providers/Microsoft.DataFactory/factories/complex-data-factory", + "name": "WaitForReplication", + "type": { + "string": "COMMAND" + }, + "flowUrn": "urn:li:dataFlow:(azure-data-factory,complex-data-factory.DiverseActivitiesPipeline,DEV)", + "env": "DEV" + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-diverse-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure-data-factory,complex-data-factory.DiverseActivitiesPipeline,DEV),WaitForReplication)", + "changeType": "UPSERT", + "aspectName": "subTypes", + "aspect": { + "json": { + "typeNames": [ + "Wait Activity" + ] + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-diverse-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure-data-factory,complex-data-factory.DiverseActivitiesPipeline,DEV),WaitForReplication)", + "changeType": "UPSERT", + "aspectName": "dataPlatformInstance", + "aspect": { + "json": { + "platform": "urn:li:dataPlatform:azure-data-factory" + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-diverse-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure-data-factory,complex-data-factory.DiverseActivitiesPipeline,DEV),WaitForReplication)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-diverse-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure-data-factory,complex-data-factory.DiverseActivitiesPipeline,DEV),WaitForReplication)", + "changeType": "UPSERT", + "aspectName": "browsePathsV2", + "aspect": { + "json": { + "path": [ + { + "id": "urn:li:container:7f68411fe6779dae7f37506c1e9bacfe", + "urn": "urn:li:container:7f68411fe6779dae7f37506c1e9bacfe" + }, + { + "id": "complex-data-factory.DiverseActivitiesPipeline", + "urn": "urn:li:dataFlow:(azure-data-factory,complex-data-factory.DiverseActivitiesPipeline,DEV)" + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-diverse-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure-data-factory,complex-data-factory.DiverseActivitiesPipeline,DEV),InitializeCounter)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-diverse-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure-data-factory,complex-data-factory.DiverseActivitiesPipeline,DEV),SendCompletionNotification)", + "changeType": "UPSERT", + "aspectName": "dataPlatformInstance", + "aspect": { + "json": { + "platform": "urn:li:dataPlatform:azure-data-factory" + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-diverse-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure-data-factory,complex-data-factory.DiverseActivitiesPipeline,DEV),FetchConfiguration)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-diverse-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure-data-factory,complex-data-factory.DiverseActivitiesPipeline,DEV),SendCompletionNotification)", + "changeType": "UPSERT", + "aspectName": "dataJobInfo", + "aspect": { + "json": { + "customProperties": { + "activity_type": "AzureFunctionActivity" + }, + "externalUrl": "https://adf.azure.com/en/authoring/pipeline/DiverseActivitiesPipeline?factory=/subscriptions/12345678-1234-1234-1234-123456789012/resourceGroups/complex-test-rg/providers/Microsoft.DataFactory/factories/complex-data-factory", + "name": "SendCompletionNotification", + "type": { + "string": "COMMAND" + }, + "flowUrn": "urn:li:dataFlow:(azure-data-factory,complex-data-factory.DiverseActivitiesPipeline,DEV)", + "env": "DEV" + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-diverse-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure-data-factory,complex-data-factory.DiverseActivitiesPipeline,DEV),SendCompletionNotification)", + "changeType": "UPSERT", + "aspectName": "browsePathsV2", + "aspect": { + "json": { + "path": [ + { + "id": "urn:li:container:7f68411fe6779dae7f37506c1e9bacfe", + "urn": "urn:li:container:7f68411fe6779dae7f37506c1e9bacfe" + }, + { + "id": "complex-data-factory.DiverseActivitiesPipeline", + "urn": "urn:li:dataFlow:(azure-data-factory,complex-data-factory.DiverseActivitiesPipeline,DEV)" + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-diverse-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure-data-factory,complex-data-factory.DiverseActivitiesPipeline,DEV),SendCompletionNotification)", + "changeType": "UPSERT", + "aspectName": "subTypes", + "aspect": { + "json": { + "typeNames": [ + "Azure Function Activity" + ] + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-diverse-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure-data-factory,complex-data-factory.DiverseActivitiesPipeline,DEV),RunMLTrainingNotebook)", + "changeType": "UPSERT", + "aspectName": "dataJobInfo", + "aspect": { + "json": { + "customProperties": { + "activity_type": "DatabricksNotebook" + }, + "externalUrl": "https://adf.azure.com/en/authoring/pipeline/DiverseActivitiesPipeline?factory=/subscriptions/12345678-1234-1234-1234-123456789012/resourceGroups/complex-test-rg/providers/Microsoft.DataFactory/factories/complex-data-factory", + "name": "RunMLTrainingNotebook", + "type": { + "string": "COMMAND" + }, + "flowUrn": "urn:li:dataFlow:(azure-data-factory,complex-data-factory.DiverseActivitiesPipeline,DEV)", + "env": "DEV" + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-diverse-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure-data-factory,complex-data-factory.DiverseActivitiesPipeline,DEV),RunMLTrainingNotebook)", + "changeType": "UPSERT", + "aspectName": "subTypes", + "aspect": { + "json": { + "typeNames": [ + "Databricks Notebook" + ] + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-diverse-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure-data-factory,complex-data-factory.DiverseActivitiesPipeline,DEV),RunMLTrainingNotebook)", + "changeType": "UPSERT", + "aspectName": "dataPlatformInstance", + "aspect": { + "json": { + "platform": "urn:li:dataPlatform:azure-data-factory" + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-diverse-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure-data-factory,complex-data-factory.DiverseActivitiesPipeline,DEV),RunMLTrainingNotebook)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-diverse-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure-data-factory,complex-data-factory.DiverseActivitiesPipeline,DEV),RunMLTrainingNotebook)", + "changeType": "UPSERT", + "aspectName": "browsePathsV2", + "aspect": { + "json": { + "path": [ + { + "id": "urn:li:container:7f68411fe6779dae7f37506c1e9bacfe", + "urn": "urn:li:container:7f68411fe6779dae7f37506c1e9bacfe" + }, + { + "id": "complex-data-factory.DiverseActivitiesPipeline", + "urn": "urn:li:dataFlow:(azure-data-factory,complex-data-factory.DiverseActivitiesPipeline,DEV)" + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-diverse-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "container", + "entityUrn": "urn:li:container:7f68411fe6779dae7f37506c1e9bacfe", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-diverse-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure-data-factory,complex-data-factory.DiverseActivitiesPipeline,DEV),FailOnCriticalError)", + "changeType": "UPSERT", + "aspectName": "dataJobInfo", + "aspect": { + "json": { + "customProperties": { + "activity_type": "Fail" + }, + "externalUrl": "https://adf.azure.com/en/authoring/pipeline/DiverseActivitiesPipeline?factory=/subscriptions/12345678-1234-1234-1234-123456789012/resourceGroups/complex-test-rg/providers/Microsoft.DataFactory/factories/complex-data-factory", + "name": "FailOnCriticalError", + "type": { + "string": "COMMAND" + }, + "flowUrn": "urn:li:dataFlow:(azure-data-factory,complex-data-factory.DiverseActivitiesPipeline,DEV)", + "env": "DEV" + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-diverse-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure-data-factory,complex-data-factory.DiverseActivitiesPipeline,DEV),FailOnCriticalError)", + "changeType": "UPSERT", + "aspectName": "subTypes", + "aspect": { + "json": { + "typeNames": [ + "Fail Activity" + ] + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-diverse-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure-data-factory,complex-data-factory.DiverseActivitiesPipeline,DEV),FailOnCriticalError)", + "changeType": "UPSERT", + "aspectName": "dataPlatformInstance", + "aspect": { + "json": { + "platform": "urn:li:dataPlatform:azure-data-factory" + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-diverse-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataFlow", + "entityUrn": "urn:li:dataFlow:(azure-data-factory,complex-data-factory.DiverseActivitiesPipeline,DEV)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-diverse-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure-data-factory,complex-data-factory.DiverseActivitiesPipeline,DEV),FailOnCriticalError)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-diverse-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure-data-factory,complex-data-factory.DiverseActivitiesPipeline,DEV),FailOnCriticalError)", + "changeType": "UPSERT", + "aspectName": "browsePathsV2", + "aspect": { + "json": { + "path": [ + { + "id": "urn:li:container:7f68411fe6779dae7f37506c1e9bacfe", + "urn": "urn:li:container:7f68411fe6779dae7f37506c1e9bacfe" + }, + { + "id": "complex-data-factory.DiverseActivitiesPipeline", + "urn": "urn:li:dataFlow:(azure-data-factory,complex-data-factory.DiverseActivitiesPipeline,DEV)" + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-diverse-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure-data-factory,complex-data-factory.DiverseActivitiesPipeline,DEV),ProcessDataWithSP)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-diverse-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure-data-factory,complex-data-factory.DiverseActivitiesPipeline,DEV),SendCompletionNotification)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-diverse-test", + "lastRunId": "no-run-id-provided" + } +} +] \ No newline at end of file diff --git a/metadata-ingestion/tests/integration/azure_data_factory/adf_foreach_golden.json b/metadata-ingestion/tests/integration/azure_data_factory/adf_foreach_golden.json new file mode 100644 index 00000000000000..e62e3649a69bf7 --- /dev/null +++ b/metadata-ingestion/tests/integration/azure_data_factory/adf_foreach_golden.json @@ -0,0 +1,408 @@ +[ +{ + "entityType": "container", + "entityUrn": "urn:li:container:7f68411fe6779dae7f37506c1e9bacfe", + "changeType": "UPSERT", + "aspectName": "containerProperties", + "aspect": { + "json": { + "customProperties": { + "platform": "azure-data-factory", + "env": "DEV", + "resource_group": "complex-test-rg", + "factory_name": "complex-data-factory", + "azure_resource_id": "/subscriptions/12345678-1234-1234-1234-123456789012/resourceGroups/complex-test-rg/providers/Microsoft.DataFactory/factories/complex-data-factory", + "location": "eastus", + "tag:environment": "test", + "tag:purpose": "complex-integration-tests", + "provisioning_state": "Succeeded" + }, + "externalUrl": "https://adf.azure.com/en/home?factory=/subscriptions/12345678-1234-1234-1234-123456789012/resourceGroups/complex-test-rg/providers/Microsoft.DataFactory/factories/complex-data-factory", + "name": "complex-data-factory", + "description": "Azure Data Factory: complex-data-factory", + "env": "DEV" + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-foreach-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "container", + "entityUrn": "urn:li:container:7f68411fe6779dae7f37506c1e9bacfe", + "changeType": "UPSERT", + "aspectName": "dataPlatformInstance", + "aspect": { + "json": { + "platform": "urn:li:dataPlatform:azure-data-factory" + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-foreach-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataFlow", + "entityUrn": "urn:li:dataFlow:(azure-data-factory,complex-data-factory.ForEachTablePipeline,DEV)", + "changeType": "UPSERT", + "aspectName": "subTypes", + "aspect": { + "json": { + "typeNames": [ + "Pipeline" + ] + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-foreach-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "container", + "entityUrn": "urn:li:container:7f68411fe6779dae7f37506c1e9bacfe", + "changeType": "UPSERT", + "aspectName": "browsePathsV2", + "aspect": { + "json": { + "path": [] + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-foreach-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataFlow", + "entityUrn": "urn:li:dataFlow:(azure-data-factory,complex-data-factory.ForEachTablePipeline,DEV)", + "changeType": "UPSERT", + "aspectName": "container", + "aspect": { + "json": { + "container": "urn:li:container:7f68411fe6779dae7f37506c1e9bacfe" + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-foreach-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "container", + "entityUrn": "urn:li:container:7f68411fe6779dae7f37506c1e9bacfe", + "changeType": "UPSERT", + "aspectName": "subTypes", + "aspect": { + "json": { + "typeNames": [ + "Data Factory" + ] + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-foreach-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataFlow", + "entityUrn": "urn:li:dataFlow:(azure-data-factory,complex-data-factory.ForEachTablePipeline,DEV)", + "changeType": "UPSERT", + "aspectName": "dataPlatformInstance", + "aspect": { + "json": { + "platform": "urn:li:dataPlatform:azure-data-factory" + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-foreach-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataFlow", + "entityUrn": "urn:li:dataFlow:(azure-data-factory,complex-data-factory.ForEachTablePipeline,DEV)", + "changeType": "UPSERT", + "aspectName": "dataFlowInfo", + "aspect": { + "json": { + "customProperties": { + "azure_resource_id": "/subscriptions/12345678-1234-1234-1234-123456789012/resourceGroups/complex-test-rg/providers/Microsoft.DataFactory/factories/complex-data-factory/pipelines/ForEachTablePipeline", + "factory_name": "complex-data-factory" + }, + "externalUrl": "https://adf.azure.com/en/authoring/pipeline/ForEachTablePipeline?factory=/subscriptions/12345678-1234-1234-1234-123456789012/resourceGroups/complex-test-rg/providers/Microsoft.DataFactory/factories/complex-data-factory", + "name": "ForEachTablePipeline", + "description": "Pipeline with ForEach loop to copy multiple tables", + "env": "DEV" + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-foreach-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataFlow", + "entityUrn": "urn:li:dataFlow:(azure-data-factory,complex-data-factory.ForEachTablePipeline,DEV)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-foreach-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataFlow", + "entityUrn": "urn:li:dataFlow:(azure-data-factory,complex-data-factory.ForEachTablePipeline,DEV)", + "changeType": "UPSERT", + "aspectName": "browsePathsV2", + "aspect": { + "json": { + "path": [ + { + "id": "urn:li:container:7f68411fe6779dae7f37506c1e9bacfe", + "urn": "urn:li:container:7f68411fe6779dae7f37506c1e9bacfe" + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-foreach-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure-data-factory,complex-data-factory.ForEachTablePipeline,DEV),IterateOverTables)", + "changeType": "UPSERT", + "aspectName": "dataJobInfo", + "aspect": { + "json": { + "customProperties": { + "activity_type": "ForEach" + }, + "externalUrl": "https://adf.azure.com/en/authoring/pipeline/ForEachTablePipeline?factory=/subscriptions/12345678-1234-1234-1234-123456789012/resourceGroups/complex-test-rg/providers/Microsoft.DataFactory/factories/complex-data-factory", + "name": "IterateOverTables", + "type": { + "string": "COMMAND" + }, + "flowUrn": "urn:li:dataFlow:(azure-data-factory,complex-data-factory.ForEachTablePipeline,DEV)", + "env": "DEV" + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-foreach-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure-data-factory,complex-data-factory.ForEachTablePipeline,DEV),IterateOverTables)", + "changeType": "UPSERT", + "aspectName": "subTypes", + "aspect": { + "json": { + "typeNames": [ + "ForEach Loop" + ] + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-foreach-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure-data-factory,complex-data-factory.ForEachTablePipeline,DEV),IterateOverTables)", + "changeType": "UPSERT", + "aspectName": "dataPlatformInstance", + "aspect": { + "json": { + "platform": "urn:li:dataPlatform:azure-data-factory" + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-foreach-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure-data-factory,complex-data-factory.ForEachTablePipeline,DEV),GetTableList)", + "changeType": "UPSERT", + "aspectName": "dataPlatformInstance", + "aspect": { + "json": { + "platform": "urn:li:dataPlatform:azure-data-factory" + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-foreach-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure-data-factory,complex-data-factory.ForEachTablePipeline,DEV),IterateOverTables)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-foreach-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure-data-factory,complex-data-factory.ForEachTablePipeline,DEV),GetTableList)", + "changeType": "UPSERT", + "aspectName": "dataJobInfo", + "aspect": { + "json": { + "customProperties": { + "activity_type": "Lookup" + }, + "externalUrl": "https://adf.azure.com/en/authoring/pipeline/ForEachTablePipeline?factory=/subscriptions/12345678-1234-1234-1234-123456789012/resourceGroups/complex-test-rg/providers/Microsoft.DataFactory/factories/complex-data-factory", + "name": "GetTableList", + "type": { + "string": "COMMAND" + }, + "flowUrn": "urn:li:dataFlow:(azure-data-factory,complex-data-factory.ForEachTablePipeline,DEV)", + "env": "DEV" + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-foreach-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure-data-factory,complex-data-factory.ForEachTablePipeline,DEV),GetTableList)", + "changeType": "UPSERT", + "aspectName": "browsePathsV2", + "aspect": { + "json": { + "path": [ + { + "id": "urn:li:container:7f68411fe6779dae7f37506c1e9bacfe", + "urn": "urn:li:container:7f68411fe6779dae7f37506c1e9bacfe" + }, + { + "id": "complex-data-factory.ForEachTablePipeline", + "urn": "urn:li:dataFlow:(azure-data-factory,complex-data-factory.ForEachTablePipeline,DEV)" + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-foreach-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure-data-factory,complex-data-factory.ForEachTablePipeline,DEV),GetTableList)", + "changeType": "UPSERT", + "aspectName": "subTypes", + "aspect": { + "json": { + "typeNames": [ + "Lookup Activity" + ] + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-foreach-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "container", + "entityUrn": "urn:li:container:7f68411fe6779dae7f37506c1e9bacfe", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-foreach-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure-data-factory,complex-data-factory.ForEachTablePipeline,DEV),IterateOverTables)", + "changeType": "UPSERT", + "aspectName": "browsePathsV2", + "aspect": { + "json": { + "path": [ + { + "id": "urn:li:container:7f68411fe6779dae7f37506c1e9bacfe", + "urn": "urn:li:container:7f68411fe6779dae7f37506c1e9bacfe" + }, + { + "id": "complex-data-factory.ForEachTablePipeline", + "urn": "urn:li:dataFlow:(azure-data-factory,complex-data-factory.ForEachTablePipeline,DEV)" + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-foreach-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure-data-factory,complex-data-factory.ForEachTablePipeline,DEV),GetTableList)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-foreach-test", + "lastRunId": "no-run-id-provided" + } +} +] \ No newline at end of file diff --git a/metadata-ingestion/tests/integration/azure_data_factory/adf_mixed_deps_golden.json b/metadata-ingestion/tests/integration/azure_data_factory/adf_mixed_deps_golden.json new file mode 100644 index 00000000000000..a06aee44d61042 --- /dev/null +++ b/metadata-ingestion/tests/integration/azure_data_factory/adf_mixed_deps_golden.json @@ -0,0 +1,1037 @@ +[ +{ + "entityType": "container", + "entityUrn": "urn:li:container:7f68411fe6779dae7f37506c1e9bacfe", + "changeType": "UPSERT", + "aspectName": "containerProperties", + "aspect": { + "json": { + "customProperties": { + "platform": "azure-data-factory", + "env": "DEV", + "resource_group": "complex-test-rg", + "factory_name": "complex-data-factory", + "azure_resource_id": "/subscriptions/12345678-1234-1234-1234-123456789012/resourceGroups/complex-test-rg/providers/Microsoft.DataFactory/factories/complex-data-factory", + "location": "eastus", + "tag:environment": "test", + "tag:purpose": "complex-integration-tests", + "provisioning_state": "Succeeded" + }, + "externalUrl": "https://adf.azure.com/en/home?factory=/subscriptions/12345678-1234-1234-1234-123456789012/resourceGroups/complex-test-rg/providers/Microsoft.DataFactory/factories/complex-data-factory", + "name": "complex-data-factory", + "description": "Azure Data Factory: complex-data-factory", + "env": "DEV" + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-mixed-deps-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "container", + "entityUrn": "urn:li:container:7f68411fe6779dae7f37506c1e9bacfe", + "changeType": "UPSERT", + "aspectName": "dataPlatformInstance", + "aspect": { + "json": { + "platform": "urn:li:dataPlatform:azure-data-factory" + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-mixed-deps-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "container", + "entityUrn": "urn:li:container:7f68411fe6779dae7f37506c1e9bacfe", + "changeType": "UPSERT", + "aspectName": "subTypes", + "aspect": { + "json": { + "typeNames": [ + "Data Factory" + ] + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-mixed-deps-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "container", + "entityUrn": "urn:li:container:7f68411fe6779dae7f37506c1e9bacfe", + "changeType": "UPSERT", + "aspectName": "browsePathsV2", + "aspect": { + "json": { + "path": [] + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-mixed-deps-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataFlow", + "entityUrn": "urn:li:dataFlow:(azure-data-factory,complex-data-factory.MixedOrchestrationPipeline,DEV)", + "changeType": "UPSERT", + "aspectName": "dataPlatformInstance", + "aspect": { + "json": { + "platform": "urn:li:dataPlatform:azure-data-factory" + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-mixed-deps-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataFlow", + "entityUrn": "urn:li:dataFlow:(azure-data-factory,complex-data-factory.MixedOrchestrationPipeline,DEV)", + "changeType": "UPSERT", + "aspectName": "dataFlowInfo", + "aspect": { + "json": { + "customProperties": { + "azure_resource_id": "/subscriptions/12345678-1234-1234-1234-123456789012/resourceGroups/complex-test-rg/providers/Microsoft.DataFactory/factories/complex-data-factory/pipelines/MixedOrchestrationPipeline", + "factory_name": "complex-data-factory" + }, + "externalUrl": "https://adf.azure.com/en/authoring/pipeline/MixedOrchestrationPipeline?factory=/subscriptions/12345678-1234-1234-1234-123456789012/resourceGroups/complex-test-rg/providers/Microsoft.DataFactory/factories/complex-data-factory", + "name": "MixedOrchestrationPipeline", + "description": "Pipeline demonstrating both pipeline and dataset dependencies", + "env": "DEV" + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-mixed-deps-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataFlow", + "entityUrn": "urn:li:dataFlow:(azure-data-factory,complex-data-factory.MixedOrchestrationPipeline,DEV)", + "changeType": "UPSERT", + "aspectName": "subTypes", + "aspect": { + "json": { + "typeNames": [ + "Pipeline" + ] + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-mixed-deps-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataFlow", + "entityUrn": "urn:li:dataFlow:(azure-data-factory,complex-data-factory.MixedOrchestrationPipeline,DEV)", + "changeType": "UPSERT", + "aspectName": "container", + "aspect": { + "json": { + "container": "urn:li:container:7f68411fe6779dae7f37506c1e9bacfe" + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-mixed-deps-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataFlow", + "entityUrn": "urn:li:dataFlow:(azure-data-factory,complex-data-factory.MixedOrchestrationPipeline,DEV)", + "changeType": "UPSERT", + "aspectName": "browsePathsV2", + "aspect": { + "json": { + "path": [ + { + "id": "urn:li:container:7f68411fe6779dae7f37506c1e9bacfe", + "urn": "urn:li:container:7f68411fe6779dae7f37506c1e9bacfe" + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-mixed-deps-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure-data-factory,complex-data-factory.MixedOrchestrationPipeline,DEV),ExecuteExtract)", + "changeType": "UPSERT", + "aspectName": "dataPlatformInstance", + "aspect": { + "json": { + "platform": "urn:li:dataPlatform:azure-data-factory" + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-mixed-deps-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure-data-factory,complex-data-factory.MixedOrchestrationPipeline,DEV),ExecuteExtract)", + "changeType": "UPSERT", + "aspectName": "dataJobInfo", + "aspect": { + "json": { + "customProperties": { + "activity_type": "ExecutePipeline", + "calls_pipeline": "ExtractDataPipeline", + "child_pipeline_urn": "urn:li:dataFlow:(azure-data-factory,complex-data-factory.ExtractDataPipeline,DEV)", + "child_first_activity": "ExtractFromSource" + }, + "externalUrl": "https://adf.azure.com/en/authoring/pipeline/MixedOrchestrationPipeline?factory=/subscriptions/12345678-1234-1234-1234-123456789012/resourceGroups/complex-test-rg/providers/Microsoft.DataFactory/factories/complex-data-factory", + "name": "ExecuteExtract", + "type": { + "string": "COMMAND" + }, + "flowUrn": "urn:li:dataFlow:(azure-data-factory,complex-data-factory.MixedOrchestrationPipeline,DEV)", + "env": "DEV" + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-mixed-deps-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure-data-factory,complex-data-factory.MixedOrchestrationPipeline,DEV),ExecuteExtract)", + "changeType": "UPSERT", + "aspectName": "subTypes", + "aspect": { + "json": { + "typeNames": [ + "Execute Pipeline" + ] + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-mixed-deps-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure-data-factory,complex-data-factory.MixedOrchestrationPipeline,DEV),ExecuteExtract)", + "changeType": "UPSERT", + "aspectName": "browsePathsV2", + "aspect": { + "json": { + "path": [ + { + "id": "urn:li:container:7f68411fe6779dae7f37506c1e9bacfe", + "urn": "urn:li:container:7f68411fe6779dae7f37506c1e9bacfe" + }, + { + "id": "complex-data-factory.MixedOrchestrationPipeline", + "urn": "urn:li:dataFlow:(azure-data-factory,complex-data-factory.MixedOrchestrationPipeline,DEV)" + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-mixed-deps-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure-data-factory,complex-data-factory.ExtractDataPipeline,DEV),ExtractFromSource)", + "changeType": "UPSERT", + "aspectName": "dataJobInputOutput", + "aspect": { + "json": { + "inputDatasets": [], + "outputDatasets": [], + "inputDatajobs": [ + "urn:li:dataJob:(urn:li:dataFlow:(azure-data-factory,complex-data-factory.MixedOrchestrationPipeline,DEV),ExecuteExtract)" + ] + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-mixed-deps-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure-data-factory,complex-data-factory.MixedOrchestrationPipeline,DEV),TransformInMain)", + "changeType": "UPSERT", + "aspectName": "dataPlatformInstance", + "aspect": { + "json": { + "platform": "urn:li:dataPlatform:azure-data-factory" + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-mixed-deps-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure-data-factory,complex-data-factory.MixedOrchestrationPipeline,DEV),TransformInMain)", + "changeType": "UPSERT", + "aspectName": "dataJobInfo", + "aspect": { + "json": { + "customProperties": { + "activity_type": "Copy" + }, + "externalUrl": "https://adf.azure.com/en/authoring/pipeline/MixedOrchestrationPipeline?factory=/subscriptions/12345678-1234-1234-1234-123456789012/resourceGroups/complex-test-rg/providers/Microsoft.DataFactory/factories/complex-data-factory", + "name": "TransformInMain", + "type": { + "string": "COMMAND" + }, + "flowUrn": "urn:li:dataFlow:(azure-data-factory,complex-data-factory.MixedOrchestrationPipeline,DEV)", + "env": "DEV" + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-mixed-deps-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure-data-factory,complex-data-factory.MixedOrchestrationPipeline,DEV),TransformInMain)", + "changeType": "UPSERT", + "aspectName": "subTypes", + "aspect": { + "json": { + "typeNames": [ + "Copy Activity" + ] + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-mixed-deps-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure-data-factory,complex-data-factory.MixedOrchestrationPipeline,DEV),TransformInMain)", + "changeType": "UPSERT", + "aspectName": "dataJobInputOutput", + "aspect": { + "json": { + "inputDatasets": [ + "urn:li:dataset:(urn:li:dataPlatform:abs,staging/customers,DEV)" + ], + "outputDatasets": [ + "urn:li:dataset:(urn:li:dataPlatform:mssql,DimCustomers,DEV)" + ] + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-mixed-deps-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure-data-factory,complex-data-factory.MixedOrchestrationPipeline,DEV),TransformInMain)", + "changeType": "UPSERT", + "aspectName": "browsePathsV2", + "aspect": { + "json": { + "path": [ + { + "id": "urn:li:container:7f68411fe6779dae7f37506c1e9bacfe", + "urn": "urn:li:container:7f68411fe6779dae7f37506c1e9bacfe" + }, + { + "id": "complex-data-factory.MixedOrchestrationPipeline", + "urn": "urn:li:dataFlow:(azure-data-factory,complex-data-factory.MixedOrchestrationPipeline,DEV)" + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-mixed-deps-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure-data-factory,complex-data-factory.MixedOrchestrationPipeline,DEV),ExecuteLoad)", + "changeType": "UPSERT", + "aspectName": "dataPlatformInstance", + "aspect": { + "json": { + "platform": "urn:li:dataPlatform:azure-data-factory" + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-mixed-deps-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure-data-factory,complex-data-factory.MixedOrchestrationPipeline,DEV),ExecuteLoad)", + "changeType": "UPSERT", + "aspectName": "dataJobInfo", + "aspect": { + "json": { + "customProperties": { + "activity_type": "ExecutePipeline", + "calls_pipeline": "LoadDataPipeline", + "child_pipeline_urn": "urn:li:dataFlow:(azure-data-factory,complex-data-factory.LoadDataPipeline,DEV)", + "child_first_activity": "LoadToDestination" + }, + "externalUrl": "https://adf.azure.com/en/authoring/pipeline/MixedOrchestrationPipeline?factory=/subscriptions/12345678-1234-1234-1234-123456789012/resourceGroups/complex-test-rg/providers/Microsoft.DataFactory/factories/complex-data-factory", + "name": "ExecuteLoad", + "type": { + "string": "COMMAND" + }, + "flowUrn": "urn:li:dataFlow:(azure-data-factory,complex-data-factory.MixedOrchestrationPipeline,DEV)", + "env": "DEV" + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-mixed-deps-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure-data-factory,complex-data-factory.MixedOrchestrationPipeline,DEV),ExecuteLoad)", + "changeType": "UPSERT", + "aspectName": "subTypes", + "aspect": { + "json": { + "typeNames": [ + "Execute Pipeline" + ] + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-mixed-deps-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure-data-factory,complex-data-factory.MixedOrchestrationPipeline,DEV),ExecuteLoad)", + "changeType": "UPSERT", + "aspectName": "browsePathsV2", + "aspect": { + "json": { + "path": [ + { + "id": "urn:li:container:7f68411fe6779dae7f37506c1e9bacfe", + "urn": "urn:li:container:7f68411fe6779dae7f37506c1e9bacfe" + }, + { + "id": "complex-data-factory.MixedOrchestrationPipeline", + "urn": "urn:li:dataFlow:(azure-data-factory,complex-data-factory.MixedOrchestrationPipeline,DEV)" + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-mixed-deps-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure-data-factory,complex-data-factory.LoadDataPipeline,DEV),LoadToDestination)", + "changeType": "UPSERT", + "aspectName": "dataJobInputOutput", + "aspect": { + "json": { + "inputDatasets": [], + "outputDatasets": [], + "inputDatajobs": [ + "urn:li:dataJob:(urn:li:dataFlow:(azure-data-factory,complex-data-factory.MixedOrchestrationPipeline,DEV),ExecuteLoad)" + ] + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-mixed-deps-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataFlow", + "entityUrn": "urn:li:dataFlow:(azure-data-factory,complex-data-factory.ExtractDataPipeline,DEV)", + "changeType": "UPSERT", + "aspectName": "dataPlatformInstance", + "aspect": { + "json": { + "platform": "urn:li:dataPlatform:azure-data-factory" + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-mixed-deps-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataFlow", + "entityUrn": "urn:li:dataFlow:(azure-data-factory,complex-data-factory.ExtractDataPipeline,DEV)", + "changeType": "UPSERT", + "aspectName": "dataFlowInfo", + "aspect": { + "json": { + "customProperties": { + "azure_resource_id": "/subscriptions/12345678-1234-1234-1234-123456789012/resourceGroups/complex-test-rg/providers/Microsoft.DataFactory/factories/complex-data-factory/pipelines/ExtractDataPipeline", + "factory_name": "complex-data-factory" + }, + "externalUrl": "https://adf.azure.com/en/authoring/pipeline/ExtractDataPipeline?factory=/subscriptions/12345678-1234-1234-1234-123456789012/resourceGroups/complex-test-rg/providers/Microsoft.DataFactory/factories/complex-data-factory", + "name": "ExtractDataPipeline", + "description": "Child pipeline for extracting data from source", + "env": "DEV" + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-mixed-deps-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataFlow", + "entityUrn": "urn:li:dataFlow:(azure-data-factory,complex-data-factory.ExtractDataPipeline,DEV)", + "changeType": "UPSERT", + "aspectName": "subTypes", + "aspect": { + "json": { + "typeNames": [ + "Pipeline" + ] + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-mixed-deps-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataFlow", + "entityUrn": "urn:li:dataFlow:(azure-data-factory,complex-data-factory.ExtractDataPipeline,DEV)", + "changeType": "UPSERT", + "aspectName": "container", + "aspect": { + "json": { + "container": "urn:li:container:7f68411fe6779dae7f37506c1e9bacfe" + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-mixed-deps-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataFlow", + "entityUrn": "urn:li:dataFlow:(azure-data-factory,complex-data-factory.ExtractDataPipeline,DEV)", + "changeType": "UPSERT", + "aspectName": "browsePathsV2", + "aspect": { + "json": { + "path": [ + { + "id": "urn:li:container:7f68411fe6779dae7f37506c1e9bacfe", + "urn": "urn:li:container:7f68411fe6779dae7f37506c1e9bacfe" + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-mixed-deps-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure-data-factory,complex-data-factory.ExtractDataPipeline,DEV),ExtractFromSource)", + "changeType": "UPSERT", + "aspectName": "dataPlatformInstance", + "aspect": { + "json": { + "platform": "urn:li:dataPlatform:azure-data-factory" + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-mixed-deps-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure-data-factory,complex-data-factory.ExtractDataPipeline,DEV),ExtractFromSource)", + "changeType": "UPSERT", + "aspectName": "dataJobInfo", + "aspect": { + "json": { + "customProperties": { + "activity_type": "Copy" + }, + "externalUrl": "https://adf.azure.com/en/authoring/pipeline/ExtractDataPipeline?factory=/subscriptions/12345678-1234-1234-1234-123456789012/resourceGroups/complex-test-rg/providers/Microsoft.DataFactory/factories/complex-data-factory", + "name": "ExtractFromSource", + "type": { + "string": "COMMAND" + }, + "flowUrn": "urn:li:dataFlow:(azure-data-factory,complex-data-factory.ExtractDataPipeline,DEV)", + "env": "DEV" + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-mixed-deps-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure-data-factory,complex-data-factory.ExtractDataPipeline,DEV),ExtractFromSource)", + "changeType": "UPSERT", + "aspectName": "subTypes", + "aspect": { + "json": { + "typeNames": [ + "Copy Activity" + ] + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-mixed-deps-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure-data-factory,complex-data-factory.ExtractDataPipeline,DEV),ExtractFromSource)", + "changeType": "UPSERT", + "aspectName": "dataJobInputOutput", + "aspect": { + "json": { + "inputDatasets": [ + "urn:li:dataset:(urn:li:dataPlatform:mssql,Customers,DEV)" + ], + "outputDatasets": [ + "urn:li:dataset:(urn:li:dataPlatform:abs,staging/customers,DEV)" + ] + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-mixed-deps-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure-data-factory,complex-data-factory.ExtractDataPipeline,DEV),ExtractFromSource)", + "changeType": "UPSERT", + "aspectName": "browsePathsV2", + "aspect": { + "json": { + "path": [ + { + "id": "urn:li:container:7f68411fe6779dae7f37506c1e9bacfe", + "urn": "urn:li:container:7f68411fe6779dae7f37506c1e9bacfe" + }, + { + "id": "complex-data-factory.ExtractDataPipeline", + "urn": "urn:li:dataFlow:(azure-data-factory,complex-data-factory.ExtractDataPipeline,DEV)" + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-mixed-deps-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataFlow", + "entityUrn": "urn:li:dataFlow:(azure-data-factory,complex-data-factory.LoadDataPipeline,DEV)", + "changeType": "UPSERT", + "aspectName": "dataPlatformInstance", + "aspect": { + "json": { + "platform": "urn:li:dataPlatform:azure-data-factory" + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-mixed-deps-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataFlow", + "entityUrn": "urn:li:dataFlow:(azure-data-factory,complex-data-factory.LoadDataPipeline,DEV)", + "changeType": "UPSERT", + "aspectName": "dataFlowInfo", + "aspect": { + "json": { + "customProperties": { + "azure_resource_id": "/subscriptions/12345678-1234-1234-1234-123456789012/resourceGroups/complex-test-rg/providers/Microsoft.DataFactory/factories/complex-data-factory/pipelines/LoadDataPipeline", + "factory_name": "complex-data-factory" + }, + "externalUrl": "https://adf.azure.com/en/authoring/pipeline/LoadDataPipeline?factory=/subscriptions/12345678-1234-1234-1234-123456789012/resourceGroups/complex-test-rg/providers/Microsoft.DataFactory/factories/complex-data-factory", + "name": "LoadDataPipeline", + "description": "Child pipeline for loading data to destination", + "env": "DEV" + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-mixed-deps-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataFlow", + "entityUrn": "urn:li:dataFlow:(azure-data-factory,complex-data-factory.LoadDataPipeline,DEV)", + "changeType": "UPSERT", + "aspectName": "subTypes", + "aspect": { + "json": { + "typeNames": [ + "Pipeline" + ] + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-mixed-deps-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataFlow", + "entityUrn": "urn:li:dataFlow:(azure-data-factory,complex-data-factory.LoadDataPipeline,DEV)", + "changeType": "UPSERT", + "aspectName": "container", + "aspect": { + "json": { + "container": "urn:li:container:7f68411fe6779dae7f37506c1e9bacfe" + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-mixed-deps-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataFlow", + "entityUrn": "urn:li:dataFlow:(azure-data-factory,complex-data-factory.LoadDataPipeline,DEV)", + "changeType": "UPSERT", + "aspectName": "browsePathsV2", + "aspect": { + "json": { + "path": [ + { + "id": "urn:li:container:7f68411fe6779dae7f37506c1e9bacfe", + "urn": "urn:li:container:7f68411fe6779dae7f37506c1e9bacfe" + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-mixed-deps-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure-data-factory,complex-data-factory.LoadDataPipeline,DEV),LoadToDestination)", + "changeType": "UPSERT", + "aspectName": "dataPlatformInstance", + "aspect": { + "json": { + "platform": "urn:li:dataPlatform:azure-data-factory" + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-mixed-deps-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure-data-factory,complex-data-factory.LoadDataPipeline,DEV),LoadToDestination)", + "changeType": "UPSERT", + "aspectName": "dataJobInfo", + "aspect": { + "json": { + "customProperties": { + "activity_type": "Copy" + }, + "externalUrl": "https://adf.azure.com/en/authoring/pipeline/LoadDataPipeline?factory=/subscriptions/12345678-1234-1234-1234-123456789012/resourceGroups/complex-test-rg/providers/Microsoft.DataFactory/factories/complex-data-factory", + "name": "LoadToDestination", + "type": { + "string": "COMMAND" + }, + "flowUrn": "urn:li:dataFlow:(azure-data-factory,complex-data-factory.LoadDataPipeline,DEV)", + "env": "DEV" + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-mixed-deps-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure-data-factory,complex-data-factory.LoadDataPipeline,DEV),LoadToDestination)", + "changeType": "UPSERT", + "aspectName": "subTypes", + "aspect": { + "json": { + "typeNames": [ + "Copy Activity" + ] + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-mixed-deps-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure-data-factory,complex-data-factory.LoadDataPipeline,DEV),LoadToDestination)", + "changeType": "UPSERT", + "aspectName": "dataJobInputOutput", + "aspect": { + "json": { + "inputDatasets": [ + "urn:li:dataset:(urn:li:dataPlatform:mssql,DimCustomers,DEV)" + ], + "outputDatasets": [ + "urn:li:dataset:(urn:li:dataPlatform:abs,sales_summary,DEV)" + ] + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-mixed-deps-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure-data-factory,complex-data-factory.LoadDataPipeline,DEV),LoadToDestination)", + "changeType": "UPSERT", + "aspectName": "browsePathsV2", + "aspect": { + "json": { + "path": [ + { + "id": "urn:li:container:7f68411fe6779dae7f37506c1e9bacfe", + "urn": "urn:li:container:7f68411fe6779dae7f37506c1e9bacfe" + }, + { + "id": "complex-data-factory.LoadDataPipeline", + "urn": "urn:li:dataFlow:(azure-data-factory,complex-data-factory.LoadDataPipeline,DEV)" + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-mixed-deps-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "container", + "entityUrn": "urn:li:container:7f68411fe6779dae7f37506c1e9bacfe", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-mixed-deps-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataFlow", + "entityUrn": "urn:li:dataFlow:(azure-data-factory,complex-data-factory.ExtractDataPipeline,DEV)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-mixed-deps-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataFlow", + "entityUrn": "urn:li:dataFlow:(azure-data-factory,complex-data-factory.LoadDataPipeline,DEV)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-mixed-deps-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataFlow", + "entityUrn": "urn:li:dataFlow:(azure-data-factory,complex-data-factory.MixedOrchestrationPipeline,DEV)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-mixed-deps-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure-data-factory,complex-data-factory.ExtractDataPipeline,DEV),ExtractFromSource)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-mixed-deps-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure-data-factory,complex-data-factory.LoadDataPipeline,DEV),LoadToDestination)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-mixed-deps-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure-data-factory,complex-data-factory.MixedOrchestrationPipeline,DEV),ExecuteExtract)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-mixed-deps-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure-data-factory,complex-data-factory.MixedOrchestrationPipeline,DEV),ExecuteLoad)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-mixed-deps-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure-data-factory,complex-data-factory.MixedOrchestrationPipeline,DEV),TransformInMain)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-mixed-deps-test", + "lastRunId": "no-run-id-provided" + } +} +] \ No newline at end of file diff --git a/metadata-ingestion/tests/integration/azure_data_factory/adf_multisource_golden.json b/metadata-ingestion/tests/integration/azure_data_factory/adf_multisource_golden.json new file mode 100644 index 00000000000000..4061b36570ef9f --- /dev/null +++ b/metadata-ingestion/tests/integration/azure_data_factory/adf_multisource_golden.json @@ -0,0 +1,813 @@ +[ +{ + "entityType": "container", + "entityUrn": "urn:li:container:7f68411fe6779dae7f37506c1e9bacfe", + "changeType": "UPSERT", + "aspectName": "containerProperties", + "aspect": { + "json": { + "customProperties": { + "platform": "azure-data-factory", + "env": "DEV", + "resource_group": "complex-test-rg", + "factory_name": "complex-data-factory", + "azure_resource_id": "/subscriptions/12345678-1234-1234-1234-123456789012/resourceGroups/complex-test-rg/providers/Microsoft.DataFactory/factories/complex-data-factory", + "location": "eastus", + "tag:environment": "test", + "tag:purpose": "complex-integration-tests", + "provisioning_state": "Succeeded" + }, + "externalUrl": "https://adf.azure.com/en/home?factory=/subscriptions/12345678-1234-1234-1234-123456789012/resourceGroups/complex-test-rg/providers/Microsoft.DataFactory/factories/complex-data-factory", + "name": "complex-data-factory", + "description": "Azure Data Factory: complex-data-factory", + "env": "DEV" + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-multisource-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "container", + "entityUrn": "urn:li:container:7f68411fe6779dae7f37506c1e9bacfe", + "changeType": "UPSERT", + "aspectName": "dataPlatformInstance", + "aspect": { + "json": { + "platform": "urn:li:dataPlatform:azure-data-factory" + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-multisource-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure-data-factory,complex-data-factory.ETLPipeline,DEV),ArchiveToDataLake)", + "changeType": "UPSERT", + "aspectName": "dataJobInfo", + "aspect": { + "json": { + "customProperties": { + "activity_type": "Copy" + }, + "externalUrl": "https://adf.azure.com/en/authoring/pipeline/ETLPipeline?factory=/subscriptions/12345678-1234-1234-1234-123456789012/resourceGroups/complex-test-rg/providers/Microsoft.DataFactory/factories/complex-data-factory", + "name": "ArchiveToDataLake", + "type": { + "string": "COMMAND" + }, + "flowUrn": "urn:li:dataFlow:(azure-data-factory,complex-data-factory.ETLPipeline,DEV)", + "env": "DEV" + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-multisource-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "container", + "entityUrn": "urn:li:container:7f68411fe6779dae7f37506c1e9bacfe", + "changeType": "UPSERT", + "aspectName": "browsePathsV2", + "aspect": { + "json": { + "path": [] + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-multisource-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure-data-factory,complex-data-factory.ETLPipeline,DEV),ArchiveToDataLake)", + "changeType": "UPSERT", + "aspectName": "subTypes", + "aspect": { + "json": { + "typeNames": [ + "Copy Activity" + ] + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-multisource-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "container", + "entityUrn": "urn:li:container:7f68411fe6779dae7f37506c1e9bacfe", + "changeType": "UPSERT", + "aspectName": "subTypes", + "aspect": { + "json": { + "typeNames": [ + "Data Factory" + ] + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-multisource-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure-data-factory,complex-data-factory.ETLPipeline,DEV),ArchiveToDataLake)", + "changeType": "UPSERT", + "aspectName": "dataPlatformInstance", + "aspect": { + "json": { + "platform": "urn:li:dataPlatform:azure-data-factory" + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-multisource-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure-data-factory,complex-data-factory.ETLPipeline,DEV),ArchiveToDataLake)", + "changeType": "UPSERT", + "aspectName": "dataJobInputOutput", + "aspect": { + "json": { + "inputDatasets": [ + "urn:li:dataset:(urn:li:dataPlatform:abs,staging/customers,DEV)" + ], + "outputDatasets": [ + "urn:li:dataset:(urn:li:dataPlatform:abs,sales,DEV)" + ] + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-multisource-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataFlow", + "entityUrn": "urn:li:dataFlow:(azure-data-factory,complex-data-factory.ETLPipeline,DEV)", + "changeType": "UPSERT", + "aspectName": "dataPlatformInstance", + "aspect": { + "json": { + "platform": "urn:li:dataPlatform:azure-data-factory" + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-multisource-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataFlow", + "entityUrn": "urn:li:dataFlow:(azure-data-factory,complex-data-factory.ETLPipeline,DEV)", + "changeType": "UPSERT", + "aspectName": "dataFlowInfo", + "aspect": { + "json": { + "customProperties": { + "azure_resource_id": "/subscriptions/12345678-1234-1234-1234-123456789012/resourceGroups/complex-test-rg/providers/Microsoft.DataFactory/factories/complex-data-factory/pipelines/ETLPipeline", + "factory_name": "complex-data-factory" + }, + "externalUrl": "https://adf.azure.com/en/authoring/pipeline/ETLPipeline?factory=/subscriptions/12345678-1234-1234-1234-123456789012/resourceGroups/complex-test-rg/providers/Microsoft.DataFactory/factories/complex-data-factory", + "name": "ETLPipeline", + "description": "Full ETL pipeline: Extract from SQL, stage in Blob, load to Synapse and archive", + "env": "DEV" + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-multisource-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure-data-factory,complex-data-factory.ETLPipeline,DEV),ArchiveToDataLake)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-multisource-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure-data-factory,complex-data-factory.ETLPipeline,DEV),ArchiveToDataLake)", + "changeType": "UPSERT", + "aspectName": "browsePathsV2", + "aspect": { + "json": { + "path": [ + { + "id": "urn:li:container:7f68411fe6779dae7f37506c1e9bacfe", + "urn": "urn:li:container:7f68411fe6779dae7f37506c1e9bacfe" + }, + { + "id": "complex-data-factory.ETLPipeline", + "urn": "urn:li:dataFlow:(azure-data-factory,complex-data-factory.ETLPipeline,DEV)" + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-multisource-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataFlow", + "entityUrn": "urn:li:dataFlow:(azure-data-factory,complex-data-factory.ETLPipeline,DEV)", + "changeType": "UPSERT", + "aspectName": "browsePathsV2", + "aspect": { + "json": { + "path": [ + { + "id": "urn:li:container:7f68411fe6779dae7f37506c1e9bacfe", + "urn": "urn:li:container:7f68411fe6779dae7f37506c1e9bacfe" + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-multisource-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure-data-factory,complex-data-factory.ETLPipeline,DEV),ExtractCustomersFromSQL)", + "changeType": "UPSERT", + "aspectName": "dataPlatformInstance", + "aspect": { + "json": { + "platform": "urn:li:dataPlatform:azure-data-factory" + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-multisource-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure-data-factory,complex-data-factory.ETLPipeline,DEV),ExtractOrdersFromSQL)", + "changeType": "UPSERT", + "aspectName": "dataPlatformInstance", + "aspect": { + "json": { + "platform": "urn:li:dataPlatform:azure-data-factory" + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-multisource-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataFlow", + "entityUrn": "urn:li:dataFlow:(azure-data-factory,complex-data-factory.ETLPipeline,DEV)", + "changeType": "UPSERT", + "aspectName": "subTypes", + "aspect": { + "json": { + "typeNames": [ + "Pipeline" + ] + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-multisource-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure-data-factory,complex-data-factory.ETLPipeline,DEV),ExtractOrdersFromSQL)", + "changeType": "UPSERT", + "aspectName": "dataJobInfo", + "aspect": { + "json": { + "customProperties": { + "activity_type": "Copy" + }, + "externalUrl": "https://adf.azure.com/en/authoring/pipeline/ETLPipeline?factory=/subscriptions/12345678-1234-1234-1234-123456789012/resourceGroups/complex-test-rg/providers/Microsoft.DataFactory/factories/complex-data-factory", + "name": "ExtractOrdersFromSQL", + "type": { + "string": "COMMAND" + }, + "flowUrn": "urn:li:dataFlow:(azure-data-factory,complex-data-factory.ETLPipeline,DEV)", + "env": "DEV" + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-multisource-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure-data-factory,complex-data-factory.ETLPipeline,DEV),ExtractOrdersFromSQL)", + "changeType": "UPSERT", + "aspectName": "dataJobInputOutput", + "aspect": { + "json": { + "inputDatasets": [ + "urn:li:dataset:(urn:li:dataPlatform:mssql,Orders,DEV)" + ], + "outputDatasets": [ + "urn:li:dataset:(urn:li:dataPlatform:abs,staging/orders,DEV)" + ] + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-multisource-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure-data-factory,complex-data-factory.ETLPipeline,DEV),ExtractOrdersFromSQL)", + "changeType": "UPSERT", + "aspectName": "browsePathsV2", + "aspect": { + "json": { + "path": [ + { + "id": "urn:li:container:7f68411fe6779dae7f37506c1e9bacfe", + "urn": "urn:li:container:7f68411fe6779dae7f37506c1e9bacfe" + }, + { + "id": "complex-data-factory.ETLPipeline", + "urn": "urn:li:dataFlow:(azure-data-factory,complex-data-factory.ETLPipeline,DEV)" + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-multisource-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure-data-factory,complex-data-factory.ETLPipeline,DEV),ExtractOrdersFromSQL)", + "changeType": "UPSERT", + "aspectName": "subTypes", + "aspect": { + "json": { + "typeNames": [ + "Copy Activity" + ] + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-multisource-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure-data-factory,complex-data-factory.ETLPipeline,DEV),ExtractCustomersFromSQL)", + "changeType": "UPSERT", + "aspectName": "dataJobInfo", + "aspect": { + "json": { + "customProperties": { + "activity_type": "Copy" + }, + "externalUrl": "https://adf.azure.com/en/authoring/pipeline/ETLPipeline?factory=/subscriptions/12345678-1234-1234-1234-123456789012/resourceGroups/complex-test-rg/providers/Microsoft.DataFactory/factories/complex-data-factory", + "name": "ExtractCustomersFromSQL", + "type": { + "string": "COMMAND" + }, + "flowUrn": "urn:li:dataFlow:(azure-data-factory,complex-data-factory.ETLPipeline,DEV)", + "env": "DEV" + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-multisource-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure-data-factory,complex-data-factory.ETLPipeline,DEV),ExtractCustomersFromSQL)", + "changeType": "UPSERT", + "aspectName": "dataJobInputOutput", + "aspect": { + "json": { + "inputDatasets": [ + "urn:li:dataset:(urn:li:dataPlatform:mssql,Customers,DEV)" + ], + "outputDatasets": [ + "urn:li:dataset:(urn:li:dataPlatform:abs,staging/customers,DEV)" + ] + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-multisource-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure-data-factory,complex-data-factory.ETLPipeline,DEV),ExtractCustomersFromSQL)", + "changeType": "UPSERT", + "aspectName": "browsePathsV2", + "aspect": { + "json": { + "path": [ + { + "id": "urn:li:container:7f68411fe6779dae7f37506c1e9bacfe", + "urn": "urn:li:container:7f68411fe6779dae7f37506c1e9bacfe" + }, + { + "id": "complex-data-factory.ETLPipeline", + "urn": "urn:li:dataFlow:(azure-data-factory,complex-data-factory.ETLPipeline,DEV)" + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-multisource-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure-data-factory,complex-data-factory.ETLPipeline,DEV),ExtractCustomersFromSQL)", + "changeType": "UPSERT", + "aspectName": "subTypes", + "aspect": { + "json": { + "typeNames": [ + "Copy Activity" + ] + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-multisource-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataFlow", + "entityUrn": "urn:li:dataFlow:(azure-data-factory,complex-data-factory.ETLPipeline,DEV)", + "changeType": "UPSERT", + "aspectName": "container", + "aspect": { + "json": { + "container": "urn:li:container:7f68411fe6779dae7f37506c1e9bacfe" + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-multisource-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure-data-factory,complex-data-factory.ETLPipeline,DEV),LoadOrdersToSynapse)", + "changeType": "UPSERT", + "aspectName": "dataJobInfo", + "aspect": { + "json": { + "customProperties": { + "activity_type": "Copy" + }, + "externalUrl": "https://adf.azure.com/en/authoring/pipeline/ETLPipeline?factory=/subscriptions/12345678-1234-1234-1234-123456789012/resourceGroups/complex-test-rg/providers/Microsoft.DataFactory/factories/complex-data-factory", + "name": "LoadOrdersToSynapse", + "type": { + "string": "COMMAND" + }, + "flowUrn": "urn:li:dataFlow:(azure-data-factory,complex-data-factory.ETLPipeline,DEV)", + "env": "DEV" + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-multisource-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure-data-factory,complex-data-factory.ETLPipeline,DEV),LoadOrdersToSynapse)", + "changeType": "UPSERT", + "aspectName": "subTypes", + "aspect": { + "json": { + "typeNames": [ + "Copy Activity" + ] + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-multisource-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure-data-factory,complex-data-factory.ETLPipeline,DEV),LoadOrdersToSynapse)", + "changeType": "UPSERT", + "aspectName": "dataPlatformInstance", + "aspect": { + "json": { + "platform": "urn:li:dataPlatform:azure-data-factory" + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-multisource-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure-data-factory,complex-data-factory.ETLPipeline,DEV),LoadOrdersToSynapse)", + "changeType": "UPSERT", + "aspectName": "dataJobInputOutput", + "aspect": { + "json": { + "inputDatasets": [ + "urn:li:dataset:(urn:li:dataPlatform:abs,staging/orders,DEV)" + ], + "outputDatasets": [ + "urn:li:dataset:(urn:li:dataPlatform:mssql,FactOrders,DEV)" + ] + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-multisource-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure-data-factory,complex-data-factory.ETLPipeline,DEV),LoadOrdersToSynapse)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-multisource-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure-data-factory,complex-data-factory.ETLPipeline,DEV),LoadOrdersToSynapse)", + "changeType": "UPSERT", + "aspectName": "browsePathsV2", + "aspect": { + "json": { + "path": [ + { + "id": "urn:li:container:7f68411fe6779dae7f37506c1e9bacfe", + "urn": "urn:li:container:7f68411fe6779dae7f37506c1e9bacfe" + }, + { + "id": "complex-data-factory.ETLPipeline", + "urn": "urn:li:dataFlow:(azure-data-factory,complex-data-factory.ETLPipeline,DEV)" + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-multisource-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataFlow", + "entityUrn": "urn:li:dataFlow:(azure-data-factory,complex-data-factory.ETLPipeline,DEV)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-multisource-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure-data-factory,complex-data-factory.ETLPipeline,DEV),LoadCustomersToSynapse)", + "changeType": "UPSERT", + "aspectName": "dataJobInfo", + "aspect": { + "json": { + "customProperties": { + "activity_type": "Copy" + }, + "externalUrl": "https://adf.azure.com/en/authoring/pipeline/ETLPipeline?factory=/subscriptions/12345678-1234-1234-1234-123456789012/resourceGroups/complex-test-rg/providers/Microsoft.DataFactory/factories/complex-data-factory", + "name": "LoadCustomersToSynapse", + "type": { + "string": "COMMAND" + }, + "flowUrn": "urn:li:dataFlow:(azure-data-factory,complex-data-factory.ETLPipeline,DEV)", + "env": "DEV" + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-multisource-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure-data-factory,complex-data-factory.ETLPipeline,DEV),LoadCustomersToSynapse)", + "changeType": "UPSERT", + "aspectName": "dataPlatformInstance", + "aspect": { + "json": { + "platform": "urn:li:dataPlatform:azure-data-factory" + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-multisource-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure-data-factory,complex-data-factory.ETLPipeline,DEV),LoadCustomersToSynapse)", + "changeType": "UPSERT", + "aspectName": "subTypes", + "aspect": { + "json": { + "typeNames": [ + "Copy Activity" + ] + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-multisource-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure-data-factory,complex-data-factory.ETLPipeline,DEV),LoadCustomersToSynapse)", + "changeType": "UPSERT", + "aspectName": "dataJobInputOutput", + "aspect": { + "json": { + "inputDatasets": [ + "urn:li:dataset:(urn:li:dataPlatform:abs,staging/customers,DEV)" + ], + "outputDatasets": [ + "urn:li:dataset:(urn:li:dataPlatform:mssql,DimCustomers,DEV)" + ] + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-multisource-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure-data-factory,complex-data-factory.ETLPipeline,DEV),LoadCustomersToSynapse)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-multisource-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure-data-factory,complex-data-factory.ETLPipeline,DEV),LoadCustomersToSynapse)", + "changeType": "UPSERT", + "aspectName": "browsePathsV2", + "aspect": { + "json": { + "path": [ + { + "id": "urn:li:container:7f68411fe6779dae7f37506c1e9bacfe", + "urn": "urn:li:container:7f68411fe6779dae7f37506c1e9bacfe" + }, + { + "id": "complex-data-factory.ETLPipeline", + "urn": "urn:li:dataFlow:(azure-data-factory,complex-data-factory.ETLPipeline,DEV)" + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-multisource-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure-data-factory,complex-data-factory.ETLPipeline,DEV),ExtractCustomersFromSQL)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-multisource-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure-data-factory,complex-data-factory.ETLPipeline,DEV),ExtractOrdersFromSQL)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-multisource-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "container", + "entityUrn": "urn:li:container:7f68411fe6779dae7f37506c1e9bacfe", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-multisource-test", + "lastRunId": "no-run-id-provided" + } +} +] \ No newline at end of file diff --git a/metadata-ingestion/tests/integration/azure_data_factory/adf_nested_golden.json b/metadata-ingestion/tests/integration/azure_data_factory/adf_nested_golden.json new file mode 100644 index 00000000000000..62ba1ab0e9c1d2 --- /dev/null +++ b/metadata-ingestion/tests/integration/azure_data_factory/adf_nested_golden.json @@ -0,0 +1,895 @@ +[ +{ + "entityType": "container", + "entityUrn": "urn:li:container:7f68411fe6779dae7f37506c1e9bacfe", + "changeType": "UPSERT", + "aspectName": "containerProperties", + "aspect": { + "json": { + "customProperties": { + "platform": "azure-data-factory", + "env": "DEV", + "resource_group": "complex-test-rg", + "factory_name": "complex-data-factory", + "azure_resource_id": "/subscriptions/12345678-1234-1234-1234-123456789012/resourceGroups/complex-test-rg/providers/Microsoft.DataFactory/factories/complex-data-factory", + "location": "eastus", + "tag:environment": "test", + "tag:purpose": "complex-integration-tests", + "provisioning_state": "Succeeded" + }, + "externalUrl": "https://adf.azure.com/en/home?factory=/subscriptions/12345678-1234-1234-1234-123456789012/resourceGroups/complex-test-rg/providers/Microsoft.DataFactory/factories/complex-data-factory", + "name": "complex-data-factory", + "description": "Azure Data Factory: complex-data-factory", + "env": "DEV" + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-nested-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "container", + "entityUrn": "urn:li:container:7f68411fe6779dae7f37506c1e9bacfe", + "changeType": "UPSERT", + "aspectName": "dataPlatformInstance", + "aspect": { + "json": { + "platform": "urn:li:dataPlatform:azure-data-factory" + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-nested-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "container", + "entityUrn": "urn:li:container:7f68411fe6779dae7f37506c1e9bacfe", + "changeType": "UPSERT", + "aspectName": "subTypes", + "aspect": { + "json": { + "typeNames": [ + "Data Factory" + ] + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-nested-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "container", + "entityUrn": "urn:li:container:7f68411fe6779dae7f37506c1e9bacfe", + "changeType": "UPSERT", + "aspectName": "browsePathsV2", + "aspect": { + "json": { + "path": [] + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-nested-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataFlow", + "entityUrn": "urn:li:dataFlow:(azure-data-factory,complex-data-factory.ParentOrchestrationPipeline,DEV)", + "changeType": "UPSERT", + "aspectName": "dataPlatformInstance", + "aspect": { + "json": { + "platform": "urn:li:dataPlatform:azure-data-factory" + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-nested-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataFlow", + "entityUrn": "urn:li:dataFlow:(azure-data-factory,complex-data-factory.ParentOrchestrationPipeline,DEV)", + "changeType": "UPSERT", + "aspectName": "dataFlowInfo", + "aspect": { + "json": { + "customProperties": { + "azure_resource_id": "/subscriptions/12345678-1234-1234-1234-123456789012/resourceGroups/complex-test-rg/providers/Microsoft.DataFactory/factories/complex-data-factory/pipelines/ParentOrchestrationPipeline", + "factory_name": "complex-data-factory" + }, + "externalUrl": "https://adf.azure.com/en/authoring/pipeline/ParentOrchestrationPipeline?factory=/subscriptions/12345678-1234-1234-1234-123456789012/resourceGroups/complex-test-rg/providers/Microsoft.DataFactory/factories/complex-data-factory", + "name": "ParentOrchestrationPipeline", + "description": "Parent orchestration pipeline that calls child pipelines", + "env": "DEV" + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-nested-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataFlow", + "entityUrn": "urn:li:dataFlow:(azure-data-factory,complex-data-factory.ParentOrchestrationPipeline,DEV)", + "changeType": "UPSERT", + "aspectName": "subTypes", + "aspect": { + "json": { + "typeNames": [ + "Pipeline" + ] + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-nested-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataFlow", + "entityUrn": "urn:li:dataFlow:(azure-data-factory,complex-data-factory.ParentOrchestrationPipeline,DEV)", + "changeType": "UPSERT", + "aspectName": "container", + "aspect": { + "json": { + "container": "urn:li:container:7f68411fe6779dae7f37506c1e9bacfe" + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-nested-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataFlow", + "entityUrn": "urn:li:dataFlow:(azure-data-factory,complex-data-factory.ParentOrchestrationPipeline,DEV)", + "changeType": "UPSERT", + "aspectName": "browsePathsV2", + "aspect": { + "json": { + "path": [ + { + "id": "urn:li:container:7f68411fe6779dae7f37506c1e9bacfe", + "urn": "urn:li:container:7f68411fe6779dae7f37506c1e9bacfe" + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-nested-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure-data-factory,complex-data-factory.ParentOrchestrationPipeline,DEV),ExecuteDataMovement)", + "changeType": "UPSERT", + "aspectName": "dataPlatformInstance", + "aspect": { + "json": { + "platform": "urn:li:dataPlatform:azure-data-factory" + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-nested-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure-data-factory,complex-data-factory.ParentOrchestrationPipeline,DEV),ExecuteDataMovement)", + "changeType": "UPSERT", + "aspectName": "dataJobInfo", + "aspect": { + "json": { + "customProperties": { + "activity_type": "ExecutePipeline", + "calls_pipeline": "ChildDataMovementPipeline", + "child_pipeline_urn": "urn:li:dataFlow:(azure-data-factory,complex-data-factory.ChildDataMovementPipeline,DEV)", + "child_first_activity": "CopyCustomersToStaging" + }, + "externalUrl": "https://adf.azure.com/en/authoring/pipeline/ParentOrchestrationPipeline?factory=/subscriptions/12345678-1234-1234-1234-123456789012/resourceGroups/complex-test-rg/providers/Microsoft.DataFactory/factories/complex-data-factory", + "name": "ExecuteDataMovement", + "type": { + "string": "COMMAND" + }, + "flowUrn": "urn:li:dataFlow:(azure-data-factory,complex-data-factory.ParentOrchestrationPipeline,DEV)", + "env": "DEV" + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-nested-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure-data-factory,complex-data-factory.ParentOrchestrationPipeline,DEV),ExecuteDataMovement)", + "changeType": "UPSERT", + "aspectName": "subTypes", + "aspect": { + "json": { + "typeNames": [ + "Execute Pipeline" + ] + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-nested-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure-data-factory,complex-data-factory.ParentOrchestrationPipeline,DEV),ExecuteDataMovement)", + "changeType": "UPSERT", + "aspectName": "browsePathsV2", + "aspect": { + "json": { + "path": [ + { + "id": "urn:li:container:7f68411fe6779dae7f37506c1e9bacfe", + "urn": "urn:li:container:7f68411fe6779dae7f37506c1e9bacfe" + }, + { + "id": "complex-data-factory.ParentOrchestrationPipeline", + "urn": "urn:li:dataFlow:(azure-data-factory,complex-data-factory.ParentOrchestrationPipeline,DEV)" + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-nested-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure-data-factory,complex-data-factory.ChildDataMovementPipeline,DEV),CopyCustomersToStaging)", + "changeType": "UPSERT", + "aspectName": "dataJobInputOutput", + "aspect": { + "json": { + "inputDatasets": [], + "outputDatasets": [], + "inputDatajobs": [ + "urn:li:dataJob:(urn:li:dataFlow:(azure-data-factory,complex-data-factory.ParentOrchestrationPipeline,DEV),ExecuteDataMovement)" + ] + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-nested-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure-data-factory,complex-data-factory.ParentOrchestrationPipeline,DEV),ExecuteTransform)", + "changeType": "UPSERT", + "aspectName": "dataPlatformInstance", + "aspect": { + "json": { + "platform": "urn:li:dataPlatform:azure-data-factory" + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-nested-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure-data-factory,complex-data-factory.ParentOrchestrationPipeline,DEV),ExecuteTransform)", + "changeType": "UPSERT", + "aspectName": "dataJobInfo", + "aspect": { + "json": { + "customProperties": { + "activity_type": "ExecutePipeline", + "calls_pipeline": "ChildTransformPipeline", + "child_pipeline_urn": "urn:li:dataFlow:(azure-data-factory,complex-data-factory.ChildTransformPipeline,DEV)", + "child_first_activity": "TransformCustomerData" + }, + "externalUrl": "https://adf.azure.com/en/authoring/pipeline/ParentOrchestrationPipeline?factory=/subscriptions/12345678-1234-1234-1234-123456789012/resourceGroups/complex-test-rg/providers/Microsoft.DataFactory/factories/complex-data-factory", + "name": "ExecuteTransform", + "type": { + "string": "COMMAND" + }, + "flowUrn": "urn:li:dataFlow:(azure-data-factory,complex-data-factory.ParentOrchestrationPipeline,DEV)", + "env": "DEV" + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-nested-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure-data-factory,complex-data-factory.ParentOrchestrationPipeline,DEV),ExecuteTransform)", + "changeType": "UPSERT", + "aspectName": "subTypes", + "aspect": { + "json": { + "typeNames": [ + "Execute Pipeline" + ] + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-nested-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure-data-factory,complex-data-factory.ParentOrchestrationPipeline,DEV),ExecuteTransform)", + "changeType": "UPSERT", + "aspectName": "browsePathsV2", + "aspect": { + "json": { + "path": [ + { + "id": "urn:li:container:7f68411fe6779dae7f37506c1e9bacfe", + "urn": "urn:li:container:7f68411fe6779dae7f37506c1e9bacfe" + }, + { + "id": "complex-data-factory.ParentOrchestrationPipeline", + "urn": "urn:li:dataFlow:(azure-data-factory,complex-data-factory.ParentOrchestrationPipeline,DEV)" + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-nested-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure-data-factory,complex-data-factory.ChildTransformPipeline,DEV),TransformCustomerData)", + "changeType": "UPSERT", + "aspectName": "dataJobInputOutput", + "aspect": { + "json": { + "inputDatasets": [], + "outputDatasets": [], + "inputDatajobs": [ + "urn:li:dataJob:(urn:li:dataFlow:(azure-data-factory,complex-data-factory.ParentOrchestrationPipeline,DEV),ExecuteTransform)" + ] + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-nested-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataFlow", + "entityUrn": "urn:li:dataFlow:(azure-data-factory,complex-data-factory.ChildDataMovementPipeline,DEV)", + "changeType": "UPSERT", + "aspectName": "dataPlatformInstance", + "aspect": { + "json": { + "platform": "urn:li:dataPlatform:azure-data-factory" + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-nested-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataFlow", + "entityUrn": "urn:li:dataFlow:(azure-data-factory,complex-data-factory.ChildDataMovementPipeline,DEV)", + "changeType": "UPSERT", + "aspectName": "dataFlowInfo", + "aspect": { + "json": { + "customProperties": { + "azure_resource_id": "/subscriptions/12345678-1234-1234-1234-123456789012/resourceGroups/complex-test-rg/providers/Microsoft.DataFactory/factories/complex-data-factory/pipelines/ChildDataMovementPipeline", + "factory_name": "complex-data-factory" + }, + "externalUrl": "https://adf.azure.com/en/authoring/pipeline/ChildDataMovementPipeline?factory=/subscriptions/12345678-1234-1234-1234-123456789012/resourceGroups/complex-test-rg/providers/Microsoft.DataFactory/factories/complex-data-factory", + "name": "ChildDataMovementPipeline", + "description": "Child pipeline for data movement", + "env": "DEV" + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-nested-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataFlow", + "entityUrn": "urn:li:dataFlow:(azure-data-factory,complex-data-factory.ChildDataMovementPipeline,DEV)", + "changeType": "UPSERT", + "aspectName": "subTypes", + "aspect": { + "json": { + "typeNames": [ + "Pipeline" + ] + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-nested-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataFlow", + "entityUrn": "urn:li:dataFlow:(azure-data-factory,complex-data-factory.ChildDataMovementPipeline,DEV)", + "changeType": "UPSERT", + "aspectName": "container", + "aspect": { + "json": { + "container": "urn:li:container:7f68411fe6779dae7f37506c1e9bacfe" + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-nested-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataFlow", + "entityUrn": "urn:li:dataFlow:(azure-data-factory,complex-data-factory.ChildDataMovementPipeline,DEV)", + "changeType": "UPSERT", + "aspectName": "browsePathsV2", + "aspect": { + "json": { + "path": [ + { + "id": "urn:li:container:7f68411fe6779dae7f37506c1e9bacfe", + "urn": "urn:li:container:7f68411fe6779dae7f37506c1e9bacfe" + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-nested-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure-data-factory,complex-data-factory.ChildDataMovementPipeline,DEV),CopyCustomersToStaging)", + "changeType": "UPSERT", + "aspectName": "dataPlatformInstance", + "aspect": { + "json": { + "platform": "urn:li:dataPlatform:azure-data-factory" + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-nested-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure-data-factory,complex-data-factory.ChildDataMovementPipeline,DEV),CopyCustomersToStaging)", + "changeType": "UPSERT", + "aspectName": "dataJobInfo", + "aspect": { + "json": { + "customProperties": { + "activity_type": "Copy" + }, + "externalUrl": "https://adf.azure.com/en/authoring/pipeline/ChildDataMovementPipeline?factory=/subscriptions/12345678-1234-1234-1234-123456789012/resourceGroups/complex-test-rg/providers/Microsoft.DataFactory/factories/complex-data-factory", + "name": "CopyCustomersToStaging", + "type": { + "string": "COMMAND" + }, + "flowUrn": "urn:li:dataFlow:(azure-data-factory,complex-data-factory.ChildDataMovementPipeline,DEV)", + "env": "DEV" + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-nested-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure-data-factory,complex-data-factory.ChildDataMovementPipeline,DEV),CopyCustomersToStaging)", + "changeType": "UPSERT", + "aspectName": "subTypes", + "aspect": { + "json": { + "typeNames": [ + "Copy Activity" + ] + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-nested-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure-data-factory,complex-data-factory.ChildDataMovementPipeline,DEV),CopyCustomersToStaging)", + "changeType": "UPSERT", + "aspectName": "dataJobInputOutput", + "aspect": { + "json": { + "inputDatasets": [ + "urn:li:dataset:(urn:li:dataPlatform:mssql,Customers,DEV)" + ], + "outputDatasets": [ + "urn:li:dataset:(urn:li:dataPlatform:abs,staging/customers,DEV)" + ] + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-nested-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure-data-factory,complex-data-factory.ChildDataMovementPipeline,DEV),CopyCustomersToStaging)", + "changeType": "UPSERT", + "aspectName": "browsePathsV2", + "aspect": { + "json": { + "path": [ + { + "id": "urn:li:container:7f68411fe6779dae7f37506c1e9bacfe", + "urn": "urn:li:container:7f68411fe6779dae7f37506c1e9bacfe" + }, + { + "id": "complex-data-factory.ChildDataMovementPipeline", + "urn": "urn:li:dataFlow:(azure-data-factory,complex-data-factory.ChildDataMovementPipeline,DEV)" + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-nested-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataFlow", + "entityUrn": "urn:li:dataFlow:(azure-data-factory,complex-data-factory.ChildTransformPipeline,DEV)", + "changeType": "UPSERT", + "aspectName": "dataPlatformInstance", + "aspect": { + "json": { + "platform": "urn:li:dataPlatform:azure-data-factory" + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-nested-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataFlow", + "entityUrn": "urn:li:dataFlow:(azure-data-factory,complex-data-factory.ChildTransformPipeline,DEV)", + "changeType": "UPSERT", + "aspectName": "dataFlowInfo", + "aspect": { + "json": { + "customProperties": { + "azure_resource_id": "/subscriptions/12345678-1234-1234-1234-123456789012/resourceGroups/complex-test-rg/providers/Microsoft.DataFactory/factories/complex-data-factory/pipelines/ChildTransformPipeline", + "factory_name": "complex-data-factory" + }, + "externalUrl": "https://adf.azure.com/en/authoring/pipeline/ChildTransformPipeline?factory=/subscriptions/12345678-1234-1234-1234-123456789012/resourceGroups/complex-test-rg/providers/Microsoft.DataFactory/factories/complex-data-factory", + "name": "ChildTransformPipeline", + "description": "Child pipeline for data transformation", + "env": "DEV" + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-nested-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataFlow", + "entityUrn": "urn:li:dataFlow:(azure-data-factory,complex-data-factory.ChildTransformPipeline,DEV)", + "changeType": "UPSERT", + "aspectName": "subTypes", + "aspect": { + "json": { + "typeNames": [ + "Pipeline" + ] + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-nested-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataFlow", + "entityUrn": "urn:li:dataFlow:(azure-data-factory,complex-data-factory.ChildTransformPipeline,DEV)", + "changeType": "UPSERT", + "aspectName": "container", + "aspect": { + "json": { + "container": "urn:li:container:7f68411fe6779dae7f37506c1e9bacfe" + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-nested-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataFlow", + "entityUrn": "urn:li:dataFlow:(azure-data-factory,complex-data-factory.ChildTransformPipeline,DEV)", + "changeType": "UPSERT", + "aspectName": "browsePathsV2", + "aspect": { + "json": { + "path": [ + { + "id": "urn:li:container:7f68411fe6779dae7f37506c1e9bacfe", + "urn": "urn:li:container:7f68411fe6779dae7f37506c1e9bacfe" + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-nested-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure-data-factory,complex-data-factory.ChildTransformPipeline,DEV),TransformCustomerData)", + "changeType": "UPSERT", + "aspectName": "dataPlatformInstance", + "aspect": { + "json": { + "platform": "urn:li:dataPlatform:azure-data-factory" + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-nested-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure-data-factory,complex-data-factory.ChildTransformPipeline,DEV),TransformCustomerData)", + "changeType": "UPSERT", + "aspectName": "dataJobInfo", + "aspect": { + "json": { + "customProperties": { + "activity_type": "ExecuteDataFlow" + }, + "externalUrl": "https://adf.azure.com/en/authoring/pipeline/ChildTransformPipeline?factory=/subscriptions/12345678-1234-1234-1234-123456789012/resourceGroups/complex-test-rg/providers/Microsoft.DataFactory/factories/complex-data-factory", + "name": "TransformCustomerData", + "type": { + "string": "COMMAND" + }, + "flowUrn": "urn:li:dataFlow:(azure-data-factory,complex-data-factory.ChildTransformPipeline,DEV)", + "env": "DEV" + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-nested-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure-data-factory,complex-data-factory.ChildTransformPipeline,DEV),TransformCustomerData)", + "changeType": "UPSERT", + "aspectName": "subTypes", + "aspect": { + "json": { + "typeNames": [ + "Data Flow Activity" + ] + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-nested-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure-data-factory,complex-data-factory.ChildTransformPipeline,DEV),TransformCustomerData)", + "changeType": "UPSERT", + "aspectName": "browsePathsV2", + "aspect": { + "json": { + "path": [ + { + "id": "urn:li:container:7f68411fe6779dae7f37506c1e9bacfe", + "urn": "urn:li:container:7f68411fe6779dae7f37506c1e9bacfe" + }, + { + "id": "complex-data-factory.ChildTransformPipeline", + "urn": "urn:li:dataFlow:(azure-data-factory,complex-data-factory.ChildTransformPipeline,DEV)" + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-nested-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "container", + "entityUrn": "urn:li:container:7f68411fe6779dae7f37506c1e9bacfe", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-nested-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataFlow", + "entityUrn": "urn:li:dataFlow:(azure-data-factory,complex-data-factory.ChildDataMovementPipeline,DEV)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-nested-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataFlow", + "entityUrn": "urn:li:dataFlow:(azure-data-factory,complex-data-factory.ChildTransformPipeline,DEV)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-nested-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataFlow", + "entityUrn": "urn:li:dataFlow:(azure-data-factory,complex-data-factory.ParentOrchestrationPipeline,DEV)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-nested-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure-data-factory,complex-data-factory.ChildDataMovementPipeline,DEV),CopyCustomersToStaging)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-nested-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure-data-factory,complex-data-factory.ChildTransformPipeline,DEV),TransformCustomerData)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-nested-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure-data-factory,complex-data-factory.ParentOrchestrationPipeline,DEV),ExecuteDataMovement)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-nested-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure-data-factory,complex-data-factory.ParentOrchestrationPipeline,DEV),ExecuteTransform)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-nested-test", + "lastRunId": "no-run-id-provided" + } +} +] \ No newline at end of file diff --git a/metadata-ingestion/tests/integration/azure_data_factory/adf_platform_instance_golden.json b/metadata-ingestion/tests/integration/azure_data_factory/adf_platform_instance_golden.json new file mode 100644 index 00000000000000..72c59841446d22 --- /dev/null +++ b/metadata-ingestion/tests/integration/azure_data_factory/adf_platform_instance_golden.json @@ -0,0 +1,812 @@ +[ +{ + "entityType": "container", + "entityUrn": "urn:li:container:99b9785e9e12713c9df27982572a999c", + "changeType": "UPSERT", + "aspectName": "containerProperties", + "aspect": { + "json": { + "customProperties": { + "platform": "azure-data-factory", + "instance": "my-adf-instance", + "env": "DEV", + "resource_group": "test-resource-group", + "factory_name": "test-data-factory", + "azure_resource_id": "/subscriptions/12345678-1234-1234-1234-123456789012/resourceGroups/test-resource-group/providers/Microsoft.DataFactory/factories/test-data-factory", + "location": "eastus", + "tag:environment": "test", + "tag:team": "data-engineering", + "provisioning_state": "Succeeded" + }, + "externalUrl": "https://adf.azure.com/en/home?factory=/subscriptions/12345678-1234-1234-1234-123456789012/resourceGroups/test-resource-group/providers/Microsoft.DataFactory/factories/test-data-factory", + "name": "test-data-factory", + "description": "Azure Data Factory: test-data-factory", + "env": "DEV" + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-test-platform-instance", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "container", + "entityUrn": "urn:li:container:99b9785e9e12713c9df27982572a999c", + "changeType": "UPSERT", + "aspectName": "dataPlatformInstance", + "aspect": { + "json": { + "platform": "urn:li:dataPlatform:azure-data-factory", + "instance": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:azure-data-factory,my-adf-instance)" + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-test-platform-instance", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure-data-factory,my-adf-instance.test-data-factory.DataIngestionPipeline,DEV),TransformData)", + "changeType": "UPSERT", + "aspectName": "dataJobInfo", + "aspect": { + "json": { + "customProperties": { + "activity_type": "ExecuteDataFlow", + "activity_description": "Execute mapping data flow", + "timeout": "7.00:00:00", + "retry": "0" + }, + "externalUrl": "https://adf.azure.com/en/authoring/pipeline/DataIngestionPipeline?factory=/subscriptions/12345678-1234-1234-1234-123456789012/resourceGroups/test-resource-group/providers/Microsoft.DataFactory/factories/test-data-factory", + "name": "TransformData", + "description": "Execute mapping data flow", + "type": { + "string": "COMMAND" + }, + "flowUrn": "urn:li:dataFlow:(azure-data-factory,my-adf-instance.test-data-factory.DataIngestionPipeline,DEV)", + "env": "DEV" + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-test-platform-instance", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "container", + "entityUrn": "urn:li:container:99b9785e9e12713c9df27982572a999c", + "changeType": "UPSERT", + "aspectName": "browsePathsV2", + "aspect": { + "json": { + "path": [ + { + "id": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:azure-data-factory,my-adf-instance)", + "urn": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:azure-data-factory,my-adf-instance)" + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-test-platform-instance", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure-data-factory,my-adf-instance.test-data-factory.DataIngestionPipeline,DEV),TransformData)", + "changeType": "UPSERT", + "aspectName": "subTypes", + "aspect": { + "json": { + "typeNames": [ + "Data Flow Activity" + ] + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-test-platform-instance", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "container", + "entityUrn": "urn:li:container:99b9785e9e12713c9df27982572a999c", + "changeType": "UPSERT", + "aspectName": "subTypes", + "aspect": { + "json": { + "typeNames": [ + "Data Factory" + ] + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-test-platform-instance", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure-data-factory,my-adf-instance.test-data-factory.DataIngestionPipeline,DEV),TransformData)", + "changeType": "UPSERT", + "aspectName": "dataPlatformInstance", + "aspect": { + "json": { + "platform": "urn:li:dataPlatform:azure-data-factory", + "instance": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:azure-data-factory,my-adf-instance)" + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-test-platform-instance", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure-data-factory,my-adf-instance.test-data-factory.DataIngestionPipeline,DEV),TransformData)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-test-platform-instance", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataFlow", + "entityUrn": "urn:li:dataFlow:(azure-data-factory,my-adf-instance.test-data-factory.DataIngestionPipeline,DEV)", + "changeType": "UPSERT", + "aspectName": "dataPlatformInstance", + "aspect": { + "json": { + "platform": "urn:li:dataPlatform:azure-data-factory", + "instance": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:azure-data-factory,my-adf-instance)" + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-test-platform-instance", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure-data-factory,my-adf-instance.test-data-factory.DataIngestionPipeline,DEV),CopyBlobToSQL)", + "changeType": "UPSERT", + "aspectName": "dataPlatformInstance", + "aspect": { + "json": { + "platform": "urn:li:dataPlatform:azure-data-factory", + "instance": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:azure-data-factory,my-adf-instance)" + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-test-platform-instance", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataFlow", + "entityUrn": "urn:li:dataFlow:(azure-data-factory,my-adf-instance.test-data-factory.DataIngestionPipeline,DEV)", + "changeType": "UPSERT", + "aspectName": "dataFlowInfo", + "aspect": { + "json": { + "customProperties": { + "azure_resource_id": "/subscriptions/12345678-1234-1234-1234-123456789012/resourceGroups/test-resource-group/providers/Microsoft.DataFactory/factories/test-data-factory/pipelines/DataIngestionPipeline", + "factory_name": "test-data-factory", + "triggers": "DailyScheduleTrigger" + }, + "externalUrl": "https://adf.azure.com/en/authoring/pipeline/DataIngestionPipeline?factory=/subscriptions/12345678-1234-1234-1234-123456789012/resourceGroups/test-resource-group/providers/Microsoft.DataFactory/factories/test-data-factory", + "name": "DataIngestionPipeline", + "description": "Main data ingestion pipeline", + "env": "DEV" + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-test-platform-instance", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure-data-factory,my-adf-instance.test-data-factory.DataIngestionPipeline,DEV),CopyBlobToSQL)", + "changeType": "UPSERT", + "aspectName": "dataJobInfo", + "aspect": { + "json": { + "customProperties": { + "activity_type": "Copy", + "activity_description": "Copy data from Blob to SQL", + "timeout": "7.00:00:00", + "retry": "0" + }, + "externalUrl": "https://adf.azure.com/en/authoring/pipeline/DataIngestionPipeline?factory=/subscriptions/12345678-1234-1234-1234-123456789012/resourceGroups/test-resource-group/providers/Microsoft.DataFactory/factories/test-data-factory", + "name": "CopyBlobToSQL", + "description": "Copy data from Blob to SQL", + "type": { + "string": "COMMAND" + }, + "flowUrn": "urn:li:dataFlow:(azure-data-factory,my-adf-instance.test-data-factory.DataIngestionPipeline,DEV)", + "env": "DEV" + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-test-platform-instance", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure-data-factory,my-adf-instance.test-data-factory.DataIngestionPipeline,DEV),CopyBlobToSQL)", + "changeType": "UPSERT", + "aspectName": "dataJobInputOutput", + "aspect": { + "json": { + "inputDatasets": [ + "urn:li:dataset:(urn:li:dataPlatform:abs,raw-data/input/data.csv,DEV)" + ], + "outputDatasets": [ + "urn:li:dataset:(urn:li:dataPlatform:mssql,ProcessedData,DEV)" + ] + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-test-platform-instance", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure-data-factory,my-adf-instance.test-data-factory.DataIngestionPipeline,DEV),CopyBlobToSQL)", + "changeType": "UPSERT", + "aspectName": "browsePathsV2", + "aspect": { + "json": { + "path": [ + { + "id": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:azure-data-factory,my-adf-instance)", + "urn": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:azure-data-factory,my-adf-instance)" + }, + { + "id": "urn:li:container:99b9785e9e12713c9df27982572a999c", + "urn": "urn:li:container:99b9785e9e12713c9df27982572a999c" + }, + { + "id": "my-adf-instance.test-data-factory.DataIngestionPipeline", + "urn": "urn:li:dataFlow:(azure-data-factory,my-adf-instance.test-data-factory.DataIngestionPipeline,DEV)" + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-test-platform-instance", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure-data-factory,my-adf-instance.test-data-factory.DataIngestionPipeline,DEV),CopyBlobToSQL)", + "changeType": "UPSERT", + "aspectName": "subTypes", + "aspect": { + "json": { + "typeNames": [ + "Copy Activity" + ] + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-test-platform-instance", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure-data-factory,my-adf-instance.test-data-factory.DataIngestionPipeline,DEV),TransformData)", + "changeType": "UPSERT", + "aspectName": "browsePathsV2", + "aspect": { + "json": { + "path": [ + { + "id": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:azure-data-factory,my-adf-instance)", + "urn": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:azure-data-factory,my-adf-instance)" + }, + { + "id": "urn:li:container:99b9785e9e12713c9df27982572a999c", + "urn": "urn:li:container:99b9785e9e12713c9df27982572a999c" + }, + { + "id": "my-adf-instance.test-data-factory.DataIngestionPipeline", + "urn": "urn:li:dataFlow:(azure-data-factory,my-adf-instance.test-data-factory.DataIngestionPipeline,DEV)" + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-test-platform-instance", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataFlow", + "entityUrn": "urn:li:dataFlow:(azure-data-factory,my-adf-instance.test-data-factory.DataIngestionPipeline,DEV)", + "changeType": "UPSERT", + "aspectName": "subTypes", + "aspect": { + "json": { + "typeNames": [ + "Pipeline" + ] + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-test-platform-instance", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataFlow", + "entityUrn": "urn:li:dataFlow:(azure-data-factory,my-adf-instance.test-data-factory.DataIngestionPipeline,DEV)", + "changeType": "UPSERT", + "aspectName": "browsePathsV2", + "aspect": { + "json": { + "path": [ + { + "id": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:azure-data-factory,my-adf-instance)", + "urn": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:azure-data-factory,my-adf-instance)" + }, + { + "id": "urn:li:container:99b9785e9e12713c9df27982572a999c", + "urn": "urn:li:container:99b9785e9e12713c9df27982572a999c" + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-test-platform-instance", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataFlow", + "entityUrn": "urn:li:dataFlow:(azure-data-factory,my-adf-instance.test-data-factory.DataIngestionPipeline,DEV)", + "changeType": "UPSERT", + "aspectName": "container", + "aspect": { + "json": { + "container": "urn:li:container:99b9785e9e12713c9df27982572a999c" + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-test-platform-instance", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure-data-factory,my-adf-instance.test-data-factory.DataIngestionPipeline,DEV),LookupConfig)", + "changeType": "UPSERT", + "aspectName": "dataPlatformInstance", + "aspect": { + "json": { + "platform": "urn:li:dataPlatform:azure-data-factory", + "instance": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:azure-data-factory,my-adf-instance)" + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-test-platform-instance", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataFlow", + "entityUrn": "urn:li:dataFlow:(azure-data-factory,my-adf-instance.test-data-factory.DataIngestionPipeline,DEV)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-test-platform-instance", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure-data-factory,my-adf-instance.test-data-factory.DataIngestionPipeline,DEV),LookupConfig)", + "changeType": "UPSERT", + "aspectName": "dataJobInfo", + "aspect": { + "json": { + "customProperties": { + "activity_type": "Lookup", + "activity_description": "Lookup configuration values", + "timeout": "7.00:00:00", + "retry": "0" + }, + "externalUrl": "https://adf.azure.com/en/authoring/pipeline/DataIngestionPipeline?factory=/subscriptions/12345678-1234-1234-1234-123456789012/resourceGroups/test-resource-group/providers/Microsoft.DataFactory/factories/test-data-factory", + "name": "LookupConfig", + "description": "Lookup configuration values", + "type": { + "string": "COMMAND" + }, + "flowUrn": "urn:li:dataFlow:(azure-data-factory,my-adf-instance.test-data-factory.DataIngestionPipeline,DEV)", + "env": "DEV" + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-test-platform-instance", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure-data-factory,my-adf-instance.test-data-factory.DataIngestionPipeline,DEV),LookupConfig)", + "changeType": "UPSERT", + "aspectName": "dataJobInputOutput", + "aspect": { + "json": { + "inputDatasets": [ + "urn:li:dataset:(urn:li:dataPlatform:abs,config/settings.json,DEV)" + ], + "outputDatasets": [] + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-test-platform-instance", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure-data-factory,my-adf-instance.test-data-factory.DataIngestionPipeline,DEV),LookupConfig)", + "changeType": "UPSERT", + "aspectName": "browsePathsV2", + "aspect": { + "json": { + "path": [ + { + "id": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:azure-data-factory,my-adf-instance)", + "urn": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:azure-data-factory,my-adf-instance)" + }, + { + "id": "urn:li:container:99b9785e9e12713c9df27982572a999c", + "urn": "urn:li:container:99b9785e9e12713c9df27982572a999c" + }, + { + "id": "my-adf-instance.test-data-factory.DataIngestionPipeline", + "urn": "urn:li:dataFlow:(azure-data-factory,my-adf-instance.test-data-factory.DataIngestionPipeline,DEV)" + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-test-platform-instance", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure-data-factory,my-adf-instance.test-data-factory.DataIngestionPipeline,DEV),LookupConfig)", + "changeType": "UPSERT", + "aspectName": "subTypes", + "aspect": { + "json": { + "typeNames": [ + "Lookup Activity" + ] + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-test-platform-instance", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure-data-factory,my-adf-instance.test-data-factory.DataProcessingPipeline,DEV),CallStoredProc)", + "changeType": "UPSERT", + "aspectName": "dataJobInfo", + "aspect": { + "json": { + "customProperties": { + "activity_type": "SqlServerStoredProcedure", + "activity_description": "Call stored procedure", + "timeout": "7.00:00:00", + "retry": "0" + }, + "externalUrl": "https://adf.azure.com/en/authoring/pipeline/DataProcessingPipeline?factory=/subscriptions/12345678-1234-1234-1234-123456789012/resourceGroups/test-resource-group/providers/Microsoft.DataFactory/factories/test-data-factory", + "name": "CallStoredProc", + "description": "Call stored procedure", + "type": { + "string": "COMMAND" + }, + "flowUrn": "urn:li:dataFlow:(azure-data-factory,my-adf-instance.test-data-factory.DataProcessingPipeline,DEV)", + "env": "DEV" + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-test-platform-instance", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure-data-factory,my-adf-instance.test-data-factory.DataProcessingPipeline,DEV),CallStoredProc)", + "changeType": "UPSERT", + "aspectName": "subTypes", + "aspect": { + "json": { + "typeNames": [ + "Stored Procedure Activity" + ] + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-test-platform-instance", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure-data-factory,my-adf-instance.test-data-factory.DataProcessingPipeline,DEV),CallStoredProc)", + "changeType": "UPSERT", + "aspectName": "dataPlatformInstance", + "aspect": { + "json": { + "platform": "urn:li:dataPlatform:azure-data-factory", + "instance": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:azure-data-factory,my-adf-instance)" + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-test-platform-instance", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure-data-factory,my-adf-instance.test-data-factory.DataProcessingPipeline,DEV),CallStoredProc)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-test-platform-instance", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure-data-factory,my-adf-instance.test-data-factory.DataProcessingPipeline,DEV),CallStoredProc)", + "changeType": "UPSERT", + "aspectName": "browsePathsV2", + "aspect": { + "json": { + "path": [ + { + "id": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:azure-data-factory,my-adf-instance)", + "urn": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:azure-data-factory,my-adf-instance)" + }, + { + "id": "urn:li:container:99b9785e9e12713c9df27982572a999c", + "urn": "urn:li:container:99b9785e9e12713c9df27982572a999c" + }, + { + "id": "my-adf-instance.test-data-factory.DataProcessingPipeline", + "urn": "urn:li:dataFlow:(azure-data-factory,my-adf-instance.test-data-factory.DataProcessingPipeline,DEV)" + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-test-platform-instance", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "container", + "entityUrn": "urn:li:container:99b9785e9e12713c9df27982572a999c", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-test-platform-instance", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataFlow", + "entityUrn": "urn:li:dataFlow:(azure-data-factory,my-adf-instance.test-data-factory.DataProcessingPipeline,DEV)", + "changeType": "UPSERT", + "aspectName": "subTypes", + "aspect": { + "json": { + "typeNames": [ + "Pipeline" + ] + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-test-platform-instance", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataFlow", + "entityUrn": "urn:li:dataFlow:(azure-data-factory,my-adf-instance.test-data-factory.DataProcessingPipeline,DEV)", + "changeType": "UPSERT", + "aspectName": "container", + "aspect": { + "json": { + "container": "urn:li:container:99b9785e9e12713c9df27982572a999c" + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-test-platform-instance", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataFlow", + "entityUrn": "urn:li:dataFlow:(azure-data-factory,my-adf-instance.test-data-factory.DataProcessingPipeline,DEV)", + "changeType": "UPSERT", + "aspectName": "dataPlatformInstance", + "aspect": { + "json": { + "platform": "urn:li:dataPlatform:azure-data-factory", + "instance": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:azure-data-factory,my-adf-instance)" + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-test-platform-instance", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataFlow", + "entityUrn": "urn:li:dataFlow:(azure-data-factory,my-adf-instance.test-data-factory.DataProcessingPipeline,DEV)", + "changeType": "UPSERT", + "aspectName": "dataFlowInfo", + "aspect": { + "json": { + "customProperties": { + "azure_resource_id": "/subscriptions/12345678-1234-1234-1234-123456789012/resourceGroups/test-resource-group/providers/Microsoft.DataFactory/factories/test-data-factory/pipelines/DataProcessingPipeline", + "factory_name": "test-data-factory" + }, + "externalUrl": "https://adf.azure.com/en/authoring/pipeline/DataProcessingPipeline?factory=/subscriptions/12345678-1234-1234-1234-123456789012/resourceGroups/test-resource-group/providers/Microsoft.DataFactory/factories/test-data-factory", + "name": "DataProcessingPipeline", + "description": "Data processing and transformation", + "env": "DEV" + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-test-platform-instance", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataFlow", + "entityUrn": "urn:li:dataFlow:(azure-data-factory,my-adf-instance.test-data-factory.DataProcessingPipeline,DEV)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-test-platform-instance", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure-data-factory,my-adf-instance.test-data-factory.DataIngestionPipeline,DEV),CopyBlobToSQL)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-test-platform-instance", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataFlow", + "entityUrn": "urn:li:dataFlow:(azure-data-factory,my-adf-instance.test-data-factory.DataProcessingPipeline,DEV)", + "changeType": "UPSERT", + "aspectName": "browsePathsV2", + "aspect": { + "json": { + "path": [ + { + "id": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:azure-data-factory,my-adf-instance)", + "urn": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:azure-data-factory,my-adf-instance)" + }, + { + "id": "urn:li:container:99b9785e9e12713c9df27982572a999c", + "urn": "urn:li:container:99b9785e9e12713c9df27982572a999c" + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-test-platform-instance", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure-data-factory,my-adf-instance.test-data-factory.DataIngestionPipeline,DEV),LookupConfig)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-test-platform-instance", + "lastRunId": "no-run-id-provided" + } +} +] \ No newline at end of file diff --git a/metadata-ingestion/tests/integration/azure_data_factory/adf_with_runs_golden.json b/metadata-ingestion/tests/integration/azure_data_factory/adf_with_runs_golden.json new file mode 100644 index 00000000000000..964ed8b1d83877 --- /dev/null +++ b/metadata-ingestion/tests/integration/azure_data_factory/adf_with_runs_golden.json @@ -0,0 +1,1781 @@ +[ +{ + "entityType": "container", + "entityUrn": "urn:li:container:cdaebfa861d2b2a3853328719496bce5", + "changeType": "UPSERT", + "aspectName": "containerProperties", + "aspect": { + "json": { + "customProperties": { + "platform": "azure-data-factory", + "env": "PROD", + "resource_group": "test-resource-group", + "factory_name": "test-data-factory", + "azure_resource_id": "/subscriptions/12345678-1234-1234-1234-123456789012/resourceGroups/test-resource-group/providers/Microsoft.DataFactory/factories/test-data-factory", + "location": "eastus", + "tag:environment": "test", + "tag:team": "data-engineering", + "provisioning_state": "Succeeded" + }, + "externalUrl": "https://adf.azure.com/en/home?factory=/subscriptions/12345678-1234-1234-1234-123456789012/resourceGroups/test-resource-group/providers/Microsoft.DataFactory/factories/test-data-factory", + "name": "test-data-factory", + "description": "Azure Data Factory: test-data-factory", + "env": "PROD" + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-test-with-runs", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "container", + "entityUrn": "urn:li:container:cdaebfa861d2b2a3853328719496bce5", + "changeType": "UPSERT", + "aspectName": "dataPlatformInstance", + "aspect": { + "json": { + "platform": "urn:li:dataPlatform:azure-data-factory" + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-test-with-runs", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "container", + "entityUrn": "urn:li:container:cdaebfa861d2b2a3853328719496bce5", + "changeType": "UPSERT", + "aspectName": "subTypes", + "aspect": { + "json": { + "typeNames": [ + "Data Factory" + ] + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-test-with-runs", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "container", + "entityUrn": "urn:li:container:cdaebfa861d2b2a3853328719496bce5", + "changeType": "UPSERT", + "aspectName": "browsePathsV2", + "aspect": { + "json": { + "path": [] + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-test-with-runs", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataFlow", + "entityUrn": "urn:li:dataFlow:(azure-data-factory,test-data-factory.DataIngestionPipeline,PROD)", + "changeType": "UPSERT", + "aspectName": "dataPlatformInstance", + "aspect": { + "json": { + "platform": "urn:li:dataPlatform:azure-data-factory" + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-test-with-runs", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataFlow", + "entityUrn": "urn:li:dataFlow:(azure-data-factory,test-data-factory.DataIngestionPipeline,PROD)", + "changeType": "UPSERT", + "aspectName": "dataFlowInfo", + "aspect": { + "json": { + "customProperties": { + "azure_resource_id": "/subscriptions/12345678-1234-1234-1234-123456789012/resourceGroups/test-resource-group/providers/Microsoft.DataFactory/factories/test-data-factory/pipelines/DataIngestionPipeline", + "factory_name": "test-data-factory", + "triggers": "DailyScheduleTrigger" + }, + "externalUrl": "https://adf.azure.com/en/authoring/pipeline/DataIngestionPipeline?factory=/subscriptions/12345678-1234-1234-1234-123456789012/resourceGroups/test-resource-group/providers/Microsoft.DataFactory/factories/test-data-factory", + "name": "DataIngestionPipeline", + "description": "Main data ingestion pipeline", + "env": "PROD" + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-test-with-runs", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataFlow", + "entityUrn": "urn:li:dataFlow:(azure-data-factory,test-data-factory.DataIngestionPipeline,PROD)", + "changeType": "UPSERT", + "aspectName": "subTypes", + "aspect": { + "json": { + "typeNames": [ + "Pipeline" + ] + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-test-with-runs", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataFlow", + "entityUrn": "urn:li:dataFlow:(azure-data-factory,test-data-factory.DataIngestionPipeline,PROD)", + "changeType": "UPSERT", + "aspectName": "container", + "aspect": { + "json": { + "container": "urn:li:container:cdaebfa861d2b2a3853328719496bce5" + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-test-with-runs", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataFlow", + "entityUrn": "urn:li:dataFlow:(azure-data-factory,test-data-factory.DataIngestionPipeline,PROD)", + "changeType": "UPSERT", + "aspectName": "browsePathsV2", + "aspect": { + "json": { + "path": [ + { + "id": "urn:li:container:cdaebfa861d2b2a3853328719496bce5", + "urn": "urn:li:container:cdaebfa861d2b2a3853328719496bce5" + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-test-with-runs", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure-data-factory,test-data-factory.DataIngestionPipeline,PROD),CopyBlobToSQL)", + "changeType": "UPSERT", + "aspectName": "dataPlatformInstance", + "aspect": { + "json": { + "platform": "urn:li:dataPlatform:azure-data-factory" + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-test-with-runs", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure-data-factory,test-data-factory.DataIngestionPipeline,PROD),CopyBlobToSQL)", + "changeType": "UPSERT", + "aspectName": "dataJobInfo", + "aspect": { + "json": { + "customProperties": { + "activity_type": "Copy", + "activity_description": "Copy data from Blob to SQL", + "timeout": "7.00:00:00", + "retry": "0" + }, + "externalUrl": "https://adf.azure.com/en/authoring/pipeline/DataIngestionPipeline?factory=/subscriptions/12345678-1234-1234-1234-123456789012/resourceGroups/test-resource-group/providers/Microsoft.DataFactory/factories/test-data-factory", + "name": "CopyBlobToSQL", + "description": "Copy data from Blob to SQL", + "type": { + "string": "COMMAND" + }, + "flowUrn": "urn:li:dataFlow:(azure-data-factory,test-data-factory.DataIngestionPipeline,PROD)", + "env": "PROD" + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-test-with-runs", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure-data-factory,test-data-factory.DataIngestionPipeline,PROD),CopyBlobToSQL)", + "changeType": "UPSERT", + "aspectName": "subTypes", + "aspect": { + "json": { + "typeNames": [ + "Copy Activity" + ] + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-test-with-runs", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure-data-factory,test-data-factory.DataIngestionPipeline,PROD),CopyBlobToSQL)", + "changeType": "UPSERT", + "aspectName": "dataJobInputOutput", + "aspect": { + "json": { + "inputDatasets": [ + "urn:li:dataset:(urn:li:dataPlatform:abs,raw-data/input/data.csv,PROD)" + ], + "outputDatasets": [ + "urn:li:dataset:(urn:li:dataPlatform:mssql,ProcessedData,PROD)" + ] + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-test-with-runs", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure-data-factory,test-data-factory.DataIngestionPipeline,PROD),CopyBlobToSQL)", + "changeType": "UPSERT", + "aspectName": "browsePathsV2", + "aspect": { + "json": { + "path": [ + { + "id": "urn:li:container:cdaebfa861d2b2a3853328719496bce5", + "urn": "urn:li:container:cdaebfa861d2b2a3853328719496bce5" + }, + { + "id": "test-data-factory.DataIngestionPipeline", + "urn": "urn:li:dataFlow:(azure-data-factory,test-data-factory.DataIngestionPipeline,PROD)" + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-test-with-runs", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure-data-factory,test-data-factory.DataIngestionPipeline,PROD),LookupConfig)", + "changeType": "UPSERT", + "aspectName": "dataPlatformInstance", + "aspect": { + "json": { + "platform": "urn:li:dataPlatform:azure-data-factory" + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-test-with-runs", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure-data-factory,test-data-factory.DataIngestionPipeline,PROD),LookupConfig)", + "changeType": "UPSERT", + "aspectName": "dataJobInfo", + "aspect": { + "json": { + "customProperties": { + "activity_type": "Lookup", + "activity_description": "Lookup configuration values", + "timeout": "7.00:00:00", + "retry": "0" + }, + "externalUrl": "https://adf.azure.com/en/authoring/pipeline/DataIngestionPipeline?factory=/subscriptions/12345678-1234-1234-1234-123456789012/resourceGroups/test-resource-group/providers/Microsoft.DataFactory/factories/test-data-factory", + "name": "LookupConfig", + "description": "Lookup configuration values", + "type": { + "string": "COMMAND" + }, + "flowUrn": "urn:li:dataFlow:(azure-data-factory,test-data-factory.DataIngestionPipeline,PROD)", + "env": "PROD" + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-test-with-runs", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure-data-factory,test-data-factory.DataIngestionPipeline,PROD),LookupConfig)", + "changeType": "UPSERT", + "aspectName": "subTypes", + "aspect": { + "json": { + "typeNames": [ + "Lookup Activity" + ] + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-test-with-runs", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure-data-factory,test-data-factory.DataIngestionPipeline,PROD),LookupConfig)", + "changeType": "UPSERT", + "aspectName": "dataJobInputOutput", + "aspect": { + "json": { + "inputDatasets": [ + "urn:li:dataset:(urn:li:dataPlatform:abs,config/settings.json,PROD)" + ], + "outputDatasets": [] + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-test-with-runs", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure-data-factory,test-data-factory.DataIngestionPipeline,PROD),LookupConfig)", + "changeType": "UPSERT", + "aspectName": "browsePathsV2", + "aspect": { + "json": { + "path": [ + { + "id": "urn:li:container:cdaebfa861d2b2a3853328719496bce5", + "urn": "urn:li:container:cdaebfa861d2b2a3853328719496bce5" + }, + { + "id": "test-data-factory.DataIngestionPipeline", + "urn": "urn:li:dataFlow:(azure-data-factory,test-data-factory.DataIngestionPipeline,PROD)" + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-test-with-runs", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure-data-factory,test-data-factory.DataIngestionPipeline,PROD),TransformData)", + "changeType": "UPSERT", + "aspectName": "dataPlatformInstance", + "aspect": { + "json": { + "platform": "urn:li:dataPlatform:azure-data-factory" + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-test-with-runs", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure-data-factory,test-data-factory.DataIngestionPipeline,PROD),TransformData)", + "changeType": "UPSERT", + "aspectName": "dataJobInfo", + "aspect": { + "json": { + "customProperties": { + "activity_type": "ExecuteDataFlow", + "activity_description": "Execute mapping data flow", + "timeout": "7.00:00:00", + "retry": "0" + }, + "externalUrl": "https://adf.azure.com/en/authoring/pipeline/DataIngestionPipeline?factory=/subscriptions/12345678-1234-1234-1234-123456789012/resourceGroups/test-resource-group/providers/Microsoft.DataFactory/factories/test-data-factory", + "name": "TransformData", + "description": "Execute mapping data flow", + "type": { + "string": "COMMAND" + }, + "flowUrn": "urn:li:dataFlow:(azure-data-factory,test-data-factory.DataIngestionPipeline,PROD)", + "env": "PROD" + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-test-with-runs", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure-data-factory,test-data-factory.DataIngestionPipeline,PROD),TransformData)", + "changeType": "UPSERT", + "aspectName": "subTypes", + "aspect": { + "json": { + "typeNames": [ + "Data Flow Activity" + ] + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-test-with-runs", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure-data-factory,test-data-factory.DataIngestionPipeline,PROD),TransformData)", + "changeType": "UPSERT", + "aspectName": "browsePathsV2", + "aspect": { + "json": { + "path": [ + { + "id": "urn:li:container:cdaebfa861d2b2a3853328719496bce5", + "urn": "urn:li:container:cdaebfa861d2b2a3853328719496bce5" + }, + { + "id": "test-data-factory.DataIngestionPipeline", + "urn": "urn:li:dataFlow:(azure-data-factory,test-data-factory.DataIngestionPipeline,PROD)" + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-test-with-runs", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataFlow", + "entityUrn": "urn:li:dataFlow:(azure-data-factory,test-data-factory.DataProcessingPipeline,PROD)", + "changeType": "UPSERT", + "aspectName": "dataPlatformInstance", + "aspect": { + "json": { + "platform": "urn:li:dataPlatform:azure-data-factory" + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-test-with-runs", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataFlow", + "entityUrn": "urn:li:dataFlow:(azure-data-factory,test-data-factory.DataProcessingPipeline,PROD)", + "changeType": "UPSERT", + "aspectName": "dataFlowInfo", + "aspect": { + "json": { + "customProperties": { + "azure_resource_id": "/subscriptions/12345678-1234-1234-1234-123456789012/resourceGroups/test-resource-group/providers/Microsoft.DataFactory/factories/test-data-factory/pipelines/DataProcessingPipeline", + "factory_name": "test-data-factory" + }, + "externalUrl": "https://adf.azure.com/en/authoring/pipeline/DataProcessingPipeline?factory=/subscriptions/12345678-1234-1234-1234-123456789012/resourceGroups/test-resource-group/providers/Microsoft.DataFactory/factories/test-data-factory", + "name": "DataProcessingPipeline", + "description": "Data processing and transformation", + "env": "PROD" + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-test-with-runs", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataFlow", + "entityUrn": "urn:li:dataFlow:(azure-data-factory,test-data-factory.DataProcessingPipeline,PROD)", + "changeType": "UPSERT", + "aspectName": "subTypes", + "aspect": { + "json": { + "typeNames": [ + "Pipeline" + ] + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-test-with-runs", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataFlow", + "entityUrn": "urn:li:dataFlow:(azure-data-factory,test-data-factory.DataProcessingPipeline,PROD)", + "changeType": "UPSERT", + "aspectName": "container", + "aspect": { + "json": { + "container": "urn:li:container:cdaebfa861d2b2a3853328719496bce5" + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-test-with-runs", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataFlow", + "entityUrn": "urn:li:dataFlow:(azure-data-factory,test-data-factory.DataProcessingPipeline,PROD)", + "changeType": "UPSERT", + "aspectName": "browsePathsV2", + "aspect": { + "json": { + "path": [ + { + "id": "urn:li:container:cdaebfa861d2b2a3853328719496bce5", + "urn": "urn:li:container:cdaebfa861d2b2a3853328719496bce5" + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-test-with-runs", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure-data-factory,test-data-factory.DataProcessingPipeline,PROD),CallStoredProc)", + "changeType": "UPSERT", + "aspectName": "dataPlatformInstance", + "aspect": { + "json": { + "platform": "urn:li:dataPlatform:azure-data-factory" + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-test-with-runs", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure-data-factory,test-data-factory.DataProcessingPipeline,PROD),CallStoredProc)", + "changeType": "UPSERT", + "aspectName": "dataJobInfo", + "aspect": { + "json": { + "customProperties": { + "activity_type": "SqlServerStoredProcedure", + "activity_description": "Call stored procedure", + "timeout": "7.00:00:00", + "retry": "0" + }, + "externalUrl": "https://adf.azure.com/en/authoring/pipeline/DataProcessingPipeline?factory=/subscriptions/12345678-1234-1234-1234-123456789012/resourceGroups/test-resource-group/providers/Microsoft.DataFactory/factories/test-data-factory", + "name": "CallStoredProc", + "description": "Call stored procedure", + "type": { + "string": "COMMAND" + }, + "flowUrn": "urn:li:dataFlow:(azure-data-factory,test-data-factory.DataProcessingPipeline,PROD)", + "env": "PROD" + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-test-with-runs", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure-data-factory,test-data-factory.DataProcessingPipeline,PROD),CallStoredProc)", + "changeType": "UPSERT", + "aspectName": "subTypes", + "aspect": { + "json": { + "typeNames": [ + "Stored Procedure Activity" + ] + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-test-with-runs", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure-data-factory,test-data-factory.DataProcessingPipeline,PROD),CallStoredProc)", + "changeType": "UPSERT", + "aspectName": "browsePathsV2", + "aspect": { + "json": { + "path": [ + { + "id": "urn:li:container:cdaebfa861d2b2a3853328719496bce5", + "urn": "urn:li:container:cdaebfa861d2b2a3853328719496bce5" + }, + { + "id": "test-data-factory.DataProcessingPipeline", + "urn": "urn:li:dataFlow:(azure-data-factory,test-data-factory.DataProcessingPipeline,PROD)" + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-test-with-runs", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataProcessInstance", + "entityUrn": "urn:li:dataProcessInstance:7aa70b5e31344dc1946c045ef1df4619", + "changeType": "UPSERT", + "aspectName": "dataProcessInstanceProperties", + "aspect": { + "json": { + "customProperties": { + "run_id": "run-001-abc", + "status": "Succeeded", + "invoked_by": "Manual", + "invoked_by_type": "Manual" + }, + "externalUrl": "https://adf.azure.com/en/monitoring/pipelineruns/run-001-abc?factory=/subscriptions/12345678-1234-1234-1234-123456789012/resourceGroups/test-resource-group/providers/Microsoft.DataFactory/factories/test-data-factory", + "name": "run-001-abc", + "type": "BATCH_SCHEDULED", + "created": { + "time": 1705305600000, + "actor": "urn:li:corpuser:datahub" + } + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-test-with-runs", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataProcessInstance", + "entityUrn": "urn:li:dataProcessInstance:7aa70b5e31344dc1946c045ef1df4619", + "changeType": "UPSERT", + "aspectName": "dataProcessInstanceRelationships", + "aspect": { + "json": { + "parentTemplate": "urn:li:dataFlow:(azure-data-factory,test-data-factory.DataIngestionPipeline,PROD)", + "upstreamInstances": [] + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-test-with-runs", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataProcessInstance", + "entityUrn": "urn:li:dataProcessInstance:7aa70b5e31344dc1946c045ef1df4619", + "changeType": "UPSERT", + "aspectName": "subTypes", + "aspect": { + "json": { + "typeNames": [ + "Pipeline Run" + ] + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-test-with-runs", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataProcessInstance", + "entityUrn": "urn:li:dataProcessInstance:7aa70b5e31344dc1946c045ef1df4619", + "changeType": "UPSERT", + "aspectName": "dataProcessInstanceRunEvent", + "aspect": { + "json": { + "timestampMillis": 1705305600000, + "partitionSpec": { + "partition": "FULL_TABLE_SNAPSHOT", + "type": "FULL_TABLE" + }, + "status": "STARTED" + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-test-with-runs", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataProcessInstance", + "entityUrn": "urn:li:dataProcessInstance:7aa70b5e31344dc1946c045ef1df4619", + "changeType": "UPSERT", + "aspectName": "dataProcessInstanceRunEvent", + "aspect": { + "json": { + "timestampMillis": 1705308300000, + "partitionSpec": { + "partition": "FULL_TABLE_SNAPSHOT", + "type": "FULL_TABLE" + }, + "status": "COMPLETE", + "result": { + "type": "SUCCESS", + "nativeResultType": "Succeeded" + } + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-test-with-runs", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataProcessInstance", + "entityUrn": "urn:li:dataProcessInstance:564015ffc3af551e6d8e26a5f4710ea5", + "changeType": "UPSERT", + "aspectName": "dataProcessInstanceProperties", + "aspect": { + "json": { + "customProperties": { + "activity_run_id": "act-001-copy", + "activity_type": "Copy", + "pipeline_run_id": "run-001-abc", + "status": "Succeeded", + "duration_ms": "900000" + }, + "externalUrl": "https://adf.azure.com/en/monitoring/pipelineruns/run-001-abc?factory=/subscriptions/12345678-1234-1234-1234-123456789012/resourceGroups/test-resource-group/providers/Microsoft.DataFactory/factories/test-data-factory", + "name": "act-001-copy", + "type": "BATCH_SCHEDULED", + "created": { + "time": 1705305900000, + "actor": "urn:li:corpuser:datahub" + } + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-test-with-runs", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataProcessInstance", + "entityUrn": "urn:li:dataProcessInstance:564015ffc3af551e6d8e26a5f4710ea5", + "changeType": "UPSERT", + "aspectName": "dataProcessInstanceRelationships", + "aspect": { + "json": { + "parentTemplate": "urn:li:dataJob:(urn:li:dataFlow:(azure-data-factory,test-data-factory.DataIngestionPipeline,PROD),CopyBlobToSQL)", + "upstreamInstances": [] + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-test-with-runs", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataProcessInstance", + "entityUrn": "urn:li:dataProcessInstance:564015ffc3af551e6d8e26a5f4710ea5", + "changeType": "UPSERT", + "aspectName": "subTypes", + "aspect": { + "json": { + "typeNames": [ + "Activity Run" + ] + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-test-with-runs", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataProcessInstance", + "entityUrn": "urn:li:dataProcessInstance:564015ffc3af551e6d8e26a5f4710ea5", + "changeType": "UPSERT", + "aspectName": "dataProcessInstanceRunEvent", + "aspect": { + "json": { + "timestampMillis": 1705305900000, + "partitionSpec": { + "partition": "FULL_TABLE_SNAPSHOT", + "type": "FULL_TABLE" + }, + "status": "STARTED" + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-test-with-runs", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataProcessInstance", + "entityUrn": "urn:li:dataProcessInstance:564015ffc3af551e6d8e26a5f4710ea5", + "changeType": "UPSERT", + "aspectName": "dataProcessInstanceRunEvent", + "aspect": { + "json": { + "timestampMillis": 1705306800000, + "partitionSpec": { + "partition": "FULL_TABLE_SNAPSHOT", + "type": "FULL_TABLE" + }, + "status": "COMPLETE", + "result": { + "type": "SUCCESS", + "nativeResultType": "Succeeded" + } + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-test-with-runs", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataProcessInstance", + "entityUrn": "urn:li:dataProcessInstance:aaef305ee765d2fac00af4c0f4d859a7", + "changeType": "UPSERT", + "aspectName": "dataProcessInstanceProperties", + "aspect": { + "json": { + "customProperties": { + "activity_run_id": "act-001-lookup", + "activity_type": "Lookup", + "pipeline_run_id": "run-001-abc", + "status": "Succeeded", + "duration_ms": "60000" + }, + "externalUrl": "https://adf.azure.com/en/monitoring/pipelineruns/run-001-abc?factory=/subscriptions/12345678-1234-1234-1234-123456789012/resourceGroups/test-resource-group/providers/Microsoft.DataFactory/factories/test-data-factory", + "name": "act-001-lookup", + "type": "BATCH_SCHEDULED", + "created": { + "time": 1705306800000, + "actor": "urn:li:corpuser:datahub" + } + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-test-with-runs", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataProcessInstance", + "entityUrn": "urn:li:dataProcessInstance:aaef305ee765d2fac00af4c0f4d859a7", + "changeType": "UPSERT", + "aspectName": "dataProcessInstanceRelationships", + "aspect": { + "json": { + "parentTemplate": "urn:li:dataJob:(urn:li:dataFlow:(azure-data-factory,test-data-factory.DataIngestionPipeline,PROD),LookupConfig)", + "upstreamInstances": [] + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-test-with-runs", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataProcessInstance", + "entityUrn": "urn:li:dataProcessInstance:aaef305ee765d2fac00af4c0f4d859a7", + "changeType": "UPSERT", + "aspectName": "subTypes", + "aspect": { + "json": { + "typeNames": [ + "Activity Run" + ] + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-test-with-runs", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataProcessInstance", + "entityUrn": "urn:li:dataProcessInstance:aaef305ee765d2fac00af4c0f4d859a7", + "changeType": "UPSERT", + "aspectName": "dataProcessInstanceRunEvent", + "aspect": { + "json": { + "timestampMillis": 1705306800000, + "partitionSpec": { + "partition": "FULL_TABLE_SNAPSHOT", + "type": "FULL_TABLE" + }, + "status": "STARTED" + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-test-with-runs", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataProcessInstance", + "entityUrn": "urn:li:dataProcessInstance:aaef305ee765d2fac00af4c0f4d859a7", + "changeType": "UPSERT", + "aspectName": "dataProcessInstanceRunEvent", + "aspect": { + "json": { + "timestampMillis": 1705306860000, + "partitionSpec": { + "partition": "FULL_TABLE_SNAPSHOT", + "type": "FULL_TABLE" + }, + "status": "COMPLETE", + "result": { + "type": "SUCCESS", + "nativeResultType": "Succeeded" + } + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-test-with-runs", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataProcessInstance", + "entityUrn": "urn:li:dataProcessInstance:be65c76a61bba7d112df51643744bd7d", + "changeType": "UPSERT", + "aspectName": "dataProcessInstanceProperties", + "aspect": { + "json": { + "customProperties": { + "activity_run_id": "act-001-transform", + "activity_type": "ExecuteDataFlow", + "pipeline_run_id": "run-001-abc", + "status": "Succeeded", + "duration_ms": "1440000" + }, + "externalUrl": "https://adf.azure.com/en/monitoring/pipelineruns/run-001-abc?factory=/subscriptions/12345678-1234-1234-1234-123456789012/resourceGroups/test-resource-group/providers/Microsoft.DataFactory/factories/test-data-factory", + "name": "act-001-transform", + "type": "BATCH_SCHEDULED", + "created": { + "time": 1705306860000, + "actor": "urn:li:corpuser:datahub" + } + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-test-with-runs", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataProcessInstance", + "entityUrn": "urn:li:dataProcessInstance:be65c76a61bba7d112df51643744bd7d", + "changeType": "UPSERT", + "aspectName": "dataProcessInstanceRelationships", + "aspect": { + "json": { + "parentTemplate": "urn:li:dataJob:(urn:li:dataFlow:(azure-data-factory,test-data-factory.DataIngestionPipeline,PROD),TransformData)", + "upstreamInstances": [] + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-test-with-runs", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataProcessInstance", + "entityUrn": "urn:li:dataProcessInstance:be65c76a61bba7d112df51643744bd7d", + "changeType": "UPSERT", + "aspectName": "subTypes", + "aspect": { + "json": { + "typeNames": [ + "Activity Run" + ] + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-test-with-runs", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataProcessInstance", + "entityUrn": "urn:li:dataProcessInstance:be65c76a61bba7d112df51643744bd7d", + "changeType": "UPSERT", + "aspectName": "dataProcessInstanceRunEvent", + "aspect": { + "json": { + "timestampMillis": 1705306860000, + "partitionSpec": { + "partition": "FULL_TABLE_SNAPSHOT", + "type": "FULL_TABLE" + }, + "status": "STARTED" + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-test-with-runs", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataProcessInstance", + "entityUrn": "urn:li:dataProcessInstance:be65c76a61bba7d112df51643744bd7d", + "changeType": "UPSERT", + "aspectName": "dataProcessInstanceRunEvent", + "aspect": { + "json": { + "timestampMillis": 1705308300000, + "partitionSpec": { + "partition": "FULL_TABLE_SNAPSHOT", + "type": "FULL_TABLE" + }, + "status": "COMPLETE", + "result": { + "type": "SUCCESS", + "nativeResultType": "Succeeded" + } + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-test-with-runs", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataProcessInstance", + "entityUrn": "urn:li:dataProcessInstance:1f3c62dd3b62b29a83221a57582a9bf3", + "changeType": "UPSERT", + "aspectName": "dataProcessInstanceProperties", + "aspect": { + "json": { + "customProperties": { + "run_id": "run-002-def", + "status": "Failed", + "invoked_by": "Manual", + "invoked_by_type": "Manual" + }, + "externalUrl": "https://adf.azure.com/en/monitoring/pipelineruns/run-002-def?factory=/subscriptions/12345678-1234-1234-1234-123456789012/resourceGroups/test-resource-group/providers/Microsoft.DataFactory/factories/test-data-factory", + "name": "run-002-def", + "type": "BATCH_SCHEDULED", + "created": { + "time": 1705219200000, + "actor": "urn:li:corpuser:datahub" + } + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-test-with-runs", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataProcessInstance", + "entityUrn": "urn:li:dataProcessInstance:1f3c62dd3b62b29a83221a57582a9bf3", + "changeType": "UPSERT", + "aspectName": "dataProcessInstanceRelationships", + "aspect": { + "json": { + "parentTemplate": "urn:li:dataFlow:(azure-data-factory,test-data-factory.DataIngestionPipeline,PROD)", + "upstreamInstances": [] + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-test-with-runs", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataProcessInstance", + "entityUrn": "urn:li:dataProcessInstance:1f3c62dd3b62b29a83221a57582a9bf3", + "changeType": "UPSERT", + "aspectName": "subTypes", + "aspect": { + "json": { + "typeNames": [ + "Pipeline Run" + ] + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-test-with-runs", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataProcessInstance", + "entityUrn": "urn:li:dataProcessInstance:1f3c62dd3b62b29a83221a57582a9bf3", + "changeType": "UPSERT", + "aspectName": "dataProcessInstanceRunEvent", + "aspect": { + "json": { + "timestampMillis": 1705219200000, + "partitionSpec": { + "partition": "FULL_TABLE_SNAPSHOT", + "type": "FULL_TABLE" + }, + "status": "STARTED" + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-test-with-runs", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataProcessInstance", + "entityUrn": "urn:li:dataProcessInstance:1f3c62dd3b62b29a83221a57582a9bf3", + "changeType": "UPSERT", + "aspectName": "dataProcessInstanceRunEvent", + "aspect": { + "json": { + "timestampMillis": 1705220100000, + "partitionSpec": { + "partition": "FULL_TABLE_SNAPSHOT", + "type": "FULL_TABLE" + }, + "status": "COMPLETE", + "result": { + "type": "FAILURE", + "nativeResultType": "Failed" + } + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-test-with-runs", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataProcessInstance", + "entityUrn": "urn:li:dataProcessInstance:c86ba22f37d917ce9c2933e1cc443fd5", + "changeType": "UPSERT", + "aspectName": "dataProcessInstanceProperties", + "aspect": { + "json": { + "customProperties": { + "activity_run_id": "act-002-copy", + "activity_type": "Copy", + "pipeline_run_id": "run-002-def", + "status": "Failed", + "duration_ms": "600000", + "error": "Connection timeout to SQL database" + }, + "externalUrl": "https://adf.azure.com/en/monitoring/pipelineruns/run-002-def?factory=/subscriptions/12345678-1234-1234-1234-123456789012/resourceGroups/test-resource-group/providers/Microsoft.DataFactory/factories/test-data-factory", + "name": "act-002-copy", + "type": "BATCH_SCHEDULED", + "created": { + "time": 1705219500000, + "actor": "urn:li:corpuser:datahub" + } + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-test-with-runs", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataProcessInstance", + "entityUrn": "urn:li:dataProcessInstance:c86ba22f37d917ce9c2933e1cc443fd5", + "changeType": "UPSERT", + "aspectName": "dataProcessInstanceRelationships", + "aspect": { + "json": { + "parentTemplate": "urn:li:dataJob:(urn:li:dataFlow:(azure-data-factory,test-data-factory.DataIngestionPipeline,PROD),CopyBlobToSQL)", + "upstreamInstances": [] + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-test-with-runs", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataProcessInstance", + "entityUrn": "urn:li:dataProcessInstance:c86ba22f37d917ce9c2933e1cc443fd5", + "changeType": "UPSERT", + "aspectName": "subTypes", + "aspect": { + "json": { + "typeNames": [ + "Activity Run" + ] + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-test-with-runs", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataProcessInstance", + "entityUrn": "urn:li:dataProcessInstance:c86ba22f37d917ce9c2933e1cc443fd5", + "changeType": "UPSERT", + "aspectName": "dataProcessInstanceRunEvent", + "aspect": { + "json": { + "timestampMillis": 1705219500000, + "partitionSpec": { + "partition": "FULL_TABLE_SNAPSHOT", + "type": "FULL_TABLE" + }, + "status": "STARTED" + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-test-with-runs", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataProcessInstance", + "entityUrn": "urn:li:dataProcessInstance:c86ba22f37d917ce9c2933e1cc443fd5", + "changeType": "UPSERT", + "aspectName": "dataProcessInstanceRunEvent", + "aspect": { + "json": { + "timestampMillis": 1705220100000, + "partitionSpec": { + "partition": "FULL_TABLE_SNAPSHOT", + "type": "FULL_TABLE" + }, + "status": "COMPLETE", + "result": { + "type": "FAILURE", + "nativeResultType": "Failed" + } + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-test-with-runs", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataProcessInstance", + "entityUrn": "urn:li:dataProcessInstance:a9a82c0897d67213837e8ee52e99bd9b", + "changeType": "UPSERT", + "aspectName": "dataProcessInstanceProperties", + "aspect": { + "json": { + "customProperties": { + "run_id": "run-003-ghi", + "status": "Succeeded", + "invoked_by": "Manual", + "invoked_by_type": "Manual" + }, + "externalUrl": "https://adf.azure.com/en/monitoring/pipelineruns/run-003-ghi?factory=/subscriptions/12345678-1234-1234-1234-123456789012/resourceGroups/test-resource-group/providers/Microsoft.DataFactory/factories/test-data-factory", + "name": "run-003-ghi", + "type": "BATCH_SCHEDULED", + "created": { + "time": 1705309200000, + "actor": "urn:li:corpuser:datahub" + } + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-test-with-runs", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataProcessInstance", + "entityUrn": "urn:li:dataProcessInstance:a9a82c0897d67213837e8ee52e99bd9b", + "changeType": "UPSERT", + "aspectName": "dataProcessInstanceRelationships", + "aspect": { + "json": { + "parentTemplate": "urn:li:dataFlow:(azure-data-factory,test-data-factory.DataProcessingPipeline,PROD)", + "upstreamInstances": [] + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-test-with-runs", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataProcessInstance", + "entityUrn": "urn:li:dataProcessInstance:a9a82c0897d67213837e8ee52e99bd9b", + "changeType": "UPSERT", + "aspectName": "subTypes", + "aspect": { + "json": { + "typeNames": [ + "Pipeline Run" + ] + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-test-with-runs", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataProcessInstance", + "entityUrn": "urn:li:dataProcessInstance:a9a82c0897d67213837e8ee52e99bd9b", + "changeType": "UPSERT", + "aspectName": "dataProcessInstanceRunEvent", + "aspect": { + "json": { + "timestampMillis": 1705309200000, + "partitionSpec": { + "partition": "FULL_TABLE_SNAPSHOT", + "type": "FULL_TABLE" + }, + "status": "STARTED" + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-test-with-runs", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataProcessInstance", + "entityUrn": "urn:li:dataProcessInstance:a9a82c0897d67213837e8ee52e99bd9b", + "changeType": "UPSERT", + "aspectName": "dataProcessInstanceRunEvent", + "aspect": { + "json": { + "timestampMillis": 1705311000000, + "partitionSpec": { + "partition": "FULL_TABLE_SNAPSHOT", + "type": "FULL_TABLE" + }, + "status": "COMPLETE", + "result": { + "type": "SUCCESS", + "nativeResultType": "Succeeded" + } + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-test-with-runs", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataProcessInstance", + "entityUrn": "urn:li:dataProcessInstance:215721aa606120d23beb87553668afb1", + "changeType": "UPSERT", + "aspectName": "dataProcessInstanceProperties", + "aspect": { + "json": { + "customProperties": { + "activity_run_id": "act-003-proc", + "activity_type": "SqlServerStoredProcedure", + "pipeline_run_id": "run-003-ghi", + "status": "Succeeded", + "duration_ms": "1500000" + }, + "externalUrl": "https://adf.azure.com/en/monitoring/pipelineruns/run-003-ghi?factory=/subscriptions/12345678-1234-1234-1234-123456789012/resourceGroups/test-resource-group/providers/Microsoft.DataFactory/factories/test-data-factory", + "name": "act-003-proc", + "type": "BATCH_SCHEDULED", + "created": { + "time": 1705309500000, + "actor": "urn:li:corpuser:datahub" + } + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-test-with-runs", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataProcessInstance", + "entityUrn": "urn:li:dataProcessInstance:215721aa606120d23beb87553668afb1", + "changeType": "UPSERT", + "aspectName": "dataProcessInstanceRelationships", + "aspect": { + "json": { + "parentTemplate": "urn:li:dataJob:(urn:li:dataFlow:(azure-data-factory,test-data-factory.DataProcessingPipeline,PROD),CallStoredProc)", + "upstreamInstances": [] + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-test-with-runs", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataProcessInstance", + "entityUrn": "urn:li:dataProcessInstance:215721aa606120d23beb87553668afb1", + "changeType": "UPSERT", + "aspectName": "subTypes", + "aspect": { + "json": { + "typeNames": [ + "Activity Run" + ] + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-test-with-runs", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataProcessInstance", + "entityUrn": "urn:li:dataProcessInstance:215721aa606120d23beb87553668afb1", + "changeType": "UPSERT", + "aspectName": "dataProcessInstanceRunEvent", + "aspect": { + "json": { + "timestampMillis": 1705309500000, + "partitionSpec": { + "partition": "FULL_TABLE_SNAPSHOT", + "type": "FULL_TABLE" + }, + "status": "STARTED" + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-test-with-runs", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataProcessInstance", + "entityUrn": "urn:li:dataProcessInstance:215721aa606120d23beb87553668afb1", + "changeType": "UPSERT", + "aspectName": "dataProcessInstanceRunEvent", + "aspect": { + "json": { + "timestampMillis": 1705311000000, + "partitionSpec": { + "partition": "FULL_TABLE_SNAPSHOT", + "type": "FULL_TABLE" + }, + "status": "COMPLETE", + "result": { + "type": "SUCCESS", + "nativeResultType": "Succeeded" + } + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-test-with-runs", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "container", + "entityUrn": "urn:li:container:cdaebfa861d2b2a3853328719496bce5", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-test-with-runs", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataFlow", + "entityUrn": "urn:li:dataFlow:(azure-data-factory,test-data-factory.DataIngestionPipeline,PROD)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-test-with-runs", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataFlow", + "entityUrn": "urn:li:dataFlow:(azure-data-factory,test-data-factory.DataProcessingPipeline,PROD)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-test-with-runs", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure-data-factory,test-data-factory.DataIngestionPipeline,PROD),CopyBlobToSQL)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-test-with-runs", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure-data-factory,test-data-factory.DataIngestionPipeline,PROD),LookupConfig)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-test-with-runs", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure-data-factory,test-data-factory.DataIngestionPipeline,PROD),TransformData)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-test-with-runs", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure-data-factory,test-data-factory.DataProcessingPipeline,PROD),CallStoredProc)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-test-with-runs", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataProcessInstance", + "entityUrn": "urn:li:dataProcessInstance:1f3c62dd3b62b29a83221a57582a9bf3", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-test-with-runs", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataProcessInstance", + "entityUrn": "urn:li:dataProcessInstance:215721aa606120d23beb87553668afb1", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-test-with-runs", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataProcessInstance", + "entityUrn": "urn:li:dataProcessInstance:564015ffc3af551e6d8e26a5f4710ea5", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-test-with-runs", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataProcessInstance", + "entityUrn": "urn:li:dataProcessInstance:7aa70b5e31344dc1946c045ef1df4619", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-test-with-runs", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataProcessInstance", + "entityUrn": "urn:li:dataProcessInstance:a9a82c0897d67213837e8ee52e99bd9b", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-test-with-runs", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataProcessInstance", + "entityUrn": "urn:li:dataProcessInstance:aaef305ee765d2fac00af4c0f4d859a7", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-test-with-runs", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataProcessInstance", + "entityUrn": "urn:li:dataProcessInstance:be65c76a61bba7d112df51643744bd7d", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-test-with-runs", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataProcessInstance", + "entityUrn": "urn:li:dataProcessInstance:c86ba22f37d917ce9c2933e1cc443fd5", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-test-with-runs", + "lastRunId": "no-run-id-provided" + } +} +] \ No newline at end of file diff --git a/metadata-ingestion/tests/integration/azure_data_factory/complex_mocks.py b/metadata-ingestion/tests/integration/azure_data_factory/complex_mocks.py new file mode 100644 index 00000000000000..3e72c96681d9ac --- /dev/null +++ b/metadata-ingestion/tests/integration/azure_data_factory/complex_mocks.py @@ -0,0 +1,1399 @@ +"""Complex mock data for Azure Data Factory integration tests. + +This module provides mock data for testing complex ADF pipeline patterns: +1. Nested Pipelines (Execute Pipeline activity) +2. ForEach Loops with multiple activities +3. Control Flow Branching (If-Condition, Switch) +4. Mapping Data Flows with transformations +5. Multi-Source Copy Pipelines (SQL → Blob → Synapse) +""" + +from typing import Any, Dict, List + +# Common test constants +SUBSCRIPTION_ID = "12345678-1234-1234-1234-123456789012" +RESOURCE_GROUP = "complex-test-rg" +FACTORY_NAME = "complex-data-factory" +LOCATION = "eastus" + + +def _base_resource_id(resource_type: str, name: str) -> str: + """Generate a standard Azure resource ID.""" + return ( + f"/subscriptions/{SUBSCRIPTION_ID}/resourceGroups/{RESOURCE_GROUP}" + f"/providers/Microsoft.DataFactory/factories/{FACTORY_NAME}/{resource_type}/{name}" + ) + + +# ============================================================================= +# LINKED SERVICES - Various platform types for lineage testing +# ============================================================================= + + +def create_complex_linked_services() -> List[Dict[str, Any]]: + """Create linked services for multiple platforms.""" + return [ + { + "id": _base_resource_id("linkedservices", "SqlServerSource"), + "name": "SqlServerSource", + "type": "Microsoft.DataFactory/factories/linkedservices", + "properties": { + "type": "AzureSqlDatabase", + "typeProperties": { + "connectionString": "Server=sql-server.database.windows.net;Database=SourceDB" + }, + }, + }, + { + "id": _base_resource_id("linkedservices", "BlobStorage"), + "name": "BlobStorage", + "type": "Microsoft.DataFactory/factories/linkedservices", + "properties": { + "type": "AzureBlobStorage", + "typeProperties": { + "connectionString": "DefaultEndpointsProtocol=https" + }, + }, + }, + { + "id": _base_resource_id("linkedservices", "SynapseDestination"), + "name": "SynapseDestination", + "type": "Microsoft.DataFactory/factories/linkedservices", + "properties": { + "type": "AzureSynapseAnalytics", + "typeProperties": { + "connectionString": "Server=synapse.sql.azuresynapse.net;Database=DW" + }, + }, + }, + { + "id": _base_resource_id("linkedservices", "SnowflakeConnection"), + "name": "SnowflakeConnection", + "type": "Microsoft.DataFactory/factories/linkedservices", + "properties": { + "type": "Snowflake", + "typeProperties": {"connectionString": "account=myaccount"}, + }, + }, + { + "id": _base_resource_id("linkedservices", "DataLakeStorage"), + "name": "DataLakeStorage", + "type": "Microsoft.DataFactory/factories/linkedservices", + "properties": { + "type": "AzureBlobFS", + "typeProperties": {"url": "https://datalake.dfs.core.windows.net"}, + }, + }, + ] + + +# ============================================================================= +# DATASETS - Input/output datasets for lineage +# ============================================================================= + + +def create_complex_datasets() -> List[Dict[str, Any]]: + """Create datasets for complex lineage scenarios.""" + return [ + # SQL Server datasets + { + "id": _base_resource_id("datasets", "SqlCustomersTable"), + "name": "SqlCustomersTable", + "type": "Microsoft.DataFactory/factories/datasets", + "properties": { + "linkedServiceName": { + "referenceName": "SqlServerSource", + "type": "LinkedServiceReference", + }, + "type": "AzureSqlTable", + "typeProperties": {"schema": "dbo", "table": "Customers"}, + }, + }, + { + "id": _base_resource_id("datasets", "SqlOrdersTable"), + "name": "SqlOrdersTable", + "type": "Microsoft.DataFactory/factories/datasets", + "properties": { + "linkedServiceName": { + "referenceName": "SqlServerSource", + "type": "LinkedServiceReference", + }, + "type": "AzureSqlTable", + "typeProperties": {"schema": "dbo", "table": "Orders"}, + }, + }, + { + "id": _base_resource_id("datasets", "SqlProductsTable"), + "name": "SqlProductsTable", + "type": "Microsoft.DataFactory/factories/datasets", + "properties": { + "linkedServiceName": { + "referenceName": "SqlServerSource", + "type": "LinkedServiceReference", + }, + "type": "AzureSqlTable", + "typeProperties": {"schema": "dbo", "table": "Products"}, + }, + }, + # Blob storage datasets + { + "id": _base_resource_id("datasets", "BlobStagingCustomers"), + "name": "BlobStagingCustomers", + "type": "Microsoft.DataFactory/factories/datasets", + "properties": { + "linkedServiceName": { + "referenceName": "BlobStorage", + "type": "LinkedServiceReference", + }, + "type": "DelimitedText", + "typeProperties": { + "location": { + "type": "AzureBlobStorageLocation", + "container": "staging", + "folderPath": "customers", + } + }, + }, + }, + { + "id": _base_resource_id("datasets", "BlobStagingOrders"), + "name": "BlobStagingOrders", + "type": "Microsoft.DataFactory/factories/datasets", + "properties": { + "linkedServiceName": { + "referenceName": "BlobStorage", + "type": "LinkedServiceReference", + }, + "type": "DelimitedText", + "typeProperties": { + "location": { + "type": "AzureBlobStorageLocation", + "container": "staging", + "folderPath": "orders", + } + }, + }, + }, + # Synapse datasets + { + "id": _base_resource_id("datasets", "SynapseCustomersDim"), + "name": "SynapseCustomersDim", + "type": "Microsoft.DataFactory/factories/datasets", + "properties": { + "linkedServiceName": { + "referenceName": "SynapseDestination", + "type": "LinkedServiceReference", + }, + "type": "AzureSqlDWTable", + "typeProperties": {"schema": "dw", "table": "DimCustomers"}, + }, + }, + { + "id": _base_resource_id("datasets", "SynapseOrdersFact"), + "name": "SynapseOrdersFact", + "type": "Microsoft.DataFactory/factories/datasets", + "properties": { + "linkedServiceName": { + "referenceName": "SynapseDestination", + "type": "LinkedServiceReference", + }, + "type": "AzureSqlDWTable", + "typeProperties": {"schema": "dw", "table": "FactOrders"}, + }, + }, + # Data Lake datasets for Data Flow + { + "id": _base_resource_id("datasets", "DataLakeRawData"), + "name": "DataLakeRawData", + "type": "Microsoft.DataFactory/factories/datasets", + "properties": { + "linkedServiceName": { + "referenceName": "DataLakeStorage", + "type": "LinkedServiceReference", + }, + "type": "Parquet", + "typeProperties": { + "location": { + "type": "AzureBlobFSLocation", + "fileSystem": "raw", + "folderPath": "sales", + } + }, + }, + }, + { + "id": _base_resource_id("datasets", "DataLakeCuratedData"), + "name": "DataLakeCuratedData", + "type": "Microsoft.DataFactory/factories/datasets", + "properties": { + "linkedServiceName": { + "referenceName": "DataLakeStorage", + "type": "LinkedServiceReference", + }, + "type": "Parquet", + "typeProperties": { + "location": { + "type": "AzureBlobFSLocation", + "fileSystem": "curated", + "folderPath": "sales_summary", + } + }, + }, + }, + ] + + +# ============================================================================= +# SCENARIO 1: NESTED PIPELINES (Execute Pipeline Activity) +# ============================================================================= + + +def create_nested_pipeline_scenario() -> Dict[str, Any]: + """Create mock data for nested pipeline scenario. + + Structure: + - ParentOrchestrationPipeline + └── ExecutePipeline: ChildDataMovementPipeline + └── Copy: SqlToBlob + └── ExecutePipeline: ChildTransformPipeline + └── DataFlow: TransformData + """ + child_data_movement = { + "id": _base_resource_id("pipelines", "ChildDataMovementPipeline"), + "name": "ChildDataMovementPipeline", + "type": "Microsoft.DataFactory/factories/pipelines", + "properties": { + "description": "Child pipeline for data movement", + "activities": [ + { + "name": "CopyCustomersToStaging", + "type": "Copy", + "inputs": [ + { + "referenceName": "SqlCustomersTable", + "type": "DatasetReference", + } + ], + "outputs": [ + { + "referenceName": "BlobStagingCustomers", + "type": "DatasetReference", + } + ], + "typeProperties": { + "source": {"type": "AzureSqlSource"}, + "sink": {"type": "DelimitedTextSink"}, + }, + } + ], + "parameters": {"sourceTable": {"type": "String"}}, + }, + } + + child_transform = { + "id": _base_resource_id("pipelines", "ChildTransformPipeline"), + "name": "ChildTransformPipeline", + "type": "Microsoft.DataFactory/factories/pipelines", + "properties": { + "description": "Child pipeline for data transformation", + "activities": [ + { + "name": "TransformCustomerData", + "type": "ExecuteDataFlow", + "typeProperties": { + "dataflow": { + "referenceName": "CustomerTransformFlow", + "type": "DataFlowReference", + } + }, + } + ], + }, + } + + parent_pipeline = { + "id": _base_resource_id("pipelines", "ParentOrchestrationPipeline"), + "name": "ParentOrchestrationPipeline", + "type": "Microsoft.DataFactory/factories/pipelines", + "properties": { + "description": "Parent orchestration pipeline that calls child pipelines", + "activities": [ + { + "name": "ExecuteDataMovement", + "type": "ExecutePipeline", + "typeProperties": { + "pipeline": { + "referenceName": "ChildDataMovementPipeline", + "type": "PipelineReference", + }, + "waitOnCompletion": True, + "parameters": {"sourceTable": "Customers"}, + }, + }, + { + "name": "ExecuteTransform", + "type": "ExecutePipeline", + "dependsOn": [ + { + "activity": "ExecuteDataMovement", + "dependencyConditions": ["Succeeded"], + } + ], + "typeProperties": { + "pipeline": { + "referenceName": "ChildTransformPipeline", + "type": "PipelineReference", + }, + "waitOnCompletion": True, + }, + }, + ], + }, + } + + return { + "pipelines": [parent_pipeline, child_data_movement, child_transform], + "expected_dataflows": 3, + "expected_datajobs": 4, # 2 ExecutePipeline + 1 Copy + 1 DataFlow + } + + +# ============================================================================= +# SCENARIO 2: FOREACH LOOPS +# ============================================================================= + + +def create_foreach_loop_scenario() -> Dict[str, Any]: + """Create mock data for ForEach loop scenario. + + Structure: + - ForEachTablePipeline + └── ForEach: IterateOverTables + └── Copy: CopyTableToStaging (parametrized) + """ + pipeline = { + "id": _base_resource_id("pipelines", "ForEachTablePipeline"), + "name": "ForEachTablePipeline", + "type": "Microsoft.DataFactory/factories/pipelines", + "properties": { + "description": "Pipeline with ForEach loop to copy multiple tables", + "parameters": { + "tableList": { + "type": "Array", + "defaultValue": ["Customers", "Orders", "Products"], + } + }, + "activities": [ + { + "name": "GetTableList", + "type": "Lookup", + "typeProperties": { + "source": { + "type": "AzureSqlSource", + "sqlReaderQuery": "SELECT name FROM sys.tables", + }, + "dataset": { + "referenceName": "SqlCustomersTable", + "type": "DatasetReference", + }, + "firstRowOnly": False, + }, + }, + { + "name": "IterateOverTables", + "type": "ForEach", + "dependsOn": [ + { + "activity": "GetTableList", + "dependencyConditions": ["Succeeded"], + } + ], + "typeProperties": { + "items": { + "value": "@activity('GetTableList').output.value", + "type": "Expression", + }, + "isSequential": False, + "batchCount": 5, + "activities": [ + { + "name": "CopyTableToStaging", + "type": "Copy", + "inputs": [ + { + "referenceName": "SqlCustomersTable", + "type": "DatasetReference", + } + ], + "outputs": [ + { + "referenceName": "BlobStagingCustomers", + "type": "DatasetReference", + } + ], + "typeProperties": { + "source": {"type": "AzureSqlSource"}, + "sink": {"type": "DelimitedTextSink"}, + }, + } + ], + }, + }, + ], + }, + } + + return { + "pipelines": [pipeline], + "expected_dataflows": 1, + "expected_datajobs": 3, # Lookup + ForEach + Copy (inside ForEach) + } + + +# ============================================================================= +# SCENARIO 3: CONTROL FLOW BRANCHING (If-Condition, Switch) +# ============================================================================= + + +def create_branching_scenario() -> Dict[str, Any]: + """Create mock data for control flow branching scenario. + + Structure: + - BranchingPipeline + └── Lookup: CheckDataExists + └── IfCondition: DataExistsCheck + ├── True: Copy: FullLoad + └── False: Copy: IncrementalLoad + └── Switch: ProcessByRegion + ├── Case "US": Copy: ProcessUSData + ├── Case "EU": Copy: ProcessEUData + └── Default: Copy: ProcessOtherData + """ + pipeline = { + "id": _base_resource_id("pipelines", "BranchingPipeline"), + "name": "BranchingPipeline", + "type": "Microsoft.DataFactory/factories/pipelines", + "properties": { + "description": "Pipeline with If-Condition and Switch branching", + "parameters": {"region": {"type": "String", "defaultValue": "US"}}, + "activities": [ + { + "name": "CheckDataExists", + "type": "Lookup", + "typeProperties": { + "source": { + "type": "AzureSqlSource", + "sqlReaderQuery": "SELECT COUNT(*) as cnt FROM dbo.Customers", + }, + "dataset": { + "referenceName": "SqlCustomersTable", + "type": "DatasetReference", + }, + }, + }, + { + "name": "DataExistsCheck", + "type": "IfCondition", + "dependsOn": [ + { + "activity": "CheckDataExists", + "dependencyConditions": ["Succeeded"], + } + ], + "typeProperties": { + "expression": { + "value": "@greater(activity('CheckDataExists').output.firstRow.cnt, 0)", + "type": "Expression", + }, + "ifTrueActivities": [ + { + "name": "FullLoad", + "type": "Copy", + "inputs": [ + { + "referenceName": "SqlCustomersTable", + "type": "DatasetReference", + } + ], + "outputs": [ + { + "referenceName": "BlobStagingCustomers", + "type": "DatasetReference", + } + ], + "typeProperties": { + "source": {"type": "AzureSqlSource"}, + "sink": {"type": "DelimitedTextSink"}, + }, + } + ], + "ifFalseActivities": [ + { + "name": "IncrementalLoad", + "type": "Copy", + "inputs": [ + { + "referenceName": "SqlOrdersTable", + "type": "DatasetReference", + } + ], + "outputs": [ + { + "referenceName": "BlobStagingOrders", + "type": "DatasetReference", + } + ], + "typeProperties": { + "source": {"type": "AzureSqlSource"}, + "sink": {"type": "DelimitedTextSink"}, + }, + } + ], + }, + }, + { + "name": "ProcessByRegion", + "type": "Switch", + "dependsOn": [ + { + "activity": "DataExistsCheck", + "dependencyConditions": ["Succeeded"], + } + ], + "typeProperties": { + "on": { + "value": "@pipeline().parameters.region", + "type": "Expression", + }, + "cases": [ + { + "value": "US", + "activities": [ + { + "name": "ProcessUSData", + "type": "Copy", + "inputs": [ + { + "referenceName": "SqlCustomersTable", + "type": "DatasetReference", + } + ], + "outputs": [ + { + "referenceName": "SynapseCustomersDim", + "type": "DatasetReference", + } + ], + "typeProperties": { + "source": {"type": "AzureSqlSource"}, + "sink": {"type": "SqlDWSink"}, + }, + } + ], + }, + { + "value": "EU", + "activities": [ + { + "name": "ProcessEUData", + "type": "Copy", + "inputs": [ + { + "referenceName": "SqlOrdersTable", + "type": "DatasetReference", + } + ], + "outputs": [ + { + "referenceName": "SynapseOrdersFact", + "type": "DatasetReference", + } + ], + "typeProperties": { + "source": {"type": "AzureSqlSource"}, + "sink": {"type": "SqlDWSink"}, + }, + } + ], + }, + ], + "defaultActivities": [ + { + "name": "ProcessOtherData", + "type": "Copy", + "inputs": [ + { + "referenceName": "SqlProductsTable", + "type": "DatasetReference", + } + ], + "outputs": [ + { + "referenceName": "BlobStagingCustomers", + "type": "DatasetReference", + } + ], + "typeProperties": { + "source": {"type": "AzureSqlSource"}, + "sink": {"type": "DelimitedTextSink"}, + }, + } + ], + }, + }, + ], + }, + } + + return { + "pipelines": [pipeline], + "expected_dataflows": 1, + "expected_datajobs": 8, # Lookup + IfCondition + 2 Copy (if branches) + Switch + 3 Copy (switch cases) + } + + +# ============================================================================= +# SCENARIO 4: MAPPING DATA FLOWS +# ============================================================================= + + +def create_dataflow_scenario() -> Dict[str, Any]: + """Create mock data for Mapping Data Flow scenario. + + Structure: + - DataFlowPipeline + └── ExecuteDataFlow: RunSalesTransformation + └── SalesTransformationFlow (sources → transforms → sinks) + """ + data_flow = { + "id": _base_resource_id("dataflows", "SalesTransformationFlow"), + "name": "SalesTransformationFlow", + "type": "Microsoft.DataFactory/factories/dataflows", + "properties": { + "type": "MappingDataFlow", + "description": "Complex data flow with multiple sources and transformations", + "typeProperties": { + "sources": [ + { + "name": "CustomersSource", + "dataset": { + "referenceName": "DataLakeRawData", + "type": "DatasetReference", + }, + }, + { + "name": "OrdersSource", + "dataset": { + "referenceName": "SqlOrdersTable", + "type": "DatasetReference", + }, + }, + ], + "sinks": [ + { + "name": "CuratedOutput", + "dataset": { + "referenceName": "DataLakeCuratedData", + "type": "DatasetReference", + }, + }, + { + "name": "SynapseOutput", + "dataset": { + "referenceName": "SynapseCustomersDim", + "type": "DatasetReference", + }, + }, + ], + "transformations": [ + { + "name": "FilterActiveCustomers", + "description": "Filter only active customers", + }, + { + "name": "JoinOrdersToCustomers", + "description": "Join orders with customers", + }, + { + "name": "AggregateByRegion", + "description": "Aggregate sales by region", + }, + { + "name": "DeriveMetrics", + "description": "Calculate derived metrics", + }, + ], + "scriptLines": [ + "source(output(", + " customer_id as integer,", + " name as string,", + " region as string,", + " is_active as boolean", + " ),", + " allowSchemaDrift: true) ~> CustomersSource", + "source(output(", + " order_id as integer,", + " customer_id as integer,", + " amount as decimal(10,2)", + " )) ~> OrdersSource", + "CustomersSource filter(is_active == true()) ~> FilterActiveCustomers", + "FilterActiveCustomers, OrdersSource join(", + " CustomersSource.customer_id == OrdersSource.customer_id", + " ) ~> JoinOrdersToCustomers", + "JoinOrdersToCustomers aggregate(", + " groupBy(region),", + " total_sales = sum(amount)", + " ) ~> AggregateByRegion", + "AggregateByRegion derive(", + " avg_order = total_sales / count(order_id)", + " ) ~> DeriveMetrics", + "DeriveMetrics sink() ~> CuratedOutput", + "DeriveMetrics sink() ~> SynapseOutput", + ], + }, + }, + } + + pipeline = { + "id": _base_resource_id("pipelines", "DataFlowPipeline"), + "name": "DataFlowPipeline", + "type": "Microsoft.DataFactory/factories/pipelines", + "properties": { + "description": "Pipeline that executes a mapping data flow", + "activities": [ + { + "name": "RunSalesTransformation", + "type": "ExecuteDataFlow", + "typeProperties": { + "dataflow": { + "referenceName": "SalesTransformationFlow", + "type": "DataFlowReference", + }, + "compute": {"coreCount": 8, "computeType": "General"}, + }, + } + ], + }, + } + + return { + "pipelines": [pipeline], + "data_flows": [data_flow], + "expected_dataflows": 1, + "expected_datajobs": 1, + "expected_lineage_sources": 2, # DataLakeRawData, SqlOrdersTable + "expected_lineage_sinks": 2, # DataLakeCuratedData, SynapseCustomersDim + } + + +# ============================================================================= +# SCENARIO 5: MULTI-SOURCE COPY CHAIN (SQL → Blob → Synapse) +# ============================================================================= + + +def create_multisource_chain_scenario() -> Dict[str, Any]: + """Create mock data for multi-source copy chain scenario. + + Structure: + - ETLPipeline + └── Copy: ExtractFromSQL (SQL → Blob) + └── Copy: LoadToSynapse (Blob → Synapse) + └── Copy: ArchiveToDataLake (Blob → DataLake) + + This tests end-to-end lineage: SQL → Blob → Synapse + └─→ DataLake + """ + pipeline = { + "id": _base_resource_id("pipelines", "ETLPipeline"), + "name": "ETLPipeline", + "type": "Microsoft.DataFactory/factories/pipelines", + "properties": { + "description": "Full ETL pipeline: Extract from SQL, stage in Blob, load to Synapse and archive", + "activities": [ + { + "name": "ExtractCustomersFromSQL", + "type": "Copy", + "inputs": [ + { + "referenceName": "SqlCustomersTable", + "type": "DatasetReference", + } + ], + "outputs": [ + { + "referenceName": "BlobStagingCustomers", + "type": "DatasetReference", + } + ], + "typeProperties": { + "source": {"type": "AzureSqlSource"}, + "sink": {"type": "DelimitedTextSink"}, + }, + }, + { + "name": "ExtractOrdersFromSQL", + "type": "Copy", + "inputs": [ + {"referenceName": "SqlOrdersTable", "type": "DatasetReference"} + ], + "outputs": [ + { + "referenceName": "BlobStagingOrders", + "type": "DatasetReference", + } + ], + "typeProperties": { + "source": {"type": "AzureSqlSource"}, + "sink": {"type": "DelimitedTextSink"}, + }, + }, + { + "name": "LoadCustomersToSynapse", + "type": "Copy", + "dependsOn": [ + { + "activity": "ExtractCustomersFromSQL", + "dependencyConditions": ["Succeeded"], + } + ], + "inputs": [ + { + "referenceName": "BlobStagingCustomers", + "type": "DatasetReference", + } + ], + "outputs": [ + { + "referenceName": "SynapseCustomersDim", + "type": "DatasetReference", + } + ], + "typeProperties": { + "source": {"type": "DelimitedTextSource"}, + "sink": {"type": "SqlDWSink", "allowPolyBase": True}, + }, + }, + { + "name": "LoadOrdersToSynapse", + "type": "Copy", + "dependsOn": [ + { + "activity": "ExtractOrdersFromSQL", + "dependencyConditions": ["Succeeded"], + } + ], + "inputs": [ + { + "referenceName": "BlobStagingOrders", + "type": "DatasetReference", + } + ], + "outputs": [ + { + "referenceName": "SynapseOrdersFact", + "type": "DatasetReference", + } + ], + "typeProperties": { + "source": {"type": "DelimitedTextSource"}, + "sink": {"type": "SqlDWSink", "allowPolyBase": True}, + }, + }, + { + "name": "ArchiveToDataLake", + "type": "Copy", + "dependsOn": [ + { + "activity": "LoadCustomersToSynapse", + "dependencyConditions": ["Succeeded"], + }, + { + "activity": "LoadOrdersToSynapse", + "dependencyConditions": ["Succeeded"], + }, + ], + "inputs": [ + { + "referenceName": "BlobStagingCustomers", + "type": "DatasetReference", + } + ], + "outputs": [ + {"referenceName": "DataLakeRawData", "type": "DatasetReference"} + ], + "typeProperties": { + "source": {"type": "DelimitedTextSource"}, + "sink": {"type": "ParquetSink"}, + }, + }, + ], + }, + } + + return { + "pipelines": [pipeline], + "expected_dataflows": 1, + "expected_datajobs": 5, + "expected_lineage_edges": [ + # Stage 1: SQL → Blob + ("SqlCustomersTable", "BlobStagingCustomers"), + ("SqlOrdersTable", "BlobStagingOrders"), + # Stage 2: Blob → Synapse + ("BlobStagingCustomers", "SynapseCustomersDim"), + ("BlobStagingOrders", "SynapseOrdersFact"), + # Stage 3: Blob → DataLake + ("BlobStagingCustomers", "DataLakeRawData"), + ], + } + + +# ============================================================================= +# SCENARIO 6: DIVERSE ACTIVITY TYPES +# ============================================================================= + + +def create_diverse_activities_scenario() -> Dict[str, Any]: + """Create mock data for testing various activity types. + + Structure: + - DiverseActivitiesPipeline + └── SetVariable: InitializeCounter + └── WebActivity: FetchConfiguration (REST API call) + └── SqlServerStoredProcedure: ProcessData + └── Wait: DelayForReplication + └── DatabricksNotebook: RunMLTraining + └── AzureFunctionActivity: SendNotification + └── Fail: FailOnError (in error handling) + """ + pipeline = { + "id": _base_resource_id("pipelines", "DiverseActivitiesPipeline"), + "name": "DiverseActivitiesPipeline", + "type": "Microsoft.DataFactory/factories/pipelines", + "properties": { + "description": "Pipeline demonstrating various activity types", + "variables": { + "counter": {"type": "Integer", "defaultValue": 0}, + "configData": {"type": "String"}, + }, + "activities": [ + # SetVariable - Initialize a pipeline variable + { + "name": "InitializeCounter", + "type": "SetVariable", + "typeProperties": { + "variableName": "counter", + "value": {"value": "1", "type": "Expression"}, + }, + }, + # WebActivity - Call an external REST API + { + "name": "FetchConfiguration", + "type": "WebActivity", + "dependsOn": [ + { + "activity": "InitializeCounter", + "dependencyConditions": ["Succeeded"], + } + ], + "typeProperties": { + "url": "https://api.example.com/config", + "method": "GET", + "headers": {"Content-Type": "application/json"}, + }, + }, + # SqlServerStoredProcedure - Execute a stored procedure + { + "name": "ProcessDataWithSP", + "type": "SqlServerStoredProcedure", + "dependsOn": [ + { + "activity": "FetchConfiguration", + "dependencyConditions": ["Succeeded"], + } + ], + "linkedServiceName": { + "referenceName": "SqlServerSource", + "type": "LinkedServiceReference", + }, + "typeProperties": { + "storedProcedureName": "sp_ProcessDailyData", + "storedProcedureParameters": { + "ProcessDate": { + "value": "@utcnow()", + "type": "DateTime", + } + }, + }, + }, + # Wait - Introduce a delay + { + "name": "WaitForReplication", + "type": "Wait", + "dependsOn": [ + { + "activity": "ProcessDataWithSP", + "dependencyConditions": ["Succeeded"], + } + ], + "typeProperties": {"waitTimeInSeconds": 30}, + }, + # GetMetadata - Get file/folder metadata + { + "name": "CheckOutputExists", + "type": "GetMetadata", + "dependsOn": [ + { + "activity": "WaitForReplication", + "dependencyConditions": ["Succeeded"], + } + ], + "typeProperties": { + "dataset": { + "referenceName": "BlobStagingCustomers", + "type": "DatasetReference", + }, + "fieldList": ["exists", "itemName", "lastModified"], + }, + }, + # DatabricksNotebook - Run a Databricks notebook + { + "name": "RunMLTrainingNotebook", + "type": "DatabricksNotebook", + "dependsOn": [ + { + "activity": "CheckOutputExists", + "dependencyConditions": ["Succeeded"], + } + ], + "linkedServiceName": { + "referenceName": "DatabricksWorkspace", + "type": "LinkedServiceReference", + }, + "typeProperties": { + "notebookPath": "/Shared/MLTraining/train_model", + "baseParameters": { + "input_path": "/mnt/data/input", + "output_path": "/mnt/data/output", + }, + }, + }, + # Script - Run a SQL script + { + "name": "RunAnalyticsScript", + "type": "Script", + "dependsOn": [ + { + "activity": "RunMLTrainingNotebook", + "dependencyConditions": ["Succeeded"], + } + ], + "linkedServiceName": { + "referenceName": "SynapseDestination", + "type": "LinkedServiceReference", + }, + "typeProperties": { + "scripts": [ + { + "text": "EXEC sp_UpdateAnalytics @date = GETDATE()", + "type": "Query", + } + ] + }, + }, + # AzureFunctionActivity - Call an Azure Function + { + "name": "SendCompletionNotification", + "type": "AzureFunctionActivity", + "dependsOn": [ + { + "activity": "RunAnalyticsScript", + "dependencyConditions": ["Succeeded"], + } + ], + "linkedServiceName": { + "referenceName": "NotificationFunction", + "type": "LinkedServiceReference", + }, + "typeProperties": { + "functionName": "SendNotification", + "method": "POST", + "body": { + "value": '@json(concat(\'{"status": "success", "pipeline": "\', pipeline().Pipeline, \'"}\'))', + "type": "Expression", + }, + }, + }, + # Fail - Explicitly fail the pipeline (usually in error handling) + # Note: In real scenarios, this would be in an error handling path + { + "name": "FailOnCriticalError", + "type": "Fail", + "dependsOn": [ + { + "activity": "SendCompletionNotification", + "dependencyConditions": ["Failed"], + } + ], + "typeProperties": { + "message": "Pipeline failed due to notification error", + "errorCode": "500", + }, + }, + ], + }, + } + + # Add Databricks linked service for the test + databricks_linked_service = { + "id": _base_resource_id("linkedservices", "DatabricksWorkspace"), + "name": "DatabricksWorkspace", + "type": "Microsoft.DataFactory/factories/linkedservices", + "properties": { + "type": "AzureDatabricks", + "typeProperties": { + "domain": "https://adb-123456789.azuredatabricks.net", + "workspaceResourceId": "/subscriptions/xxx/resourceGroups/xxx/providers/Microsoft.Databricks/workspaces/my-workspace", + }, + }, + } + + # Add Azure Function linked service + function_linked_service = { + "id": _base_resource_id("linkedservices", "NotificationFunction"), + "name": "NotificationFunction", + "type": "Microsoft.DataFactory/factories/linkedservices", + "properties": { + "type": "AzureFunction", + "typeProperties": { + "functionAppUrl": "https://my-function-app.azurewebsites.net", + }, + }, + } + + return { + "pipelines": [pipeline], + "additional_linked_services": [ + databricks_linked_service, + function_linked_service, + ], + "expected_datajobs": 9, # All activities + "activity_types_covered": [ + "SetVariable", + "WebActivity", + "SqlServerStoredProcedure", + "Wait", + "GetMetadata", + "DatabricksNotebook", + "Script", + "AzureFunctionActivity", + "Fail", + ], + } + + +# ============================================================================= +# SCENARIO 7: MIXED DEPENDENCIES (Pipeline + Dataset Lineage) +# ============================================================================= + + +def create_mixed_dependencies_scenario() -> Dict[str, Any]: + """Create mock data for mixed pipeline and dataset dependencies. + + This scenario tests both types of lineage in a single orchestration: + 1. Pipeline-to-pipeline lineage (ExecutePipeline activities) + 2. Dataset lineage (Copy activities with inputs/outputs) + + Structure: + - MixedOrchestrationPipeline + └── ExecutePipeline: ExtractDataPipeline (child) + └── Copy: ExtractFromSource (reads SqlCustomersTable, writes BlobStagingCustomers) + └── Copy: TransformInMain (reads BlobStagingCustomers, writes SynapseCustomersDim) + └── ExecutePipeline: LoadDataPipeline (child) + └── Copy: LoadToDestination (reads SynapseCustomersDim, writes DataLakeCuratedData) + + Expected lineage: + - ExecuteExtract -> ExtractFromSource (pipeline lineage) + - TransformInMain -> BlobStagingCustomers (dataset input) + - TransformInMain -> SynapseCustomersDim (dataset output) + - ExecuteLoad -> LoadToDestination (pipeline lineage) + """ + # Child pipeline for extraction + extract_pipeline = { + "id": _base_resource_id("pipelines", "ExtractDataPipeline"), + "name": "ExtractDataPipeline", + "type": "Microsoft.DataFactory/factories/pipelines", + "properties": { + "description": "Child pipeline for extracting data from source", + "activities": [ + { + "name": "ExtractFromSource", + "type": "Copy", + "inputs": [ + { + "referenceName": "SqlCustomersTable", + "type": "DatasetReference", + } + ], + "outputs": [ + { + "referenceName": "BlobStagingCustomers", + "type": "DatasetReference", + } + ], + "typeProperties": { + "source": {"type": "AzureSqlSource"}, + "sink": {"type": "DelimitedTextSink"}, + }, + } + ], + }, + } + + # Child pipeline for loading + load_pipeline = { + "id": _base_resource_id("pipelines", "LoadDataPipeline"), + "name": "LoadDataPipeline", + "type": "Microsoft.DataFactory/factories/pipelines", + "properties": { + "description": "Child pipeline for loading data to destination", + "activities": [ + { + "name": "LoadToDestination", + "type": "Copy", + "inputs": [ + { + "referenceName": "SynapseCustomersDim", + "type": "DatasetReference", + } + ], + "outputs": [ + { + "referenceName": "DataLakeCuratedData", + "type": "DatasetReference", + } + ], + "typeProperties": { + "source": {"type": "SqlDWSource"}, + "sink": {"type": "ParquetSink"}, + }, + } + ], + }, + } + + # Main orchestration pipeline with both ExecutePipeline and Copy activities + main_pipeline = { + "id": _base_resource_id("pipelines", "MixedOrchestrationPipeline"), + "name": "MixedOrchestrationPipeline", + "type": "Microsoft.DataFactory/factories/pipelines", + "properties": { + "description": "Pipeline demonstrating both pipeline and dataset dependencies", + "activities": [ + # Step 1: Call child pipeline to extract data + { + "name": "ExecuteExtract", + "type": "ExecutePipeline", + "typeProperties": { + "pipeline": { + "referenceName": "ExtractDataPipeline", + "type": "PipelineReference", + }, + "waitOnCompletion": True, + }, + }, + # Step 2: Transform data in main pipeline (has dataset lineage) + { + "name": "TransformInMain", + "type": "Copy", + "dependsOn": [ + { + "activity": "ExecuteExtract", + "dependencyConditions": ["Succeeded"], + } + ], + "inputs": [ + { + "referenceName": "BlobStagingCustomers", + "type": "DatasetReference", + } + ], + "outputs": [ + { + "referenceName": "SynapseCustomersDim", + "type": "DatasetReference", + } + ], + "typeProperties": { + "source": {"type": "DelimitedTextSource"}, + "sink": {"type": "SqlDWSink"}, + }, + }, + # Step 3: Call child pipeline to load data + { + "name": "ExecuteLoad", + "type": "ExecutePipeline", + "dependsOn": [ + { + "activity": "TransformInMain", + "dependencyConditions": ["Succeeded"], + } + ], + "typeProperties": { + "pipeline": { + "referenceName": "LoadDataPipeline", + "type": "PipelineReference", + }, + "waitOnCompletion": True, + }, + }, + ], + }, + } + + return { + "pipelines": [main_pipeline, extract_pipeline, load_pipeline], + "expected_dataflows": 3, # 3 pipelines + "expected_datajobs": 5, # 2 ExecutePipeline + 1 Copy in main + 2 Copy in children + "expected_pipeline_lineage": 2, # 2 ExecutePipeline activities + "expected_dataset_lineage": 3, # TransformInMain (1 in, 1 out) + ExtractFromSource + LoadToDestination + } + + +# ============================================================================= +# FACTORY HELPER +# ============================================================================= + + +def create_complex_factory() -> Dict[str, Any]: + """Create the factory that contains all complex scenarios.""" + return { + "id": f"/subscriptions/{SUBSCRIPTION_ID}/resourceGroups/{RESOURCE_GROUP}/providers/Microsoft.DataFactory/factories/{FACTORY_NAME}", + "name": FACTORY_NAME, + "type": "Microsoft.DataFactory/factories", + "location": LOCATION, + "tags": {"environment": "test", "purpose": "complex-integration-tests"}, + "properties": { + "provisioningState": "Succeeded", + "createTime": "2024-01-01T00:00:00Z", + }, + } + + +def get_all_complex_pipelines() -> List[Dict[str, Any]]: + """Get all pipelines from all complex scenarios.""" + pipelines = [] + pipelines.extend(create_nested_pipeline_scenario()["pipelines"]) + pipelines.extend(create_foreach_loop_scenario()["pipelines"]) + pipelines.extend(create_branching_scenario()["pipelines"]) + pipelines.extend(create_dataflow_scenario()["pipelines"]) + pipelines.extend(create_multisource_chain_scenario()["pipelines"]) + return pipelines + + +def get_all_data_flows() -> List[Dict[str, Any]]: + """Get all data flows from scenarios that have them.""" + data_flows = [] + dataflow_scenario = create_dataflow_scenario() + if "data_flows" in dataflow_scenario: + data_flows.extend(dataflow_scenario["data_flows"]) + return data_flows diff --git a/metadata-ingestion/tests/integration/azure_data_factory/test_adf_source.py b/metadata-ingestion/tests/integration/azure_data_factory/test_adf_source.py new file mode 100644 index 00000000000000..17462be3a06f65 --- /dev/null +++ b/metadata-ingestion/tests/integration/azure_data_factory/test_adf_source.py @@ -0,0 +1,742 @@ +"""Integration tests for Azure Data Factory source. + +These tests use mocked Azure SDK responses to verify the full ingestion pipeline +produces the expected metadata events. +""" + +from datetime import datetime, timezone +from typing import Any, Dict, Iterator, List, Optional +from unittest import mock +from unittest.mock import MagicMock + +import pytest +from freezegun import freeze_time + +from datahub.ingestion.run.pipeline import Pipeline +from datahub.testing import mce_helpers + +FROZEN_TIME = "2024-01-15 12:00:00" + +# Mock Azure SDK response data + + +def create_mock_factory( + name: str, + resource_group: str, + subscription_id: str, + location: str = "eastus", + tags: Optional[Dict[str, str]] = None, +) -> Dict[str, Any]: + """Create a mock factory response.""" + return { + "id": f"/subscriptions/{subscription_id}/resourceGroups/{resource_group}/providers/Microsoft.DataFactory/factories/{name}", + "name": name, + "type": "Microsoft.DataFactory/factories", + "location": location, + "tags": tags or {}, + "properties": { + "provisioningState": "Succeeded", + "createTime": "2024-01-01T00:00:00Z", + }, + } + + +def create_mock_pipeline( + name: str, + factory_name: str, + resource_group: str, + subscription_id: str, + activities: Optional[List[Dict[str, Any]]] = None, + description: Optional[str] = None, +) -> Dict[str, Any]: + """Create a mock pipeline response.""" + return { + "id": f"/subscriptions/{subscription_id}/resourceGroups/{resource_group}/providers/Microsoft.DataFactory/factories/{factory_name}/pipelines/{name}", + "name": name, + "type": "Microsoft.DataFactory/factories/pipelines", + "properties": { + "description": description, + "activities": activities or [], + "parameters": {}, + "variables": {}, + "annotations": [], + }, + } + + +def create_mock_activity( + name: str, + activity_type: str, + inputs: Optional[List[Dict[str, Any]]] = None, + outputs: Optional[List[Dict[str, Any]]] = None, + depends_on: Optional[List[Dict[str, Any]]] = None, + description: Optional[str] = None, +) -> Dict[str, Any]: + """Create a mock activity definition.""" + return { + "name": name, + "type": activity_type, + "description": description, + "dependsOn": depends_on or [], + "inputs": inputs or [], + "outputs": outputs or [], + "typeProperties": {}, + "policy": {"timeout": "7.00:00:00", "retry": 0}, + "userProperties": [], + } + + +def create_mock_dataset( + name: str, + factory_name: str, + resource_group: str, + subscription_id: str, + linked_service_name: str, + dataset_type: str = "AzureBlobDataset", + type_properties: Optional[Dict[str, Any]] = None, +) -> Dict[str, Any]: + """Create a mock dataset response.""" + return { + "id": f"/subscriptions/{subscription_id}/resourceGroups/{resource_group}/providers/Microsoft.DataFactory/factories/{factory_name}/datasets/{name}", + "name": name, + "type": "Microsoft.DataFactory/factories/datasets", + "properties": { + "linkedServiceName": { + "referenceName": linked_service_name, + "type": "LinkedServiceReference", + }, + "type": dataset_type, + "typeProperties": type_properties or {}, + "annotations": [], + "parameters": {}, + }, + } + + +def create_mock_linked_service( + name: str, + factory_name: str, + resource_group: str, + subscription_id: str, + service_type: str = "AzureBlobStorage", +) -> Dict[str, Any]: + """Create a mock linked service response.""" + return { + "id": f"/subscriptions/{subscription_id}/resourceGroups/{resource_group}/providers/Microsoft.DataFactory/factories/{factory_name}/linkedservices/{name}", + "name": name, + "type": "Microsoft.DataFactory/factories/linkedservices", + "properties": { + "type": service_type, + "typeProperties": {}, + "annotations": [], + }, + } + + +def create_mock_trigger( + name: str, + factory_name: str, + resource_group: str, + subscription_id: str, + trigger_type: str = "ScheduleTrigger", + pipelines: Optional[List[str]] = None, +) -> Dict[str, Any]: + """Create a mock trigger response.""" + pipeline_refs = [ + { + "pipelineReference": {"referenceName": p, "type": "PipelineReference"}, + "parameters": {}, + } + for p in (pipelines or []) + ] + return { + "id": f"/subscriptions/{subscription_id}/resourceGroups/{resource_group}/providers/Microsoft.DataFactory/factories/{factory_name}/triggers/{name}", + "name": name, + "type": "Microsoft.DataFactory/factories/triggers", + "properties": { + "type": trigger_type, + "runtimeState": "Started", + "pipelines": pipeline_refs, + "typeProperties": {}, + "annotations": [], + }, + } + + +def create_mock_pipeline_run( + run_id: str, + pipeline_name: str, + status: str = "Succeeded", + start_time: Optional[datetime] = None, + end_time: Optional[datetime] = None, +) -> Dict[str, Any]: + """Create a mock pipeline run response.""" + return { + "runId": run_id, + "pipelineName": pipeline_name, + "status": status, + "runStart": ( + start_time or datetime(2024, 1, 15, 10, 0, 0, tzinfo=timezone.utc) + ).isoformat(), + "runEnd": ( + end_time or datetime(2024, 1, 15, 10, 30, 0, tzinfo=timezone.utc) + ).isoformat(), + "durationInMs": 1800000, + "message": None, + "parameters": {}, + "invokedBy": {"name": "Manual", "invokedByType": "Manual"}, + "lastUpdated": datetime( + 2024, 1, 15, 10, 30, 0, tzinfo=timezone.utc + ).isoformat(), + } + + +def create_mock_activity_run( + activity_run_id: str, + activity_name: str, + activity_type: str, + pipeline_run_id: str, + pipeline_name: str, + status: str = "Succeeded", + start_time: Optional[datetime] = None, + end_time: Optional[datetime] = None, + duration_ms: int = 30000, + error: Optional[Dict[str, Any]] = None, +) -> Dict[str, Any]: + """Create a mock activity run response.""" + return { + "activityRunId": activity_run_id, + "activityName": activity_name, + "activityType": activity_type, + "pipelineRunId": pipeline_run_id, + "pipelineName": pipeline_name, + "status": status, + "activityRunStart": ( + start_time or datetime(2024, 1, 15, 10, 5, 0, tzinfo=timezone.utc) + ).isoformat(), + "activityRunEnd": ( + end_time or datetime(2024, 1, 15, 10, 10, 0, tzinfo=timezone.utc) + ).isoformat(), + "durationInMs": duration_ms, + "input": {}, + "output": {}, + "error": error, + } + + +class MockAzureResource: + """Mock class to simulate Azure SDK resource objects.""" + + def __init__(self, data: Dict[str, Any]): + self._data = data + + def as_dict(self) -> Dict[str, Any]: + return self._data + + +class MockPagedIterator: + """Mock class to simulate Azure SDK paged iterators.""" + + def __init__(self, items: List[Dict[str, Any]]): + self._items = [MockAzureResource(item) for item in items] + + def __iter__(self) -> Iterator[MockAzureResource]: + return iter(self._items) + + +class MockQueryResponse: + """Mock class for query responses with continuation token.""" + + def __init__( + self, items: List[Dict[str, Any]], continuation_token: Optional[str] = None + ): + self.value = [MockAzureResource(item) for item in items] + self.continuation_token = continuation_token + + +# Test data constants +SUBSCRIPTION_ID = "12345678-1234-1234-1234-123456789012" +RESOURCE_GROUP = "test-resource-group" +FACTORY_NAME = "test-data-factory" + + +def get_mock_test_data() -> Dict[str, Any]: + """Generate comprehensive test data for the ADF source.""" + factories = [ + create_mock_factory( + name=FACTORY_NAME, + resource_group=RESOURCE_GROUP, + subscription_id=SUBSCRIPTION_ID, + tags={"environment": "test", "team": "data-engineering"}, + ), + ] + + # Create pipelines with various activities + copy_activity = create_mock_activity( + name="CopyBlobToSQL", + activity_type="Copy", + inputs=[{"referenceName": "SourceBlobDataset", "type": "DatasetReference"}], + outputs=[{"referenceName": "DestSqlDataset", "type": "DatasetReference"}], + description="Copy data from Blob to SQL", + ) + + lookup_activity = create_mock_activity( + name="LookupConfig", + activity_type="Lookup", + inputs=[{"referenceName": "ConfigDataset", "type": "DatasetReference"}], + description="Lookup configuration values", + ) + + dataflow_activity = create_mock_activity( + name="TransformData", + activity_type="ExecuteDataFlow", + depends_on=[ + {"activity": "LookupConfig", "dependencyConditions": ["Succeeded"]} + ], + description="Execute mapping data flow", + ) + + stored_proc_activity = create_mock_activity( + name="CallStoredProc", + activity_type="SqlServerStoredProcedure", + depends_on=[ + {"activity": "CopyBlobToSQL", "dependencyConditions": ["Succeeded"]} + ], + description="Call stored procedure", + ) + + pipelines = [ + create_mock_pipeline( + name="DataIngestionPipeline", + factory_name=FACTORY_NAME, + resource_group=RESOURCE_GROUP, + subscription_id=SUBSCRIPTION_ID, + description="Main data ingestion pipeline", + activities=[copy_activity, lookup_activity, dataflow_activity], + ), + create_mock_pipeline( + name="DataProcessingPipeline", + factory_name=FACTORY_NAME, + resource_group=RESOURCE_GROUP, + subscription_id=SUBSCRIPTION_ID, + description="Data processing and transformation", + activities=[stored_proc_activity], + ), + ] + + # Create datasets + datasets = [ + create_mock_dataset( + name="SourceBlobDataset", + factory_name=FACTORY_NAME, + resource_group=RESOURCE_GROUP, + subscription_id=SUBSCRIPTION_ID, + linked_service_name="AzureBlobStorageLS", + dataset_type="DelimitedTextDataset", + type_properties={ + "location": { + "container": "raw-data", + "folderPath": "input", + "fileName": "data.csv", + } + }, + ), + create_mock_dataset( + name="DestSqlDataset", + factory_name=FACTORY_NAME, + resource_group=RESOURCE_GROUP, + subscription_id=SUBSCRIPTION_ID, + linked_service_name="AzureSqlDatabaseLS", + dataset_type="AzureSqlTableDataset", + type_properties={"schema": "dbo", "table": "ProcessedData"}, + ), + create_mock_dataset( + name="ConfigDataset", + factory_name=FACTORY_NAME, + resource_group=RESOURCE_GROUP, + subscription_id=SUBSCRIPTION_ID, + linked_service_name="AzureBlobStorageLS", + dataset_type="JsonDataset", + type_properties={ + "location": { + "container": "config", + "fileName": "settings.json", + } + }, + ), + ] + + # Create linked services + linked_services = [ + create_mock_linked_service( + name="AzureBlobStorageLS", + factory_name=FACTORY_NAME, + resource_group=RESOURCE_GROUP, + subscription_id=SUBSCRIPTION_ID, + service_type="AzureBlobStorage", + ), + create_mock_linked_service( + name="AzureSqlDatabaseLS", + factory_name=FACTORY_NAME, + resource_group=RESOURCE_GROUP, + subscription_id=SUBSCRIPTION_ID, + service_type="AzureSqlDatabase", + ), + ] + + # Create triggers + triggers = [ + create_mock_trigger( + name="DailyScheduleTrigger", + factory_name=FACTORY_NAME, + resource_group=RESOURCE_GROUP, + subscription_id=SUBSCRIPTION_ID, + trigger_type="ScheduleTrigger", + pipelines=["DataIngestionPipeline"], + ), + ] + + # Create pipeline runs + pipeline_runs = [ + create_mock_pipeline_run( + run_id="run-001-abc", + pipeline_name="DataIngestionPipeline", + status="Succeeded", + start_time=datetime(2024, 1, 15, 8, 0, 0, tzinfo=timezone.utc), + end_time=datetime(2024, 1, 15, 8, 45, 0, tzinfo=timezone.utc), + ), + create_mock_pipeline_run( + run_id="run-002-def", + pipeline_name="DataIngestionPipeline", + status="Failed", + start_time=datetime(2024, 1, 14, 8, 0, 0, tzinfo=timezone.utc), + end_time=datetime(2024, 1, 14, 8, 15, 0, tzinfo=timezone.utc), + ), + create_mock_pipeline_run( + run_id="run-003-ghi", + pipeline_name="DataProcessingPipeline", + status="Succeeded", + start_time=datetime(2024, 1, 15, 9, 0, 0, tzinfo=timezone.utc), + end_time=datetime(2024, 1, 15, 9, 30, 0, tzinfo=timezone.utc), + ), + ] + + # Create activity runs for each pipeline run + # Activity runs are linked to DataJobs (activities), not DataFlows (pipelines) + activity_runs = { + "run-001-abc": [ # DataIngestionPipeline - Succeeded + create_mock_activity_run( + activity_run_id="act-001-copy", + activity_name="CopyBlobToSQL", + activity_type="Copy", + pipeline_run_id="run-001-abc", + pipeline_name="DataIngestionPipeline", + status="Succeeded", + start_time=datetime(2024, 1, 15, 8, 5, 0, tzinfo=timezone.utc), + end_time=datetime(2024, 1, 15, 8, 20, 0, tzinfo=timezone.utc), + duration_ms=900000, + ), + create_mock_activity_run( + activity_run_id="act-001-lookup", + activity_name="LookupConfig", + activity_type="Lookup", + pipeline_run_id="run-001-abc", + pipeline_name="DataIngestionPipeline", + status="Succeeded", + start_time=datetime(2024, 1, 15, 8, 20, 0, tzinfo=timezone.utc), + end_time=datetime(2024, 1, 15, 8, 21, 0, tzinfo=timezone.utc), + duration_ms=60000, + ), + create_mock_activity_run( + activity_run_id="act-001-transform", + activity_name="TransformData", + activity_type="ExecuteDataFlow", + pipeline_run_id="run-001-abc", + pipeline_name="DataIngestionPipeline", + status="Succeeded", + start_time=datetime(2024, 1, 15, 8, 21, 0, tzinfo=timezone.utc), + end_time=datetime(2024, 1, 15, 8, 45, 0, tzinfo=timezone.utc), + duration_ms=1440000, + ), + ], + "run-002-def": [ # DataIngestionPipeline - Failed + create_mock_activity_run( + activity_run_id="act-002-copy", + activity_name="CopyBlobToSQL", + activity_type="Copy", + pipeline_run_id="run-002-def", + pipeline_name="DataIngestionPipeline", + status="Failed", + start_time=datetime(2024, 1, 14, 8, 5, 0, tzinfo=timezone.utc), + end_time=datetime(2024, 1, 14, 8, 15, 0, tzinfo=timezone.utc), + duration_ms=600000, + error={ + "message": "Connection timeout to SQL database", + "errorCode": "2200", + }, + ), + ], + "run-003-ghi": [ # DataProcessingPipeline - Succeeded + create_mock_activity_run( + activity_run_id="act-003-proc", + activity_name="CallStoredProc", + activity_type="SqlServerStoredProcedure", + pipeline_run_id="run-003-ghi", + pipeline_name="DataProcessingPipeline", + status="Succeeded", + start_time=datetime(2024, 1, 15, 9, 5, 0, tzinfo=timezone.utc), + end_time=datetime(2024, 1, 15, 9, 30, 0, tzinfo=timezone.utc), + duration_ms=1500000, + ), + ], + } + + return { + "factories": factories, + "pipelines": pipelines, + "datasets": datasets, + "linked_services": linked_services, + "triggers": triggers, + "pipeline_runs": pipeline_runs, + "activity_runs": activity_runs, + } + + +def create_mock_client( + test_data: Dict[str, Any], include_activity_runs: bool = False +) -> MagicMock: + """Create a mock DataFactoryManagementClient. + + Args: + test_data: Dictionary containing mock data for factories, pipelines, etc. + include_activity_runs: If True, return activity runs for each pipeline run. + This enables testing of the activity run extraction feature. + """ + mock_client = MagicMock() + + # Mock factories + mock_client.factories.list.return_value = MockPagedIterator(test_data["factories"]) + mock_client.factories.list_by_resource_group.return_value = MockPagedIterator( + test_data["factories"] + ) + + # Mock pipelines + mock_client.pipelines.list_by_factory.return_value = MockPagedIterator( + test_data["pipelines"] + ) + + # Mock datasets + mock_client.datasets.list_by_factory.return_value = MockPagedIterator( + test_data["datasets"] + ) + + # Mock linked services + mock_client.linked_services.list_by_factory.return_value = MockPagedIterator( + test_data["linked_services"] + ) + + # Mock triggers + mock_client.triggers.list_by_factory.return_value = MockPagedIterator( + test_data["triggers"] + ) + + # Mock data flows (empty for basic tests) + mock_client.data_flows.list_by_factory.return_value = MockPagedIterator([]) + + # Mock pipeline runs + mock_client.pipeline_runs.query_by_factory.return_value = MockQueryResponse( + test_data["pipeline_runs"] + ) + + # Mock activity runs - return based on pipeline run ID if enabled + if include_activity_runs and "activity_runs" in test_data: + activity_runs_by_pipeline = test_data["activity_runs"] + + def get_activity_runs( + resource_group_name: str, factory_name: str, run_id: str, filter_parameters + ) -> MockQueryResponse: + """Return activity runs for the given pipeline run ID.""" + runs = activity_runs_by_pipeline.get(run_id, []) + return MockQueryResponse(runs) + + mock_client.activity_runs.query_by_pipeline_run.side_effect = get_activity_runs + else: + mock_client.activity_runs.query_by_pipeline_run.return_value = ( + MockQueryResponse([]) + ) + + return mock_client + + +@freeze_time(FROZEN_TIME) +@pytest.mark.integration +def test_adf_source_basic(pytestconfig, tmp_path): + """Test basic ADF metadata extraction without execution history.""" + test_resources_dir = pytestconfig.rootpath / "tests/integration/azure_data_factory" + output_file = tmp_path / "adf_basic_events.json" + golden_file = test_resources_dir / "adf_basic_golden.json" + + test_data = get_mock_test_data() + mock_client = create_mock_client(test_data) + + with mock.patch( + "datahub.ingestion.source.azure_data_factory.adf_client.DataFactoryManagementClient" + ) as MockClientClass: + MockClientClass.return_value = mock_client + + with mock.patch( + "datahub.ingestion.source.azure.azure_auth.DefaultAzureCredential" + ): + pipeline = Pipeline.create( + { + "run_id": "adf-test-basic", + "source": { + "type": "azure-data-factory", + "config": { + "subscription_id": SUBSCRIPTION_ID, + "resource_group": RESOURCE_GROUP, + "credential": { + "authentication_method": "default", + }, + "include_lineage": True, + "include_execution_history": False, + "env": "PROD", + }, + }, + "sink": { + "type": "file", + "config": { + "filename": str(output_file), + }, + }, + } + ) + + pipeline.run() + pipeline.raise_from_status() + + # For the first run, we need to create the golden file + # In subsequent runs, this will compare against the golden file + mce_helpers.check_golden_file( + pytestconfig, + output_path=str(output_file), + golden_path=str(golden_file), + ) + + +@freeze_time(FROZEN_TIME) +@pytest.mark.integration +def test_adf_source_with_execution_history(pytestconfig, tmp_path): + """Test ADF metadata extraction with execution history. + + This test verifies: + - Pipeline runs are extracted as DataProcessInstance linked to DataFlow + - Activity runs are extracted as DataProcessInstance linked to DataJob + - Run status (Succeeded, Failed) is correctly mapped + - Both start and end events are emitted for completed runs + """ + test_resources_dir = pytestconfig.rootpath / "tests/integration/azure_data_factory" + output_file = tmp_path / "adf_with_runs_events.json" + golden_file = test_resources_dir / "adf_with_runs_golden.json" + + test_data = get_mock_test_data() + # Enable activity runs to test DataJob-level run history + mock_client = create_mock_client(test_data, include_activity_runs=True) + + with mock.patch( + "datahub.ingestion.source.azure_data_factory.adf_client.DataFactoryManagementClient" + ) as MockClientClass: + MockClientClass.return_value = mock_client + + with mock.patch( + "datahub.ingestion.source.azure.azure_auth.DefaultAzureCredential" + ): + pipeline = Pipeline.create( + { + "run_id": "adf-test-with-runs", + "source": { + "type": "azure-data-factory", + "config": { + "subscription_id": SUBSCRIPTION_ID, + "resource_group": RESOURCE_GROUP, + "credential": { + "authentication_method": "default", + }, + "include_lineage": True, + "include_execution_history": True, + "execution_history_days": 7, + "env": "PROD", + }, + }, + "sink": { + "type": "file", + "config": { + "filename": str(output_file), + }, + }, + } + ) + + pipeline.run() + pipeline.raise_from_status() + + mce_helpers.check_golden_file( + pytestconfig, + output_path=str(output_file), + golden_path=str(golden_file), + ) + + +@freeze_time(FROZEN_TIME) +@pytest.mark.integration +def test_adf_source_with_platform_instance(pytestconfig, tmp_path): + """Test ADF metadata extraction with platform instance configured.""" + test_resources_dir = pytestconfig.rootpath / "tests/integration/azure_data_factory" + output_file = tmp_path / "adf_platform_instance_events.json" + golden_file = test_resources_dir / "adf_platform_instance_golden.json" + + test_data = get_mock_test_data() + mock_client = create_mock_client(test_data) + + with mock.patch( + "datahub.ingestion.source.azure_data_factory.adf_client.DataFactoryManagementClient" + ) as MockClientClass: + MockClientClass.return_value = mock_client + + with mock.patch( + "datahub.ingestion.source.azure.azure_auth.DefaultAzureCredential" + ): + pipeline = Pipeline.create( + { + "run_id": "adf-test-platform-instance", + "source": { + "type": "azure-data-factory", + "config": { + "subscription_id": SUBSCRIPTION_ID, + "resource_group": RESOURCE_GROUP, + "credential": { + "authentication_method": "default", + }, + "platform_instance": "my-adf-instance", + "include_lineage": True, + "include_execution_history": False, + "env": "DEV", + }, + }, + "sink": { + "type": "file", + "config": { + "filename": str(output_file), + }, + }, + } + ) + + pipeline.run() + pipeline.raise_from_status() + + mce_helpers.check_golden_file( + pytestconfig, + output_path=str(output_file), + golden_path=str(golden_file), + ) diff --git a/metadata-ingestion/tests/integration/azure_data_factory/test_complex_pipelines.py b/metadata-ingestion/tests/integration/azure_data_factory/test_complex_pipelines.py new file mode 100644 index 00000000000000..c1053b62cae15d --- /dev/null +++ b/metadata-ingestion/tests/integration/azure_data_factory/test_complex_pipelines.py @@ -0,0 +1,1483 @@ +"""Integration tests for complex Azure Data Factory pipeline patterns. + +These tests validate that the ADF connector correctly handles advanced pipeline +configurations commonly found in production environments. Each test scenario +represents a real-world pattern that data engineers use in ADF. + +Test Coverage: +============= + +1. **Nested Pipelines (Execute Pipeline Activity)** + - Parent pipelines orchestrating child pipelines + - Validates hierarchical DataFlow/DataJob relationships + - Ensures lineage propagates through nested execution + +2. **ForEach Loops** + - Iterative processing over collections (tables, files, etc.) + - Tests that loop activities and their children are properly extracted + - Validates parametrized activities within loops + +3. **Control Flow Branching (If-Condition, Switch)** + - Conditional execution paths based on runtime expressions + - Verifies all branches (true/false, switch cases, default) are captured + - Tests that lineage is recorded for activities in all branches + +4. **Mapping Data Flows** + - Complex transformations (filter, join, aggregate, derive) + - Multiple sources and sinks with transformation chains + - Validates Data Flow script extraction and lineage + +5. **Multi-Source ETL Pipelines** + - Full ETL chains: SQL → Blob → Synapse → DataLake + - Tests end-to-end lineage across multiple hops + - Validates platform mapping (mssql, abs) + +Why These Tests Matter: +====================== +Production ADF pipelines rarely use simple, linear patterns. These tests ensure +the connector handles real-world complexity without losing lineage information +or failing to capture activities in nested/conditional structures. + +Mock Data Strategy: +================== +Mock data is based on Azure REST API response structures from: +https://github.com/Azure/azure-rest-api-specs/tree/main/specification/datafactory + +The mocks simulate real Azure SDK responses, including: +- Factory, Pipeline, Dataset, LinkedService, DataFlow objects +- Proper nesting of properties and type-specific configurations +- Realistic activity structures with inputs, outputs, and dependencies +""" + +import json +from pathlib import Path +from typing import Any, Dict, Iterator, List, Optional +from unittest import mock + +import pytest +from freezegun import freeze_time + +from datahub.ingestion.run.pipeline import Pipeline +from datahub.ingestion.source.azure_data_factory.adf_source import ( + AzureDataFactorySource, +) +from datahub.testing import mce_helpers +from tests.integration.azure_data_factory.complex_mocks import ( + RESOURCE_GROUP, + SUBSCRIPTION_ID, + create_branching_scenario, + create_complex_datasets, + create_complex_factory, + create_complex_linked_services, + create_dataflow_scenario, + create_diverse_activities_scenario, + create_foreach_loop_scenario, + create_multisource_chain_scenario, + create_nested_pipeline_scenario, + get_all_data_flows, +) + +# Freeze time for deterministic test output (affects timestamps in MCPs) +FROZEN_TIME = "2024-01-15 12:00:00" + + +# ============================================================================= +# MOCK HELPERS +# ============================================================================= +# These classes simulate the Azure SDK's response objects. The Azure SDK returns +# objects that have an as_dict() method to convert to dictionaries, which our +# connector then parses into Pydantic models. + + +class MockAzureResource: + """Mock class to simulate Azure SDK resource objects. + + The Azure SDK returns resource objects (Pipeline, Dataset, etc.) that have + an as_dict() method. Our connector calls this method to get a dictionary + representation which is then validated against our Pydantic models. + """ + + def __init__(self, data: Dict[str, Any]): + self._data = data + + def as_dict(self) -> Dict[str, Any]: + return self._data + + +class MockPagedIterator: + """Mock class to simulate Azure SDK paged iterators. + + Azure SDK list operations return paged iterators that yield resource objects. + This mock simulates that behavior for testing without making real API calls. + """ + + def __init__(self, items: List[Dict[str, Any]]): + self._items = [MockAzureResource(item) for item in items] + + def __iter__(self) -> Iterator[MockAzureResource]: + return iter(self._items) + + +class MockQueryResponse: + """Mock class for query responses (e.g., pipeline runs) with continuation token. + + Some Azure APIs return query responses that include a continuation token + for pagination. This mock supports that pattern. + """ + + def __init__( + self, items: List[Dict[str, Any]], continuation_token: Optional[str] = None + ): + self.value = [MockAzureResource(item) for item in items] + self.continuation_token = continuation_token + + +def create_mock_client( + pipelines: List[Dict[str, Any]], + datasets: List[Dict[str, Any]], + linked_services: List[Dict[str, Any]], + data_flows: Optional[List[Dict[str, Any]]] = None, + triggers: Optional[List[Dict[str, Any]]] = None, + pipeline_runs: Optional[List[Dict[str, Any]]] = None, +) -> mock.MagicMock: + """Create a mock DataFactoryManagementClient with the given test data. + + This function creates a mock that simulates the Azure SDK's + DataFactoryManagementClient. Each method returns appropriate mock + iterators/responses that our connector will process. + + Args: + pipelines: List of pipeline definitions (will be converted to DataFlow entities) + datasets: List of dataset definitions (used for lineage resolution) + linked_services: List of linked service definitions (used for platform mapping) + data_flows: List of data flow definitions (for Mapping Data Flow activities) + triggers: List of trigger definitions (optional) + pipeline_runs: List of pipeline run records (for execution history) + + Returns: + A MagicMock configured to behave like DataFactoryManagementClient + """ + mock_client = mock.MagicMock() + + # Mock factories - the top-level container for all ADF resources + factory = create_complex_factory() + mock_client.factories.list.return_value = MockPagedIterator([factory]) + mock_client.factories.list_by_resource_group.return_value = MockPagedIterator( + [factory] + ) + + # Mock pipelines - these become DataFlow entities in DataHub + mock_client.pipelines.list_by_factory.return_value = MockPagedIterator(pipelines) + + # Mock datasets - used to resolve lineage (input/output of activities) + mock_client.datasets.list_by_factory.return_value = MockPagedIterator(datasets) + + # Mock linked services - determine the platform type for datasets + # (e.g., AzureSqlDatabase → mssql, AzureBlobStorage → abs) + mock_client.linked_services.list_by_factory.return_value = MockPagedIterator( + linked_services + ) + + # Mock triggers - schedule definitions (not heavily used in these tests) + mock_client.triggers.list_by_factory.return_value = MockPagedIterator( + triggers or [] + ) + + # Mock data flows - Mapping Data Flow definitions with sources/sinks/transforms + mock_client.data_flows.list_by_factory.return_value = MockPagedIterator( + data_flows or [] + ) + + # Mock pipeline runs - execution history (for DataProcessInstance entities) + mock_client.pipeline_runs.query_by_factory.return_value = MockQueryResponse( + pipeline_runs or [] + ) + + # Mock activity runs - individual activity execution records + mock_client.activity_runs.query_by_pipeline_run.return_value = MockQueryResponse([]) + + return mock_client + + +def _run_test_pipeline( + tmp_path: Any, + run_id: str, + pipelines: List[Dict[str, Any]], + datasets: Optional[List[Dict[str, Any]]] = None, + linked_services: Optional[List[Dict[str, Any]]] = None, + data_flows: Optional[List[Dict[str, Any]]] = None, + include_lineage: bool = True, +) -> Pipeline: + """Helper function to run an ingestion pipeline with mocked Azure data. + + This sets up the full DataHub ingestion pipeline with mocked Azure SDK + responses, runs the ingestion, and returns the pipeline for assertions. + + Args: + tmp_path: Pytest fixture for temporary directory + run_id: Unique identifier for this test run + pipelines: ADF pipeline definitions to ingest + datasets: Dataset definitions (defaults to standard test datasets) + linked_services: Linked service definitions (defaults to standard test services) + data_flows: Data flow definitions for Mapping Data Flow activities + include_lineage: Whether to extract lineage from activities + + Returns: + The executed Pipeline object with source report for assertions + """ + if datasets is None: + datasets = create_complex_datasets() + if linked_services is None: + linked_services = create_complex_linked_services() + + mock_client = create_mock_client( + pipelines=pipelines, + datasets=datasets, + linked_services=linked_services, + data_flows=data_flows, + ) + + output_file = tmp_path / f"{run_id}_output.json" + + config = { + "run_id": run_id, + "source": { + "type": "azure-data-factory", + "config": { + "subscription_id": SUBSCRIPTION_ID, + "resource_group": RESOURCE_GROUP, + "credential": {"authentication_method": "default"}, + "env": "DEV", + "include_lineage": include_lineage, + "include_execution_history": False, + }, + }, + "sink": {"type": "file", "config": {"filename": str(output_file)}}, + } + + with mock.patch( + "datahub.ingestion.source.azure_data_factory.adf_client.DataFactoryManagementClient" + ) as MockClientClass: + MockClientClass.return_value = mock_client + + with mock.patch( + "datahub.ingestion.source.azure.azure_auth.DefaultAzureCredential" + ): + pipeline = Pipeline.create(config) + pipeline.run() + pipeline.raise_from_status() + + return pipeline + + +# ============================================================================= +# TEST: NESTED PIPELINES (Execute Pipeline Activity) +# ============================================================================= +# +# Scenario: Parent pipeline orchestrates child pipelines +# ------------------------------------------------------- +# ParentOrchestrationPipeline +# └── ExecutePipeline: ChildDataMovementPipeline +# └── Copy: SqlToBlob +# └── ExecutePipeline: ChildTransformPipeline +# └── DataFlow: TransformData +# +# What we're testing: +# - All three pipelines are extracted as DataFlow entities +# - ExecutePipeline activities are captured as DataJob entities +# - Child pipeline activities (Copy, DataFlow) are also captured +# - Browse paths show proper hierarchy +# +# Why this matters: +# - Large organizations modularize pipelines for reusability +# - Lineage must track data movement through nested executions +# - Users need to see the full orchestration hierarchy + + +@freeze_time(FROZEN_TIME) +@pytest.mark.integration +def test_nested_pipeline_creates_all_entities(pytestconfig, tmp_path): + """Test that nested pipelines create correct DataFlow and DataJob entities. + + This test verifies that when a parent pipeline contains ExecutePipeline + activities that call child pipelines, all three pipelines and their + activities are properly extracted as DataHub entities. + + Expected entities: + - 3 DataFlow entities (ParentOrchestrationPipeline, ChildDataMovement, ChildTransform) + - 4 DataJob entities (2 ExecutePipeline + 1 Copy + 1 DataFlow activity) + """ + scenario = create_nested_pipeline_scenario() + + pipeline = _run_test_pipeline( + tmp_path, + run_id="nested-pipeline-test", + pipelines=scenario["pipelines"], + data_flows=get_all_data_flows(), + ) + + # Verify all pipelines were processed (not filtered out) + assert isinstance(pipeline.source, AzureDataFactorySource) + assert pipeline.source.report.pipelines_scanned == len(scenario["pipelines"]) + + +@freeze_time(FROZEN_TIME) +@pytest.mark.integration +def test_nested_pipeline_golden(pytestconfig, tmp_path): + """Golden file test for nested pipeline scenario. + + Compares the output MCPs against a known-good golden file to detect + any regressions in entity structure, URN format, or aspect content. + + The golden file captures the expected output including: + - Container for the factory + - DataFlow entities for each pipeline + - DataJob entities for each activity + - Browse paths showing hierarchy + """ + test_resources_dir = pytestconfig.rootpath / "tests/integration/azure_data_factory" + scenario = create_nested_pipeline_scenario() + + output_file = tmp_path / "adf_nested_events.json" + golden_file = test_resources_dir / "adf_nested_golden.json" + + mock_client = create_mock_client( + pipelines=scenario["pipelines"], + datasets=create_complex_datasets(), + linked_services=create_complex_linked_services(), + data_flows=get_all_data_flows(), + ) + + config = { + "run_id": "adf-nested-test", + "source": { + "type": "azure-data-factory", + "config": { + "subscription_id": SUBSCRIPTION_ID, + "resource_group": RESOURCE_GROUP, + "credential": {"authentication_method": "default"}, + "env": "DEV", + "include_lineage": True, + }, + }, + "sink": {"type": "file", "config": {"filename": str(output_file)}}, + } + + with mock.patch( + "datahub.ingestion.source.azure_data_factory.adf_client.DataFactoryManagementClient" + ) as MockClientClass: + MockClientClass.return_value = mock_client + + with mock.patch( + "datahub.ingestion.source.azure.azure_auth.DefaultAzureCredential" + ): + pipeline = Pipeline.create(config) + pipeline.run() + pipeline.raise_from_status() + + mce_helpers.check_golden_file( + pytestconfig, + output_path=str(output_file), + golden_path=str(golden_file), + ) + + +# ============================================================================= +# TEST: FOREACH LOOPS +# ============================================================================= +# +# Scenario: Iterate over a list of tables to copy +# ------------------------------------------------ +# ForEachTablePipeline +# └── Lookup: GetTableList (query sys.tables) +# └── ForEach: IterateOverTables +# └── Copy: CopyTableToStaging (parametrized) +# +# What we're testing: +# - ForEach activity is captured as a DataJob +# - Activities inside ForEach are also captured +# - Lookup activity's lineage (reading from system tables) +# +# Why this matters: +# - ForEach is used extensively for bulk data operations +# - Users need visibility into what tables/files are processed +# - The Copy activity inside ForEach creates lineage for each iteration + + +@freeze_time(FROZEN_TIME) +@pytest.mark.integration +def test_foreach_loop_pipeline(pytestconfig, tmp_path): + """Golden file test for ForEach loop pipeline. + + Tests a pipeline that uses ForEach to iterate over tables and copy + each one to staging. This is a common pattern for bulk data movement. + + The test verifies: + - ForEach activity is captured as a DataJob with "ForEach Loop" subtype + - Nested Copy activity is captured (though iterations aren't expanded) + - Lookup activity that provides the iteration items is captured + """ + test_resources_dir = pytestconfig.rootpath / "tests/integration/azure_data_factory" + scenario = create_foreach_loop_scenario() + + output_file = tmp_path / "adf_foreach_events.json" + golden_file = test_resources_dir / "adf_foreach_golden.json" + + mock_client = create_mock_client( + pipelines=scenario["pipelines"], + datasets=create_complex_datasets(), + linked_services=create_complex_linked_services(), + ) + + config = { + "run_id": "adf-foreach-test", + "source": { + "type": "azure-data-factory", + "config": { + "subscription_id": SUBSCRIPTION_ID, + "resource_group": RESOURCE_GROUP, + "credential": {"authentication_method": "default"}, + "env": "DEV", + "include_lineage": True, + }, + }, + "sink": {"type": "file", "config": {"filename": str(output_file)}}, + } + + with mock.patch( + "datahub.ingestion.source.azure_data_factory.adf_client.DataFactoryManagementClient" + ) as MockClientClass: + MockClientClass.return_value = mock_client + + with mock.patch( + "datahub.ingestion.source.azure.azure_auth.DefaultAzureCredential" + ): + pipeline = Pipeline.create(config) + pipeline.run() + pipeline.raise_from_status() + + mce_helpers.check_golden_file( + pytestconfig, + output_path=str(output_file), + golden_path=str(golden_file), + ) + + +# ============================================================================= +# TEST: CONTROL FLOW BRANCHING (If-Condition, Switch) +# ============================================================================= +# +# Scenario: Conditional execution based on data existence and region +# ------------------------------------------------------------------ +# BranchingPipeline +# └── Lookup: CheckDataExists +# └── IfCondition: DataExistsCheck +# ├── True: Copy: FullLoad +# └── False: Copy: IncrementalLoad +# └── Switch: ProcessByRegion +# ├── Case "US": Copy: ProcessUSData +# ├── Case "EU": Copy: ProcessEUData +# └── Default: Copy: ProcessOtherData +# +# What we're testing: +# - IfCondition activity captures both true and false branches +# - Switch activity captures all cases and default +# - Activities in all branches are extracted as DataJobs +# - Lineage is captured for activities in conditional branches +# +# Why this matters: +# - Real pipelines have complex conditional logic +# - Users need to see ALL possible execution paths +# - Lineage must include data flows in every branch + + +@freeze_time(FROZEN_TIME) +@pytest.mark.integration +def test_branching_pipeline(pytestconfig, tmp_path): + """Golden file test for If-Condition and Switch branching pipeline. + + Tests a pipeline with complex control flow: + 1. IfCondition that branches based on whether data exists + 2. Switch that routes processing based on region parameter + + The test verifies: + - All activities in all branches are captured + - IfCondition has "If Condition" subtype + - Switch has "Switch Activity" subtype + - Lineage captures inputs/outputs in each branch's activities + """ + test_resources_dir = pytestconfig.rootpath / "tests/integration/azure_data_factory" + scenario = create_branching_scenario() + + output_file = tmp_path / "adf_branching_events.json" + golden_file = test_resources_dir / "adf_branching_golden.json" + + mock_client = create_mock_client( + pipelines=scenario["pipelines"], + datasets=create_complex_datasets(), + linked_services=create_complex_linked_services(), + ) + + config = { + "run_id": "adf-branching-test", + "source": { + "type": "azure-data-factory", + "config": { + "subscription_id": SUBSCRIPTION_ID, + "resource_group": RESOURCE_GROUP, + "credential": {"authentication_method": "default"}, + "env": "DEV", + "include_lineage": True, + }, + }, + "sink": {"type": "file", "config": {"filename": str(output_file)}}, + } + + with mock.patch( + "datahub.ingestion.source.azure_data_factory.adf_client.DataFactoryManagementClient" + ) as MockClientClass: + MockClientClass.return_value = mock_client + + with mock.patch( + "datahub.ingestion.source.azure.azure_auth.DefaultAzureCredential" + ): + pipeline = Pipeline.create(config) + pipeline.run() + pipeline.raise_from_status() + + mce_helpers.check_golden_file( + pytestconfig, + output_path=str(output_file), + golden_path=str(golden_file), + ) + + +# ============================================================================= +# TEST: MAPPING DATA FLOWS +# ============================================================================= +# +# Scenario: Complex data transformation with multiple sources and sinks +# --------------------------------------------------------------------- +# DataFlowPipeline +# └── ExecuteDataFlow: RunSalesTransformation +# └── SalesTransformationFlow: +# Sources: CustomersSource, OrdersSource +# Transformations: Filter → Join → Aggregate → Derive +# Sinks: CuratedOutput, SynapseOutput +# +# What we're testing: +# - Data Flow definition is loaded and cached +# - ExecuteDataFlow activity extracts sources as inputs +# - ExecuteDataFlow activity extracts sinks as outputs +# - Data Flow script is captured in dataTransformLogic aspect +# +# Why this matters: +# - Mapping Data Flows contain critical transformation logic +# - Lineage from Data Flows shows complex many-to-many relationships +# - Scripts help users understand what transformations are applied + + +@freeze_time(FROZEN_TIME) +@pytest.mark.integration +def test_dataflow_pipeline_with_lineage(pytestconfig, tmp_path): + """Golden file test for Mapping Data Flow pipeline with lineage extraction. + + Tests a pipeline that executes a Mapping Data Flow containing: + - Multiple sources (customers, orders) + - Multiple transformations (filter, join, aggregate, derive) + - Multiple sinks (data lake, synapse) + + The test verifies: + - ExecuteDataFlow activity has "Data Flow Activity" subtype + - Data Flow sources are captured as input datasets + - Data Flow sinks are captured as output datasets + - Data Flow script is captured (for transformation visibility) + """ + test_resources_dir = pytestconfig.rootpath / "tests/integration/azure_data_factory" + scenario = create_dataflow_scenario() + + output_file = tmp_path / "adf_dataflow_events.json" + golden_file = test_resources_dir / "adf_dataflow_golden.json" + + mock_client = create_mock_client( + pipelines=scenario["pipelines"], + datasets=create_complex_datasets(), + linked_services=create_complex_linked_services(), + data_flows=scenario.get("data_flows", []), + ) + + config = { + "run_id": "adf-dataflow-test", + "source": { + "type": "azure-data-factory", + "config": { + "subscription_id": SUBSCRIPTION_ID, + "resource_group": RESOURCE_GROUP, + "credential": {"authentication_method": "default"}, + "env": "DEV", + "include_lineage": True, + }, + }, + "sink": {"type": "file", "config": {"filename": str(output_file)}}, + } + + with mock.patch( + "datahub.ingestion.source.azure_data_factory.adf_client.DataFactoryManagementClient" + ) as MockClientClass: + MockClientClass.return_value = mock_client + + with mock.patch( + "datahub.ingestion.source.azure.azure_auth.DefaultAzureCredential" + ): + pipeline = Pipeline.create(config) + pipeline.run() + pipeline.raise_from_status() + + mce_helpers.check_golden_file( + pytestconfig, + output_path=str(output_file), + golden_path=str(golden_file), + ) + + +# ============================================================================= +# TEST: MULTI-SOURCE ETL PIPELINE +# ============================================================================= +# +# Scenario: Full ETL chain with multiple stages and destinations +# -------------------------------------------------------------- +# ETLPipeline +# ├── Copy: ExtractCustomersFromSQL (SQL → Blob) +# ├── Copy: ExtractOrdersFromSQL (SQL → Blob) +# ├── Copy: LoadCustomersToSynapse (Blob → Synapse) +# ├── Copy: LoadOrdersToSynapse (Blob → Synapse) +# └── Copy: ArchiveToDataLake (Blob → DataLake) +# +# Lineage chain: +# SQL (Customers) → Blob (Staging) → Synapse (DW) +# → DataLake (Archive) +# SQL (Orders) → Blob (Staging) → Synapse (DW) +# +# What we're testing: +# - Multi-hop lineage is captured correctly +# - Platform mapping works for different linked services: +# - AzureSqlDatabase → mssql +# - AzureBlobStorage → abs +# - AzureSynapseAnalytics → mssql +# - AzureBlobFS → abs +# - Dependencies between activities are respected +# +# Why this matters: +# - Real ETL pipelines have multiple stages +# - Users need to trace data from source to final destination +# - Platform-specific URNs enable cross-system lineage in DataHub + + +@freeze_time(FROZEN_TIME) +@pytest.mark.integration +def test_multisource_etl_pipeline(pytestconfig, tmp_path): + """Golden file test for multi-source ETL pipeline with full lineage chain. + + Tests a realistic ETL pipeline that: + 1. Extracts data from SQL databases to blob storage + 2. Loads from blob to Synapse data warehouse + 3. Archives to Data Lake for long-term storage + + The test verifies: + - All Copy activities are captured with correct subtypes + - Platform mapping produces correct URNs: + - mssql for SQL and Synapse datasets + - abs for Blob and Data Lake datasets + - Activity dependencies are reflected in job order + """ + test_resources_dir = pytestconfig.rootpath / "tests/integration/azure_data_factory" + scenario = create_multisource_chain_scenario() + + output_file = tmp_path / "adf_multisource_events.json" + golden_file = test_resources_dir / "adf_multisource_golden.json" + + mock_client = create_mock_client( + pipelines=scenario["pipelines"], + datasets=create_complex_datasets(), + linked_services=create_complex_linked_services(), + ) + + config = { + "run_id": "adf-multisource-test", + "source": { + "type": "azure-data-factory", + "config": { + "subscription_id": SUBSCRIPTION_ID, + "resource_group": RESOURCE_GROUP, + "credential": {"authentication_method": "default"}, + "env": "DEV", + "include_lineage": True, + }, + }, + "sink": {"type": "file", "config": {"filename": str(output_file)}}, + } + + with mock.patch( + "datahub.ingestion.source.azure_data_factory.adf_client.DataFactoryManagementClient" + ) as MockClientClass: + MockClientClass.return_value = mock_client + + with mock.patch( + "datahub.ingestion.source.azure.azure_auth.DefaultAzureCredential" + ): + pipeline = Pipeline.create(config) + pipeline.run() + pipeline.raise_from_status() + + mce_helpers.check_golden_file( + pytestconfig, + output_path=str(output_file), + golden_path=str(golden_file), + ) + + +# ============================================================================= +# LINEAGE VERIFICATION TESTS +# ============================================================================= +# +# These tests go beyond golden file comparison to programmatically verify +# that lineage is being captured correctly. They check specific assertions +# about the extracted metadata rather than comparing full output. + + +@freeze_time(FROZEN_TIME) +@pytest.mark.integration +def test_multisource_lineage_accuracy(tmp_path): + """Verify lineage edges are correct for multi-source ETL pipeline. + + This test programmatically inspects the generated MCPs to verify that: + 1. dataJobInputOutput aspects are emitted (lineage is captured) + 2. SQL sources appear as input datasets with 'mssql' platform + 3. Synapse destinations appear as output datasets with 'mssql' platform (Synapse uses mssql protocol) + + This complements the golden file test by focusing on specific lineage + properties that are critical for data governance use cases. + """ + scenario = create_multisource_chain_scenario() + + mock_client = create_mock_client( + pipelines=scenario["pipelines"], + datasets=create_complex_datasets(), + linked_services=create_complex_linked_services(), + ) + + output_file = tmp_path / "lineage_test.json" + + config = { + "run_id": "lineage-test", + "source": { + "type": "azure-data-factory", + "config": { + "subscription_id": SUBSCRIPTION_ID, + "resource_group": RESOURCE_GROUP, + "credential": {"authentication_method": "default"}, + "env": "DEV", + "include_lineage": True, + }, + }, + "sink": {"type": "file", "config": {"filename": str(output_file)}}, + } + + with mock.patch( + "datahub.ingestion.source.azure_data_factory.adf_client.DataFactoryManagementClient" + ) as MockClientClass: + MockClientClass.return_value = mock_client + + with mock.patch( + "datahub.ingestion.source.azure.azure_auth.DefaultAzureCredential" + ): + pipeline = Pipeline.create(config) + pipeline.run() + + # Read output and verify lineage (file sink outputs JSON array) + with open(output_file) as f: + mcps = json.load(f) + + # Find dataJobInputOutput aspects - these contain the lineage edges + lineage_aspects = [ + mcp for mcp in mcps if mcp.get("aspectName") == "dataJobInputOutput" + ] + + # Verify lineage aspects were emitted for Copy activities + assert len(lineage_aspects) > 0, "Expected lineage aspects to be emitted" + + # Collect all input and output datasets from lineage aspects + all_inputs = [] + all_outputs = [] + for aspect in lineage_aspects: + inputs = aspect.get("aspect", {}).get("json", {}).get("inputDatasets", []) + outputs = aspect.get("aspect", {}).get("json", {}).get("outputDatasets", []) + all_inputs.extend(inputs) + all_outputs.extend(outputs) + + # Verify SQL sources are captured with correct platform + # SQL inputs should have URNs containing 'mssql' (mapped from AzureSqlDatabase) + sql_inputs = [i for i in all_inputs if "mssql" in i] + assert len(sql_inputs) > 0, "Expected SQL dataset inputs with 'mssql' platform" + + # Verify Synapse destinations are captured with correct platform + # Synapse outputs should have URNs containing 'mssql' (Synapse uses mssql protocol) + # Check for output datasets that contain common Synapse table naming patterns + mssql_outputs = [o for o in all_outputs if "mssql" in o] + assert len(mssql_outputs) > 0, ( + "Expected Synapse dataset outputs with 'mssql' platform" + ) + + +@freeze_time(FROZEN_TIME) +@pytest.mark.integration +def test_dataflow_lineage_sources_and_sinks(tmp_path): + """Verify Data Flow sources and sinks are extracted for lineage. + + This test verifies that when a pipeline executes a Mapping Data Flow, + the connector: + 1. Fetches and caches the Data Flow definition + 2. Extracts source datasets from the Data Flow + 3. Extracts sink datasets from the Data Flow + 4. Reports that data flows were scanned + + Data Flow lineage is critical because: + - Data Flows can have complex many-to-many relationships + - Sources/sinks are defined in the Data Flow, not the activity + - Without Data Flow inspection, lineage would be incomplete + """ + scenario = create_dataflow_scenario() + + mock_client = create_mock_client( + pipelines=scenario["pipelines"], + datasets=create_complex_datasets(), + linked_services=create_complex_linked_services(), + data_flows=scenario.get("data_flows", []), + ) + + output_file = tmp_path / "dataflow_lineage_test.json" + + config = { + "run_id": "dataflow-lineage-test", + "source": { + "type": "azure-data-factory", + "config": { + "subscription_id": SUBSCRIPTION_ID, + "resource_group": RESOURCE_GROUP, + "credential": {"authentication_method": "default"}, + "env": "DEV", + "include_lineage": True, + }, + }, + "sink": {"type": "file", "config": {"filename": str(output_file)}}, + } + + with mock.patch( + "datahub.ingestion.source.azure_data_factory.adf_client.DataFactoryManagementClient" + ) as MockClientClass: + MockClientClass.return_value = mock_client + + with mock.patch( + "datahub.ingestion.source.azure.azure_auth.DefaultAzureCredential" + ): + pipeline = Pipeline.create(config) + pipeline.run() + + # Verify that data flows were fetched and processed + # This confirms the connector is looking up Data Flow definitions + assert isinstance(pipeline.source, AzureDataFactorySource) + assert pipeline.source.report.data_flows_scanned > 0, ( + "Expected data flows to be scanned for lineage extraction" + ) + + +# ============================================================================= +# TEST: DIVERSE ACTIVITY TYPES +# ============================================================================= +# +# Scenario: Pipeline with various activity types +# ----------------------------------------------- +# DiverseActivitiesPipeline +# └── SetVariable: InitializeCounter +# └── WebActivity: FetchConfiguration +# └── SqlServerStoredProcedure: ProcessData +# └── Wait: DelayForReplication +# └── GetMetadata: CheckOutputExists +# └── DatabricksNotebook: RunMLTraining +# └── Script: RunAnalyticsScript +# └── AzureFunctionActivity: SendNotification +# └── Fail: FailOnError +# +# What we're testing: +# - All activity types are captured as DataJobs with correct subtypes +# - Each activity has the appropriate metadata (description, properties) +# - The connector doesn't fail on uncommon activity types +# +# Why this matters: +# - Real pipelines use many different activity types +# - Users need visibility into all orchestration activities +# - Activity subtypes help with filtering and understanding + + +@freeze_time(FROZEN_TIME) +@pytest.mark.integration +def test_diverse_activities_pipeline(pytestconfig, tmp_path): + """Test that diverse activity types are correctly captured. + + This test verifies that the connector handles various activity types: + - SetVariable, WebActivity, SqlServerStoredProcedure, Wait + - GetMetadata, DatabricksNotebook, Script, AzureFunctionActivity, Fail + + Each activity should be captured as a DataJob with the correct subtype. + """ + test_resources_dir = pytestconfig.rootpath / "tests/integration/azure_data_factory" + scenario = create_diverse_activities_scenario() + + output_file = tmp_path / "adf_diverse_events.json" + golden_file = test_resources_dir / "adf_diverse_golden.json" + + # Combine standard linked services with additional ones from the scenario + all_linked_services = create_complex_linked_services() + scenario.get( + "additional_linked_services", [] + ) + + mock_client = create_mock_client( + pipelines=scenario["pipelines"], + datasets=create_complex_datasets(), + linked_services=all_linked_services, + ) + + config = { + "run_id": "adf-diverse-test", + "source": { + "type": "azure-data-factory", + "config": { + "subscription_id": SUBSCRIPTION_ID, + "resource_group": RESOURCE_GROUP, + "credential": {"authentication_method": "default"}, + "env": "DEV", + "include_lineage": True, + }, + }, + "sink": {"type": "file", "config": {"filename": str(output_file)}}, + } + + with mock.patch( + "datahub.ingestion.source.azure_data_factory.adf_client.DataFactoryManagementClient" + ) as MockClientClass: + MockClientClass.return_value = mock_client + + with mock.patch( + "datahub.ingestion.source.azure.azure_auth.DefaultAzureCredential" + ): + pipeline = Pipeline.create(config) + pipeline.run() + pipeline.raise_from_status() + + mce_helpers.check_golden_file( + pytestconfig, + output_path=str(output_file), + golden_path=str(golden_file), + ) + + +@freeze_time(FROZEN_TIME) +@pytest.mark.integration +def test_diverse_activities_subtypes(tmp_path): + """Verify that diverse activity types have correct subtypes. + + This test programmatically checks that each activity type is mapped + to the expected DataHub subtype. + """ + scenario = create_diverse_activities_scenario() + + all_linked_services = create_complex_linked_services() + scenario.get( + "additional_linked_services", [] + ) + + mock_client = create_mock_client( + pipelines=scenario["pipelines"], + datasets=create_complex_datasets(), + linked_services=all_linked_services, + ) + + output_file = tmp_path / "diverse_subtypes_test.json" + + config = { + "run_id": "diverse-subtypes-test", + "source": { + "type": "azure-data-factory", + "config": { + "subscription_id": SUBSCRIPTION_ID, + "resource_group": RESOURCE_GROUP, + "credential": {"authentication_method": "default"}, + "env": "DEV", + "include_lineage": True, + }, + }, + "sink": {"type": "file", "config": {"filename": str(output_file)}}, + } + + with mock.patch( + "datahub.ingestion.source.azure_data_factory.adf_client.DataFactoryManagementClient" + ) as MockClientClass: + MockClientClass.return_value = mock_client + + with mock.patch( + "datahub.ingestion.source.azure.azure_auth.DefaultAzureCredential" + ): + pipeline = Pipeline.create(config) + pipeline.run() + + # Read output and verify subtypes + with open(output_file) as f: + mcps = json.load(f) + + # Find subTypes aspects for DataJobs + subtype_aspects = [ + mcp + for mcp in mcps + if mcp.get("entityType") == "dataJob" and mcp.get("aspectName") == "subTypes" + ] + + # Collect all subtypes + found_subtypes = set() + for aspect in subtype_aspects: + types = aspect.get("aspect", {}).get("json", {}).get("typeNames", []) + found_subtypes.update(types) + + # Verify we captured diverse subtypes (at least some key ones) + expected_subtypes = { + "Set Variable", + "Web Activity", + "Stored Procedure Activity", + "Wait Activity", + "Get Metadata Activity", + "Databricks Notebook", + } + + found_expected = expected_subtypes.intersection(found_subtypes) + assert len(found_expected) >= 3, ( + f"Expected to find at least 3 activity subtypes from {expected_subtypes}, " + f"but found: {found_subtypes}" + ) + + +# ============================================================================= +# TEST: PIPELINE-TO-PIPELINE LINEAGE +# ============================================================================= +# +# Scenario: Parent pipeline calling child pipelines +# -------------------------------------------------- +# When a pipeline uses ExecutePipeline activity to call another pipeline, +# we should capture this dependency. This enables: +# - Understanding orchestration hierarchies +# - Impact analysis across pipeline boundaries +# - Tracing data flow through nested execution +# +# What we're testing: +# - ExecutePipeline activities capture child pipeline references +# - Custom properties include "calls_pipeline" and "child_pipeline_urn" +# - The dependency is visible in DataHub + + +@freeze_time(FROZEN_TIME) +@pytest.mark.integration +def test_pipeline_to_pipeline_lineage(tmp_path): + """Verify that ExecutePipeline activities create DataJob-to-DataJob lineage. + + When a parent pipeline calls a child pipeline via ExecutePipeline, + the connector should: + 1. Capture the child pipeline name in custom properties + 2. Record the child pipeline's DataFlow URN + 3. Create DataJob-to-DataJob lineage (inputDatajobs) pointing to first child activity + 4. Enable users to trace the orchestration hierarchy in the UI + + This test checks the nested pipeline scenario for these dependencies. + """ + scenario = create_nested_pipeline_scenario() + + mock_client = create_mock_client( + pipelines=scenario["pipelines"], + datasets=create_complex_datasets(), + linked_services=create_complex_linked_services(), + data_flows=get_all_data_flows(), + ) + + output_file = tmp_path / "pipeline_lineage_test.json" + + config = { + "run_id": "pipeline-lineage-test", + "source": { + "type": "azure-data-factory", + "config": { + "subscription_id": SUBSCRIPTION_ID, + "resource_group": RESOURCE_GROUP, + "credential": {"authentication_method": "default"}, + "env": "DEV", + "include_lineage": True, + }, + }, + "sink": {"type": "file", "config": {"filename": str(output_file)}}, + } + + with mock.patch( + "datahub.ingestion.source.azure_data_factory.adf_client.DataFactoryManagementClient" + ) as MockClientClass: + MockClientClass.return_value = mock_client + + with mock.patch( + "datahub.ingestion.source.azure.azure_auth.DefaultAzureCredential" + ): + pipeline = Pipeline.create(config) + pipeline.run() + + # Read output and verify pipeline-to-pipeline references + with open(output_file) as f: + mcps = json.load(f) + + # Find DataJobInfo aspects with ExecutePipeline activities + datajob_infos = [ + mcp + for mcp in mcps + if mcp.get("entityType") == "dataJob" and mcp.get("aspectName") == "dataJobInfo" + ] + + # Find DataJobInputOutput aspects + datajob_io = [ + mcp + for mcp in mcps + if mcp.get("entityType") == "dataJob" + and mcp.get("aspectName") == "dataJobInputOutput" + ] + + # Look for activities that call child pipelines + child_pipeline_refs = [] + for info in datajob_infos: + custom_props = ( + info.get("aspect", {}).get("json", {}).get("customProperties", {}) + ) + if "calls_pipeline" in custom_props: + child_pipeline_refs.append( + { + "activity": info.get("entityUrn", ""), + "calls": custom_props.get("calls_pipeline"), + "child_urn": custom_props.get("child_pipeline_urn"), + "child_first_activity": custom_props.get("child_first_activity"), + } + ) + + # The nested pipeline scenario has 2 ExecutePipeline activities + assert len(child_pipeline_refs) >= 2, ( + f"Expected at least 2 ExecutePipeline activities with child pipeline references, " + f"but found: {len(child_pipeline_refs)}" + ) + + # Verify the child pipeline names are captured + child_names = {ref["calls"] for ref in child_pipeline_refs} + assert "ChildDataMovementPipeline" in child_names, ( + f"Expected ChildDataMovementPipeline in child references: {child_names}" + ) + assert "ChildTransformPipeline" in child_names, ( + f"Expected ChildTransformPipeline in child references: {child_names}" + ) + + # Verify the first activity names are captured + first_activities = {ref["child_first_activity"] for ref in child_pipeline_refs} + assert "CopyCustomersToStaging" in first_activities, ( + f"Expected CopyCustomersToStaging as first activity: {first_activities}" + ) + assert "TransformCustomerData" in first_activities, ( + f"Expected TransformCustomerData as first activity: {first_activities}" + ) + + # Verify DataJobInputOutput aspects create correct lineage direction + # The child's first activity should have the parent ExecutePipeline as inputDatajobs + # This creates lineage: ExecutePipeline -> ChildFirstActivity + child_activity_inputs = {} + for io in datajob_io: + entity_urn = io.get("entityUrn", "") + input_jobs = io.get("aspect", {}).get("json", {}).get("inputDatajobs", []) + if input_jobs: + child_activity_inputs[entity_urn] = input_jobs + + # Should have at least 2 child activities with inputDatajobs (one for each ExecutePipeline) + assert len(child_activity_inputs) >= 2, ( + f"Expected at least 2 child activities with inputDatajobs lineage, " + f"but found: {len(child_activity_inputs)}" + ) + + # Verify the child activities have ExecutePipeline as their input (upstream) + # CopyCustomersToStaging should have ExecuteDataMovement as input + # TransformCustomerData should have ExecuteTransform as input + all_inputs = [] + for inputs in child_activity_inputs.values(): + all_inputs.extend(inputs) + + assert any("ExecuteDataMovement" in urn for urn in all_inputs), ( + f"Expected ExecuteDataMovement as upstream of child activity: {all_inputs}" + ) + assert any("ExecuteTransform" in urn for urn in all_inputs), ( + f"Expected ExecuteTransform as upstream of child activity: {all_inputs}" + ) + + +def test_mixed_pipeline_and_dataset_dependencies(tmp_path: Path) -> None: + """Test scenario with both pipeline-to-pipeline and dataset dependencies. + + This test verifies that the connector correctly handles pipelines that have: + 1. ExecutePipeline activities (pipeline-to-pipeline lineage) + 2. Copy activities with explicit inputs/outputs (dataset lineage) + + Structure: + - MixedOrchestrationPipeline + └── ExecuteExtract -> ExtractDataPipeline.ExtractFromSource + └── TransformInMain (Copy with dataset I/O) + └── ExecuteLoad -> LoadDataPipeline.LoadToDestination + + Expected results: + - Pipeline lineage: ExecuteExtract -> ExtractFromSource + - Pipeline lineage: ExecuteLoad -> LoadToDestination + - Dataset lineage: TransformInMain reads BlobStagingCustomers + - Dataset lineage: TransformInMain writes SynapseCustomersDim + """ + from tests.integration.azure_data_factory.complex_mocks import ( + create_mixed_dependencies_scenario, + ) + + scenario = create_mixed_dependencies_scenario() + output_file = tmp_path / "mixed_deps_output.json" + + # Create mock client using the existing helper + mock_client = create_mock_client( + pipelines=scenario["pipelines"], + datasets=create_complex_datasets(), + linked_services=create_complex_linked_services(), + ) + + config = { + "run_id": "mixed_deps_test", + "source": { + "type": "azure-data-factory", + "config": { + "subscription_id": SUBSCRIPTION_ID, + "resource_group": RESOURCE_GROUP, + "credential": {"authentication_method": "default"}, + "env": "DEV", + "include_lineage": True, + "include_execution_history": False, + }, + }, + "sink": {"type": "file", "config": {"filename": str(output_file)}}, + } + + with mock.patch( + "datahub.ingestion.source.azure_data_factory.adf_client.DataFactoryManagementClient" + ) as MockClientClass: + MockClientClass.return_value = mock_client + + with mock.patch( + "datahub.ingestion.source.azure.azure_auth.DefaultAzureCredential" + ): + pipeline = Pipeline.create(config) + pipeline.run() + + # Read output + with open(output_file) as f: + mcps = json.load(f) + + # ========================================================================= + # Verify Pipeline-to-Pipeline Lineage + # ========================================================================= + # Find DataJobInfo aspects to identify ExecutePipeline activities + datajob_infos = [ + mcp + for mcp in mcps + if mcp.get("entityType") == "dataJob" and mcp.get("aspectName") == "dataJobInfo" + ] + + # Find activities that call child pipelines + execute_pipeline_refs = [] + for info in datajob_infos: + custom_props = ( + info.get("aspect", {}).get("json", {}).get("customProperties", {}) + ) + if "calls_pipeline" in custom_props: + execute_pipeline_refs.append( + { + "activity_urn": info.get("entityUrn", ""), + "calls": custom_props.get("calls_pipeline"), + "child_first_activity": custom_props.get("child_first_activity"), + } + ) + + # Should have 2 ExecutePipeline activities + assert len(execute_pipeline_refs) == 2, ( + f"Expected 2 ExecutePipeline activities, found: {len(execute_pipeline_refs)}" + ) + + # Verify correct child pipelines are referenced + child_pipelines = {ref["calls"] for ref in execute_pipeline_refs} + assert "ExtractDataPipeline" in child_pipelines + assert "LoadDataPipeline" in child_pipelines + + # Verify first activities of child pipelines + first_activities = {ref["child_first_activity"] for ref in execute_pipeline_refs} + assert "ExtractFromSource" in first_activities + assert "LoadToDestination" in first_activities + + # ========================================================================= + # Verify Dataset Lineage + # ========================================================================= + # Find DataJobInputOutput aspects + datajob_io = [ + mcp + for mcp in mcps + if mcp.get("entityType") == "dataJob" + and mcp.get("aspectName") == "dataJobInputOutput" + ] + + # Build a map of entity URN -> (inputDatasets, outputDatasets) + dataset_lineage: dict[str, dict[str, list[str]]] = {} + for io in datajob_io: + entity_urn = io.get("entityUrn", "") + input_datasets = io.get("aspect", {}).get("json", {}).get("inputDatasets", []) + output_datasets = io.get("aspect", {}).get("json", {}).get("outputDatasets", []) + if input_datasets or output_datasets: + dataset_lineage[entity_urn] = { + "inputs": input_datasets, + "outputs": output_datasets, + } + + # Find TransformInMain activity's lineage + transform_lineage = None + for urn, lineage in dataset_lineage.items(): + if "TransformInMain" in urn: + transform_lineage = lineage + break + + assert transform_lineage is not None, ( + f"TransformInMain activity should have dataset lineage. " + f"Available URNs: {list(dataset_lineage.keys())}" + ) + + # TransformInMain should read from BlobStagingCustomers (blob storage) + assert len(transform_lineage["inputs"]) >= 1, ( + "TransformInMain should have at least 1 input dataset" + ) + # The URN uses platform and dataset path from typeProperties, not the ADF dataset name + # BlobStagingCustomers maps to abs platform with path staging/customers + assert any( + "abs" in urn or "staging" in urn for urn in transform_lineage["inputs"] + ), f"TransformInMain should read from blob storage: {transform_lineage['inputs']}" + + # TransformInMain should write to SynapseSalesTable (mssql platform) + assert len(transform_lineage["outputs"]) >= 1, ( + "TransformInMain should have at least 1 output dataset" + ) + # SynapseSalesTable maps to mssql platform (Synapse uses mssql protocol) + assert any( + "mssql" in urn or "Sales" in urn for urn in transform_lineage["outputs"] + ), ( + f"TransformInMain should write to Synapse (mssql): {transform_lineage['outputs']}" + ) + + # ========================================================================= + # Verify Both Lineage Types Coexist + # ========================================================================= + # We should have at least 3 DataJobInputOutput aspects: + # - 2 for child pipelines' first activities (inputDatajobs from pipeline lineage) + # - Several for Copy activities (inputDatasets/outputDatasets) + assert len(datajob_io) >= 3, ( + f"Expected at least 3 DataJobInputOutput aspects for mixed lineage, " + f"found: {len(datajob_io)}" + ) + + # Verify pipeline lineage exists (inputDatajobs) + pipeline_lineage_count = sum( + 1 + for io in datajob_io + if io.get("aspect", {}).get("json", {}).get("inputDatajobs", []) + ) + assert pipeline_lineage_count >= 2, ( + f"Expected at least 2 activities with pipeline lineage (inputDatajobs), " + f"found: {pipeline_lineage_count}" + ) + + # Verify dataset lineage exists (inputDatasets or outputDatasets) + dataset_lineage_count = sum( + 1 + for io in datajob_io + if io.get("aspect", {}).get("json", {}).get("inputDatasets", []) + or io.get("aspect", {}).get("json", {}).get("outputDatasets", []) + ) + assert dataset_lineage_count >= 3, ( + f"Expected at least 3 activities with dataset lineage, " + f"found: {dataset_lineage_count}" + ) + + +@freeze_time(FROZEN_TIME) +@pytest.mark.integration +def test_mixed_dependencies_golden(pytestconfig, tmp_path): + """Golden file test for mixed pipeline and dataset dependencies. + + This golden test validates the complete output when a pipeline has both: + 1. ExecutePipeline activities (pipeline-to-pipeline lineage) + 2. Copy activities with dataset inputs/outputs (dataset lineage) + + The golden file captures: + - Container for the factory + - DataFlow entities for all 3 pipelines + - DataJob entities for all 5 activities + - DataJobInputOutput aspects showing both pipeline and dataset lineage + - Browse paths and custom properties + """ + from tests.integration.azure_data_factory.complex_mocks import ( + create_mixed_dependencies_scenario, + ) + + test_resources_dir = pytestconfig.rootpath / "tests/integration/azure_data_factory" + scenario = create_mixed_dependencies_scenario() + + output_file = tmp_path / "adf_mixed_deps_events.json" + golden_file = test_resources_dir / "adf_mixed_deps_golden.json" + + mock_client = create_mock_client( + pipelines=scenario["pipelines"], + datasets=create_complex_datasets(), + linked_services=create_complex_linked_services(), + ) + + config = { + "run_id": "adf-mixed-deps-test", + "source": { + "type": "azure-data-factory", + "config": { + "subscription_id": SUBSCRIPTION_ID, + "resource_group": RESOURCE_GROUP, + "credential": {"authentication_method": "default"}, + "env": "DEV", + "include_lineage": True, + }, + }, + "sink": {"type": "file", "config": {"filename": str(output_file)}}, + } + + with mock.patch( + "datahub.ingestion.source.azure_data_factory.adf_client.DataFactoryManagementClient" + ) as MockClientClass: + MockClientClass.return_value = mock_client + + with mock.patch( + "datahub.ingestion.source.azure.azure_auth.DefaultAzureCredential" + ): + pipeline = Pipeline.create(config) + pipeline.run() + pipeline.raise_from_status() + + mce_helpers.check_golden_file( + pytestconfig, + output_path=str(output_file), + golden_path=str(golden_file), + ) diff --git a/metadata-ingestion/tests/unit/azure_data_factory/__init__.py b/metadata-ingestion/tests/unit/azure_data_factory/__init__.py new file mode 100644 index 00000000000000..a371633468077c --- /dev/null +++ b/metadata-ingestion/tests/unit/azure_data_factory/__init__.py @@ -0,0 +1 @@ +"""Unit tests for Azure Data Factory connector.""" diff --git a/metadata-ingestion/tests/unit/azure_data_factory/test_adf_config.py b/metadata-ingestion/tests/unit/azure_data_factory/test_adf_config.py new file mode 100644 index 00000000000000..f9d4f2e72f54a2 --- /dev/null +++ b/metadata-ingestion/tests/unit/azure_data_factory/test_adf_config.py @@ -0,0 +1,162 @@ +"""Unit tests for Azure Data Factory configuration. + +Following the accelerator guidelines, we test: +- Configuration VALIDATION logic (required fields, bounds checking) +- Configuration INTERACTION logic (combinations of fields) + +We do NOT test: +- Default configuration values (anti-pattern) +- Simple getters/setters +- Pydantic framework behavior +""" + +import pytest +from pydantic import ValidationError + +from datahub.ingestion.source.azure.azure_auth import ( + AzureAuthenticationMethod, + AzureCredentialConfig, +) +from datahub.ingestion.source.azure_data_factory.adf_config import ( + AzureDataFactoryConfig, +) + + +class TestAzureCredentialConfigValidation: + """Tests for AzureCredentialConfig validation logic.""" + + def test_service_principal_requires_client_secret(self) -> None: + """Service principal auth should fail without client_secret.""" + with pytest.raises(ValidationError) as exc_info: + AzureCredentialConfig( + authentication_method=AzureAuthenticationMethod.SERVICE_PRINCIPAL, + client_id="test-client-id", + tenant_id="test-tenant-id", + # Missing client_secret + ) + assert "client_secret" in str(exc_info.value) + + def test_service_principal_requires_tenant_id(self) -> None: + """Service principal auth should fail without tenant_id.""" + with pytest.raises(ValidationError) as exc_info: + AzureCredentialConfig( + authentication_method=AzureAuthenticationMethod.SERVICE_PRINCIPAL, + client_id="test-client-id", + client_secret="test-secret", + # Missing tenant_id + ) + assert "tenant_id" in str(exc_info.value) + + def test_service_principal_requires_client_id(self) -> None: + """Service principal auth should fail without client_id.""" + with pytest.raises(ValidationError) as exc_info: + AzureCredentialConfig( + authentication_method=AzureAuthenticationMethod.SERVICE_PRINCIPAL, + client_secret="test-secret", + tenant_id="test-tenant-id", + # Missing client_id + ) + assert "client_id" in str(exc_info.value) + + def test_service_principal_valid_when_all_fields_present(self) -> None: + """Service principal should pass validation with all required fields.""" + # Should not raise + config = AzureCredentialConfig( + authentication_method=AzureAuthenticationMethod.SERVICE_PRINCIPAL, + client_id="test-client-id", + client_secret="test-client-secret", + tenant_id="test-tenant-id", + ) + # Verify config was created (not testing values, testing validation passed) + assert ( + config.authentication_method == AzureAuthenticationMethod.SERVICE_PRINCIPAL + ) + + +class TestAzureDataFactoryConfigValidation: + """Tests for AzureDataFactoryConfig validation logic.""" + + def test_execution_history_days_minimum_bound(self) -> None: + """execution_history_days should reject values below 1.""" + with pytest.raises(ValidationError): + AzureDataFactoryConfig( + subscription_id="test", + execution_history_days=0, # Below minimum + ) + + def test_execution_history_days_maximum_bound(self) -> None: + """execution_history_days should reject values above 90.""" + with pytest.raises(ValidationError): + AzureDataFactoryConfig( + subscription_id="test", + execution_history_days=91, # Above maximum + ) + + def test_execution_history_days_accepts_boundary_values(self) -> None: + """execution_history_days should accept boundary values (1 and 90).""" + # Should not raise + config_min = AzureDataFactoryConfig( + subscription_id="test", + execution_history_days=1, + ) + assert config_min.execution_history_days == 1 + + config_max = AzureDataFactoryConfig( + subscription_id="test", + execution_history_days=90, + ) + assert config_max.execution_history_days == 90 + + def test_subscription_id_required(self) -> None: + """Config should fail without subscription_id.""" + with pytest.raises(ValidationError): + AzureDataFactoryConfig() # type: ignore[call-arg] + + def test_factory_pattern_deny_filters_correctly(self) -> None: + """Factory pattern deny should filter matching factories.""" + config = AzureDataFactoryConfig( + subscription_id="test", + factory_pattern={"allow": [".*"], "deny": [".*-test$", "dev-.*"]}, + ) + + # Test that pattern matching works as expected + assert config.factory_pattern.allowed("prod-factory") + assert not config.factory_pattern.allowed("prod-test") + assert not config.factory_pattern.allowed("dev-factory") + + def test_pipeline_pattern_filtering(self) -> None: + """Pipeline pattern should filter pipelines correctly.""" + config = AzureDataFactoryConfig( + subscription_id="test", + pipeline_pattern={"allow": ["^prod_.*"], "deny": [".*_backup$"]}, + ) + + # Test filtering logic + assert config.pipeline_pattern.allowed("prod_ingestion") + assert config.pipeline_pattern.allowed("prod_transform") + assert not config.pipeline_pattern.allowed("dev_ingestion") + assert not config.pipeline_pattern.allowed("prod_backup") + + +class TestCredentialConfigInteraction: + """Tests for how credential config interacts with main config.""" + + def test_service_principal_credential_embedded_in_config(self) -> None: + """Service principal credential should integrate with main config.""" + config = AzureDataFactoryConfig( + subscription_id="test-subscription", + credential=AzureCredentialConfig( + authentication_method=AzureAuthenticationMethod.SERVICE_PRINCIPAL, + client_id="test-client", + client_secret="test-secret", + tenant_id="test-tenant", + ), + ) + + # Verify credential is properly set + assert ( + config.credential.authentication_method + == AzureAuthenticationMethod.SERVICE_PRINCIPAL + ) + assert config.credential.client_id == "test-client" + assert config.credential.tenant_id == "test-tenant" diff --git a/metadata-ingestion/tests/unit/azure_data_factory/test_adf_source.py b/metadata-ingestion/tests/unit/azure_data_factory/test_adf_source.py new file mode 100644 index 00000000000000..73a1c0f026c0c6 --- /dev/null +++ b/metadata-ingestion/tests/unit/azure_data_factory/test_adf_source.py @@ -0,0 +1,449 @@ +"""Unit tests for Azure Data Factory source - business logic only. + +Following the accelerator guidelines, we test: +- Platform mapping logic (linked service type -> DataHub platform) +- Activity subtype mapping +- Table name extraction from dataset properties +- Run status mapping +- Lineage extraction logic patterns + +We do NOT test: +- Trivial getters/setters +- Third-party library behavior +- Pydantic validation (covered by test_adf_config.py) +""" + +from datahub.api.entities.dataprocess.dataprocess_instance import InstanceRunResult +from datahub.ingestion.source.azure_data_factory.adf_source import ( + ACTIVITY_SUBTYPE_MAP, + LINKED_SERVICE_PLATFORM_MAP, +) + + +class TestLinkedServicePlatformMapping: + """Tests for linked service to DataHub platform mapping. + + This is critical business logic - incorrect mapping would create + lineage to wrong platform URNs. + """ + + def test_azure_sql_variants_map_to_mssql(self) -> None: + """All Azure SQL variants should map to mssql platform.""" + azure_sql_types = ["AzureSqlDatabase", "AzureSqlMI", "SqlServer"] + for sql_type in azure_sql_types: + assert LINKED_SERVICE_PLATFORM_MAP.get(sql_type) == "mssql", ( + f"{sql_type} should map to 'mssql'" + ) + + def test_synapse_variants_map_to_mssql(self) -> None: + """Azure Synapse variants should map to mssql platform (same protocol).""" + synapse_types = ["AzureSynapseAnalytics", "AzureSqlDW"] + for synapse_type in synapse_types: + assert LINKED_SERVICE_PLATFORM_MAP.get(synapse_type) == "mssql", ( + f"{synapse_type} should map to 'mssql'" + ) + + def test_databricks_variants_map_correctly(self) -> None: + """Databricks services should all map to databricks platform.""" + databricks_types = ["AzureDatabricks", "AzureDatabricksDeltaLake"] + for db_type in databricks_types: + assert LINKED_SERVICE_PLATFORM_MAP.get(db_type) == "databricks", ( + f"{db_type} should map to 'databricks'" + ) + + def test_azure_storage_types_map_to_abs_platform(self) -> None: + """All Azure storage types should map to abs (Azure Blob Storage) platform.""" + assert LINKED_SERVICE_PLATFORM_MAP["AzureBlobStorage"] == "abs" + assert LINKED_SERVICE_PLATFORM_MAP["AzureBlobFS"] == "abs" + assert LINKED_SERVICE_PLATFORM_MAP["AzureDataLakeStore"] == "abs" + + def test_major_cloud_databases_covered(self) -> None: + """Major cloud databases should be mapped.""" + major_databases = { + "Snowflake": "snowflake", + "GoogleBigQuery": "bigquery", + "AmazonRedshift": "redshift", + } + for service_type, expected_platform in major_databases.items(): + assert LINKED_SERVICE_PLATFORM_MAP.get(service_type) == expected_platform + + def test_common_open_source_databases_covered(self) -> None: + """Common OSS databases should be mapped.""" + oss_databases = { + "PostgreSql": "postgres", + "MySql": "mysql", + "Oracle": "oracle", + "CosmosDbMongoDbApi": "mongodb", # MongoDB via Cosmos DB API + } + for service_type, expected_platform in oss_databases.items(): + assert LINKED_SERVICE_PLATFORM_MAP.get(service_type) == expected_platform + + def test_unknown_service_type_returns_none(self) -> None: + """Unknown service types should return None (not raise).""" + assert LINKED_SERVICE_PLATFORM_MAP.get("UnknownServiceType") is None + assert LINKED_SERVICE_PLATFORM_MAP.get("CustomConnector") is None + + +class TestActivitySubtypeMapping: + """Tests for activity type to subtype mapping. + + Subtypes affect how activities appear in the UI and their grouping. + """ + + def test_copy_activity_subtype(self) -> None: + """Copy activity should have descriptive subtype.""" + assert ACTIVITY_SUBTYPE_MAP["Copy"] == "Copy Activity" + + def test_dataflow_activities_grouped_together(self) -> None: + """Both DataFlow and ExecuteDataFlow should have same subtype.""" + assert ACTIVITY_SUBTYPE_MAP["DataFlow"] == "Data Flow Activity" + assert ACTIVITY_SUBTYPE_MAP["ExecuteDataFlow"] == "Data Flow Activity" + + def test_control_flow_activities_have_descriptive_names(self) -> None: + """Control flow activities should have user-friendly subtypes.""" + control_flow_map = { + "IfCondition": "If Condition", + "ForEach": "ForEach Loop", + "Until": "Until Loop", + "Switch": "Switch Activity", + "Wait": "Wait Activity", + } + for activity_type, expected_subtype in control_flow_map.items(): + assert ACTIVITY_SUBTYPE_MAP.get(activity_type) == expected_subtype + + def test_databricks_activities_identifiable(self) -> None: + """Databricks activities should be clearly identified.""" + databricks_activities = [ + "DatabricksNotebook", + "DatabricksSparkJar", + "DatabricksSparkPython", + ] + for activity in databricks_activities: + subtype = ACTIVITY_SUBTYPE_MAP.get(activity) + assert subtype is not None + assert "Databricks" in subtype + + +class TestTableNameExtractionLogic: + """Tests for the logic patterns used in table name extraction. + + These tests verify the extraction logic that would be used in + _extract_table_name without needing a full source instance. + """ + + def test_extract_simple_table_name(self) -> None: + """Should extract tableName property directly.""" + type_props = {"tableName": "dbo.customers"} + # Logic pattern from _extract_table_name + table_name = type_props.get("tableName") + assert table_name == "dbo.customers" + + def test_combine_schema_and_table(self) -> None: + """Should combine separate schema and table fields.""" + type_props = {"schema": "sales", "table": "orders"} + # Logic pattern from _extract_table_name + schema = type_props.get("schema", "") + table = type_props.get("table", "") + result = f"{schema}.{table}" if schema and table else table or schema + assert result == "sales.orders" + + def test_schema_only_returns_schema(self) -> None: + """Should return schema when table is missing.""" + type_props = {"schema": "dbo"} + schema = type_props.get("schema", "") + table = type_props.get("table", "") + result = f"{schema}.{table}" if schema and table else table or schema + assert result == "dbo" + + def test_table_only_returns_table(self) -> None: + """Should return table when schema is missing.""" + type_props = {"table": "orders"} + schema = type_props.get("schema", "") + table = type_props.get("table", "") + result = f"{schema}.{table}" if schema and table else table or schema + assert result == "orders" + + +class TestFilePathExtractionLogic: + """Tests for file path extraction from dataset properties.""" + + def test_combine_folder_and_filename(self) -> None: + """Should combine folderPath and fileName.""" + type_props = {"folderPath": "raw/data", "fileName": "file.csv"} + folder = type_props.get("folderPath", "") + filename = type_props.get("fileName", "") + result = f"{folder}/{filename}" if folder and filename else filename or folder + assert result == "raw/data/file.csv" + + def test_folder_only_returns_folder(self) -> None: + """Should return folder when filename is missing.""" + type_props = {"folderPath": "raw/data"} + folder = type_props.get("folderPath", "") + filename = type_props.get("fileName", "") + result = f"{folder}/{filename}" if folder and filename else filename or folder + assert result == "raw/data" + + def test_nested_location_extraction(self) -> None: + """Should extract path components from nested location object.""" + type_props = { + "location": { + "container": "mycontainer", + "folderPath": "data/raw", + "fileName": "output.parquet", + } + } + location = type_props.get("location", {}) + if isinstance(location, dict): + container = location.get("container", "") + folder = location.get("folderPath", "") + filename = location.get("fileName", "") + parts = [p for p in [container, folder, filename] if p] + result = "/".join(parts) if parts else None + else: + result = None + assert result == "mycontainer/data/raw/output.parquet" + + +class TestRunStatusMapping: + """Tests for mapping ADF run status to DataHub InstanceRunResult.""" + + def test_succeeded_maps_to_success(self) -> None: + """Succeeded status should map to SUCCESS result.""" + status_map = { + "Succeeded": InstanceRunResult.SUCCESS, + "Failed": InstanceRunResult.FAILURE, + "Cancelled": InstanceRunResult.SKIPPED, + } + assert status_map["Succeeded"] == InstanceRunResult.SUCCESS + + def test_failed_maps_to_failure(self) -> None: + """Failed status should map to FAILURE result.""" + status_map = { + "Succeeded": InstanceRunResult.SUCCESS, + "Failed": InstanceRunResult.FAILURE, + "Cancelled": InstanceRunResult.SKIPPED, + } + assert status_map["Failed"] == InstanceRunResult.FAILURE + + def test_cancelled_maps_to_skipped(self) -> None: + """Cancelled status should map to SKIPPED result.""" + status_map = { + "Cancelled": InstanceRunResult.SKIPPED, + } + assert status_map["Cancelled"] == InstanceRunResult.SKIPPED + + def test_in_progress_should_return_none(self) -> None: + """In-progress statuses should not have a final result.""" + incomplete_statuses = ["InProgress", "Queued", "Cancelling"] + status_map = { + "InProgress": None, + "Queued": None, + "Cancelling": None, + } + for status in incomplete_statuses: + assert status_map.get(status) is None + + +class TestResourceGroupExtractionLogic: + """Tests for extracting resource group from Azure resource ID.""" + + def test_extract_from_standard_resource_id(self) -> None: + """Should extract resource group from standard Azure resource ID.""" + resource_id = ( + "/subscriptions/12345678-1234-1234-1234-123456789012" + "/resourceGroups/my-resource-group" + "/providers/Microsoft.DataFactory/factories/my-factory" + ) + parts = resource_id.split("/") + rg_index = parts.index("resourceGroups") + resource_group = parts[rg_index + 1] + assert resource_group == "my-resource-group" + + def test_extract_with_complex_resource_group_name(self) -> None: + """Should handle resource groups with hyphens, underscores, and numbers.""" + test_cases = [ + ("prod-data-rg-001", "prod-data-rg-001"), + ("RG_Production_123", "RG_Production_123"), + ("simple", "simple"), + ] + for rg_name, expected in test_cases: + resource_id = ( + f"/subscriptions/00000000-0000-0000-0000-000000000000" + f"/resourceGroups/{rg_name}" + f"/providers/Microsoft.DataFactory/factories/factory1" + ) + parts = resource_id.split("/") + rg_index = parts.index("resourceGroups") + extracted = parts[rg_index + 1] + assert extracted == expected + + +class TestActivityRunPropertyExtraction: + """Tests for activity run property extraction logic. + + Activity runs create DataProcessInstance entities linked to DataJobs. + These tests verify the property extraction patterns. + """ + + def test_activity_run_properties_extracted(self) -> None: + """Verify essential activity run properties are extracted.""" + activity_run = { + "activityRunId": "act-run-123", + "activityName": "CopyData", + "activityType": "Copy", + "pipelineRunId": "pipe-run-456", + "status": "Succeeded", + "durationInMs": 45000, + } + + # Logic pattern from _emit_activity_runs + properties: dict[str, str] = { + "activity_run_id": activity_run["activityRunId"], + "activity_type": activity_run["activityType"], + "pipeline_run_id": activity_run["pipelineRunId"], + "status": activity_run["status"], + } + + if activity_run.get("durationInMs") is not None: + properties["duration_ms"] = str(activity_run["durationInMs"]) + + assert properties["activity_run_id"] == "act-run-123" + assert properties["activity_type"] == "Copy" + assert properties["pipeline_run_id"] == "pipe-run-456" + assert properties["status"] == "Succeeded" + assert properties["duration_ms"] == "45000" + + def test_activity_run_error_truncated(self) -> None: + """Verify error messages are truncated to prevent oversized properties.""" + MAX_RUN_MESSAGE_LENGTH = 500 + long_error = "E" * 1000 # 1000 character error + + activity_run = { + "activityRunId": "act-run-err", + "error": {"message": long_error}, + } + + # Logic pattern from _emit_activity_runs + error = activity_run.get("error", {}) + if error: + error_msg = str(error.get("message", "")) + if error_msg: + truncated = error_msg[:MAX_RUN_MESSAGE_LENGTH] + + assert len(truncated) == MAX_RUN_MESSAGE_LENGTH + assert len(truncated) < len(long_error) + + def test_activity_run_missing_optional_fields(self) -> None: + """Verify graceful handling of missing optional fields.""" + activity_run = { + "activityRunId": "act-run-minimal", + "activityName": "MinimalActivity", + "activityType": "Copy", + "pipelineRunId": "pipe-run-789", + "status": "Succeeded", + # No durationInMs, error, input, output + } + + properties: dict[str, str] = { + "activity_run_id": activity_run["activityRunId"], + "activity_type": activity_run["activityType"], + "pipeline_run_id": activity_run["pipelineRunId"], + "status": activity_run["status"], + } + + # Optional fields should not cause errors + if activity_run.get("durationInMs") is not None: + properties["duration_ms"] = str(activity_run["durationInMs"]) + + error = activity_run.get("error") + if error: + error_msg = str(error.get("message", "")) + if error_msg: + properties["error"] = error_msg[:500] + + assert "duration_ms" not in properties + assert "error" not in properties + assert len(properties) == 4 + + +class TestActivityRunToDataJobUrnMapping: + """Tests for mapping activity runs to DataJob URNs. + + Activity runs must link to DataJob URNs (not DataFlow URNs) so the + Runs tab appears on DataJob pages in the UI. + """ + + def test_datajob_urn_constructed_from_activity_run(self) -> None: + """DataJob URN should use activity name as job_id.""" + from datahub.metadata.urns import DataFlowUrn, DataJobUrn + + factory_name = "my-factory" + pipeline_name = "DataPipeline" + activity_name = "CopyActivity" + env = "PROD" + platform = "azure-data-factory" + + # Logic pattern from _emit_activity_runs + flow_name = f"{factory_name}.{pipeline_name}" + flow_urn = DataFlowUrn.create_from_ids( + orchestrator=platform, + flow_id=flow_name, + env=env, + ) + job_urn = DataJobUrn.create_from_ids( + data_flow_urn=str(flow_urn), + job_id=activity_name, + ) + + # Verify URN structure + assert "dataJob" in str(job_urn) + assert activity_name in str(job_urn) + assert flow_name in str(job_urn) + assert platform in str(job_urn) + + def test_activity_run_links_to_datajob_not_dataflow(self) -> None: + """Verify activity runs link to DataJob, enabling the Runs tab in UI.""" + from datahub.metadata.urns import DataFlowUrn, DataJobUrn + + flow_urn = DataFlowUrn.create_from_ids( + orchestrator="azure-data-factory", + flow_id="factory.pipeline", + env="PROD", + ) + job_urn = DataJobUrn.create_from_ids( + data_flow_urn=str(flow_urn), + job_id="MyActivity", + ) + + # The URN type should be dataJob, not dataFlow + assert job_urn.entity_type == "dataJob" + assert flow_urn.entity_type == "dataFlow" + + # The job URN should reference the flow URN + assert str(flow_urn) in str(job_urn) + + def test_multiple_activities_get_unique_urns(self) -> None: + """Each activity in a pipeline should have a unique DataJob URN.""" + from datahub.metadata.urns import DataFlowUrn, DataJobUrn + + flow_urn = DataFlowUrn.create_from_ids( + orchestrator="azure-data-factory", + flow_id="factory.pipeline", + env="PROD", + ) + + activities = ["CopyData", "TransformData", "LoadData"] + job_urns = [ + DataJobUrn.create_from_ids( + data_flow_urn=str(flow_urn), + job_id=activity, + ) + for activity in activities + ] + + # All URNs should be unique + assert len(set(str(u) for u in job_urns)) == len(activities) + + # Each URN should contain its activity name + for activity, urn in zip(activities, job_urns): + assert activity in str(urn) diff --git a/metadata-service/configuration/src/main/resources/bootstrap_mcps.yaml b/metadata-service/configuration/src/main/resources/bootstrap_mcps.yaml index 24f638402019b3..7f77c7867e3b98 100644 --- a/metadata-service/configuration/src/main/resources/bootstrap_mcps.yaml +++ b/metadata-service/configuration/src/main/resources/bootstrap_mcps.yaml @@ -13,7 +13,7 @@ bootstrap: mcps_location: "bootstrap_mcps/root-user.yaml" - name: data-platforms - version: v4 + version: v5 blocking: true async: false mcps_location: "bootstrap_mcps/data-platforms.yaml" diff --git a/metadata-service/configuration/src/main/resources/bootstrap_mcps/data-platforms.yaml b/metadata-service/configuration/src/main/resources/bootstrap_mcps/data-platforms.yaml index 9882c6af4537c9..084e664aa094bf 100644 --- a/metadata-service/configuration/src/main/resources/bootstrap_mcps/data-platforms.yaml +++ b/metadata-service/configuration/src/main/resources/bootstrap_mcps/data-platforms.yaml @@ -21,6 +21,16 @@ displayName: Azure Data Lake (Gen 2) type: FILE_SYSTEM logoUrl: "assets/platforms/adlslogo.png" +- entityUrn: urn:li:dataPlatform:azure-data-factory + entityType: dataPlatform + aspectName: dataPlatformInfo + changeType: UPSERT + aspect: + datasetNameDelimiter: "." + name: azure-data-factory + displayName: Azure Data Factory + type: OTHERS + logoUrl: "assets/platforms/azuredatafactorylogo.svg" - entityUrn: urn:li:dataPlatform:airflow entityType: dataPlatform aspectName: dataPlatformInfo