From bdd75f816655477b14fb16c089cfad08d01a3bc8 Mon Sep 17 00:00:00 2001 From: Anush Kumar Date: Mon, 8 Dec 2025 16:00:01 -0800 Subject: [PATCH 01/13] feat(azure-data-factory): add Azure Data Factory connector for metadata ingestion - Implemented a new connector to extract metadata from Azure Data Factory, including Data Factories, Pipelines, Activities, and Dataset lineage. - Added support for multiple authentication methods: Service Principal, Managed Identity, Azure CLI, and DefaultAzureCredential. - Introduced configuration options for filtering factories and pipelines, as well as options for including execution history and lineage extraction. - Created comprehensive documentation and example recipes for easy setup and usage. - Added integration and unit tests to ensure functionality and reliability of the connector. --- .../docs/sources/azure_data_factory/README.md | 80 ++ .../azure_data_factory_pre.md | 214 +++ .../azure_data_factory_recipe.yml | 59 + metadata-ingestion/setup.py | 7 + .../ingestion/source/azure/azure_auth.py | 184 +++ .../source/azure_data_factory/__init__.py | 22 + .../source/azure_data_factory/adf_client.py | 435 +++++++ .../source/azure_data_factory/adf_config.py | 158 +++ .../source/azure_data_factory/adf_models.py | 536 ++++++++ .../source/azure_data_factory/adf_report.py | 123 ++ .../source/azure_data_factory/adf_source.py | 1073 +++++++++++++++ .../azure_data_factory/__init__.py | 1 + .../azure_data_factory/adf_basic_golden.json | 775 +++++++++++ .../adf_platform_instance_golden.json | 812 ++++++++++++ .../adf_with_runs_golden.json | 1150 +++++++++++++++++ .../azure_data_factory/test_adf_source.py | 608 +++++++++ .../tests/unit/azure_data_factory/__init__.py | 1 + .../azure_data_factory/test_adf_config.py | 162 +++ .../azure_data_factory/test_adf_source.py | 278 ++++ 19 files changed, 6678 insertions(+) create mode 100644 metadata-ingestion/docs/sources/azure_data_factory/README.md create mode 100644 metadata-ingestion/docs/sources/azure_data_factory/azure_data_factory_pre.md create mode 100644 metadata-ingestion/docs/sources/azure_data_factory/azure_data_factory_recipe.yml create mode 100644 metadata-ingestion/src/datahub/ingestion/source/azure/azure_auth.py create mode 100644 metadata-ingestion/src/datahub/ingestion/source/azure_data_factory/__init__.py create mode 100644 metadata-ingestion/src/datahub/ingestion/source/azure_data_factory/adf_client.py create mode 100644 metadata-ingestion/src/datahub/ingestion/source/azure_data_factory/adf_config.py create mode 100644 metadata-ingestion/src/datahub/ingestion/source/azure_data_factory/adf_models.py create mode 100644 metadata-ingestion/src/datahub/ingestion/source/azure_data_factory/adf_report.py create mode 100644 metadata-ingestion/src/datahub/ingestion/source/azure_data_factory/adf_source.py create mode 100644 metadata-ingestion/tests/integration/azure_data_factory/__init__.py create mode 100644 metadata-ingestion/tests/integration/azure_data_factory/adf_basic_golden.json create mode 100644 metadata-ingestion/tests/integration/azure_data_factory/adf_platform_instance_golden.json create mode 100644 metadata-ingestion/tests/integration/azure_data_factory/adf_with_runs_golden.json create mode 100644 metadata-ingestion/tests/integration/azure_data_factory/test_adf_source.py create mode 100644 metadata-ingestion/tests/unit/azure_data_factory/__init__.py create mode 100644 metadata-ingestion/tests/unit/azure_data_factory/test_adf_config.py create mode 100644 metadata-ingestion/tests/unit/azure_data_factory/test_adf_source.py diff --git a/metadata-ingestion/docs/sources/azure_data_factory/README.md b/metadata-ingestion/docs/sources/azure_data_factory/README.md new file mode 100644 index 00000000000000..407ed8bbd83e0e --- /dev/null +++ b/metadata-ingestion/docs/sources/azure_data_factory/README.md @@ -0,0 +1,80 @@ +# Azure Data Factory + +For context on getting started with ingestion, check out our [metadata ingestion guide](../../../../metadata-ingestion/README.md). + +## Setup + +To install this plugin, run `pip install 'acryl-datahub[azure-data-factory]'`. + +## Quickstart Recipe + +```yaml +source: + type: azure-data-factory + config: + # Required + subscription_id: ${AZURE_SUBSCRIPTION_ID} + + # Authentication (service principal) + credential: + authentication_method: service_principal + client_id: ${AZURE_CLIENT_ID} + client_secret: ${AZURE_CLIENT_SECRET} + tenant_id: ${AZURE_TENANT_ID} + + # Optional filters + factory_pattern: + allow: ["prod-.*"] + + # Features + include_lineage: true + include_execution_history: false + + env: PROD + +sink: + type: datahub-rest + config: + server: "http://localhost:8080" +``` + +## Authentication Methods + +| Method | Config Value | Use Case | +| ----------------- | ------------------- | ----------------- | +| Service Principal | `service_principal` | Production | +| Managed Identity | `managed_identity` | Azure-hosted | +| Azure CLI | `cli` | Local development | +| Auto-detect | `default` | Flexible | + +## Config Details + +| Field | Required | Description | +| ---------------------------------- | -------- | ----------------------------------------- | +| `subscription_id` | ✅ | Azure subscription ID | +| `credential.authentication_method` | | Auth method (default: `default`) | +| `credential.client_id` | | App (client) ID for service principal | +| `credential.client_secret` | | Client secret for service principal | +| `credential.tenant_id` | | Tenant (directory) ID | +| `resource_group` | | Filter to specific resource group | +| `factory_pattern` | | Regex allow/deny for factories | +| `pipeline_pattern` | | Regex allow/deny for pipelines | +| `include_lineage` | | Extract lineage (default: `true`) | +| `include_execution_history` | | Extract pipeline runs (default: `false`) | +| `execution_history_days` | | Days of history, 1-90 (default: `7`) | +| `platform_instance_map` | | Map linked services to platform instances | +| `env` | | Environment (default: `PROD`) | + +## Entity Mapping + +| ADF Concept | DataHub Entity | +| ------------ | ------------------- | +| Data Factory | Container | +| Pipeline | DataFlow | +| Activity | DataJob | +| Dataset | Dataset | +| Pipeline Run | DataProcessInstance | + +## Questions + +If you've got any questions on configuring this source, feel free to ping us on [our Slack](https://slack.datahubproject.io/). diff --git a/metadata-ingestion/docs/sources/azure_data_factory/azure_data_factory_pre.md b/metadata-ingestion/docs/sources/azure_data_factory/azure_data_factory_pre.md new file mode 100644 index 00000000000000..c8e2d8062034a4 --- /dev/null +++ b/metadata-ingestion/docs/sources/azure_data_factory/azure_data_factory_pre.md @@ -0,0 +1,214 @@ +## Overview + +This connector extracts metadata from Azure Data Factory (ADF), including: + +- **Data Factories** as Containers +- **Pipelines** as DataFlows +- **Activities** as DataJobs (Copy, Data Flow, Lookup, etc.) +- **Lineage** between source and destination datasets +- **Execution History** as DataProcessInstance (optional) + +:::note Not Azure Fabric +This connector is for **Azure Data Factory** (classic), not Azure Fabric's Data Factory. Azure Fabric support is planned for a future release. +::: + +## Prerequisites + +### Azure Authentication + +The connector supports multiple authentication methods: + +| Method | Best For | Configuration | +| -------------------------- | ------------------------------------------------ | --------------------------------------------------- | +| **Service Principal** | Production environments | `authentication_method: service_principal` | +| **Managed Identity** | Azure-hosted deployments (VMs, AKS, App Service) | `authentication_method: managed_identity` | +| **Azure CLI** | Local development | `authentication_method: cli` (run `az login` first) | +| **DefaultAzureCredential** | Flexible environments | `authentication_method: default` | + +### Required Azure Permissions + +Grant the following role to your identity on the Data Factory resources: + +| Role | Required For | +| ---------------------------- | ----------------------------------- | +| **Reader** | Basic metadata extraction | +| **Data Factory Contributor** | Full access including pipeline runs | + +To set up a service principal: + +1. Create an App Registration in Azure Portal > Microsoft Entra ID > App registrations +2. Create a client secret under Certificates & secrets +3. Grant the service principal **Reader** or **Data Factory Contributor** role on your resource group or Data Factory + +## Concept Mapping + +| Azure Data Factory | DataHub Entity | SubType | +| ------------------ | ------------------------------------------------------------------------------------------------------ | ---------------------------- | +| Data Factory | [Container](https://docs.datahub.com/docs/generated/metamodel/entities/container/) | Data Factory | +| Pipeline | [DataFlow](https://docs.datahub.com/docs/generated/metamodel/entities/dataflow/) | Pipeline | +| Activity | [DataJob](https://docs.datahub.com/docs/generated/metamodel/entities/datajob/) | Copy, DataFlow, Lookup, etc. | +| Dataset | [Dataset](https://docs.datahub.com/docs/generated/metamodel/entities/dataset/) | Based on linked service type | +| Pipeline Run | [DataProcessInstance](https://docs.datahub.com/docs/generated/metamodel/entities/dataprocessinstance/) | - | + +## Capabilities + +| Capability | Status | Notes | +| --------------------- | ------ | ------------------------------------------- | +| Platform Instance | ✅ | Enabled by default | +| Containers | ✅ | Data Factories as containers | +| Lineage (Table-level) | ✅ | From activity inputs/outputs and Data Flows | +| Data Flow Scripts | ✅ | Stored as transformation logic | +| Execution History | ✅ | Optional, via `include_execution_history` | +| Stateful Ingestion | ✅ | Stale entity removal | + +## Lineage Extraction + +The connector extracts lineage from: + +1. **Copy Activities**: Maps input/output datasets to DataHub datasets +2. **Data Flow Activities**: Extracts sources and sinks from Data Flow definitions +3. **Lookup Activities**: Maps lookup datasets as inputs + +### Supported Linked Service Mappings + +| ADF Linked Service | DataHub Platform | +| ------------------------------------------ | -------------------- | +| AzureBlobStorage, AzureBlobFS | `azure_blob_storage` | +| AzureDataLakeStore, AzureDataLakeStoreGen2 | `azure_data_lake` | +| AzureSqlDatabase, AzureSqlDW | `mssql` | +| AzureSynapseAnalytics | `synapse` | +| Snowflake | `snowflake` | +| AmazonS3 | `s3` | +| GoogleBigQuery | `bigquery` | +| PostgreSql, AzurePostgreSql | `postgres` | +| MySql, AzureMySql | `mysql` | +| Oracle | `oracle` | +| Salesforce | `salesforce` | +| CosmosDb | `cosmos` | +| AzureDatabricks, DatabricksDeltaLake | `databricks` | + +### Platform Instance Mapping + +For accurate lineage resolution to existing datasets in DataHub, map linked service names to platform instances: + +```yaml +source: + type: azure-data-factory + config: + platform_instance_map: + "snowflake-prod-connection": "prod_snowflake" + "synapse-analytics-connection": "prod_synapse" +``` + +## Data Flow Scripts + +For activities that execute ADF Data Flows (mapping data flows), the connector extracts the Data Flow script and stores it as transformation logic on the DataJob entity. + +This enables: + +- Viewing the complete Data Flow transformation script in DataHub +- Understanding the data transformations applied by each Data Flow activity +- Searching for Data Flows by their transformation logic + +The script is stored in the `dataTransformLogic` aspect and is visible in the DataHub UI under the activity's details. + +## Execution History + +When `include_execution_history: true`, the connector extracts pipeline runs as `DataProcessInstance` entities: + +```yaml +source: + type: azure-data-factory + config: + include_execution_history: true + execution_history_days: 7 # 1-90 days +``` + +This provides: + +- Pipeline run status (Succeeded, Failed, Cancelled, In Progress) +- Run duration and timestamps +- Trigger information (who/what started the run) +- Run parameters + +## When to Use Platform Instance + +The `platform_instance` configuration is used to distinguish between **separate ADF deployments** (e.g., different Azure subscriptions or tenants), not for separating factories within the same deployment. + +### When to Use `platform_instance` + +| Scenario | Example Configuration | +| -------------------------------- | ---------------------------------------- | +| **Multiple Azure Subscriptions** | Different subscriptions for prod vs dev | +| **Multi-Tenant Organizations** | Separate Azure tenants per business unit | +| **Multi-Region Deployments** | US-East vs EU-West deployments | + +**Example: Multiple Subscriptions** + +```yaml +# Production subscription +source: + type: azure-data-factory + config: + subscription_id: "prod-subscription-id" + platform_instance: "production" + +# Development subscription +source: + type: azure-data-factory + config: + subscription_id: "dev-subscription-id" + platform_instance: "development" +``` + +**Example: Multi-Region** + +```yaml +# US Region +source: + type: azure-data-factory + config: + subscription_id: "us-east-subscription" + platform_instance: "us-east" + +# EU Region +source: + type: azure-data-factory + config: + subscription_id: "eu-west-subscription" + platform_instance: "eu-west" +``` + +### When NOT to Use `platform_instance` + +- **Single subscription** - Factory names in URNs already provide uniqueness +- **Multiple factories in same subscription** - The factory name is included in the URN automatically +- **Same logical environment** - Don't use it just to differentiate factories + +:::note URN Uniqueness +The connector automatically includes the factory name in pipeline URNs (e.g., `my-factory.ETL-Pipeline`), so you don't need `platform_instance` to distinguish pipelines across factories within the same subscription. +::: + +## URN Format + +Pipeline URNs include the factory name for uniqueness across multiple factories: + +``` +urn:li:dataFlow:(azure_data_factory,{factory_name}.{pipeline_name},{env}) +``` + +Example: `urn:li:dataFlow:(azure_data_factory,my-factory.ETL-Pipeline,PROD)` + +Activity URNs reference their parent pipeline: + +``` +urn:li:dataJob:(urn:li:dataFlow:(azure_data_factory,{factory_name}.{pipeline_name},{env}),{activity_name}) +``` + +With `platform_instance` set, it's prepended to the URN: + +``` +urn:li:dataFlow:(azure_data_factory,{platform_instance}.{factory_name}.{pipeline_name},{env}) +``` + +Example: `urn:li:dataFlow:(azure_data_factory,production.my-factory.ETL-Pipeline,PROD)` diff --git a/metadata-ingestion/docs/sources/azure_data_factory/azure_data_factory_recipe.yml b/metadata-ingestion/docs/sources/azure_data_factory/azure_data_factory_recipe.yml new file mode 100644 index 00000000000000..175ecad33e8c27 --- /dev/null +++ b/metadata-ingestion/docs/sources/azure_data_factory/azure_data_factory_recipe.yml @@ -0,0 +1,59 @@ +# Example recipe for Azure Data Factory source +# See README.md for full configuration options + +source: + type: azure-data-factory + config: + # Required: Azure subscription containing Data Factories + subscription_id: ${AZURE_SUBSCRIPTION_ID} + + # Optional: Filter to specific resource group + # resource_group: my-resource-group + + # Authentication (using service principal) + credential: + authentication_method: service_principal + client_id: ${AZURE_CLIENT_ID} + client_secret: ${AZURE_CLIENT_SECRET} + tenant_id: ${AZURE_TENANT_ID} + + # Optional: Filter factories by name pattern + factory_pattern: + allow: + - ".*" # Allow all factories by default + deny: [] + + # Optional: Filter pipelines by name pattern + pipeline_pattern: + allow: + - ".*" # Allow all pipelines by default + deny: [] + + # Feature flags + include_lineage: true + include_column_lineage: false # Advanced: requires Data Flow parsing + include_execution_history: false # Set to true for pipeline run history + execution_history_days: 7 # Only used when include_execution_history is true + include_datasets: true + include_linked_services: true + include_triggers: true + + # Optional: Map linked services to platform instances for accurate lineage + # platform_instance_map: + # "my-snowflake-connection": "prod_snowflake" + + # Optional: Platform instance for this ADF connector + # platform_instance: "main-adf" + + # Environment + env: PROD + + # Optional: Stateful ingestion for stale entity removal + # stateful_ingestion: + # enabled: true + +sink: + type: datahub-rest + config: + server: "http://localhost:8080" + diff --git a/metadata-ingestion/setup.py b/metadata-ingestion/setup.py index e79c1f94857d5e..a3e50534b973e0 100644 --- a/metadata-ingestion/setup.py +++ b/metadata-ingestion/setup.py @@ -348,6 +348,11 @@ *path_spec_common, } +azure_data_factory = { + "azure-identity>=1.21.0", + "azure-mgmt-datafactory>=9.0.0", +} + data_lake_profiling = { "pydeequ>=1.1.0", "pyspark~=3.5.6", @@ -450,6 +455,7 @@ "tenacity!=8.4.0", }, "azure-ad": set(), + "azure-data-factory": azure_data_factory, "bigquery": sql_common | bigquery_common | sqlglot_lib @@ -817,6 +823,7 @@ "sqlalchemy = datahub.ingestion.source.sql.sql_generic:SQLAlchemyGenericSource", "athena = datahub.ingestion.source.sql.athena:AthenaSource", "azure-ad = datahub.ingestion.source.identity.azure_ad:AzureADSource", + "azure-data-factory = datahub.ingestion.source.azure_data_factory.adf_source:AzureDataFactorySource", "bigquery = datahub.ingestion.source.bigquery_v2.bigquery:BigqueryV2Source", "bigquery-queries = datahub.ingestion.source.bigquery_v2.bigquery_queries:BigQueryQueriesSource", "clickhouse = datahub.ingestion.source.sql.clickhouse:ClickHouseSource", diff --git a/metadata-ingestion/src/datahub/ingestion/source/azure/azure_auth.py b/metadata-ingestion/src/datahub/ingestion/source/azure/azure_auth.py new file mode 100644 index 00000000000000..12007be8d3882b --- /dev/null +++ b/metadata-ingestion/src/datahub/ingestion/source/azure/azure_auth.py @@ -0,0 +1,184 @@ +"""Unified Azure authentication module for DataHub connectors. + +This module provides a reusable authentication configuration that can be used +across all Azure connectors (ADF, Synapse, Fabric, etc.). + +Supports multiple authentication methods: +- Service Principal (client_id + client_secret + tenant_id) +- Managed Identity (system-assigned or user-assigned) +- Azure CLI credentials (for local development) +- DefaultAzureCredential (auto-detects environment) +""" + +from enum import Enum +from typing import Optional + +from azure.core.credentials import TokenCredential +from azure.identity import ( + AzureCliCredential, + ClientSecretCredential, + DefaultAzureCredential, + ManagedIdentityCredential, +) +from pydantic import Field, SecretStr, model_validator + +from datahub.configuration import ConfigModel + + +class AzureAuthenticationMethod(str, Enum): + """Supported Azure authentication methods. + + - DEFAULT: Uses DefaultAzureCredential which auto-detects credentials from + environment variables, managed identity, Azure CLI, etc. + - SERVICE_PRINCIPAL: Uses client ID, client secret, and tenant ID + - MANAGED_IDENTITY: Uses Azure Managed Identity (system or user-assigned) + - CLI: Uses Azure CLI credential (requires `az login`) + """ + + DEFAULT = "default" + SERVICE_PRINCIPAL = "service_principal" + MANAGED_IDENTITY = "managed_identity" + CLI = "cli" + + +class AzureCredentialConfig(ConfigModel): + """Unified Azure authentication configuration. + + This class provides a reusable authentication configuration that can be + composed into any Azure connector's configuration. It supports multiple + authentication methods and returns a TokenCredential that works with + any Azure SDK client. + + Example usage in a connector config: + class MyAzureConnectorConfig(ConfigModel): + credential: AzureCredentialConfig = Field( + default_factory=AzureCredentialConfig, + description="Azure authentication configuration" + ) + subscription_id: str = Field(...) + """ + + authentication_method: AzureAuthenticationMethod = Field( + default=AzureAuthenticationMethod.DEFAULT, + description=( + "Authentication method to use. Options: " + "'default' (auto-detects from environment), " + "'service_principal' (client ID + secret + tenant), " + "'managed_identity' (Azure Managed Identity), " + "'cli' (Azure CLI credential). " + "Recommended: Use 'default' which tries multiple methods automatically." + ), + ) + + # Service Principal credentials (required when authentication_method = "service_principal") + client_id: Optional[str] = Field( + default=None, + description=( + "Azure Application (client) ID. Required for service_principal authentication. " + "Find this in Azure Portal > App registrations > Your app > Overview." + ), + ) + client_secret: Optional[SecretStr] = Field( + default=None, + description=( + "Azure client secret. Required for service_principal authentication. " + "Create in Azure Portal > App registrations > Your app > Certificates & secrets." + ), + ) + tenant_id: Optional[str] = Field( + default=None, + description=( + "Azure tenant (directory) ID. Required for service_principal authentication. " + "Find this in Azure Portal > Microsoft Entra ID > Overview." + ), + ) + + # Managed Identity options (optional, for user-assigned managed identity) + managed_identity_client_id: Optional[str] = Field( + default=None, + description=( + "Client ID for user-assigned managed identity. " + "Leave empty to use system-assigned managed identity. " + "Only used when authentication_method is 'managed_identity'." + ), + ) + + # Additional options for DefaultAzureCredential + exclude_cli_credential: bool = Field( + default=False, + description=( + "When using 'default' authentication, exclude Azure CLI credential. " + "Useful in production to avoid accidentally using developer credentials." + ), + ) + exclude_environment_credential: bool = Field( + default=False, + description=( + "When using 'default' authentication, exclude environment variables. " + "Environment variables checked: AZURE_CLIENT_ID, AZURE_CLIENT_SECRET, AZURE_TENANT_ID." + ), + ) + exclude_managed_identity_credential: bool = Field( + default=False, + description=( + "When using 'default' authentication, exclude managed identity. " + "Useful during local development when managed identity is not available." + ), + ) + + def get_credential(self) -> TokenCredential: + """Get Azure credential based on the configured authentication method. + + Returns: + TokenCredential: An Azure credential object that can be used with + any Azure SDK client (e.g., DataFactoryManagementClient). + + Raises: + ValueError: If required credentials are missing for the chosen method. + """ + if self.authentication_method == AzureAuthenticationMethod.SERVICE_PRINCIPAL: + if not self.client_secret: + raise ValueError( + "client_secret is required for service_principal authentication" + ) + # These are validated as required in validate_credentials() + assert self.tenant_id is not None + assert self.client_id is not None + return ClientSecretCredential( + tenant_id=self.tenant_id, + client_id=self.client_id, + client_secret=self.client_secret.get_secret_value(), + ) + + elif self.authentication_method == AzureAuthenticationMethod.MANAGED_IDENTITY: + return ManagedIdentityCredential(client_id=self.managed_identity_client_id) + + elif self.authentication_method == AzureAuthenticationMethod.CLI: + return AzureCliCredential() + + else: # DEFAULT + return DefaultAzureCredential( + exclude_cli_credential=self.exclude_cli_credential, + exclude_environment_credential=self.exclude_environment_credential, + exclude_managed_identity_credential=self.exclude_managed_identity_credential, + ) + + @model_validator(mode="after") + def validate_credentials(self) -> "AzureCredentialConfig": + """Validate that required credentials are provided for the chosen method.""" + if self.authentication_method == AzureAuthenticationMethod.SERVICE_PRINCIPAL: + missing = [] + if not self.client_id: + missing.append("client_id") + if not self.client_secret: + missing.append("client_secret") + if not self.tenant_id: + missing.append("tenant_id") + + if missing: + raise ValueError( + f"Service principal authentication requires: {', '.join(missing)}. " + f"These can be found in Azure Portal > App registrations." + ) + + return self diff --git a/metadata-ingestion/src/datahub/ingestion/source/azure_data_factory/__init__.py b/metadata-ingestion/src/datahub/ingestion/source/azure_data_factory/__init__.py new file mode 100644 index 00000000000000..f3adb4ffcaf005 --- /dev/null +++ b/metadata-ingestion/src/datahub/ingestion/source/azure_data_factory/__init__.py @@ -0,0 +1,22 @@ +"""Azure Data Factory DataHub connector. + +This package provides a connector to ingest metadata from Azure Data Factory +into DataHub, including: + +- Data Factories as Containers +- Pipelines as DataFlows +- Activities as DataJobs +- Dataset lineage +- Execution history (optional) + +Usage: + source: + type: azure_data_factory + config: + subscription_id: ${AZURE_SUBSCRIPTION_ID} + credential: + authentication_method: service_principal + client_id: ${AZURE_CLIENT_ID} + client_secret: ${AZURE_CLIENT_SECRET} + tenant_id: ${AZURE_TENANT_ID} +""" diff --git a/metadata-ingestion/src/datahub/ingestion/source/azure_data_factory/adf_client.py b/metadata-ingestion/src/datahub/ingestion/source/azure_data_factory/adf_client.py new file mode 100644 index 00000000000000..15802a2c62a9dd --- /dev/null +++ b/metadata-ingestion/src/datahub/ingestion/source/azure_data_factory/adf_client.py @@ -0,0 +1,435 @@ +"""Azure Data Factory REST API client wrapper. + +This module provides a typed client for interacting with the Azure Data Factory +REST API. It handles authentication, pagination, and error handling. + +API Documentation: https://learn.microsoft.com/en-us/rest/api/datafactory/ +""" + +import logging +from datetime import datetime, timedelta, timezone +from typing import Iterator, Optional + +from azure.core.credentials import TokenCredential +from azure.core.exceptions import HttpResponseError +from azure.mgmt.datafactory import DataFactoryManagementClient +from azure.mgmt.datafactory.models import ( + ActivityRunsQueryResponse, + PipelineRunsQueryResponse, + RunFilterParameters, +) + +from datahub.ingestion.source.azure_data_factory.adf_models import ( + ActivityRun, + DataFlow, + Dataset, + Factory, + LinkedService, + Pipeline, + PipelineRun, + Trigger, +) + +logger = logging.getLogger(__name__) + + +class AzureDataFactoryClient: + """Client for Azure Data Factory REST API. + + Uses the Azure SDK (azure-mgmt-datafactory) for type safety and + automatic pagination handling. + + API Reference: https://learn.microsoft.com/en-us/rest/api/datafactory/ + """ + + def __init__( + self, + credential: TokenCredential, + subscription_id: str, + ) -> None: + """Initialize the ADF client. + + Args: + credential: Azure credential for authentication (from AzureCredentialConfig) + subscription_id: Azure subscription ID containing Data Factories + """ + self.subscription_id = subscription_id + self._client = DataFactoryManagementClient( + credential=credential, + subscription_id=subscription_id, + ) + + def get_factories( + self, + resource_group: Optional[str] = None, + ) -> Iterator[Factory]: + """List all Data Factories. + + API Reference: + - By subscription: https://learn.microsoft.com/en-us/rest/api/datafactory/factories/list + - By resource group: https://learn.microsoft.com/en-us/rest/api/datafactory/factories/list-by-resource-group + + Args: + resource_group: Optional resource group name to filter factories + + Yields: + Factory objects + """ + try: + if resource_group: + # GET /subscriptions/{sub}/resourceGroups/{rg}/providers/Microsoft.DataFactory/factories + # Docs: https://learn.microsoft.com/en-us/rest/api/datafactory/factories/list-by-resource-group + factories_response = self._client.factories.list_by_resource_group( + resource_group_name=resource_group + ) + else: + # GET /subscriptions/{sub}/providers/Microsoft.DataFactory/factories + # Docs: https://learn.microsoft.com/en-us/rest/api/datafactory/factories/list + factories_response = self._client.factories.list() + + for factory in factories_response: + yield Factory.model_validate(factory.as_dict()) + + except HttpResponseError as e: + logger.error(f"Failed to list factories: {e.message}") + raise + + def get_factory( + self, + resource_group: str, + factory_name: str, + ) -> Factory: + """Get a specific Data Factory. + + API Reference: https://learn.microsoft.com/en-us/rest/api/datafactory/factories/get + + Args: + resource_group: Resource group name + factory_name: Data Factory name + + Returns: + Factory object + """ + # GET /subscriptions/{sub}/resourceGroups/{rg}/providers/Microsoft.DataFactory/factories/{factoryName} + # Docs: https://learn.microsoft.com/en-us/rest/api/datafactory/factories/get + factory = self._client.factories.get( + resource_group_name=resource_group, + factory_name=factory_name, + ) + if factory is None: + raise ValueError(f"Factory not found: {factory_name}") + return Factory.model_validate(factory.as_dict()) + + def get_pipelines( + self, + resource_group: str, + factory_name: str, + ) -> Iterator[Pipeline]: + """List all pipelines in a Data Factory. + + API Reference: https://learn.microsoft.com/en-us/rest/api/datafactory/pipelines/list-by-factory + + Args: + resource_group: Resource group name + factory_name: Data Factory name + + Yields: + Pipeline objects with activities + """ + try: + # GET /subscriptions/{sub}/resourceGroups/{rg}/providers/Microsoft.DataFactory/factories/{factoryName}/pipelines + # Docs: https://learn.microsoft.com/en-us/rest/api/datafactory/pipelines/list-by-factory + pipelines_response = self._client.pipelines.list_by_factory( + resource_group_name=resource_group, + factory_name=factory_name, + ) + + for pipeline in pipelines_response: + yield Pipeline.model_validate(pipeline.as_dict()) + + except HttpResponseError as e: + logger.error( + f"Failed to list pipelines for factory {factory_name}: {e.message}" + ) + raise + + def get_pipeline( + self, + resource_group: str, + factory_name: str, + pipeline_name: str, + ) -> Pipeline: + """Get a specific pipeline. + + API Reference: https://learn.microsoft.com/en-us/rest/api/datafactory/pipelines/get + + Args: + resource_group: Resource group name + factory_name: Data Factory name + pipeline_name: Pipeline name + + Returns: + Pipeline object with activities + """ + # GET /subscriptions/{sub}/resourceGroups/{rg}/providers/Microsoft.DataFactory/factories/{factoryName}/pipelines/{pipelineName} + # Docs: https://learn.microsoft.com/en-us/rest/api/datafactory/pipelines/get + pipeline = self._client.pipelines.get( + resource_group_name=resource_group, + factory_name=factory_name, + pipeline_name=pipeline_name, + ) + if pipeline is None: + raise ValueError(f"Pipeline not found: {pipeline_name}") + return Pipeline.model_validate(pipeline.as_dict()) + + def get_datasets( + self, + resource_group: str, + factory_name: str, + ) -> Iterator[Dataset]: + """List all datasets in a Data Factory. + + API Reference: https://learn.microsoft.com/en-us/rest/api/datafactory/datasets/list-by-factory + + Args: + resource_group: Resource group name + factory_name: Data Factory name + + Yields: + Dataset objects + """ + try: + # GET /subscriptions/{sub}/resourceGroups/{rg}/providers/Microsoft.DataFactory/factories/{factoryName}/datasets + # Docs: https://learn.microsoft.com/en-us/rest/api/datafactory/datasets/list-by-factory + datasets_response = self._client.datasets.list_by_factory( + resource_group_name=resource_group, + factory_name=factory_name, + ) + + for dataset in datasets_response: + yield Dataset.model_validate(dataset.as_dict()) + + except HttpResponseError as e: + logger.error( + f"Failed to list datasets for factory {factory_name}: {e.message}" + ) + raise + + def get_linked_services( + self, + resource_group: str, + factory_name: str, + ) -> Iterator[LinkedService]: + """List all linked services in a Data Factory. + + API Reference: https://learn.microsoft.com/en-us/rest/api/datafactory/linked-services/list-by-factory + + Args: + resource_group: Resource group name + factory_name: Data Factory name + + Yields: + LinkedService objects + """ + try: + # GET /subscriptions/{sub}/resourceGroups/{rg}/providers/Microsoft.DataFactory/factories/{factoryName}/linkedservices + # Docs: https://learn.microsoft.com/en-us/rest/api/datafactory/linked-services/list-by-factory + linked_services_response = self._client.linked_services.list_by_factory( + resource_group_name=resource_group, + factory_name=factory_name, + ) + + for linked_service in linked_services_response: + yield LinkedService.model_validate(linked_service.as_dict()) + + except HttpResponseError as e: + logger.error( + f"Failed to list linked services for factory {factory_name}: {e.message}" + ) + raise + + def get_data_flows( + self, + resource_group: str, + factory_name: str, + ) -> Iterator[DataFlow]: + """List all data flows in a Data Factory. + + API Reference: https://learn.microsoft.com/en-us/rest/api/datafactory/data-flows/list-by-factory + + Args: + resource_group: Resource group name + factory_name: Data Factory name + + Yields: + DataFlow objects + """ + try: + # GET /subscriptions/{sub}/resourceGroups/{rg}/providers/Microsoft.DataFactory/factories/{factoryName}/dataflows + # Docs: https://learn.microsoft.com/en-us/rest/api/datafactory/data-flows/list-by-factory + data_flows_response = self._client.data_flows.list_by_factory( + resource_group_name=resource_group, + factory_name=factory_name, + ) + + for data_flow in data_flows_response: + yield DataFlow.model_validate(data_flow.as_dict()) + + except HttpResponseError as e: + logger.error( + f"Failed to list data flows for factory {factory_name}: {e.message}" + ) + raise + + def get_triggers( + self, + resource_group: str, + factory_name: str, + ) -> Iterator[Trigger]: + """List all triggers in a Data Factory. + + API Reference: https://learn.microsoft.com/en-us/rest/api/datafactory/triggers/list-by-factory + + Args: + resource_group: Resource group name + factory_name: Data Factory name + + Yields: + Trigger objects + """ + try: + # GET /subscriptions/{sub}/resourceGroups/{rg}/providers/Microsoft.DataFactory/factories/{factoryName}/triggers + # Docs: https://learn.microsoft.com/en-us/rest/api/datafactory/triggers/list-by-factory + triggers_response = self._client.triggers.list_by_factory( + resource_group_name=resource_group, + factory_name=factory_name, + ) + + for trigger in triggers_response: + yield Trigger.model_validate(trigger.as_dict()) + + except HttpResponseError as e: + logger.error( + f"Failed to list triggers for factory {factory_name}: {e.message}" + ) + raise + + def get_pipeline_runs( + self, + resource_group: str, + factory_name: str, + days: int = 7, + ) -> Iterator[PipelineRun]: + """Query pipeline runs for a Data Factory. + + API Reference: https://learn.microsoft.com/en-us/rest/api/datafactory/pipeline-runs/query-by-factory + + Args: + resource_group: Resource group name + factory_name: Data Factory name + days: Number of days of history to fetch + + Yields: + PipelineRun objects + """ + try: + end_time = datetime.now(timezone.utc) + start_time = end_time - timedelta(days=days) + + # POST /subscriptions/{sub}/resourceGroups/{rg}/providers/Microsoft.DataFactory/factories/{factoryName}/queryPipelineRuns + # Docs: https://learn.microsoft.com/en-us/rest/api/datafactory/pipeline-runs/query-by-factory + filter_params = RunFilterParameters( + last_updated_after=start_time, + last_updated_before=end_time, + ) + + response: PipelineRunsQueryResponse = ( + self._client.pipeline_runs.query_by_factory( + resource_group_name=resource_group, + factory_name=factory_name, + filter_parameters=filter_params, + ) + ) + + for run in response.value or []: + yield PipelineRun.model_validate(run.as_dict()) + + # Handle pagination via continuation token + while response.continuation_token: + filter_params.continuation_token = response.continuation_token + response = self._client.pipeline_runs.query_by_factory( + resource_group_name=resource_group, + factory_name=factory_name, + filter_parameters=filter_params, + ) + for run in response.value or []: + yield PipelineRun.model_validate(run.as_dict()) + + except HttpResponseError as e: + logger.error( + f"Failed to query pipeline runs for factory {factory_name}: {e.message}" + ) + raise + + def get_activity_runs( + self, + resource_group: str, + factory_name: str, + run_id: str, + ) -> Iterator[ActivityRun]: + """Query activity runs for a pipeline run. + + API Reference: https://learn.microsoft.com/en-us/rest/api/datafactory/activity-runs/query-by-pipeline-run + + Args: + resource_group: Resource group name + factory_name: Data Factory name + run_id: Pipeline run ID + + Yields: + ActivityRun objects + """ + try: + end_time = datetime.now(timezone.utc) + start_time = end_time - timedelta(days=90) # Max retention + + # POST /subscriptions/{sub}/resourceGroups/{rg}/providers/Microsoft.DataFactory/factories/{factoryName}/pipelineruns/{runId}/queryActivityruns + # Docs: https://learn.microsoft.com/en-us/rest/api/datafactory/activity-runs/query-by-pipeline-run + filter_params = RunFilterParameters( + last_updated_after=start_time, + last_updated_before=end_time, + ) + + response: ActivityRunsQueryResponse = ( + self._client.activity_runs.query_by_pipeline_run( + resource_group_name=resource_group, + factory_name=factory_name, + run_id=run_id, + filter_parameters=filter_params, + ) + ) + + for run in response.value or []: + yield ActivityRun.model_validate(run.as_dict()) + + # Handle pagination via continuation token + while response.continuation_token: + filter_params.continuation_token = response.continuation_token + response = self._client.activity_runs.query_by_pipeline_run( + resource_group_name=resource_group, + factory_name=factory_name, + run_id=run_id, + filter_parameters=filter_params, + ) + for run in response.value or []: + yield ActivityRun.model_validate(run.as_dict()) + + except HttpResponseError as e: + logger.error( + f"Failed to query activity runs for pipeline run {run_id}: {e.message}" + ) + raise + + def close(self) -> None: + """Close the client and release resources.""" + self._client.close() diff --git a/metadata-ingestion/src/datahub/ingestion/source/azure_data_factory/adf_config.py b/metadata-ingestion/src/datahub/ingestion/source/azure_data_factory/adf_config.py new file mode 100644 index 00000000000000..c48e998ba24e6c --- /dev/null +++ b/metadata-ingestion/src/datahub/ingestion/source/azure_data_factory/adf_config.py @@ -0,0 +1,158 @@ +"""Configuration classes for Azure Data Factory connector.""" + +from typing import Optional + +from pydantic import Field + +from datahub.configuration.common import AllowDenyPattern +from datahub.configuration.source_common import ( + EnvConfigMixin, + PlatformInstanceConfigMixin, +) +from datahub.ingestion.source.azure.azure_auth import AzureCredentialConfig +from datahub.ingestion.source.state.stale_entity_removal_handler import ( + StatefulStaleMetadataRemovalConfig, +) +from datahub.ingestion.source.state.stateful_ingestion_base import ( + StatefulIngestionConfigBase, +) + + +class AzureDataFactoryConfig( + StatefulIngestionConfigBase, + PlatformInstanceConfigMixin, + EnvConfigMixin, +): + """Configuration for Azure Data Factory source. + + This connector extracts metadata from Azure Data Factory including: + - Data Factories as Containers + - Pipelines as DataFlows + - Activities as DataJobs + - Dataset lineage + - Execution history (optional) + """ + + # Azure Authentication + credential: AzureCredentialConfig = Field( + default_factory=AzureCredentialConfig, + description=( + "Azure authentication configuration. Supports service principal, " + "managed identity, Azure CLI, or auto-detection (DefaultAzureCredential). " + "See AzureCredentialConfig for detailed options." + ), + ) + + # Azure Scope + subscription_id: str = Field( + description=( + "Azure subscription ID containing the Data Factories to ingest. " + "Find this in Azure Portal > Subscriptions." + ), + ) + + resource_group: Optional[str] = Field( + default=None, + description=( + "Azure resource group name to filter Data Factories. " + "If not specified, all Data Factories in the subscription will be ingested." + ), + ) + + # Filtering + factory_pattern: AllowDenyPattern = Field( + default=AllowDenyPattern.allow_all(), + description=( + "Regex patterns to filter Data Factories by name. " + "Example: allow=['prod-.*'], deny=['.*-test']" + ), + ) + + pipeline_pattern: AllowDenyPattern = Field( + default=AllowDenyPattern.allow_all(), + description=( + "Regex patterns to filter pipelines by name. " + "Applied to all factories matching factory_pattern." + ), + ) + + # Feature Flags + include_lineage: bool = Field( + default=True, + description=( + "Extract lineage from activity inputs/outputs. " + "Maps ADF datasets to DataHub datasets based on linked service type." + ), + ) + + include_column_lineage: bool = Field( + default=False, + description=( + "Extract column-level lineage from Data Flow activities. " + "Requires parsing Data Flow definitions. " + "Note: This is an advanced feature and may increase ingestion time." + ), + ) + + include_execution_history: bool = Field( + default=False, + description=( + "Extract pipeline and activity execution history as DataProcessInstance. " + "Includes run status, duration, and parameters." + ), + ) + + execution_history_days: int = Field( + default=7, + description=( + "Number of days of execution history to extract. " + "Only used when include_execution_history is True. " + "Higher values increase ingestion time." + ), + ge=1, + le=90, + ) + + include_datasets: bool = Field( + default=True, + description=( + "Include ADF dataset definitions in the metadata. " + "Datasets are used to resolve lineage to external platforms." + ), + ) + + include_linked_services: bool = Field( + default=True, + description=( + "Include linked service connection information as custom properties. " + "Sensitive connection strings are not extracted." + ), + ) + + include_triggers: bool = Field( + default=True, + description=( + "Include trigger information as custom properties on pipelines. " + "Shows schedule and event triggers associated with pipelines." + ), + ) + + # Platform Mapping + platform_instance_map: dict[str, str] = Field( + default_factory=dict, + description=( + "Map linked service names to DataHub platform instances. " + "Example: {'my-snowflake-connection': 'prod_snowflake'}. " + "Used for accurate lineage resolution to existing datasets." + ), + ) + + # Stateful Ingestion + stateful_ingestion: Optional[StatefulStaleMetadataRemovalConfig] = Field( + default=None, + description=( + "Configuration for stateful ingestion and stale entity removal. " + "When enabled, tracks ingested entities and removes those that " + "no longer exist in Azure Data Factory." + ), + ) diff --git a/metadata-ingestion/src/datahub/ingestion/source/azure_data_factory/adf_models.py b/metadata-ingestion/src/datahub/ingestion/source/azure_data_factory/adf_models.py new file mode 100644 index 00000000000000..47af993ea2e69f --- /dev/null +++ b/metadata-ingestion/src/datahub/ingestion/source/azure_data_factory/adf_models.py @@ -0,0 +1,536 @@ +"""Pydantic models for Azure Data Factory API responses. + +These models provide type safety and validation for ADF REST API responses. +Field names match the Azure API response structure (camelCase). + +API Documentation: https://learn.microsoft.com/en-us/rest/api/datafactory/ +""" + +from datetime import datetime +from typing import Any, Optional + +from pydantic import BaseModel, ConfigDict, Field, model_validator + + +class AdfResource(BaseModel): + """Base model for Azure Data Factory resources.""" + + model_config = ConfigDict(populate_by_name=True, extra="allow") + + id: str = Field(description="Azure resource ID") + name: str = Field(description="Resource name") + type: str = Field(description="Azure resource type") + etag: Optional[str] = Field(default=None, description="Resource ETag") + + +class FactoryProperties(BaseModel): + """Properties of a Data Factory.""" + + model_config = ConfigDict(populate_by_name=True, extra="allow") + + provisioning_state: Optional[str] = Field( + default=None, alias="provisioningState", description="Provisioning state" + ) + create_time: Optional[datetime] = Field( + default=None, alias="createTime", description="Factory creation time" + ) + version: Optional[str] = Field(default=None, description="Factory version") + + +class Factory(AdfResource): + """Azure Data Factory resource. + + API Reference: https://learn.microsoft.com/en-us/rest/api/datafactory/factories/get + """ + + location: str = Field(description="Azure region") + tags: dict[str, str] = Field(default_factory=dict, description="Resource tags") + properties: Optional[FactoryProperties] = Field( + default=None, description="Factory properties" + ) + + +class ActivityDependency(BaseModel): + """Dependency between activities in a pipeline.""" + + model_config = ConfigDict(populate_by_name=True, extra="allow") + + activity: str = Field(description="Name of the dependent activity") + dependency_conditions: list[str] = Field( + default_factory=list, + alias="dependencyConditions", + description="Conditions for dependency (Succeeded, Failed, Skipped, Completed)", + ) + + +class DatasetReference(BaseModel): + """Reference to an ADF dataset.""" + + model_config = ConfigDict(populate_by_name=True, extra="allow") + + reference_name: str = Field(alias="referenceName", description="Dataset name") + type: str = Field(default="DatasetReference", description="Reference type") + parameters: dict[str, Any] = Field( + default_factory=dict, description="Dataset parameters" + ) + + +class LinkedServiceReference(BaseModel): + """Reference to a linked service.""" + + model_config = ConfigDict(populate_by_name=True, extra="allow") + + reference_name: str = Field( + alias="referenceName", description="Linked service name" + ) + type: str = Field(default="LinkedServiceReference", description="Reference type") + parameters: dict[str, Any] = Field( + default_factory=dict, description="Linked service parameters" + ) + + +class ActivityInput(BaseModel): + """Input configuration for an activity.""" + + model_config = ConfigDict(populate_by_name=True, extra="allow") + + # For Copy activities + source: Optional[dict[str, Any]] = Field( + default=None, description="Source configuration" + ) + + # Dataset reference (common) + dataset: Optional[DatasetReference] = Field( + default=None, description="Input dataset reference" + ) + + +class ActivityOutput(BaseModel): + """Output configuration for an activity.""" + + model_config = ConfigDict(populate_by_name=True, extra="allow") + + # For Copy activities + sink: Optional[dict[str, Any]] = Field( + default=None, description="Sink configuration" + ) + + # Dataset reference (common) + dataset: Optional[DatasetReference] = Field( + default=None, description="Output dataset reference" + ) + + +class Activity(BaseModel): + """Activity within an ADF pipeline. + + API Reference: https://learn.microsoft.com/en-us/rest/api/datafactory/pipelines/get + """ + + model_config = ConfigDict(populate_by_name=True, extra="allow") + + name: str = Field(description="Activity name") + type: str = Field( + description="Activity type (e.g., Copy, DataFlow, ExecutePipeline)" + ) + description: Optional[str] = Field(default=None, description="Activity description") + + # Dependencies + depends_on: list[ActivityDependency] = Field( + default_factory=list, alias="dependsOn", description="Activity dependencies" + ) + + # Type-specific properties stored here + type_properties: Optional[dict[str, Any]] = Field( + default=None, alias="typeProperties", description="Type-specific properties" + ) + + # Inputs/Outputs (for Copy and other data activities) + inputs: list[DatasetReference] = Field( + default_factory=list, description="Input dataset references" + ) + outputs: list[DatasetReference] = Field( + default_factory=list, description="Output dataset references" + ) + + # Linked service (for some activities) + linked_service_name: Optional[LinkedServiceReference] = Field( + default=None, + alias="linkedServiceName", + description="Linked service for activity", + ) + + # Policy + policy: Optional[dict[str, Any]] = Field( + default=None, description="Activity execution policy" + ) + + # User properties + user_properties: list[dict[str, Any]] = Field( + default_factory=list, + alias="userProperties", + description="User-defined properties", + ) + + +class PipelineProperties(BaseModel): + """Properties of an ADF pipeline.""" + + model_config = ConfigDict(populate_by_name=True, extra="allow") + + description: Optional[str] = Field(default=None, description="Pipeline description") + activities: list[Activity] = Field( + default_factory=list, description="Pipeline activities" + ) + parameters: dict[str, Any] = Field( + default_factory=dict, description="Pipeline parameters" + ) + variables: dict[str, Any] = Field( + default_factory=dict, description="Pipeline variables" + ) + concurrency: Optional[int] = Field(default=None, description="Max concurrent runs") + annotations: list[str] = Field( + default_factory=list, description="Pipeline annotations" + ) + folder: Optional[dict[str, str]] = Field( + default=None, description="Folder path for organization" + ) + + +class Pipeline(AdfResource): + """Azure Data Factory pipeline. + + API Reference: https://learn.microsoft.com/en-us/rest/api/datafactory/pipelines/get + + Note: The Azure SDK may return pipeline data with properties at the root level + or nested under 'properties'. This model handles both cases. + """ + + # Properties can be nested or at root level depending on Azure SDK version + properties: Optional[PipelineProperties] = Field( + default=None, description="Pipeline properties" + ) + + # Root-level fields (used when properties are flattened) + description: Optional[str] = Field(default=None, description="Pipeline description") + activities: list[Activity] = Field( + default_factory=list, description="Pipeline activities" + ) + parameters: dict[str, Any] = Field( + default_factory=dict, description="Pipeline parameters" + ) + variables: dict[str, Any] = Field( + default_factory=dict, description="Pipeline variables" + ) + concurrency: Optional[int] = Field(default=None, description="Max concurrent runs") + annotations: list[str] = Field( + default_factory=list, description="Pipeline annotations" + ) + folder: Optional[dict[str, str]] = Field( + default=None, description="Folder path for organization" + ) + + @model_validator(mode="after") + def normalize_properties(self) -> "Pipeline": + """Ensure properties are accessible whether nested or flat.""" + if self.properties is None: + # Properties are at root level, create a PipelineProperties object + self.properties = PipelineProperties( + description=self.description, + activities=self.activities, + parameters=self.parameters, + variables=self.variables, + concurrency=self.concurrency, + annotations=self.annotations, + folder=self.folder, + ) + return self + + +class DatasetProperties(BaseModel): + """Properties of an ADF dataset.""" + + model_config = ConfigDict(populate_by_name=True, extra="allow") + + description: Optional[str] = Field(default=None, description="Dataset description") + linked_service_name: LinkedServiceReference = Field( + alias="linkedServiceName", description="Associated linked service" + ) + parameters: dict[str, Any] = Field( + default_factory=dict, description="Dataset parameters" + ) + annotations: list[str] = Field( + default_factory=list, description="Dataset annotations" + ) + folder: Optional[dict[str, str]] = Field( + default=None, description="Folder path for organization" + ) + type: str = Field( + description="Dataset type (e.g., AzureBlobDataset, DelimitedTextDataset)" + ) + + # Type-specific properties + type_properties: Optional[dict[str, Any]] = Field( + default=None, alias="typeProperties", description="Type-specific properties" + ) + + # Schema (optional) - named schema_definition to avoid conflict with Pydantic's schema method + schema_definition: Optional[list[dict[str, Any]]] = Field( + default=None, alias="schema", description="Dataset schema definition" + ) + + # Structure (legacy schema format) + structure: Optional[list[dict[str, Any]]] = Field( + default=None, description="Dataset structure (legacy)" + ) + + +class Dataset(AdfResource): + """Azure Data Factory dataset. + + API Reference: https://learn.microsoft.com/en-us/rest/api/datafactory/datasets/get + """ + + properties: DatasetProperties = Field(description="Dataset properties") + + +class LinkedServiceProperties(BaseModel): + """Properties of a linked service.""" + + model_config = ConfigDict(populate_by_name=True, extra="allow") + + description: Optional[str] = Field( + default=None, description="Linked service description" + ) + type: str = Field( + description="Linked service type (e.g., AzureBlobStorage, AzureSqlDatabase)" + ) + type_properties: Optional[dict[str, Any]] = Field( + default=None, alias="typeProperties", description="Type-specific properties" + ) + annotations: list[str] = Field( + default_factory=list, description="Linked service annotations" + ) + connect_via: Optional[dict[str, Any]] = Field( + default=None, alias="connectVia", description="Integration runtime reference" + ) + + +class LinkedService(AdfResource): + """Azure Data Factory linked service (connection). + + API Reference: https://learn.microsoft.com/en-us/rest/api/datafactory/linked-services/get + """ + + properties: LinkedServiceProperties = Field(description="Linked service properties") + + +class DataFlowSource(BaseModel): + """Source definition in a data flow.""" + + model_config = ConfigDict(populate_by_name=True, extra="allow") + + name: str = Field(description="Source name") + dataset: Optional[DatasetReference] = Field( + default=None, description="Source dataset" + ) + linked_service: Optional[LinkedServiceReference] = Field( + default=None, alias="linkedService", description="Inline linked service" + ) + schema_linked_service: Optional[LinkedServiceReference] = Field( + default=None, alias="schemaLinkedService", description="Schema linked service" + ) + + +class DataFlowSink(BaseModel): + """Sink definition in a data flow.""" + + model_config = ConfigDict(populate_by_name=True, extra="allow") + + name: str = Field(description="Sink name") + dataset: Optional[DatasetReference] = Field( + default=None, description="Sink dataset" + ) + linked_service: Optional[LinkedServiceReference] = Field( + default=None, alias="linkedService", description="Inline linked service" + ) + schema_linked_service: Optional[LinkedServiceReference] = Field( + default=None, alias="schemaLinkedService", description="Schema linked service" + ) + + +class DataFlowProperties(BaseModel): + """Properties of a mapping data flow.""" + + model_config = ConfigDict(populate_by_name=True, extra="allow") + + description: Optional[str] = Field( + default=None, description="Data flow description" + ) + type: str = Field(default="MappingDataFlow", description="Data flow type") + type_properties: Optional[dict[str, Any]] = Field( + default=None, alias="typeProperties", description="Type-specific properties" + ) + annotations: list[str] = Field( + default_factory=list, description="Data flow annotations" + ) + folder: Optional[dict[str, str]] = Field( + default=None, description="Folder path for organization" + ) + + # Sources and sinks for lineage extraction + sources: list[DataFlowSource] = Field( + default_factory=list, description="Data flow sources" + ) + sinks: list[DataFlowSink] = Field( + default_factory=list, description="Data flow sinks" + ) + + # Transformations and script + transformations: list[dict[str, Any]] = Field( + default_factory=list, description="Data flow transformations" + ) + script_lines: list[str] = Field( + default_factory=list, + alias="scriptLines", + description="Data flow script lines (DSL)", + ) + + def get_script(self) -> Optional[str]: + """Get the complete Data Flow script as a single string.""" + if self.script_lines: + return "\n".join(self.script_lines) + return None + + +class DataFlow(AdfResource): + """Azure Data Factory mapping data flow. + + API Reference: https://learn.microsoft.com/en-us/rest/api/datafactory/data-flows/get + """ + + properties: DataFlowProperties = Field(description="Data flow properties") + + +class TriggerProperties(BaseModel): + """Properties of a trigger.""" + + model_config = ConfigDict(populate_by_name=True, extra="allow") + + description: Optional[str] = Field(default=None, description="Trigger description") + type: str = Field( + description="Trigger type (e.g., ScheduleTrigger, BlobEventsTrigger)" + ) + runtime_state: Optional[str] = Field( + default=None, + alias="runtimeState", + description="Trigger state (Started, Stopped)", + ) + type_properties: Optional[dict[str, Any]] = Field( + default=None, alias="typeProperties", description="Type-specific properties" + ) + annotations: list[str] = Field( + default_factory=list, description="Trigger annotations" + ) + pipelines: list[dict[str, Any]] = Field( + default_factory=list, description="Pipelines triggered" + ) + + +class Trigger(AdfResource): + """Azure Data Factory trigger. + + API Reference: https://learn.microsoft.com/en-us/rest/api/datafactory/triggers/get + """ + + properties: TriggerProperties = Field(description="Trigger properties") + + +class PipelineRun(BaseModel): + """Pipeline run execution record. + + API Reference: https://learn.microsoft.com/en-us/rest/api/datafactory/pipeline-runs/get + """ + + model_config = ConfigDict(populate_by_name=True, extra="allow") + + run_id: str = Field(alias="runId", description="Unique run identifier") + pipeline_name: str = Field(alias="pipelineName", description="Pipeline name") + status: str = Field(description="Run status (Succeeded, Failed, InProgress, etc.)") + run_start: Optional[datetime] = Field( + default=None, alias="runStart", description="Run start time" + ) + run_end: Optional[datetime] = Field( + default=None, alias="runEnd", description="Run end time" + ) + duration_in_ms: Optional[int] = Field( + default=None, alias="durationInMs", description="Duration in milliseconds" + ) + message: Optional[str] = Field(default=None, description="Run message or error") + parameters: dict[str, str] = Field( + default_factory=dict, description="Run parameters" + ) + invoked_by: Optional[dict[str, str]] = Field( + default=None, + alias="invokedBy", + description="Trigger or user that invoked the run", + ) + last_updated: Optional[datetime] = Field( + default=None, alias="lastUpdated", description="Last update time" + ) + run_group_id: Optional[str] = Field( + default=None, alias="runGroupId", description="Run group identifier" + ) + is_latest: Optional[bool] = Field( + default=None, alias="isLatest", description="Is this the latest run" + ) + + +class ActivityRun(BaseModel): + """Activity run execution record. + + API Reference: https://learn.microsoft.com/en-us/rest/api/datafactory/activity-runs/query-by-pipeline-run + """ + + model_config = ConfigDict(populate_by_name=True, extra="allow") + + activity_run_id: str = Field( + alias="activityRunId", description="Unique run identifier" + ) + activity_name: str = Field(alias="activityName", description="Activity name") + activity_type: str = Field(alias="activityType", description="Activity type") + pipeline_run_id: str = Field( + alias="pipelineRunId", description="Parent pipeline run ID" + ) + pipeline_name: str = Field(alias="pipelineName", description="Parent pipeline name") + status: str = Field(description="Run status") + activity_run_start: Optional[datetime] = Field( + default=None, alias="activityRunStart", description="Activity start time" + ) + activity_run_end: Optional[datetime] = Field( + default=None, alias="activityRunEnd", description="Activity end time" + ) + duration_in_ms: Optional[int] = Field( + default=None, alias="durationInMs", description="Duration in milliseconds" + ) + input: Optional[dict[str, Any]] = Field(default=None, description="Activity input") + output: Optional[dict[str, Any]] = Field( + default=None, description="Activity output" + ) + error: Optional[dict[str, Any]] = Field( + default=None, description="Error details if failed" + ) + + +class ListResponse(BaseModel): + """Generic list response with pagination. + + API Reference: https://learn.microsoft.com/en-us/rest/api/datafactory/factories/list + """ + + model_config = ConfigDict(populate_by_name=True, extra="allow") + + value: list[dict[str, Any]] = Field(description="List of resources") + next_link: Optional[str] = Field( + default=None, alias="nextLink", description="URL for next page of results" + ) diff --git a/metadata-ingestion/src/datahub/ingestion/source/azure_data_factory/adf_report.py b/metadata-ingestion/src/datahub/ingestion/source/azure_data_factory/adf_report.py new file mode 100644 index 00000000000000..b984b67be74907 --- /dev/null +++ b/metadata-ingestion/src/datahub/ingestion/source/azure_data_factory/adf_report.py @@ -0,0 +1,123 @@ +"""Custom report class for Azure Data Factory connector.""" + +from dataclasses import dataclass, field + +from datahub.ingestion.source.state.stale_entity_removal_handler import ( + StaleEntityRemovalSourceReport, +) +from datahub.utilities.lossy_collections import LossyList + + +@dataclass +class AzureDataFactorySourceReport(StaleEntityRemovalSourceReport): + """Ingestion report for Azure Data Factory source. + + Tracks metrics specific to ADF ingestion including counts of + factories, pipelines, activities, and lineage extraction results. + """ + + # Entity counts + factories_scanned: int = 0 + pipelines_scanned: int = 0 + activities_scanned: int = 0 + datasets_scanned: int = 0 + linked_services_scanned: int = 0 + data_flows_scanned: int = 0 + triggers_scanned: int = 0 + + # Filtered entities + filtered_factories: LossyList[str] = field(default_factory=LossyList) + filtered_pipelines: LossyList[str] = field(default_factory=LossyList) + + # Lineage metrics + lineage_edges_extracted: int = 0 + lineage_extraction_failures: int = 0 + datasets_with_lineage: int = 0 + datasets_without_platform_mapping: LossyList[str] = field(default_factory=LossyList) + + # Execution history metrics + pipeline_runs_scanned: int = 0 + activity_runs_scanned: int = 0 + + # API metrics + api_calls: int = 0 + api_errors: int = 0 + + def report_factory_scanned(self) -> None: + """Increment factories scanned counter.""" + self.factories_scanned += 1 + + def report_factory_filtered(self, factory_name: str) -> None: + """Record a filtered factory.""" + self.filtered_factories.append(factory_name) + + def report_pipeline_scanned(self) -> None: + """Increment pipelines scanned counter.""" + self.pipelines_scanned += 1 + + def report_pipeline_filtered(self, pipeline_name: str) -> None: + """Record a filtered pipeline.""" + self.filtered_pipelines.append(pipeline_name) + + def report_activity_scanned(self) -> None: + """Increment activities scanned counter.""" + self.activities_scanned += 1 + + def report_dataset_scanned(self) -> None: + """Increment datasets scanned counter.""" + self.datasets_scanned += 1 + + def report_linked_service_scanned(self) -> None: + """Increment linked services scanned counter.""" + self.linked_services_scanned += 1 + + def report_data_flow_scanned(self) -> None: + """Increment data flows scanned counter.""" + self.data_flows_scanned += 1 + + def report_trigger_scanned(self) -> None: + """Increment triggers scanned counter.""" + self.triggers_scanned += 1 + + def report_lineage_extracted(self) -> None: + """Increment lineage edges counter.""" + self.lineage_edges_extracted += 1 + self.datasets_with_lineage += 1 + + def report_lineage_failed(self, entity_name: str, error: str) -> None: + """Record a lineage extraction failure.""" + self.lineage_extraction_failures += 1 + self.report_warning( + title="Lineage Extraction Failed", + message="Unable to extract lineage for this entity.", + context=f"entity={entity_name}, error={error}", + ) + + def report_unmapped_platform( + self, dataset_name: str, linked_service_type: str + ) -> None: + """Record a dataset with unmapped platform.""" + self.datasets_without_platform_mapping.append( + f"{dataset_name} (type={linked_service_type})" + ) + + def report_pipeline_run_scanned(self) -> None: + """Increment pipeline runs scanned counter.""" + self.pipeline_runs_scanned += 1 + + def report_activity_run_scanned(self) -> None: + """Increment activity runs scanned counter.""" + self.activity_runs_scanned += 1 + + def report_api_call(self) -> None: + """Track an API call.""" + self.api_calls += 1 + + def report_api_error(self, endpoint: str, error: str) -> None: + """Record an API error.""" + self.api_errors += 1 + self.report_warning( + title="API Error", + message="Failed to call Azure Data Factory API.", + context=f"endpoint={endpoint}, error={error}", + ) diff --git a/metadata-ingestion/src/datahub/ingestion/source/azure_data_factory/adf_source.py b/metadata-ingestion/src/datahub/ingestion/source/azure_data_factory/adf_source.py new file mode 100644 index 00000000000000..20dcdafd2c6c48 --- /dev/null +++ b/metadata-ingestion/src/datahub/ingestion/source/azure_data_factory/adf_source.py @@ -0,0 +1,1073 @@ +"""Azure Data Factory ingestion source for DataHub. + +This connector extracts metadata from Azure Data Factory including: +- Data Factories as Containers +- Pipelines as DataFlows +- Activities as DataJobs +- Dataset lineage (activity inputs/outputs) +- Pipeline execution history (optional) + +Usage: + source: + type: azure_data_factory + config: + subscription_id: ${AZURE_SUBSCRIPTION_ID} + credential: + authentication_method: service_principal + client_id: ${AZURE_CLIENT_ID} + client_secret: ${AZURE_CLIENT_SECRET} + tenant_id: ${AZURE_TENANT_ID} +""" + +import logging +from typing import Dict, Iterable, List, Optional, Tuple + +from datahub.api.entities.dataprocess.dataprocess_instance import ( + DataProcessInstance, + InstanceRunResult, +) +from datahub.emitter.mcp import MetadataChangeProposalWrapper +from datahub.emitter.mcp_builder import ContainerKey +from datahub.ingestion.api.common import PipelineContext +from datahub.ingestion.api.decorators import ( + SourceCapability, + SupportStatus, + capability, + config_class, + platform_name, + support_status, +) +from datahub.ingestion.api.source import MetadataWorkUnitProcessor +from datahub.ingestion.api.workunit import MetadataWorkUnit +from datahub.ingestion.source.azure_data_factory.adf_client import ( + AzureDataFactoryClient, +) +from datahub.ingestion.source.azure_data_factory.adf_config import ( + AzureDataFactoryConfig, +) +from datahub.ingestion.source.azure_data_factory.adf_models import ( + Activity, + DataFlow as AdfDataFlow, + Dataset as AdfDataset, + Factory, + LinkedService, + Pipeline, + PipelineRun, + Trigger, +) +from datahub.ingestion.source.azure_data_factory.adf_report import ( + AzureDataFactorySourceReport, +) +from datahub.ingestion.source.state.stale_entity_removal_handler import ( + StaleEntityRemovalHandler, +) +from datahub.ingestion.source.state.stateful_ingestion_base import ( + StatefulIngestionSourceBase, +) +from datahub.metadata.schema_classes import ( + DataProcessTypeClass, + DataTransformClass, + DataTransformLogicClass, + QueryLanguageClass, + QueryStatementClass, +) +from datahub.metadata.urns import DataFlowUrn, DatasetUrn +from datahub.sdk.container import Container +from datahub.sdk.dataflow import DataFlow +from datahub.sdk.datajob import DataJob + +logger = logging.getLogger(__name__) + +# Platform identifier for Azure Data Factory +PLATFORM = "azure_data_factory" + +# Mapping of ADF linked service types to DataHub platforms +LINKED_SERVICE_PLATFORM_MAP: Dict[str, str] = { + # Azure Storage + "AzureBlobStorage": "azure_blob_storage", + "AzureBlobFS": "azure_data_lake", + "AzureDataLakeStore": "azure_data_lake", + "AzureDataLakeStoreCosmosStructuredStream": "azure_data_lake", + "AzureFileStorage": "azure_file_storage", + # Azure Databases + "AzureSqlDatabase": "mssql", + "AzureSqlDW": "synapse", + "AzureSynapseAnalytics": "synapse", + "AzureSqlMI": "mssql", + "SqlServer": "mssql", + "AzurePostgreSql": "postgres", + "AzureMySql": "mysql", + "CosmosDb": "cosmosdb", + "CosmosDbMongoDbApi": "mongodb", + # Databricks + "AzureDatabricks": "databricks", + "AzureDatabricksDeltaLake": "databricks", + # Cloud Platforms + "AmazonS3": "s3", + "AmazonS3Compatible": "s3", + "GoogleCloudStorage": "gcs", + "AmazonRedshift": "redshift", + "GoogleBigQuery": "bigquery", + "Snowflake": "snowflake", + # Traditional Databases + "PostgreSql": "postgres", + "MySql": "mysql", + "Oracle": "oracle", + "OracleServiceCloud": "oracle", + "Db2": "db2", + "Sybase": "sybase", + "Teradata": "teradata", + "Informix": "informix", + "Netezza": "netezza", + "Vertica": "vertica", + "Greenplum": "greenplum", + # Data Warehouses + "Hive": "hive", + "Spark": "spark", + "Hdfs": "hdfs", + # SaaS Applications + "Salesforce": "salesforce", + "SalesforceServiceCloud": "salesforce", + "SalesforceMarketingCloud": "salesforce", + "ServiceNow": "servicenow", + "Dynamics": "dynamics", + "DynamicsAX": "dynamics", + "DynamicsCrm": "dynamics", + # File Formats (use linked service or default) + "FtpServer": "ftp", + "Sftp": "sftp", + "HttpServer": "http", + "OData": "odata", + "Rest": "rest", +} + +# Mapping of ADF activity types to DataHub subtypes +ACTIVITY_SUBTYPE_MAP: Dict[str, str] = { + "Copy": "Copy Activity", + "DataFlow": "Data Flow Activity", + "ExecutePipeline": "Execute Pipeline", + "ExecuteDataFlow": "Data Flow Activity", + "Lookup": "Lookup Activity", + "GetMetadata": "Get Metadata Activity", + "SqlServerStoredProcedure": "Stored Procedure Activity", + "Script": "Script Activity", + "WebActivity": "Web Activity", + "WebHook": "Webhook Activity", + "IfCondition": "If Condition", + "ForEach": "ForEach Loop", + "Until": "Until Loop", + "Wait": "Wait Activity", + "SetVariable": "Set Variable", + "AppendVariable": "Append Variable", + "Switch": "Switch Activity", + "Filter": "Filter Activity", + "Validation": "Validation Activity", + "DatabricksNotebook": "Databricks Notebook", + "DatabricksSparkJar": "Databricks Spark Jar", + "DatabricksSparkPython": "Databricks Spark Python", + "HDInsightHive": "HDInsight Hive", + "HDInsightPig": "HDInsight Pig", + "HDInsightSpark": "HDInsight Spark", + "HDInsightMapReduce": "HDInsight MapReduce", + "HDInsightStreaming": "HDInsight Streaming", + "AzureFunctionActivity": "Azure Function Activity", + "AzureMLBatchExecution": "Azure ML Batch", + "AzureMLUpdateResource": "Azure ML Update", + "AzureMLExecutePipeline": "Azure ML Pipeline", + "Custom": "Custom Activity", + "Delete": "Delete Activity", + "SynapseNotebook": "Synapse Notebook", + "SparkJob": "Spark Job", + "SynapseSparkJob": "Synapse Spark Job", + "SqlPoolStoredProcedure": "SQL Pool Stored Procedure", + "Fail": "Fail Activity", +} + + +class AzureDataFactoryContainerKey(ContainerKey): + """Container key for Azure Data Factory resources.""" + + resource_group: str + factory_name: str + + +@platform_name("Azure Data Factory") +@config_class(AzureDataFactoryConfig) +@support_status(SupportStatus.INCUBATING) +@capability(SourceCapability.PLATFORM_INSTANCE, "Enabled by default") +@capability( + SourceCapability.LINEAGE_COARSE, + "Extracts lineage from activity inputs/outputs", +) +@capability(SourceCapability.CONTAINERS, "Enabled by default") +class AzureDataFactorySource(StatefulIngestionSourceBase): + """Extracts metadata from Azure Data Factory. + + This source extracts: + - Data Factories as Containers + - Pipelines as DataFlows + - Activities as DataJobs + - Dataset lineage from activity inputs/outputs + - Execution history (optional) + """ + + config: AzureDataFactoryConfig + report: AzureDataFactorySourceReport + platform: str = PLATFORM + + def __init__(self, config: AzureDataFactoryConfig, ctx: PipelineContext) -> None: + super().__init__(config, ctx) + self.config = config + self.report = AzureDataFactorySourceReport() + + # Initialize Azure client + credential = config.credential.get_credential() + self.client = AzureDataFactoryClient( + credential=credential, + subscription_id=config.subscription_id, + ) + + # Cache for datasets, linked services, data flows, and triggers (per factory) + self._datasets_cache: Dict[str, Dict[str, AdfDataset]] = {} + self._linked_services_cache: Dict[str, Dict[str, LinkedService]] = {} + self._data_flows_cache: Dict[str, Dict[str, AdfDataFlow]] = {} + self._triggers_cache: Dict[str, List[Trigger]] = {} + + @classmethod + def create( + cls, config_dict: Dict, ctx: PipelineContext + ) -> "AzureDataFactorySource": + config = AzureDataFactoryConfig.model_validate(config_dict) + return cls(config, ctx) + + def get_workunit_processors(self) -> List[Optional[MetadataWorkUnitProcessor]]: + return [ + *super().get_workunit_processors(), + StaleEntityRemovalHandler.create( + self, self.config, self.ctx + ).workunit_processor, + ] + + def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]: + """Generate workunits for all Azure Data Factory resources.""" + + # Iterate over all factories + for factory in self.client.get_factories( + resource_group=self.config.resource_group + ): + self.report.report_api_call() + + # Check if factory matches pattern + if not self.config.factory_pattern.allowed(factory.name): + self.report.report_factory_filtered(factory.name) + continue + + self.report.report_factory_scanned() + logger.info(f"Processing factory: {factory.name}") + + # Extract resource group from factory ID + # Format: /subscriptions/{sub}/resourceGroups/{rg}/providers/... + resource_group = self._extract_resource_group(factory.id) + + # Cache datasets and linked services for this factory + self._cache_factory_resources(resource_group, factory.name) + + # Emit factory as container and get the Container object for browse paths + container, container_workunits = self._emit_factory(factory, resource_group) + yield from container_workunits + + # Process pipelines, passing the Container for proper browse path hierarchy + yield from self._process_pipelines(factory, resource_group, container) + + # Process execution history if enabled + if self.config.include_execution_history: + yield from self._process_execution_history(factory, resource_group) + + def _extract_resource_group(self, resource_id: str) -> str: + """Extract resource group name from Azure resource ID.""" + # Format: /subscriptions/{sub}/resourceGroups/{rg}/providers/... + parts = resource_id.split("/") + try: + rg_index = parts.index("resourceGroups") + return parts[rg_index + 1] + except (ValueError, IndexError): + logger.warning(f"Could not extract resource group from: {resource_id}") + return "unknown" + + def _cache_factory_resources(self, resource_group: str, factory_name: str) -> None: + """Cache datasets and linked services for a factory.""" + factory_key = f"{resource_group}/{factory_name}" + + # Cache datasets + if self.config.include_datasets: + self._datasets_cache[factory_key] = {} + for dataset in self.client.get_datasets(resource_group, factory_name): + self.report.report_api_call() + self.report.report_dataset_scanned() + self._datasets_cache[factory_key][dataset.name] = dataset + + # Cache linked services + if self.config.include_linked_services: + self._linked_services_cache[factory_key] = {} + for ls in self.client.get_linked_services(resource_group, factory_name): + self.report.report_api_call() + self.report.report_linked_service_scanned() + self._linked_services_cache[factory_key][ls.name] = ls + + # Cache triggers + if self.config.include_triggers: + self._triggers_cache[factory_key] = [] + for trigger in self.client.get_triggers(resource_group, factory_name): + self.report.report_api_call() + self.report.report_trigger_scanned() + self._triggers_cache[factory_key].append(trigger) + + # Cache data flows (for lineage extraction from Data Flow activities) + if self.config.include_lineage: + self._data_flows_cache[factory_key] = {} + for data_flow in self.client.get_data_flows(resource_group, factory_name): + self.report.report_api_call() + self.report.report_data_flow_scanned() + self._data_flows_cache[factory_key][data_flow.name] = data_flow + + def _emit_factory( + self, factory: Factory, resource_group: str + ) -> Tuple[Container, Iterable[MetadataWorkUnit]]: + """Emit a Data Factory as a Container. + + Returns: + Tuple of (Container object, workunits). The Container object is needed + by child entities (DataFlows) to properly set up browse paths. + """ + container_key = AzureDataFactoryContainerKey( + platform=PLATFORM, + instance=self.config.platform_instance, + resource_group=resource_group, + factory_name=factory.name, + env=self.config.env, + ) + + # Build custom properties + custom_props: Dict[str, str] = { + "azure_resource_id": factory.id, + "location": factory.location, + } + if factory.tags: + for key, value in factory.tags.items(): + custom_props[f"tag:{key}"] = value + if factory.properties and factory.properties.provisioning_state: + custom_props["provisioning_state"] = factory.properties.provisioning_state + + container = Container( + container_key, + display_name=factory.name, + description=f"Azure Data Factory: {factory.name}", + subtype="Data Factory", + external_url=self._get_factory_url(factory, resource_group), + extra_properties=custom_props, + parent_container=None, # Top-level container + ) + + return container, container.as_workunits() + + def _get_factory_url(self, factory: Factory, resource_group: str) -> str: + """Generate Azure Portal URL for a Data Factory.""" + return ( + f"https://adf.azure.com/en/home" + f"?factory=/subscriptions/{self.config.subscription_id}" + f"/resourceGroups/{resource_group}" + f"/providers/Microsoft.DataFactory/factories/{factory.name}" + ) + + def _process_pipelines( + self, factory: Factory, resource_group: str, container: Container + ) -> Iterable[MetadataWorkUnit]: + """Process all pipelines in a factory. + + Args: + factory: The Data Factory + resource_group: Azure resource group name + container: The parent Container object (for browse path hierarchy) + """ + factory_key = f"{resource_group}/{factory.name}" + + for pipeline in self.client.get_pipelines(resource_group, factory.name): + self.report.report_api_call() + + # Check if pipeline matches pattern + if not self.config.pipeline_pattern.allowed(pipeline.name): + self.report.report_pipeline_filtered(pipeline.name) + continue + + self.report.report_pipeline_scanned() + logger.debug(f"Processing pipeline: {factory.name}/{pipeline.name}") + + # Emit pipeline as DataFlow, passing the Container for proper browse paths + dataflow = self._create_dataflow( + pipeline, factory, resource_group, container + ) + yield from dataflow.as_workunits() + + # Emit activities as DataJobs + if pipeline.properties is None: + logger.warning( + f"Pipeline {pipeline.name} has no properties, skipping activities" + ) + continue + for activity in pipeline.properties.activities: + self.report.report_activity_scanned() + + datajob = self._create_datajob( + activity, pipeline, factory, resource_group, dataflow, factory_key + ) + yield from datajob.as_workunits() + + # Emit dataTransformLogic for Data Flow activities + if activity.type == "ExecuteDataFlow": + yield from self._emit_data_flow_script( + activity, datajob, factory_key + ) + + def _create_dataflow( + self, + pipeline: Pipeline, + factory: Factory, + resource_group: str, + container: Container, + ) -> DataFlow: + """Create a DataFlow entity for a pipeline. + + Args: + pipeline: The ADF pipeline + factory: The parent Data Factory + resource_group: Azure resource group name + container: The parent Container object (enables proper browse path hierarchy) + """ + # Build flow name with factory prefix for uniqueness across factories + flow_name = f"{factory.name}.{pipeline.name}" + + # Custom properties + custom_props: Dict[str, str] = { + "azure_resource_id": pipeline.id, + "factory_name": factory.name, + } + + # Extract properties if available + description: Optional[str] = None + if pipeline.properties is not None: + if pipeline.properties.concurrency: + custom_props["concurrency"] = str(pipeline.properties.concurrency) + if pipeline.properties.folder: + folder_name = pipeline.properties.folder.get("name", "") + if folder_name: + custom_props["folder"] = folder_name + if pipeline.properties.annotations: + custom_props["annotations"] = ", ".join(pipeline.properties.annotations) + description = pipeline.properties.description + + # Add trigger info if available + triggers = self._get_pipeline_triggers( + resource_group, factory.name, pipeline.name + ) + if triggers: + custom_props["triggers"] = ", ".join(triggers) + + # Pass the Container object directly so the SDK can properly build + # browse paths by inheriting from the parent container's path + dataflow = DataFlow( + platform=PLATFORM, + name=flow_name, + platform_instance=self.config.platform_instance, + env=self.config.env, + display_name=pipeline.name, + description=description, + external_url=self._get_pipeline_url(factory, resource_group, pipeline.name), + custom_properties=custom_props, + subtype="Pipeline", + parent_container=container, + ) + + return dataflow + + def _get_pipeline_triggers( + self, resource_group: str, factory_name: str, pipeline_name: str + ) -> List[str]: + """Get trigger names associated with a pipeline.""" + if not self.config.include_triggers: + return [] + + factory_key = f"{resource_group}/{factory_name}" + triggers = self._triggers_cache.get(factory_key, []) + + result = [] + for trigger in triggers: + # Check if trigger references this pipeline + for pipeline_ref in trigger.properties.pipelines: + ref_name = pipeline_ref.get("pipelineReference", {}).get( + "referenceName", "" + ) + if ref_name == pipeline_name: + result.append(trigger.name) + break + + return result + + def _get_pipeline_url( + self, factory: Factory, resource_group: str, pipeline_name: str + ) -> str: + """Generate Azure Portal URL for a pipeline.""" + return ( + f"https://adf.azure.com/en/authoring/pipeline/{pipeline_name}" + f"?factory=/subscriptions/{self.config.subscription_id}" + f"/resourceGroups/{resource_group}" + f"/providers/Microsoft.DataFactory/factories/{factory.name}" + ) + + def _create_datajob( + self, + activity: Activity, + pipeline: Pipeline, + factory: Factory, + resource_group: str, + dataflow: DataFlow, + factory_key: str, + ) -> DataJob: + """Create a DataJob entity for an activity.""" + # Determine activity subtype + subtype = ACTIVITY_SUBTYPE_MAP.get(activity.type, activity.type) + + # Custom properties + custom_props: Dict[str, str] = { + "activity_type": activity.type, + } + if activity.description: + custom_props["activity_description"] = activity.description + + # Add policy info + if activity.policy: + if "timeout" in activity.policy: + custom_props["timeout"] = str(activity.policy["timeout"]) + if "retry" in activity.policy: + custom_props["retry"] = str(activity.policy["retry"]) + + # Extract lineage (inlets/outlets) + inlets: Optional[List[str]] = None + outlets: Optional[List[str]] = None + + if self.config.include_lineage: + extracted_inlets = self._extract_activity_inputs(activity, factory_key) + extracted_outlets = self._extract_activity_outputs(activity, factory_key) + if extracted_inlets: + inlets = extracted_inlets + if extracted_outlets: + outlets = extracted_outlets + + # Create DataJob with external URL to the parent pipeline + # (ADF doesn't have direct activity URLs, so we link to the pipeline) + datajob = DataJob( + name=activity.name, + flow=dataflow, + display_name=activity.name, + description=activity.description, + external_url=self._get_pipeline_url(factory, resource_group, pipeline.name), + custom_properties=custom_props, + subtype=subtype, + inlets=inlets, # type: ignore[arg-type] + outlets=outlets, # type: ignore[arg-type] + ) + + return datajob + + def _extract_activity_inputs( + self, activity: Activity, factory_key: str + ) -> List[str]: + """Extract input dataset URNs from an activity.""" + inputs: List[str] = [] + + # Process explicit inputs (for Copy activities and others) + for input_ref in activity.inputs: + dataset_urn = self._resolve_dataset_urn( + input_ref.reference_name, factory_key + ) + if dataset_urn: + inputs.append(str(dataset_urn)) + self.report.report_lineage_extracted() + + # Process Data Flow activities - extract sources as inputs + if activity.type == "ExecuteDataFlow": + data_flow_inputs = self._extract_data_flow_sources(activity, factory_key) + inputs.extend(data_flow_inputs) + + # Process source in typeProperties (for Copy activities) + if activity.type_properties and "source" in activity.type_properties: + source = activity.type_properties["source"] + if "datasetSettings" in source: + # Inline dataset configuration + pass # Complex case, skip for now + # Source might reference a dataset in storeSettings + store_settings = source.get("storeSettings", {}) + if "linkedServiceName" in store_settings: + # Could resolve to a dataset if we have schema info + pass + + return inputs + + def _extract_activity_outputs( + self, activity: Activity, factory_key: str + ) -> List[str]: + """Extract output dataset URNs from an activity.""" + outputs: List[str] = [] + + # Process explicit outputs (for Copy activities and others) + for output_ref in activity.outputs: + dataset_urn = self._resolve_dataset_urn( + output_ref.reference_name, factory_key + ) + if dataset_urn: + outputs.append(str(dataset_urn)) + self.report.report_lineage_extracted() + + # Process Data Flow activities - extract sinks as outputs + if activity.type == "ExecuteDataFlow": + data_flow_outputs = self._extract_data_flow_sinks(activity, factory_key) + outputs.extend(data_flow_outputs) + + # Process sink in typeProperties (for Copy activities) + if activity.type_properties and "sink" in activity.type_properties: + sink = activity.type_properties["sink"] + if "datasetSettings" in sink: + # Inline dataset configuration + pass # Complex case, skip for now + + return outputs + + def _get_data_flow_name_from_activity( + self, activity: Activity, factory_key: str + ) -> Optional[str]: + """Get the Data Flow name referenced by an ExecuteDataFlow activity. + + Due to a case-sensitivity bug in the Azure SDK where it expects + 'typeProperties.dataFlow' but the API returns 'typeProperties.dataflow', + we try multiple approaches to find the Data Flow name. + + Args: + activity: The ExecuteDataFlow activity + factory_key: Factory key for cache lookup + + Returns: + Data Flow name if found, None otherwise + """ + # Approach 1: Try typeProperties.dataFlow (SDK expected format) + if activity.type_properties: + data_flow_ref = activity.type_properties.get( + "dataFlow", activity.type_properties.get("dataflow", {}) + ) + if isinstance(data_flow_ref, dict): + name = data_flow_ref.get("referenceName") + if name: + return name + + # Approach 2: Try to match activity name to Data Flow name + # Many users name their activity similarly to the Data Flow + data_flows = self._data_flows_cache.get(factory_key, {}) + + # Exact match + if activity.name in data_flows: + logger.debug( + f"Found Data Flow by exact activity name match: {activity.name}" + ) + return activity.name + + # Fuzzy match - try removing common suffixes/variations + activity_name_normalized = activity.name.replace(" ", "").lower() + for df_name in data_flows: + df_name_normalized = df_name.replace(" ", "").lower() + if activity_name_normalized == df_name_normalized: + logger.debug( + f"Found Data Flow by fuzzy match: activity='{activity.name}' -> dataflow='{df_name}'" + ) + return df_name + + return None + + def _emit_data_flow_script( + self, activity: Activity, datajob: DataJob, factory_key: str + ) -> Iterable[MetadataWorkUnit]: + """Emit the Data Flow script as a dataTransformLogic aspect. + + For ExecuteDataFlow activities, this extracts the Data Flow DSL script + and emits it as a transformation aspect, making it viewable in the UI. + + Args: + activity: The ExecuteDataFlow activity + datajob: The DataJob entity for this activity + factory_key: Factory key for cache lookup + + Yields: + MetadataWorkUnit for the dataTransformLogic aspect + """ + # Get the Data Flow name + data_flow_name = self._get_data_flow_name_from_activity(activity, factory_key) + if not data_flow_name: + return + + # Look up the Data Flow definition + data_flows = self._data_flows_cache.get(factory_key, {}) + data_flow = data_flows.get(data_flow_name) + if not data_flow or not data_flow.properties: + return + + # Get the script from the Data Flow + script = data_flow.properties.get_script() + if not script: + logger.debug(f"No script found for Data Flow: {data_flow_name}") + return + + # Emit the dataTransformLogic aspect + # Note: Using SQL as language because UNKNOWN is not yet broadly supported + # in the UI. The Data Flow DSL is similar to SQL in structure. + logger.debug( + f"Emitting Data Flow script for activity '{activity.name}' " + f"({len(script)} chars)" + ) + yield MetadataChangeProposalWrapper( + entityUrn=str(datajob.urn), + aspect=DataTransformLogicClass( + transforms=[ + DataTransformClass( + queryStatement=QueryStatementClass( + value=script, + language=QueryLanguageClass.SQL, + ) + ) + ] + ), + ).as_workunit() + + def _extract_data_flow_sources( + self, activity: Activity, factory_key: str + ) -> List[str]: + """Extract source dataset URNs from a Data Flow activity. + + Data Flow activities reference a Data Flow definition which contains + sources (inputs) and sinks (outputs). This method extracts the sources. + + Args: + activity: The ExecuteDataFlow activity + factory_key: Factory key for cache lookup + + Returns: + List of source dataset URNs + """ + inputs: List[str] = [] + + # Get the Data Flow name using our robust lookup + data_flow_name = self._get_data_flow_name_from_activity(activity, factory_key) + + if not data_flow_name: + logger.debug( + f"Could not find Data Flow reference for activity: {activity.name}" + ) + return inputs + + # Look up the Data Flow definition + data_flows = self._data_flows_cache.get(factory_key, {}) + data_flow = data_flows.get(data_flow_name) + + if not data_flow: + logger.debug(f"Data Flow not found in cache: {data_flow_name}") + return inputs + + # Extract sources from the Data Flow + if data_flow.properties: + for source in data_flow.properties.sources: + if source.dataset: + dataset_urn = self._resolve_dataset_urn( + source.dataset.reference_name, factory_key + ) + if dataset_urn: + inputs.append(str(dataset_urn)) + self.report.report_lineage_extracted() + logger.debug( + f"Extracted Data Flow source: {source.name} -> {dataset_urn}" + ) + + return inputs + + def _extract_data_flow_sinks( + self, activity: Activity, factory_key: str + ) -> List[str]: + """Extract sink dataset URNs from a Data Flow activity. + + Data Flow activities reference a Data Flow definition which contains + sources (inputs) and sinks (outputs). This method extracts the sinks. + + Args: + activity: The ExecuteDataFlow activity + factory_key: Factory key for cache lookup + + Returns: + List of sink dataset URNs + """ + outputs: List[str] = [] + + # Get the Data Flow name using our robust lookup + data_flow_name = self._get_data_flow_name_from_activity(activity, factory_key) + + if not data_flow_name: + logger.debug( + f"Could not find Data Flow reference for activity: {activity.name}" + ) + return outputs + + # Look up the Data Flow definition + data_flows = self._data_flows_cache.get(factory_key, {}) + data_flow = data_flows.get(data_flow_name) + + if not data_flow: + logger.debug(f"Data Flow not found in cache: {data_flow_name}") + return outputs + + # Extract sinks from the Data Flow + if data_flow.properties: + for sink in data_flow.properties.sinks: + if sink.dataset: + dataset_urn = self._resolve_dataset_urn( + sink.dataset.reference_name, factory_key + ) + if dataset_urn: + outputs.append(str(dataset_urn)) + self.report.report_lineage_extracted() + logger.debug( + f"Extracted Data Flow sink: {sink.name} -> {dataset_urn}" + ) + + return outputs + + def _resolve_dataset_urn( + self, dataset_name: str, factory_key: str + ) -> Optional[DatasetUrn]: + """Resolve an ADF dataset reference to a DataHub DatasetUrn.""" + # Get dataset from cache + datasets = self._datasets_cache.get(factory_key, {}) + dataset = datasets.get(dataset_name) + + if not dataset: + logger.debug(f"Dataset not found in cache: {dataset_name}") + return None + + # Get linked service to determine platform + linked_service_ref = dataset.properties.linked_service_name + linked_services = self._linked_services_cache.get(factory_key, {}) + linked_service = linked_services.get(linked_service_ref.reference_name) + + if not linked_service: + logger.debug( + f"Linked service not found: {linked_service_ref.reference_name}" + ) + self.report.report_unmapped_platform(dataset_name, "unknown") + return None + + # Map linked service type to DataHub platform + ls_type = linked_service.properties.type + platform = LINKED_SERVICE_PLATFORM_MAP.get(ls_type) + + if not platform: + logger.debug(f"Unknown linked service type: {ls_type}") + self.report.report_unmapped_platform(dataset_name, ls_type) + return None + + # Build dataset name from type properties + table_name = self._extract_table_name(dataset, linked_service) + if not table_name: + table_name = dataset_name # Fallback to ADF dataset name + + # Check if there's a platform instance mapping + platform_instance = self.config.platform_instance_map.get( + linked_service_ref.reference_name + ) + + return DatasetUrn.create_from_ids( + platform_id=platform, + table_name=table_name, + env=self.config.env, + platform_instance=platform_instance, + ) + + def _extract_table_name( + self, dataset: AdfDataset, linked_service: LinkedService + ) -> Optional[str]: + """Extract table/file name from dataset type properties.""" + if not dataset.properties.type_properties: + return None + + type_props = dataset.properties.type_properties + + # SQL-like datasets + if "tableName" in type_props: + return type_props["tableName"] + if "table" in type_props: + return type_props["table"] + + # Structured table reference + if "schema" in type_props and "table" in type_props: + schema = type_props.get("schema", "") + table = type_props.get("table", "") + if schema and table: + return f"{schema}.{table}" + + # File-based datasets + if "fileName" in type_props: + folder = type_props.get("folderPath", "") + filename = type_props.get("fileName", "") + if folder and filename: + return f"{folder}/{filename}" + return filename + + # Container/path based + if "location" in type_props: + location = type_props["location"] + if isinstance(location, dict): + container = location.get("container", "") + folder = location.get("folderPath", "") + filename = location.get("fileName", "") + parts = [p for p in [container, folder, filename] if p] + if parts: + return "/".join(parts) + + return None + + def _process_execution_history( + self, factory: Factory, resource_group: str + ) -> Iterable[MetadataWorkUnit]: + """Process pipeline execution history for a factory.""" + logger.info( + f"Processing execution history for factory: {factory.name} " + f"(last {self.config.execution_history_days} days)" + ) + + for pipeline_run in self.client.get_pipeline_runs( + resource_group, + factory.name, + days=self.config.execution_history_days, + ): + self.report.report_api_call() + self.report.report_pipeline_run_scanned() + + # Check if pipeline matches pattern + if not self.config.pipeline_pattern.allowed(pipeline_run.pipeline_name): + continue + + yield from self._emit_pipeline_run(pipeline_run, factory, resource_group) + + def _emit_pipeline_run( + self, + pipeline_run: PipelineRun, + factory: Factory, + resource_group: str, + ) -> Iterable[MetadataWorkUnit]: + """Emit a pipeline run as DataProcessInstance.""" + # Build DataFlow URN for the template - include factory name for uniqueness + flow_name = f"{factory.name}.{pipeline_run.pipeline_name}" + flow_urn = DataFlowUrn.create_from_ids( + orchestrator=PLATFORM, + flow_id=flow_name, + env=self.config.env, + platform_instance=self.config.platform_instance, + ) + + # Map ADF status to InstanceRunResult + result = self._map_run_status(pipeline_run.status) + + # Build custom properties + properties: Dict[str, str] = { + "run_id": pipeline_run.run_id, + "status": pipeline_run.status, + } + if pipeline_run.message: + properties["message"] = pipeline_run.message[:500] # Truncate long messages + if pipeline_run.invoked_by: + invoker_name = pipeline_run.invoked_by.get("name", "") + invoker_type = pipeline_run.invoked_by.get("invokedByType", "") + if invoker_name: + properties["invoked_by"] = invoker_name + if invoker_type: + properties["invoked_by_type"] = invoker_type + if pipeline_run.parameters: + for key, value in list(pipeline_run.parameters.items())[ + :10 + ]: # Limit params + properties[f"param:{key}"] = str(value)[:100] + + # Create DataProcessInstance + dpi = DataProcessInstance( + id=pipeline_run.run_id, + orchestrator=PLATFORM, + cluster=self.config.env, + type=DataProcessTypeClass.BATCH_SCHEDULED, + template_urn=flow_urn, + properties=properties, + url=self._get_pipeline_run_url( + factory, resource_group, pipeline_run.run_id + ), + data_platform_instance=self.config.platform_instance, + subtype="Pipeline Run", + ) + + # Emit the instance + for mcp in dpi.generate_mcp( + created_ts_millis=( + int(pipeline_run.run_start.timestamp() * 1000) + if pipeline_run.run_start + else None + ), + materialize_iolets=False, + ): + yield mcp.as_workunit() + + # Emit start event + if pipeline_run.run_start: + start_ts = int(pipeline_run.run_start.timestamp() * 1000) + for mcp in dpi.start_event_mcp(start_ts): + yield mcp.as_workunit() + + # Emit end event if run is complete + if pipeline_run.run_end and result: + end_ts = int(pipeline_run.run_end.timestamp() * 1000) + for mcp in dpi.end_event_mcp( + end_timestamp_millis=end_ts, + result=result, + result_type=pipeline_run.status, + ): + yield mcp.as_workunit() + + def _map_run_status(self, status: str) -> Optional[InstanceRunResult]: + """Map ADF run status to DataHub InstanceRunResult.""" + status_map = { + "Succeeded": InstanceRunResult.SUCCESS, + "Failed": InstanceRunResult.FAILURE, + "Cancelled": InstanceRunResult.SKIPPED, + "Cancelling": None, # Still running + "InProgress": None, # Still running + "Queued": None, # Not started + } + return status_map.get(status) + + def _get_pipeline_run_url( + self, factory: Factory, resource_group: str, run_id: str + ) -> str: + """Generate Azure Portal URL for a pipeline run.""" + return ( + f"https://adf.azure.com/en/monitoring/pipelineruns/{run_id}" + f"?factory=/subscriptions/{self.config.subscription_id}" + f"/resourceGroups/{resource_group}" + f"/providers/Microsoft.DataFactory/factories/{factory.name}" + ) + + def get_report(self) -> AzureDataFactorySourceReport: + return self.report + + def close(self) -> None: + """Clean up resources.""" + self.client.close() + super().close() diff --git a/metadata-ingestion/tests/integration/azure_data_factory/__init__.py b/metadata-ingestion/tests/integration/azure_data_factory/__init__.py new file mode 100644 index 00000000000000..261403a51885d4 --- /dev/null +++ b/metadata-ingestion/tests/integration/azure_data_factory/__init__.py @@ -0,0 +1 @@ +"""Integration tests for Azure Data Factory connector.""" diff --git a/metadata-ingestion/tests/integration/azure_data_factory/adf_basic_golden.json b/metadata-ingestion/tests/integration/azure_data_factory/adf_basic_golden.json new file mode 100644 index 00000000000000..19e52adb2e48da --- /dev/null +++ b/metadata-ingestion/tests/integration/azure_data_factory/adf_basic_golden.json @@ -0,0 +1,775 @@ +[ +{ + "entityType": "container", + "entityUrn": "urn:li:container:0690c0f5c38fc7fe7ad0518a59384f93", + "changeType": "UPSERT", + "aspectName": "containerProperties", + "aspect": { + "json": { + "customProperties": { + "platform": "azure_data_factory", + "env": "PROD", + "resource_group": "test-resource-group", + "factory_name": "test-data-factory", + "azure_resource_id": "/subscriptions/12345678-1234-1234-1234-123456789012/resourceGroups/test-resource-group/providers/Microsoft.DataFactory/factories/test-data-factory", + "location": "eastus", + "tag:environment": "test", + "tag:team": "data-engineering", + "provisioning_state": "Succeeded" + }, + "externalUrl": "https://adf.azure.com/en/home?factory=/subscriptions/12345678-1234-1234-1234-123456789012/resourceGroups/test-resource-group/providers/Microsoft.DataFactory/factories/test-data-factory", + "name": "test-data-factory", + "description": "Azure Data Factory: test-data-factory", + "env": "PROD" + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-test-basic", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "container", + "entityUrn": "urn:li:container:0690c0f5c38fc7fe7ad0518a59384f93", + "changeType": "UPSERT", + "aspectName": "dataPlatformInstance", + "aspect": { + "json": { + "platform": "urn:li:dataPlatform:azure_data_factory" + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-test-basic", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "container", + "entityUrn": "urn:li:container:0690c0f5c38fc7fe7ad0518a59384f93", + "changeType": "UPSERT", + "aspectName": "subTypes", + "aspect": { + "json": { + "typeNames": [ + "Data Factory" + ] + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-test-basic", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "container", + "entityUrn": "urn:li:container:0690c0f5c38fc7fe7ad0518a59384f93", + "changeType": "UPSERT", + "aspectName": "browsePathsV2", + "aspect": { + "json": { + "path": [] + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-test-basic", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataFlow", + "entityUrn": "urn:li:dataFlow:(azure_data_factory,test-data-factory.DataIngestionPipeline,PROD)", + "changeType": "UPSERT", + "aspectName": "dataPlatformInstance", + "aspect": { + "json": { + "platform": "urn:li:dataPlatform:azure_data_factory" + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-test-basic", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "container", + "entityUrn": "urn:li:container:0690c0f5c38fc7fe7ad0518a59384f93", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-test-basic", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataFlow", + "entityUrn": "urn:li:dataFlow:(azure_data_factory,test-data-factory.DataIngestionPipeline,PROD)", + "changeType": "UPSERT", + "aspectName": "dataFlowInfo", + "aspect": { + "json": { + "customProperties": { + "azure_resource_id": "/subscriptions/12345678-1234-1234-1234-123456789012/resourceGroups/test-resource-group/providers/Microsoft.DataFactory/factories/test-data-factory/pipelines/DataIngestionPipeline", + "factory_name": "test-data-factory", + "triggers": "DailyScheduleTrigger" + }, + "externalUrl": "https://adf.azure.com/en/authoring/pipeline/DataIngestionPipeline?factory=/subscriptions/12345678-1234-1234-1234-123456789012/resourceGroups/test-resource-group/providers/Microsoft.DataFactory/factories/test-data-factory", + "name": "DataIngestionPipeline", + "description": "Main data ingestion pipeline", + "env": "PROD" + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-test-basic", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure_data_factory,test-data-factory.DataIngestionPipeline,PROD),TransformData)", + "changeType": "UPSERT", + "aspectName": "browsePathsV2", + "aspect": { + "json": { + "path": [ + { + "id": "urn:li:container:0690c0f5c38fc7fe7ad0518a59384f93", + "urn": "urn:li:container:0690c0f5c38fc7fe7ad0518a59384f93" + }, + { + "id": "test-data-factory.DataIngestionPipeline", + "urn": "urn:li:dataFlow:(azure_data_factory,test-data-factory.DataIngestionPipeline,PROD)" + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-test-basic", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataFlow", + "entityUrn": "urn:li:dataFlow:(azure_data_factory,test-data-factory.DataIngestionPipeline,PROD)", + "changeType": "UPSERT", + "aspectName": "subTypes", + "aspect": { + "json": { + "typeNames": [ + "Pipeline" + ] + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-test-basic", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure_data_factory,test-data-factory.DataIngestionPipeline,PROD),TransformData)", + "changeType": "UPSERT", + "aspectName": "subTypes", + "aspect": { + "json": { + "typeNames": [ + "Data Flow Activity" + ] + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-test-basic", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataFlow", + "entityUrn": "urn:li:dataFlow:(azure_data_factory,test-data-factory.DataIngestionPipeline,PROD)", + "changeType": "UPSERT", + "aspectName": "container", + "aspect": { + "json": { + "container": "urn:li:container:0690c0f5c38fc7fe7ad0518a59384f93" + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-test-basic", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataFlow", + "entityUrn": "urn:li:dataFlow:(azure_data_factory,test-data-factory.DataIngestionPipeline,PROD)", + "changeType": "UPSERT", + "aspectName": "browsePathsV2", + "aspect": { + "json": { + "path": [ + { + "id": "urn:li:container:0690c0f5c38fc7fe7ad0518a59384f93", + "urn": "urn:li:container:0690c0f5c38fc7fe7ad0518a59384f93" + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-test-basic", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure_data_factory,test-data-factory.DataIngestionPipeline,PROD),TransformData)", + "changeType": "UPSERT", + "aspectName": "dataPlatformInstance", + "aspect": { + "json": { + "platform": "urn:li:dataPlatform:azure_data_factory" + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-test-basic", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure_data_factory,test-data-factory.DataIngestionPipeline,PROD),TransformData)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-test-basic", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure_data_factory,test-data-factory.DataIngestionPipeline,PROD),CopyBlobToSQL)", + "changeType": "UPSERT", + "aspectName": "dataPlatformInstance", + "aspect": { + "json": { + "platform": "urn:li:dataPlatform:azure_data_factory" + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-test-basic", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure_data_factory,test-data-factory.DataIngestionPipeline,PROD),CopyBlobToSQL)", + "changeType": "UPSERT", + "aspectName": "dataJobInfo", + "aspect": { + "json": { + "customProperties": { + "activity_type": "Copy", + "activity_description": "Copy data from Blob to SQL", + "timeout": "7.00:00:00", + "retry": "0" + }, + "externalUrl": "https://adf.azure.com/en/authoring/pipeline/DataIngestionPipeline?factory=/subscriptions/12345678-1234-1234-1234-123456789012/resourceGroups/test-resource-group/providers/Microsoft.DataFactory/factories/test-data-factory", + "name": "CopyBlobToSQL", + "description": "Copy data from Blob to SQL", + "type": { + "string": "COMMAND" + }, + "flowUrn": "urn:li:dataFlow:(azure_data_factory,test-data-factory.DataIngestionPipeline,PROD)", + "env": "PROD" + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-test-basic", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure_data_factory,test-data-factory.DataIngestionPipeline,PROD),TransformData)", + "changeType": "UPSERT", + "aspectName": "dataJobInfo", + "aspect": { + "json": { + "customProperties": { + "activity_type": "ExecuteDataFlow", + "activity_description": "Execute mapping data flow", + "timeout": "7.00:00:00", + "retry": "0" + }, + "externalUrl": "https://adf.azure.com/en/authoring/pipeline/DataIngestionPipeline?factory=/subscriptions/12345678-1234-1234-1234-123456789012/resourceGroups/test-resource-group/providers/Microsoft.DataFactory/factories/test-data-factory", + "name": "TransformData", + "description": "Execute mapping data flow", + "type": { + "string": "COMMAND" + }, + "flowUrn": "urn:li:dataFlow:(azure_data_factory,test-data-factory.DataIngestionPipeline,PROD)", + "env": "PROD" + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-test-basic", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure_data_factory,test-data-factory.DataIngestionPipeline,PROD),CopyBlobToSQL)", + "changeType": "UPSERT", + "aspectName": "browsePathsV2", + "aspect": { + "json": { + "path": [ + { + "id": "urn:li:container:0690c0f5c38fc7fe7ad0518a59384f93", + "urn": "urn:li:container:0690c0f5c38fc7fe7ad0518a59384f93" + }, + { + "id": "test-data-factory.DataIngestionPipeline", + "urn": "urn:li:dataFlow:(azure_data_factory,test-data-factory.DataIngestionPipeline,PROD)" + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-test-basic", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure_data_factory,test-data-factory.DataIngestionPipeline,PROD),CopyBlobToSQL)", + "changeType": "UPSERT", + "aspectName": "subTypes", + "aspect": { + "json": { + "typeNames": [ + "Copy Activity" + ] + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-test-basic", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure_data_factory,test-data-factory.DataIngestionPipeline,PROD),LookupConfig)", + "changeType": "UPSERT", + "aspectName": "dataPlatformInstance", + "aspect": { + "json": { + "platform": "urn:li:dataPlatform:azure_data_factory" + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-test-basic", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure_data_factory,test-data-factory.DataIngestionPipeline,PROD),LookupConfig)", + "changeType": "UPSERT", + "aspectName": "dataJobInfo", + "aspect": { + "json": { + "customProperties": { + "activity_type": "Lookup", + "activity_description": "Lookup configuration values", + "timeout": "7.00:00:00", + "retry": "0" + }, + "externalUrl": "https://adf.azure.com/en/authoring/pipeline/DataIngestionPipeline?factory=/subscriptions/12345678-1234-1234-1234-123456789012/resourceGroups/test-resource-group/providers/Microsoft.DataFactory/factories/test-data-factory", + "name": "LookupConfig", + "description": "Lookup configuration values", + "type": { + "string": "COMMAND" + }, + "flowUrn": "urn:li:dataFlow:(azure_data_factory,test-data-factory.DataIngestionPipeline,PROD)", + "env": "PROD" + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-test-basic", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure_data_factory,test-data-factory.DataIngestionPipeline,PROD),CopyBlobToSQL)", + "changeType": "UPSERT", + "aspectName": "dataJobInputOutput", + "aspect": { + "json": { + "inputDatasets": [ + "urn:li:dataset:(urn:li:dataPlatform:azure_blob_storage,raw-data/input/data.csv,PROD)" + ], + "outputDatasets": [ + "urn:li:dataset:(urn:li:dataPlatform:mssql,ProcessedData,PROD)" + ] + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-test-basic", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure_data_factory,test-data-factory.DataIngestionPipeline,PROD),CopyBlobToSQL)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-test-basic", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataFlow", + "entityUrn": "urn:li:dataFlow:(azure_data_factory,test-data-factory.DataProcessingPipeline,PROD)", + "changeType": "UPSERT", + "aspectName": "dataPlatformInstance", + "aspect": { + "json": { + "platform": "urn:li:dataPlatform:azure_data_factory" + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-test-basic", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure_data_factory,test-data-factory.DataIngestionPipeline,PROD),LookupConfig)", + "changeType": "UPSERT", + "aspectName": "subTypes", + "aspect": { + "json": { + "typeNames": [ + "Lookup Activity" + ] + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-test-basic", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataFlow", + "entityUrn": "urn:li:dataFlow:(azure_data_factory,test-data-factory.DataProcessingPipeline,PROD)", + "changeType": "UPSERT", + "aspectName": "dataFlowInfo", + "aspect": { + "json": { + "customProperties": { + "azure_resource_id": "/subscriptions/12345678-1234-1234-1234-123456789012/resourceGroups/test-resource-group/providers/Microsoft.DataFactory/factories/test-data-factory/pipelines/DataProcessingPipeline", + "factory_name": "test-data-factory" + }, + "externalUrl": "https://adf.azure.com/en/authoring/pipeline/DataProcessingPipeline?factory=/subscriptions/12345678-1234-1234-1234-123456789012/resourceGroups/test-resource-group/providers/Microsoft.DataFactory/factories/test-data-factory", + "name": "DataProcessingPipeline", + "description": "Data processing and transformation", + "env": "PROD" + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-test-basic", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure_data_factory,test-data-factory.DataIngestionPipeline,PROD),LookupConfig)", + "changeType": "UPSERT", + "aspectName": "dataJobInputOutput", + "aspect": { + "json": { + "inputDatasets": [ + "urn:li:dataset:(urn:li:dataPlatform:azure_blob_storage,config/settings.json,PROD)" + ], + "outputDatasets": [] + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-test-basic", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataFlow", + "entityUrn": "urn:li:dataFlow:(azure_data_factory,test-data-factory.DataProcessingPipeline,PROD)", + "changeType": "UPSERT", + "aspectName": "subTypes", + "aspect": { + "json": { + "typeNames": [ + "Pipeline" + ] + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-test-basic", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure_data_factory,test-data-factory.DataProcessingPipeline,PROD),CallStoredProc)", + "changeType": "UPSERT", + "aspectName": "browsePathsV2", + "aspect": { + "json": { + "path": [ + { + "id": "urn:li:container:0690c0f5c38fc7fe7ad0518a59384f93", + "urn": "urn:li:container:0690c0f5c38fc7fe7ad0518a59384f93" + }, + { + "id": "test-data-factory.DataProcessingPipeline", + "urn": "urn:li:dataFlow:(azure_data_factory,test-data-factory.DataProcessingPipeline,PROD)" + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-test-basic", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataFlow", + "entityUrn": "urn:li:dataFlow:(azure_data_factory,test-data-factory.DataProcessingPipeline,PROD)", + "changeType": "UPSERT", + "aspectName": "container", + "aspect": { + "json": { + "container": "urn:li:container:0690c0f5c38fc7fe7ad0518a59384f93" + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-test-basic", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataFlow", + "entityUrn": "urn:li:dataFlow:(azure_data_factory,test-data-factory.DataProcessingPipeline,PROD)", + "changeType": "UPSERT", + "aspectName": "browsePathsV2", + "aspect": { + "json": { + "path": [ + { + "id": "urn:li:container:0690c0f5c38fc7fe7ad0518a59384f93", + "urn": "urn:li:container:0690c0f5c38fc7fe7ad0518a59384f93" + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-test-basic", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure_data_factory,test-data-factory.DataProcessingPipeline,PROD),CallStoredProc)", + "changeType": "UPSERT", + "aspectName": "subTypes", + "aspect": { + "json": { + "typeNames": [ + "Stored Procedure Activity" + ] + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-test-basic", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure_data_factory,test-data-factory.DataIngestionPipeline,PROD),LookupConfig)", + "changeType": "UPSERT", + "aspectName": "browsePathsV2", + "aspect": { + "json": { + "path": [ + { + "id": "urn:li:container:0690c0f5c38fc7fe7ad0518a59384f93", + "urn": "urn:li:container:0690c0f5c38fc7fe7ad0518a59384f93" + }, + { + "id": "test-data-factory.DataIngestionPipeline", + "urn": "urn:li:dataFlow:(azure_data_factory,test-data-factory.DataIngestionPipeline,PROD)" + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-test-basic", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure_data_factory,test-data-factory.DataProcessingPipeline,PROD),CallStoredProc)", + "changeType": "UPSERT", + "aspectName": "dataPlatformInstance", + "aspect": { + "json": { + "platform": "urn:li:dataPlatform:azure_data_factory" + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-test-basic", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataFlow", + "entityUrn": "urn:li:dataFlow:(azure_data_factory,test-data-factory.DataProcessingPipeline,PROD)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-test-basic", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure_data_factory,test-data-factory.DataProcessingPipeline,PROD),CallStoredProc)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-test-basic", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure_data_factory,test-data-factory.DataProcessingPipeline,PROD),CallStoredProc)", + "changeType": "UPSERT", + "aspectName": "dataJobInfo", + "aspect": { + "json": { + "customProperties": { + "activity_type": "SqlServerStoredProcedure", + "activity_description": "Call stored procedure", + "timeout": "7.00:00:00", + "retry": "0" + }, + "externalUrl": "https://adf.azure.com/en/authoring/pipeline/DataProcessingPipeline?factory=/subscriptions/12345678-1234-1234-1234-123456789012/resourceGroups/test-resource-group/providers/Microsoft.DataFactory/factories/test-data-factory", + "name": "CallStoredProc", + "description": "Call stored procedure", + "type": { + "string": "COMMAND" + }, + "flowUrn": "urn:li:dataFlow:(azure_data_factory,test-data-factory.DataProcessingPipeline,PROD)", + "env": "PROD" + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-test-basic", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure_data_factory,test-data-factory.DataIngestionPipeline,PROD),LookupConfig)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-test-basic", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataFlow", + "entityUrn": "urn:li:dataFlow:(azure_data_factory,test-data-factory.DataIngestionPipeline,PROD)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-test-basic", + "lastRunId": "no-run-id-provided" + } +} +] \ No newline at end of file diff --git a/metadata-ingestion/tests/integration/azure_data_factory/adf_platform_instance_golden.json b/metadata-ingestion/tests/integration/azure_data_factory/adf_platform_instance_golden.json new file mode 100644 index 00000000000000..05d62f667a6fcc --- /dev/null +++ b/metadata-ingestion/tests/integration/azure_data_factory/adf_platform_instance_golden.json @@ -0,0 +1,812 @@ +[ +{ + "entityType": "container", + "entityUrn": "urn:li:container:b5eba2c40de50f1afa7f4c8876a663d4", + "changeType": "UPSERT", + "aspectName": "containerProperties", + "aspect": { + "json": { + "customProperties": { + "platform": "azure_data_factory", + "instance": "my-adf-instance", + "env": "DEV", + "resource_group": "test-resource-group", + "factory_name": "test-data-factory", + "azure_resource_id": "/subscriptions/12345678-1234-1234-1234-123456789012/resourceGroups/test-resource-group/providers/Microsoft.DataFactory/factories/test-data-factory", + "location": "eastus", + "tag:environment": "test", + "tag:team": "data-engineering", + "provisioning_state": "Succeeded" + }, + "externalUrl": "https://adf.azure.com/en/home?factory=/subscriptions/12345678-1234-1234-1234-123456789012/resourceGroups/test-resource-group/providers/Microsoft.DataFactory/factories/test-data-factory", + "name": "test-data-factory", + "description": "Azure Data Factory: test-data-factory", + "env": "DEV" + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-test-platform-instance", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "container", + "entityUrn": "urn:li:container:b5eba2c40de50f1afa7f4c8876a663d4", + "changeType": "UPSERT", + "aspectName": "dataPlatformInstance", + "aspect": { + "json": { + "platform": "urn:li:dataPlatform:azure_data_factory", + "instance": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:azure_data_factory,my-adf-instance)" + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-test-platform-instance", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "container", + "entityUrn": "urn:li:container:b5eba2c40de50f1afa7f4c8876a663d4", + "changeType": "UPSERT", + "aspectName": "subTypes", + "aspect": { + "json": { + "typeNames": [ + "Data Factory" + ] + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-test-platform-instance", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "container", + "entityUrn": "urn:li:container:b5eba2c40de50f1afa7f4c8876a663d4", + "changeType": "UPSERT", + "aspectName": "browsePathsV2", + "aspect": { + "json": { + "path": [ + { + "id": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:azure_data_factory,my-adf-instance)", + "urn": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:azure_data_factory,my-adf-instance)" + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-test-platform-instance", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataFlow", + "entityUrn": "urn:li:dataFlow:(azure_data_factory,my-adf-instance.test-data-factory.DataIngestionPipeline,DEV)", + "changeType": "UPSERT", + "aspectName": "dataPlatformInstance", + "aspect": { + "json": { + "platform": "urn:li:dataPlatform:azure_data_factory", + "instance": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:azure_data_factory,my-adf-instance)" + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-test-platform-instance", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "container", + "entityUrn": "urn:li:container:b5eba2c40de50f1afa7f4c8876a663d4", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-test-platform-instance", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataFlow", + "entityUrn": "urn:li:dataFlow:(azure_data_factory,my-adf-instance.test-data-factory.DataIngestionPipeline,DEV)", + "changeType": "UPSERT", + "aspectName": "dataFlowInfo", + "aspect": { + "json": { + "customProperties": { + "azure_resource_id": "/subscriptions/12345678-1234-1234-1234-123456789012/resourceGroups/test-resource-group/providers/Microsoft.DataFactory/factories/test-data-factory/pipelines/DataIngestionPipeline", + "factory_name": "test-data-factory", + "triggers": "DailyScheduleTrigger" + }, + "externalUrl": "https://adf.azure.com/en/authoring/pipeline/DataIngestionPipeline?factory=/subscriptions/12345678-1234-1234-1234-123456789012/resourceGroups/test-resource-group/providers/Microsoft.DataFactory/factories/test-data-factory", + "name": "DataIngestionPipeline", + "description": "Main data ingestion pipeline", + "env": "DEV" + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-test-platform-instance", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataFlow", + "entityUrn": "urn:li:dataFlow:(azure_data_factory,my-adf-instance.test-data-factory.DataIngestionPipeline,DEV)", + "changeType": "UPSERT", + "aspectName": "container", + "aspect": { + "json": { + "container": "urn:li:container:b5eba2c40de50f1afa7f4c8876a663d4" + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-test-platform-instance", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataFlow", + "entityUrn": "urn:li:dataFlow:(azure_data_factory,my-adf-instance.test-data-factory.DataIngestionPipeline,DEV)", + "changeType": "UPSERT", + "aspectName": "subTypes", + "aspect": { + "json": { + "typeNames": [ + "Pipeline" + ] + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-test-platform-instance", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure_data_factory,my-adf-instance.test-data-factory.DataIngestionPipeline,DEV),CopyBlobToSQL)", + "changeType": "UPSERT", + "aspectName": "dataPlatformInstance", + "aspect": { + "json": { + "platform": "urn:li:dataPlatform:azure_data_factory", + "instance": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:azure_data_factory,my-adf-instance)" + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-test-platform-instance", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure_data_factory,my-adf-instance.test-data-factory.DataIngestionPipeline,DEV),CopyBlobToSQL)", + "changeType": "UPSERT", + "aspectName": "dataJobInfo", + "aspect": { + "json": { + "customProperties": { + "activity_type": "Copy", + "activity_description": "Copy data from Blob to SQL", + "timeout": "7.00:00:00", + "retry": "0" + }, + "externalUrl": "https://adf.azure.com/en/authoring/pipeline/DataIngestionPipeline?factory=/subscriptions/12345678-1234-1234-1234-123456789012/resourceGroups/test-resource-group/providers/Microsoft.DataFactory/factories/test-data-factory", + "name": "CopyBlobToSQL", + "description": "Copy data from Blob to SQL", + "type": { + "string": "COMMAND" + }, + "flowUrn": "urn:li:dataFlow:(azure_data_factory,my-adf-instance.test-data-factory.DataIngestionPipeline,DEV)", + "env": "DEV" + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-test-platform-instance", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataFlow", + "entityUrn": "urn:li:dataFlow:(azure_data_factory,my-adf-instance.test-data-factory.DataIngestionPipeline,DEV)", + "changeType": "UPSERT", + "aspectName": "browsePathsV2", + "aspect": { + "json": { + "path": [ + { + "id": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:azure_data_factory,my-adf-instance)", + "urn": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:azure_data_factory,my-adf-instance)" + }, + { + "id": "urn:li:container:b5eba2c40de50f1afa7f4c8876a663d4", + "urn": "urn:li:container:b5eba2c40de50f1afa7f4c8876a663d4" + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-test-platform-instance", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataFlow", + "entityUrn": "urn:li:dataFlow:(azure_data_factory,my-adf-instance.test-data-factory.DataIngestionPipeline,DEV)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-test-platform-instance", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure_data_factory,my-adf-instance.test-data-factory.DataIngestionPipeline,DEV),CopyBlobToSQL)", + "changeType": "UPSERT", + "aspectName": "subTypes", + "aspect": { + "json": { + "typeNames": [ + "Copy Activity" + ] + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-test-platform-instance", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure_data_factory,my-adf-instance.test-data-factory.DataIngestionPipeline,DEV),CopyBlobToSQL)", + "changeType": "UPSERT", + "aspectName": "dataJobInputOutput", + "aspect": { + "json": { + "inputDatasets": [ + "urn:li:dataset:(urn:li:dataPlatform:azure_blob_storage,raw-data/input/data.csv,DEV)" + ], + "outputDatasets": [ + "urn:li:dataset:(urn:li:dataPlatform:mssql,ProcessedData,DEV)" + ] + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-test-platform-instance", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure_data_factory,my-adf-instance.test-data-factory.DataProcessingPipeline,DEV),CallStoredProc)", + "changeType": "UPSERT", + "aspectName": "browsePathsV2", + "aspect": { + "json": { + "path": [ + { + "id": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:azure_data_factory,my-adf-instance)", + "urn": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:azure_data_factory,my-adf-instance)" + }, + { + "id": "urn:li:container:b5eba2c40de50f1afa7f4c8876a663d4", + "urn": "urn:li:container:b5eba2c40de50f1afa7f4c8876a663d4" + }, + { + "id": "my-adf-instance.test-data-factory.DataProcessingPipeline", + "urn": "urn:li:dataFlow:(azure_data_factory,my-adf-instance.test-data-factory.DataProcessingPipeline,DEV)" + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-test-platform-instance", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure_data_factory,my-adf-instance.test-data-factory.DataProcessingPipeline,DEV),CallStoredProc)", + "changeType": "UPSERT", + "aspectName": "subTypes", + "aspect": { + "json": { + "typeNames": [ + "Stored Procedure Activity" + ] + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-test-platform-instance", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure_data_factory,my-adf-instance.test-data-factory.DataIngestionPipeline,DEV),CopyBlobToSQL)", + "changeType": "UPSERT", + "aspectName": "browsePathsV2", + "aspect": { + "json": { + "path": [ + { + "id": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:azure_data_factory,my-adf-instance)", + "urn": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:azure_data_factory,my-adf-instance)" + }, + { + "id": "urn:li:container:b5eba2c40de50f1afa7f4c8876a663d4", + "urn": "urn:li:container:b5eba2c40de50f1afa7f4c8876a663d4" + }, + { + "id": "my-adf-instance.test-data-factory.DataIngestionPipeline", + "urn": "urn:li:dataFlow:(azure_data_factory,my-adf-instance.test-data-factory.DataIngestionPipeline,DEV)" + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-test-platform-instance", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure_data_factory,my-adf-instance.test-data-factory.DataProcessingPipeline,DEV),CallStoredProc)", + "changeType": "UPSERT", + "aspectName": "dataPlatformInstance", + "aspect": { + "json": { + "platform": "urn:li:dataPlatform:azure_data_factory", + "instance": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:azure_data_factory,my-adf-instance)" + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-test-platform-instance", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure_data_factory,my-adf-instance.test-data-factory.DataIngestionPipeline,DEV),LookupConfig)", + "changeType": "UPSERT", + "aspectName": "dataPlatformInstance", + "aspect": { + "json": { + "platform": "urn:li:dataPlatform:azure_data_factory", + "instance": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:azure_data_factory,my-adf-instance)" + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-test-platform-instance", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure_data_factory,my-adf-instance.test-data-factory.DataIngestionPipeline,DEV),LookupConfig)", + "changeType": "UPSERT", + "aspectName": "dataJobInfo", + "aspect": { + "json": { + "customProperties": { + "activity_type": "Lookup", + "activity_description": "Lookup configuration values", + "timeout": "7.00:00:00", + "retry": "0" + }, + "externalUrl": "https://adf.azure.com/en/authoring/pipeline/DataIngestionPipeline?factory=/subscriptions/12345678-1234-1234-1234-123456789012/resourceGroups/test-resource-group/providers/Microsoft.DataFactory/factories/test-data-factory", + "name": "LookupConfig", + "description": "Lookup configuration values", + "type": { + "string": "COMMAND" + }, + "flowUrn": "urn:li:dataFlow:(azure_data_factory,my-adf-instance.test-data-factory.DataIngestionPipeline,DEV)", + "env": "DEV" + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-test-platform-instance", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure_data_factory,my-adf-instance.test-data-factory.DataProcessingPipeline,DEV),CallStoredProc)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-test-platform-instance", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure_data_factory,my-adf-instance.test-data-factory.DataProcessingPipeline,DEV),CallStoredProc)", + "changeType": "UPSERT", + "aspectName": "dataJobInfo", + "aspect": { + "json": { + "customProperties": { + "activity_type": "SqlServerStoredProcedure", + "activity_description": "Call stored procedure", + "timeout": "7.00:00:00", + "retry": "0" + }, + "externalUrl": "https://adf.azure.com/en/authoring/pipeline/DataProcessingPipeline?factory=/subscriptions/12345678-1234-1234-1234-123456789012/resourceGroups/test-resource-group/providers/Microsoft.DataFactory/factories/test-data-factory", + "name": "CallStoredProc", + "description": "Call stored procedure", + "type": { + "string": "COMMAND" + }, + "flowUrn": "urn:li:dataFlow:(azure_data_factory,my-adf-instance.test-data-factory.DataProcessingPipeline,DEV)", + "env": "DEV" + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-test-platform-instance", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure_data_factory,my-adf-instance.test-data-factory.DataIngestionPipeline,DEV),LookupConfig)", + "changeType": "UPSERT", + "aspectName": "subTypes", + "aspect": { + "json": { + "typeNames": [ + "Lookup Activity" + ] + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-test-platform-instance", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure_data_factory,my-adf-instance.test-data-factory.DataIngestionPipeline,DEV),LookupConfig)", + "changeType": "UPSERT", + "aspectName": "dataJobInputOutput", + "aspect": { + "json": { + "inputDatasets": [ + "urn:li:dataset:(urn:li:dataPlatform:azure_blob_storage,config/settings.json,DEV)" + ], + "outputDatasets": [] + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-test-platform-instance", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataFlow", + "entityUrn": "urn:li:dataFlow:(azure_data_factory,my-adf-instance.test-data-factory.DataProcessingPipeline,DEV)", + "changeType": "UPSERT", + "aspectName": "container", + "aspect": { + "json": { + "container": "urn:li:container:b5eba2c40de50f1afa7f4c8876a663d4" + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-test-platform-instance", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataFlow", + "entityUrn": "urn:li:dataFlow:(azure_data_factory,my-adf-instance.test-data-factory.DataProcessingPipeline,DEV)", + "changeType": "UPSERT", + "aspectName": "browsePathsV2", + "aspect": { + "json": { + "path": [ + { + "id": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:azure_data_factory,my-adf-instance)", + "urn": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:azure_data_factory,my-adf-instance)" + }, + { + "id": "urn:li:container:b5eba2c40de50f1afa7f4c8876a663d4", + "urn": "urn:li:container:b5eba2c40de50f1afa7f4c8876a663d4" + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-test-platform-instance", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure_data_factory,my-adf-instance.test-data-factory.DataIngestionPipeline,DEV),LookupConfig)", + "changeType": "UPSERT", + "aspectName": "browsePathsV2", + "aspect": { + "json": { + "path": [ + { + "id": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:azure_data_factory,my-adf-instance)", + "urn": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:azure_data_factory,my-adf-instance)" + }, + { + "id": "urn:li:container:b5eba2c40de50f1afa7f4c8876a663d4", + "urn": "urn:li:container:b5eba2c40de50f1afa7f4c8876a663d4" + }, + { + "id": "my-adf-instance.test-data-factory.DataIngestionPipeline", + "urn": "urn:li:dataFlow:(azure_data_factory,my-adf-instance.test-data-factory.DataIngestionPipeline,DEV)" + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-test-platform-instance", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataFlow", + "entityUrn": "urn:li:dataFlow:(azure_data_factory,my-adf-instance.test-data-factory.DataProcessingPipeline,DEV)", + "changeType": "UPSERT", + "aspectName": "subTypes", + "aspect": { + "json": { + "typeNames": [ + "Pipeline" + ] + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-test-platform-instance", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure_data_factory,my-adf-instance.test-data-factory.DataIngestionPipeline,DEV),TransformData)", + "changeType": "UPSERT", + "aspectName": "dataPlatformInstance", + "aspect": { + "json": { + "platform": "urn:li:dataPlatform:azure_data_factory", + "instance": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:azure_data_factory,my-adf-instance)" + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-test-platform-instance", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure_data_factory,my-adf-instance.test-data-factory.DataIngestionPipeline,DEV),TransformData)", + "changeType": "UPSERT", + "aspectName": "dataJobInfo", + "aspect": { + "json": { + "customProperties": { + "activity_type": "ExecuteDataFlow", + "activity_description": "Execute mapping data flow", + "timeout": "7.00:00:00", + "retry": "0" + }, + "externalUrl": "https://adf.azure.com/en/authoring/pipeline/DataIngestionPipeline?factory=/subscriptions/12345678-1234-1234-1234-123456789012/resourceGroups/test-resource-group/providers/Microsoft.DataFactory/factories/test-data-factory", + "name": "TransformData", + "description": "Execute mapping data flow", + "type": { + "string": "COMMAND" + }, + "flowUrn": "urn:li:dataFlow:(azure_data_factory,my-adf-instance.test-data-factory.DataIngestionPipeline,DEV)", + "env": "DEV" + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-test-platform-instance", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataFlow", + "entityUrn": "urn:li:dataFlow:(azure_data_factory,my-adf-instance.test-data-factory.DataProcessingPipeline,DEV)", + "changeType": "UPSERT", + "aspectName": "dataFlowInfo", + "aspect": { + "json": { + "customProperties": { + "azure_resource_id": "/subscriptions/12345678-1234-1234-1234-123456789012/resourceGroups/test-resource-group/providers/Microsoft.DataFactory/factories/test-data-factory/pipelines/DataProcessingPipeline", + "factory_name": "test-data-factory" + }, + "externalUrl": "https://adf.azure.com/en/authoring/pipeline/DataProcessingPipeline?factory=/subscriptions/12345678-1234-1234-1234-123456789012/resourceGroups/test-resource-group/providers/Microsoft.DataFactory/factories/test-data-factory", + "name": "DataProcessingPipeline", + "description": "Data processing and transformation", + "env": "DEV" + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-test-platform-instance", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataFlow", + "entityUrn": "urn:li:dataFlow:(azure_data_factory,my-adf-instance.test-data-factory.DataProcessingPipeline,DEV)", + "changeType": "UPSERT", + "aspectName": "dataPlatformInstance", + "aspect": { + "json": { + "platform": "urn:li:dataPlatform:azure_data_factory", + "instance": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:azure_data_factory,my-adf-instance)" + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-test-platform-instance", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure_data_factory,my-adf-instance.test-data-factory.DataIngestionPipeline,DEV),TransformData)", + "changeType": "UPSERT", + "aspectName": "subTypes", + "aspect": { + "json": { + "typeNames": [ + "Data Flow Activity" + ] + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-test-platform-instance", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataFlow", + "entityUrn": "urn:li:dataFlow:(azure_data_factory,my-adf-instance.test-data-factory.DataProcessingPipeline,DEV)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-test-platform-instance", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure_data_factory,my-adf-instance.test-data-factory.DataIngestionPipeline,DEV),TransformData)", + "changeType": "UPSERT", + "aspectName": "browsePathsV2", + "aspect": { + "json": { + "path": [ + { + "id": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:azure_data_factory,my-adf-instance)", + "urn": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:azure_data_factory,my-adf-instance)" + }, + { + "id": "urn:li:container:b5eba2c40de50f1afa7f4c8876a663d4", + "urn": "urn:li:container:b5eba2c40de50f1afa7f4c8876a663d4" + }, + { + "id": "my-adf-instance.test-data-factory.DataIngestionPipeline", + "urn": "urn:li:dataFlow:(azure_data_factory,my-adf-instance.test-data-factory.DataIngestionPipeline,DEV)" + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-test-platform-instance", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure_data_factory,my-adf-instance.test-data-factory.DataIngestionPipeline,DEV),CopyBlobToSQL)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-test-platform-instance", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure_data_factory,my-adf-instance.test-data-factory.DataIngestionPipeline,DEV),TransformData)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-test-platform-instance", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure_data_factory,my-adf-instance.test-data-factory.DataIngestionPipeline,DEV),LookupConfig)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-test-platform-instance", + "lastRunId": "no-run-id-provided" + } +} +] \ No newline at end of file diff --git a/metadata-ingestion/tests/integration/azure_data_factory/adf_with_runs_golden.json b/metadata-ingestion/tests/integration/azure_data_factory/adf_with_runs_golden.json new file mode 100644 index 00000000000000..2043c73b4bbbdd --- /dev/null +++ b/metadata-ingestion/tests/integration/azure_data_factory/adf_with_runs_golden.json @@ -0,0 +1,1150 @@ +[ +{ + "entityType": "container", + "entityUrn": "urn:li:container:0690c0f5c38fc7fe7ad0518a59384f93", + "changeType": "UPSERT", + "aspectName": "containerProperties", + "aspect": { + "json": { + "customProperties": { + "platform": "azure_data_factory", + "env": "PROD", + "resource_group": "test-resource-group", + "factory_name": "test-data-factory", + "azure_resource_id": "/subscriptions/12345678-1234-1234-1234-123456789012/resourceGroups/test-resource-group/providers/Microsoft.DataFactory/factories/test-data-factory", + "location": "eastus", + "tag:environment": "test", + "tag:team": "data-engineering", + "provisioning_state": "Succeeded" + }, + "externalUrl": "https://adf.azure.com/en/home?factory=/subscriptions/12345678-1234-1234-1234-123456789012/resourceGroups/test-resource-group/providers/Microsoft.DataFactory/factories/test-data-factory", + "name": "test-data-factory", + "description": "Azure Data Factory: test-data-factory", + "env": "PROD" + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-test-with-runs", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "container", + "entityUrn": "urn:li:container:0690c0f5c38fc7fe7ad0518a59384f93", + "changeType": "UPSERT", + "aspectName": "dataPlatformInstance", + "aspect": { + "json": { + "platform": "urn:li:dataPlatform:azure_data_factory" + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-test-with-runs", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "container", + "entityUrn": "urn:li:container:0690c0f5c38fc7fe7ad0518a59384f93", + "changeType": "UPSERT", + "aspectName": "subTypes", + "aspect": { + "json": { + "typeNames": [ + "Data Factory" + ] + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-test-with-runs", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "container", + "entityUrn": "urn:li:container:0690c0f5c38fc7fe7ad0518a59384f93", + "changeType": "UPSERT", + "aspectName": "browsePathsV2", + "aspect": { + "json": { + "path": [] + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-test-with-runs", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataFlow", + "entityUrn": "urn:li:dataFlow:(azure_data_factory,test-data-factory.DataIngestionPipeline,PROD)", + "changeType": "UPSERT", + "aspectName": "dataPlatformInstance", + "aspect": { + "json": { + "platform": "urn:li:dataPlatform:azure_data_factory" + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-test-with-runs", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataFlow", + "entityUrn": "urn:li:dataFlow:(azure_data_factory,test-data-factory.DataIngestionPipeline,PROD)", + "changeType": "UPSERT", + "aspectName": "dataFlowInfo", + "aspect": { + "json": { + "customProperties": { + "azure_resource_id": "/subscriptions/12345678-1234-1234-1234-123456789012/resourceGroups/test-resource-group/providers/Microsoft.DataFactory/factories/test-data-factory/pipelines/DataIngestionPipeline", + "factory_name": "test-data-factory", + "triggers": "DailyScheduleTrigger" + }, + "externalUrl": "https://adf.azure.com/en/authoring/pipeline/DataIngestionPipeline?factory=/subscriptions/12345678-1234-1234-1234-123456789012/resourceGroups/test-resource-group/providers/Microsoft.DataFactory/factories/test-data-factory", + "name": "DataIngestionPipeline", + "description": "Main data ingestion pipeline", + "env": "PROD" + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-test-with-runs", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataFlow", + "entityUrn": "urn:li:dataFlow:(azure_data_factory,test-data-factory.DataIngestionPipeline,PROD)", + "changeType": "UPSERT", + "aspectName": "subTypes", + "aspect": { + "json": { + "typeNames": [ + "Pipeline" + ] + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-test-with-runs", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataFlow", + "entityUrn": "urn:li:dataFlow:(azure_data_factory,test-data-factory.DataIngestionPipeline,PROD)", + "changeType": "UPSERT", + "aspectName": "container", + "aspect": { + "json": { + "container": "urn:li:container:0690c0f5c38fc7fe7ad0518a59384f93" + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-test-with-runs", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataFlow", + "entityUrn": "urn:li:dataFlow:(azure_data_factory,test-data-factory.DataIngestionPipeline,PROD)", + "changeType": "UPSERT", + "aspectName": "browsePathsV2", + "aspect": { + "json": { + "path": [ + { + "id": "urn:li:container:0690c0f5c38fc7fe7ad0518a59384f93", + "urn": "urn:li:container:0690c0f5c38fc7fe7ad0518a59384f93" + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-test-with-runs", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure_data_factory,test-data-factory.DataIngestionPipeline,PROD),CopyBlobToSQL)", + "changeType": "UPSERT", + "aspectName": "dataPlatformInstance", + "aspect": { + "json": { + "platform": "urn:li:dataPlatform:azure_data_factory" + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-test-with-runs", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure_data_factory,test-data-factory.DataIngestionPipeline,PROD),CopyBlobToSQL)", + "changeType": "UPSERT", + "aspectName": "dataJobInfo", + "aspect": { + "json": { + "customProperties": { + "activity_type": "Copy", + "activity_description": "Copy data from Blob to SQL", + "timeout": "7.00:00:00", + "retry": "0" + }, + "externalUrl": "https://adf.azure.com/en/authoring/pipeline/DataIngestionPipeline?factory=/subscriptions/12345678-1234-1234-1234-123456789012/resourceGroups/test-resource-group/providers/Microsoft.DataFactory/factories/test-data-factory", + "name": "CopyBlobToSQL", + "description": "Copy data from Blob to SQL", + "type": { + "string": "COMMAND" + }, + "flowUrn": "urn:li:dataFlow:(azure_data_factory,test-data-factory.DataIngestionPipeline,PROD)", + "env": "PROD" + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-test-with-runs", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure_data_factory,test-data-factory.DataIngestionPipeline,PROD),CopyBlobToSQL)", + "changeType": "UPSERT", + "aspectName": "subTypes", + "aspect": { + "json": { + "typeNames": [ + "Copy Activity" + ] + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-test-with-runs", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure_data_factory,test-data-factory.DataIngestionPipeline,PROD),CopyBlobToSQL)", + "changeType": "UPSERT", + "aspectName": "dataJobInputOutput", + "aspect": { + "json": { + "inputDatasets": [ + "urn:li:dataset:(urn:li:dataPlatform:azure_blob_storage,raw-data/input/data.csv,PROD)" + ], + "outputDatasets": [ + "urn:li:dataset:(urn:li:dataPlatform:mssql,ProcessedData,PROD)" + ] + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-test-with-runs", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure_data_factory,test-data-factory.DataIngestionPipeline,PROD),CopyBlobToSQL)", + "changeType": "UPSERT", + "aspectName": "browsePathsV2", + "aspect": { + "json": { + "path": [ + { + "id": "urn:li:container:0690c0f5c38fc7fe7ad0518a59384f93", + "urn": "urn:li:container:0690c0f5c38fc7fe7ad0518a59384f93" + }, + { + "id": "test-data-factory.DataIngestionPipeline", + "urn": "urn:li:dataFlow:(azure_data_factory,test-data-factory.DataIngestionPipeline,PROD)" + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-test-with-runs", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure_data_factory,test-data-factory.DataIngestionPipeline,PROD),LookupConfig)", + "changeType": "UPSERT", + "aspectName": "dataPlatformInstance", + "aspect": { + "json": { + "platform": "urn:li:dataPlatform:azure_data_factory" + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-test-with-runs", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure_data_factory,test-data-factory.DataIngestionPipeline,PROD),LookupConfig)", + "changeType": "UPSERT", + "aspectName": "dataJobInfo", + "aspect": { + "json": { + "customProperties": { + "activity_type": "Lookup", + "activity_description": "Lookup configuration values", + "timeout": "7.00:00:00", + "retry": "0" + }, + "externalUrl": "https://adf.azure.com/en/authoring/pipeline/DataIngestionPipeline?factory=/subscriptions/12345678-1234-1234-1234-123456789012/resourceGroups/test-resource-group/providers/Microsoft.DataFactory/factories/test-data-factory", + "name": "LookupConfig", + "description": "Lookup configuration values", + "type": { + "string": "COMMAND" + }, + "flowUrn": "urn:li:dataFlow:(azure_data_factory,test-data-factory.DataIngestionPipeline,PROD)", + "env": "PROD" + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-test-with-runs", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure_data_factory,test-data-factory.DataIngestionPipeline,PROD),LookupConfig)", + "changeType": "UPSERT", + "aspectName": "subTypes", + "aspect": { + "json": { + "typeNames": [ + "Lookup Activity" + ] + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-test-with-runs", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure_data_factory,test-data-factory.DataIngestionPipeline,PROD),LookupConfig)", + "changeType": "UPSERT", + "aspectName": "dataJobInputOutput", + "aspect": { + "json": { + "inputDatasets": [ + "urn:li:dataset:(urn:li:dataPlatform:azure_blob_storage,config/settings.json,PROD)" + ], + "outputDatasets": [] + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-test-with-runs", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure_data_factory,test-data-factory.DataIngestionPipeline,PROD),LookupConfig)", + "changeType": "UPSERT", + "aspectName": "browsePathsV2", + "aspect": { + "json": { + "path": [ + { + "id": "urn:li:container:0690c0f5c38fc7fe7ad0518a59384f93", + "urn": "urn:li:container:0690c0f5c38fc7fe7ad0518a59384f93" + }, + { + "id": "test-data-factory.DataIngestionPipeline", + "urn": "urn:li:dataFlow:(azure_data_factory,test-data-factory.DataIngestionPipeline,PROD)" + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-test-with-runs", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure_data_factory,test-data-factory.DataIngestionPipeline,PROD),TransformData)", + "changeType": "UPSERT", + "aspectName": "dataPlatformInstance", + "aspect": { + "json": { + "platform": "urn:li:dataPlatform:azure_data_factory" + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-test-with-runs", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure_data_factory,test-data-factory.DataIngestionPipeline,PROD),TransformData)", + "changeType": "UPSERT", + "aspectName": "dataJobInfo", + "aspect": { + "json": { + "customProperties": { + "activity_type": "ExecuteDataFlow", + "activity_description": "Execute mapping data flow", + "timeout": "7.00:00:00", + "retry": "0" + }, + "externalUrl": "https://adf.azure.com/en/authoring/pipeline/DataIngestionPipeline?factory=/subscriptions/12345678-1234-1234-1234-123456789012/resourceGroups/test-resource-group/providers/Microsoft.DataFactory/factories/test-data-factory", + "name": "TransformData", + "description": "Execute mapping data flow", + "type": { + "string": "COMMAND" + }, + "flowUrn": "urn:li:dataFlow:(azure_data_factory,test-data-factory.DataIngestionPipeline,PROD)", + "env": "PROD" + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-test-with-runs", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure_data_factory,test-data-factory.DataIngestionPipeline,PROD),TransformData)", + "changeType": "UPSERT", + "aspectName": "subTypes", + "aspect": { + "json": { + "typeNames": [ + "Data Flow Activity" + ] + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-test-with-runs", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure_data_factory,test-data-factory.DataIngestionPipeline,PROD),TransformData)", + "changeType": "UPSERT", + "aspectName": "browsePathsV2", + "aspect": { + "json": { + "path": [ + { + "id": "urn:li:container:0690c0f5c38fc7fe7ad0518a59384f93", + "urn": "urn:li:container:0690c0f5c38fc7fe7ad0518a59384f93" + }, + { + "id": "test-data-factory.DataIngestionPipeline", + "urn": "urn:li:dataFlow:(azure_data_factory,test-data-factory.DataIngestionPipeline,PROD)" + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-test-with-runs", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataFlow", + "entityUrn": "urn:li:dataFlow:(azure_data_factory,test-data-factory.DataProcessingPipeline,PROD)", + "changeType": "UPSERT", + "aspectName": "dataPlatformInstance", + "aspect": { + "json": { + "platform": "urn:li:dataPlatform:azure_data_factory" + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-test-with-runs", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataFlow", + "entityUrn": "urn:li:dataFlow:(azure_data_factory,test-data-factory.DataProcessingPipeline,PROD)", + "changeType": "UPSERT", + "aspectName": "dataFlowInfo", + "aspect": { + "json": { + "customProperties": { + "azure_resource_id": "/subscriptions/12345678-1234-1234-1234-123456789012/resourceGroups/test-resource-group/providers/Microsoft.DataFactory/factories/test-data-factory/pipelines/DataProcessingPipeline", + "factory_name": "test-data-factory" + }, + "externalUrl": "https://adf.azure.com/en/authoring/pipeline/DataProcessingPipeline?factory=/subscriptions/12345678-1234-1234-1234-123456789012/resourceGroups/test-resource-group/providers/Microsoft.DataFactory/factories/test-data-factory", + "name": "DataProcessingPipeline", + "description": "Data processing and transformation", + "env": "PROD" + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-test-with-runs", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataFlow", + "entityUrn": "urn:li:dataFlow:(azure_data_factory,test-data-factory.DataProcessingPipeline,PROD)", + "changeType": "UPSERT", + "aspectName": "subTypes", + "aspect": { + "json": { + "typeNames": [ + "Pipeline" + ] + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-test-with-runs", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataFlow", + "entityUrn": "urn:li:dataFlow:(azure_data_factory,test-data-factory.DataProcessingPipeline,PROD)", + "changeType": "UPSERT", + "aspectName": "container", + "aspect": { + "json": { + "container": "urn:li:container:0690c0f5c38fc7fe7ad0518a59384f93" + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-test-with-runs", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataFlow", + "entityUrn": "urn:li:dataFlow:(azure_data_factory,test-data-factory.DataProcessingPipeline,PROD)", + "changeType": "UPSERT", + "aspectName": "browsePathsV2", + "aspect": { + "json": { + "path": [ + { + "id": "urn:li:container:0690c0f5c38fc7fe7ad0518a59384f93", + "urn": "urn:li:container:0690c0f5c38fc7fe7ad0518a59384f93" + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-test-with-runs", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure_data_factory,test-data-factory.DataProcessingPipeline,PROD),CallStoredProc)", + "changeType": "UPSERT", + "aspectName": "dataPlatformInstance", + "aspect": { + "json": { + "platform": "urn:li:dataPlatform:azure_data_factory" + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-test-with-runs", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure_data_factory,test-data-factory.DataProcessingPipeline,PROD),CallStoredProc)", + "changeType": "UPSERT", + "aspectName": "dataJobInfo", + "aspect": { + "json": { + "customProperties": { + "activity_type": "SqlServerStoredProcedure", + "activity_description": "Call stored procedure", + "timeout": "7.00:00:00", + "retry": "0" + }, + "externalUrl": "https://adf.azure.com/en/authoring/pipeline/DataProcessingPipeline?factory=/subscriptions/12345678-1234-1234-1234-123456789012/resourceGroups/test-resource-group/providers/Microsoft.DataFactory/factories/test-data-factory", + "name": "CallStoredProc", + "description": "Call stored procedure", + "type": { + "string": "COMMAND" + }, + "flowUrn": "urn:li:dataFlow:(azure_data_factory,test-data-factory.DataProcessingPipeline,PROD)", + "env": "PROD" + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-test-with-runs", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure_data_factory,test-data-factory.DataProcessingPipeline,PROD),CallStoredProc)", + "changeType": "UPSERT", + "aspectName": "subTypes", + "aspect": { + "json": { + "typeNames": [ + "Stored Procedure Activity" + ] + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-test-with-runs", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure_data_factory,test-data-factory.DataProcessingPipeline,PROD),CallStoredProc)", + "changeType": "UPSERT", + "aspectName": "browsePathsV2", + "aspect": { + "json": { + "path": [ + { + "id": "urn:li:container:0690c0f5c38fc7fe7ad0518a59384f93", + "urn": "urn:li:container:0690c0f5c38fc7fe7ad0518a59384f93" + }, + { + "id": "test-data-factory.DataProcessingPipeline", + "urn": "urn:li:dataFlow:(azure_data_factory,test-data-factory.DataProcessingPipeline,PROD)" + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-test-with-runs", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataProcessInstance", + "entityUrn": "urn:li:dataProcessInstance:01640a3ec425fdac43877636b9eeafba", + "changeType": "UPSERT", + "aspectName": "dataProcessInstanceProperties", + "aspect": { + "json": { + "customProperties": { + "run_id": "run-001-abc", + "status": "Succeeded", + "invoked_by": "Manual", + "invoked_by_type": "Manual" + }, + "externalUrl": "https://adf.azure.com/en/monitoring/pipelineruns/run-001-abc?factory=/subscriptions/12345678-1234-1234-1234-123456789012/resourceGroups/test-resource-group/providers/Microsoft.DataFactory/factories/test-data-factory", + "name": "run-001-abc", + "type": "BATCH_SCHEDULED", + "created": { + "time": 1705305600000, + "actor": "urn:li:corpuser:datahub" + } + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-test-with-runs", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataProcessInstance", + "entityUrn": "urn:li:dataProcessInstance:01640a3ec425fdac43877636b9eeafba", + "changeType": "UPSERT", + "aspectName": "dataProcessInstanceRelationships", + "aspect": { + "json": { + "parentTemplate": "urn:li:dataFlow:(azure_data_factory,test-data-factory.DataIngestionPipeline,PROD)", + "upstreamInstances": [] + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-test-with-runs", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataProcessInstance", + "entityUrn": "urn:li:dataProcessInstance:01640a3ec425fdac43877636b9eeafba", + "changeType": "UPSERT", + "aspectName": "subTypes", + "aspect": { + "json": { + "typeNames": [ + "Pipeline Run" + ] + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-test-with-runs", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataProcessInstance", + "entityUrn": "urn:li:dataProcessInstance:01640a3ec425fdac43877636b9eeafba", + "changeType": "UPSERT", + "aspectName": "dataProcessInstanceRunEvent", + "aspect": { + "json": { + "timestampMillis": 1705305600000, + "partitionSpec": { + "partition": "FULL_TABLE_SNAPSHOT", + "type": "FULL_TABLE" + }, + "status": "STARTED" + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-test-with-runs", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataProcessInstance", + "entityUrn": "urn:li:dataProcessInstance:01640a3ec425fdac43877636b9eeafba", + "changeType": "UPSERT", + "aspectName": "dataProcessInstanceRunEvent", + "aspect": { + "json": { + "timestampMillis": 1705308300000, + "partitionSpec": { + "partition": "FULL_TABLE_SNAPSHOT", + "type": "FULL_TABLE" + }, + "status": "COMPLETE", + "result": { + "type": "SUCCESS", + "nativeResultType": "Succeeded" + } + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-test-with-runs", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataProcessInstance", + "entityUrn": "urn:li:dataProcessInstance:346ef377189e5b4227f7687c2d3fc47c", + "changeType": "UPSERT", + "aspectName": "dataProcessInstanceProperties", + "aspect": { + "json": { + "customProperties": { + "run_id": "run-002-def", + "status": "Failed", + "invoked_by": "Manual", + "invoked_by_type": "Manual" + }, + "externalUrl": "https://adf.azure.com/en/monitoring/pipelineruns/run-002-def?factory=/subscriptions/12345678-1234-1234-1234-123456789012/resourceGroups/test-resource-group/providers/Microsoft.DataFactory/factories/test-data-factory", + "name": "run-002-def", + "type": "BATCH_SCHEDULED", + "created": { + "time": 1705219200000, + "actor": "urn:li:corpuser:datahub" + } + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-test-with-runs", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataProcessInstance", + "entityUrn": "urn:li:dataProcessInstance:346ef377189e5b4227f7687c2d3fc47c", + "changeType": "UPSERT", + "aspectName": "dataProcessInstanceRelationships", + "aspect": { + "json": { + "parentTemplate": "urn:li:dataFlow:(azure_data_factory,test-data-factory.DataIngestionPipeline,PROD)", + "upstreamInstances": [] + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-test-with-runs", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataProcessInstance", + "entityUrn": "urn:li:dataProcessInstance:346ef377189e5b4227f7687c2d3fc47c", + "changeType": "UPSERT", + "aspectName": "subTypes", + "aspect": { + "json": { + "typeNames": [ + "Pipeline Run" + ] + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-test-with-runs", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataProcessInstance", + "entityUrn": "urn:li:dataProcessInstance:346ef377189e5b4227f7687c2d3fc47c", + "changeType": "UPSERT", + "aspectName": "dataProcessInstanceRunEvent", + "aspect": { + "json": { + "timestampMillis": 1705219200000, + "partitionSpec": { + "partition": "FULL_TABLE_SNAPSHOT", + "type": "FULL_TABLE" + }, + "status": "STARTED" + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-test-with-runs", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataProcessInstance", + "entityUrn": "urn:li:dataProcessInstance:346ef377189e5b4227f7687c2d3fc47c", + "changeType": "UPSERT", + "aspectName": "dataProcessInstanceRunEvent", + "aspect": { + "json": { + "timestampMillis": 1705220100000, + "partitionSpec": { + "partition": "FULL_TABLE_SNAPSHOT", + "type": "FULL_TABLE" + }, + "status": "COMPLETE", + "result": { + "type": "FAILURE", + "nativeResultType": "Failed" + } + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-test-with-runs", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataProcessInstance", + "entityUrn": "urn:li:dataProcessInstance:e94dcdc0d6c440103bda5f316ca28a9e", + "changeType": "UPSERT", + "aspectName": "dataProcessInstanceProperties", + "aspect": { + "json": { + "customProperties": { + "run_id": "run-003-ghi", + "status": "Succeeded", + "invoked_by": "Manual", + "invoked_by_type": "Manual" + }, + "externalUrl": "https://adf.azure.com/en/monitoring/pipelineruns/run-003-ghi?factory=/subscriptions/12345678-1234-1234-1234-123456789012/resourceGroups/test-resource-group/providers/Microsoft.DataFactory/factories/test-data-factory", + "name": "run-003-ghi", + "type": "BATCH_SCHEDULED", + "created": { + "time": 1705309200000, + "actor": "urn:li:corpuser:datahub" + } + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-test-with-runs", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataProcessInstance", + "entityUrn": "urn:li:dataProcessInstance:e94dcdc0d6c440103bda5f316ca28a9e", + "changeType": "UPSERT", + "aspectName": "dataProcessInstanceRelationships", + "aspect": { + "json": { + "parentTemplate": "urn:li:dataFlow:(azure_data_factory,test-data-factory.DataProcessingPipeline,PROD)", + "upstreamInstances": [] + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-test-with-runs", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataProcessInstance", + "entityUrn": "urn:li:dataProcessInstance:e94dcdc0d6c440103bda5f316ca28a9e", + "changeType": "UPSERT", + "aspectName": "subTypes", + "aspect": { + "json": { + "typeNames": [ + "Pipeline Run" + ] + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-test-with-runs", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataProcessInstance", + "entityUrn": "urn:li:dataProcessInstance:e94dcdc0d6c440103bda5f316ca28a9e", + "changeType": "UPSERT", + "aspectName": "dataProcessInstanceRunEvent", + "aspect": { + "json": { + "timestampMillis": 1705309200000, + "partitionSpec": { + "partition": "FULL_TABLE_SNAPSHOT", + "type": "FULL_TABLE" + }, + "status": "STARTED" + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-test-with-runs", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataProcessInstance", + "entityUrn": "urn:li:dataProcessInstance:e94dcdc0d6c440103bda5f316ca28a9e", + "changeType": "UPSERT", + "aspectName": "dataProcessInstanceRunEvent", + "aspect": { + "json": { + "timestampMillis": 1705311000000, + "partitionSpec": { + "partition": "FULL_TABLE_SNAPSHOT", + "type": "FULL_TABLE" + }, + "status": "COMPLETE", + "result": { + "type": "SUCCESS", + "nativeResultType": "Succeeded" + } + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-test-with-runs", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "container", + "entityUrn": "urn:li:container:0690c0f5c38fc7fe7ad0518a59384f93", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-test-with-runs", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataFlow", + "entityUrn": "urn:li:dataFlow:(azure_data_factory,test-data-factory.DataIngestionPipeline,PROD)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-test-with-runs", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataFlow", + "entityUrn": "urn:li:dataFlow:(azure_data_factory,test-data-factory.DataProcessingPipeline,PROD)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-test-with-runs", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure_data_factory,test-data-factory.DataIngestionPipeline,PROD),CopyBlobToSQL)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-test-with-runs", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure_data_factory,test-data-factory.DataIngestionPipeline,PROD),LookupConfig)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-test-with-runs", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure_data_factory,test-data-factory.DataIngestionPipeline,PROD),TransformData)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-test-with-runs", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure_data_factory,test-data-factory.DataProcessingPipeline,PROD),CallStoredProc)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-test-with-runs", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataProcessInstance", + "entityUrn": "urn:li:dataProcessInstance:01640a3ec425fdac43877636b9eeafba", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-test-with-runs", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataProcessInstance", + "entityUrn": "urn:li:dataProcessInstance:346ef377189e5b4227f7687c2d3fc47c", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-test-with-runs", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataProcessInstance", + "entityUrn": "urn:li:dataProcessInstance:e94dcdc0d6c440103bda5f316ca28a9e", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-test-with-runs", + "lastRunId": "no-run-id-provided" + } +} +] \ No newline at end of file diff --git a/metadata-ingestion/tests/integration/azure_data_factory/test_adf_source.py b/metadata-ingestion/tests/integration/azure_data_factory/test_adf_source.py new file mode 100644 index 00000000000000..36b9a215bb4eb0 --- /dev/null +++ b/metadata-ingestion/tests/integration/azure_data_factory/test_adf_source.py @@ -0,0 +1,608 @@ +"""Integration tests for Azure Data Factory source. + +These tests use mocked Azure SDK responses to verify the full ingestion pipeline +produces the expected metadata events. +""" + +from datetime import datetime, timezone +from typing import Any, Dict, Iterator, List +from unittest import mock +from unittest.mock import MagicMock + +import pytest +from freezegun import freeze_time + +from datahub.ingestion.run.pipeline import Pipeline +from datahub.testing import mce_helpers + +FROZEN_TIME = "2024-01-15 12:00:00" + +# Mock Azure SDK response data + + +def create_mock_factory( + name: str, + resource_group: str, + subscription_id: str, + location: str = "eastus", + tags: Dict[str, str] | None = None, +) -> Dict[str, Any]: + """Create a mock factory response.""" + return { + "id": f"/subscriptions/{subscription_id}/resourceGroups/{resource_group}/providers/Microsoft.DataFactory/factories/{name}", + "name": name, + "type": "Microsoft.DataFactory/factories", + "location": location, + "tags": tags or {}, + "properties": { + "provisioningState": "Succeeded", + "createTime": "2024-01-01T00:00:00Z", + }, + } + + +def create_mock_pipeline( + name: str, + factory_name: str, + resource_group: str, + subscription_id: str, + activities: List[Dict[str, Any]] | None = None, + description: str | None = None, +) -> Dict[str, Any]: + """Create a mock pipeline response.""" + return { + "id": f"/subscriptions/{subscription_id}/resourceGroups/{resource_group}/providers/Microsoft.DataFactory/factories/{factory_name}/pipelines/{name}", + "name": name, + "type": "Microsoft.DataFactory/factories/pipelines", + "properties": { + "description": description, + "activities": activities or [], + "parameters": {}, + "variables": {}, + "annotations": [], + }, + } + + +def create_mock_activity( + name: str, + activity_type: str, + inputs: List[Dict[str, Any]] | None = None, + outputs: List[Dict[str, Any]] | None = None, + depends_on: List[Dict[str, Any]] | None = None, + description: str | None = None, +) -> Dict[str, Any]: + """Create a mock activity definition.""" + return { + "name": name, + "type": activity_type, + "description": description, + "dependsOn": depends_on or [], + "inputs": inputs or [], + "outputs": outputs or [], + "typeProperties": {}, + "policy": {"timeout": "7.00:00:00", "retry": 0}, + "userProperties": [], + } + + +def create_mock_dataset( + name: str, + factory_name: str, + resource_group: str, + subscription_id: str, + linked_service_name: str, + dataset_type: str = "AzureBlobDataset", + type_properties: Dict[str, Any] | None = None, +) -> Dict[str, Any]: + """Create a mock dataset response.""" + return { + "id": f"/subscriptions/{subscription_id}/resourceGroups/{resource_group}/providers/Microsoft.DataFactory/factories/{factory_name}/datasets/{name}", + "name": name, + "type": "Microsoft.DataFactory/factories/datasets", + "properties": { + "linkedServiceName": { + "referenceName": linked_service_name, + "type": "LinkedServiceReference", + }, + "type": dataset_type, + "typeProperties": type_properties or {}, + "annotations": [], + "parameters": {}, + }, + } + + +def create_mock_linked_service( + name: str, + factory_name: str, + resource_group: str, + subscription_id: str, + service_type: str = "AzureBlobStorage", +) -> Dict[str, Any]: + """Create a mock linked service response.""" + return { + "id": f"/subscriptions/{subscription_id}/resourceGroups/{resource_group}/providers/Microsoft.DataFactory/factories/{factory_name}/linkedservices/{name}", + "name": name, + "type": "Microsoft.DataFactory/factories/linkedservices", + "properties": { + "type": service_type, + "typeProperties": {}, + "annotations": [], + }, + } + + +def create_mock_trigger( + name: str, + factory_name: str, + resource_group: str, + subscription_id: str, + trigger_type: str = "ScheduleTrigger", + pipelines: List[str] | None = None, +) -> Dict[str, Any]: + """Create a mock trigger response.""" + pipeline_refs = [ + { + "pipelineReference": {"referenceName": p, "type": "PipelineReference"}, + "parameters": {}, + } + for p in (pipelines or []) + ] + return { + "id": f"/subscriptions/{subscription_id}/resourceGroups/{resource_group}/providers/Microsoft.DataFactory/factories/{factory_name}/triggers/{name}", + "name": name, + "type": "Microsoft.DataFactory/factories/triggers", + "properties": { + "type": trigger_type, + "runtimeState": "Started", + "pipelines": pipeline_refs, + "typeProperties": {}, + "annotations": [], + }, + } + + +def create_mock_pipeline_run( + run_id: str, + pipeline_name: str, + status: str = "Succeeded", + start_time: datetime | None = None, + end_time: datetime | None = None, +) -> Dict[str, Any]: + """Create a mock pipeline run response.""" + return { + "runId": run_id, + "pipelineName": pipeline_name, + "status": status, + "runStart": ( + start_time or datetime(2024, 1, 15, 10, 0, 0, tzinfo=timezone.utc) + ).isoformat(), + "runEnd": ( + end_time or datetime(2024, 1, 15, 10, 30, 0, tzinfo=timezone.utc) + ).isoformat(), + "durationInMs": 1800000, + "message": None, + "parameters": {}, + "invokedBy": {"name": "Manual", "invokedByType": "Manual"}, + "lastUpdated": datetime( + 2024, 1, 15, 10, 30, 0, tzinfo=timezone.utc + ).isoformat(), + } + + +class MockAzureResource: + """Mock class to simulate Azure SDK resource objects.""" + + def __init__(self, data: Dict[str, Any]): + self._data = data + + def as_dict(self) -> Dict[str, Any]: + return self._data + + +class MockPagedIterator: + """Mock class to simulate Azure SDK paged iterators.""" + + def __init__(self, items: List[Dict[str, Any]]): + self._items = [MockAzureResource(item) for item in items] + + def __iter__(self) -> Iterator[MockAzureResource]: + return iter(self._items) + + +class MockQueryResponse: + """Mock class for query responses with continuation token.""" + + def __init__( + self, items: List[Dict[str, Any]], continuation_token: str | None = None + ): + self.value = [MockAzureResource(item) for item in items] + self.continuation_token = continuation_token + + +# Test data constants +SUBSCRIPTION_ID = "12345678-1234-1234-1234-123456789012" +RESOURCE_GROUP = "test-resource-group" +FACTORY_NAME = "test-data-factory" + + +def get_mock_test_data() -> Dict[str, Any]: + """Generate comprehensive test data for the ADF source.""" + factories = [ + create_mock_factory( + name=FACTORY_NAME, + resource_group=RESOURCE_GROUP, + subscription_id=SUBSCRIPTION_ID, + tags={"environment": "test", "team": "data-engineering"}, + ), + ] + + # Create pipelines with various activities + copy_activity = create_mock_activity( + name="CopyBlobToSQL", + activity_type="Copy", + inputs=[{"referenceName": "SourceBlobDataset", "type": "DatasetReference"}], + outputs=[{"referenceName": "DestSqlDataset", "type": "DatasetReference"}], + description="Copy data from Blob to SQL", + ) + + lookup_activity = create_mock_activity( + name="LookupConfig", + activity_type="Lookup", + inputs=[{"referenceName": "ConfigDataset", "type": "DatasetReference"}], + description="Lookup configuration values", + ) + + dataflow_activity = create_mock_activity( + name="TransformData", + activity_type="ExecuteDataFlow", + depends_on=[ + {"activity": "LookupConfig", "dependencyConditions": ["Succeeded"]} + ], + description="Execute mapping data flow", + ) + + stored_proc_activity = create_mock_activity( + name="CallStoredProc", + activity_type="SqlServerStoredProcedure", + depends_on=[ + {"activity": "CopyBlobToSQL", "dependencyConditions": ["Succeeded"]} + ], + description="Call stored procedure", + ) + + pipelines = [ + create_mock_pipeline( + name="DataIngestionPipeline", + factory_name=FACTORY_NAME, + resource_group=RESOURCE_GROUP, + subscription_id=SUBSCRIPTION_ID, + description="Main data ingestion pipeline", + activities=[copy_activity, lookup_activity, dataflow_activity], + ), + create_mock_pipeline( + name="DataProcessingPipeline", + factory_name=FACTORY_NAME, + resource_group=RESOURCE_GROUP, + subscription_id=SUBSCRIPTION_ID, + description="Data processing and transformation", + activities=[stored_proc_activity], + ), + ] + + # Create datasets + datasets = [ + create_mock_dataset( + name="SourceBlobDataset", + factory_name=FACTORY_NAME, + resource_group=RESOURCE_GROUP, + subscription_id=SUBSCRIPTION_ID, + linked_service_name="AzureBlobStorageLS", + dataset_type="DelimitedTextDataset", + type_properties={ + "location": { + "container": "raw-data", + "folderPath": "input", + "fileName": "data.csv", + } + }, + ), + create_mock_dataset( + name="DestSqlDataset", + factory_name=FACTORY_NAME, + resource_group=RESOURCE_GROUP, + subscription_id=SUBSCRIPTION_ID, + linked_service_name="AzureSqlDatabaseLS", + dataset_type="AzureSqlTableDataset", + type_properties={"schema": "dbo", "table": "ProcessedData"}, + ), + create_mock_dataset( + name="ConfigDataset", + factory_name=FACTORY_NAME, + resource_group=RESOURCE_GROUP, + subscription_id=SUBSCRIPTION_ID, + linked_service_name="AzureBlobStorageLS", + dataset_type="JsonDataset", + type_properties={ + "location": { + "container": "config", + "fileName": "settings.json", + } + }, + ), + ] + + # Create linked services + linked_services = [ + create_mock_linked_service( + name="AzureBlobStorageLS", + factory_name=FACTORY_NAME, + resource_group=RESOURCE_GROUP, + subscription_id=SUBSCRIPTION_ID, + service_type="AzureBlobStorage", + ), + create_mock_linked_service( + name="AzureSqlDatabaseLS", + factory_name=FACTORY_NAME, + resource_group=RESOURCE_GROUP, + subscription_id=SUBSCRIPTION_ID, + service_type="AzureSqlDatabase", + ), + ] + + # Create triggers + triggers = [ + create_mock_trigger( + name="DailyScheduleTrigger", + factory_name=FACTORY_NAME, + resource_group=RESOURCE_GROUP, + subscription_id=SUBSCRIPTION_ID, + trigger_type="ScheduleTrigger", + pipelines=["DataIngestionPipeline"], + ), + ] + + # Create pipeline runs + pipeline_runs = [ + create_mock_pipeline_run( + run_id="run-001-abc", + pipeline_name="DataIngestionPipeline", + status="Succeeded", + start_time=datetime(2024, 1, 15, 8, 0, 0, tzinfo=timezone.utc), + end_time=datetime(2024, 1, 15, 8, 45, 0, tzinfo=timezone.utc), + ), + create_mock_pipeline_run( + run_id="run-002-def", + pipeline_name="DataIngestionPipeline", + status="Failed", + start_time=datetime(2024, 1, 14, 8, 0, 0, tzinfo=timezone.utc), + end_time=datetime(2024, 1, 14, 8, 15, 0, tzinfo=timezone.utc), + ), + create_mock_pipeline_run( + run_id="run-003-ghi", + pipeline_name="DataProcessingPipeline", + status="Succeeded", + start_time=datetime(2024, 1, 15, 9, 0, 0, tzinfo=timezone.utc), + end_time=datetime(2024, 1, 15, 9, 30, 0, tzinfo=timezone.utc), + ), + ] + + return { + "factories": factories, + "pipelines": pipelines, + "datasets": datasets, + "linked_services": linked_services, + "triggers": triggers, + "pipeline_runs": pipeline_runs, + } + + +def create_mock_client(test_data: Dict[str, Any]) -> MagicMock: + """Create a mock DataFactoryManagementClient.""" + mock_client = MagicMock() + + # Mock factories + mock_client.factories.list.return_value = MockPagedIterator(test_data["factories"]) + mock_client.factories.list_by_resource_group.return_value = MockPagedIterator( + test_data["factories"] + ) + + # Mock pipelines + mock_client.pipelines.list_by_factory.return_value = MockPagedIterator( + test_data["pipelines"] + ) + + # Mock datasets + mock_client.datasets.list_by_factory.return_value = MockPagedIterator( + test_data["datasets"] + ) + + # Mock linked services + mock_client.linked_services.list_by_factory.return_value = MockPagedIterator( + test_data["linked_services"] + ) + + # Mock triggers + mock_client.triggers.list_by_factory.return_value = MockPagedIterator( + test_data["triggers"] + ) + + # Mock data flows (empty for basic tests) + mock_client.data_flows.list_by_factory.return_value = MockPagedIterator([]) + + # Mock pipeline runs + mock_client.pipeline_runs.query_by_factory.return_value = MockQueryResponse( + test_data["pipeline_runs"] + ) + + # Mock activity runs (empty for basic tests) + mock_client.activity_runs.query_by_pipeline_run.return_value = MockQueryResponse([]) + + return mock_client + + +@freeze_time(FROZEN_TIME) +@pytest.mark.integration +def test_adf_source_basic(pytestconfig, tmp_path): + """Test basic ADF metadata extraction without execution history.""" + test_resources_dir = pytestconfig.rootpath / "tests/integration/azure_data_factory" + output_file = tmp_path / "adf_basic_events.json" + golden_file = test_resources_dir / "adf_basic_golden.json" + + test_data = get_mock_test_data() + mock_client = create_mock_client(test_data) + + with mock.patch( + "datahub.ingestion.source.azure_data_factory.adf_client.DataFactoryManagementClient" + ) as MockClientClass: + MockClientClass.return_value = mock_client + + with mock.patch( + "datahub.ingestion.source.azure.azure_auth.DefaultAzureCredential" + ): + pipeline = Pipeline.create( + { + "run_id": "adf-test-basic", + "source": { + "type": "azure-data-factory", + "config": { + "subscription_id": SUBSCRIPTION_ID, + "resource_group": RESOURCE_GROUP, + "credential": { + "authentication_method": "default", + }, + "include_lineage": True, + "include_execution_history": False, + "env": "PROD", + }, + }, + "sink": { + "type": "file", + "config": { + "filename": str(output_file), + }, + }, + } + ) + + pipeline.run() + pipeline.raise_from_status() + + # For the first run, we need to create the golden file + # In subsequent runs, this will compare against the golden file + mce_helpers.check_golden_file( + pytestconfig, + output_path=str(output_file), + golden_path=str(golden_file), + ) + + +@freeze_time(FROZEN_TIME) +@pytest.mark.integration +def test_adf_source_with_execution_history(pytestconfig, tmp_path): + """Test ADF metadata extraction with execution history.""" + test_resources_dir = pytestconfig.rootpath / "tests/integration/azure_data_factory" + output_file = tmp_path / "adf_with_runs_events.json" + golden_file = test_resources_dir / "adf_with_runs_golden.json" + + test_data = get_mock_test_data() + mock_client = create_mock_client(test_data) + + with mock.patch( + "datahub.ingestion.source.azure_data_factory.adf_client.DataFactoryManagementClient" + ) as MockClientClass: + MockClientClass.return_value = mock_client + + with mock.patch( + "datahub.ingestion.source.azure.azure_auth.DefaultAzureCredential" + ): + pipeline = Pipeline.create( + { + "run_id": "adf-test-with-runs", + "source": { + "type": "azure-data-factory", + "config": { + "subscription_id": SUBSCRIPTION_ID, + "resource_group": RESOURCE_GROUP, + "credential": { + "authentication_method": "default", + }, + "include_lineage": True, + "include_execution_history": True, + "execution_history_days": 7, + "env": "PROD", + }, + }, + "sink": { + "type": "file", + "config": { + "filename": str(output_file), + }, + }, + } + ) + + pipeline.run() + pipeline.raise_from_status() + + mce_helpers.check_golden_file( + pytestconfig, + output_path=str(output_file), + golden_path=str(golden_file), + ) + + +@freeze_time(FROZEN_TIME) +@pytest.mark.integration +def test_adf_source_with_platform_instance(pytestconfig, tmp_path): + """Test ADF metadata extraction with platform instance configured.""" + test_resources_dir = pytestconfig.rootpath / "tests/integration/azure_data_factory" + output_file = tmp_path / "adf_platform_instance_events.json" + golden_file = test_resources_dir / "adf_platform_instance_golden.json" + + test_data = get_mock_test_data() + mock_client = create_mock_client(test_data) + + with mock.patch( + "datahub.ingestion.source.azure_data_factory.adf_client.DataFactoryManagementClient" + ) as MockClientClass: + MockClientClass.return_value = mock_client + + with mock.patch( + "datahub.ingestion.source.azure.azure_auth.DefaultAzureCredential" + ): + pipeline = Pipeline.create( + { + "run_id": "adf-test-platform-instance", + "source": { + "type": "azure-data-factory", + "config": { + "subscription_id": SUBSCRIPTION_ID, + "resource_group": RESOURCE_GROUP, + "credential": { + "authentication_method": "default", + }, + "platform_instance": "my-adf-instance", + "include_lineage": True, + "include_execution_history": False, + "env": "DEV", + }, + }, + "sink": { + "type": "file", + "config": { + "filename": str(output_file), + }, + }, + } + ) + + pipeline.run() + pipeline.raise_from_status() + + mce_helpers.check_golden_file( + pytestconfig, + output_path=str(output_file), + golden_path=str(golden_file), + ) diff --git a/metadata-ingestion/tests/unit/azure_data_factory/__init__.py b/metadata-ingestion/tests/unit/azure_data_factory/__init__.py new file mode 100644 index 00000000000000..a371633468077c --- /dev/null +++ b/metadata-ingestion/tests/unit/azure_data_factory/__init__.py @@ -0,0 +1 @@ +"""Unit tests for Azure Data Factory connector.""" diff --git a/metadata-ingestion/tests/unit/azure_data_factory/test_adf_config.py b/metadata-ingestion/tests/unit/azure_data_factory/test_adf_config.py new file mode 100644 index 00000000000000..f9d4f2e72f54a2 --- /dev/null +++ b/metadata-ingestion/tests/unit/azure_data_factory/test_adf_config.py @@ -0,0 +1,162 @@ +"""Unit tests for Azure Data Factory configuration. + +Following the accelerator guidelines, we test: +- Configuration VALIDATION logic (required fields, bounds checking) +- Configuration INTERACTION logic (combinations of fields) + +We do NOT test: +- Default configuration values (anti-pattern) +- Simple getters/setters +- Pydantic framework behavior +""" + +import pytest +from pydantic import ValidationError + +from datahub.ingestion.source.azure.azure_auth import ( + AzureAuthenticationMethod, + AzureCredentialConfig, +) +from datahub.ingestion.source.azure_data_factory.adf_config import ( + AzureDataFactoryConfig, +) + + +class TestAzureCredentialConfigValidation: + """Tests for AzureCredentialConfig validation logic.""" + + def test_service_principal_requires_client_secret(self) -> None: + """Service principal auth should fail without client_secret.""" + with pytest.raises(ValidationError) as exc_info: + AzureCredentialConfig( + authentication_method=AzureAuthenticationMethod.SERVICE_PRINCIPAL, + client_id="test-client-id", + tenant_id="test-tenant-id", + # Missing client_secret + ) + assert "client_secret" in str(exc_info.value) + + def test_service_principal_requires_tenant_id(self) -> None: + """Service principal auth should fail without tenant_id.""" + with pytest.raises(ValidationError) as exc_info: + AzureCredentialConfig( + authentication_method=AzureAuthenticationMethod.SERVICE_PRINCIPAL, + client_id="test-client-id", + client_secret="test-secret", + # Missing tenant_id + ) + assert "tenant_id" in str(exc_info.value) + + def test_service_principal_requires_client_id(self) -> None: + """Service principal auth should fail without client_id.""" + with pytest.raises(ValidationError) as exc_info: + AzureCredentialConfig( + authentication_method=AzureAuthenticationMethod.SERVICE_PRINCIPAL, + client_secret="test-secret", + tenant_id="test-tenant-id", + # Missing client_id + ) + assert "client_id" in str(exc_info.value) + + def test_service_principal_valid_when_all_fields_present(self) -> None: + """Service principal should pass validation with all required fields.""" + # Should not raise + config = AzureCredentialConfig( + authentication_method=AzureAuthenticationMethod.SERVICE_PRINCIPAL, + client_id="test-client-id", + client_secret="test-client-secret", + tenant_id="test-tenant-id", + ) + # Verify config was created (not testing values, testing validation passed) + assert ( + config.authentication_method == AzureAuthenticationMethod.SERVICE_PRINCIPAL + ) + + +class TestAzureDataFactoryConfigValidation: + """Tests for AzureDataFactoryConfig validation logic.""" + + def test_execution_history_days_minimum_bound(self) -> None: + """execution_history_days should reject values below 1.""" + with pytest.raises(ValidationError): + AzureDataFactoryConfig( + subscription_id="test", + execution_history_days=0, # Below minimum + ) + + def test_execution_history_days_maximum_bound(self) -> None: + """execution_history_days should reject values above 90.""" + with pytest.raises(ValidationError): + AzureDataFactoryConfig( + subscription_id="test", + execution_history_days=91, # Above maximum + ) + + def test_execution_history_days_accepts_boundary_values(self) -> None: + """execution_history_days should accept boundary values (1 and 90).""" + # Should not raise + config_min = AzureDataFactoryConfig( + subscription_id="test", + execution_history_days=1, + ) + assert config_min.execution_history_days == 1 + + config_max = AzureDataFactoryConfig( + subscription_id="test", + execution_history_days=90, + ) + assert config_max.execution_history_days == 90 + + def test_subscription_id_required(self) -> None: + """Config should fail without subscription_id.""" + with pytest.raises(ValidationError): + AzureDataFactoryConfig() # type: ignore[call-arg] + + def test_factory_pattern_deny_filters_correctly(self) -> None: + """Factory pattern deny should filter matching factories.""" + config = AzureDataFactoryConfig( + subscription_id="test", + factory_pattern={"allow": [".*"], "deny": [".*-test$", "dev-.*"]}, + ) + + # Test that pattern matching works as expected + assert config.factory_pattern.allowed("prod-factory") + assert not config.factory_pattern.allowed("prod-test") + assert not config.factory_pattern.allowed("dev-factory") + + def test_pipeline_pattern_filtering(self) -> None: + """Pipeline pattern should filter pipelines correctly.""" + config = AzureDataFactoryConfig( + subscription_id="test", + pipeline_pattern={"allow": ["^prod_.*"], "deny": [".*_backup$"]}, + ) + + # Test filtering logic + assert config.pipeline_pattern.allowed("prod_ingestion") + assert config.pipeline_pattern.allowed("prod_transform") + assert not config.pipeline_pattern.allowed("dev_ingestion") + assert not config.pipeline_pattern.allowed("prod_backup") + + +class TestCredentialConfigInteraction: + """Tests for how credential config interacts with main config.""" + + def test_service_principal_credential_embedded_in_config(self) -> None: + """Service principal credential should integrate with main config.""" + config = AzureDataFactoryConfig( + subscription_id="test-subscription", + credential=AzureCredentialConfig( + authentication_method=AzureAuthenticationMethod.SERVICE_PRINCIPAL, + client_id="test-client", + client_secret="test-secret", + tenant_id="test-tenant", + ), + ) + + # Verify credential is properly set + assert ( + config.credential.authentication_method + == AzureAuthenticationMethod.SERVICE_PRINCIPAL + ) + assert config.credential.client_id == "test-client" + assert config.credential.tenant_id == "test-tenant" diff --git a/metadata-ingestion/tests/unit/azure_data_factory/test_adf_source.py b/metadata-ingestion/tests/unit/azure_data_factory/test_adf_source.py new file mode 100644 index 00000000000000..59ba8ddf00d3ac --- /dev/null +++ b/metadata-ingestion/tests/unit/azure_data_factory/test_adf_source.py @@ -0,0 +1,278 @@ +"""Unit tests for Azure Data Factory source - business logic only. + +Following the accelerator guidelines, we test: +- Platform mapping logic (linked service type -> DataHub platform) +- Activity subtype mapping +- Table name extraction from dataset properties +- Run status mapping +- Lineage extraction logic patterns + +We do NOT test: +- Trivial getters/setters +- Third-party library behavior +- Pydantic validation (covered by test_adf_config.py) +""" + +from datahub.api.entities.dataprocess.dataprocess_instance import InstanceRunResult +from datahub.ingestion.source.azure_data_factory.adf_source import ( + ACTIVITY_SUBTYPE_MAP, + LINKED_SERVICE_PLATFORM_MAP, +) + + +class TestLinkedServicePlatformMapping: + """Tests for linked service to DataHub platform mapping. + + This is critical business logic - incorrect mapping would create + lineage to wrong platform URNs. + """ + + def test_azure_sql_variants_map_to_mssql(self) -> None: + """All Azure SQL variants should map to mssql platform.""" + azure_sql_types = ["AzureSqlDatabase", "AzureSqlMI", "SqlServer"] + for sql_type in azure_sql_types: + assert LINKED_SERVICE_PLATFORM_MAP.get(sql_type) == "mssql", ( + f"{sql_type} should map to 'mssql'" + ) + + def test_synapse_variants_map_correctly(self) -> None: + """Azure Synapse variants should map to synapse platform.""" + synapse_types = ["AzureSynapseAnalytics", "AzureSqlDW"] + for synapse_type in synapse_types: + assert LINKED_SERVICE_PLATFORM_MAP.get(synapse_type) == "synapse", ( + f"{synapse_type} should map to 'synapse'" + ) + + def test_databricks_variants_map_correctly(self) -> None: + """Databricks services should all map to databricks platform.""" + databricks_types = ["AzureDatabricks", "AzureDatabricksDeltaLake"] + for db_type in databricks_types: + assert LINKED_SERVICE_PLATFORM_MAP.get(db_type) == "databricks", ( + f"{db_type} should map to 'databricks'" + ) + + def test_azure_storage_types_map_to_distinct_platforms(self) -> None: + """Different Azure storage types should map to distinct platforms.""" + assert LINKED_SERVICE_PLATFORM_MAP["AzureBlobStorage"] == "azure_blob_storage" + assert LINKED_SERVICE_PLATFORM_MAP["AzureBlobFS"] == "azure_data_lake" + assert LINKED_SERVICE_PLATFORM_MAP["AzureDataLakeStore"] == "azure_data_lake" + + def test_major_cloud_databases_covered(self) -> None: + """Major cloud databases should be mapped.""" + major_databases = { + "Snowflake": "snowflake", + "GoogleBigQuery": "bigquery", + "AmazonRedshift": "redshift", + } + for service_type, expected_platform in major_databases.items(): + assert LINKED_SERVICE_PLATFORM_MAP.get(service_type) == expected_platform + + def test_common_open_source_databases_covered(self) -> None: + """Common OSS databases should be mapped.""" + oss_databases = { + "PostgreSql": "postgres", + "MySql": "mysql", + "Oracle": "oracle", + "CosmosDbMongoDbApi": "mongodb", # MongoDB via Cosmos DB API + } + for service_type, expected_platform in oss_databases.items(): + assert LINKED_SERVICE_PLATFORM_MAP.get(service_type) == expected_platform + + def test_unknown_service_type_returns_none(self) -> None: + """Unknown service types should return None (not raise).""" + assert LINKED_SERVICE_PLATFORM_MAP.get("UnknownServiceType") is None + assert LINKED_SERVICE_PLATFORM_MAP.get("CustomConnector") is None + + +class TestActivitySubtypeMapping: + """Tests for activity type to subtype mapping. + + Subtypes affect how activities appear in the UI and their grouping. + """ + + def test_copy_activity_subtype(self) -> None: + """Copy activity should have descriptive subtype.""" + assert ACTIVITY_SUBTYPE_MAP["Copy"] == "Copy Activity" + + def test_dataflow_activities_grouped_together(self) -> None: + """Both DataFlow and ExecuteDataFlow should have same subtype.""" + assert ACTIVITY_SUBTYPE_MAP["DataFlow"] == "Data Flow Activity" + assert ACTIVITY_SUBTYPE_MAP["ExecuteDataFlow"] == "Data Flow Activity" + + def test_control_flow_activities_have_descriptive_names(self) -> None: + """Control flow activities should have user-friendly subtypes.""" + control_flow_map = { + "IfCondition": "If Condition", + "ForEach": "ForEach Loop", + "Until": "Until Loop", + "Switch": "Switch Activity", + "Wait": "Wait Activity", + } + for activity_type, expected_subtype in control_flow_map.items(): + assert ACTIVITY_SUBTYPE_MAP.get(activity_type) == expected_subtype + + def test_databricks_activities_identifiable(self) -> None: + """Databricks activities should be clearly identified.""" + databricks_activities = [ + "DatabricksNotebook", + "DatabricksSparkJar", + "DatabricksSparkPython", + ] + for activity in databricks_activities: + subtype = ACTIVITY_SUBTYPE_MAP.get(activity) + assert subtype is not None + assert "Databricks" in subtype + + +class TestTableNameExtractionLogic: + """Tests for the logic patterns used in table name extraction. + + These tests verify the extraction logic that would be used in + _extract_table_name without needing a full source instance. + """ + + def test_extract_simple_table_name(self) -> None: + """Should extract tableName property directly.""" + type_props = {"tableName": "dbo.customers"} + # Logic pattern from _extract_table_name + table_name = type_props.get("tableName") + assert table_name == "dbo.customers" + + def test_combine_schema_and_table(self) -> None: + """Should combine separate schema and table fields.""" + type_props = {"schema": "sales", "table": "orders"} + # Logic pattern from _extract_table_name + schema = type_props.get("schema", "") + table = type_props.get("table", "") + result = f"{schema}.{table}" if schema and table else table or schema + assert result == "sales.orders" + + def test_schema_only_returns_schema(self) -> None: + """Should return schema when table is missing.""" + type_props = {"schema": "dbo"} + schema = type_props.get("schema", "") + table = type_props.get("table", "") + result = f"{schema}.{table}" if schema and table else table or schema + assert result == "dbo" + + def test_table_only_returns_table(self) -> None: + """Should return table when schema is missing.""" + type_props = {"table": "orders"} + schema = type_props.get("schema", "") + table = type_props.get("table", "") + result = f"{schema}.{table}" if schema and table else table or schema + assert result == "orders" + + +class TestFilePathExtractionLogic: + """Tests for file path extraction from dataset properties.""" + + def test_combine_folder_and_filename(self) -> None: + """Should combine folderPath and fileName.""" + type_props = {"folderPath": "raw/data", "fileName": "file.csv"} + folder = type_props.get("folderPath", "") + filename = type_props.get("fileName", "") + result = f"{folder}/{filename}" if folder and filename else filename or folder + assert result == "raw/data/file.csv" + + def test_folder_only_returns_folder(self) -> None: + """Should return folder when filename is missing.""" + type_props = {"folderPath": "raw/data"} + folder = type_props.get("folderPath", "") + filename = type_props.get("fileName", "") + result = f"{folder}/{filename}" if folder and filename else filename or folder + assert result == "raw/data" + + def test_nested_location_extraction(self) -> None: + """Should extract path components from nested location object.""" + type_props = { + "location": { + "container": "mycontainer", + "folderPath": "data/raw", + "fileName": "output.parquet", + } + } + location = type_props.get("location", {}) + if isinstance(location, dict): + container = location.get("container", "") + folder = location.get("folderPath", "") + filename = location.get("fileName", "") + parts = [p for p in [container, folder, filename] if p] + result = "/".join(parts) if parts else None + else: + result = None + assert result == "mycontainer/data/raw/output.parquet" + + +class TestRunStatusMapping: + """Tests for mapping ADF run status to DataHub InstanceRunResult.""" + + def test_succeeded_maps_to_success(self) -> None: + """Succeeded status should map to SUCCESS result.""" + status_map = { + "Succeeded": InstanceRunResult.SUCCESS, + "Failed": InstanceRunResult.FAILURE, + "Cancelled": InstanceRunResult.SKIPPED, + } + assert status_map["Succeeded"] == InstanceRunResult.SUCCESS + + def test_failed_maps_to_failure(self) -> None: + """Failed status should map to FAILURE result.""" + status_map = { + "Succeeded": InstanceRunResult.SUCCESS, + "Failed": InstanceRunResult.FAILURE, + "Cancelled": InstanceRunResult.SKIPPED, + } + assert status_map["Failed"] == InstanceRunResult.FAILURE + + def test_cancelled_maps_to_skipped(self) -> None: + """Cancelled status should map to SKIPPED result.""" + status_map = { + "Cancelled": InstanceRunResult.SKIPPED, + } + assert status_map["Cancelled"] == InstanceRunResult.SKIPPED + + def test_in_progress_should_return_none(self) -> None: + """In-progress statuses should not have a final result.""" + incomplete_statuses = ["InProgress", "Queued", "Cancelling"] + status_map = { + "InProgress": None, + "Queued": None, + "Cancelling": None, + } + for status in incomplete_statuses: + assert status_map.get(status) is None + + +class TestResourceGroupExtractionLogic: + """Tests for extracting resource group from Azure resource ID.""" + + def test_extract_from_standard_resource_id(self) -> None: + """Should extract resource group from standard Azure resource ID.""" + resource_id = ( + "/subscriptions/12345678-1234-1234-1234-123456789012" + "/resourceGroups/my-resource-group" + "/providers/Microsoft.DataFactory/factories/my-factory" + ) + parts = resource_id.split("/") + rg_index = parts.index("resourceGroups") + resource_group = parts[rg_index + 1] + assert resource_group == "my-resource-group" + + def test_extract_with_complex_resource_group_name(self) -> None: + """Should handle resource groups with hyphens, underscores, and numbers.""" + test_cases = [ + ("prod-data-rg-001", "prod-data-rg-001"), + ("RG_Production_123", "RG_Production_123"), + ("simple", "simple"), + ] + for rg_name, expected in test_cases: + resource_id = ( + f"/subscriptions/00000000-0000-0000-0000-000000000000" + f"/resourceGroups/{rg_name}" + f"/providers/Microsoft.DataFactory/factories/factory1" + ) + parts = resource_id.split("/") + rg_index = parts.index("resourceGroups") + extracted = parts[rg_index + 1] + assert extracted == expected From 5a08dfb26bcbbb4c6c4326ad0789ec55b5ceb210 Mon Sep 17 00:00:00 2001 From: Anush Kumar Date: Mon, 8 Dec 2025 17:21:37 -0800 Subject: [PATCH 02/13] feat(azure-data-factory): enhance Azure Data Factory integration - Added support for Azure Data Factory logos and updated constants for platform identification. - Implemented pipeline-to-pipeline lineage tracking for ExecutePipeline activities, enabling better visibility of dependencies in the DataHub UI. - Updated documentation to reflect new features and improved metadata ingestion capabilities. - Refactored code for better clarity and maintainability, including type definitions for ADF API responses. - Adjusted test cases to ensure accuracy with the new changes. --- .../app/ingest/source/builder/constants.ts | 4 + .../app/ingestV2/source/builder/constants.ts | 4 + .../src/images/azuredatafactorylogo.svg | 1 + .../azure_data_factory_pre.md | 27 +- .../ingestion/source/azure/azure_auth.py | 12 +- .../source/azure_data_factory/adf_client.py | 5 +- .../source/azure_data_factory/adf_models.py | 124 +- .../source/azure_data_factory/adf_source.py | 254 ++-- .../azure_data_factory/adf_basic_golden.json | 446 +++--- .../adf_branching_golden.json | 508 +++++++ .../adf_dataflow_golden.json | 308 +++++ .../adf_diverse_golden.json | 1108 +++++++++++++++ .../adf_foreach_golden.json | 408 ++++++ .../adf_multisource_golden.json | 813 +++++++++++ .../azure_data_factory/adf_nested_golden.json | 895 ++++++++++++ .../adf_platform_instance_golden.json | 492 +++---- .../adf_with_runs_golden.json | 180 +-- .../azure_data_factory/complex_mocks.py | 1231 +++++++++++++++++ .../test_complex_pipelines.py | 1208 ++++++++++++++++ .../src/main/resources/bootstrap_mcps.yaml | 2 +- .../bootstrap_mcps/data-platforms.yaml | 10 + 21 files changed, 7367 insertions(+), 673 deletions(-) create mode 100644 datahub-web-react/src/images/azuredatafactorylogo.svg create mode 100644 metadata-ingestion/tests/integration/azure_data_factory/adf_branching_golden.json create mode 100644 metadata-ingestion/tests/integration/azure_data_factory/adf_dataflow_golden.json create mode 100644 metadata-ingestion/tests/integration/azure_data_factory/adf_diverse_golden.json create mode 100644 metadata-ingestion/tests/integration/azure_data_factory/adf_foreach_golden.json create mode 100644 metadata-ingestion/tests/integration/azure_data_factory/adf_multisource_golden.json create mode 100644 metadata-ingestion/tests/integration/azure_data_factory/adf_nested_golden.json create mode 100644 metadata-ingestion/tests/integration/azure_data_factory/complex_mocks.py create mode 100644 metadata-ingestion/tests/integration/azure_data_factory/test_complex_pipelines.py diff --git a/datahub-web-react/src/app/ingest/source/builder/constants.ts b/datahub-web-react/src/app/ingest/source/builder/constants.ts index ec2d6b119d14e5..c9997484d57ca1 100644 --- a/datahub-web-react/src/app/ingest/source/builder/constants.ts +++ b/datahub-web-react/src/app/ingest/source/builder/constants.ts @@ -1,4 +1,5 @@ import athenaLogo from '@images/awsathenalogo.png'; +import azureDataFactoryLogo from '@images/azuredatafactorylogo.svg'; import azureLogo from '@images/azure-ad.png'; import bigqueryLogo from '@images/bigquerylogo.png'; import cassandraLogo from '@images/cassandralogo.png'; @@ -50,6 +51,8 @@ export const ATHENA = 'athena'; export const ATHENA_URN = `urn:li:dataPlatform:${ATHENA}`; export const AZURE = 'azure-ad'; export const AZURE_URN = `urn:li:dataPlatform:${AZURE}`; +export const AZURE_DATA_FACTORY = 'azure-data-factory'; +export const AZURE_DATA_FACTORY_URN = `urn:li:dataPlatform:${AZURE_DATA_FACTORY}`; export const BIGQUERY = 'bigquery'; export const BIGQUERY_USAGE = 'bigquery-usage'; export const BIGQUERY_BETA = 'bigquery-beta'; @@ -162,6 +165,7 @@ export const STREAMLIT_URN = `urn:li:dataPlatform:${STREAMLIT}`; export const PLATFORM_URN_TO_LOGO = { [ATHENA_URN]: athenaLogo, [AZURE_URN]: azureLogo, + [AZURE_DATA_FACTORY_URN]: azureDataFactoryLogo, [BIGQUERY_URN]: bigqueryLogo, [CLICKHOUSE_URN]: clickhouseLogo, [COCKROACHDB_URN]: cockroachdbLogo, diff --git a/datahub-web-react/src/app/ingestV2/source/builder/constants.ts b/datahub-web-react/src/app/ingestV2/source/builder/constants.ts index be3f8100650414..695d6a12b5bcd1 100644 --- a/datahub-web-react/src/app/ingestV2/source/builder/constants.ts +++ b/datahub-web-react/src/app/ingestV2/source/builder/constants.ts @@ -1,4 +1,5 @@ import athenaLogo from '@images/awsathenalogo.png'; +import azureDataFactoryLogo from '@images/azuredatafactorylogo.svg'; import azureLogo from '@images/azure-ad.png'; import bigqueryLogo from '@images/bigquerylogo.png'; import cassandraLogo from '@images/cassandralogo.png'; @@ -48,6 +49,8 @@ export const ATHENA = 'athena'; export const ATHENA_URN = `urn:li:dataPlatform:${ATHENA}`; export const AZURE = 'azure-ad'; export const AZURE_URN = `urn:li:dataPlatform:${AZURE}`; +export const AZURE_DATA_FACTORY = 'azure-data-factory'; +export const AZURE_DATA_FACTORY_URN = `urn:li:dataPlatform:${AZURE_DATA_FACTORY}`; export const BIGQUERY = 'bigquery'; export const BIGQUERY_BETA = 'bigquery-beta'; export const BIGQUERY_URN = `urn:li:dataPlatform:${BIGQUERY}`; @@ -155,6 +158,7 @@ export const SNAPLOGIC_URN = `urn:li:dataPlatform:${SNAPLOGIC}`; export const PLATFORM_URN_TO_LOGO = { [ATHENA_URN]: athenaLogo, [AZURE_URN]: azureLogo, + [AZURE_DATA_FACTORY_URN]: azureDataFactoryLogo, [BIGQUERY_URN]: bigqueryLogo, [CLICKHOUSE_URN]: clickhouseLogo, [COCKROACHDB_URN]: cockroachdbLogo, diff --git a/datahub-web-react/src/images/azuredatafactorylogo.svg b/datahub-web-react/src/images/azuredatafactorylogo.svg new file mode 100644 index 00000000000000..22373367da353e --- /dev/null +++ b/datahub-web-react/src/images/azuredatafactorylogo.svg @@ -0,0 +1 @@ +Icon-databases-126 \ No newline at end of file diff --git a/metadata-ingestion/docs/sources/azure_data_factory/azure_data_factory_pre.md b/metadata-ingestion/docs/sources/azure_data_factory/azure_data_factory_pre.md index c8e2d8062034a4..8b8fc995e9be9a 100644 --- a/metadata-ingestion/docs/sources/azure_data_factory/azure_data_factory_pre.md +++ b/metadata-ingestion/docs/sources/azure_data_factory/azure_data_factory_pre.md @@ -57,6 +57,7 @@ To set up a service principal: | Platform Instance | ✅ | Enabled by default | | Containers | ✅ | Data Factories as containers | | Lineage (Table-level) | ✅ | From activity inputs/outputs and Data Flows | +| Pipeline-to-Pipeline | ✅ | ExecutePipeline activities create lineage | | Data Flow Scripts | ✅ | Stored as transformation logic | | Execution History | ✅ | Optional, via `include_execution_history` | | Stateful Ingestion | ✅ | Stale entity removal | @@ -68,6 +69,22 @@ The connector extracts lineage from: 1. **Copy Activities**: Maps input/output datasets to DataHub datasets 2. **Data Flow Activities**: Extracts sources and sinks from Data Flow definitions 3. **Lookup Activities**: Maps lookup datasets as inputs +4. **ExecutePipeline Activities**: Creates pipeline-to-pipeline lineage to child pipelines + +### Pipeline-to-Pipeline Lineage + +When a pipeline calls another pipeline via an `ExecutePipeline` activity, the connector creates a lineage edge from the calling activity to the first activity in the child pipeline. This enables: + +- Tracing orchestration hierarchies across nested pipelines +- Impact analysis when modifying child pipelines +- Understanding dependencies between modular pipelines + +The ExecutePipeline activity's DataJob entity will include: + +- Custom property `calls_pipeline`: Name of the child pipeline +- Custom property `child_pipeline_urn`: URN of the child DataFlow +- Custom property `child_first_activity`: Name of the first activity in the child pipeline +- Lineage edge to the first DataJob in the child pipeline ### Supported Linked Service Mappings @@ -194,21 +211,21 @@ The connector automatically includes the factory name in pipeline URNs (e.g., `m Pipeline URNs include the factory name for uniqueness across multiple factories: ``` -urn:li:dataFlow:(azure_data_factory,{factory_name}.{pipeline_name},{env}) +urn:li:dataFlow:(azure-data-factory,{factory_name}.{pipeline_name},{env}) ``` -Example: `urn:li:dataFlow:(azure_data_factory,my-factory.ETL-Pipeline,PROD)` +Example: `urn:li:dataFlow:(azure-data-factory,my-factory.ETL-Pipeline,PROD)` Activity URNs reference their parent pipeline: ``` -urn:li:dataJob:(urn:li:dataFlow:(azure_data_factory,{factory_name}.{pipeline_name},{env}),{activity_name}) +urn:li:dataJob:(urn:li:dataFlow:(azure-data-factory,{factory_name}.{pipeline_name},{env}),{activity_name}) ``` With `platform_instance` set, it's prepended to the URN: ``` -urn:li:dataFlow:(azure_data_factory,{platform_instance}.{factory_name}.{pipeline_name},{env}) +urn:li:dataFlow:(azure-data-factory,{platform_instance}.{factory_name}.{pipeline_name},{env}) ``` -Example: `urn:li:dataFlow:(azure_data_factory,production.my-factory.ETL-Pipeline,PROD)` +Example: `urn:li:dataFlow:(azure-data-factory,production.my-factory.ETL-Pipeline,PROD)` diff --git a/metadata-ingestion/src/datahub/ingestion/source/azure/azure_auth.py b/metadata-ingestion/src/datahub/ingestion/source/azure/azure_auth.py index 12007be8d3882b..9c2427bddaf349 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/azure/azure_auth.py +++ b/metadata-ingestion/src/datahub/ingestion/source/azure/azure_auth.py @@ -137,13 +137,19 @@ def get_credential(self) -> TokenCredential: ValueError: If required credentials are missing for the chosen method. """ if self.authentication_method == AzureAuthenticationMethod.SERVICE_PRINCIPAL: + # Validate all required fields (also validated in validate_credentials()) if not self.client_secret: raise ValueError( "client_secret is required for service_principal authentication" ) - # These are validated as required in validate_credentials() - assert self.tenant_id is not None - assert self.client_id is not None + if not self.tenant_id: + raise ValueError( + "tenant_id is required for service_principal authentication" + ) + if not self.client_id: + raise ValueError( + "client_id is required for service_principal authentication" + ) return ClientSecretCredential( tenant_id=self.tenant_id, client_id=self.client_id, diff --git a/metadata-ingestion/src/datahub/ingestion/source/azure_data_factory/adf_client.py b/metadata-ingestion/src/datahub/ingestion/source/azure_data_factory/adf_client.py index 15802a2c62a9dd..d5e06630930643 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/azure_data_factory/adf_client.py +++ b/metadata-ingestion/src/datahub/ingestion/source/azure_data_factory/adf_client.py @@ -32,6 +32,9 @@ logger = logging.getLogger(__name__) +# Maximum retention period for activity run queries (Azure limit) +MAX_ACTIVITY_RUN_RETENTION_DAYS = 90 + class AzureDataFactoryClient: """Client for Azure Data Factory REST API. @@ -391,7 +394,7 @@ def get_activity_runs( """ try: end_time = datetime.now(timezone.utc) - start_time = end_time - timedelta(days=90) # Max retention + start_time = end_time - timedelta(days=MAX_ACTIVITY_RUN_RETENTION_DAYS) # POST /subscriptions/{sub}/resourceGroups/{rg}/providers/Microsoft.DataFactory/factories/{factoryName}/pipelineruns/{runId}/queryActivityruns # Docs: https://learn.microsoft.com/en-us/rest/api/datafactory/activity-runs/query-by-pipeline-run diff --git a/metadata-ingestion/src/datahub/ingestion/source/azure_data_factory/adf_models.py b/metadata-ingestion/src/datahub/ingestion/source/azure_data_factory/adf_models.py index 47af993ea2e69f..b92f0375595013 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/azure_data_factory/adf_models.py +++ b/metadata-ingestion/src/datahub/ingestion/source/azure_data_factory/adf_models.py @@ -7,9 +7,63 @@ """ from datetime import datetime -from typing import Any, Optional +from typing import Any, Optional, Union from pydantic import BaseModel, ConfigDict, Field, model_validator +from typing_extensions import TypedDict + +# Type aliases for common JSON value types in ADF API responses +# Azure API parameters and variables can contain primitive types +JsonPrimitive = Union[str, int, float, bool, None] + + +# TypedDict for well-known structures in ADF API responses +class FolderInfo(TypedDict, total=False): + """Folder organization structure used by pipelines, datasets, etc.""" + + name: str + + +class InvokedByInfo(TypedDict, total=False): + """Information about what triggered a pipeline run.""" + + name: str + id: str + invokedByType: str + + +class UserProperty(TypedDict, total=False): + """User-defined property on an activity.""" + + name: str + value: str + + +class IntegrationRuntimeReference(TypedDict, total=False): + """Reference to an integration runtime.""" + + referenceName: str + type: str + + +class ActivityPolicy(TypedDict, total=False): + """Execution policy for an activity.""" + + timeout: str + retry: int + retryIntervalInSeconds: int + secureInput: bool + secureOutput: bool + + +class SchemaColumn(TypedDict, total=False): + """Column definition in a dataset schema.""" + + name: str + type: str + physicalType: str + precision: int + scale: int class AdfResource(BaseModel): @@ -70,7 +124,7 @@ class DatasetReference(BaseModel): reference_name: str = Field(alias="referenceName", description="Dataset name") type: str = Field(default="DatasetReference", description="Reference type") - parameters: dict[str, Any] = Field( + parameters: dict[str, JsonPrimitive] = Field( default_factory=dict, description="Dataset parameters" ) @@ -84,7 +138,7 @@ class LinkedServiceReference(BaseModel): alias="referenceName", description="Linked service name" ) type: str = Field(default="LinkedServiceReference", description="Reference type") - parameters: dict[str, Any] = Field( + parameters: dict[str, JsonPrimitive] = Field( default_factory=dict, description="Linked service parameters" ) @@ -94,7 +148,7 @@ class ActivityInput(BaseModel): model_config = ConfigDict(populate_by_name=True, extra="allow") - # For Copy activities + # For Copy activities - source config varies by source type (SQL, Blob, etc.) source: Optional[dict[str, Any]] = Field( default=None, description="Source configuration" ) @@ -110,7 +164,7 @@ class ActivityOutput(BaseModel): model_config = ConfigDict(populate_by_name=True, extra="allow") - # For Copy activities + # For Copy activities - sink config varies by sink type sink: Optional[dict[str, Any]] = Field( default=None, description="Sink configuration" ) @@ -140,7 +194,9 @@ class Activity(BaseModel): default_factory=list, alias="dependsOn", description="Activity dependencies" ) - # Type-specific properties stored here + # Type-specific properties vary by activity type (Copy, DataFlow, ExecutePipeline, etc.) + # Contains nested structures like {"pipeline": {"referenceName": "...", "type": "..."}} + # Uses Any due to deeply nested and varying structures from Azure API type_properties: Optional[dict[str, Any]] = Field( default=None, alias="typeProperties", description="Type-specific properties" ) @@ -161,12 +217,12 @@ class Activity(BaseModel): ) # Policy - policy: Optional[dict[str, Any]] = Field( + policy: Optional[ActivityPolicy] = Field( default=None, description="Activity execution policy" ) # User properties - user_properties: list[dict[str, Any]] = Field( + user_properties: list[UserProperty] = Field( default_factory=list, alias="userProperties", description="User-defined properties", @@ -182,9 +238,11 @@ class PipelineProperties(BaseModel): activities: list[Activity] = Field( default_factory=list, description="Pipeline activities" ) + # Parameters have complex structure: {"name": {"type": "String", "defaultValue": ...}} parameters: dict[str, Any] = Field( default_factory=dict, description="Pipeline parameters" ) + # Variables have complex structure similar to parameters variables: dict[str, Any] = Field( default_factory=dict, description="Pipeline variables" ) @@ -192,7 +250,7 @@ class PipelineProperties(BaseModel): annotations: list[str] = Field( default_factory=list, description="Pipeline annotations" ) - folder: Optional[dict[str, str]] = Field( + folder: Optional[FolderInfo] = Field( default=None, description="Folder path for organization" ) @@ -216,9 +274,11 @@ class Pipeline(AdfResource): activities: list[Activity] = Field( default_factory=list, description="Pipeline activities" ) + # Parameters have complex structure: {"name": {"type": "String", "defaultValue": ...}} parameters: dict[str, Any] = Field( default_factory=dict, description="Pipeline parameters" ) + # Variables have complex structure similar to parameters variables: dict[str, Any] = Field( default_factory=dict, description="Pipeline variables" ) @@ -226,7 +286,7 @@ class Pipeline(AdfResource): annotations: list[str] = Field( default_factory=list, description="Pipeline annotations" ) - folder: Optional[dict[str, str]] = Field( + folder: Optional[FolderInfo] = Field( default=None, description="Folder path for organization" ) @@ -256,31 +316,34 @@ class DatasetProperties(BaseModel): linked_service_name: LinkedServiceReference = Field( alias="linkedServiceName", description="Associated linked service" ) + # Parameters can have complex structure: {"name": {"type": "String"}} parameters: dict[str, Any] = Field( default_factory=dict, description="Dataset parameters" ) annotations: list[str] = Field( default_factory=list, description="Dataset annotations" ) - folder: Optional[dict[str, str]] = Field( + folder: Optional[FolderInfo] = Field( default=None, description="Folder path for organization" ) type: str = Field( description="Dataset type (e.g., AzureBlobDataset, DelimitedTextDataset)" ) - # Type-specific properties + # Type-specific properties vary by dataset type (AzureBlobDataset, SqlTable, etc.) + # Contains nested structures for connection details, file paths, etc. + # Uses Any due to deeply nested and varying structures from Azure API type_properties: Optional[dict[str, Any]] = Field( default=None, alias="typeProperties", description="Type-specific properties" ) # Schema (optional) - named schema_definition to avoid conflict with Pydantic's schema method - schema_definition: Optional[list[dict[str, Any]]] = Field( + schema_definition: Optional[list[SchemaColumn]] = Field( default=None, alias="schema", description="Dataset schema definition" ) # Structure (legacy schema format) - structure: Optional[list[dict[str, Any]]] = Field( + structure: Optional[list[SchemaColumn]] = Field( default=None, description="Dataset structure (legacy)" ) @@ -305,13 +368,15 @@ class LinkedServiceProperties(BaseModel): type: str = Field( description="Linked service type (e.g., AzureBlobStorage, AzureSqlDatabase)" ) + # Type-specific properties vary by linked service type (SQL, Blob, etc.) + # Uses Any due to deeply nested and varying structures from Azure API type_properties: Optional[dict[str, Any]] = Field( default=None, alias="typeProperties", description="Type-specific properties" ) annotations: list[str] = Field( default_factory=list, description="Linked service annotations" ) - connect_via: Optional[dict[str, Any]] = Field( + connect_via: Optional[IntegrationRuntimeReference] = Field( default=None, alias="connectVia", description="Integration runtime reference" ) @@ -359,6 +424,13 @@ class DataFlowSink(BaseModel): ) +class DataFlowTransformation(TypedDict, total=False): + """Transformation step in a data flow.""" + + name: str + description: str + + class DataFlowProperties(BaseModel): """Properties of a mapping data flow.""" @@ -368,13 +440,15 @@ class DataFlowProperties(BaseModel): default=None, description="Data flow description" ) type: str = Field(default="MappingDataFlow", description="Data flow type") + # Type-specific properties contain sources, sinks, transformations, scripts + # Uses Any due to deeply nested and varying structures from Azure API type_properties: Optional[dict[str, Any]] = Field( default=None, alias="typeProperties", description="Type-specific properties" ) annotations: list[str] = Field( default_factory=list, description="Data flow annotations" ) - folder: Optional[dict[str, str]] = Field( + folder: Optional[FolderInfo] = Field( default=None, description="Folder path for organization" ) @@ -387,7 +461,7 @@ class DataFlowProperties(BaseModel): ) # Transformations and script - transformations: list[dict[str, Any]] = Field( + transformations: list[DataFlowTransformation] = Field( default_factory=list, description="Data flow transformations" ) script_lines: list[str] = Field( @@ -412,6 +486,13 @@ class DataFlow(AdfResource): properties: DataFlowProperties = Field(description="Data flow properties") +class TriggerPipelineReference(TypedDict, total=False): + """Reference to a pipeline from a trigger.""" + + pipelineReference: dict[str, str] + parameters: dict[str, str] + + class TriggerProperties(BaseModel): """Properties of a trigger.""" @@ -426,13 +507,15 @@ class TriggerProperties(BaseModel): alias="runtimeState", description="Trigger state (Started, Stopped)", ) + # Type-specific properties vary by trigger type (Schedule, BlobEvents, etc.) + # Uses Any due to deeply nested and varying structures from Azure API type_properties: Optional[dict[str, Any]] = Field( default=None, alias="typeProperties", description="Type-specific properties" ) annotations: list[str] = Field( default_factory=list, description="Trigger annotations" ) - pipelines: list[dict[str, Any]] = Field( + pipelines: list[TriggerPipelineReference] = Field( default_factory=list, description="Pipelines triggered" ) @@ -470,7 +553,7 @@ class PipelineRun(BaseModel): parameters: dict[str, str] = Field( default_factory=dict, description="Run parameters" ) - invoked_by: Optional[dict[str, str]] = Field( + invoked_by: Optional[InvokedByInfo] = Field( default=None, alias="invokedBy", description="Trigger or user that invoked the run", @@ -513,6 +596,8 @@ class ActivityRun(BaseModel): duration_in_ms: Optional[int] = Field( default=None, alias="durationInMs", description="Duration in milliseconds" ) + # Input/output/error contain runtime data that varies by activity type + # These can contain deeply nested structures from Azure API input: Optional[dict[str, Any]] = Field(default=None, description="Activity input") output: Optional[dict[str, Any]] = Field( default=None, description="Activity output" @@ -530,6 +615,7 @@ class ListResponse(BaseModel): model_config = ConfigDict(populate_by_name=True, extra="allow") + # Resources contain nested structures that vary by type value: list[dict[str, Any]] = Field(description="List of resources") next_link: Optional[str] = Field( default=None, alias="nextLink", description="URL for next page of results" diff --git a/metadata-ingestion/src/datahub/ingestion/source/azure_data_factory/adf_source.py b/metadata-ingestion/src/datahub/ingestion/source/azure_data_factory/adf_source.py index 20dcdafd2c6c48..67f6f77d9c7c9b 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/azure_data_factory/adf_source.py +++ b/metadata-ingestion/src/datahub/ingestion/source/azure_data_factory/adf_source.py @@ -20,7 +20,7 @@ """ import logging -from typing import Dict, Iterable, List, Optional, Tuple +from typing import Iterable, Optional from datahub.api.entities.dataprocess.dataprocess_instance import ( DataProcessInstance, @@ -65,13 +65,15 @@ StatefulIngestionSourceBase, ) from datahub.metadata.schema_classes import ( + DataJobInputOutputClass, DataProcessTypeClass, DataTransformClass, DataTransformLogicClass, QueryLanguageClass, QueryStatementClass, ) -from datahub.metadata.urns import DataFlowUrn, DatasetUrn +from datahub.metadata.urns import DataFlowUrn, DataJobUrn, DatasetUrn +from datahub.sdk._shared import DatasetUrnOrStr from datahub.sdk.container import Container from datahub.sdk.dataflow import DataFlow from datahub.sdk.datajob import DataJob @@ -79,10 +81,15 @@ logger = logging.getLogger(__name__) # Platform identifier for Azure Data Factory -PLATFORM = "azure_data_factory" +PLATFORM = "azure-data-factory" + +# Constants for pipeline run processing +MAX_RUN_MESSAGE_LENGTH = 500 # Truncate long error/status messages +MAX_RUN_PARAMETERS = 10 # Limit number of parameters to store +MAX_PARAMETER_VALUE_LENGTH = 100 # Truncate long parameter values # Mapping of ADF linked service types to DataHub platforms -LINKED_SERVICE_PLATFORM_MAP: Dict[str, str] = { +LINKED_SERVICE_PLATFORM_MAP: dict[str, str] = { # Azure Storage "AzureBlobStorage": "azure_blob_storage", "AzureBlobFS": "azure_data_lake", @@ -142,7 +149,7 @@ } # Mapping of ADF activity types to DataHub subtypes -ACTIVITY_SUBTYPE_MAP: Dict[str, str] = { +ACTIVITY_SUBTYPE_MAP: dict[str, str] = { "Copy": "Copy Activity", "DataFlow": "Data Flow Activity", "ExecutePipeline": "Execute Pipeline", @@ -227,20 +234,21 @@ def __init__(self, config: AzureDataFactoryConfig, ctx: PipelineContext) -> None subscription_id=config.subscription_id, ) - # Cache for datasets, linked services, data flows, and triggers (per factory) - self._datasets_cache: Dict[str, Dict[str, AdfDataset]] = {} - self._linked_services_cache: Dict[str, Dict[str, LinkedService]] = {} - self._data_flows_cache: Dict[str, Dict[str, AdfDataFlow]] = {} - self._triggers_cache: Dict[str, List[Trigger]] = {} + # Cache for datasets, linked services, data flows, pipelines, and triggers (per factory) + self._datasets_cache: dict[str, dict[str, AdfDataset]] = {} + self._linked_services_cache: dict[str, dict[str, LinkedService]] = {} + self._data_flows_cache: dict[str, dict[str, AdfDataFlow]] = {} + self._pipelines_cache: dict[str, dict[str, Pipeline]] = {} + self._triggers_cache: dict[str, list[Trigger]] = {} @classmethod def create( - cls, config_dict: Dict, ctx: PipelineContext + cls, config_dict: dict, ctx: PipelineContext ) -> "AzureDataFactorySource": config = AzureDataFactoryConfig.model_validate(config_dict) return cls(config, ctx) - def get_workunit_processors(self) -> List[Optional[MetadataWorkUnitProcessor]]: + def get_workunit_processors(self) -> list[Optional[MetadataWorkUnitProcessor]]: return [ *super().get_workunit_processors(), StaleEntityRemovalHandler.create( @@ -332,7 +340,7 @@ def _cache_factory_resources(self, resource_group: str, factory_name: str) -> No def _emit_factory( self, factory: Factory, resource_group: str - ) -> Tuple[Container, Iterable[MetadataWorkUnit]]: + ) -> tuple[Container, Iterable[MetadataWorkUnit]]: """Emit a Data Factory as a Container. Returns: @@ -348,7 +356,7 @@ def _emit_factory( ) # Build custom properties - custom_props: Dict[str, str] = { + custom_props: dict[str, str] = { "azure_resource_id": factory.id, "location": factory.location, } @@ -382,7 +390,13 @@ def _get_factory_url(self, factory: Factory, resource_group: str) -> str: def _process_pipelines( self, factory: Factory, resource_group: str, container: Container ) -> Iterable[MetadataWorkUnit]: - """Process all pipelines in a factory. + """Process all pipelines in a factory using two-pass approach. + + First pass: Fetch and cache all pipelines for the factory. + Second pass: Process pipelines and emit entities with proper lineage. + + This two-pass approach enables ExecutePipeline activities to reference + child pipelines that may not have been processed yet. Args: factory: The Data Factory @@ -391,16 +405,21 @@ def _process_pipelines( """ factory_key = f"{resource_group}/{factory.name}" + # First pass: Cache all pipelines for this factory + self._pipelines_cache[factory_key] = {} for pipeline in self.client.get_pipelines(resource_group, factory.name): self.report.report_api_call() + self._pipelines_cache[factory_key][pipeline.name] = pipeline + # Second pass: Process pipelines and emit entities + for pipeline_name, pipeline in self._pipelines_cache[factory_key].items(): # Check if pipeline matches pattern - if not self.config.pipeline_pattern.allowed(pipeline.name): - self.report.report_pipeline_filtered(pipeline.name) + if not self.config.pipeline_pattern.allowed(pipeline_name): + self.report.report_pipeline_filtered(pipeline_name) continue self.report.report_pipeline_scanned() - logger.debug(f"Processing pipeline: {factory.name}/{pipeline.name}") + logger.debug(f"Processing pipeline: {factory.name}/{pipeline_name}") # Emit pipeline as DataFlow, passing the Container for proper browse paths dataflow = self._create_dataflow( @@ -411,7 +430,7 @@ def _process_pipelines( # Emit activities as DataJobs if pipeline.properties is None: logger.warning( - f"Pipeline {pipeline.name} has no properties, skipping activities" + f"Pipeline {pipeline_name} has no properties, skipping activities" ) continue for activity in pipeline.properties.activities: @@ -428,6 +447,12 @@ def _process_pipelines( activity, datajob, factory_key ) + # Emit pipeline-to-pipeline lineage for ExecutePipeline activities + if activity.type == "ExecutePipeline": + yield from self._emit_pipeline_lineage( + activity, datajob, factory, factory_key + ) + def _create_dataflow( self, pipeline: Pipeline, @@ -447,7 +472,7 @@ def _create_dataflow( flow_name = f"{factory.name}.{pipeline.name}" # Custom properties - custom_props: Dict[str, str] = { + custom_props: dict[str, str] = { "azure_resource_id": pipeline.id, "factory_name": factory.name, } @@ -491,7 +516,7 @@ def _create_dataflow( def _get_pipeline_triggers( self, resource_group: str, factory_name: str, pipeline_name: str - ) -> List[str]: + ) -> list[str]: """Get trigger names associated with a pipeline.""" if not self.config.include_triggers: return [] @@ -537,7 +562,7 @@ def _create_datajob( subtype = ACTIVITY_SUBTYPE_MAP.get(activity.type, activity.type) # Custom properties - custom_props: Dict[str, str] = { + custom_props: dict[str, str] = { "activity_type": activity.type, } if activity.description: @@ -551,8 +576,8 @@ def _create_datajob( custom_props["retry"] = str(activity.policy["retry"]) # Extract lineage (inlets/outlets) - inlets: Optional[List[str]] = None - outlets: Optional[List[str]] = None + inlets: Optional[list[DatasetUrnOrStr]] = None + outlets: Optional[list[DatasetUrnOrStr]] = None if self.config.include_lineage: extracted_inlets = self._extract_activity_inputs(activity, factory_key) @@ -572,17 +597,17 @@ def _create_datajob( external_url=self._get_pipeline_url(factory, resource_group, pipeline.name), custom_properties=custom_props, subtype=subtype, - inlets=inlets, # type: ignore[arg-type] - outlets=outlets, # type: ignore[arg-type] + inlets=inlets, + outlets=outlets, ) return datajob def _extract_activity_inputs( self, activity: Activity, factory_key: str - ) -> List[str]: + ) -> list[DatasetUrnOrStr]: """Extract input dataset URNs from an activity.""" - inputs: List[str] = [] + inputs: list[DatasetUrnOrStr] = [] # Process explicit inputs (for Copy activities and others) for input_ref in activity.inputs: @@ -614,9 +639,9 @@ def _extract_activity_inputs( def _extract_activity_outputs( self, activity: Activity, factory_key: str - ) -> List[str]: + ) -> list[DatasetUrnOrStr]: """Extract output dataset URNs from an activity.""" - outputs: List[str] = [] + outputs: list[DatasetUrnOrStr] = [] # Process explicit outputs (for Copy activities and others) for output_ref in activity.outputs: @@ -744,22 +769,24 @@ def _emit_data_flow_script( ), ).as_workunit() - def _extract_data_flow_sources( - self, activity: Activity, factory_key: str - ) -> List[str]: - """Extract source dataset URNs from a Data Flow activity. + def _extract_data_flow_endpoints( + self, activity: Activity, factory_key: str, endpoint_type: str + ) -> list[str]: + """Extract source or sink dataset URNs from a Data Flow activity. Data Flow activities reference a Data Flow definition which contains - sources (inputs) and sinks (outputs). This method extracts the sources. + sources (inputs) and sinks (outputs). This method extracts either based + on the endpoint_type parameter. Args: activity: The ExecuteDataFlow activity factory_key: Factory key for cache lookup + endpoint_type: "sources" or "sinks" Returns: - List of source dataset URNs + List of dataset URNs for the specified endpoint type """ - inputs: List[str] = [] + urns: list[str] = [] # Get the Data Flow name using our robust lookup data_flow_name = self._get_data_flow_name_from_activity(activity, factory_key) @@ -768,7 +795,7 @@ def _extract_data_flow_sources( logger.debug( f"Could not find Data Flow reference for activity: {activity.name}" ) - return inputs + return urns # Look up the Data Flow definition data_flows = self._data_flows_cache.get(factory_key, {}) @@ -776,73 +803,130 @@ def _extract_data_flow_sources( if not data_flow: logger.debug(f"Data Flow not found in cache: {data_flow_name}") - return inputs + return urns - # Extract sources from the Data Flow + # Extract endpoints from the Data Flow if data_flow.properties: - for source in data_flow.properties.sources: - if source.dataset: + endpoints = getattr(data_flow.properties, endpoint_type, []) + endpoint_label = endpoint_type[:-1] # "sources" -> "source" + for endpoint in endpoints: + if endpoint.dataset: dataset_urn = self._resolve_dataset_urn( - source.dataset.reference_name, factory_key + endpoint.dataset.reference_name, factory_key ) if dataset_urn: - inputs.append(str(dataset_urn)) + urns.append(str(dataset_urn)) self.report.report_lineage_extracted() logger.debug( - f"Extracted Data Flow source: {source.name} -> {dataset_urn}" + f"Extracted Data Flow {endpoint_label}: {endpoint.name} -> {dataset_urn}" ) - return inputs + return urns + + def _extract_data_flow_sources( + self, activity: Activity, factory_key: str + ) -> list[str]: + """Extract source dataset URNs from a Data Flow activity.""" + return self._extract_data_flow_endpoints(activity, factory_key, "sources") def _extract_data_flow_sinks( self, activity: Activity, factory_key: str - ) -> List[str]: - """Extract sink dataset URNs from a Data Flow activity. + ) -> list[str]: + """Extract sink dataset URNs from a Data Flow activity.""" + return self._extract_data_flow_endpoints(activity, factory_key, "sinks") - Data Flow activities reference a Data Flow definition which contains - sources (inputs) and sinks (outputs). This method extracts the sinks. + def _emit_pipeline_lineage( + self, + activity: Activity, + datajob: DataJob, + factory: Factory, + factory_key: str, + ) -> Iterable[MetadataWorkUnit]: + """Emit pipeline-to-pipeline lineage for ExecutePipeline activities. + + When a pipeline calls another pipeline via ExecutePipeline activity, + we create a DataJob-to-DataJob dependency from the calling activity + to the first activity in the child pipeline. This creates visible + lineage edges in the DataHub UI. Args: - activity: The ExecuteDataFlow activity - factory_key: Factory key for cache lookup + activity: The ExecutePipeline activity + datajob: The DataJob entity for this activity + factory: The parent Data Factory + factory_key: Factory key for URN construction - Returns: - List of sink dataset URNs + Yields: + MetadataWorkUnit for the pipeline dependency """ - outputs: List[str] = [] + if not activity.type_properties: + return - # Get the Data Flow name using our robust lookup - data_flow_name = self._get_data_flow_name_from_activity(activity, factory_key) + # Extract the child pipeline reference from typeProperties + pipeline_ref = activity.type_properties.get("pipeline", {}) + child_pipeline_name = pipeline_ref.get("referenceName") - if not data_flow_name: + if not child_pipeline_name: logger.debug( - f"Could not find Data Flow reference for activity: {activity.name}" + f"ExecutePipeline activity {activity.name} has no pipeline reference" ) - return outputs + return - # Look up the Data Flow definition - data_flows = self._data_flows_cache.get(factory_key, {}) - data_flow = data_flows.get(data_flow_name) + # Build the child pipeline's DataFlow URN + child_flow_id = f"{factory.name}.{child_pipeline_name}" + child_flow_urn = DataFlowUrn.create_from_ids( + orchestrator=PLATFORM, + flow_id=child_flow_id, + env=self.config.env, + ) - if not data_flow: - logger.debug(f"Data Flow not found in cache: {data_flow_name}") - return outputs + # Look up child pipeline from cache to get its first activity + pipelines = self._pipelines_cache.get(factory_key, {}) + child_pipeline = pipelines.get(child_pipeline_name) - # Extract sinks from the Data Flow - if data_flow.properties: - for sink in data_flow.properties.sinks: - if sink.dataset: - dataset_urn = self._resolve_dataset_urn( - sink.dataset.reference_name, factory_key - ) - if dataset_urn: - outputs.append(str(dataset_urn)) - self.report.report_lineage_extracted() - logger.debug( - f"Extracted Data Flow sink: {sink.name} -> {dataset_urn}" - ) + child_datajob_urn: Optional[DataJobUrn] = None + first_activity_name: Optional[str] = None - return outputs + if child_pipeline and child_pipeline.properties: + activities = child_pipeline.properties.activities + if activities: + first_activity_name = activities[0].name + child_datajob_urn = DataJobUrn.create_from_ids( + data_flow_urn=str(child_flow_urn), + job_id=first_activity_name, + ) + logger.debug( + f"ExecutePipeline {activity.name} -> {child_pipeline_name}." + f"{first_activity_name} (URN: {child_datajob_urn})" + ) + else: + logger.debug( + f"Child pipeline {child_pipeline_name} not found in cache or has no activities" + ) + + # Update custom properties to include the child pipeline reference + current_props = datajob.custom_properties + current_props["calls_pipeline"] = child_pipeline_name + current_props["child_pipeline_urn"] = str(child_flow_urn) + if first_activity_name: + current_props["child_first_activity"] = first_activity_name + datajob.set_custom_properties(current_props) + + self.report.report_lineage_extracted() + + # Emit DataJobInputOutput with the child DataJob as an input dependency + # This creates a visible lineage edge in the DataHub UI + input_datajobs: list[str] = [] + if child_datajob_urn: + input_datajobs.append(str(child_datajob_urn)) + + yield MetadataChangeProposalWrapper( + entityUrn=str(datajob.urn), + aspect=DataJobInputOutputClass( + inputDatasets=[], + outputDatasets=[], + inputDatajobs=input_datajobs, + ), + ).as_workunit() def _resolve_dataset_urn( self, dataset_name: str, factory_key: str @@ -980,12 +1064,12 @@ def _emit_pipeline_run( result = self._map_run_status(pipeline_run.status) # Build custom properties - properties: Dict[str, str] = { + properties: dict[str, str] = { "run_id": pipeline_run.run_id, "status": pipeline_run.status, } if pipeline_run.message: - properties["message"] = pipeline_run.message[:500] # Truncate long messages + properties["message"] = pipeline_run.message[:MAX_RUN_MESSAGE_LENGTH] if pipeline_run.invoked_by: invoker_name = pipeline_run.invoked_by.get("name", "") invoker_type = pipeline_run.invoked_by.get("invokedByType", "") @@ -995,9 +1079,9 @@ def _emit_pipeline_run( properties["invoked_by_type"] = invoker_type if pipeline_run.parameters: for key, value in list(pipeline_run.parameters.items())[ - :10 - ]: # Limit params - properties[f"param:{key}"] = str(value)[:100] + :MAX_RUN_PARAMETERS + ]: + properties[f"param:{key}"] = str(value)[:MAX_PARAMETER_VALUE_LENGTH] # Create DataProcessInstance dpi = DataProcessInstance( diff --git a/metadata-ingestion/tests/integration/azure_data_factory/adf_basic_golden.json b/metadata-ingestion/tests/integration/azure_data_factory/adf_basic_golden.json index 19e52adb2e48da..eab65cfd2c3a9b 100644 --- a/metadata-ingestion/tests/integration/azure_data_factory/adf_basic_golden.json +++ b/metadata-ingestion/tests/integration/azure_data_factory/adf_basic_golden.json @@ -1,13 +1,13 @@ [ { "entityType": "container", - "entityUrn": "urn:li:container:0690c0f5c38fc7fe7ad0518a59384f93", + "entityUrn": "urn:li:container:cdaebfa861d2b2a3853328719496bce5", "changeType": "UPSERT", "aspectName": "containerProperties", "aspect": { "json": { "customProperties": { - "platform": "azure_data_factory", + "platform": "azure-data-factory", "env": "PROD", "resource_group": "test-resource-group", "factory_name": "test-data-factory", @@ -31,12 +31,41 @@ }, { "entityType": "container", - "entityUrn": "urn:li:container:0690c0f5c38fc7fe7ad0518a59384f93", + "entityUrn": "urn:li:container:cdaebfa861d2b2a3853328719496bce5", "changeType": "UPSERT", "aspectName": "dataPlatformInstance", "aspect": { "json": { - "platform": "urn:li:dataPlatform:azure_data_factory" + "platform": "urn:li:dataPlatform:azure-data-factory" + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-test-basic", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure-data-factory,test-data-factory.DataProcessingPipeline,PROD),CallStoredProc)", + "changeType": "UPSERT", + "aspectName": "dataJobInfo", + "aspect": { + "json": { + "customProperties": { + "activity_type": "SqlServerStoredProcedure", + "activity_description": "Call stored procedure", + "timeout": "7.00:00:00", + "retry": "0" + }, + "externalUrl": "https://adf.azure.com/en/authoring/pipeline/DataProcessingPipeline?factory=/subscriptions/12345678-1234-1234-1234-123456789012/resourceGroups/test-resource-group/providers/Microsoft.DataFactory/factories/test-data-factory", + "name": "CallStoredProc", + "description": "Call stored procedure", + "type": { + "string": "COMMAND" + }, + "flowUrn": "urn:li:dataFlow:(azure-data-factory,test-data-factory.DataProcessingPipeline,PROD)", + "env": "PROD" } }, "systemMetadata": { @@ -47,13 +76,29 @@ }, { "entityType": "container", - "entityUrn": "urn:li:container:0690c0f5c38fc7fe7ad0518a59384f93", + "entityUrn": "urn:li:container:cdaebfa861d2b2a3853328719496bce5", + "changeType": "UPSERT", + "aspectName": "browsePathsV2", + "aspect": { + "json": { + "path": [] + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-test-basic", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure-data-factory,test-data-factory.DataProcessingPipeline,PROD),CallStoredProc)", "changeType": "UPSERT", "aspectName": "subTypes", "aspect": { "json": { "typeNames": [ - "Data Factory" + "Stored Procedure Activity" ] } }, @@ -65,12 +110,14 @@ }, { "entityType": "container", - "entityUrn": "urn:li:container:0690c0f5c38fc7fe7ad0518a59384f93", + "entityUrn": "urn:li:container:cdaebfa861d2b2a3853328719496bce5", "changeType": "UPSERT", - "aspectName": "browsePathsV2", + "aspectName": "subTypes", "aspect": { "json": { - "path": [] + "typeNames": [ + "Data Factory" + ] } }, "systemMetadata": { @@ -80,13 +127,13 @@ } }, { - "entityType": "dataFlow", - "entityUrn": "urn:li:dataFlow:(azure_data_factory,test-data-factory.DataIngestionPipeline,PROD)", + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure-data-factory,test-data-factory.DataProcessingPipeline,PROD),CallStoredProc)", "changeType": "UPSERT", "aspectName": "dataPlatformInstance", "aspect": { "json": { - "platform": "urn:li:dataPlatform:azure_data_factory" + "platform": "urn:li:dataPlatform:azure-data-factory" } }, "systemMetadata": { @@ -96,8 +143,8 @@ } }, { - "entityType": "container", - "entityUrn": "urn:li:container:0690c0f5c38fc7fe7ad0518a59384f93", + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure-data-factory,test-data-factory.DataProcessingPipeline,PROD),CallStoredProc)", "changeType": "UPSERT", "aspectName": "status", "aspect": { @@ -113,7 +160,23 @@ }, { "entityType": "dataFlow", - "entityUrn": "urn:li:dataFlow:(azure_data_factory,test-data-factory.DataIngestionPipeline,PROD)", + "entityUrn": "urn:li:dataFlow:(azure-data-factory,test-data-factory.DataIngestionPipeline,PROD)", + "changeType": "UPSERT", + "aspectName": "dataPlatformInstance", + "aspect": { + "json": { + "platform": "urn:li:dataPlatform:azure-data-factory" + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-test-basic", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataFlow", + "entityUrn": "urn:li:dataFlow:(azure-data-factory,test-data-factory.DataIngestionPipeline,PROD)", "changeType": "UPSERT", "aspectName": "dataFlowInfo", "aspect": { @@ -137,19 +200,19 @@ }, { "entityType": "dataJob", - "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure_data_factory,test-data-factory.DataIngestionPipeline,PROD),TransformData)", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure-data-factory,test-data-factory.DataProcessingPipeline,PROD),CallStoredProc)", "changeType": "UPSERT", "aspectName": "browsePathsV2", "aspect": { "json": { "path": [ { - "id": "urn:li:container:0690c0f5c38fc7fe7ad0518a59384f93", - "urn": "urn:li:container:0690c0f5c38fc7fe7ad0518a59384f93" + "id": "urn:li:container:cdaebfa861d2b2a3853328719496bce5", + "urn": "urn:li:container:cdaebfa861d2b2a3853328719496bce5" }, { - "id": "test-data-factory.DataIngestionPipeline", - "urn": "urn:li:dataFlow:(azure_data_factory,test-data-factory.DataIngestionPipeline,PROD)" + "id": "test-data-factory.DataProcessingPipeline", + "urn": "urn:li:dataFlow:(azure-data-factory,test-data-factory.DataProcessingPipeline,PROD)" } ] } @@ -162,7 +225,7 @@ }, { "entityType": "dataFlow", - "entityUrn": "urn:li:dataFlow:(azure_data_factory,test-data-factory.DataIngestionPipeline,PROD)", + "entityUrn": "urn:li:dataFlow:(azure-data-factory,test-data-factory.DataProcessingPipeline,PROD)", "changeType": "UPSERT", "aspectName": "subTypes", "aspect": { @@ -179,14 +242,35 @@ } }, { - "entityType": "dataJob", - "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure_data_factory,test-data-factory.DataIngestionPipeline,PROD),TransformData)", + "entityType": "dataFlow", + "entityUrn": "urn:li:dataFlow:(azure-data-factory,test-data-factory.DataIngestionPipeline,PROD)", + "changeType": "UPSERT", + "aspectName": "browsePathsV2", + "aspect": { + "json": { + "path": [ + { + "id": "urn:li:container:cdaebfa861d2b2a3853328719496bce5", + "urn": "urn:li:container:cdaebfa861d2b2a3853328719496bce5" + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-test-basic", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataFlow", + "entityUrn": "urn:li:dataFlow:(azure-data-factory,test-data-factory.DataIngestionPipeline,PROD)", "changeType": "UPSERT", "aspectName": "subTypes", "aspect": { "json": { "typeNames": [ - "Data Flow Activity" + "Pipeline" ] } }, @@ -198,12 +282,12 @@ }, { "entityType": "dataFlow", - "entityUrn": "urn:li:dataFlow:(azure_data_factory,test-data-factory.DataIngestionPipeline,PROD)", + "entityUrn": "urn:li:dataFlow:(azure-data-factory,test-data-factory.DataIngestionPipeline,PROD)", "changeType": "UPSERT", "aspectName": "container", "aspect": { "json": { - "container": "urn:li:container:0690c0f5c38fc7fe7ad0518a59384f93" + "container": "urn:li:container:cdaebfa861d2b2a3853328719496bce5" } }, "systemMetadata": { @@ -214,17 +298,12 @@ }, { "entityType": "dataFlow", - "entityUrn": "urn:li:dataFlow:(azure_data_factory,test-data-factory.DataIngestionPipeline,PROD)", + "entityUrn": "urn:li:dataFlow:(azure-data-factory,test-data-factory.DataProcessingPipeline,PROD)", "changeType": "UPSERT", - "aspectName": "browsePathsV2", + "aspectName": "container", "aspect": { "json": { - "path": [ - { - "id": "urn:li:container:0690c0f5c38fc7fe7ad0518a59384f93", - "urn": "urn:li:container:0690c0f5c38fc7fe7ad0518a59384f93" - } - ] + "container": "urn:li:container:cdaebfa861d2b2a3853328719496bce5" } }, "systemMetadata": { @@ -234,13 +313,13 @@ } }, { - "entityType": "dataJob", - "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure_data_factory,test-data-factory.DataIngestionPipeline,PROD),TransformData)", + "entityType": "dataFlow", + "entityUrn": "urn:li:dataFlow:(azure-data-factory,test-data-factory.DataProcessingPipeline,PROD)", "changeType": "UPSERT", "aspectName": "dataPlatformInstance", "aspect": { "json": { - "platform": "urn:li:dataPlatform:azure_data_factory" + "platform": "urn:li:dataPlatform:azure-data-factory" } }, "systemMetadata": { @@ -250,13 +329,20 @@ } }, { - "entityType": "dataJob", - "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure_data_factory,test-data-factory.DataIngestionPipeline,PROD),TransformData)", + "entityType": "dataFlow", + "entityUrn": "urn:li:dataFlow:(azure-data-factory,test-data-factory.DataProcessingPipeline,PROD)", "changeType": "UPSERT", - "aspectName": "status", + "aspectName": "dataFlowInfo", "aspect": { "json": { - "removed": false + "customProperties": { + "azure_resource_id": "/subscriptions/12345678-1234-1234-1234-123456789012/resourceGroups/test-resource-group/providers/Microsoft.DataFactory/factories/test-data-factory/pipelines/DataProcessingPipeline", + "factory_name": "test-data-factory" + }, + "externalUrl": "https://adf.azure.com/en/authoring/pipeline/DataProcessingPipeline?factory=/subscriptions/12345678-1234-1234-1234-123456789012/resourceGroups/test-resource-group/providers/Microsoft.DataFactory/factories/test-data-factory", + "name": "DataProcessingPipeline", + "description": "Data processing and transformation", + "env": "PROD" } }, "systemMetadata": { @@ -267,12 +353,28 @@ }, { "entityType": "dataJob", - "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure_data_factory,test-data-factory.DataIngestionPipeline,PROD),CopyBlobToSQL)", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure-data-factory,test-data-factory.DataIngestionPipeline,PROD),CopyBlobToSQL)", "changeType": "UPSERT", "aspectName": "dataPlatformInstance", "aspect": { "json": { - "platform": "urn:li:dataPlatform:azure_data_factory" + "platform": "urn:li:dataPlatform:azure-data-factory" + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-test-basic", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataFlow", + "entityUrn": "urn:li:dataFlow:(azure-data-factory,test-data-factory.DataProcessingPipeline,PROD)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false } }, "systemMetadata": { @@ -283,7 +385,7 @@ }, { "entityType": "dataJob", - "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure_data_factory,test-data-factory.DataIngestionPipeline,PROD),CopyBlobToSQL)", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure-data-factory,test-data-factory.DataIngestionPipeline,PROD),CopyBlobToSQL)", "changeType": "UPSERT", "aspectName": "dataJobInfo", "aspect": { @@ -300,7 +402,7 @@ "type": { "string": "COMMAND" }, - "flowUrn": "urn:li:dataFlow:(azure_data_factory,test-data-factory.DataIngestionPipeline,PROD)", + "flowUrn": "urn:li:dataFlow:(azure-data-factory,test-data-factory.DataIngestionPipeline,PROD)", "env": "PROD" } }, @@ -312,25 +414,17 @@ }, { "entityType": "dataJob", - "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure_data_factory,test-data-factory.DataIngestionPipeline,PROD),TransformData)", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure-data-factory,test-data-factory.DataIngestionPipeline,PROD),CopyBlobToSQL)", "changeType": "UPSERT", - "aspectName": "dataJobInfo", + "aspectName": "dataJobInputOutput", "aspect": { "json": { - "customProperties": { - "activity_type": "ExecuteDataFlow", - "activity_description": "Execute mapping data flow", - "timeout": "7.00:00:00", - "retry": "0" - }, - "externalUrl": "https://adf.azure.com/en/authoring/pipeline/DataIngestionPipeline?factory=/subscriptions/12345678-1234-1234-1234-123456789012/resourceGroups/test-resource-group/providers/Microsoft.DataFactory/factories/test-data-factory", - "name": "TransformData", - "description": "Execute mapping data flow", - "type": { - "string": "COMMAND" - }, - "flowUrn": "urn:li:dataFlow:(azure_data_factory,test-data-factory.DataIngestionPipeline,PROD)", - "env": "PROD" + "inputDatasets": [ + "urn:li:dataset:(urn:li:dataPlatform:azure_blob_storage,raw-data/input/data.csv,PROD)" + ], + "outputDatasets": [ + "urn:li:dataset:(urn:li:dataPlatform:mssql,ProcessedData,PROD)" + ] } }, "systemMetadata": { @@ -341,19 +435,19 @@ }, { "entityType": "dataJob", - "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure_data_factory,test-data-factory.DataIngestionPipeline,PROD),CopyBlobToSQL)", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure-data-factory,test-data-factory.DataIngestionPipeline,PROD),CopyBlobToSQL)", "changeType": "UPSERT", "aspectName": "browsePathsV2", "aspect": { "json": { "path": [ { - "id": "urn:li:container:0690c0f5c38fc7fe7ad0518a59384f93", - "urn": "urn:li:container:0690c0f5c38fc7fe7ad0518a59384f93" + "id": "urn:li:container:cdaebfa861d2b2a3853328719496bce5", + "urn": "urn:li:container:cdaebfa861d2b2a3853328719496bce5" }, { "id": "test-data-factory.DataIngestionPipeline", - "urn": "urn:li:dataFlow:(azure_data_factory,test-data-factory.DataIngestionPipeline,PROD)" + "urn": "urn:li:dataFlow:(azure-data-factory,test-data-factory.DataIngestionPipeline,PROD)" } ] } @@ -366,7 +460,7 @@ }, { "entityType": "dataJob", - "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure_data_factory,test-data-factory.DataIngestionPipeline,PROD),CopyBlobToSQL)", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure-data-factory,test-data-factory.DataIngestionPipeline,PROD),CopyBlobToSQL)", "changeType": "UPSERT", "aspectName": "subTypes", "aspect": { @@ -383,13 +477,18 @@ } }, { - "entityType": "dataJob", - "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure_data_factory,test-data-factory.DataIngestionPipeline,PROD),LookupConfig)", + "entityType": "dataFlow", + "entityUrn": "urn:li:dataFlow:(azure-data-factory,test-data-factory.DataProcessingPipeline,PROD)", "changeType": "UPSERT", - "aspectName": "dataPlatformInstance", + "aspectName": "browsePathsV2", "aspect": { "json": { - "platform": "urn:li:dataPlatform:azure_data_factory" + "path": [ + { + "id": "urn:li:container:cdaebfa861d2b2a3853328719496bce5", + "urn": "urn:li:container:cdaebfa861d2b2a3853328719496bce5" + } + ] } }, "systemMetadata": { @@ -400,7 +499,7 @@ }, { "entityType": "dataJob", - "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure_data_factory,test-data-factory.DataIngestionPipeline,PROD),LookupConfig)", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure-data-factory,test-data-factory.DataIngestionPipeline,PROD),LookupConfig)", "changeType": "UPSERT", "aspectName": "dataJobInfo", "aspect": { @@ -417,7 +516,7 @@ "type": { "string": "COMMAND" }, - "flowUrn": "urn:li:dataFlow:(azure_data_factory,test-data-factory.DataIngestionPipeline,PROD)", + "flowUrn": "urn:li:dataFlow:(azure-data-factory,test-data-factory.DataIngestionPipeline,PROD)", "env": "PROD" } }, @@ -429,16 +528,13 @@ }, { "entityType": "dataJob", - "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure_data_factory,test-data-factory.DataIngestionPipeline,PROD),CopyBlobToSQL)", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure-data-factory,test-data-factory.DataIngestionPipeline,PROD),LookupConfig)", "changeType": "UPSERT", - "aspectName": "dataJobInputOutput", + "aspectName": "subTypes", "aspect": { "json": { - "inputDatasets": [ - "urn:li:dataset:(urn:li:dataPlatform:azure_blob_storage,raw-data/input/data.csv,PROD)" - ], - "outputDatasets": [ - "urn:li:dataset:(urn:li:dataPlatform:mssql,ProcessedData,PROD)" + "typeNames": [ + "Lookup Activity" ] } }, @@ -450,28 +546,12 @@ }, { "entityType": "dataJob", - "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure_data_factory,test-data-factory.DataIngestionPipeline,PROD),CopyBlobToSQL)", - "changeType": "UPSERT", - "aspectName": "status", - "aspect": { - "json": { - "removed": false - } - }, - "systemMetadata": { - "lastObserved": 1705320000000, - "runId": "adf-test-basic", - "lastRunId": "no-run-id-provided" - } -}, -{ - "entityType": "dataFlow", - "entityUrn": "urn:li:dataFlow:(azure_data_factory,test-data-factory.DataProcessingPipeline,PROD)", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure-data-factory,test-data-factory.DataIngestionPipeline,PROD),LookupConfig)", "changeType": "UPSERT", "aspectName": "dataPlatformInstance", "aspect": { "json": { - "platform": "urn:li:dataPlatform:azure_data_factory" + "platform": "urn:li:dataPlatform:azure-data-factory" } }, "systemMetadata": { @@ -482,37 +562,12 @@ }, { "entityType": "dataJob", - "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure_data_factory,test-data-factory.DataIngestionPipeline,PROD),LookupConfig)", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure-data-factory,test-data-factory.DataIngestionPipeline,PROD),TransformData)", "changeType": "UPSERT", - "aspectName": "subTypes", - "aspect": { - "json": { - "typeNames": [ - "Lookup Activity" - ] - } - }, - "systemMetadata": { - "lastObserved": 1705320000000, - "runId": "adf-test-basic", - "lastRunId": "no-run-id-provided" - } -}, -{ - "entityType": "dataFlow", - "entityUrn": "urn:li:dataFlow:(azure_data_factory,test-data-factory.DataProcessingPipeline,PROD)", - "changeType": "UPSERT", - "aspectName": "dataFlowInfo", + "aspectName": "dataPlatformInstance", "aspect": { "json": { - "customProperties": { - "azure_resource_id": "/subscriptions/12345678-1234-1234-1234-123456789012/resourceGroups/test-resource-group/providers/Microsoft.DataFactory/factories/test-data-factory/pipelines/DataProcessingPipeline", - "factory_name": "test-data-factory" - }, - "externalUrl": "https://adf.azure.com/en/authoring/pipeline/DataProcessingPipeline?factory=/subscriptions/12345678-1234-1234-1234-123456789012/resourceGroups/test-resource-group/providers/Microsoft.DataFactory/factories/test-data-factory", - "name": "DataProcessingPipeline", - "description": "Data processing and transformation", - "env": "PROD" + "platform": "urn:li:dataPlatform:azure-data-factory" } }, "systemMetadata": { @@ -523,7 +578,7 @@ }, { "entityType": "dataJob", - "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure_data_factory,test-data-factory.DataIngestionPipeline,PROD),LookupConfig)", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure-data-factory,test-data-factory.DataIngestionPipeline,PROD),LookupConfig)", "changeType": "UPSERT", "aspectName": "dataJobInputOutput", "aspect": { @@ -541,15 +596,26 @@ } }, { - "entityType": "dataFlow", - "entityUrn": "urn:li:dataFlow:(azure_data_factory,test-data-factory.DataProcessingPipeline,PROD)", + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure-data-factory,test-data-factory.DataIngestionPipeline,PROD),TransformData)", "changeType": "UPSERT", - "aspectName": "subTypes", + "aspectName": "dataJobInfo", "aspect": { "json": { - "typeNames": [ - "Pipeline" - ] + "customProperties": { + "activity_type": "ExecuteDataFlow", + "activity_description": "Execute mapping data flow", + "timeout": "7.00:00:00", + "retry": "0" + }, + "externalUrl": "https://adf.azure.com/en/authoring/pipeline/DataIngestionPipeline?factory=/subscriptions/12345678-1234-1234-1234-123456789012/resourceGroups/test-resource-group/providers/Microsoft.DataFactory/factories/test-data-factory", + "name": "TransformData", + "description": "Execute mapping data flow", + "type": { + "string": "COMMAND" + }, + "flowUrn": "urn:li:dataFlow:(azure-data-factory,test-data-factory.DataIngestionPipeline,PROD)", + "env": "PROD" } }, "systemMetadata": { @@ -560,19 +626,19 @@ }, { "entityType": "dataJob", - "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure_data_factory,test-data-factory.DataProcessingPipeline,PROD),CallStoredProc)", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure-data-factory,test-data-factory.DataIngestionPipeline,PROD),TransformData)", "changeType": "UPSERT", "aspectName": "browsePathsV2", "aspect": { "json": { "path": [ { - "id": "urn:li:container:0690c0f5c38fc7fe7ad0518a59384f93", - "urn": "urn:li:container:0690c0f5c38fc7fe7ad0518a59384f93" + "id": "urn:li:container:cdaebfa861d2b2a3853328719496bce5", + "urn": "urn:li:container:cdaebfa861d2b2a3853328719496bce5" }, { - "id": "test-data-factory.DataProcessingPipeline", - "urn": "urn:li:dataFlow:(azure_data_factory,test-data-factory.DataProcessingPipeline,PROD)" + "id": "test-data-factory.DataIngestionPipeline", + "urn": "urn:li:dataFlow:(azure-data-factory,test-data-factory.DataIngestionPipeline,PROD)" } ] } @@ -584,13 +650,15 @@ } }, { - "entityType": "dataFlow", - "entityUrn": "urn:li:dataFlow:(azure_data_factory,test-data-factory.DataProcessingPipeline,PROD)", + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure-data-factory,test-data-factory.DataIngestionPipeline,PROD),TransformData)", "changeType": "UPSERT", - "aspectName": "container", + "aspectName": "subTypes", "aspect": { "json": { - "container": "urn:li:container:0690c0f5c38fc7fe7ad0518a59384f93" + "typeNames": [ + "Data Flow Activity" + ] } }, "systemMetadata": { @@ -600,18 +668,13 @@ } }, { - "entityType": "dataFlow", - "entityUrn": "urn:li:dataFlow:(azure_data_factory,test-data-factory.DataProcessingPipeline,PROD)", + "entityType": "container", + "entityUrn": "urn:li:container:cdaebfa861d2b2a3853328719496bce5", "changeType": "UPSERT", - "aspectName": "browsePathsV2", + "aspectName": "status", "aspect": { "json": { - "path": [ - { - "id": "urn:li:container:0690c0f5c38fc7fe7ad0518a59384f93", - "urn": "urn:li:container:0690c0f5c38fc7fe7ad0518a59384f93" - } - ] + "removed": false } }, "systemMetadata": { @@ -622,14 +685,12 @@ }, { "entityType": "dataJob", - "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure_data_factory,test-data-factory.DataProcessingPipeline,PROD),CallStoredProc)", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure-data-factory,test-data-factory.DataIngestionPipeline,PROD),LookupConfig)", "changeType": "UPSERT", - "aspectName": "subTypes", + "aspectName": "status", "aspect": { "json": { - "typeNames": [ - "Stored Procedure Activity" - ] + "removed": false } }, "systemMetadata": { @@ -640,19 +701,19 @@ }, { "entityType": "dataJob", - "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure_data_factory,test-data-factory.DataIngestionPipeline,PROD),LookupConfig)", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure-data-factory,test-data-factory.DataIngestionPipeline,PROD),LookupConfig)", "changeType": "UPSERT", "aspectName": "browsePathsV2", "aspect": { "json": { "path": [ { - "id": "urn:li:container:0690c0f5c38fc7fe7ad0518a59384f93", - "urn": "urn:li:container:0690c0f5c38fc7fe7ad0518a59384f93" + "id": "urn:li:container:cdaebfa861d2b2a3853328719496bce5", + "urn": "urn:li:container:cdaebfa861d2b2a3853328719496bce5" }, { "id": "test-data-factory.DataIngestionPipeline", - "urn": "urn:li:dataFlow:(azure_data_factory,test-data-factory.DataIngestionPipeline,PROD)" + "urn": "urn:li:dataFlow:(azure-data-factory,test-data-factory.DataIngestionPipeline,PROD)" } ] } @@ -665,23 +726,7 @@ }, { "entityType": "dataJob", - "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure_data_factory,test-data-factory.DataProcessingPipeline,PROD),CallStoredProc)", - "changeType": "UPSERT", - "aspectName": "dataPlatformInstance", - "aspect": { - "json": { - "platform": "urn:li:dataPlatform:azure_data_factory" - } - }, - "systemMetadata": { - "lastObserved": 1705320000000, - "runId": "adf-test-basic", - "lastRunId": "no-run-id-provided" - } -}, -{ - "entityType": "dataFlow", - "entityUrn": "urn:li:dataFlow:(azure_data_factory,test-data-factory.DataProcessingPipeline,PROD)", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure-data-factory,test-data-factory.DataIngestionPipeline,PROD),CopyBlobToSQL)", "changeType": "UPSERT", "aspectName": "status", "aspect": { @@ -697,52 +742,7 @@ }, { "entityType": "dataJob", - "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure_data_factory,test-data-factory.DataProcessingPipeline,PROD),CallStoredProc)", - "changeType": "UPSERT", - "aspectName": "status", - "aspect": { - "json": { - "removed": false - } - }, - "systemMetadata": { - "lastObserved": 1705320000000, - "runId": "adf-test-basic", - "lastRunId": "no-run-id-provided" - } -}, -{ - "entityType": "dataJob", - "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure_data_factory,test-data-factory.DataProcessingPipeline,PROD),CallStoredProc)", - "changeType": "UPSERT", - "aspectName": "dataJobInfo", - "aspect": { - "json": { - "customProperties": { - "activity_type": "SqlServerStoredProcedure", - "activity_description": "Call stored procedure", - "timeout": "7.00:00:00", - "retry": "0" - }, - "externalUrl": "https://adf.azure.com/en/authoring/pipeline/DataProcessingPipeline?factory=/subscriptions/12345678-1234-1234-1234-123456789012/resourceGroups/test-resource-group/providers/Microsoft.DataFactory/factories/test-data-factory", - "name": "CallStoredProc", - "description": "Call stored procedure", - "type": { - "string": "COMMAND" - }, - "flowUrn": "urn:li:dataFlow:(azure_data_factory,test-data-factory.DataProcessingPipeline,PROD)", - "env": "PROD" - } - }, - "systemMetadata": { - "lastObserved": 1705320000000, - "runId": "adf-test-basic", - "lastRunId": "no-run-id-provided" - } -}, -{ - "entityType": "dataJob", - "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure_data_factory,test-data-factory.DataIngestionPipeline,PROD),LookupConfig)", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure-data-factory,test-data-factory.DataIngestionPipeline,PROD),TransformData)", "changeType": "UPSERT", "aspectName": "status", "aspect": { @@ -758,7 +758,7 @@ }, { "entityType": "dataFlow", - "entityUrn": "urn:li:dataFlow:(azure_data_factory,test-data-factory.DataIngestionPipeline,PROD)", + "entityUrn": "urn:li:dataFlow:(azure-data-factory,test-data-factory.DataIngestionPipeline,PROD)", "changeType": "UPSERT", "aspectName": "status", "aspect": { diff --git a/metadata-ingestion/tests/integration/azure_data_factory/adf_branching_golden.json b/metadata-ingestion/tests/integration/azure_data_factory/adf_branching_golden.json new file mode 100644 index 00000000000000..ab0139120c5ca9 --- /dev/null +++ b/metadata-ingestion/tests/integration/azure_data_factory/adf_branching_golden.json @@ -0,0 +1,508 @@ +[ +{ + "entityType": "container", + "entityUrn": "urn:li:container:7f68411fe6779dae7f37506c1e9bacfe", + "changeType": "UPSERT", + "aspectName": "containerProperties", + "aspect": { + "json": { + "customProperties": { + "platform": "azure-data-factory", + "env": "DEV", + "resource_group": "complex-test-rg", + "factory_name": "complex-data-factory", + "azure_resource_id": "/subscriptions/12345678-1234-1234-1234-123456789012/resourceGroups/complex-test-rg/providers/Microsoft.DataFactory/factories/complex-data-factory", + "location": "eastus", + "tag:environment": "test", + "tag:purpose": "complex-integration-tests", + "provisioning_state": "Succeeded" + }, + "externalUrl": "https://adf.azure.com/en/home?factory=/subscriptions/12345678-1234-1234-1234-123456789012/resourceGroups/complex-test-rg/providers/Microsoft.DataFactory/factories/complex-data-factory", + "name": "complex-data-factory", + "description": "Azure Data Factory: complex-data-factory", + "env": "DEV" + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-branching-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "container", + "entityUrn": "urn:li:container:7f68411fe6779dae7f37506c1e9bacfe", + "changeType": "UPSERT", + "aspectName": "dataPlatformInstance", + "aspect": { + "json": { + "platform": "urn:li:dataPlatform:azure-data-factory" + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-branching-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataFlow", + "entityUrn": "urn:li:dataFlow:(azure-data-factory,complex-data-factory.BranchingPipeline,DEV)", + "changeType": "UPSERT", + "aspectName": "subTypes", + "aspect": { + "json": { + "typeNames": [ + "Pipeline" + ] + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-branching-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "container", + "entityUrn": "urn:li:container:7f68411fe6779dae7f37506c1e9bacfe", + "changeType": "UPSERT", + "aspectName": "browsePathsV2", + "aspect": { + "json": { + "path": [] + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-branching-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataFlow", + "entityUrn": "urn:li:dataFlow:(azure-data-factory,complex-data-factory.BranchingPipeline,DEV)", + "changeType": "UPSERT", + "aspectName": "container", + "aspect": { + "json": { + "container": "urn:li:container:7f68411fe6779dae7f37506c1e9bacfe" + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-branching-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "container", + "entityUrn": "urn:li:container:7f68411fe6779dae7f37506c1e9bacfe", + "changeType": "UPSERT", + "aspectName": "subTypes", + "aspect": { + "json": { + "typeNames": [ + "Data Factory" + ] + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-branching-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataFlow", + "entityUrn": "urn:li:dataFlow:(azure-data-factory,complex-data-factory.BranchingPipeline,DEV)", + "changeType": "UPSERT", + "aspectName": "dataPlatformInstance", + "aspect": { + "json": { + "platform": "urn:li:dataPlatform:azure-data-factory" + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-branching-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataFlow", + "entityUrn": "urn:li:dataFlow:(azure-data-factory,complex-data-factory.BranchingPipeline,DEV)", + "changeType": "UPSERT", + "aspectName": "dataFlowInfo", + "aspect": { + "json": { + "customProperties": { + "azure_resource_id": "/subscriptions/12345678-1234-1234-1234-123456789012/resourceGroups/complex-test-rg/providers/Microsoft.DataFactory/factories/complex-data-factory/pipelines/BranchingPipeline", + "factory_name": "complex-data-factory" + }, + "externalUrl": "https://adf.azure.com/en/authoring/pipeline/BranchingPipeline?factory=/subscriptions/12345678-1234-1234-1234-123456789012/resourceGroups/complex-test-rg/providers/Microsoft.DataFactory/factories/complex-data-factory", + "name": "BranchingPipeline", + "description": "Pipeline with If-Condition and Switch branching", + "env": "DEV" + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-branching-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataFlow", + "entityUrn": "urn:li:dataFlow:(azure-data-factory,complex-data-factory.BranchingPipeline,DEV)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-branching-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataFlow", + "entityUrn": "urn:li:dataFlow:(azure-data-factory,complex-data-factory.BranchingPipeline,DEV)", + "changeType": "UPSERT", + "aspectName": "browsePathsV2", + "aspect": { + "json": { + "path": [ + { + "id": "urn:li:container:7f68411fe6779dae7f37506c1e9bacfe", + "urn": "urn:li:container:7f68411fe6779dae7f37506c1e9bacfe" + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-branching-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure-data-factory,complex-data-factory.BranchingPipeline,DEV),ProcessByRegion)", + "changeType": "UPSERT", + "aspectName": "dataJobInfo", + "aspect": { + "json": { + "customProperties": { + "activity_type": "Switch" + }, + "externalUrl": "https://adf.azure.com/en/authoring/pipeline/BranchingPipeline?factory=/subscriptions/12345678-1234-1234-1234-123456789012/resourceGroups/complex-test-rg/providers/Microsoft.DataFactory/factories/complex-data-factory", + "name": "ProcessByRegion", + "type": { + "string": "COMMAND" + }, + "flowUrn": "urn:li:dataFlow:(azure-data-factory,complex-data-factory.BranchingPipeline,DEV)", + "env": "DEV" + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-branching-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure-data-factory,complex-data-factory.BranchingPipeline,DEV),ProcessByRegion)", + "changeType": "UPSERT", + "aspectName": "subTypes", + "aspect": { + "json": { + "typeNames": [ + "Switch Activity" + ] + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-branching-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure-data-factory,complex-data-factory.BranchingPipeline,DEV),ProcessByRegion)", + "changeType": "UPSERT", + "aspectName": "dataPlatformInstance", + "aspect": { + "json": { + "platform": "urn:li:dataPlatform:azure-data-factory" + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-branching-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure-data-factory,complex-data-factory.BranchingPipeline,DEV),CheckDataExists)", + "changeType": "UPSERT", + "aspectName": "dataPlatformInstance", + "aspect": { + "json": { + "platform": "urn:li:dataPlatform:azure-data-factory" + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-branching-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure-data-factory,complex-data-factory.BranchingPipeline,DEV),ProcessByRegion)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-branching-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure-data-factory,complex-data-factory.BranchingPipeline,DEV),CheckDataExists)", + "changeType": "UPSERT", + "aspectName": "dataJobInfo", + "aspect": { + "json": { + "customProperties": { + "activity_type": "Lookup" + }, + "externalUrl": "https://adf.azure.com/en/authoring/pipeline/BranchingPipeline?factory=/subscriptions/12345678-1234-1234-1234-123456789012/resourceGroups/complex-test-rg/providers/Microsoft.DataFactory/factories/complex-data-factory", + "name": "CheckDataExists", + "type": { + "string": "COMMAND" + }, + "flowUrn": "urn:li:dataFlow:(azure-data-factory,complex-data-factory.BranchingPipeline,DEV)", + "env": "DEV" + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-branching-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure-data-factory,complex-data-factory.BranchingPipeline,DEV),CheckDataExists)", + "changeType": "UPSERT", + "aspectName": "browsePathsV2", + "aspect": { + "json": { + "path": [ + { + "id": "urn:li:container:7f68411fe6779dae7f37506c1e9bacfe", + "urn": "urn:li:container:7f68411fe6779dae7f37506c1e9bacfe" + }, + { + "id": "complex-data-factory.BranchingPipeline", + "urn": "urn:li:dataFlow:(azure-data-factory,complex-data-factory.BranchingPipeline,DEV)" + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-branching-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure-data-factory,complex-data-factory.BranchingPipeline,DEV),DataExistsCheck)", + "changeType": "UPSERT", + "aspectName": "dataPlatformInstance", + "aspect": { + "json": { + "platform": "urn:li:dataPlatform:azure-data-factory" + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-branching-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure-data-factory,complex-data-factory.BranchingPipeline,DEV),CheckDataExists)", + "changeType": "UPSERT", + "aspectName": "subTypes", + "aspect": { + "json": { + "typeNames": [ + "Lookup Activity" + ] + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-branching-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure-data-factory,complex-data-factory.BranchingPipeline,DEV),DataExistsCheck)", + "changeType": "UPSERT", + "aspectName": "dataJobInfo", + "aspect": { + "json": { + "customProperties": { + "activity_type": "IfCondition" + }, + "externalUrl": "https://adf.azure.com/en/authoring/pipeline/BranchingPipeline?factory=/subscriptions/12345678-1234-1234-1234-123456789012/resourceGroups/complex-test-rg/providers/Microsoft.DataFactory/factories/complex-data-factory", + "name": "DataExistsCheck", + "type": { + "string": "COMMAND" + }, + "flowUrn": "urn:li:dataFlow:(azure-data-factory,complex-data-factory.BranchingPipeline,DEV)", + "env": "DEV" + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-branching-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure-data-factory,complex-data-factory.BranchingPipeline,DEV),DataExistsCheck)", + "changeType": "UPSERT", + "aspectName": "browsePathsV2", + "aspect": { + "json": { + "path": [ + { + "id": "urn:li:container:7f68411fe6779dae7f37506c1e9bacfe", + "urn": "urn:li:container:7f68411fe6779dae7f37506c1e9bacfe" + }, + { + "id": "complex-data-factory.BranchingPipeline", + "urn": "urn:li:dataFlow:(azure-data-factory,complex-data-factory.BranchingPipeline,DEV)" + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-branching-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure-data-factory,complex-data-factory.BranchingPipeline,DEV),DataExistsCheck)", + "changeType": "UPSERT", + "aspectName": "subTypes", + "aspect": { + "json": { + "typeNames": [ + "If Condition" + ] + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-branching-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "container", + "entityUrn": "urn:li:container:7f68411fe6779dae7f37506c1e9bacfe", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-branching-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure-data-factory,complex-data-factory.BranchingPipeline,DEV),ProcessByRegion)", + "changeType": "UPSERT", + "aspectName": "browsePathsV2", + "aspect": { + "json": { + "path": [ + { + "id": "urn:li:container:7f68411fe6779dae7f37506c1e9bacfe", + "urn": "urn:li:container:7f68411fe6779dae7f37506c1e9bacfe" + }, + { + "id": "complex-data-factory.BranchingPipeline", + "urn": "urn:li:dataFlow:(azure-data-factory,complex-data-factory.BranchingPipeline,DEV)" + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-branching-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure-data-factory,complex-data-factory.BranchingPipeline,DEV),CheckDataExists)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-branching-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure-data-factory,complex-data-factory.BranchingPipeline,DEV),DataExistsCheck)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-branching-test", + "lastRunId": "no-run-id-provided" + } +} +] \ No newline at end of file diff --git a/metadata-ingestion/tests/integration/azure_data_factory/adf_dataflow_golden.json b/metadata-ingestion/tests/integration/azure_data_factory/adf_dataflow_golden.json new file mode 100644 index 00000000000000..80468bab9c4f0b --- /dev/null +++ b/metadata-ingestion/tests/integration/azure_data_factory/adf_dataflow_golden.json @@ -0,0 +1,308 @@ +[ +{ + "entityType": "container", + "entityUrn": "urn:li:container:7f68411fe6779dae7f37506c1e9bacfe", + "changeType": "UPSERT", + "aspectName": "containerProperties", + "aspect": { + "json": { + "customProperties": { + "platform": "azure-data-factory", + "env": "DEV", + "resource_group": "complex-test-rg", + "factory_name": "complex-data-factory", + "azure_resource_id": "/subscriptions/12345678-1234-1234-1234-123456789012/resourceGroups/complex-test-rg/providers/Microsoft.DataFactory/factories/complex-data-factory", + "location": "eastus", + "tag:environment": "test", + "tag:purpose": "complex-integration-tests", + "provisioning_state": "Succeeded" + }, + "externalUrl": "https://adf.azure.com/en/home?factory=/subscriptions/12345678-1234-1234-1234-123456789012/resourceGroups/complex-test-rg/providers/Microsoft.DataFactory/factories/complex-data-factory", + "name": "complex-data-factory", + "description": "Azure Data Factory: complex-data-factory", + "env": "DEV" + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-dataflow-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "container", + "entityUrn": "urn:li:container:7f68411fe6779dae7f37506c1e9bacfe", + "changeType": "UPSERT", + "aspectName": "dataPlatformInstance", + "aspect": { + "json": { + "platform": "urn:li:dataPlatform:azure-data-factory" + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-dataflow-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure-data-factory,complex-data-factory.DataFlowPipeline,DEV),RunSalesTransformation)", + "changeType": "UPSERT", + "aspectName": "dataJobInfo", + "aspect": { + "json": { + "customProperties": { + "activity_type": "ExecuteDataFlow" + }, + "externalUrl": "https://adf.azure.com/en/authoring/pipeline/DataFlowPipeline?factory=/subscriptions/12345678-1234-1234-1234-123456789012/resourceGroups/complex-test-rg/providers/Microsoft.DataFactory/factories/complex-data-factory", + "name": "RunSalesTransformation", + "type": { + "string": "COMMAND" + }, + "flowUrn": "urn:li:dataFlow:(azure-data-factory,complex-data-factory.DataFlowPipeline,DEV)", + "env": "DEV" + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-dataflow-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "container", + "entityUrn": "urn:li:container:7f68411fe6779dae7f37506c1e9bacfe", + "changeType": "UPSERT", + "aspectName": "browsePathsV2", + "aspect": { + "json": { + "path": [] + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-dataflow-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure-data-factory,complex-data-factory.DataFlowPipeline,DEV),RunSalesTransformation)", + "changeType": "UPSERT", + "aspectName": "subTypes", + "aspect": { + "json": { + "typeNames": [ + "Data Flow Activity" + ] + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-dataflow-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "container", + "entityUrn": "urn:li:container:7f68411fe6779dae7f37506c1e9bacfe", + "changeType": "UPSERT", + "aspectName": "subTypes", + "aspect": { + "json": { + "typeNames": [ + "Data Factory" + ] + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-dataflow-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure-data-factory,complex-data-factory.DataFlowPipeline,DEV),RunSalesTransformation)", + "changeType": "UPSERT", + "aspectName": "dataPlatformInstance", + "aspect": { + "json": { + "platform": "urn:li:dataPlatform:azure-data-factory" + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-dataflow-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure-data-factory,complex-data-factory.DataFlowPipeline,DEV),RunSalesTransformation)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-dataflow-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataFlow", + "entityUrn": "urn:li:dataFlow:(azure-data-factory,complex-data-factory.DataFlowPipeline,DEV)", + "changeType": "UPSERT", + "aspectName": "dataPlatformInstance", + "aspect": { + "json": { + "platform": "urn:li:dataPlatform:azure-data-factory" + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-dataflow-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataFlow", + "entityUrn": "urn:li:dataFlow:(azure-data-factory,complex-data-factory.DataFlowPipeline,DEV)", + "changeType": "UPSERT", + "aspectName": "dataFlowInfo", + "aspect": { + "json": { + "customProperties": { + "azure_resource_id": "/subscriptions/12345678-1234-1234-1234-123456789012/resourceGroups/complex-test-rg/providers/Microsoft.DataFactory/factories/complex-data-factory/pipelines/DataFlowPipeline", + "factory_name": "complex-data-factory" + }, + "externalUrl": "https://adf.azure.com/en/authoring/pipeline/DataFlowPipeline?factory=/subscriptions/12345678-1234-1234-1234-123456789012/resourceGroups/complex-test-rg/providers/Microsoft.DataFactory/factories/complex-data-factory", + "name": "DataFlowPipeline", + "description": "Pipeline that executes a mapping data flow", + "env": "DEV" + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-dataflow-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure-data-factory,complex-data-factory.DataFlowPipeline,DEV),RunSalesTransformation)", + "changeType": "UPSERT", + "aspectName": "browsePathsV2", + "aspect": { + "json": { + "path": [ + { + "id": "urn:li:container:7f68411fe6779dae7f37506c1e9bacfe", + "urn": "urn:li:container:7f68411fe6779dae7f37506c1e9bacfe" + }, + { + "id": "complex-data-factory.DataFlowPipeline", + "urn": "urn:li:dataFlow:(azure-data-factory,complex-data-factory.DataFlowPipeline,DEV)" + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-dataflow-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataFlow", + "entityUrn": "urn:li:dataFlow:(azure-data-factory,complex-data-factory.DataFlowPipeline,DEV)", + "changeType": "UPSERT", + "aspectName": "subTypes", + "aspect": { + "json": { + "typeNames": [ + "Pipeline" + ] + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-dataflow-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataFlow", + "entityUrn": "urn:li:dataFlow:(azure-data-factory,complex-data-factory.DataFlowPipeline,DEV)", + "changeType": "UPSERT", + "aspectName": "browsePathsV2", + "aspect": { + "json": { + "path": [ + { + "id": "urn:li:container:7f68411fe6779dae7f37506c1e9bacfe", + "urn": "urn:li:container:7f68411fe6779dae7f37506c1e9bacfe" + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-dataflow-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataFlow", + "entityUrn": "urn:li:dataFlow:(azure-data-factory,complex-data-factory.DataFlowPipeline,DEV)", + "changeType": "UPSERT", + "aspectName": "container", + "aspect": { + "json": { + "container": "urn:li:container:7f68411fe6779dae7f37506c1e9bacfe" + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-dataflow-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "container", + "entityUrn": "urn:li:container:7f68411fe6779dae7f37506c1e9bacfe", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-dataflow-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataFlow", + "entityUrn": "urn:li:dataFlow:(azure-data-factory,complex-data-factory.DataFlowPipeline,DEV)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-dataflow-test", + "lastRunId": "no-run-id-provided" + } +} +] \ No newline at end of file diff --git a/metadata-ingestion/tests/integration/azure_data_factory/adf_diverse_golden.json b/metadata-ingestion/tests/integration/azure_data_factory/adf_diverse_golden.json new file mode 100644 index 00000000000000..b9d147739ddcea --- /dev/null +++ b/metadata-ingestion/tests/integration/azure_data_factory/adf_diverse_golden.json @@ -0,0 +1,1108 @@ +[ +{ + "entityType": "container", + "entityUrn": "urn:li:container:7f68411fe6779dae7f37506c1e9bacfe", + "changeType": "UPSERT", + "aspectName": "containerProperties", + "aspect": { + "json": { + "customProperties": { + "platform": "azure-data-factory", + "env": "DEV", + "resource_group": "complex-test-rg", + "factory_name": "complex-data-factory", + "azure_resource_id": "/subscriptions/12345678-1234-1234-1234-123456789012/resourceGroups/complex-test-rg/providers/Microsoft.DataFactory/factories/complex-data-factory", + "location": "eastus", + "tag:environment": "test", + "tag:purpose": "complex-integration-tests", + "provisioning_state": "Succeeded" + }, + "externalUrl": "https://adf.azure.com/en/home?factory=/subscriptions/12345678-1234-1234-1234-123456789012/resourceGroups/complex-test-rg/providers/Microsoft.DataFactory/factories/complex-data-factory", + "name": "complex-data-factory", + "description": "Azure Data Factory: complex-data-factory", + "env": "DEV" + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-diverse-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "container", + "entityUrn": "urn:li:container:7f68411fe6779dae7f37506c1e9bacfe", + "changeType": "UPSERT", + "aspectName": "dataPlatformInstance", + "aspect": { + "json": { + "platform": "urn:li:dataPlatform:azure-data-factory" + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-diverse-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure-data-factory,complex-data-factory.DiverseActivitiesPipeline,DEV),CheckOutputExists)", + "changeType": "UPSERT", + "aspectName": "dataJobInfo", + "aspect": { + "json": { + "customProperties": { + "activity_type": "GetMetadata" + }, + "externalUrl": "https://adf.azure.com/en/authoring/pipeline/DiverseActivitiesPipeline?factory=/subscriptions/12345678-1234-1234-1234-123456789012/resourceGroups/complex-test-rg/providers/Microsoft.DataFactory/factories/complex-data-factory", + "name": "CheckOutputExists", + "type": { + "string": "COMMAND" + }, + "flowUrn": "urn:li:dataFlow:(azure-data-factory,complex-data-factory.DiverseActivitiesPipeline,DEV)", + "env": "DEV" + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-diverse-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "container", + "entityUrn": "urn:li:container:7f68411fe6779dae7f37506c1e9bacfe", + "changeType": "UPSERT", + "aspectName": "browsePathsV2", + "aspect": { + "json": { + "path": [] + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-diverse-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataFlow", + "entityUrn": "urn:li:dataFlow:(azure-data-factory,complex-data-factory.DiverseActivitiesPipeline,DEV)", + "changeType": "UPSERT", + "aspectName": "dataPlatformInstance", + "aspect": { + "json": { + "platform": "urn:li:dataPlatform:azure-data-factory" + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-diverse-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataFlow", + "entityUrn": "urn:li:dataFlow:(azure-data-factory,complex-data-factory.DiverseActivitiesPipeline,DEV)", + "changeType": "UPSERT", + "aspectName": "dataFlowInfo", + "aspect": { + "json": { + "customProperties": { + "azure_resource_id": "/subscriptions/12345678-1234-1234-1234-123456789012/resourceGroups/complex-test-rg/providers/Microsoft.DataFactory/factories/complex-data-factory/pipelines/DiverseActivitiesPipeline", + "factory_name": "complex-data-factory" + }, + "externalUrl": "https://adf.azure.com/en/authoring/pipeline/DiverseActivitiesPipeline?factory=/subscriptions/12345678-1234-1234-1234-123456789012/resourceGroups/complex-test-rg/providers/Microsoft.DataFactory/factories/complex-data-factory", + "name": "DiverseActivitiesPipeline", + "description": "Pipeline demonstrating various activity types", + "env": "DEV" + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-diverse-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure-data-factory,complex-data-factory.DiverseActivitiesPipeline,DEV),CheckOutputExists)", + "changeType": "UPSERT", + "aspectName": "subTypes", + "aspect": { + "json": { + "typeNames": [ + "Get Metadata Activity" + ] + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-diverse-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "container", + "entityUrn": "urn:li:container:7f68411fe6779dae7f37506c1e9bacfe", + "changeType": "UPSERT", + "aspectName": "subTypes", + "aspect": { + "json": { + "typeNames": [ + "Data Factory" + ] + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-diverse-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataFlow", + "entityUrn": "urn:li:dataFlow:(azure-data-factory,complex-data-factory.DiverseActivitiesPipeline,DEV)", + "changeType": "UPSERT", + "aspectName": "browsePathsV2", + "aspect": { + "json": { + "path": [ + { + "id": "urn:li:container:7f68411fe6779dae7f37506c1e9bacfe", + "urn": "urn:li:container:7f68411fe6779dae7f37506c1e9bacfe" + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-diverse-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataFlow", + "entityUrn": "urn:li:dataFlow:(azure-data-factory,complex-data-factory.DiverseActivitiesPipeline,DEV)", + "changeType": "UPSERT", + "aspectName": "subTypes", + "aspect": { + "json": { + "typeNames": [ + "Pipeline" + ] + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-diverse-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataFlow", + "entityUrn": "urn:li:dataFlow:(azure-data-factory,complex-data-factory.DiverseActivitiesPipeline,DEV)", + "changeType": "UPSERT", + "aspectName": "container", + "aspect": { + "json": { + "container": "urn:li:container:7f68411fe6779dae7f37506c1e9bacfe" + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-diverse-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure-data-factory,complex-data-factory.DiverseActivitiesPipeline,DEV),CheckOutputExists)", + "changeType": "UPSERT", + "aspectName": "dataPlatformInstance", + "aspect": { + "json": { + "platform": "urn:li:dataPlatform:azure-data-factory" + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-diverse-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure-data-factory,complex-data-factory.DiverseActivitiesPipeline,DEV),CheckOutputExists)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-diverse-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure-data-factory,complex-data-factory.DiverseActivitiesPipeline,DEV),CheckOutputExists)", + "changeType": "UPSERT", + "aspectName": "browsePathsV2", + "aspect": { + "json": { + "path": [ + { + "id": "urn:li:container:7f68411fe6779dae7f37506c1e9bacfe", + "urn": "urn:li:container:7f68411fe6779dae7f37506c1e9bacfe" + }, + { + "id": "complex-data-factory.DiverseActivitiesPipeline", + "urn": "urn:li:dataFlow:(azure-data-factory,complex-data-factory.DiverseActivitiesPipeline,DEV)" + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-diverse-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure-data-factory,complex-data-factory.DiverseActivitiesPipeline,DEV),RunAnalyticsScript)", + "changeType": "UPSERT", + "aspectName": "dataJobInfo", + "aspect": { + "json": { + "customProperties": { + "activity_type": "Script" + }, + "externalUrl": "https://adf.azure.com/en/authoring/pipeline/DiverseActivitiesPipeline?factory=/subscriptions/12345678-1234-1234-1234-123456789012/resourceGroups/complex-test-rg/providers/Microsoft.DataFactory/factories/complex-data-factory", + "name": "RunAnalyticsScript", + "type": { + "string": "COMMAND" + }, + "flowUrn": "urn:li:dataFlow:(azure-data-factory,complex-data-factory.DiverseActivitiesPipeline,DEV)", + "env": "DEV" + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-diverse-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure-data-factory,complex-data-factory.DiverseActivitiesPipeline,DEV),RunAnalyticsScript)", + "changeType": "UPSERT", + "aspectName": "subTypes", + "aspect": { + "json": { + "typeNames": [ + "Script Activity" + ] + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-diverse-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure-data-factory,complex-data-factory.DiverseActivitiesPipeline,DEV),RunAnalyticsScript)", + "changeType": "UPSERT", + "aspectName": "dataPlatformInstance", + "aspect": { + "json": { + "platform": "urn:li:dataPlatform:azure-data-factory" + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-diverse-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure-data-factory,complex-data-factory.DiverseActivitiesPipeline,DEV),RunAnalyticsScript)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-diverse-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure-data-factory,complex-data-factory.DiverseActivitiesPipeline,DEV),InitializeCounter)", + "changeType": "UPSERT", + "aspectName": "dataPlatformInstance", + "aspect": { + "json": { + "platform": "urn:li:dataPlatform:azure-data-factory" + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-diverse-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure-data-factory,complex-data-factory.DiverseActivitiesPipeline,DEV),RunAnalyticsScript)", + "changeType": "UPSERT", + "aspectName": "browsePathsV2", + "aspect": { + "json": { + "path": [ + { + "id": "urn:li:container:7f68411fe6779dae7f37506c1e9bacfe", + "urn": "urn:li:container:7f68411fe6779dae7f37506c1e9bacfe" + }, + { + "id": "complex-data-factory.DiverseActivitiesPipeline", + "urn": "urn:li:dataFlow:(azure-data-factory,complex-data-factory.DiverseActivitiesPipeline,DEV)" + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-diverse-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure-data-factory,complex-data-factory.DiverseActivitiesPipeline,DEV),InitializeCounter)", + "changeType": "UPSERT", + "aspectName": "dataJobInfo", + "aspect": { + "json": { + "customProperties": { + "activity_type": "SetVariable" + }, + "externalUrl": "https://adf.azure.com/en/authoring/pipeline/DiverseActivitiesPipeline?factory=/subscriptions/12345678-1234-1234-1234-123456789012/resourceGroups/complex-test-rg/providers/Microsoft.DataFactory/factories/complex-data-factory", + "name": "InitializeCounter", + "type": { + "string": "COMMAND" + }, + "flowUrn": "urn:li:dataFlow:(azure-data-factory,complex-data-factory.DiverseActivitiesPipeline,DEV)", + "env": "DEV" + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-diverse-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure-data-factory,complex-data-factory.DiverseActivitiesPipeline,DEV),InitializeCounter)", + "changeType": "UPSERT", + "aspectName": "browsePathsV2", + "aspect": { + "json": { + "path": [ + { + "id": "urn:li:container:7f68411fe6779dae7f37506c1e9bacfe", + "urn": "urn:li:container:7f68411fe6779dae7f37506c1e9bacfe" + }, + { + "id": "complex-data-factory.DiverseActivitiesPipeline", + "urn": "urn:li:dataFlow:(azure-data-factory,complex-data-factory.DiverseActivitiesPipeline,DEV)" + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-diverse-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure-data-factory,complex-data-factory.DiverseActivitiesPipeline,DEV),ProcessDataWithSP)", + "changeType": "UPSERT", + "aspectName": "dataPlatformInstance", + "aspect": { + "json": { + "platform": "urn:li:dataPlatform:azure-data-factory" + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-diverse-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure-data-factory,complex-data-factory.DiverseActivitiesPipeline,DEV),FetchConfiguration)", + "changeType": "UPSERT", + "aspectName": "dataPlatformInstance", + "aspect": { + "json": { + "platform": "urn:li:dataPlatform:azure-data-factory" + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-diverse-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure-data-factory,complex-data-factory.DiverseActivitiesPipeline,DEV),ProcessDataWithSP)", + "changeType": "UPSERT", + "aspectName": "dataJobInfo", + "aspect": { + "json": { + "customProperties": { + "activity_type": "SqlServerStoredProcedure" + }, + "externalUrl": "https://adf.azure.com/en/authoring/pipeline/DiverseActivitiesPipeline?factory=/subscriptions/12345678-1234-1234-1234-123456789012/resourceGroups/complex-test-rg/providers/Microsoft.DataFactory/factories/complex-data-factory", + "name": "ProcessDataWithSP", + "type": { + "string": "COMMAND" + }, + "flowUrn": "urn:li:dataFlow:(azure-data-factory,complex-data-factory.DiverseActivitiesPipeline,DEV)", + "env": "DEV" + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-diverse-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure-data-factory,complex-data-factory.DiverseActivitiesPipeline,DEV),ProcessDataWithSP)", + "changeType": "UPSERT", + "aspectName": "browsePathsV2", + "aspect": { + "json": { + "path": [ + { + "id": "urn:li:container:7f68411fe6779dae7f37506c1e9bacfe", + "urn": "urn:li:container:7f68411fe6779dae7f37506c1e9bacfe" + }, + { + "id": "complex-data-factory.DiverseActivitiesPipeline", + "urn": "urn:li:dataFlow:(azure-data-factory,complex-data-factory.DiverseActivitiesPipeline,DEV)" + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-diverse-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure-data-factory,complex-data-factory.DiverseActivitiesPipeline,DEV),ProcessDataWithSP)", + "changeType": "UPSERT", + "aspectName": "subTypes", + "aspect": { + "json": { + "typeNames": [ + "Stored Procedure Activity" + ] + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-diverse-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure-data-factory,complex-data-factory.DiverseActivitiesPipeline,DEV),InitializeCounter)", + "changeType": "UPSERT", + "aspectName": "subTypes", + "aspect": { + "json": { + "typeNames": [ + "Set Variable" + ] + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-diverse-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure-data-factory,complex-data-factory.DiverseActivitiesPipeline,DEV),FetchConfiguration)", + "changeType": "UPSERT", + "aspectName": "dataJobInfo", + "aspect": { + "json": { + "customProperties": { + "activity_type": "WebActivity" + }, + "externalUrl": "https://adf.azure.com/en/authoring/pipeline/DiverseActivitiesPipeline?factory=/subscriptions/12345678-1234-1234-1234-123456789012/resourceGroups/complex-test-rg/providers/Microsoft.DataFactory/factories/complex-data-factory", + "name": "FetchConfiguration", + "type": { + "string": "COMMAND" + }, + "flowUrn": "urn:li:dataFlow:(azure-data-factory,complex-data-factory.DiverseActivitiesPipeline,DEV)", + "env": "DEV" + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-diverse-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure-data-factory,complex-data-factory.DiverseActivitiesPipeline,DEV),FetchConfiguration)", + "changeType": "UPSERT", + "aspectName": "browsePathsV2", + "aspect": { + "json": { + "path": [ + { + "id": "urn:li:container:7f68411fe6779dae7f37506c1e9bacfe", + "urn": "urn:li:container:7f68411fe6779dae7f37506c1e9bacfe" + }, + { + "id": "complex-data-factory.DiverseActivitiesPipeline", + "urn": "urn:li:dataFlow:(azure-data-factory,complex-data-factory.DiverseActivitiesPipeline,DEV)" + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-diverse-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure-data-factory,complex-data-factory.DiverseActivitiesPipeline,DEV),FetchConfiguration)", + "changeType": "UPSERT", + "aspectName": "subTypes", + "aspect": { + "json": { + "typeNames": [ + "Web Activity" + ] + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-diverse-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure-data-factory,complex-data-factory.DiverseActivitiesPipeline,DEV),WaitForReplication)", + "changeType": "UPSERT", + "aspectName": "dataJobInfo", + "aspect": { + "json": { + "customProperties": { + "activity_type": "Wait" + }, + "externalUrl": "https://adf.azure.com/en/authoring/pipeline/DiverseActivitiesPipeline?factory=/subscriptions/12345678-1234-1234-1234-123456789012/resourceGroups/complex-test-rg/providers/Microsoft.DataFactory/factories/complex-data-factory", + "name": "WaitForReplication", + "type": { + "string": "COMMAND" + }, + "flowUrn": "urn:li:dataFlow:(azure-data-factory,complex-data-factory.DiverseActivitiesPipeline,DEV)", + "env": "DEV" + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-diverse-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure-data-factory,complex-data-factory.DiverseActivitiesPipeline,DEV),WaitForReplication)", + "changeType": "UPSERT", + "aspectName": "subTypes", + "aspect": { + "json": { + "typeNames": [ + "Wait Activity" + ] + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-diverse-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure-data-factory,complex-data-factory.DiverseActivitiesPipeline,DEV),WaitForReplication)", + "changeType": "UPSERT", + "aspectName": "dataPlatformInstance", + "aspect": { + "json": { + "platform": "urn:li:dataPlatform:azure-data-factory" + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-diverse-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure-data-factory,complex-data-factory.DiverseActivitiesPipeline,DEV),WaitForReplication)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-diverse-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure-data-factory,complex-data-factory.DiverseActivitiesPipeline,DEV),WaitForReplication)", + "changeType": "UPSERT", + "aspectName": "browsePathsV2", + "aspect": { + "json": { + "path": [ + { + "id": "urn:li:container:7f68411fe6779dae7f37506c1e9bacfe", + "urn": "urn:li:container:7f68411fe6779dae7f37506c1e9bacfe" + }, + { + "id": "complex-data-factory.DiverseActivitiesPipeline", + "urn": "urn:li:dataFlow:(azure-data-factory,complex-data-factory.DiverseActivitiesPipeline,DEV)" + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-diverse-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure-data-factory,complex-data-factory.DiverseActivitiesPipeline,DEV),InitializeCounter)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-diverse-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure-data-factory,complex-data-factory.DiverseActivitiesPipeline,DEV),SendCompletionNotification)", + "changeType": "UPSERT", + "aspectName": "dataPlatformInstance", + "aspect": { + "json": { + "platform": "urn:li:dataPlatform:azure-data-factory" + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-diverse-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure-data-factory,complex-data-factory.DiverseActivitiesPipeline,DEV),FetchConfiguration)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-diverse-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure-data-factory,complex-data-factory.DiverseActivitiesPipeline,DEV),SendCompletionNotification)", + "changeType": "UPSERT", + "aspectName": "dataJobInfo", + "aspect": { + "json": { + "customProperties": { + "activity_type": "AzureFunctionActivity" + }, + "externalUrl": "https://adf.azure.com/en/authoring/pipeline/DiverseActivitiesPipeline?factory=/subscriptions/12345678-1234-1234-1234-123456789012/resourceGroups/complex-test-rg/providers/Microsoft.DataFactory/factories/complex-data-factory", + "name": "SendCompletionNotification", + "type": { + "string": "COMMAND" + }, + "flowUrn": "urn:li:dataFlow:(azure-data-factory,complex-data-factory.DiverseActivitiesPipeline,DEV)", + "env": "DEV" + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-diverse-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure-data-factory,complex-data-factory.DiverseActivitiesPipeline,DEV),SendCompletionNotification)", + "changeType": "UPSERT", + "aspectName": "browsePathsV2", + "aspect": { + "json": { + "path": [ + { + "id": "urn:li:container:7f68411fe6779dae7f37506c1e9bacfe", + "urn": "urn:li:container:7f68411fe6779dae7f37506c1e9bacfe" + }, + { + "id": "complex-data-factory.DiverseActivitiesPipeline", + "urn": "urn:li:dataFlow:(azure-data-factory,complex-data-factory.DiverseActivitiesPipeline,DEV)" + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-diverse-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure-data-factory,complex-data-factory.DiverseActivitiesPipeline,DEV),SendCompletionNotification)", + "changeType": "UPSERT", + "aspectName": "subTypes", + "aspect": { + "json": { + "typeNames": [ + "Azure Function Activity" + ] + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-diverse-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure-data-factory,complex-data-factory.DiverseActivitiesPipeline,DEV),RunMLTrainingNotebook)", + "changeType": "UPSERT", + "aspectName": "dataJobInfo", + "aspect": { + "json": { + "customProperties": { + "activity_type": "DatabricksNotebook" + }, + "externalUrl": "https://adf.azure.com/en/authoring/pipeline/DiverseActivitiesPipeline?factory=/subscriptions/12345678-1234-1234-1234-123456789012/resourceGroups/complex-test-rg/providers/Microsoft.DataFactory/factories/complex-data-factory", + "name": "RunMLTrainingNotebook", + "type": { + "string": "COMMAND" + }, + "flowUrn": "urn:li:dataFlow:(azure-data-factory,complex-data-factory.DiverseActivitiesPipeline,DEV)", + "env": "DEV" + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-diverse-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure-data-factory,complex-data-factory.DiverseActivitiesPipeline,DEV),RunMLTrainingNotebook)", + "changeType": "UPSERT", + "aspectName": "subTypes", + "aspect": { + "json": { + "typeNames": [ + "Databricks Notebook" + ] + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-diverse-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure-data-factory,complex-data-factory.DiverseActivitiesPipeline,DEV),RunMLTrainingNotebook)", + "changeType": "UPSERT", + "aspectName": "dataPlatformInstance", + "aspect": { + "json": { + "platform": "urn:li:dataPlatform:azure-data-factory" + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-diverse-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure-data-factory,complex-data-factory.DiverseActivitiesPipeline,DEV),RunMLTrainingNotebook)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-diverse-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure-data-factory,complex-data-factory.DiverseActivitiesPipeline,DEV),RunMLTrainingNotebook)", + "changeType": "UPSERT", + "aspectName": "browsePathsV2", + "aspect": { + "json": { + "path": [ + { + "id": "urn:li:container:7f68411fe6779dae7f37506c1e9bacfe", + "urn": "urn:li:container:7f68411fe6779dae7f37506c1e9bacfe" + }, + { + "id": "complex-data-factory.DiverseActivitiesPipeline", + "urn": "urn:li:dataFlow:(azure-data-factory,complex-data-factory.DiverseActivitiesPipeline,DEV)" + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-diverse-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "container", + "entityUrn": "urn:li:container:7f68411fe6779dae7f37506c1e9bacfe", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-diverse-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure-data-factory,complex-data-factory.DiverseActivitiesPipeline,DEV),FailOnCriticalError)", + "changeType": "UPSERT", + "aspectName": "dataJobInfo", + "aspect": { + "json": { + "customProperties": { + "activity_type": "Fail" + }, + "externalUrl": "https://adf.azure.com/en/authoring/pipeline/DiverseActivitiesPipeline?factory=/subscriptions/12345678-1234-1234-1234-123456789012/resourceGroups/complex-test-rg/providers/Microsoft.DataFactory/factories/complex-data-factory", + "name": "FailOnCriticalError", + "type": { + "string": "COMMAND" + }, + "flowUrn": "urn:li:dataFlow:(azure-data-factory,complex-data-factory.DiverseActivitiesPipeline,DEV)", + "env": "DEV" + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-diverse-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure-data-factory,complex-data-factory.DiverseActivitiesPipeline,DEV),FailOnCriticalError)", + "changeType": "UPSERT", + "aspectName": "subTypes", + "aspect": { + "json": { + "typeNames": [ + "Fail Activity" + ] + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-diverse-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure-data-factory,complex-data-factory.DiverseActivitiesPipeline,DEV),FailOnCriticalError)", + "changeType": "UPSERT", + "aspectName": "dataPlatformInstance", + "aspect": { + "json": { + "platform": "urn:li:dataPlatform:azure-data-factory" + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-diverse-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataFlow", + "entityUrn": "urn:li:dataFlow:(azure-data-factory,complex-data-factory.DiverseActivitiesPipeline,DEV)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-diverse-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure-data-factory,complex-data-factory.DiverseActivitiesPipeline,DEV),FailOnCriticalError)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-diverse-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure-data-factory,complex-data-factory.DiverseActivitiesPipeline,DEV),FailOnCriticalError)", + "changeType": "UPSERT", + "aspectName": "browsePathsV2", + "aspect": { + "json": { + "path": [ + { + "id": "urn:li:container:7f68411fe6779dae7f37506c1e9bacfe", + "urn": "urn:li:container:7f68411fe6779dae7f37506c1e9bacfe" + }, + { + "id": "complex-data-factory.DiverseActivitiesPipeline", + "urn": "urn:li:dataFlow:(azure-data-factory,complex-data-factory.DiverseActivitiesPipeline,DEV)" + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-diverse-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure-data-factory,complex-data-factory.DiverseActivitiesPipeline,DEV),ProcessDataWithSP)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-diverse-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure-data-factory,complex-data-factory.DiverseActivitiesPipeline,DEV),SendCompletionNotification)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-diverse-test", + "lastRunId": "no-run-id-provided" + } +} +] \ No newline at end of file diff --git a/metadata-ingestion/tests/integration/azure_data_factory/adf_foreach_golden.json b/metadata-ingestion/tests/integration/azure_data_factory/adf_foreach_golden.json new file mode 100644 index 00000000000000..e62e3649a69bf7 --- /dev/null +++ b/metadata-ingestion/tests/integration/azure_data_factory/adf_foreach_golden.json @@ -0,0 +1,408 @@ +[ +{ + "entityType": "container", + "entityUrn": "urn:li:container:7f68411fe6779dae7f37506c1e9bacfe", + "changeType": "UPSERT", + "aspectName": "containerProperties", + "aspect": { + "json": { + "customProperties": { + "platform": "azure-data-factory", + "env": "DEV", + "resource_group": "complex-test-rg", + "factory_name": "complex-data-factory", + "azure_resource_id": "/subscriptions/12345678-1234-1234-1234-123456789012/resourceGroups/complex-test-rg/providers/Microsoft.DataFactory/factories/complex-data-factory", + "location": "eastus", + "tag:environment": "test", + "tag:purpose": "complex-integration-tests", + "provisioning_state": "Succeeded" + }, + "externalUrl": "https://adf.azure.com/en/home?factory=/subscriptions/12345678-1234-1234-1234-123456789012/resourceGroups/complex-test-rg/providers/Microsoft.DataFactory/factories/complex-data-factory", + "name": "complex-data-factory", + "description": "Azure Data Factory: complex-data-factory", + "env": "DEV" + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-foreach-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "container", + "entityUrn": "urn:li:container:7f68411fe6779dae7f37506c1e9bacfe", + "changeType": "UPSERT", + "aspectName": "dataPlatformInstance", + "aspect": { + "json": { + "platform": "urn:li:dataPlatform:azure-data-factory" + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-foreach-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataFlow", + "entityUrn": "urn:li:dataFlow:(azure-data-factory,complex-data-factory.ForEachTablePipeline,DEV)", + "changeType": "UPSERT", + "aspectName": "subTypes", + "aspect": { + "json": { + "typeNames": [ + "Pipeline" + ] + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-foreach-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "container", + "entityUrn": "urn:li:container:7f68411fe6779dae7f37506c1e9bacfe", + "changeType": "UPSERT", + "aspectName": "browsePathsV2", + "aspect": { + "json": { + "path": [] + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-foreach-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataFlow", + "entityUrn": "urn:li:dataFlow:(azure-data-factory,complex-data-factory.ForEachTablePipeline,DEV)", + "changeType": "UPSERT", + "aspectName": "container", + "aspect": { + "json": { + "container": "urn:li:container:7f68411fe6779dae7f37506c1e9bacfe" + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-foreach-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "container", + "entityUrn": "urn:li:container:7f68411fe6779dae7f37506c1e9bacfe", + "changeType": "UPSERT", + "aspectName": "subTypes", + "aspect": { + "json": { + "typeNames": [ + "Data Factory" + ] + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-foreach-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataFlow", + "entityUrn": "urn:li:dataFlow:(azure-data-factory,complex-data-factory.ForEachTablePipeline,DEV)", + "changeType": "UPSERT", + "aspectName": "dataPlatformInstance", + "aspect": { + "json": { + "platform": "urn:li:dataPlatform:azure-data-factory" + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-foreach-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataFlow", + "entityUrn": "urn:li:dataFlow:(azure-data-factory,complex-data-factory.ForEachTablePipeline,DEV)", + "changeType": "UPSERT", + "aspectName": "dataFlowInfo", + "aspect": { + "json": { + "customProperties": { + "azure_resource_id": "/subscriptions/12345678-1234-1234-1234-123456789012/resourceGroups/complex-test-rg/providers/Microsoft.DataFactory/factories/complex-data-factory/pipelines/ForEachTablePipeline", + "factory_name": "complex-data-factory" + }, + "externalUrl": "https://adf.azure.com/en/authoring/pipeline/ForEachTablePipeline?factory=/subscriptions/12345678-1234-1234-1234-123456789012/resourceGroups/complex-test-rg/providers/Microsoft.DataFactory/factories/complex-data-factory", + "name": "ForEachTablePipeline", + "description": "Pipeline with ForEach loop to copy multiple tables", + "env": "DEV" + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-foreach-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataFlow", + "entityUrn": "urn:li:dataFlow:(azure-data-factory,complex-data-factory.ForEachTablePipeline,DEV)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-foreach-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataFlow", + "entityUrn": "urn:li:dataFlow:(azure-data-factory,complex-data-factory.ForEachTablePipeline,DEV)", + "changeType": "UPSERT", + "aspectName": "browsePathsV2", + "aspect": { + "json": { + "path": [ + { + "id": "urn:li:container:7f68411fe6779dae7f37506c1e9bacfe", + "urn": "urn:li:container:7f68411fe6779dae7f37506c1e9bacfe" + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-foreach-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure-data-factory,complex-data-factory.ForEachTablePipeline,DEV),IterateOverTables)", + "changeType": "UPSERT", + "aspectName": "dataJobInfo", + "aspect": { + "json": { + "customProperties": { + "activity_type": "ForEach" + }, + "externalUrl": "https://adf.azure.com/en/authoring/pipeline/ForEachTablePipeline?factory=/subscriptions/12345678-1234-1234-1234-123456789012/resourceGroups/complex-test-rg/providers/Microsoft.DataFactory/factories/complex-data-factory", + "name": "IterateOverTables", + "type": { + "string": "COMMAND" + }, + "flowUrn": "urn:li:dataFlow:(azure-data-factory,complex-data-factory.ForEachTablePipeline,DEV)", + "env": "DEV" + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-foreach-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure-data-factory,complex-data-factory.ForEachTablePipeline,DEV),IterateOverTables)", + "changeType": "UPSERT", + "aspectName": "subTypes", + "aspect": { + "json": { + "typeNames": [ + "ForEach Loop" + ] + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-foreach-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure-data-factory,complex-data-factory.ForEachTablePipeline,DEV),IterateOverTables)", + "changeType": "UPSERT", + "aspectName": "dataPlatformInstance", + "aspect": { + "json": { + "platform": "urn:li:dataPlatform:azure-data-factory" + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-foreach-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure-data-factory,complex-data-factory.ForEachTablePipeline,DEV),GetTableList)", + "changeType": "UPSERT", + "aspectName": "dataPlatformInstance", + "aspect": { + "json": { + "platform": "urn:li:dataPlatform:azure-data-factory" + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-foreach-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure-data-factory,complex-data-factory.ForEachTablePipeline,DEV),IterateOverTables)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-foreach-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure-data-factory,complex-data-factory.ForEachTablePipeline,DEV),GetTableList)", + "changeType": "UPSERT", + "aspectName": "dataJobInfo", + "aspect": { + "json": { + "customProperties": { + "activity_type": "Lookup" + }, + "externalUrl": "https://adf.azure.com/en/authoring/pipeline/ForEachTablePipeline?factory=/subscriptions/12345678-1234-1234-1234-123456789012/resourceGroups/complex-test-rg/providers/Microsoft.DataFactory/factories/complex-data-factory", + "name": "GetTableList", + "type": { + "string": "COMMAND" + }, + "flowUrn": "urn:li:dataFlow:(azure-data-factory,complex-data-factory.ForEachTablePipeline,DEV)", + "env": "DEV" + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-foreach-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure-data-factory,complex-data-factory.ForEachTablePipeline,DEV),GetTableList)", + "changeType": "UPSERT", + "aspectName": "browsePathsV2", + "aspect": { + "json": { + "path": [ + { + "id": "urn:li:container:7f68411fe6779dae7f37506c1e9bacfe", + "urn": "urn:li:container:7f68411fe6779dae7f37506c1e9bacfe" + }, + { + "id": "complex-data-factory.ForEachTablePipeline", + "urn": "urn:li:dataFlow:(azure-data-factory,complex-data-factory.ForEachTablePipeline,DEV)" + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-foreach-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure-data-factory,complex-data-factory.ForEachTablePipeline,DEV),GetTableList)", + "changeType": "UPSERT", + "aspectName": "subTypes", + "aspect": { + "json": { + "typeNames": [ + "Lookup Activity" + ] + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-foreach-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "container", + "entityUrn": "urn:li:container:7f68411fe6779dae7f37506c1e9bacfe", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-foreach-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure-data-factory,complex-data-factory.ForEachTablePipeline,DEV),IterateOverTables)", + "changeType": "UPSERT", + "aspectName": "browsePathsV2", + "aspect": { + "json": { + "path": [ + { + "id": "urn:li:container:7f68411fe6779dae7f37506c1e9bacfe", + "urn": "urn:li:container:7f68411fe6779dae7f37506c1e9bacfe" + }, + { + "id": "complex-data-factory.ForEachTablePipeline", + "urn": "urn:li:dataFlow:(azure-data-factory,complex-data-factory.ForEachTablePipeline,DEV)" + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-foreach-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure-data-factory,complex-data-factory.ForEachTablePipeline,DEV),GetTableList)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-foreach-test", + "lastRunId": "no-run-id-provided" + } +} +] \ No newline at end of file diff --git a/metadata-ingestion/tests/integration/azure_data_factory/adf_multisource_golden.json b/metadata-ingestion/tests/integration/azure_data_factory/adf_multisource_golden.json new file mode 100644 index 00000000000000..830c84f0b559cb --- /dev/null +++ b/metadata-ingestion/tests/integration/azure_data_factory/adf_multisource_golden.json @@ -0,0 +1,813 @@ +[ +{ + "entityType": "container", + "entityUrn": "urn:li:container:7f68411fe6779dae7f37506c1e9bacfe", + "changeType": "UPSERT", + "aspectName": "containerProperties", + "aspect": { + "json": { + "customProperties": { + "platform": "azure-data-factory", + "env": "DEV", + "resource_group": "complex-test-rg", + "factory_name": "complex-data-factory", + "azure_resource_id": "/subscriptions/12345678-1234-1234-1234-123456789012/resourceGroups/complex-test-rg/providers/Microsoft.DataFactory/factories/complex-data-factory", + "location": "eastus", + "tag:environment": "test", + "tag:purpose": "complex-integration-tests", + "provisioning_state": "Succeeded" + }, + "externalUrl": "https://adf.azure.com/en/home?factory=/subscriptions/12345678-1234-1234-1234-123456789012/resourceGroups/complex-test-rg/providers/Microsoft.DataFactory/factories/complex-data-factory", + "name": "complex-data-factory", + "description": "Azure Data Factory: complex-data-factory", + "env": "DEV" + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-multisource-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "container", + "entityUrn": "urn:li:container:7f68411fe6779dae7f37506c1e9bacfe", + "changeType": "UPSERT", + "aspectName": "dataPlatformInstance", + "aspect": { + "json": { + "platform": "urn:li:dataPlatform:azure-data-factory" + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-multisource-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure-data-factory,complex-data-factory.ETLPipeline,DEV),ArchiveToDataLake)", + "changeType": "UPSERT", + "aspectName": "dataJobInfo", + "aspect": { + "json": { + "customProperties": { + "activity_type": "Copy" + }, + "externalUrl": "https://adf.azure.com/en/authoring/pipeline/ETLPipeline?factory=/subscriptions/12345678-1234-1234-1234-123456789012/resourceGroups/complex-test-rg/providers/Microsoft.DataFactory/factories/complex-data-factory", + "name": "ArchiveToDataLake", + "type": { + "string": "COMMAND" + }, + "flowUrn": "urn:li:dataFlow:(azure-data-factory,complex-data-factory.ETLPipeline,DEV)", + "env": "DEV" + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-multisource-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "container", + "entityUrn": "urn:li:container:7f68411fe6779dae7f37506c1e9bacfe", + "changeType": "UPSERT", + "aspectName": "browsePathsV2", + "aspect": { + "json": { + "path": [] + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-multisource-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure-data-factory,complex-data-factory.ETLPipeline,DEV),ArchiveToDataLake)", + "changeType": "UPSERT", + "aspectName": "subTypes", + "aspect": { + "json": { + "typeNames": [ + "Copy Activity" + ] + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-multisource-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "container", + "entityUrn": "urn:li:container:7f68411fe6779dae7f37506c1e9bacfe", + "changeType": "UPSERT", + "aspectName": "subTypes", + "aspect": { + "json": { + "typeNames": [ + "Data Factory" + ] + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-multisource-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure-data-factory,complex-data-factory.ETLPipeline,DEV),ArchiveToDataLake)", + "changeType": "UPSERT", + "aspectName": "dataPlatformInstance", + "aspect": { + "json": { + "platform": "urn:li:dataPlatform:azure-data-factory" + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-multisource-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure-data-factory,complex-data-factory.ETLPipeline,DEV),ArchiveToDataLake)", + "changeType": "UPSERT", + "aspectName": "dataJobInputOutput", + "aspect": { + "json": { + "inputDatasets": [ + "urn:li:dataset:(urn:li:dataPlatform:azure_blob_storage,staging/customers,DEV)" + ], + "outputDatasets": [ + "urn:li:dataset:(urn:li:dataPlatform:azure_data_lake,sales,DEV)" + ] + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-multisource-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataFlow", + "entityUrn": "urn:li:dataFlow:(azure-data-factory,complex-data-factory.ETLPipeline,DEV)", + "changeType": "UPSERT", + "aspectName": "dataPlatformInstance", + "aspect": { + "json": { + "platform": "urn:li:dataPlatform:azure-data-factory" + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-multisource-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataFlow", + "entityUrn": "urn:li:dataFlow:(azure-data-factory,complex-data-factory.ETLPipeline,DEV)", + "changeType": "UPSERT", + "aspectName": "dataFlowInfo", + "aspect": { + "json": { + "customProperties": { + "azure_resource_id": "/subscriptions/12345678-1234-1234-1234-123456789012/resourceGroups/complex-test-rg/providers/Microsoft.DataFactory/factories/complex-data-factory/pipelines/ETLPipeline", + "factory_name": "complex-data-factory" + }, + "externalUrl": "https://adf.azure.com/en/authoring/pipeline/ETLPipeline?factory=/subscriptions/12345678-1234-1234-1234-123456789012/resourceGroups/complex-test-rg/providers/Microsoft.DataFactory/factories/complex-data-factory", + "name": "ETLPipeline", + "description": "Full ETL pipeline: Extract from SQL, stage in Blob, load to Synapse and archive", + "env": "DEV" + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-multisource-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure-data-factory,complex-data-factory.ETLPipeline,DEV),ArchiveToDataLake)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-multisource-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure-data-factory,complex-data-factory.ETLPipeline,DEV),ArchiveToDataLake)", + "changeType": "UPSERT", + "aspectName": "browsePathsV2", + "aspect": { + "json": { + "path": [ + { + "id": "urn:li:container:7f68411fe6779dae7f37506c1e9bacfe", + "urn": "urn:li:container:7f68411fe6779dae7f37506c1e9bacfe" + }, + { + "id": "complex-data-factory.ETLPipeline", + "urn": "urn:li:dataFlow:(azure-data-factory,complex-data-factory.ETLPipeline,DEV)" + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-multisource-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataFlow", + "entityUrn": "urn:li:dataFlow:(azure-data-factory,complex-data-factory.ETLPipeline,DEV)", + "changeType": "UPSERT", + "aspectName": "browsePathsV2", + "aspect": { + "json": { + "path": [ + { + "id": "urn:li:container:7f68411fe6779dae7f37506c1e9bacfe", + "urn": "urn:li:container:7f68411fe6779dae7f37506c1e9bacfe" + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-multisource-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure-data-factory,complex-data-factory.ETLPipeline,DEV),ExtractCustomersFromSQL)", + "changeType": "UPSERT", + "aspectName": "dataPlatformInstance", + "aspect": { + "json": { + "platform": "urn:li:dataPlatform:azure-data-factory" + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-multisource-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure-data-factory,complex-data-factory.ETLPipeline,DEV),ExtractOrdersFromSQL)", + "changeType": "UPSERT", + "aspectName": "dataPlatformInstance", + "aspect": { + "json": { + "platform": "urn:li:dataPlatform:azure-data-factory" + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-multisource-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataFlow", + "entityUrn": "urn:li:dataFlow:(azure-data-factory,complex-data-factory.ETLPipeline,DEV)", + "changeType": "UPSERT", + "aspectName": "subTypes", + "aspect": { + "json": { + "typeNames": [ + "Pipeline" + ] + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-multisource-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure-data-factory,complex-data-factory.ETLPipeline,DEV),ExtractOrdersFromSQL)", + "changeType": "UPSERT", + "aspectName": "dataJobInfo", + "aspect": { + "json": { + "customProperties": { + "activity_type": "Copy" + }, + "externalUrl": "https://adf.azure.com/en/authoring/pipeline/ETLPipeline?factory=/subscriptions/12345678-1234-1234-1234-123456789012/resourceGroups/complex-test-rg/providers/Microsoft.DataFactory/factories/complex-data-factory", + "name": "ExtractOrdersFromSQL", + "type": { + "string": "COMMAND" + }, + "flowUrn": "urn:li:dataFlow:(azure-data-factory,complex-data-factory.ETLPipeline,DEV)", + "env": "DEV" + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-multisource-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure-data-factory,complex-data-factory.ETLPipeline,DEV),ExtractOrdersFromSQL)", + "changeType": "UPSERT", + "aspectName": "dataJobInputOutput", + "aspect": { + "json": { + "inputDatasets": [ + "urn:li:dataset:(urn:li:dataPlatform:mssql,Orders,DEV)" + ], + "outputDatasets": [ + "urn:li:dataset:(urn:li:dataPlatform:azure_blob_storage,staging/orders,DEV)" + ] + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-multisource-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure-data-factory,complex-data-factory.ETLPipeline,DEV),ExtractOrdersFromSQL)", + "changeType": "UPSERT", + "aspectName": "browsePathsV2", + "aspect": { + "json": { + "path": [ + { + "id": "urn:li:container:7f68411fe6779dae7f37506c1e9bacfe", + "urn": "urn:li:container:7f68411fe6779dae7f37506c1e9bacfe" + }, + { + "id": "complex-data-factory.ETLPipeline", + "urn": "urn:li:dataFlow:(azure-data-factory,complex-data-factory.ETLPipeline,DEV)" + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-multisource-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure-data-factory,complex-data-factory.ETLPipeline,DEV),ExtractOrdersFromSQL)", + "changeType": "UPSERT", + "aspectName": "subTypes", + "aspect": { + "json": { + "typeNames": [ + "Copy Activity" + ] + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-multisource-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure-data-factory,complex-data-factory.ETLPipeline,DEV),ExtractCustomersFromSQL)", + "changeType": "UPSERT", + "aspectName": "dataJobInfo", + "aspect": { + "json": { + "customProperties": { + "activity_type": "Copy" + }, + "externalUrl": "https://adf.azure.com/en/authoring/pipeline/ETLPipeline?factory=/subscriptions/12345678-1234-1234-1234-123456789012/resourceGroups/complex-test-rg/providers/Microsoft.DataFactory/factories/complex-data-factory", + "name": "ExtractCustomersFromSQL", + "type": { + "string": "COMMAND" + }, + "flowUrn": "urn:li:dataFlow:(azure-data-factory,complex-data-factory.ETLPipeline,DEV)", + "env": "DEV" + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-multisource-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure-data-factory,complex-data-factory.ETLPipeline,DEV),ExtractCustomersFromSQL)", + "changeType": "UPSERT", + "aspectName": "dataJobInputOutput", + "aspect": { + "json": { + "inputDatasets": [ + "urn:li:dataset:(urn:li:dataPlatform:mssql,Customers,DEV)" + ], + "outputDatasets": [ + "urn:li:dataset:(urn:li:dataPlatform:azure_blob_storage,staging/customers,DEV)" + ] + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-multisource-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure-data-factory,complex-data-factory.ETLPipeline,DEV),ExtractCustomersFromSQL)", + "changeType": "UPSERT", + "aspectName": "browsePathsV2", + "aspect": { + "json": { + "path": [ + { + "id": "urn:li:container:7f68411fe6779dae7f37506c1e9bacfe", + "urn": "urn:li:container:7f68411fe6779dae7f37506c1e9bacfe" + }, + { + "id": "complex-data-factory.ETLPipeline", + "urn": "urn:li:dataFlow:(azure-data-factory,complex-data-factory.ETLPipeline,DEV)" + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-multisource-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure-data-factory,complex-data-factory.ETLPipeline,DEV),ExtractCustomersFromSQL)", + "changeType": "UPSERT", + "aspectName": "subTypes", + "aspect": { + "json": { + "typeNames": [ + "Copy Activity" + ] + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-multisource-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataFlow", + "entityUrn": "urn:li:dataFlow:(azure-data-factory,complex-data-factory.ETLPipeline,DEV)", + "changeType": "UPSERT", + "aspectName": "container", + "aspect": { + "json": { + "container": "urn:li:container:7f68411fe6779dae7f37506c1e9bacfe" + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-multisource-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure-data-factory,complex-data-factory.ETLPipeline,DEV),LoadOrdersToSynapse)", + "changeType": "UPSERT", + "aspectName": "dataJobInfo", + "aspect": { + "json": { + "customProperties": { + "activity_type": "Copy" + }, + "externalUrl": "https://adf.azure.com/en/authoring/pipeline/ETLPipeline?factory=/subscriptions/12345678-1234-1234-1234-123456789012/resourceGroups/complex-test-rg/providers/Microsoft.DataFactory/factories/complex-data-factory", + "name": "LoadOrdersToSynapse", + "type": { + "string": "COMMAND" + }, + "flowUrn": "urn:li:dataFlow:(azure-data-factory,complex-data-factory.ETLPipeline,DEV)", + "env": "DEV" + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-multisource-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure-data-factory,complex-data-factory.ETLPipeline,DEV),LoadOrdersToSynapse)", + "changeType": "UPSERT", + "aspectName": "subTypes", + "aspect": { + "json": { + "typeNames": [ + "Copy Activity" + ] + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-multisource-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure-data-factory,complex-data-factory.ETLPipeline,DEV),LoadOrdersToSynapse)", + "changeType": "UPSERT", + "aspectName": "dataPlatformInstance", + "aspect": { + "json": { + "platform": "urn:li:dataPlatform:azure-data-factory" + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-multisource-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure-data-factory,complex-data-factory.ETLPipeline,DEV),LoadOrdersToSynapse)", + "changeType": "UPSERT", + "aspectName": "dataJobInputOutput", + "aspect": { + "json": { + "inputDatasets": [ + "urn:li:dataset:(urn:li:dataPlatform:azure_blob_storage,staging/orders,DEV)" + ], + "outputDatasets": [ + "urn:li:dataset:(urn:li:dataPlatform:synapse,FactOrders,DEV)" + ] + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-multisource-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure-data-factory,complex-data-factory.ETLPipeline,DEV),LoadOrdersToSynapse)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-multisource-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure-data-factory,complex-data-factory.ETLPipeline,DEV),LoadOrdersToSynapse)", + "changeType": "UPSERT", + "aspectName": "browsePathsV2", + "aspect": { + "json": { + "path": [ + { + "id": "urn:li:container:7f68411fe6779dae7f37506c1e9bacfe", + "urn": "urn:li:container:7f68411fe6779dae7f37506c1e9bacfe" + }, + { + "id": "complex-data-factory.ETLPipeline", + "urn": "urn:li:dataFlow:(azure-data-factory,complex-data-factory.ETLPipeline,DEV)" + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-multisource-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataFlow", + "entityUrn": "urn:li:dataFlow:(azure-data-factory,complex-data-factory.ETLPipeline,DEV)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-multisource-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure-data-factory,complex-data-factory.ETLPipeline,DEV),LoadCustomersToSynapse)", + "changeType": "UPSERT", + "aspectName": "dataJobInfo", + "aspect": { + "json": { + "customProperties": { + "activity_type": "Copy" + }, + "externalUrl": "https://adf.azure.com/en/authoring/pipeline/ETLPipeline?factory=/subscriptions/12345678-1234-1234-1234-123456789012/resourceGroups/complex-test-rg/providers/Microsoft.DataFactory/factories/complex-data-factory", + "name": "LoadCustomersToSynapse", + "type": { + "string": "COMMAND" + }, + "flowUrn": "urn:li:dataFlow:(azure-data-factory,complex-data-factory.ETLPipeline,DEV)", + "env": "DEV" + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-multisource-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure-data-factory,complex-data-factory.ETLPipeline,DEV),LoadCustomersToSynapse)", + "changeType": "UPSERT", + "aspectName": "dataPlatformInstance", + "aspect": { + "json": { + "platform": "urn:li:dataPlatform:azure-data-factory" + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-multisource-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure-data-factory,complex-data-factory.ETLPipeline,DEV),LoadCustomersToSynapse)", + "changeType": "UPSERT", + "aspectName": "subTypes", + "aspect": { + "json": { + "typeNames": [ + "Copy Activity" + ] + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-multisource-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure-data-factory,complex-data-factory.ETLPipeline,DEV),LoadCustomersToSynapse)", + "changeType": "UPSERT", + "aspectName": "dataJobInputOutput", + "aspect": { + "json": { + "inputDatasets": [ + "urn:li:dataset:(urn:li:dataPlatform:azure_blob_storage,staging/customers,DEV)" + ], + "outputDatasets": [ + "urn:li:dataset:(urn:li:dataPlatform:synapse,DimCustomers,DEV)" + ] + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-multisource-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure-data-factory,complex-data-factory.ETLPipeline,DEV),LoadCustomersToSynapse)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-multisource-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure-data-factory,complex-data-factory.ETLPipeline,DEV),LoadCustomersToSynapse)", + "changeType": "UPSERT", + "aspectName": "browsePathsV2", + "aspect": { + "json": { + "path": [ + { + "id": "urn:li:container:7f68411fe6779dae7f37506c1e9bacfe", + "urn": "urn:li:container:7f68411fe6779dae7f37506c1e9bacfe" + }, + { + "id": "complex-data-factory.ETLPipeline", + "urn": "urn:li:dataFlow:(azure-data-factory,complex-data-factory.ETLPipeline,DEV)" + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-multisource-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure-data-factory,complex-data-factory.ETLPipeline,DEV),ExtractCustomersFromSQL)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-multisource-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure-data-factory,complex-data-factory.ETLPipeline,DEV),ExtractOrdersFromSQL)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-multisource-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "container", + "entityUrn": "urn:li:container:7f68411fe6779dae7f37506c1e9bacfe", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-multisource-test", + "lastRunId": "no-run-id-provided" + } +} +] \ No newline at end of file diff --git a/metadata-ingestion/tests/integration/azure_data_factory/adf_nested_golden.json b/metadata-ingestion/tests/integration/azure_data_factory/adf_nested_golden.json new file mode 100644 index 00000000000000..2a7e65206305d5 --- /dev/null +++ b/metadata-ingestion/tests/integration/azure_data_factory/adf_nested_golden.json @@ -0,0 +1,895 @@ +[ +{ + "entityType": "container", + "entityUrn": "urn:li:container:7f68411fe6779dae7f37506c1e9bacfe", + "changeType": "UPSERT", + "aspectName": "containerProperties", + "aspect": { + "json": { + "customProperties": { + "platform": "azure-data-factory", + "env": "DEV", + "resource_group": "complex-test-rg", + "factory_name": "complex-data-factory", + "azure_resource_id": "/subscriptions/12345678-1234-1234-1234-123456789012/resourceGroups/complex-test-rg/providers/Microsoft.DataFactory/factories/complex-data-factory", + "location": "eastus", + "tag:environment": "test", + "tag:purpose": "complex-integration-tests", + "provisioning_state": "Succeeded" + }, + "externalUrl": "https://adf.azure.com/en/home?factory=/subscriptions/12345678-1234-1234-1234-123456789012/resourceGroups/complex-test-rg/providers/Microsoft.DataFactory/factories/complex-data-factory", + "name": "complex-data-factory", + "description": "Azure Data Factory: complex-data-factory", + "env": "DEV" + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-nested-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "container", + "entityUrn": "urn:li:container:7f68411fe6779dae7f37506c1e9bacfe", + "changeType": "UPSERT", + "aspectName": "dataPlatformInstance", + "aspect": { + "json": { + "platform": "urn:li:dataPlatform:azure-data-factory" + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-nested-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataFlow", + "entityUrn": "urn:li:dataFlow:(azure-data-factory,complex-data-factory.ChildDataMovementPipeline,DEV)", + "changeType": "UPSERT", + "aspectName": "subTypes", + "aspect": { + "json": { + "typeNames": [ + "Pipeline" + ] + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-nested-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "container", + "entityUrn": "urn:li:container:7f68411fe6779dae7f37506c1e9bacfe", + "changeType": "UPSERT", + "aspectName": "browsePathsV2", + "aspect": { + "json": { + "path": [] + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-nested-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataFlow", + "entityUrn": "urn:li:dataFlow:(azure-data-factory,complex-data-factory.ChildDataMovementPipeline,DEV)", + "changeType": "UPSERT", + "aspectName": "container", + "aspect": { + "json": { + "container": "urn:li:container:7f68411fe6779dae7f37506c1e9bacfe" + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-nested-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "container", + "entityUrn": "urn:li:container:7f68411fe6779dae7f37506c1e9bacfe", + "changeType": "UPSERT", + "aspectName": "subTypes", + "aspect": { + "json": { + "typeNames": [ + "Data Factory" + ] + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-nested-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataFlow", + "entityUrn": "urn:li:dataFlow:(azure-data-factory,complex-data-factory.ChildDataMovementPipeline,DEV)", + "changeType": "UPSERT", + "aspectName": "dataPlatformInstance", + "aspect": { + "json": { + "platform": "urn:li:dataPlatform:azure-data-factory" + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-nested-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataFlow", + "entityUrn": "urn:li:dataFlow:(azure-data-factory,complex-data-factory.ChildDataMovementPipeline,DEV)", + "changeType": "UPSERT", + "aspectName": "dataFlowInfo", + "aspect": { + "json": { + "customProperties": { + "azure_resource_id": "/subscriptions/12345678-1234-1234-1234-123456789012/resourceGroups/complex-test-rg/providers/Microsoft.DataFactory/factories/complex-data-factory/pipelines/ChildDataMovementPipeline", + "factory_name": "complex-data-factory" + }, + "externalUrl": "https://adf.azure.com/en/authoring/pipeline/ChildDataMovementPipeline?factory=/subscriptions/12345678-1234-1234-1234-123456789012/resourceGroups/complex-test-rg/providers/Microsoft.DataFactory/factories/complex-data-factory", + "name": "ChildDataMovementPipeline", + "description": "Child pipeline for data movement", + "env": "DEV" + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-nested-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataFlow", + "entityUrn": "urn:li:dataFlow:(azure-data-factory,complex-data-factory.ParentOrchestrationPipeline,DEV)", + "changeType": "UPSERT", + "aspectName": "dataPlatformInstance", + "aspect": { + "json": { + "platform": "urn:li:dataPlatform:azure-data-factory" + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-nested-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataFlow", + "entityUrn": "urn:li:dataFlow:(azure-data-factory,complex-data-factory.ParentOrchestrationPipeline,DEV)", + "changeType": "UPSERT", + "aspectName": "dataFlowInfo", + "aspect": { + "json": { + "customProperties": { + "azure_resource_id": "/subscriptions/12345678-1234-1234-1234-123456789012/resourceGroups/complex-test-rg/providers/Microsoft.DataFactory/factories/complex-data-factory/pipelines/ParentOrchestrationPipeline", + "factory_name": "complex-data-factory" + }, + "externalUrl": "https://adf.azure.com/en/authoring/pipeline/ParentOrchestrationPipeline?factory=/subscriptions/12345678-1234-1234-1234-123456789012/resourceGroups/complex-test-rg/providers/Microsoft.DataFactory/factories/complex-data-factory", + "name": "ParentOrchestrationPipeline", + "description": "Parent orchestration pipeline that calls child pipelines", + "env": "DEV" + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-nested-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataFlow", + "entityUrn": "urn:li:dataFlow:(azure-data-factory,complex-data-factory.ChildDataMovementPipeline,DEV)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-nested-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataFlow", + "entityUrn": "urn:li:dataFlow:(azure-data-factory,complex-data-factory.ChildDataMovementPipeline,DEV)", + "changeType": "UPSERT", + "aspectName": "browsePathsV2", + "aspect": { + "json": { + "path": [ + { + "id": "urn:li:container:7f68411fe6779dae7f37506c1e9bacfe", + "urn": "urn:li:container:7f68411fe6779dae7f37506c1e9bacfe" + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-nested-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataFlow", + "entityUrn": "urn:li:dataFlow:(azure-data-factory,complex-data-factory.ParentOrchestrationPipeline,DEV)", + "changeType": "UPSERT", + "aspectName": "browsePathsV2", + "aspect": { + "json": { + "path": [ + { + "id": "urn:li:container:7f68411fe6779dae7f37506c1e9bacfe", + "urn": "urn:li:container:7f68411fe6779dae7f37506c1e9bacfe" + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-nested-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure-data-factory,complex-data-factory.ParentOrchestrationPipeline,DEV),ExecuteDataMovement)", + "changeType": "UPSERT", + "aspectName": "dataPlatformInstance", + "aspect": { + "json": { + "platform": "urn:li:dataPlatform:azure-data-factory" + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-nested-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataFlow", + "entityUrn": "urn:li:dataFlow:(azure-data-factory,complex-data-factory.ParentOrchestrationPipeline,DEV)", + "changeType": "UPSERT", + "aspectName": "subTypes", + "aspect": { + "json": { + "typeNames": [ + "Pipeline" + ] + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-nested-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure-data-factory,complex-data-factory.ParentOrchestrationPipeline,DEV),ExecuteDataMovement)", + "changeType": "UPSERT", + "aspectName": "dataJobInfo", + "aspect": { + "json": { + "customProperties": { + "activity_type": "ExecutePipeline", + "calls_pipeline": "ChildDataMovementPipeline", + "child_pipeline_urn": "urn:li:dataFlow:(azure-data-factory,complex-data-factory.ChildDataMovementPipeline,DEV)", + "child_first_activity": "CopyCustomersToStaging" + }, + "externalUrl": "https://adf.azure.com/en/authoring/pipeline/ParentOrchestrationPipeline?factory=/subscriptions/12345678-1234-1234-1234-123456789012/resourceGroups/complex-test-rg/providers/Microsoft.DataFactory/factories/complex-data-factory", + "name": "ExecuteDataMovement", + "type": { + "string": "COMMAND" + }, + "flowUrn": "urn:li:dataFlow:(azure-data-factory,complex-data-factory.ParentOrchestrationPipeline,DEV)", + "env": "DEV" + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-nested-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure-data-factory,complex-data-factory.ParentOrchestrationPipeline,DEV),ExecuteDataMovement)", + "changeType": "UPSERT", + "aspectName": "dataJobInputOutput", + "aspect": { + "json": { + "inputDatasets": [], + "outputDatasets": [], + "inputDatajobs": [ + "urn:li:dataJob:(urn:li:dataFlow:(azure-data-factory,complex-data-factory.ChildDataMovementPipeline,DEV),CopyCustomersToStaging)" + ] + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-nested-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure-data-factory,complex-data-factory.ParentOrchestrationPipeline,DEV),ExecuteDataMovement)", + "changeType": "UPSERT", + "aspectName": "browsePathsV2", + "aspect": { + "json": { + "path": [ + { + "id": "urn:li:container:7f68411fe6779dae7f37506c1e9bacfe", + "urn": "urn:li:container:7f68411fe6779dae7f37506c1e9bacfe" + }, + { + "id": "complex-data-factory.ParentOrchestrationPipeline", + "urn": "urn:li:dataFlow:(azure-data-factory,complex-data-factory.ParentOrchestrationPipeline,DEV)" + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-nested-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure-data-factory,complex-data-factory.ParentOrchestrationPipeline,DEV),ExecuteTransform)", + "changeType": "UPSERT", + "aspectName": "dataPlatformInstance", + "aspect": { + "json": { + "platform": "urn:li:dataPlatform:azure-data-factory" + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-nested-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure-data-factory,complex-data-factory.ParentOrchestrationPipeline,DEV),ExecuteDataMovement)", + "changeType": "UPSERT", + "aspectName": "subTypes", + "aspect": { + "json": { + "typeNames": [ + "Execute Pipeline" + ] + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-nested-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure-data-factory,complex-data-factory.ParentOrchestrationPipeline,DEV),ExecuteTransform)", + "changeType": "UPSERT", + "aspectName": "dataJobInfo", + "aspect": { + "json": { + "customProperties": { + "activity_type": "ExecutePipeline", + "calls_pipeline": "ChildTransformPipeline", + "child_pipeline_urn": "urn:li:dataFlow:(azure-data-factory,complex-data-factory.ChildTransformPipeline,DEV)", + "child_first_activity": "TransformCustomerData" + }, + "externalUrl": "https://adf.azure.com/en/authoring/pipeline/ParentOrchestrationPipeline?factory=/subscriptions/12345678-1234-1234-1234-123456789012/resourceGroups/complex-test-rg/providers/Microsoft.DataFactory/factories/complex-data-factory", + "name": "ExecuteTransform", + "type": { + "string": "COMMAND" + }, + "flowUrn": "urn:li:dataFlow:(azure-data-factory,complex-data-factory.ParentOrchestrationPipeline,DEV)", + "env": "DEV" + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-nested-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure-data-factory,complex-data-factory.ParentOrchestrationPipeline,DEV),ExecuteTransform)", + "changeType": "UPSERT", + "aspectName": "dataJobInputOutput", + "aspect": { + "json": { + "inputDatasets": [], + "outputDatasets": [], + "inputDatajobs": [ + "urn:li:dataJob:(urn:li:dataFlow:(azure-data-factory,complex-data-factory.ChildTransformPipeline,DEV),TransformCustomerData)" + ] + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-nested-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure-data-factory,complex-data-factory.ParentOrchestrationPipeline,DEV),ExecuteTransform)", + "changeType": "UPSERT", + "aspectName": "browsePathsV2", + "aspect": { + "json": { + "path": [ + { + "id": "urn:li:container:7f68411fe6779dae7f37506c1e9bacfe", + "urn": "urn:li:container:7f68411fe6779dae7f37506c1e9bacfe" + }, + { + "id": "complex-data-factory.ParentOrchestrationPipeline", + "urn": "urn:li:dataFlow:(azure-data-factory,complex-data-factory.ParentOrchestrationPipeline,DEV)" + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-nested-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure-data-factory,complex-data-factory.ParentOrchestrationPipeline,DEV),ExecuteTransform)", + "changeType": "UPSERT", + "aspectName": "subTypes", + "aspect": { + "json": { + "typeNames": [ + "Execute Pipeline" + ] + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-nested-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataFlow", + "entityUrn": "urn:li:dataFlow:(azure-data-factory,complex-data-factory.ParentOrchestrationPipeline,DEV)", + "changeType": "UPSERT", + "aspectName": "container", + "aspect": { + "json": { + "container": "urn:li:container:7f68411fe6779dae7f37506c1e9bacfe" + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-nested-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataFlow", + "entityUrn": "urn:li:dataFlow:(azure-data-factory,complex-data-factory.ParentOrchestrationPipeline,DEV)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-nested-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure-data-factory,complex-data-factory.ParentOrchestrationPipeline,DEV),ExecuteDataMovement)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-nested-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure-data-factory,complex-data-factory.ParentOrchestrationPipeline,DEV),ExecuteTransform)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-nested-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure-data-factory,complex-data-factory.ChildDataMovementPipeline,DEV),CopyCustomersToStaging)", + "changeType": "UPSERT", + "aspectName": "dataPlatformInstance", + "aspect": { + "json": { + "platform": "urn:li:dataPlatform:azure-data-factory" + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-nested-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataFlow", + "entityUrn": "urn:li:dataFlow:(azure-data-factory,complex-data-factory.ChildTransformPipeline,DEV)", + "changeType": "UPSERT", + "aspectName": "subTypes", + "aspect": { + "json": { + "typeNames": [ + "Pipeline" + ] + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-nested-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure-data-factory,complex-data-factory.ChildDataMovementPipeline,DEV),CopyCustomersToStaging)", + "changeType": "UPSERT", + "aspectName": "dataJobInfo", + "aspect": { + "json": { + "customProperties": { + "activity_type": "Copy" + }, + "externalUrl": "https://adf.azure.com/en/authoring/pipeline/ChildDataMovementPipeline?factory=/subscriptions/12345678-1234-1234-1234-123456789012/resourceGroups/complex-test-rg/providers/Microsoft.DataFactory/factories/complex-data-factory", + "name": "CopyCustomersToStaging", + "type": { + "string": "COMMAND" + }, + "flowUrn": "urn:li:dataFlow:(azure-data-factory,complex-data-factory.ChildDataMovementPipeline,DEV)", + "env": "DEV" + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-nested-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure-data-factory,complex-data-factory.ChildDataMovementPipeline,DEV),CopyCustomersToStaging)", + "changeType": "UPSERT", + "aspectName": "dataJobInputOutput", + "aspect": { + "json": { + "inputDatasets": [ + "urn:li:dataset:(urn:li:dataPlatform:mssql,Customers,DEV)" + ], + "outputDatasets": [ + "urn:li:dataset:(urn:li:dataPlatform:azure_blob_storage,staging/customers,DEV)" + ] + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-nested-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure-data-factory,complex-data-factory.ChildDataMovementPipeline,DEV),CopyCustomersToStaging)", + "changeType": "UPSERT", + "aspectName": "browsePathsV2", + "aspect": { + "json": { + "path": [ + { + "id": "urn:li:container:7f68411fe6779dae7f37506c1e9bacfe", + "urn": "urn:li:container:7f68411fe6779dae7f37506c1e9bacfe" + }, + { + "id": "complex-data-factory.ChildDataMovementPipeline", + "urn": "urn:li:dataFlow:(azure-data-factory,complex-data-factory.ChildDataMovementPipeline,DEV)" + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-nested-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure-data-factory,complex-data-factory.ChildDataMovementPipeline,DEV),CopyCustomersToStaging)", + "changeType": "UPSERT", + "aspectName": "subTypes", + "aspect": { + "json": { + "typeNames": [ + "Copy Activity" + ] + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-nested-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataFlow", + "entityUrn": "urn:li:dataFlow:(azure-data-factory,complex-data-factory.ChildTransformPipeline,DEV)", + "changeType": "UPSERT", + "aspectName": "container", + "aspect": { + "json": { + "container": "urn:li:container:7f68411fe6779dae7f37506c1e9bacfe" + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-nested-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataFlow", + "entityUrn": "urn:li:dataFlow:(azure-data-factory,complex-data-factory.ChildTransformPipeline,DEV)", + "changeType": "UPSERT", + "aspectName": "dataPlatformInstance", + "aspect": { + "json": { + "platform": "urn:li:dataPlatform:azure-data-factory" + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-nested-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataFlow", + "entityUrn": "urn:li:dataFlow:(azure-data-factory,complex-data-factory.ChildTransformPipeline,DEV)", + "changeType": "UPSERT", + "aspectName": "dataFlowInfo", + "aspect": { + "json": { + "customProperties": { + "azure_resource_id": "/subscriptions/12345678-1234-1234-1234-123456789012/resourceGroups/complex-test-rg/providers/Microsoft.DataFactory/factories/complex-data-factory/pipelines/ChildTransformPipeline", + "factory_name": "complex-data-factory" + }, + "externalUrl": "https://adf.azure.com/en/authoring/pipeline/ChildTransformPipeline?factory=/subscriptions/12345678-1234-1234-1234-123456789012/resourceGroups/complex-test-rg/providers/Microsoft.DataFactory/factories/complex-data-factory", + "name": "ChildTransformPipeline", + "description": "Child pipeline for data transformation", + "env": "DEV" + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-nested-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataFlow", + "entityUrn": "urn:li:dataFlow:(azure-data-factory,complex-data-factory.ChildTransformPipeline,DEV)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-nested-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataFlow", + "entityUrn": "urn:li:dataFlow:(azure-data-factory,complex-data-factory.ChildTransformPipeline,DEV)", + "changeType": "UPSERT", + "aspectName": "browsePathsV2", + "aspect": { + "json": { + "path": [ + { + "id": "urn:li:container:7f68411fe6779dae7f37506c1e9bacfe", + "urn": "urn:li:container:7f68411fe6779dae7f37506c1e9bacfe" + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-nested-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "container", + "entityUrn": "urn:li:container:7f68411fe6779dae7f37506c1e9bacfe", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-nested-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure-data-factory,complex-data-factory.ChildTransformPipeline,DEV),TransformCustomerData)", + "changeType": "UPSERT", + "aspectName": "dataJobInfo", + "aspect": { + "json": { + "customProperties": { + "activity_type": "ExecuteDataFlow" + }, + "externalUrl": "https://adf.azure.com/en/authoring/pipeline/ChildTransformPipeline?factory=/subscriptions/12345678-1234-1234-1234-123456789012/resourceGroups/complex-test-rg/providers/Microsoft.DataFactory/factories/complex-data-factory", + "name": "TransformCustomerData", + "type": { + "string": "COMMAND" + }, + "flowUrn": "urn:li:dataFlow:(azure-data-factory,complex-data-factory.ChildTransformPipeline,DEV)", + "env": "DEV" + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-nested-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure-data-factory,complex-data-factory.ChildTransformPipeline,DEV),TransformCustomerData)", + "changeType": "UPSERT", + "aspectName": "subTypes", + "aspect": { + "json": { + "typeNames": [ + "Data Flow Activity" + ] + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-nested-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure-data-factory,complex-data-factory.ChildTransformPipeline,DEV),TransformCustomerData)", + "changeType": "UPSERT", + "aspectName": "dataPlatformInstance", + "aspect": { + "json": { + "platform": "urn:li:dataPlatform:azure-data-factory" + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-nested-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure-data-factory,complex-data-factory.ChildTransformPipeline,DEV),TransformCustomerData)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-nested-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure-data-factory,complex-data-factory.ChildTransformPipeline,DEV),TransformCustomerData)", + "changeType": "UPSERT", + "aspectName": "browsePathsV2", + "aspect": { + "json": { + "path": [ + { + "id": "urn:li:container:7f68411fe6779dae7f37506c1e9bacfe", + "urn": "urn:li:container:7f68411fe6779dae7f37506c1e9bacfe" + }, + { + "id": "complex-data-factory.ChildTransformPipeline", + "urn": "urn:li:dataFlow:(azure-data-factory,complex-data-factory.ChildTransformPipeline,DEV)" + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-nested-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure-data-factory,complex-data-factory.ChildDataMovementPipeline,DEV),CopyCustomersToStaging)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-nested-test", + "lastRunId": "no-run-id-provided" + } +} +] \ No newline at end of file diff --git a/metadata-ingestion/tests/integration/azure_data_factory/adf_platform_instance_golden.json b/metadata-ingestion/tests/integration/azure_data_factory/adf_platform_instance_golden.json index 05d62f667a6fcc..5da85bdc49314c 100644 --- a/metadata-ingestion/tests/integration/azure_data_factory/adf_platform_instance_golden.json +++ b/metadata-ingestion/tests/integration/azure_data_factory/adf_platform_instance_golden.json @@ -1,13 +1,13 @@ [ { "entityType": "container", - "entityUrn": "urn:li:container:b5eba2c40de50f1afa7f4c8876a663d4", + "entityUrn": "urn:li:container:99b9785e9e12713c9df27982572a999c", "changeType": "UPSERT", "aspectName": "containerProperties", "aspect": { "json": { "customProperties": { - "platform": "azure_data_factory", + "platform": "azure-data-factory", "instance": "my-adf-instance", "env": "DEV", "resource_group": "test-resource-group", @@ -32,13 +32,13 @@ }, { "entityType": "container", - "entityUrn": "urn:li:container:b5eba2c40de50f1afa7f4c8876a663d4", + "entityUrn": "urn:li:container:99b9785e9e12713c9df27982572a999c", "changeType": "UPSERT", "aspectName": "dataPlatformInstance", "aspect": { "json": { - "platform": "urn:li:dataPlatform:azure_data_factory", - "instance": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:azure_data_factory,my-adf-instance)" + "platform": "urn:li:dataPlatform:azure-data-factory", + "instance": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:azure-data-factory,my-adf-instance)" } }, "systemMetadata": { @@ -48,15 +48,26 @@ } }, { - "entityType": "container", - "entityUrn": "urn:li:container:b5eba2c40de50f1afa7f4c8876a663d4", + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure-data-factory,my-adf-instance.test-data-factory.DataIngestionPipeline,DEV),TransformData)", "changeType": "UPSERT", - "aspectName": "subTypes", + "aspectName": "dataJobInfo", "aspect": { "json": { - "typeNames": [ - "Data Factory" - ] + "customProperties": { + "activity_type": "ExecuteDataFlow", + "activity_description": "Execute mapping data flow", + "timeout": "7.00:00:00", + "retry": "0" + }, + "externalUrl": "https://adf.azure.com/en/authoring/pipeline/DataIngestionPipeline?factory=/subscriptions/12345678-1234-1234-1234-123456789012/resourceGroups/test-resource-group/providers/Microsoft.DataFactory/factories/test-data-factory", + "name": "TransformData", + "description": "Execute mapping data flow", + "type": { + "string": "COMMAND" + }, + "flowUrn": "urn:li:dataFlow:(azure-data-factory,my-adf-instance.test-data-factory.DataIngestionPipeline,DEV)", + "env": "DEV" } }, "systemMetadata": { @@ -67,15 +78,15 @@ }, { "entityType": "container", - "entityUrn": "urn:li:container:b5eba2c40de50f1afa7f4c8876a663d4", + "entityUrn": "urn:li:container:99b9785e9e12713c9df27982572a999c", "changeType": "UPSERT", "aspectName": "browsePathsV2", "aspect": { "json": { "path": [ { - "id": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:azure_data_factory,my-adf-instance)", - "urn": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:azure_data_factory,my-adf-instance)" + "id": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:azure-data-factory,my-adf-instance)", + "urn": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:azure-data-factory,my-adf-instance)" } ] } @@ -87,14 +98,15 @@ } }, { - "entityType": "dataFlow", - "entityUrn": "urn:li:dataFlow:(azure_data_factory,my-adf-instance.test-data-factory.DataIngestionPipeline,DEV)", + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure-data-factory,my-adf-instance.test-data-factory.DataIngestionPipeline,DEV),TransformData)", "changeType": "UPSERT", - "aspectName": "dataPlatformInstance", + "aspectName": "subTypes", "aspect": { "json": { - "platform": "urn:li:dataPlatform:azure_data_factory", - "instance": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:azure_data_factory,my-adf-instance)" + "typeNames": [ + "Data Flow Activity" + ] } }, "systemMetadata": { @@ -105,12 +117,14 @@ }, { "entityType": "container", - "entityUrn": "urn:li:container:b5eba2c40de50f1afa7f4c8876a663d4", + "entityUrn": "urn:li:container:99b9785e9e12713c9df27982572a999c", "changeType": "UPSERT", - "aspectName": "status", + "aspectName": "subTypes", "aspect": { "json": { - "removed": false + "typeNames": [ + "Data Factory" + ] } }, "systemMetadata": { @@ -120,21 +134,14 @@ } }, { - "entityType": "dataFlow", - "entityUrn": "urn:li:dataFlow:(azure_data_factory,my-adf-instance.test-data-factory.DataIngestionPipeline,DEV)", + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure-data-factory,my-adf-instance.test-data-factory.DataIngestionPipeline,DEV),TransformData)", "changeType": "UPSERT", - "aspectName": "dataFlowInfo", + "aspectName": "dataPlatformInstance", "aspect": { "json": { - "customProperties": { - "azure_resource_id": "/subscriptions/12345678-1234-1234-1234-123456789012/resourceGroups/test-resource-group/providers/Microsoft.DataFactory/factories/test-data-factory/pipelines/DataIngestionPipeline", - "factory_name": "test-data-factory", - "triggers": "DailyScheduleTrigger" - }, - "externalUrl": "https://adf.azure.com/en/authoring/pipeline/DataIngestionPipeline?factory=/subscriptions/12345678-1234-1234-1234-123456789012/resourceGroups/test-resource-group/providers/Microsoft.DataFactory/factories/test-data-factory", - "name": "DataIngestionPipeline", - "description": "Main data ingestion pipeline", - "env": "DEV" + "platform": "urn:li:dataPlatform:azure-data-factory", + "instance": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:azure-data-factory,my-adf-instance)" } }, "systemMetadata": { @@ -144,13 +151,13 @@ } }, { - "entityType": "dataFlow", - "entityUrn": "urn:li:dataFlow:(azure_data_factory,my-adf-instance.test-data-factory.DataIngestionPipeline,DEV)", + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure-data-factory,my-adf-instance.test-data-factory.DataIngestionPipeline,DEV),TransformData)", "changeType": "UPSERT", - "aspectName": "container", + "aspectName": "status", "aspect": { "json": { - "container": "urn:li:container:b5eba2c40de50f1afa7f4c8876a663d4" + "removed": false } }, "systemMetadata": { @@ -161,14 +168,13 @@ }, { "entityType": "dataFlow", - "entityUrn": "urn:li:dataFlow:(azure_data_factory,my-adf-instance.test-data-factory.DataIngestionPipeline,DEV)", + "entityUrn": "urn:li:dataFlow:(azure-data-factory,my-adf-instance.test-data-factory.DataIngestionPipeline,DEV)", "changeType": "UPSERT", - "aspectName": "subTypes", + "aspectName": "dataPlatformInstance", "aspect": { "json": { - "typeNames": [ - "Pipeline" - ] + "platform": "urn:li:dataPlatform:azure-data-factory", + "instance": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:azure-data-factory,my-adf-instance)" } }, "systemMetadata": { @@ -179,13 +185,37 @@ }, { "entityType": "dataJob", - "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure_data_factory,my-adf-instance.test-data-factory.DataIngestionPipeline,DEV),CopyBlobToSQL)", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure-data-factory,my-adf-instance.test-data-factory.DataIngestionPipeline,DEV),CopyBlobToSQL)", "changeType": "UPSERT", "aspectName": "dataPlatformInstance", "aspect": { "json": { - "platform": "urn:li:dataPlatform:azure_data_factory", - "instance": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:azure_data_factory,my-adf-instance)" + "platform": "urn:li:dataPlatform:azure-data-factory", + "instance": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:azure-data-factory,my-adf-instance)" + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-test-platform-instance", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataFlow", + "entityUrn": "urn:li:dataFlow:(azure-data-factory,my-adf-instance.test-data-factory.DataIngestionPipeline,DEV)", + "changeType": "UPSERT", + "aspectName": "dataFlowInfo", + "aspect": { + "json": { + "customProperties": { + "azure_resource_id": "/subscriptions/12345678-1234-1234-1234-123456789012/resourceGroups/test-resource-group/providers/Microsoft.DataFactory/factories/test-data-factory/pipelines/DataIngestionPipeline", + "factory_name": "test-data-factory", + "triggers": "DailyScheduleTrigger" + }, + "externalUrl": "https://adf.azure.com/en/authoring/pipeline/DataIngestionPipeline?factory=/subscriptions/12345678-1234-1234-1234-123456789012/resourceGroups/test-resource-group/providers/Microsoft.DataFactory/factories/test-data-factory", + "name": "DataIngestionPipeline", + "description": "Main data ingestion pipeline", + "env": "DEV" } }, "systemMetadata": { @@ -196,7 +226,7 @@ }, { "entityType": "dataJob", - "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure_data_factory,my-adf-instance.test-data-factory.DataIngestionPipeline,DEV),CopyBlobToSQL)", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure-data-factory,my-adf-instance.test-data-factory.DataIngestionPipeline,DEV),CopyBlobToSQL)", "changeType": "UPSERT", "aspectName": "dataJobInfo", "aspect": { @@ -213,7 +243,7 @@ "type": { "string": "COMMAND" }, - "flowUrn": "urn:li:dataFlow:(azure_data_factory,my-adf-instance.test-data-factory.DataIngestionPipeline,DEV)", + "flowUrn": "urn:li:dataFlow:(azure-data-factory,my-adf-instance.test-data-factory.DataIngestionPipeline,DEV)", "env": "DEV" } }, @@ -224,21 +254,17 @@ } }, { - "entityType": "dataFlow", - "entityUrn": "urn:li:dataFlow:(azure_data_factory,my-adf-instance.test-data-factory.DataIngestionPipeline,DEV)", + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure-data-factory,my-adf-instance.test-data-factory.DataIngestionPipeline,DEV),CopyBlobToSQL)", "changeType": "UPSERT", - "aspectName": "browsePathsV2", + "aspectName": "dataJobInputOutput", "aspect": { "json": { - "path": [ - { - "id": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:azure_data_factory,my-adf-instance)", - "urn": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:azure_data_factory,my-adf-instance)" - }, - { - "id": "urn:li:container:b5eba2c40de50f1afa7f4c8876a663d4", - "urn": "urn:li:container:b5eba2c40de50f1afa7f4c8876a663d4" - } + "inputDatasets": [ + "urn:li:dataset:(urn:li:dataPlatform:azure_blob_storage,raw-data/input/data.csv,DEV)" + ], + "outputDatasets": [ + "urn:li:dataset:(urn:li:dataPlatform:mssql,ProcessedData,DEV)" ] } }, @@ -249,13 +275,26 @@ } }, { - "entityType": "dataFlow", - "entityUrn": "urn:li:dataFlow:(azure_data_factory,my-adf-instance.test-data-factory.DataIngestionPipeline,DEV)", + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure-data-factory,my-adf-instance.test-data-factory.DataIngestionPipeline,DEV),CopyBlobToSQL)", "changeType": "UPSERT", - "aspectName": "status", + "aspectName": "browsePathsV2", "aspect": { "json": { - "removed": false + "path": [ + { + "id": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:azure-data-factory,my-adf-instance)", + "urn": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:azure-data-factory,my-adf-instance)" + }, + { + "id": "urn:li:container:99b9785e9e12713c9df27982572a999c", + "urn": "urn:li:container:99b9785e9e12713c9df27982572a999c" + }, + { + "id": "my-adf-instance.test-data-factory.DataIngestionPipeline", + "urn": "urn:li:dataFlow:(azure-data-factory,my-adf-instance.test-data-factory.DataIngestionPipeline,DEV)" + } + ] } }, "systemMetadata": { @@ -266,7 +305,7 @@ }, { "entityType": "dataJob", - "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure_data_factory,my-adf-instance.test-data-factory.DataIngestionPipeline,DEV),CopyBlobToSQL)", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure-data-factory,my-adf-instance.test-data-factory.DataIngestionPipeline,DEV),CopyBlobToSQL)", "changeType": "UPSERT", "aspectName": "subTypes", "aspect": { @@ -284,44 +323,23 @@ }, { "entityType": "dataJob", - "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure_data_factory,my-adf-instance.test-data-factory.DataIngestionPipeline,DEV),CopyBlobToSQL)", - "changeType": "UPSERT", - "aspectName": "dataJobInputOutput", - "aspect": { - "json": { - "inputDatasets": [ - "urn:li:dataset:(urn:li:dataPlatform:azure_blob_storage,raw-data/input/data.csv,DEV)" - ], - "outputDatasets": [ - "urn:li:dataset:(urn:li:dataPlatform:mssql,ProcessedData,DEV)" - ] - } - }, - "systemMetadata": { - "lastObserved": 1705320000000, - "runId": "adf-test-platform-instance", - "lastRunId": "no-run-id-provided" - } -}, -{ - "entityType": "dataJob", - "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure_data_factory,my-adf-instance.test-data-factory.DataProcessingPipeline,DEV),CallStoredProc)", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure-data-factory,my-adf-instance.test-data-factory.DataIngestionPipeline,DEV),TransformData)", "changeType": "UPSERT", "aspectName": "browsePathsV2", "aspect": { "json": { "path": [ { - "id": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:azure_data_factory,my-adf-instance)", - "urn": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:azure_data_factory,my-adf-instance)" + "id": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:azure-data-factory,my-adf-instance)", + "urn": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:azure-data-factory,my-adf-instance)" }, { - "id": "urn:li:container:b5eba2c40de50f1afa7f4c8876a663d4", - "urn": "urn:li:container:b5eba2c40de50f1afa7f4c8876a663d4" + "id": "urn:li:container:99b9785e9e12713c9df27982572a999c", + "urn": "urn:li:container:99b9785e9e12713c9df27982572a999c" }, { - "id": "my-adf-instance.test-data-factory.DataProcessingPipeline", - "urn": "urn:li:dataFlow:(azure_data_factory,my-adf-instance.test-data-factory.DataProcessingPipeline,DEV)" + "id": "my-adf-instance.test-data-factory.DataIngestionPipeline", + "urn": "urn:li:dataFlow:(azure-data-factory,my-adf-instance.test-data-factory.DataIngestionPipeline,DEV)" } ] } @@ -333,14 +351,14 @@ } }, { - "entityType": "dataJob", - "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure_data_factory,my-adf-instance.test-data-factory.DataProcessingPipeline,DEV),CallStoredProc)", + "entityType": "dataFlow", + "entityUrn": "urn:li:dataFlow:(azure-data-factory,my-adf-instance.test-data-factory.DataIngestionPipeline,DEV)", "changeType": "UPSERT", "aspectName": "subTypes", "aspect": { "json": { "typeNames": [ - "Stored Procedure Activity" + "Pipeline" ] } }, @@ -351,24 +369,20 @@ } }, { - "entityType": "dataJob", - "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure_data_factory,my-adf-instance.test-data-factory.DataIngestionPipeline,DEV),CopyBlobToSQL)", + "entityType": "dataFlow", + "entityUrn": "urn:li:dataFlow:(azure-data-factory,my-adf-instance.test-data-factory.DataIngestionPipeline,DEV)", "changeType": "UPSERT", "aspectName": "browsePathsV2", "aspect": { "json": { "path": [ { - "id": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:azure_data_factory,my-adf-instance)", - "urn": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:azure_data_factory,my-adf-instance)" - }, - { - "id": "urn:li:container:b5eba2c40de50f1afa7f4c8876a663d4", - "urn": "urn:li:container:b5eba2c40de50f1afa7f4c8876a663d4" + "id": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:azure-data-factory,my-adf-instance)", + "urn": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:azure-data-factory,my-adf-instance)" }, { - "id": "my-adf-instance.test-data-factory.DataIngestionPipeline", - "urn": "urn:li:dataFlow:(azure_data_factory,my-adf-instance.test-data-factory.DataIngestionPipeline,DEV)" + "id": "urn:li:container:99b9785e9e12713c9df27982572a999c", + "urn": "urn:li:container:99b9785e9e12713c9df27982572a999c" } ] } @@ -380,14 +394,13 @@ } }, { - "entityType": "dataJob", - "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure_data_factory,my-adf-instance.test-data-factory.DataProcessingPipeline,DEV),CallStoredProc)", + "entityType": "dataFlow", + "entityUrn": "urn:li:dataFlow:(azure-data-factory,my-adf-instance.test-data-factory.DataIngestionPipeline,DEV)", "changeType": "UPSERT", - "aspectName": "dataPlatformInstance", + "aspectName": "container", "aspect": { "json": { - "platform": "urn:li:dataPlatform:azure_data_factory", - "instance": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:azure_data_factory,my-adf-instance)" + "container": "urn:li:container:99b9785e9e12713c9df27982572a999c" } }, "systemMetadata": { @@ -398,13 +411,29 @@ }, { "entityType": "dataJob", - "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure_data_factory,my-adf-instance.test-data-factory.DataIngestionPipeline,DEV),LookupConfig)", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure-data-factory,my-adf-instance.test-data-factory.DataIngestionPipeline,DEV),LookupConfig)", "changeType": "UPSERT", "aspectName": "dataPlatformInstance", "aspect": { "json": { - "platform": "urn:li:dataPlatform:azure_data_factory", - "instance": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:azure_data_factory,my-adf-instance)" + "platform": "urn:li:dataPlatform:azure-data-factory", + "instance": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:azure-data-factory,my-adf-instance)" + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-test-platform-instance", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataFlow", + "entityUrn": "urn:li:dataFlow:(azure-data-factory,my-adf-instance.test-data-factory.DataIngestionPipeline,DEV)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false } }, "systemMetadata": { @@ -415,7 +444,7 @@ }, { "entityType": "dataJob", - "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure_data_factory,my-adf-instance.test-data-factory.DataIngestionPipeline,DEV),LookupConfig)", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure-data-factory,my-adf-instance.test-data-factory.DataIngestionPipeline,DEV),LookupConfig)", "changeType": "UPSERT", "aspectName": "dataJobInfo", "aspect": { @@ -432,7 +461,7 @@ "type": { "string": "COMMAND" }, - "flowUrn": "urn:li:dataFlow:(azure_data_factory,my-adf-instance.test-data-factory.DataIngestionPipeline,DEV)", + "flowUrn": "urn:li:dataFlow:(azure-data-factory,my-adf-instance.test-data-factory.DataIngestionPipeline,DEV)", "env": "DEV" } }, @@ -444,12 +473,62 @@ }, { "entityType": "dataJob", - "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure_data_factory,my-adf-instance.test-data-factory.DataProcessingPipeline,DEV),CallStoredProc)", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure-data-factory,my-adf-instance.test-data-factory.DataIngestionPipeline,DEV),LookupConfig)", "changeType": "UPSERT", - "aspectName": "status", + "aspectName": "dataJobInputOutput", "aspect": { "json": { - "removed": false + "inputDatasets": [ + "urn:li:dataset:(urn:li:dataPlatform:azure_blob_storage,config/settings.json,DEV)" + ], + "outputDatasets": [] + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-test-platform-instance", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure-data-factory,my-adf-instance.test-data-factory.DataIngestionPipeline,DEV),LookupConfig)", + "changeType": "UPSERT", + "aspectName": "browsePathsV2", + "aspect": { + "json": { + "path": [ + { + "id": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:azure-data-factory,my-adf-instance)", + "urn": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:azure-data-factory,my-adf-instance)" + }, + { + "id": "urn:li:container:99b9785e9e12713c9df27982572a999c", + "urn": "urn:li:container:99b9785e9e12713c9df27982572a999c" + }, + { + "id": "my-adf-instance.test-data-factory.DataIngestionPipeline", + "urn": "urn:li:dataFlow:(azure-data-factory,my-adf-instance.test-data-factory.DataIngestionPipeline,DEV)" + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-test-platform-instance", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure-data-factory,my-adf-instance.test-data-factory.DataIngestionPipeline,DEV),LookupConfig)", + "changeType": "UPSERT", + "aspectName": "subTypes", + "aspect": { + "json": { + "typeNames": [ + "Lookup Activity" + ] } }, "systemMetadata": { @@ -460,7 +539,7 @@ }, { "entityType": "dataJob", - "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure_data_factory,my-adf-instance.test-data-factory.DataProcessingPipeline,DEV),CallStoredProc)", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure-data-factory,my-adf-instance.test-data-factory.DataProcessingPipeline,DEV),CallStoredProc)", "changeType": "UPSERT", "aspectName": "dataJobInfo", "aspect": { @@ -477,7 +556,7 @@ "type": { "string": "COMMAND" }, - "flowUrn": "urn:li:dataFlow:(azure_data_factory,my-adf-instance.test-data-factory.DataProcessingPipeline,DEV)", + "flowUrn": "urn:li:dataFlow:(azure-data-factory,my-adf-instance.test-data-factory.DataProcessingPipeline,DEV)", "env": "DEV" } }, @@ -489,13 +568,13 @@ }, { "entityType": "dataJob", - "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure_data_factory,my-adf-instance.test-data-factory.DataIngestionPipeline,DEV),LookupConfig)", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure-data-factory,my-adf-instance.test-data-factory.DataProcessingPipeline,DEV),CallStoredProc)", "changeType": "UPSERT", "aspectName": "subTypes", "aspect": { "json": { "typeNames": [ - "Lookup Activity" + "Stored Procedure Activity" ] } }, @@ -507,15 +586,13 @@ }, { "entityType": "dataJob", - "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure_data_factory,my-adf-instance.test-data-factory.DataIngestionPipeline,DEV),LookupConfig)", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure-data-factory,my-adf-instance.test-data-factory.DataProcessingPipeline,DEV),CallStoredProc)", "changeType": "UPSERT", - "aspectName": "dataJobInputOutput", + "aspectName": "dataPlatformInstance", "aspect": { "json": { - "inputDatasets": [ - "urn:li:dataset:(urn:li:dataPlatform:azure_blob_storage,config/settings.json,DEV)" - ], - "outputDatasets": [] + "platform": "urn:li:dataPlatform:azure-data-factory", + "instance": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:azure-data-factory,my-adf-instance)" } }, "systemMetadata": { @@ -525,13 +602,13 @@ } }, { - "entityType": "dataFlow", - "entityUrn": "urn:li:dataFlow:(azure_data_factory,my-adf-instance.test-data-factory.DataProcessingPipeline,DEV)", + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure-data-factory,my-adf-instance.test-data-factory.DataProcessingPipeline,DEV),CallStoredProc)", "changeType": "UPSERT", - "aspectName": "container", + "aspectName": "status", "aspect": { "json": { - "container": "urn:li:container:b5eba2c40de50f1afa7f4c8876a663d4" + "removed": false } }, "systemMetadata": { @@ -541,20 +618,24 @@ } }, { - "entityType": "dataFlow", - "entityUrn": "urn:li:dataFlow:(azure_data_factory,my-adf-instance.test-data-factory.DataProcessingPipeline,DEV)", + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure-data-factory,my-adf-instance.test-data-factory.DataProcessingPipeline,DEV),CallStoredProc)", "changeType": "UPSERT", "aspectName": "browsePathsV2", "aspect": { "json": { "path": [ { - "id": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:azure_data_factory,my-adf-instance)", - "urn": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:azure_data_factory,my-adf-instance)" + "id": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:azure-data-factory,my-adf-instance)", + "urn": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:azure-data-factory,my-adf-instance)" + }, + { + "id": "urn:li:container:99b9785e9e12713c9df27982572a999c", + "urn": "urn:li:container:99b9785e9e12713c9df27982572a999c" }, { - "id": "urn:li:container:b5eba2c40de50f1afa7f4c8876a663d4", - "urn": "urn:li:container:b5eba2c40de50f1afa7f4c8876a663d4" + "id": "my-adf-instance.test-data-factory.DataProcessingPipeline", + "urn": "urn:li:dataFlow:(azure-data-factory,my-adf-instance.test-data-factory.DataProcessingPipeline,DEV)" } ] } @@ -566,26 +647,13 @@ } }, { - "entityType": "dataJob", - "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure_data_factory,my-adf-instance.test-data-factory.DataIngestionPipeline,DEV),LookupConfig)", + "entityType": "container", + "entityUrn": "urn:li:container:99b9785e9e12713c9df27982572a999c", "changeType": "UPSERT", - "aspectName": "browsePathsV2", + "aspectName": "status", "aspect": { "json": { - "path": [ - { - "id": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:azure_data_factory,my-adf-instance)", - "urn": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:azure_data_factory,my-adf-instance)" - }, - { - "id": "urn:li:container:b5eba2c40de50f1afa7f4c8876a663d4", - "urn": "urn:li:container:b5eba2c40de50f1afa7f4c8876a663d4" - }, - { - "id": "my-adf-instance.test-data-factory.DataIngestionPipeline", - "urn": "urn:li:dataFlow:(azure_data_factory,my-adf-instance.test-data-factory.DataIngestionPipeline,DEV)" - } - ] + "removed": false } }, "systemMetadata": { @@ -596,7 +664,7 @@ }, { "entityType": "dataFlow", - "entityUrn": "urn:li:dataFlow:(azure_data_factory,my-adf-instance.test-data-factory.DataProcessingPipeline,DEV)", + "entityUrn": "urn:li:dataFlow:(azure-data-factory,my-adf-instance.test-data-factory.DataProcessingPipeline,DEV)", "changeType": "UPSERT", "aspectName": "subTypes", "aspect": { @@ -613,14 +681,13 @@ } }, { - "entityType": "dataJob", - "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure_data_factory,my-adf-instance.test-data-factory.DataIngestionPipeline,DEV),TransformData)", + "entityType": "dataFlow", + "entityUrn": "urn:li:dataFlow:(azure-data-factory,my-adf-instance.test-data-factory.DataProcessingPipeline,DEV)", "changeType": "UPSERT", - "aspectName": "dataPlatformInstance", + "aspectName": "container", "aspect": { "json": { - "platform": "urn:li:dataPlatform:azure_data_factory", - "instance": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:azure_data_factory,my-adf-instance)" + "container": "urn:li:container:99b9785e9e12713c9df27982572a999c" } }, "systemMetadata": { @@ -630,26 +697,14 @@ } }, { - "entityType": "dataJob", - "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure_data_factory,my-adf-instance.test-data-factory.DataIngestionPipeline,DEV),TransformData)", + "entityType": "dataFlow", + "entityUrn": "urn:li:dataFlow:(azure-data-factory,my-adf-instance.test-data-factory.DataProcessingPipeline,DEV)", "changeType": "UPSERT", - "aspectName": "dataJobInfo", + "aspectName": "dataPlatformInstance", "aspect": { "json": { - "customProperties": { - "activity_type": "ExecuteDataFlow", - "activity_description": "Execute mapping data flow", - "timeout": "7.00:00:00", - "retry": "0" - }, - "externalUrl": "https://adf.azure.com/en/authoring/pipeline/DataIngestionPipeline?factory=/subscriptions/12345678-1234-1234-1234-123456789012/resourceGroups/test-resource-group/providers/Microsoft.DataFactory/factories/test-data-factory", - "name": "TransformData", - "description": "Execute mapping data flow", - "type": { - "string": "COMMAND" - }, - "flowUrn": "urn:li:dataFlow:(azure_data_factory,my-adf-instance.test-data-factory.DataIngestionPipeline,DEV)", - "env": "DEV" + "platform": "urn:li:dataPlatform:azure-data-factory", + "instance": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:azure-data-factory,my-adf-instance)" } }, "systemMetadata": { @@ -660,7 +715,7 @@ }, { "entityType": "dataFlow", - "entityUrn": "urn:li:dataFlow:(azure_data_factory,my-adf-instance.test-data-factory.DataProcessingPipeline,DEV)", + "entityUrn": "urn:li:dataFlow:(azure-data-factory,my-adf-instance.test-data-factory.DataProcessingPipeline,DEV)", "changeType": "UPSERT", "aspectName": "dataFlowInfo", "aspect": { @@ -683,13 +738,12 @@ }, { "entityType": "dataFlow", - "entityUrn": "urn:li:dataFlow:(azure_data_factory,my-adf-instance.test-data-factory.DataProcessingPipeline,DEV)", + "entityUrn": "urn:li:dataFlow:(azure-data-factory,my-adf-instance.test-data-factory.DataProcessingPipeline,DEV)", "changeType": "UPSERT", - "aspectName": "dataPlatformInstance", + "aspectName": "status", "aspect": { "json": { - "platform": "urn:li:dataPlatform:azure_data_factory", - "instance": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:azure_data_factory,my-adf-instance)" + "removed": false } }, "systemMetadata": { @@ -700,25 +754,7 @@ }, { "entityType": "dataJob", - "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure_data_factory,my-adf-instance.test-data-factory.DataIngestionPipeline,DEV),TransformData)", - "changeType": "UPSERT", - "aspectName": "subTypes", - "aspect": { - "json": { - "typeNames": [ - "Data Flow Activity" - ] - } - }, - "systemMetadata": { - "lastObserved": 1705320000000, - "runId": "adf-test-platform-instance", - "lastRunId": "no-run-id-provided" - } -}, -{ - "entityType": "dataFlow", - "entityUrn": "urn:li:dataFlow:(azure_data_factory,my-adf-instance.test-data-factory.DataProcessingPipeline,DEV)", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure-data-factory,my-adf-instance.test-data-factory.DataIngestionPipeline,DEV),CopyBlobToSQL)", "changeType": "UPSERT", "aspectName": "status", "aspect": { @@ -733,24 +769,20 @@ } }, { - "entityType": "dataJob", - "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure_data_factory,my-adf-instance.test-data-factory.DataIngestionPipeline,DEV),TransformData)", + "entityType": "dataFlow", + "entityUrn": "urn:li:dataFlow:(azure-data-factory,my-adf-instance.test-data-factory.DataProcessingPipeline,DEV)", "changeType": "UPSERT", "aspectName": "browsePathsV2", "aspect": { "json": { "path": [ { - "id": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:azure_data_factory,my-adf-instance)", - "urn": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:azure_data_factory,my-adf-instance)" - }, - { - "id": "urn:li:container:b5eba2c40de50f1afa7f4c8876a663d4", - "urn": "urn:li:container:b5eba2c40de50f1afa7f4c8876a663d4" + "id": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:azure-data-factory,my-adf-instance)", + "urn": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:azure-data-factory,my-adf-instance)" }, { - "id": "my-adf-instance.test-data-factory.DataIngestionPipeline", - "urn": "urn:li:dataFlow:(azure_data_factory,my-adf-instance.test-data-factory.DataIngestionPipeline,DEV)" + "id": "urn:li:container:99b9785e9e12713c9df27982572a999c", + "urn": "urn:li:container:99b9785e9e12713c9df27982572a999c" } ] } @@ -763,39 +795,7 @@ }, { "entityType": "dataJob", - "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure_data_factory,my-adf-instance.test-data-factory.DataIngestionPipeline,DEV),CopyBlobToSQL)", - "changeType": "UPSERT", - "aspectName": "status", - "aspect": { - "json": { - "removed": false - } - }, - "systemMetadata": { - "lastObserved": 1705320000000, - "runId": "adf-test-platform-instance", - "lastRunId": "no-run-id-provided" - } -}, -{ - "entityType": "dataJob", - "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure_data_factory,my-adf-instance.test-data-factory.DataIngestionPipeline,DEV),TransformData)", - "changeType": "UPSERT", - "aspectName": "status", - "aspect": { - "json": { - "removed": false - } - }, - "systemMetadata": { - "lastObserved": 1705320000000, - "runId": "adf-test-platform-instance", - "lastRunId": "no-run-id-provided" - } -}, -{ - "entityType": "dataJob", - "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure_data_factory,my-adf-instance.test-data-factory.DataIngestionPipeline,DEV),LookupConfig)", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure-data-factory,my-adf-instance.test-data-factory.DataIngestionPipeline,DEV),LookupConfig)", "changeType": "UPSERT", "aspectName": "status", "aspect": { diff --git a/metadata-ingestion/tests/integration/azure_data_factory/adf_with_runs_golden.json b/metadata-ingestion/tests/integration/azure_data_factory/adf_with_runs_golden.json index 2043c73b4bbbdd..4d324a76d50694 100644 --- a/metadata-ingestion/tests/integration/azure_data_factory/adf_with_runs_golden.json +++ b/metadata-ingestion/tests/integration/azure_data_factory/adf_with_runs_golden.json @@ -1,13 +1,13 @@ [ { "entityType": "container", - "entityUrn": "urn:li:container:0690c0f5c38fc7fe7ad0518a59384f93", + "entityUrn": "urn:li:container:cdaebfa861d2b2a3853328719496bce5", "changeType": "UPSERT", "aspectName": "containerProperties", "aspect": { "json": { "customProperties": { - "platform": "azure_data_factory", + "platform": "azure-data-factory", "env": "PROD", "resource_group": "test-resource-group", "factory_name": "test-data-factory", @@ -31,12 +31,12 @@ }, { "entityType": "container", - "entityUrn": "urn:li:container:0690c0f5c38fc7fe7ad0518a59384f93", + "entityUrn": "urn:li:container:cdaebfa861d2b2a3853328719496bce5", "changeType": "UPSERT", "aspectName": "dataPlatformInstance", "aspect": { "json": { - "platform": "urn:li:dataPlatform:azure_data_factory" + "platform": "urn:li:dataPlatform:azure-data-factory" } }, "systemMetadata": { @@ -47,7 +47,7 @@ }, { "entityType": "container", - "entityUrn": "urn:li:container:0690c0f5c38fc7fe7ad0518a59384f93", + "entityUrn": "urn:li:container:cdaebfa861d2b2a3853328719496bce5", "changeType": "UPSERT", "aspectName": "subTypes", "aspect": { @@ -65,7 +65,7 @@ }, { "entityType": "container", - "entityUrn": "urn:li:container:0690c0f5c38fc7fe7ad0518a59384f93", + "entityUrn": "urn:li:container:cdaebfa861d2b2a3853328719496bce5", "changeType": "UPSERT", "aspectName": "browsePathsV2", "aspect": { @@ -81,12 +81,12 @@ }, { "entityType": "dataFlow", - "entityUrn": "urn:li:dataFlow:(azure_data_factory,test-data-factory.DataIngestionPipeline,PROD)", + "entityUrn": "urn:li:dataFlow:(azure-data-factory,test-data-factory.DataIngestionPipeline,PROD)", "changeType": "UPSERT", "aspectName": "dataPlatformInstance", "aspect": { "json": { - "platform": "urn:li:dataPlatform:azure_data_factory" + "platform": "urn:li:dataPlatform:azure-data-factory" } }, "systemMetadata": { @@ -97,7 +97,7 @@ }, { "entityType": "dataFlow", - "entityUrn": "urn:li:dataFlow:(azure_data_factory,test-data-factory.DataIngestionPipeline,PROD)", + "entityUrn": "urn:li:dataFlow:(azure-data-factory,test-data-factory.DataIngestionPipeline,PROD)", "changeType": "UPSERT", "aspectName": "dataFlowInfo", "aspect": { @@ -121,7 +121,7 @@ }, { "entityType": "dataFlow", - "entityUrn": "urn:li:dataFlow:(azure_data_factory,test-data-factory.DataIngestionPipeline,PROD)", + "entityUrn": "urn:li:dataFlow:(azure-data-factory,test-data-factory.DataIngestionPipeline,PROD)", "changeType": "UPSERT", "aspectName": "subTypes", "aspect": { @@ -139,12 +139,12 @@ }, { "entityType": "dataFlow", - "entityUrn": "urn:li:dataFlow:(azure_data_factory,test-data-factory.DataIngestionPipeline,PROD)", + "entityUrn": "urn:li:dataFlow:(azure-data-factory,test-data-factory.DataIngestionPipeline,PROD)", "changeType": "UPSERT", "aspectName": "container", "aspect": { "json": { - "container": "urn:li:container:0690c0f5c38fc7fe7ad0518a59384f93" + "container": "urn:li:container:cdaebfa861d2b2a3853328719496bce5" } }, "systemMetadata": { @@ -155,15 +155,15 @@ }, { "entityType": "dataFlow", - "entityUrn": "urn:li:dataFlow:(azure_data_factory,test-data-factory.DataIngestionPipeline,PROD)", + "entityUrn": "urn:li:dataFlow:(azure-data-factory,test-data-factory.DataIngestionPipeline,PROD)", "changeType": "UPSERT", "aspectName": "browsePathsV2", "aspect": { "json": { "path": [ { - "id": "urn:li:container:0690c0f5c38fc7fe7ad0518a59384f93", - "urn": "urn:li:container:0690c0f5c38fc7fe7ad0518a59384f93" + "id": "urn:li:container:cdaebfa861d2b2a3853328719496bce5", + "urn": "urn:li:container:cdaebfa861d2b2a3853328719496bce5" } ] } @@ -176,12 +176,12 @@ }, { "entityType": "dataJob", - "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure_data_factory,test-data-factory.DataIngestionPipeline,PROD),CopyBlobToSQL)", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure-data-factory,test-data-factory.DataIngestionPipeline,PROD),CopyBlobToSQL)", "changeType": "UPSERT", "aspectName": "dataPlatformInstance", "aspect": { "json": { - "platform": "urn:li:dataPlatform:azure_data_factory" + "platform": "urn:li:dataPlatform:azure-data-factory" } }, "systemMetadata": { @@ -192,7 +192,7 @@ }, { "entityType": "dataJob", - "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure_data_factory,test-data-factory.DataIngestionPipeline,PROD),CopyBlobToSQL)", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure-data-factory,test-data-factory.DataIngestionPipeline,PROD),CopyBlobToSQL)", "changeType": "UPSERT", "aspectName": "dataJobInfo", "aspect": { @@ -209,7 +209,7 @@ "type": { "string": "COMMAND" }, - "flowUrn": "urn:li:dataFlow:(azure_data_factory,test-data-factory.DataIngestionPipeline,PROD)", + "flowUrn": "urn:li:dataFlow:(azure-data-factory,test-data-factory.DataIngestionPipeline,PROD)", "env": "PROD" } }, @@ -221,7 +221,7 @@ }, { "entityType": "dataJob", - "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure_data_factory,test-data-factory.DataIngestionPipeline,PROD),CopyBlobToSQL)", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure-data-factory,test-data-factory.DataIngestionPipeline,PROD),CopyBlobToSQL)", "changeType": "UPSERT", "aspectName": "subTypes", "aspect": { @@ -239,7 +239,7 @@ }, { "entityType": "dataJob", - "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure_data_factory,test-data-factory.DataIngestionPipeline,PROD),CopyBlobToSQL)", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure-data-factory,test-data-factory.DataIngestionPipeline,PROD),CopyBlobToSQL)", "changeType": "UPSERT", "aspectName": "dataJobInputOutput", "aspect": { @@ -260,19 +260,19 @@ }, { "entityType": "dataJob", - "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure_data_factory,test-data-factory.DataIngestionPipeline,PROD),CopyBlobToSQL)", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure-data-factory,test-data-factory.DataIngestionPipeline,PROD),CopyBlobToSQL)", "changeType": "UPSERT", "aspectName": "browsePathsV2", "aspect": { "json": { "path": [ { - "id": "urn:li:container:0690c0f5c38fc7fe7ad0518a59384f93", - "urn": "urn:li:container:0690c0f5c38fc7fe7ad0518a59384f93" + "id": "urn:li:container:cdaebfa861d2b2a3853328719496bce5", + "urn": "urn:li:container:cdaebfa861d2b2a3853328719496bce5" }, { "id": "test-data-factory.DataIngestionPipeline", - "urn": "urn:li:dataFlow:(azure_data_factory,test-data-factory.DataIngestionPipeline,PROD)" + "urn": "urn:li:dataFlow:(azure-data-factory,test-data-factory.DataIngestionPipeline,PROD)" } ] } @@ -285,12 +285,12 @@ }, { "entityType": "dataJob", - "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure_data_factory,test-data-factory.DataIngestionPipeline,PROD),LookupConfig)", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure-data-factory,test-data-factory.DataIngestionPipeline,PROD),LookupConfig)", "changeType": "UPSERT", "aspectName": "dataPlatformInstance", "aspect": { "json": { - "platform": "urn:li:dataPlatform:azure_data_factory" + "platform": "urn:li:dataPlatform:azure-data-factory" } }, "systemMetadata": { @@ -301,7 +301,7 @@ }, { "entityType": "dataJob", - "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure_data_factory,test-data-factory.DataIngestionPipeline,PROD),LookupConfig)", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure-data-factory,test-data-factory.DataIngestionPipeline,PROD),LookupConfig)", "changeType": "UPSERT", "aspectName": "dataJobInfo", "aspect": { @@ -318,7 +318,7 @@ "type": { "string": "COMMAND" }, - "flowUrn": "urn:li:dataFlow:(azure_data_factory,test-data-factory.DataIngestionPipeline,PROD)", + "flowUrn": "urn:li:dataFlow:(azure-data-factory,test-data-factory.DataIngestionPipeline,PROD)", "env": "PROD" } }, @@ -330,7 +330,7 @@ }, { "entityType": "dataJob", - "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure_data_factory,test-data-factory.DataIngestionPipeline,PROD),LookupConfig)", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure-data-factory,test-data-factory.DataIngestionPipeline,PROD),LookupConfig)", "changeType": "UPSERT", "aspectName": "subTypes", "aspect": { @@ -348,7 +348,7 @@ }, { "entityType": "dataJob", - "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure_data_factory,test-data-factory.DataIngestionPipeline,PROD),LookupConfig)", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure-data-factory,test-data-factory.DataIngestionPipeline,PROD),LookupConfig)", "changeType": "UPSERT", "aspectName": "dataJobInputOutput", "aspect": { @@ -367,19 +367,19 @@ }, { "entityType": "dataJob", - "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure_data_factory,test-data-factory.DataIngestionPipeline,PROD),LookupConfig)", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure-data-factory,test-data-factory.DataIngestionPipeline,PROD),LookupConfig)", "changeType": "UPSERT", "aspectName": "browsePathsV2", "aspect": { "json": { "path": [ { - "id": "urn:li:container:0690c0f5c38fc7fe7ad0518a59384f93", - "urn": "urn:li:container:0690c0f5c38fc7fe7ad0518a59384f93" + "id": "urn:li:container:cdaebfa861d2b2a3853328719496bce5", + "urn": "urn:li:container:cdaebfa861d2b2a3853328719496bce5" }, { "id": "test-data-factory.DataIngestionPipeline", - "urn": "urn:li:dataFlow:(azure_data_factory,test-data-factory.DataIngestionPipeline,PROD)" + "urn": "urn:li:dataFlow:(azure-data-factory,test-data-factory.DataIngestionPipeline,PROD)" } ] } @@ -392,12 +392,12 @@ }, { "entityType": "dataJob", - "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure_data_factory,test-data-factory.DataIngestionPipeline,PROD),TransformData)", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure-data-factory,test-data-factory.DataIngestionPipeline,PROD),TransformData)", "changeType": "UPSERT", "aspectName": "dataPlatformInstance", "aspect": { "json": { - "platform": "urn:li:dataPlatform:azure_data_factory" + "platform": "urn:li:dataPlatform:azure-data-factory" } }, "systemMetadata": { @@ -408,7 +408,7 @@ }, { "entityType": "dataJob", - "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure_data_factory,test-data-factory.DataIngestionPipeline,PROD),TransformData)", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure-data-factory,test-data-factory.DataIngestionPipeline,PROD),TransformData)", "changeType": "UPSERT", "aspectName": "dataJobInfo", "aspect": { @@ -425,7 +425,7 @@ "type": { "string": "COMMAND" }, - "flowUrn": "urn:li:dataFlow:(azure_data_factory,test-data-factory.DataIngestionPipeline,PROD)", + "flowUrn": "urn:li:dataFlow:(azure-data-factory,test-data-factory.DataIngestionPipeline,PROD)", "env": "PROD" } }, @@ -437,7 +437,7 @@ }, { "entityType": "dataJob", - "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure_data_factory,test-data-factory.DataIngestionPipeline,PROD),TransformData)", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure-data-factory,test-data-factory.DataIngestionPipeline,PROD),TransformData)", "changeType": "UPSERT", "aspectName": "subTypes", "aspect": { @@ -455,19 +455,19 @@ }, { "entityType": "dataJob", - "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure_data_factory,test-data-factory.DataIngestionPipeline,PROD),TransformData)", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure-data-factory,test-data-factory.DataIngestionPipeline,PROD),TransformData)", "changeType": "UPSERT", "aspectName": "browsePathsV2", "aspect": { "json": { "path": [ { - "id": "urn:li:container:0690c0f5c38fc7fe7ad0518a59384f93", - "urn": "urn:li:container:0690c0f5c38fc7fe7ad0518a59384f93" + "id": "urn:li:container:cdaebfa861d2b2a3853328719496bce5", + "urn": "urn:li:container:cdaebfa861d2b2a3853328719496bce5" }, { "id": "test-data-factory.DataIngestionPipeline", - "urn": "urn:li:dataFlow:(azure_data_factory,test-data-factory.DataIngestionPipeline,PROD)" + "urn": "urn:li:dataFlow:(azure-data-factory,test-data-factory.DataIngestionPipeline,PROD)" } ] } @@ -480,12 +480,12 @@ }, { "entityType": "dataFlow", - "entityUrn": "urn:li:dataFlow:(azure_data_factory,test-data-factory.DataProcessingPipeline,PROD)", + "entityUrn": "urn:li:dataFlow:(azure-data-factory,test-data-factory.DataProcessingPipeline,PROD)", "changeType": "UPSERT", "aspectName": "dataPlatformInstance", "aspect": { "json": { - "platform": "urn:li:dataPlatform:azure_data_factory" + "platform": "urn:li:dataPlatform:azure-data-factory" } }, "systemMetadata": { @@ -496,7 +496,7 @@ }, { "entityType": "dataFlow", - "entityUrn": "urn:li:dataFlow:(azure_data_factory,test-data-factory.DataProcessingPipeline,PROD)", + "entityUrn": "urn:li:dataFlow:(azure-data-factory,test-data-factory.DataProcessingPipeline,PROD)", "changeType": "UPSERT", "aspectName": "dataFlowInfo", "aspect": { @@ -519,7 +519,7 @@ }, { "entityType": "dataFlow", - "entityUrn": "urn:li:dataFlow:(azure_data_factory,test-data-factory.DataProcessingPipeline,PROD)", + "entityUrn": "urn:li:dataFlow:(azure-data-factory,test-data-factory.DataProcessingPipeline,PROD)", "changeType": "UPSERT", "aspectName": "subTypes", "aspect": { @@ -537,12 +537,12 @@ }, { "entityType": "dataFlow", - "entityUrn": "urn:li:dataFlow:(azure_data_factory,test-data-factory.DataProcessingPipeline,PROD)", + "entityUrn": "urn:li:dataFlow:(azure-data-factory,test-data-factory.DataProcessingPipeline,PROD)", "changeType": "UPSERT", "aspectName": "container", "aspect": { "json": { - "container": "urn:li:container:0690c0f5c38fc7fe7ad0518a59384f93" + "container": "urn:li:container:cdaebfa861d2b2a3853328719496bce5" } }, "systemMetadata": { @@ -553,15 +553,15 @@ }, { "entityType": "dataFlow", - "entityUrn": "urn:li:dataFlow:(azure_data_factory,test-data-factory.DataProcessingPipeline,PROD)", + "entityUrn": "urn:li:dataFlow:(azure-data-factory,test-data-factory.DataProcessingPipeline,PROD)", "changeType": "UPSERT", "aspectName": "browsePathsV2", "aspect": { "json": { "path": [ { - "id": "urn:li:container:0690c0f5c38fc7fe7ad0518a59384f93", - "urn": "urn:li:container:0690c0f5c38fc7fe7ad0518a59384f93" + "id": "urn:li:container:cdaebfa861d2b2a3853328719496bce5", + "urn": "urn:li:container:cdaebfa861d2b2a3853328719496bce5" } ] } @@ -574,12 +574,12 @@ }, { "entityType": "dataJob", - "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure_data_factory,test-data-factory.DataProcessingPipeline,PROD),CallStoredProc)", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure-data-factory,test-data-factory.DataProcessingPipeline,PROD),CallStoredProc)", "changeType": "UPSERT", "aspectName": "dataPlatformInstance", "aspect": { "json": { - "platform": "urn:li:dataPlatform:azure_data_factory" + "platform": "urn:li:dataPlatform:azure-data-factory" } }, "systemMetadata": { @@ -590,7 +590,7 @@ }, { "entityType": "dataJob", - "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure_data_factory,test-data-factory.DataProcessingPipeline,PROD),CallStoredProc)", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure-data-factory,test-data-factory.DataProcessingPipeline,PROD),CallStoredProc)", "changeType": "UPSERT", "aspectName": "dataJobInfo", "aspect": { @@ -607,7 +607,7 @@ "type": { "string": "COMMAND" }, - "flowUrn": "urn:li:dataFlow:(azure_data_factory,test-data-factory.DataProcessingPipeline,PROD)", + "flowUrn": "urn:li:dataFlow:(azure-data-factory,test-data-factory.DataProcessingPipeline,PROD)", "env": "PROD" } }, @@ -619,7 +619,7 @@ }, { "entityType": "dataJob", - "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure_data_factory,test-data-factory.DataProcessingPipeline,PROD),CallStoredProc)", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure-data-factory,test-data-factory.DataProcessingPipeline,PROD),CallStoredProc)", "changeType": "UPSERT", "aspectName": "subTypes", "aspect": { @@ -637,19 +637,19 @@ }, { "entityType": "dataJob", - "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure_data_factory,test-data-factory.DataProcessingPipeline,PROD),CallStoredProc)", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure-data-factory,test-data-factory.DataProcessingPipeline,PROD),CallStoredProc)", "changeType": "UPSERT", "aspectName": "browsePathsV2", "aspect": { "json": { "path": [ { - "id": "urn:li:container:0690c0f5c38fc7fe7ad0518a59384f93", - "urn": "urn:li:container:0690c0f5c38fc7fe7ad0518a59384f93" + "id": "urn:li:container:cdaebfa861d2b2a3853328719496bce5", + "urn": "urn:li:container:cdaebfa861d2b2a3853328719496bce5" }, { "id": "test-data-factory.DataProcessingPipeline", - "urn": "urn:li:dataFlow:(azure_data_factory,test-data-factory.DataProcessingPipeline,PROD)" + "urn": "urn:li:dataFlow:(azure-data-factory,test-data-factory.DataProcessingPipeline,PROD)" } ] } @@ -662,7 +662,7 @@ }, { "entityType": "dataProcessInstance", - "entityUrn": "urn:li:dataProcessInstance:01640a3ec425fdac43877636b9eeafba", + "entityUrn": "urn:li:dataProcessInstance:7aa70b5e31344dc1946c045ef1df4619", "changeType": "UPSERT", "aspectName": "dataProcessInstanceProperties", "aspect": { @@ -690,12 +690,12 @@ }, { "entityType": "dataProcessInstance", - "entityUrn": "urn:li:dataProcessInstance:01640a3ec425fdac43877636b9eeafba", + "entityUrn": "urn:li:dataProcessInstance:7aa70b5e31344dc1946c045ef1df4619", "changeType": "UPSERT", "aspectName": "dataProcessInstanceRelationships", "aspect": { "json": { - "parentTemplate": "urn:li:dataFlow:(azure_data_factory,test-data-factory.DataIngestionPipeline,PROD)", + "parentTemplate": "urn:li:dataFlow:(azure-data-factory,test-data-factory.DataIngestionPipeline,PROD)", "upstreamInstances": [] } }, @@ -707,7 +707,7 @@ }, { "entityType": "dataProcessInstance", - "entityUrn": "urn:li:dataProcessInstance:01640a3ec425fdac43877636b9eeafba", + "entityUrn": "urn:li:dataProcessInstance:7aa70b5e31344dc1946c045ef1df4619", "changeType": "UPSERT", "aspectName": "subTypes", "aspect": { @@ -725,7 +725,7 @@ }, { "entityType": "dataProcessInstance", - "entityUrn": "urn:li:dataProcessInstance:01640a3ec425fdac43877636b9eeafba", + "entityUrn": "urn:li:dataProcessInstance:7aa70b5e31344dc1946c045ef1df4619", "changeType": "UPSERT", "aspectName": "dataProcessInstanceRunEvent", "aspect": { @@ -746,7 +746,7 @@ }, { "entityType": "dataProcessInstance", - "entityUrn": "urn:li:dataProcessInstance:01640a3ec425fdac43877636b9eeafba", + "entityUrn": "urn:li:dataProcessInstance:7aa70b5e31344dc1946c045ef1df4619", "changeType": "UPSERT", "aspectName": "dataProcessInstanceRunEvent", "aspect": { @@ -771,7 +771,7 @@ }, { "entityType": "dataProcessInstance", - "entityUrn": "urn:li:dataProcessInstance:346ef377189e5b4227f7687c2d3fc47c", + "entityUrn": "urn:li:dataProcessInstance:1f3c62dd3b62b29a83221a57582a9bf3", "changeType": "UPSERT", "aspectName": "dataProcessInstanceProperties", "aspect": { @@ -799,12 +799,12 @@ }, { "entityType": "dataProcessInstance", - "entityUrn": "urn:li:dataProcessInstance:346ef377189e5b4227f7687c2d3fc47c", + "entityUrn": "urn:li:dataProcessInstance:1f3c62dd3b62b29a83221a57582a9bf3", "changeType": "UPSERT", "aspectName": "dataProcessInstanceRelationships", "aspect": { "json": { - "parentTemplate": "urn:li:dataFlow:(azure_data_factory,test-data-factory.DataIngestionPipeline,PROD)", + "parentTemplate": "urn:li:dataFlow:(azure-data-factory,test-data-factory.DataIngestionPipeline,PROD)", "upstreamInstances": [] } }, @@ -816,7 +816,7 @@ }, { "entityType": "dataProcessInstance", - "entityUrn": "urn:li:dataProcessInstance:346ef377189e5b4227f7687c2d3fc47c", + "entityUrn": "urn:li:dataProcessInstance:1f3c62dd3b62b29a83221a57582a9bf3", "changeType": "UPSERT", "aspectName": "subTypes", "aspect": { @@ -834,7 +834,7 @@ }, { "entityType": "dataProcessInstance", - "entityUrn": "urn:li:dataProcessInstance:346ef377189e5b4227f7687c2d3fc47c", + "entityUrn": "urn:li:dataProcessInstance:1f3c62dd3b62b29a83221a57582a9bf3", "changeType": "UPSERT", "aspectName": "dataProcessInstanceRunEvent", "aspect": { @@ -855,7 +855,7 @@ }, { "entityType": "dataProcessInstance", - "entityUrn": "urn:li:dataProcessInstance:346ef377189e5b4227f7687c2d3fc47c", + "entityUrn": "urn:li:dataProcessInstance:1f3c62dd3b62b29a83221a57582a9bf3", "changeType": "UPSERT", "aspectName": "dataProcessInstanceRunEvent", "aspect": { @@ -880,7 +880,7 @@ }, { "entityType": "dataProcessInstance", - "entityUrn": "urn:li:dataProcessInstance:e94dcdc0d6c440103bda5f316ca28a9e", + "entityUrn": "urn:li:dataProcessInstance:a9a82c0897d67213837e8ee52e99bd9b", "changeType": "UPSERT", "aspectName": "dataProcessInstanceProperties", "aspect": { @@ -908,12 +908,12 @@ }, { "entityType": "dataProcessInstance", - "entityUrn": "urn:li:dataProcessInstance:e94dcdc0d6c440103bda5f316ca28a9e", + "entityUrn": "urn:li:dataProcessInstance:a9a82c0897d67213837e8ee52e99bd9b", "changeType": "UPSERT", "aspectName": "dataProcessInstanceRelationships", "aspect": { "json": { - "parentTemplate": "urn:li:dataFlow:(azure_data_factory,test-data-factory.DataProcessingPipeline,PROD)", + "parentTemplate": "urn:li:dataFlow:(azure-data-factory,test-data-factory.DataProcessingPipeline,PROD)", "upstreamInstances": [] } }, @@ -925,7 +925,7 @@ }, { "entityType": "dataProcessInstance", - "entityUrn": "urn:li:dataProcessInstance:e94dcdc0d6c440103bda5f316ca28a9e", + "entityUrn": "urn:li:dataProcessInstance:a9a82c0897d67213837e8ee52e99bd9b", "changeType": "UPSERT", "aspectName": "subTypes", "aspect": { @@ -943,7 +943,7 @@ }, { "entityType": "dataProcessInstance", - "entityUrn": "urn:li:dataProcessInstance:e94dcdc0d6c440103bda5f316ca28a9e", + "entityUrn": "urn:li:dataProcessInstance:a9a82c0897d67213837e8ee52e99bd9b", "changeType": "UPSERT", "aspectName": "dataProcessInstanceRunEvent", "aspect": { @@ -964,7 +964,7 @@ }, { "entityType": "dataProcessInstance", - "entityUrn": "urn:li:dataProcessInstance:e94dcdc0d6c440103bda5f316ca28a9e", + "entityUrn": "urn:li:dataProcessInstance:a9a82c0897d67213837e8ee52e99bd9b", "changeType": "UPSERT", "aspectName": "dataProcessInstanceRunEvent", "aspect": { @@ -989,7 +989,7 @@ }, { "entityType": "container", - "entityUrn": "urn:li:container:0690c0f5c38fc7fe7ad0518a59384f93", + "entityUrn": "urn:li:container:cdaebfa861d2b2a3853328719496bce5", "changeType": "UPSERT", "aspectName": "status", "aspect": { @@ -1005,7 +1005,7 @@ }, { "entityType": "dataFlow", - "entityUrn": "urn:li:dataFlow:(azure_data_factory,test-data-factory.DataIngestionPipeline,PROD)", + "entityUrn": "urn:li:dataFlow:(azure-data-factory,test-data-factory.DataIngestionPipeline,PROD)", "changeType": "UPSERT", "aspectName": "status", "aspect": { @@ -1021,7 +1021,7 @@ }, { "entityType": "dataFlow", - "entityUrn": "urn:li:dataFlow:(azure_data_factory,test-data-factory.DataProcessingPipeline,PROD)", + "entityUrn": "urn:li:dataFlow:(azure-data-factory,test-data-factory.DataProcessingPipeline,PROD)", "changeType": "UPSERT", "aspectName": "status", "aspect": { @@ -1037,7 +1037,7 @@ }, { "entityType": "dataJob", - "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure_data_factory,test-data-factory.DataIngestionPipeline,PROD),CopyBlobToSQL)", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure-data-factory,test-data-factory.DataIngestionPipeline,PROD),CopyBlobToSQL)", "changeType": "UPSERT", "aspectName": "status", "aspect": { @@ -1053,7 +1053,7 @@ }, { "entityType": "dataJob", - "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure_data_factory,test-data-factory.DataIngestionPipeline,PROD),LookupConfig)", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure-data-factory,test-data-factory.DataIngestionPipeline,PROD),LookupConfig)", "changeType": "UPSERT", "aspectName": "status", "aspect": { @@ -1069,7 +1069,7 @@ }, { "entityType": "dataJob", - "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure_data_factory,test-data-factory.DataIngestionPipeline,PROD),TransformData)", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure-data-factory,test-data-factory.DataIngestionPipeline,PROD),TransformData)", "changeType": "UPSERT", "aspectName": "status", "aspect": { @@ -1085,7 +1085,7 @@ }, { "entityType": "dataJob", - "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure_data_factory,test-data-factory.DataProcessingPipeline,PROD),CallStoredProc)", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure-data-factory,test-data-factory.DataProcessingPipeline,PROD),CallStoredProc)", "changeType": "UPSERT", "aspectName": "status", "aspect": { @@ -1101,7 +1101,7 @@ }, { "entityType": "dataProcessInstance", - "entityUrn": "urn:li:dataProcessInstance:01640a3ec425fdac43877636b9eeafba", + "entityUrn": "urn:li:dataProcessInstance:1f3c62dd3b62b29a83221a57582a9bf3", "changeType": "UPSERT", "aspectName": "status", "aspect": { @@ -1117,7 +1117,7 @@ }, { "entityType": "dataProcessInstance", - "entityUrn": "urn:li:dataProcessInstance:346ef377189e5b4227f7687c2d3fc47c", + "entityUrn": "urn:li:dataProcessInstance:7aa70b5e31344dc1946c045ef1df4619", "changeType": "UPSERT", "aspectName": "status", "aspect": { @@ -1133,7 +1133,7 @@ }, { "entityType": "dataProcessInstance", - "entityUrn": "urn:li:dataProcessInstance:e94dcdc0d6c440103bda5f316ca28a9e", + "entityUrn": "urn:li:dataProcessInstance:a9a82c0897d67213837e8ee52e99bd9b", "changeType": "UPSERT", "aspectName": "status", "aspect": { diff --git a/metadata-ingestion/tests/integration/azure_data_factory/complex_mocks.py b/metadata-ingestion/tests/integration/azure_data_factory/complex_mocks.py new file mode 100644 index 00000000000000..42eb5691a98e3c --- /dev/null +++ b/metadata-ingestion/tests/integration/azure_data_factory/complex_mocks.py @@ -0,0 +1,1231 @@ +"""Complex mock data for Azure Data Factory integration tests. + +This module provides mock data for testing complex ADF pipeline patterns: +1. Nested Pipelines (Execute Pipeline activity) +2. ForEach Loops with multiple activities +3. Control Flow Branching (If-Condition, Switch) +4. Mapping Data Flows with transformations +5. Multi-Source Copy Pipelines (SQL → Blob → Synapse) +""" + +from typing import Any, Dict, List + +# Common test constants +SUBSCRIPTION_ID = "12345678-1234-1234-1234-123456789012" +RESOURCE_GROUP = "complex-test-rg" +FACTORY_NAME = "complex-data-factory" +LOCATION = "eastus" + + +def _base_resource_id(resource_type: str, name: str) -> str: + """Generate a standard Azure resource ID.""" + return ( + f"/subscriptions/{SUBSCRIPTION_ID}/resourceGroups/{RESOURCE_GROUP}" + f"/providers/Microsoft.DataFactory/factories/{FACTORY_NAME}/{resource_type}/{name}" + ) + + +# ============================================================================= +# LINKED SERVICES - Various platform types for lineage testing +# ============================================================================= + + +def create_complex_linked_services() -> List[Dict[str, Any]]: + """Create linked services for multiple platforms.""" + return [ + { + "id": _base_resource_id("linkedservices", "SqlServerSource"), + "name": "SqlServerSource", + "type": "Microsoft.DataFactory/factories/linkedservices", + "properties": { + "type": "AzureSqlDatabase", + "typeProperties": { + "connectionString": "Server=sql-server.database.windows.net;Database=SourceDB" + }, + }, + }, + { + "id": _base_resource_id("linkedservices", "BlobStorage"), + "name": "BlobStorage", + "type": "Microsoft.DataFactory/factories/linkedservices", + "properties": { + "type": "AzureBlobStorage", + "typeProperties": { + "connectionString": "DefaultEndpointsProtocol=https" + }, + }, + }, + { + "id": _base_resource_id("linkedservices", "SynapseDestination"), + "name": "SynapseDestination", + "type": "Microsoft.DataFactory/factories/linkedservices", + "properties": { + "type": "AzureSynapseAnalytics", + "typeProperties": { + "connectionString": "Server=synapse.sql.azuresynapse.net;Database=DW" + }, + }, + }, + { + "id": _base_resource_id("linkedservices", "SnowflakeConnection"), + "name": "SnowflakeConnection", + "type": "Microsoft.DataFactory/factories/linkedservices", + "properties": { + "type": "Snowflake", + "typeProperties": {"connectionString": "account=myaccount"}, + }, + }, + { + "id": _base_resource_id("linkedservices", "DataLakeStorage"), + "name": "DataLakeStorage", + "type": "Microsoft.DataFactory/factories/linkedservices", + "properties": { + "type": "AzureBlobFS", + "typeProperties": {"url": "https://datalake.dfs.core.windows.net"}, + }, + }, + ] + + +# ============================================================================= +# DATASETS - Input/output datasets for lineage +# ============================================================================= + + +def create_complex_datasets() -> List[Dict[str, Any]]: + """Create datasets for complex lineage scenarios.""" + return [ + # SQL Server datasets + { + "id": _base_resource_id("datasets", "SqlCustomersTable"), + "name": "SqlCustomersTable", + "type": "Microsoft.DataFactory/factories/datasets", + "properties": { + "linkedServiceName": { + "referenceName": "SqlServerSource", + "type": "LinkedServiceReference", + }, + "type": "AzureSqlTable", + "typeProperties": {"schema": "dbo", "table": "Customers"}, + }, + }, + { + "id": _base_resource_id("datasets", "SqlOrdersTable"), + "name": "SqlOrdersTable", + "type": "Microsoft.DataFactory/factories/datasets", + "properties": { + "linkedServiceName": { + "referenceName": "SqlServerSource", + "type": "LinkedServiceReference", + }, + "type": "AzureSqlTable", + "typeProperties": {"schema": "dbo", "table": "Orders"}, + }, + }, + { + "id": _base_resource_id("datasets", "SqlProductsTable"), + "name": "SqlProductsTable", + "type": "Microsoft.DataFactory/factories/datasets", + "properties": { + "linkedServiceName": { + "referenceName": "SqlServerSource", + "type": "LinkedServiceReference", + }, + "type": "AzureSqlTable", + "typeProperties": {"schema": "dbo", "table": "Products"}, + }, + }, + # Blob storage datasets + { + "id": _base_resource_id("datasets", "BlobStagingCustomers"), + "name": "BlobStagingCustomers", + "type": "Microsoft.DataFactory/factories/datasets", + "properties": { + "linkedServiceName": { + "referenceName": "BlobStorage", + "type": "LinkedServiceReference", + }, + "type": "DelimitedText", + "typeProperties": { + "location": { + "type": "AzureBlobStorageLocation", + "container": "staging", + "folderPath": "customers", + } + }, + }, + }, + { + "id": _base_resource_id("datasets", "BlobStagingOrders"), + "name": "BlobStagingOrders", + "type": "Microsoft.DataFactory/factories/datasets", + "properties": { + "linkedServiceName": { + "referenceName": "BlobStorage", + "type": "LinkedServiceReference", + }, + "type": "DelimitedText", + "typeProperties": { + "location": { + "type": "AzureBlobStorageLocation", + "container": "staging", + "folderPath": "orders", + } + }, + }, + }, + # Synapse datasets + { + "id": _base_resource_id("datasets", "SynapseCustomersDim"), + "name": "SynapseCustomersDim", + "type": "Microsoft.DataFactory/factories/datasets", + "properties": { + "linkedServiceName": { + "referenceName": "SynapseDestination", + "type": "LinkedServiceReference", + }, + "type": "AzureSqlDWTable", + "typeProperties": {"schema": "dw", "table": "DimCustomers"}, + }, + }, + { + "id": _base_resource_id("datasets", "SynapseOrdersFact"), + "name": "SynapseOrdersFact", + "type": "Microsoft.DataFactory/factories/datasets", + "properties": { + "linkedServiceName": { + "referenceName": "SynapseDestination", + "type": "LinkedServiceReference", + }, + "type": "AzureSqlDWTable", + "typeProperties": {"schema": "dw", "table": "FactOrders"}, + }, + }, + # Data Lake datasets for Data Flow + { + "id": _base_resource_id("datasets", "DataLakeRawData"), + "name": "DataLakeRawData", + "type": "Microsoft.DataFactory/factories/datasets", + "properties": { + "linkedServiceName": { + "referenceName": "DataLakeStorage", + "type": "LinkedServiceReference", + }, + "type": "Parquet", + "typeProperties": { + "location": { + "type": "AzureBlobFSLocation", + "fileSystem": "raw", + "folderPath": "sales", + } + }, + }, + }, + { + "id": _base_resource_id("datasets", "DataLakeCuratedData"), + "name": "DataLakeCuratedData", + "type": "Microsoft.DataFactory/factories/datasets", + "properties": { + "linkedServiceName": { + "referenceName": "DataLakeStorage", + "type": "LinkedServiceReference", + }, + "type": "Parquet", + "typeProperties": { + "location": { + "type": "AzureBlobFSLocation", + "fileSystem": "curated", + "folderPath": "sales_summary", + } + }, + }, + }, + ] + + +# ============================================================================= +# SCENARIO 1: NESTED PIPELINES (Execute Pipeline Activity) +# ============================================================================= + + +def create_nested_pipeline_scenario() -> Dict[str, Any]: + """Create mock data for nested pipeline scenario. + + Structure: + - ParentOrchestrationPipeline + └── ExecutePipeline: ChildDataMovementPipeline + └── Copy: SqlToBlob + └── ExecutePipeline: ChildTransformPipeline + └── DataFlow: TransformData + """ + child_data_movement = { + "id": _base_resource_id("pipelines", "ChildDataMovementPipeline"), + "name": "ChildDataMovementPipeline", + "type": "Microsoft.DataFactory/factories/pipelines", + "properties": { + "description": "Child pipeline for data movement", + "activities": [ + { + "name": "CopyCustomersToStaging", + "type": "Copy", + "inputs": [ + { + "referenceName": "SqlCustomersTable", + "type": "DatasetReference", + } + ], + "outputs": [ + { + "referenceName": "BlobStagingCustomers", + "type": "DatasetReference", + } + ], + "typeProperties": { + "source": {"type": "AzureSqlSource"}, + "sink": {"type": "DelimitedTextSink"}, + }, + } + ], + "parameters": {"sourceTable": {"type": "String"}}, + }, + } + + child_transform = { + "id": _base_resource_id("pipelines", "ChildTransformPipeline"), + "name": "ChildTransformPipeline", + "type": "Microsoft.DataFactory/factories/pipelines", + "properties": { + "description": "Child pipeline for data transformation", + "activities": [ + { + "name": "TransformCustomerData", + "type": "ExecuteDataFlow", + "typeProperties": { + "dataflow": { + "referenceName": "CustomerTransformFlow", + "type": "DataFlowReference", + } + }, + } + ], + }, + } + + parent_pipeline = { + "id": _base_resource_id("pipelines", "ParentOrchestrationPipeline"), + "name": "ParentOrchestrationPipeline", + "type": "Microsoft.DataFactory/factories/pipelines", + "properties": { + "description": "Parent orchestration pipeline that calls child pipelines", + "activities": [ + { + "name": "ExecuteDataMovement", + "type": "ExecutePipeline", + "typeProperties": { + "pipeline": { + "referenceName": "ChildDataMovementPipeline", + "type": "PipelineReference", + }, + "waitOnCompletion": True, + "parameters": {"sourceTable": "Customers"}, + }, + }, + { + "name": "ExecuteTransform", + "type": "ExecutePipeline", + "dependsOn": [ + { + "activity": "ExecuteDataMovement", + "dependencyConditions": ["Succeeded"], + } + ], + "typeProperties": { + "pipeline": { + "referenceName": "ChildTransformPipeline", + "type": "PipelineReference", + }, + "waitOnCompletion": True, + }, + }, + ], + }, + } + + return { + "pipelines": [parent_pipeline, child_data_movement, child_transform], + "expected_dataflows": 3, + "expected_datajobs": 4, # 2 ExecutePipeline + 1 Copy + 1 DataFlow + } + + +# ============================================================================= +# SCENARIO 2: FOREACH LOOPS +# ============================================================================= + + +def create_foreach_loop_scenario() -> Dict[str, Any]: + """Create mock data for ForEach loop scenario. + + Structure: + - ForEachTablePipeline + └── ForEach: IterateOverTables + └── Copy: CopyTableToStaging (parametrized) + """ + pipeline = { + "id": _base_resource_id("pipelines", "ForEachTablePipeline"), + "name": "ForEachTablePipeline", + "type": "Microsoft.DataFactory/factories/pipelines", + "properties": { + "description": "Pipeline with ForEach loop to copy multiple tables", + "parameters": { + "tableList": { + "type": "Array", + "defaultValue": ["Customers", "Orders", "Products"], + } + }, + "activities": [ + { + "name": "GetTableList", + "type": "Lookup", + "typeProperties": { + "source": { + "type": "AzureSqlSource", + "sqlReaderQuery": "SELECT name FROM sys.tables", + }, + "dataset": { + "referenceName": "SqlCustomersTable", + "type": "DatasetReference", + }, + "firstRowOnly": False, + }, + }, + { + "name": "IterateOverTables", + "type": "ForEach", + "dependsOn": [ + { + "activity": "GetTableList", + "dependencyConditions": ["Succeeded"], + } + ], + "typeProperties": { + "items": { + "value": "@activity('GetTableList').output.value", + "type": "Expression", + }, + "isSequential": False, + "batchCount": 5, + "activities": [ + { + "name": "CopyTableToStaging", + "type": "Copy", + "inputs": [ + { + "referenceName": "SqlCustomersTable", + "type": "DatasetReference", + } + ], + "outputs": [ + { + "referenceName": "BlobStagingCustomers", + "type": "DatasetReference", + } + ], + "typeProperties": { + "source": {"type": "AzureSqlSource"}, + "sink": {"type": "DelimitedTextSink"}, + }, + } + ], + }, + }, + ], + }, + } + + return { + "pipelines": [pipeline], + "expected_dataflows": 1, + "expected_datajobs": 3, # Lookup + ForEach + Copy (inside ForEach) + } + + +# ============================================================================= +# SCENARIO 3: CONTROL FLOW BRANCHING (If-Condition, Switch) +# ============================================================================= + + +def create_branching_scenario() -> Dict[str, Any]: + """Create mock data for control flow branching scenario. + + Structure: + - BranchingPipeline + └── Lookup: CheckDataExists + └── IfCondition: DataExistsCheck + ├── True: Copy: FullLoad + └── False: Copy: IncrementalLoad + └── Switch: ProcessByRegion + ├── Case "US": Copy: ProcessUSData + ├── Case "EU": Copy: ProcessEUData + └── Default: Copy: ProcessOtherData + """ + pipeline = { + "id": _base_resource_id("pipelines", "BranchingPipeline"), + "name": "BranchingPipeline", + "type": "Microsoft.DataFactory/factories/pipelines", + "properties": { + "description": "Pipeline with If-Condition and Switch branching", + "parameters": {"region": {"type": "String", "defaultValue": "US"}}, + "activities": [ + { + "name": "CheckDataExists", + "type": "Lookup", + "typeProperties": { + "source": { + "type": "AzureSqlSource", + "sqlReaderQuery": "SELECT COUNT(*) as cnt FROM dbo.Customers", + }, + "dataset": { + "referenceName": "SqlCustomersTable", + "type": "DatasetReference", + }, + }, + }, + { + "name": "DataExistsCheck", + "type": "IfCondition", + "dependsOn": [ + { + "activity": "CheckDataExists", + "dependencyConditions": ["Succeeded"], + } + ], + "typeProperties": { + "expression": { + "value": "@greater(activity('CheckDataExists').output.firstRow.cnt, 0)", + "type": "Expression", + }, + "ifTrueActivities": [ + { + "name": "FullLoad", + "type": "Copy", + "inputs": [ + { + "referenceName": "SqlCustomersTable", + "type": "DatasetReference", + } + ], + "outputs": [ + { + "referenceName": "BlobStagingCustomers", + "type": "DatasetReference", + } + ], + "typeProperties": { + "source": {"type": "AzureSqlSource"}, + "sink": {"type": "DelimitedTextSink"}, + }, + } + ], + "ifFalseActivities": [ + { + "name": "IncrementalLoad", + "type": "Copy", + "inputs": [ + { + "referenceName": "SqlOrdersTable", + "type": "DatasetReference", + } + ], + "outputs": [ + { + "referenceName": "BlobStagingOrders", + "type": "DatasetReference", + } + ], + "typeProperties": { + "source": {"type": "AzureSqlSource"}, + "sink": {"type": "DelimitedTextSink"}, + }, + } + ], + }, + }, + { + "name": "ProcessByRegion", + "type": "Switch", + "dependsOn": [ + { + "activity": "DataExistsCheck", + "dependencyConditions": ["Succeeded"], + } + ], + "typeProperties": { + "on": { + "value": "@pipeline().parameters.region", + "type": "Expression", + }, + "cases": [ + { + "value": "US", + "activities": [ + { + "name": "ProcessUSData", + "type": "Copy", + "inputs": [ + { + "referenceName": "SqlCustomersTable", + "type": "DatasetReference", + } + ], + "outputs": [ + { + "referenceName": "SynapseCustomersDim", + "type": "DatasetReference", + } + ], + "typeProperties": { + "source": {"type": "AzureSqlSource"}, + "sink": {"type": "SqlDWSink"}, + }, + } + ], + }, + { + "value": "EU", + "activities": [ + { + "name": "ProcessEUData", + "type": "Copy", + "inputs": [ + { + "referenceName": "SqlOrdersTable", + "type": "DatasetReference", + } + ], + "outputs": [ + { + "referenceName": "SynapseOrdersFact", + "type": "DatasetReference", + } + ], + "typeProperties": { + "source": {"type": "AzureSqlSource"}, + "sink": {"type": "SqlDWSink"}, + }, + } + ], + }, + ], + "defaultActivities": [ + { + "name": "ProcessOtherData", + "type": "Copy", + "inputs": [ + { + "referenceName": "SqlProductsTable", + "type": "DatasetReference", + } + ], + "outputs": [ + { + "referenceName": "BlobStagingCustomers", + "type": "DatasetReference", + } + ], + "typeProperties": { + "source": {"type": "AzureSqlSource"}, + "sink": {"type": "DelimitedTextSink"}, + }, + } + ], + }, + }, + ], + }, + } + + return { + "pipelines": [pipeline], + "expected_dataflows": 1, + "expected_datajobs": 8, # Lookup + IfCondition + 2 Copy (if branches) + Switch + 3 Copy (switch cases) + } + + +# ============================================================================= +# SCENARIO 4: MAPPING DATA FLOWS +# ============================================================================= + + +def create_dataflow_scenario() -> Dict[str, Any]: + """Create mock data for Mapping Data Flow scenario. + + Structure: + - DataFlowPipeline + └── ExecuteDataFlow: RunSalesTransformation + └── SalesTransformationFlow (sources → transforms → sinks) + """ + data_flow = { + "id": _base_resource_id("dataflows", "SalesTransformationFlow"), + "name": "SalesTransformationFlow", + "type": "Microsoft.DataFactory/factories/dataflows", + "properties": { + "type": "MappingDataFlow", + "description": "Complex data flow with multiple sources and transformations", + "typeProperties": { + "sources": [ + { + "name": "CustomersSource", + "dataset": { + "referenceName": "DataLakeRawData", + "type": "DatasetReference", + }, + }, + { + "name": "OrdersSource", + "dataset": { + "referenceName": "SqlOrdersTable", + "type": "DatasetReference", + }, + }, + ], + "sinks": [ + { + "name": "CuratedOutput", + "dataset": { + "referenceName": "DataLakeCuratedData", + "type": "DatasetReference", + }, + }, + { + "name": "SynapseOutput", + "dataset": { + "referenceName": "SynapseCustomersDim", + "type": "DatasetReference", + }, + }, + ], + "transformations": [ + { + "name": "FilterActiveCustomers", + "description": "Filter only active customers", + }, + { + "name": "JoinOrdersToCustomers", + "description": "Join orders with customers", + }, + { + "name": "AggregateByRegion", + "description": "Aggregate sales by region", + }, + { + "name": "DeriveMetrics", + "description": "Calculate derived metrics", + }, + ], + "scriptLines": [ + "source(output(", + " customer_id as integer,", + " name as string,", + " region as string,", + " is_active as boolean", + " ),", + " allowSchemaDrift: true) ~> CustomersSource", + "source(output(", + " order_id as integer,", + " customer_id as integer,", + " amount as decimal(10,2)", + " )) ~> OrdersSource", + "CustomersSource filter(is_active == true()) ~> FilterActiveCustomers", + "FilterActiveCustomers, OrdersSource join(", + " CustomersSource.customer_id == OrdersSource.customer_id", + " ) ~> JoinOrdersToCustomers", + "JoinOrdersToCustomers aggregate(", + " groupBy(region),", + " total_sales = sum(amount)", + " ) ~> AggregateByRegion", + "AggregateByRegion derive(", + " avg_order = total_sales / count(order_id)", + " ) ~> DeriveMetrics", + "DeriveMetrics sink() ~> CuratedOutput", + "DeriveMetrics sink() ~> SynapseOutput", + ], + }, + }, + } + + pipeline = { + "id": _base_resource_id("pipelines", "DataFlowPipeline"), + "name": "DataFlowPipeline", + "type": "Microsoft.DataFactory/factories/pipelines", + "properties": { + "description": "Pipeline that executes a mapping data flow", + "activities": [ + { + "name": "RunSalesTransformation", + "type": "ExecuteDataFlow", + "typeProperties": { + "dataflow": { + "referenceName": "SalesTransformationFlow", + "type": "DataFlowReference", + }, + "compute": {"coreCount": 8, "computeType": "General"}, + }, + } + ], + }, + } + + return { + "pipelines": [pipeline], + "data_flows": [data_flow], + "expected_dataflows": 1, + "expected_datajobs": 1, + "expected_lineage_sources": 2, # DataLakeRawData, SqlOrdersTable + "expected_lineage_sinks": 2, # DataLakeCuratedData, SynapseCustomersDim + } + + +# ============================================================================= +# SCENARIO 5: MULTI-SOURCE COPY CHAIN (SQL → Blob → Synapse) +# ============================================================================= + + +def create_multisource_chain_scenario() -> Dict[str, Any]: + """Create mock data for multi-source copy chain scenario. + + Structure: + - ETLPipeline + └── Copy: ExtractFromSQL (SQL → Blob) + └── Copy: LoadToSynapse (Blob → Synapse) + └── Copy: ArchiveToDataLake (Blob → DataLake) + + This tests end-to-end lineage: SQL → Blob → Synapse + └─→ DataLake + """ + pipeline = { + "id": _base_resource_id("pipelines", "ETLPipeline"), + "name": "ETLPipeline", + "type": "Microsoft.DataFactory/factories/pipelines", + "properties": { + "description": "Full ETL pipeline: Extract from SQL, stage in Blob, load to Synapse and archive", + "activities": [ + { + "name": "ExtractCustomersFromSQL", + "type": "Copy", + "inputs": [ + { + "referenceName": "SqlCustomersTable", + "type": "DatasetReference", + } + ], + "outputs": [ + { + "referenceName": "BlobStagingCustomers", + "type": "DatasetReference", + } + ], + "typeProperties": { + "source": {"type": "AzureSqlSource"}, + "sink": {"type": "DelimitedTextSink"}, + }, + }, + { + "name": "ExtractOrdersFromSQL", + "type": "Copy", + "inputs": [ + {"referenceName": "SqlOrdersTable", "type": "DatasetReference"} + ], + "outputs": [ + { + "referenceName": "BlobStagingOrders", + "type": "DatasetReference", + } + ], + "typeProperties": { + "source": {"type": "AzureSqlSource"}, + "sink": {"type": "DelimitedTextSink"}, + }, + }, + { + "name": "LoadCustomersToSynapse", + "type": "Copy", + "dependsOn": [ + { + "activity": "ExtractCustomersFromSQL", + "dependencyConditions": ["Succeeded"], + } + ], + "inputs": [ + { + "referenceName": "BlobStagingCustomers", + "type": "DatasetReference", + } + ], + "outputs": [ + { + "referenceName": "SynapseCustomersDim", + "type": "DatasetReference", + } + ], + "typeProperties": { + "source": {"type": "DelimitedTextSource"}, + "sink": {"type": "SqlDWSink", "allowPolyBase": True}, + }, + }, + { + "name": "LoadOrdersToSynapse", + "type": "Copy", + "dependsOn": [ + { + "activity": "ExtractOrdersFromSQL", + "dependencyConditions": ["Succeeded"], + } + ], + "inputs": [ + { + "referenceName": "BlobStagingOrders", + "type": "DatasetReference", + } + ], + "outputs": [ + { + "referenceName": "SynapseOrdersFact", + "type": "DatasetReference", + } + ], + "typeProperties": { + "source": {"type": "DelimitedTextSource"}, + "sink": {"type": "SqlDWSink", "allowPolyBase": True}, + }, + }, + { + "name": "ArchiveToDataLake", + "type": "Copy", + "dependsOn": [ + { + "activity": "LoadCustomersToSynapse", + "dependencyConditions": ["Succeeded"], + }, + { + "activity": "LoadOrdersToSynapse", + "dependencyConditions": ["Succeeded"], + }, + ], + "inputs": [ + { + "referenceName": "BlobStagingCustomers", + "type": "DatasetReference", + } + ], + "outputs": [ + {"referenceName": "DataLakeRawData", "type": "DatasetReference"} + ], + "typeProperties": { + "source": {"type": "DelimitedTextSource"}, + "sink": {"type": "ParquetSink"}, + }, + }, + ], + }, + } + + return { + "pipelines": [pipeline], + "expected_dataflows": 1, + "expected_datajobs": 5, + "expected_lineage_edges": [ + # Stage 1: SQL → Blob + ("SqlCustomersTable", "BlobStagingCustomers"), + ("SqlOrdersTable", "BlobStagingOrders"), + # Stage 2: Blob → Synapse + ("BlobStagingCustomers", "SynapseCustomersDim"), + ("BlobStagingOrders", "SynapseOrdersFact"), + # Stage 3: Blob → DataLake + ("BlobStagingCustomers", "DataLakeRawData"), + ], + } + + +# ============================================================================= +# SCENARIO 6: DIVERSE ACTIVITY TYPES +# ============================================================================= + + +def create_diverse_activities_scenario() -> Dict[str, Any]: + """Create mock data for testing various activity types. + + Structure: + - DiverseActivitiesPipeline + └── SetVariable: InitializeCounter + └── WebActivity: FetchConfiguration (REST API call) + └── SqlServerStoredProcedure: ProcessData + └── Wait: DelayForReplication + └── DatabricksNotebook: RunMLTraining + └── AzureFunctionActivity: SendNotification + └── Fail: FailOnError (in error handling) + """ + pipeline = { + "id": _base_resource_id("pipelines", "DiverseActivitiesPipeline"), + "name": "DiverseActivitiesPipeline", + "type": "Microsoft.DataFactory/factories/pipelines", + "properties": { + "description": "Pipeline demonstrating various activity types", + "variables": { + "counter": {"type": "Integer", "defaultValue": 0}, + "configData": {"type": "String"}, + }, + "activities": [ + # SetVariable - Initialize a pipeline variable + { + "name": "InitializeCounter", + "type": "SetVariable", + "typeProperties": { + "variableName": "counter", + "value": {"value": "1", "type": "Expression"}, + }, + }, + # WebActivity - Call an external REST API + { + "name": "FetchConfiguration", + "type": "WebActivity", + "dependsOn": [ + { + "activity": "InitializeCounter", + "dependencyConditions": ["Succeeded"], + } + ], + "typeProperties": { + "url": "https://api.example.com/config", + "method": "GET", + "headers": {"Content-Type": "application/json"}, + }, + }, + # SqlServerStoredProcedure - Execute a stored procedure + { + "name": "ProcessDataWithSP", + "type": "SqlServerStoredProcedure", + "dependsOn": [ + { + "activity": "FetchConfiguration", + "dependencyConditions": ["Succeeded"], + } + ], + "linkedServiceName": { + "referenceName": "SqlServerSource", + "type": "LinkedServiceReference", + }, + "typeProperties": { + "storedProcedureName": "sp_ProcessDailyData", + "storedProcedureParameters": { + "ProcessDate": { + "value": "@utcnow()", + "type": "DateTime", + } + }, + }, + }, + # Wait - Introduce a delay + { + "name": "WaitForReplication", + "type": "Wait", + "dependsOn": [ + { + "activity": "ProcessDataWithSP", + "dependencyConditions": ["Succeeded"], + } + ], + "typeProperties": {"waitTimeInSeconds": 30}, + }, + # GetMetadata - Get file/folder metadata + { + "name": "CheckOutputExists", + "type": "GetMetadata", + "dependsOn": [ + { + "activity": "WaitForReplication", + "dependencyConditions": ["Succeeded"], + } + ], + "typeProperties": { + "dataset": { + "referenceName": "BlobStagingCustomers", + "type": "DatasetReference", + }, + "fieldList": ["exists", "itemName", "lastModified"], + }, + }, + # DatabricksNotebook - Run a Databricks notebook + { + "name": "RunMLTrainingNotebook", + "type": "DatabricksNotebook", + "dependsOn": [ + { + "activity": "CheckOutputExists", + "dependencyConditions": ["Succeeded"], + } + ], + "linkedServiceName": { + "referenceName": "DatabricksWorkspace", + "type": "LinkedServiceReference", + }, + "typeProperties": { + "notebookPath": "/Shared/MLTraining/train_model", + "baseParameters": { + "input_path": "/mnt/data/input", + "output_path": "/mnt/data/output", + }, + }, + }, + # Script - Run a SQL script + { + "name": "RunAnalyticsScript", + "type": "Script", + "dependsOn": [ + { + "activity": "RunMLTrainingNotebook", + "dependencyConditions": ["Succeeded"], + } + ], + "linkedServiceName": { + "referenceName": "SynapseDestination", + "type": "LinkedServiceReference", + }, + "typeProperties": { + "scripts": [ + { + "text": "EXEC sp_UpdateAnalytics @date = GETDATE()", + "type": "Query", + } + ] + }, + }, + # AzureFunctionActivity - Call an Azure Function + { + "name": "SendCompletionNotification", + "type": "AzureFunctionActivity", + "dependsOn": [ + { + "activity": "RunAnalyticsScript", + "dependencyConditions": ["Succeeded"], + } + ], + "linkedServiceName": { + "referenceName": "NotificationFunction", + "type": "LinkedServiceReference", + }, + "typeProperties": { + "functionName": "SendNotification", + "method": "POST", + "body": { + "value": '@json(concat(\'{"status": "success", "pipeline": "\', pipeline().Pipeline, \'"}\'))', + "type": "Expression", + }, + }, + }, + # Fail - Explicitly fail the pipeline (usually in error handling) + # Note: In real scenarios, this would be in an error handling path + { + "name": "FailOnCriticalError", + "type": "Fail", + "dependsOn": [ + { + "activity": "SendCompletionNotification", + "dependencyConditions": ["Failed"], + } + ], + "typeProperties": { + "message": "Pipeline failed due to notification error", + "errorCode": "500", + }, + }, + ], + }, + } + + # Add Databricks linked service for the test + databricks_linked_service = { + "id": _base_resource_id("linkedservices", "DatabricksWorkspace"), + "name": "DatabricksWorkspace", + "type": "Microsoft.DataFactory/factories/linkedservices", + "properties": { + "type": "AzureDatabricks", + "typeProperties": { + "domain": "https://adb-123456789.azuredatabricks.net", + "workspaceResourceId": "/subscriptions/xxx/resourceGroups/xxx/providers/Microsoft.Databricks/workspaces/my-workspace", + }, + }, + } + + # Add Azure Function linked service + function_linked_service = { + "id": _base_resource_id("linkedservices", "NotificationFunction"), + "name": "NotificationFunction", + "type": "Microsoft.DataFactory/factories/linkedservices", + "properties": { + "type": "AzureFunction", + "typeProperties": { + "functionAppUrl": "https://my-function-app.azurewebsites.net", + }, + }, + } + + return { + "pipelines": [pipeline], + "additional_linked_services": [ + databricks_linked_service, + function_linked_service, + ], + "expected_datajobs": 9, # All activities + "activity_types_covered": [ + "SetVariable", + "WebActivity", + "SqlServerStoredProcedure", + "Wait", + "GetMetadata", + "DatabricksNotebook", + "Script", + "AzureFunctionActivity", + "Fail", + ], + } + + +# ============================================================================= +# FACTORY HELPER +# ============================================================================= + + +def create_complex_factory() -> Dict[str, Any]: + """Create the factory that contains all complex scenarios.""" + return { + "id": f"/subscriptions/{SUBSCRIPTION_ID}/resourceGroups/{RESOURCE_GROUP}/providers/Microsoft.DataFactory/factories/{FACTORY_NAME}", + "name": FACTORY_NAME, + "type": "Microsoft.DataFactory/factories", + "location": LOCATION, + "tags": {"environment": "test", "purpose": "complex-integration-tests"}, + "properties": { + "provisioningState": "Succeeded", + "createTime": "2024-01-01T00:00:00Z", + }, + } + + +def get_all_complex_pipelines() -> List[Dict[str, Any]]: + """Get all pipelines from all complex scenarios.""" + pipelines = [] + pipelines.extend(create_nested_pipeline_scenario()["pipelines"]) + pipelines.extend(create_foreach_loop_scenario()["pipelines"]) + pipelines.extend(create_branching_scenario()["pipelines"]) + pipelines.extend(create_dataflow_scenario()["pipelines"]) + pipelines.extend(create_multisource_chain_scenario()["pipelines"]) + return pipelines + + +def get_all_data_flows() -> List[Dict[str, Any]]: + """Get all data flows from scenarios that have them.""" + data_flows = [] + dataflow_scenario = create_dataflow_scenario() + if "data_flows" in dataflow_scenario: + data_flows.extend(dataflow_scenario["data_flows"]) + return data_flows diff --git a/metadata-ingestion/tests/integration/azure_data_factory/test_complex_pipelines.py b/metadata-ingestion/tests/integration/azure_data_factory/test_complex_pipelines.py new file mode 100644 index 00000000000000..06ff70920d608a --- /dev/null +++ b/metadata-ingestion/tests/integration/azure_data_factory/test_complex_pipelines.py @@ -0,0 +1,1208 @@ +"""Integration tests for complex Azure Data Factory pipeline patterns. + +These tests validate that the ADF connector correctly handles advanced pipeline +configurations commonly found in production environments. Each test scenario +represents a real-world pattern that data engineers use in ADF. + +Test Coverage: +============= + +1. **Nested Pipelines (Execute Pipeline Activity)** + - Parent pipelines orchestrating child pipelines + - Validates hierarchical DataFlow/DataJob relationships + - Ensures lineage propagates through nested execution + +2. **ForEach Loops** + - Iterative processing over collections (tables, files, etc.) + - Tests that loop activities and their children are properly extracted + - Validates parametrized activities within loops + +3. **Control Flow Branching (If-Condition, Switch)** + - Conditional execution paths based on runtime expressions + - Verifies all branches (true/false, switch cases, default) are captured + - Tests that lineage is recorded for activities in all branches + +4. **Mapping Data Flows** + - Complex transformations (filter, join, aggregate, derive) + - Multiple sources and sinks with transformation chains + - Validates Data Flow script extraction and lineage + +5. **Multi-Source ETL Pipelines** + - Full ETL chains: SQL → Blob → Synapse → DataLake + - Tests end-to-end lineage across multiple hops + - Validates platform mapping (mssql, azure_blob_storage, synapse) + +Why These Tests Matter: +====================== +Production ADF pipelines rarely use simple, linear patterns. These tests ensure +the connector handles real-world complexity without losing lineage information +or failing to capture activities in nested/conditional structures. + +Mock Data Strategy: +================== +Mock data is based on Azure REST API response structures from: +https://github.com/Azure/azure-rest-api-specs/tree/main/specification/datafactory + +The mocks simulate real Azure SDK responses, including: +- Factory, Pipeline, Dataset, LinkedService, DataFlow objects +- Proper nesting of properties and type-specific configurations +- Realistic activity structures with inputs, outputs, and dependencies +""" + +import json +from typing import Any, Dict, Iterator, List +from unittest import mock + +import pytest +from freezegun import freeze_time + +from datahub.ingestion.run.pipeline import Pipeline +from datahub.testing import mce_helpers +from tests.integration.azure_data_factory.complex_mocks import ( + RESOURCE_GROUP, + SUBSCRIPTION_ID, + create_branching_scenario, + create_complex_datasets, + create_complex_factory, + create_complex_linked_services, + create_dataflow_scenario, + create_diverse_activities_scenario, + create_foreach_loop_scenario, + create_multisource_chain_scenario, + create_nested_pipeline_scenario, + get_all_data_flows, +) + +# Freeze time for deterministic test output (affects timestamps in MCPs) +FROZEN_TIME = "2024-01-15 12:00:00" + + +# ============================================================================= +# MOCK HELPERS +# ============================================================================= +# These classes simulate the Azure SDK's response objects. The Azure SDK returns +# objects that have an as_dict() method to convert to dictionaries, which our +# connector then parses into Pydantic models. + + +class MockAzureResource: + """Mock class to simulate Azure SDK resource objects. + + The Azure SDK returns resource objects (Pipeline, Dataset, etc.) that have + an as_dict() method. Our connector calls this method to get a dictionary + representation which is then validated against our Pydantic models. + """ + + def __init__(self, data: Dict[str, Any]): + self._data = data + + def as_dict(self) -> Dict[str, Any]: + return self._data + + +class MockPagedIterator: + """Mock class to simulate Azure SDK paged iterators. + + Azure SDK list operations return paged iterators that yield resource objects. + This mock simulates that behavior for testing without making real API calls. + """ + + def __init__(self, items: List[Dict[str, Any]]): + self._items = [MockAzureResource(item) for item in items] + + def __iter__(self) -> Iterator[MockAzureResource]: + return iter(self._items) + + +class MockQueryResponse: + """Mock class for query responses (e.g., pipeline runs) with continuation token. + + Some Azure APIs return query responses that include a continuation token + for pagination. This mock supports that pattern. + """ + + def __init__( + self, items: List[Dict[str, Any]], continuation_token: str | None = None + ): + self.value = [MockAzureResource(item) for item in items] + self.continuation_token = continuation_token + + +def create_mock_client( + pipelines: List[Dict[str, Any]], + datasets: List[Dict[str, Any]], + linked_services: List[Dict[str, Any]], + data_flows: List[Dict[str, Any]] | None = None, + triggers: List[Dict[str, Any]] | None = None, + pipeline_runs: List[Dict[str, Any]] | None = None, +) -> mock.MagicMock: + """Create a mock DataFactoryManagementClient with the given test data. + + This function creates a mock that simulates the Azure SDK's + DataFactoryManagementClient. Each method returns appropriate mock + iterators/responses that our connector will process. + + Args: + pipelines: List of pipeline definitions (will be converted to DataFlow entities) + datasets: List of dataset definitions (used for lineage resolution) + linked_services: List of linked service definitions (used for platform mapping) + data_flows: List of data flow definitions (for Mapping Data Flow activities) + triggers: List of trigger definitions (optional) + pipeline_runs: List of pipeline run records (for execution history) + + Returns: + A MagicMock configured to behave like DataFactoryManagementClient + """ + mock_client = mock.MagicMock() + + # Mock factories - the top-level container for all ADF resources + factory = create_complex_factory() + mock_client.factories.list.return_value = MockPagedIterator([factory]) + mock_client.factories.list_by_resource_group.return_value = MockPagedIterator( + [factory] + ) + + # Mock pipelines - these become DataFlow entities in DataHub + mock_client.pipelines.list_by_factory.return_value = MockPagedIterator(pipelines) + + # Mock datasets - used to resolve lineage (input/output of activities) + mock_client.datasets.list_by_factory.return_value = MockPagedIterator(datasets) + + # Mock linked services - determine the platform type for datasets + # (e.g., AzureSqlDatabase → mssql, AzureBlobStorage → azure_blob_storage) + mock_client.linked_services.list_by_factory.return_value = MockPagedIterator( + linked_services + ) + + # Mock triggers - schedule definitions (not heavily used in these tests) + mock_client.triggers.list_by_factory.return_value = MockPagedIterator( + triggers or [] + ) + + # Mock data flows - Mapping Data Flow definitions with sources/sinks/transforms + mock_client.data_flows.list_by_factory.return_value = MockPagedIterator( + data_flows or [] + ) + + # Mock pipeline runs - execution history (for DataProcessInstance entities) + mock_client.pipeline_runs.query_by_factory.return_value = MockQueryResponse( + pipeline_runs or [] + ) + + # Mock activity runs - individual activity execution records + mock_client.activity_runs.query_by_pipeline_run.return_value = MockQueryResponse([]) + + return mock_client + + +def _run_test_pipeline( + tmp_path, + run_id: str, + pipelines: List[Dict[str, Any]], + datasets: List[Dict[str, Any]] | None = None, + linked_services: List[Dict[str, Any]] | None = None, + data_flows: List[Dict[str, Any]] | None = None, + include_lineage: bool = True, +) -> Pipeline: + """Helper function to run an ingestion pipeline with mocked Azure data. + + This sets up the full DataHub ingestion pipeline with mocked Azure SDK + responses, runs the ingestion, and returns the pipeline for assertions. + + Args: + tmp_path: Pytest fixture for temporary directory + run_id: Unique identifier for this test run + pipelines: ADF pipeline definitions to ingest + datasets: Dataset definitions (defaults to standard test datasets) + linked_services: Linked service definitions (defaults to standard test services) + data_flows: Data flow definitions for Mapping Data Flow activities + include_lineage: Whether to extract lineage from activities + + Returns: + The executed Pipeline object with source report for assertions + """ + if datasets is None: + datasets = create_complex_datasets() + if linked_services is None: + linked_services = create_complex_linked_services() + + mock_client = create_mock_client( + pipelines=pipelines, + datasets=datasets, + linked_services=linked_services, + data_flows=data_flows, + ) + + output_file = tmp_path / f"{run_id}_output.json" + + config = { + "run_id": run_id, + "source": { + "type": "azure-data-factory", + "config": { + "subscription_id": SUBSCRIPTION_ID, + "resource_group": RESOURCE_GROUP, + "credential": {"authentication_method": "default"}, + "env": "DEV", + "include_lineage": include_lineage, + "include_execution_history": False, + }, + }, + "sink": {"type": "file", "config": {"filename": str(output_file)}}, + } + + with mock.patch( + "datahub.ingestion.source.azure_data_factory.adf_client.DataFactoryManagementClient" + ) as MockClientClass: + MockClientClass.return_value = mock_client + + with mock.patch( + "datahub.ingestion.source.azure.azure_auth.DefaultAzureCredential" + ): + pipeline = Pipeline.create(config) + pipeline.run() + pipeline.raise_from_status() + + return pipeline + + +# ============================================================================= +# TEST: NESTED PIPELINES (Execute Pipeline Activity) +# ============================================================================= +# +# Scenario: Parent pipeline orchestrates child pipelines +# ------------------------------------------------------- +# ParentOrchestrationPipeline +# └── ExecutePipeline: ChildDataMovementPipeline +# └── Copy: SqlToBlob +# └── ExecutePipeline: ChildTransformPipeline +# └── DataFlow: TransformData +# +# What we're testing: +# - All three pipelines are extracted as DataFlow entities +# - ExecutePipeline activities are captured as DataJob entities +# - Child pipeline activities (Copy, DataFlow) are also captured +# - Browse paths show proper hierarchy +# +# Why this matters: +# - Large organizations modularize pipelines for reusability +# - Lineage must track data movement through nested executions +# - Users need to see the full orchestration hierarchy + + +@freeze_time(FROZEN_TIME) +@pytest.mark.integration +def test_nested_pipeline_creates_all_entities(pytestconfig, tmp_path): + """Test that nested pipelines create correct DataFlow and DataJob entities. + + This test verifies that when a parent pipeline contains ExecutePipeline + activities that call child pipelines, all three pipelines and their + activities are properly extracted as DataHub entities. + + Expected entities: + - 3 DataFlow entities (ParentOrchestrationPipeline, ChildDataMovement, ChildTransform) + - 4 DataJob entities (2 ExecutePipeline + 1 Copy + 1 DataFlow activity) + """ + scenario = create_nested_pipeline_scenario() + + pipeline = _run_test_pipeline( + tmp_path, + run_id="nested-pipeline-test", + pipelines=scenario["pipelines"], + data_flows=get_all_data_flows(), + ) + + # Verify all pipelines were processed (not filtered out) + assert pipeline.source.report.pipelines_scanned == len(scenario["pipelines"]) + + +@freeze_time(FROZEN_TIME) +@pytest.mark.integration +def test_nested_pipeline_golden(pytestconfig, tmp_path): + """Golden file test for nested pipeline scenario. + + Compares the output MCPs against a known-good golden file to detect + any regressions in entity structure, URN format, or aspect content. + + The golden file captures the expected output including: + - Container for the factory + - DataFlow entities for each pipeline + - DataJob entities for each activity + - Browse paths showing hierarchy + """ + test_resources_dir = pytestconfig.rootpath / "tests/integration/azure_data_factory" + scenario = create_nested_pipeline_scenario() + + output_file = tmp_path / "adf_nested_events.json" + golden_file = test_resources_dir / "adf_nested_golden.json" + + mock_client = create_mock_client( + pipelines=scenario["pipelines"], + datasets=create_complex_datasets(), + linked_services=create_complex_linked_services(), + data_flows=get_all_data_flows(), + ) + + config = { + "run_id": "adf-nested-test", + "source": { + "type": "azure-data-factory", + "config": { + "subscription_id": SUBSCRIPTION_ID, + "resource_group": RESOURCE_GROUP, + "credential": {"authentication_method": "default"}, + "env": "DEV", + "include_lineage": True, + }, + }, + "sink": {"type": "file", "config": {"filename": str(output_file)}}, + } + + with mock.patch( + "datahub.ingestion.source.azure_data_factory.adf_client.DataFactoryManagementClient" + ) as MockClientClass: + MockClientClass.return_value = mock_client + + with mock.patch( + "datahub.ingestion.source.azure.azure_auth.DefaultAzureCredential" + ): + pipeline = Pipeline.create(config) + pipeline.run() + pipeline.raise_from_status() + + mce_helpers.check_golden_file( + pytestconfig, + output_path=str(output_file), + golden_path=str(golden_file), + ) + + +# ============================================================================= +# TEST: FOREACH LOOPS +# ============================================================================= +# +# Scenario: Iterate over a list of tables to copy +# ------------------------------------------------ +# ForEachTablePipeline +# └── Lookup: GetTableList (query sys.tables) +# └── ForEach: IterateOverTables +# └── Copy: CopyTableToStaging (parametrized) +# +# What we're testing: +# - ForEach activity is captured as a DataJob +# - Activities inside ForEach are also captured +# - Lookup activity's lineage (reading from system tables) +# +# Why this matters: +# - ForEach is used extensively for bulk data operations +# - Users need visibility into what tables/files are processed +# - The Copy activity inside ForEach creates lineage for each iteration + + +@freeze_time(FROZEN_TIME) +@pytest.mark.integration +def test_foreach_loop_pipeline(pytestconfig, tmp_path): + """Golden file test for ForEach loop pipeline. + + Tests a pipeline that uses ForEach to iterate over tables and copy + each one to staging. This is a common pattern for bulk data movement. + + The test verifies: + - ForEach activity is captured as a DataJob with "ForEach Loop" subtype + - Nested Copy activity is captured (though iterations aren't expanded) + - Lookup activity that provides the iteration items is captured + """ + test_resources_dir = pytestconfig.rootpath / "tests/integration/azure_data_factory" + scenario = create_foreach_loop_scenario() + + output_file = tmp_path / "adf_foreach_events.json" + golden_file = test_resources_dir / "adf_foreach_golden.json" + + mock_client = create_mock_client( + pipelines=scenario["pipelines"], + datasets=create_complex_datasets(), + linked_services=create_complex_linked_services(), + ) + + config = { + "run_id": "adf-foreach-test", + "source": { + "type": "azure-data-factory", + "config": { + "subscription_id": SUBSCRIPTION_ID, + "resource_group": RESOURCE_GROUP, + "credential": {"authentication_method": "default"}, + "env": "DEV", + "include_lineage": True, + }, + }, + "sink": {"type": "file", "config": {"filename": str(output_file)}}, + } + + with mock.patch( + "datahub.ingestion.source.azure_data_factory.adf_client.DataFactoryManagementClient" + ) as MockClientClass: + MockClientClass.return_value = mock_client + + with mock.patch( + "datahub.ingestion.source.azure.azure_auth.DefaultAzureCredential" + ): + pipeline = Pipeline.create(config) + pipeline.run() + pipeline.raise_from_status() + + mce_helpers.check_golden_file( + pytestconfig, + output_path=str(output_file), + golden_path=str(golden_file), + ) + + +# ============================================================================= +# TEST: CONTROL FLOW BRANCHING (If-Condition, Switch) +# ============================================================================= +# +# Scenario: Conditional execution based on data existence and region +# ------------------------------------------------------------------ +# BranchingPipeline +# └── Lookup: CheckDataExists +# └── IfCondition: DataExistsCheck +# ├── True: Copy: FullLoad +# └── False: Copy: IncrementalLoad +# └── Switch: ProcessByRegion +# ├── Case "US": Copy: ProcessUSData +# ├── Case "EU": Copy: ProcessEUData +# └── Default: Copy: ProcessOtherData +# +# What we're testing: +# - IfCondition activity captures both true and false branches +# - Switch activity captures all cases and default +# - Activities in all branches are extracted as DataJobs +# - Lineage is captured for activities in conditional branches +# +# Why this matters: +# - Real pipelines have complex conditional logic +# - Users need to see ALL possible execution paths +# - Lineage must include data flows in every branch + + +@freeze_time(FROZEN_TIME) +@pytest.mark.integration +def test_branching_pipeline(pytestconfig, tmp_path): + """Golden file test for If-Condition and Switch branching pipeline. + + Tests a pipeline with complex control flow: + 1. IfCondition that branches based on whether data exists + 2. Switch that routes processing based on region parameter + + The test verifies: + - All activities in all branches are captured + - IfCondition has "If Condition" subtype + - Switch has "Switch Activity" subtype + - Lineage captures inputs/outputs in each branch's activities + """ + test_resources_dir = pytestconfig.rootpath / "tests/integration/azure_data_factory" + scenario = create_branching_scenario() + + output_file = tmp_path / "adf_branching_events.json" + golden_file = test_resources_dir / "adf_branching_golden.json" + + mock_client = create_mock_client( + pipelines=scenario["pipelines"], + datasets=create_complex_datasets(), + linked_services=create_complex_linked_services(), + ) + + config = { + "run_id": "adf-branching-test", + "source": { + "type": "azure-data-factory", + "config": { + "subscription_id": SUBSCRIPTION_ID, + "resource_group": RESOURCE_GROUP, + "credential": {"authentication_method": "default"}, + "env": "DEV", + "include_lineage": True, + }, + }, + "sink": {"type": "file", "config": {"filename": str(output_file)}}, + } + + with mock.patch( + "datahub.ingestion.source.azure_data_factory.adf_client.DataFactoryManagementClient" + ) as MockClientClass: + MockClientClass.return_value = mock_client + + with mock.patch( + "datahub.ingestion.source.azure.azure_auth.DefaultAzureCredential" + ): + pipeline = Pipeline.create(config) + pipeline.run() + pipeline.raise_from_status() + + mce_helpers.check_golden_file( + pytestconfig, + output_path=str(output_file), + golden_path=str(golden_file), + ) + + +# ============================================================================= +# TEST: MAPPING DATA FLOWS +# ============================================================================= +# +# Scenario: Complex data transformation with multiple sources and sinks +# --------------------------------------------------------------------- +# DataFlowPipeline +# └── ExecuteDataFlow: RunSalesTransformation +# └── SalesTransformationFlow: +# Sources: CustomersSource, OrdersSource +# Transformations: Filter → Join → Aggregate → Derive +# Sinks: CuratedOutput, SynapseOutput +# +# What we're testing: +# - Data Flow definition is loaded and cached +# - ExecuteDataFlow activity extracts sources as inputs +# - ExecuteDataFlow activity extracts sinks as outputs +# - Data Flow script is captured in dataTransformLogic aspect +# +# Why this matters: +# - Mapping Data Flows contain critical transformation logic +# - Lineage from Data Flows shows complex many-to-many relationships +# - Scripts help users understand what transformations are applied + + +@freeze_time(FROZEN_TIME) +@pytest.mark.integration +def test_dataflow_pipeline_with_lineage(pytestconfig, tmp_path): + """Golden file test for Mapping Data Flow pipeline with lineage extraction. + + Tests a pipeline that executes a Mapping Data Flow containing: + - Multiple sources (customers, orders) + - Multiple transformations (filter, join, aggregate, derive) + - Multiple sinks (data lake, synapse) + + The test verifies: + - ExecuteDataFlow activity has "Data Flow Activity" subtype + - Data Flow sources are captured as input datasets + - Data Flow sinks are captured as output datasets + - Data Flow script is captured (for transformation visibility) + """ + test_resources_dir = pytestconfig.rootpath / "tests/integration/azure_data_factory" + scenario = create_dataflow_scenario() + + output_file = tmp_path / "adf_dataflow_events.json" + golden_file = test_resources_dir / "adf_dataflow_golden.json" + + mock_client = create_mock_client( + pipelines=scenario["pipelines"], + datasets=create_complex_datasets(), + linked_services=create_complex_linked_services(), + data_flows=scenario.get("data_flows", []), + ) + + config = { + "run_id": "adf-dataflow-test", + "source": { + "type": "azure-data-factory", + "config": { + "subscription_id": SUBSCRIPTION_ID, + "resource_group": RESOURCE_GROUP, + "credential": {"authentication_method": "default"}, + "env": "DEV", + "include_lineage": True, + }, + }, + "sink": {"type": "file", "config": {"filename": str(output_file)}}, + } + + with mock.patch( + "datahub.ingestion.source.azure_data_factory.adf_client.DataFactoryManagementClient" + ) as MockClientClass: + MockClientClass.return_value = mock_client + + with mock.patch( + "datahub.ingestion.source.azure.azure_auth.DefaultAzureCredential" + ): + pipeline = Pipeline.create(config) + pipeline.run() + pipeline.raise_from_status() + + mce_helpers.check_golden_file( + pytestconfig, + output_path=str(output_file), + golden_path=str(golden_file), + ) + + +# ============================================================================= +# TEST: MULTI-SOURCE ETL PIPELINE +# ============================================================================= +# +# Scenario: Full ETL chain with multiple stages and destinations +# -------------------------------------------------------------- +# ETLPipeline +# ├── Copy: ExtractCustomersFromSQL (SQL → Blob) +# ├── Copy: ExtractOrdersFromSQL (SQL → Blob) +# ├── Copy: LoadCustomersToSynapse (Blob → Synapse) +# ├── Copy: LoadOrdersToSynapse (Blob → Synapse) +# └── Copy: ArchiveToDataLake (Blob → DataLake) +# +# Lineage chain: +# SQL (Customers) → Blob (Staging) → Synapse (DW) +# → DataLake (Archive) +# SQL (Orders) → Blob (Staging) → Synapse (DW) +# +# What we're testing: +# - Multi-hop lineage is captured correctly +# - Platform mapping works for different linked services: +# - AzureSqlDatabase → mssql +# - AzureBlobStorage → azure_blob_storage +# - AzureSynapseAnalytics → synapse +# - AzureBlobFS → azure_data_lake +# - Dependencies between activities are respected +# +# Why this matters: +# - Real ETL pipelines have multiple stages +# - Users need to trace data from source to final destination +# - Platform-specific URNs enable cross-system lineage in DataHub + + +@freeze_time(FROZEN_TIME) +@pytest.mark.integration +def test_multisource_etl_pipeline(pytestconfig, tmp_path): + """Golden file test for multi-source ETL pipeline with full lineage chain. + + Tests a realistic ETL pipeline that: + 1. Extracts data from SQL databases to blob storage + 2. Loads from blob to Synapse data warehouse + 3. Archives to Data Lake for long-term storage + + The test verifies: + - All Copy activities are captured with correct subtypes + - Platform mapping produces correct URNs: + - mssql for SQL datasets + - azure_blob_storage for Blob datasets + - synapse for Synapse datasets + - azure_data_lake for Data Lake datasets + - Activity dependencies are reflected in job order + """ + test_resources_dir = pytestconfig.rootpath / "tests/integration/azure_data_factory" + scenario = create_multisource_chain_scenario() + + output_file = tmp_path / "adf_multisource_events.json" + golden_file = test_resources_dir / "adf_multisource_golden.json" + + mock_client = create_mock_client( + pipelines=scenario["pipelines"], + datasets=create_complex_datasets(), + linked_services=create_complex_linked_services(), + ) + + config = { + "run_id": "adf-multisource-test", + "source": { + "type": "azure-data-factory", + "config": { + "subscription_id": SUBSCRIPTION_ID, + "resource_group": RESOURCE_GROUP, + "credential": {"authentication_method": "default"}, + "env": "DEV", + "include_lineage": True, + }, + }, + "sink": {"type": "file", "config": {"filename": str(output_file)}}, + } + + with mock.patch( + "datahub.ingestion.source.azure_data_factory.adf_client.DataFactoryManagementClient" + ) as MockClientClass: + MockClientClass.return_value = mock_client + + with mock.patch( + "datahub.ingestion.source.azure.azure_auth.DefaultAzureCredential" + ): + pipeline = Pipeline.create(config) + pipeline.run() + pipeline.raise_from_status() + + mce_helpers.check_golden_file( + pytestconfig, + output_path=str(output_file), + golden_path=str(golden_file), + ) + + +# ============================================================================= +# LINEAGE VERIFICATION TESTS +# ============================================================================= +# +# These tests go beyond golden file comparison to programmatically verify +# that lineage is being captured correctly. They check specific assertions +# about the extracted metadata rather than comparing full output. + + +@freeze_time(FROZEN_TIME) +@pytest.mark.integration +def test_multisource_lineage_accuracy(tmp_path): + """Verify lineage edges are correct for multi-source ETL pipeline. + + This test programmatically inspects the generated MCPs to verify that: + 1. dataJobInputOutput aspects are emitted (lineage is captured) + 2. SQL sources appear as input datasets with 'mssql' platform + 3. Synapse destinations appear as output datasets with 'synapse' platform + + This complements the golden file test by focusing on specific lineage + properties that are critical for data governance use cases. + """ + scenario = create_multisource_chain_scenario() + + mock_client = create_mock_client( + pipelines=scenario["pipelines"], + datasets=create_complex_datasets(), + linked_services=create_complex_linked_services(), + ) + + output_file = tmp_path / "lineage_test.json" + + config = { + "run_id": "lineage-test", + "source": { + "type": "azure-data-factory", + "config": { + "subscription_id": SUBSCRIPTION_ID, + "resource_group": RESOURCE_GROUP, + "credential": {"authentication_method": "default"}, + "env": "DEV", + "include_lineage": True, + }, + }, + "sink": {"type": "file", "config": {"filename": str(output_file)}}, + } + + with mock.patch( + "datahub.ingestion.source.azure_data_factory.adf_client.DataFactoryManagementClient" + ) as MockClientClass: + MockClientClass.return_value = mock_client + + with mock.patch( + "datahub.ingestion.source.azure.azure_auth.DefaultAzureCredential" + ): + pipeline = Pipeline.create(config) + pipeline.run() + + # Read output and verify lineage (file sink outputs JSON array) + with open(output_file) as f: + mcps = json.load(f) + + # Find dataJobInputOutput aspects - these contain the lineage edges + lineage_aspects = [ + mcp for mcp in mcps if mcp.get("aspectName") == "dataJobInputOutput" + ] + + # Verify lineage aspects were emitted for Copy activities + assert len(lineage_aspects) > 0, "Expected lineage aspects to be emitted" + + # Collect all input and output datasets from lineage aspects + all_inputs = [] + all_outputs = [] + for aspect in lineage_aspects: + inputs = aspect.get("aspect", {}).get("json", {}).get("inputDatasets", []) + outputs = aspect.get("aspect", {}).get("json", {}).get("outputDatasets", []) + all_inputs.extend(inputs) + all_outputs.extend(outputs) + + # Verify SQL sources are captured with correct platform + # SQL inputs should have URNs containing 'mssql' (mapped from AzureSqlDatabase) + sql_inputs = [i for i in all_inputs if "mssql" in i] + assert len(sql_inputs) > 0, "Expected SQL dataset inputs with 'mssql' platform" + + # Verify Synapse destinations are captured with correct platform + # Synapse outputs should have URNs containing 'synapse' + synapse_outputs = [o for o in all_outputs if "synapse" in o] + assert len(synapse_outputs) > 0, ( + "Expected Synapse dataset outputs with 'synapse' platform" + ) + + +@freeze_time(FROZEN_TIME) +@pytest.mark.integration +def test_dataflow_lineage_sources_and_sinks(tmp_path): + """Verify Data Flow sources and sinks are extracted for lineage. + + This test verifies that when a pipeline executes a Mapping Data Flow, + the connector: + 1. Fetches and caches the Data Flow definition + 2. Extracts source datasets from the Data Flow + 3. Extracts sink datasets from the Data Flow + 4. Reports that data flows were scanned + + Data Flow lineage is critical because: + - Data Flows can have complex many-to-many relationships + - Sources/sinks are defined in the Data Flow, not the activity + - Without Data Flow inspection, lineage would be incomplete + """ + scenario = create_dataflow_scenario() + + mock_client = create_mock_client( + pipelines=scenario["pipelines"], + datasets=create_complex_datasets(), + linked_services=create_complex_linked_services(), + data_flows=scenario.get("data_flows", []), + ) + + output_file = tmp_path / "dataflow_lineage_test.json" + + config = { + "run_id": "dataflow-lineage-test", + "source": { + "type": "azure-data-factory", + "config": { + "subscription_id": SUBSCRIPTION_ID, + "resource_group": RESOURCE_GROUP, + "credential": {"authentication_method": "default"}, + "env": "DEV", + "include_lineage": True, + }, + }, + "sink": {"type": "file", "config": {"filename": str(output_file)}}, + } + + with mock.patch( + "datahub.ingestion.source.azure_data_factory.adf_client.DataFactoryManagementClient" + ) as MockClientClass: + MockClientClass.return_value = mock_client + + with mock.patch( + "datahub.ingestion.source.azure.azure_auth.DefaultAzureCredential" + ): + pipeline = Pipeline.create(config) + pipeline.run() + + # Verify that data flows were fetched and processed + # This confirms the connector is looking up Data Flow definitions + assert pipeline.source.report.data_flows_scanned > 0, ( + "Expected data flows to be scanned for lineage extraction" + ) + + +# ============================================================================= +# TEST: DIVERSE ACTIVITY TYPES +# ============================================================================= +# +# Scenario: Pipeline with various activity types +# ----------------------------------------------- +# DiverseActivitiesPipeline +# └── SetVariable: InitializeCounter +# └── WebActivity: FetchConfiguration +# └── SqlServerStoredProcedure: ProcessData +# └── Wait: DelayForReplication +# └── GetMetadata: CheckOutputExists +# └── DatabricksNotebook: RunMLTraining +# └── Script: RunAnalyticsScript +# └── AzureFunctionActivity: SendNotification +# └── Fail: FailOnError +# +# What we're testing: +# - All activity types are captured as DataJobs with correct subtypes +# - Each activity has the appropriate metadata (description, properties) +# - The connector doesn't fail on uncommon activity types +# +# Why this matters: +# - Real pipelines use many different activity types +# - Users need visibility into all orchestration activities +# - Activity subtypes help with filtering and understanding + + +@freeze_time(FROZEN_TIME) +@pytest.mark.integration +def test_diverse_activities_pipeline(pytestconfig, tmp_path): + """Test that diverse activity types are correctly captured. + + This test verifies that the connector handles various activity types: + - SetVariable, WebActivity, SqlServerStoredProcedure, Wait + - GetMetadata, DatabricksNotebook, Script, AzureFunctionActivity, Fail + + Each activity should be captured as a DataJob with the correct subtype. + """ + test_resources_dir = pytestconfig.rootpath / "tests/integration/azure_data_factory" + scenario = create_diverse_activities_scenario() + + output_file = tmp_path / "adf_diverse_events.json" + golden_file = test_resources_dir / "adf_diverse_golden.json" + + # Combine standard linked services with additional ones from the scenario + all_linked_services = create_complex_linked_services() + scenario.get( + "additional_linked_services", [] + ) + + mock_client = create_mock_client( + pipelines=scenario["pipelines"], + datasets=create_complex_datasets(), + linked_services=all_linked_services, + ) + + config = { + "run_id": "adf-diverse-test", + "source": { + "type": "azure-data-factory", + "config": { + "subscription_id": SUBSCRIPTION_ID, + "resource_group": RESOURCE_GROUP, + "credential": {"authentication_method": "default"}, + "env": "DEV", + "include_lineage": True, + }, + }, + "sink": {"type": "file", "config": {"filename": str(output_file)}}, + } + + with mock.patch( + "datahub.ingestion.source.azure_data_factory.adf_client.DataFactoryManagementClient" + ) as MockClientClass: + MockClientClass.return_value = mock_client + + with mock.patch( + "datahub.ingestion.source.azure.azure_auth.DefaultAzureCredential" + ): + pipeline = Pipeline.create(config) + pipeline.run() + pipeline.raise_from_status() + + mce_helpers.check_golden_file( + pytestconfig, + output_path=str(output_file), + golden_path=str(golden_file), + ) + + +@freeze_time(FROZEN_TIME) +@pytest.mark.integration +def test_diverse_activities_subtypes(tmp_path): + """Verify that diverse activity types have correct subtypes. + + This test programmatically checks that each activity type is mapped + to the expected DataHub subtype. + """ + scenario = create_diverse_activities_scenario() + + all_linked_services = create_complex_linked_services() + scenario.get( + "additional_linked_services", [] + ) + + mock_client = create_mock_client( + pipelines=scenario["pipelines"], + datasets=create_complex_datasets(), + linked_services=all_linked_services, + ) + + output_file = tmp_path / "diverse_subtypes_test.json" + + config = { + "run_id": "diverse-subtypes-test", + "source": { + "type": "azure-data-factory", + "config": { + "subscription_id": SUBSCRIPTION_ID, + "resource_group": RESOURCE_GROUP, + "credential": {"authentication_method": "default"}, + "env": "DEV", + "include_lineage": True, + }, + }, + "sink": {"type": "file", "config": {"filename": str(output_file)}}, + } + + with mock.patch( + "datahub.ingestion.source.azure_data_factory.adf_client.DataFactoryManagementClient" + ) as MockClientClass: + MockClientClass.return_value = mock_client + + with mock.patch( + "datahub.ingestion.source.azure.azure_auth.DefaultAzureCredential" + ): + pipeline = Pipeline.create(config) + pipeline.run() + + # Read output and verify subtypes + with open(output_file) as f: + mcps = json.load(f) + + # Find subTypes aspects for DataJobs + subtype_aspects = [ + mcp + for mcp in mcps + if mcp.get("entityType") == "dataJob" and mcp.get("aspectName") == "subTypes" + ] + + # Collect all subtypes + found_subtypes = set() + for aspect in subtype_aspects: + types = aspect.get("aspect", {}).get("json", {}).get("typeNames", []) + found_subtypes.update(types) + + # Verify we captured diverse subtypes (at least some key ones) + expected_subtypes = { + "Set Variable", + "Web Activity", + "Stored Procedure Activity", + "Wait Activity", + "Get Metadata Activity", + "Databricks Notebook", + } + + found_expected = expected_subtypes.intersection(found_subtypes) + assert len(found_expected) >= 3, ( + f"Expected to find at least 3 activity subtypes from {expected_subtypes}, " + f"but found: {found_subtypes}" + ) + + +# ============================================================================= +# TEST: PIPELINE-TO-PIPELINE LINEAGE +# ============================================================================= +# +# Scenario: Parent pipeline calling child pipelines +# -------------------------------------------------- +# When a pipeline uses ExecutePipeline activity to call another pipeline, +# we should capture this dependency. This enables: +# - Understanding orchestration hierarchies +# - Impact analysis across pipeline boundaries +# - Tracing data flow through nested execution +# +# What we're testing: +# - ExecutePipeline activities capture child pipeline references +# - Custom properties include "calls_pipeline" and "child_pipeline_urn" +# - The dependency is visible in DataHub + + +@freeze_time(FROZEN_TIME) +@pytest.mark.integration +def test_pipeline_to_pipeline_lineage(tmp_path): + """Verify that ExecutePipeline activities create DataJob-to-DataJob lineage. + + When a parent pipeline calls a child pipeline via ExecutePipeline, + the connector should: + 1. Capture the child pipeline name in custom properties + 2. Record the child pipeline's DataFlow URN + 3. Create DataJob-to-DataJob lineage (inputDatajobs) pointing to first child activity + 4. Enable users to trace the orchestration hierarchy in the UI + + This test checks the nested pipeline scenario for these dependencies. + """ + scenario = create_nested_pipeline_scenario() + + mock_client = create_mock_client( + pipelines=scenario["pipelines"], + datasets=create_complex_datasets(), + linked_services=create_complex_linked_services(), + data_flows=get_all_data_flows(), + ) + + output_file = tmp_path / "pipeline_lineage_test.json" + + config = { + "run_id": "pipeline-lineage-test", + "source": { + "type": "azure-data-factory", + "config": { + "subscription_id": SUBSCRIPTION_ID, + "resource_group": RESOURCE_GROUP, + "credential": {"authentication_method": "default"}, + "env": "DEV", + "include_lineage": True, + }, + }, + "sink": {"type": "file", "config": {"filename": str(output_file)}}, + } + + with mock.patch( + "datahub.ingestion.source.azure_data_factory.adf_client.DataFactoryManagementClient" + ) as MockClientClass: + MockClientClass.return_value = mock_client + + with mock.patch( + "datahub.ingestion.source.azure.azure_auth.DefaultAzureCredential" + ): + pipeline = Pipeline.create(config) + pipeline.run() + + # Read output and verify pipeline-to-pipeline references + with open(output_file) as f: + mcps = json.load(f) + + # Find DataJobInfo aspects with ExecutePipeline activities + datajob_infos = [ + mcp + for mcp in mcps + if mcp.get("entityType") == "dataJob" and mcp.get("aspectName") == "dataJobInfo" + ] + + # Find DataJobInputOutput aspects + datajob_io = [ + mcp + for mcp in mcps + if mcp.get("entityType") == "dataJob" + and mcp.get("aspectName") == "dataJobInputOutput" + ] + + # Look for activities that call child pipelines + child_pipeline_refs = [] + for info in datajob_infos: + custom_props = ( + info.get("aspect", {}).get("json", {}).get("customProperties", {}) + ) + if "calls_pipeline" in custom_props: + child_pipeline_refs.append( + { + "activity": info.get("entityUrn", ""), + "calls": custom_props.get("calls_pipeline"), + "child_urn": custom_props.get("child_pipeline_urn"), + "child_first_activity": custom_props.get("child_first_activity"), + } + ) + + # The nested pipeline scenario has 2 ExecutePipeline activities + assert len(child_pipeline_refs) >= 2, ( + f"Expected at least 2 ExecutePipeline activities with child pipeline references, " + f"but found: {len(child_pipeline_refs)}" + ) + + # Verify the child pipeline names are captured + child_names = {ref["calls"] for ref in child_pipeline_refs} + assert "ChildDataMovementPipeline" in child_names, ( + f"Expected ChildDataMovementPipeline in child references: {child_names}" + ) + assert "ChildTransformPipeline" in child_names, ( + f"Expected ChildTransformPipeline in child references: {child_names}" + ) + + # Verify the first activity names are captured + first_activities = {ref["child_first_activity"] for ref in child_pipeline_refs} + assert "CopyCustomersToStaging" in first_activities, ( + f"Expected CopyCustomersToStaging as first activity: {first_activities}" + ) + assert "TransformCustomerData" in first_activities, ( + f"Expected TransformCustomerData as first activity: {first_activities}" + ) + + # Verify DataJobInputOutput aspects have inputDatajobs for pipeline-to-pipeline lineage + input_datajobs_found = [] + for io in datajob_io: + input_jobs = io.get("aspect", {}).get("json", {}).get("inputDatajobs", []) + if input_jobs: + input_datajobs_found.extend(input_jobs) + + # Should have at least 2 inputDatajobs references (one for each ExecutePipeline) + assert len(input_datajobs_found) >= 2, ( + f"Expected at least 2 inputDatajobs for pipeline-to-pipeline lineage, " + f"but found: {len(input_datajobs_found)}" + ) + + # Verify the inputDatajobs point to child pipeline activities + assert any("CopyCustomersToStaging" in urn for urn in input_datajobs_found), ( + f"Expected inputDatajobs to reference CopyCustomersToStaging: {input_datajobs_found}" + ) + assert any("TransformCustomerData" in urn for urn in input_datajobs_found), ( + f"Expected inputDatajobs to reference TransformCustomerData: {input_datajobs_found}" + ) diff --git a/metadata-service/configuration/src/main/resources/bootstrap_mcps.yaml b/metadata-service/configuration/src/main/resources/bootstrap_mcps.yaml index 24f638402019b3..7f77c7867e3b98 100644 --- a/metadata-service/configuration/src/main/resources/bootstrap_mcps.yaml +++ b/metadata-service/configuration/src/main/resources/bootstrap_mcps.yaml @@ -13,7 +13,7 @@ bootstrap: mcps_location: "bootstrap_mcps/root-user.yaml" - name: data-platforms - version: v4 + version: v5 blocking: true async: false mcps_location: "bootstrap_mcps/data-platforms.yaml" diff --git a/metadata-service/configuration/src/main/resources/bootstrap_mcps/data-platforms.yaml b/metadata-service/configuration/src/main/resources/bootstrap_mcps/data-platforms.yaml index 9882c6af4537c9..084e664aa094bf 100644 --- a/metadata-service/configuration/src/main/resources/bootstrap_mcps/data-platforms.yaml +++ b/metadata-service/configuration/src/main/resources/bootstrap_mcps/data-platforms.yaml @@ -21,6 +21,16 @@ displayName: Azure Data Lake (Gen 2) type: FILE_SYSTEM logoUrl: "assets/platforms/adlslogo.png" +- entityUrn: urn:li:dataPlatform:azure-data-factory + entityType: dataPlatform + aspectName: dataPlatformInfo + changeType: UPSERT + aspect: + datasetNameDelimiter: "." + name: azure-data-factory + displayName: Azure Data Factory + type: OTHERS + logoUrl: "assets/platforms/azuredatafactorylogo.svg" - entityUrn: urn:li:dataPlatform:airflow entityType: dataPlatform aspectName: dataPlatformInfo From d4c3b6e2d5b21298bb9361a0169f85218221ab76 Mon Sep 17 00:00:00 2001 From: Anush Kumar Date: Mon, 8 Dec 2025 17:40:54 -0800 Subject: [PATCH 03/13] feat(azure-data-factory): enhance mixed dependencies handling in ADF integration - Implemented support for mixed pipeline and dataset dependencies in Azure Data Factory, allowing for both pipeline-to-pipeline and dataset lineage tracking. - Updated documentation to reflect new features and improved clarity on lineage extraction. - Added integration tests to validate the handling of mixed dependencies, ensuring accurate lineage representation in the DataHub UI. - Refactored existing tests to accommodate new scenarios and ensure comprehensive coverage of ADF functionalities. --- .../azure_data_factory_pre.md | 7 +- .../source/azure_data_factory/adf_source.py | 26 +- .../adf_mixed_deps_golden.json | 1037 +++++++++++++++++ .../azure_data_factory/adf_nested_golden.json | 404 +++---- .../azure_data_factory/complex_mocks.py | 168 +++ .../test_complex_pipelines.py | 294 ++++- 6 files changed, 1707 insertions(+), 229 deletions(-) create mode 100644 metadata-ingestion/tests/integration/azure_data_factory/adf_mixed_deps_golden.json diff --git a/metadata-ingestion/docs/sources/azure_data_factory/azure_data_factory_pre.md b/metadata-ingestion/docs/sources/azure_data_factory/azure_data_factory_pre.md index 8b8fc995e9be9a..0f77bffcf78d96 100644 --- a/metadata-ingestion/docs/sources/azure_data_factory/azure_data_factory_pre.md +++ b/metadata-ingestion/docs/sources/azure_data_factory/azure_data_factory_pre.md @@ -73,18 +73,21 @@ The connector extracts lineage from: ### Pipeline-to-Pipeline Lineage -When a pipeline calls another pipeline via an `ExecutePipeline` activity, the connector creates a lineage edge from the calling activity to the first activity in the child pipeline. This enables: +When a pipeline calls another pipeline via an `ExecutePipeline` activity, the connector creates a lineage edge showing the calling activity as **upstream** of the child pipeline's first activity. This enables: - Tracing orchestration hierarchies across nested pipelines - Impact analysis when modifying child pipelines - Understanding dependencies between modular pipelines +**Lineage Direction:** `ExecutePipeline` → `ChildFirstActivity` + The ExecutePipeline activity's DataJob entity will include: - Custom property `calls_pipeline`: Name of the child pipeline - Custom property `child_pipeline_urn`: URN of the child DataFlow - Custom property `child_first_activity`: Name of the first activity in the child pipeline -- Lineage edge to the first DataJob in the child pipeline + +The child pipeline's first activity will have the ExecutePipeline as its input/upstream dependency. ### Supported Linked Service Mappings diff --git a/metadata-ingestion/src/datahub/ingestion/source/azure_data_factory/adf_source.py b/metadata-ingestion/src/datahub/ingestion/source/azure_data_factory/adf_source.py index 67f6f77d9c7c9b..bdb96077af00af 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/azure_data_factory/adf_source.py +++ b/metadata-ingestion/src/datahub/ingestion/source/azure_data_factory/adf_source.py @@ -913,20 +913,20 @@ def _emit_pipeline_lineage( self.report.report_lineage_extracted() - # Emit DataJobInputOutput with the child DataJob as an input dependency - # This creates a visible lineage edge in the DataHub UI - input_datajobs: list[str] = [] + # Emit DataJobInputOutput on the CHILD's first activity, setting ExecutePipeline as upstream + # This creates lineage: ExecutePipeline -> ChildFirstActivity + # (The parent activity triggers the child, so parent is upstream of child) if child_datajob_urn: - input_datajobs.append(str(child_datajob_urn)) - - yield MetadataChangeProposalWrapper( - entityUrn=str(datajob.urn), - aspect=DataJobInputOutputClass( - inputDatasets=[], - outputDatasets=[], - inputDatajobs=input_datajobs, - ), - ).as_workunit() + yield MetadataChangeProposalWrapper( + entityUrn=str(child_datajob_urn), # Child's first activity + aspect=DataJobInputOutputClass( + inputDatasets=[], + outputDatasets=[], + inputDatajobs=[ + str(datajob.urn) + ], # ExecutePipeline as input/upstream + ), + ).as_workunit() def _resolve_dataset_urn( self, dataset_name: str, factory_key: str diff --git a/metadata-ingestion/tests/integration/azure_data_factory/adf_mixed_deps_golden.json b/metadata-ingestion/tests/integration/azure_data_factory/adf_mixed_deps_golden.json new file mode 100644 index 00000000000000..f45d4aafc4121b --- /dev/null +++ b/metadata-ingestion/tests/integration/azure_data_factory/adf_mixed_deps_golden.json @@ -0,0 +1,1037 @@ +[ +{ + "entityType": "container", + "entityUrn": "urn:li:container:7f68411fe6779dae7f37506c1e9bacfe", + "changeType": "UPSERT", + "aspectName": "containerProperties", + "aspect": { + "json": { + "customProperties": { + "platform": "azure-data-factory", + "env": "DEV", + "resource_group": "complex-test-rg", + "factory_name": "complex-data-factory", + "azure_resource_id": "/subscriptions/12345678-1234-1234-1234-123456789012/resourceGroups/complex-test-rg/providers/Microsoft.DataFactory/factories/complex-data-factory", + "location": "eastus", + "tag:environment": "test", + "tag:purpose": "complex-integration-tests", + "provisioning_state": "Succeeded" + }, + "externalUrl": "https://adf.azure.com/en/home?factory=/subscriptions/12345678-1234-1234-1234-123456789012/resourceGroups/complex-test-rg/providers/Microsoft.DataFactory/factories/complex-data-factory", + "name": "complex-data-factory", + "description": "Azure Data Factory: complex-data-factory", + "env": "DEV" + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-mixed-deps-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "container", + "entityUrn": "urn:li:container:7f68411fe6779dae7f37506c1e9bacfe", + "changeType": "UPSERT", + "aspectName": "dataPlatformInstance", + "aspect": { + "json": { + "platform": "urn:li:dataPlatform:azure-data-factory" + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-mixed-deps-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "container", + "entityUrn": "urn:li:container:7f68411fe6779dae7f37506c1e9bacfe", + "changeType": "UPSERT", + "aspectName": "subTypes", + "aspect": { + "json": { + "typeNames": [ + "Data Factory" + ] + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-mixed-deps-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "container", + "entityUrn": "urn:li:container:7f68411fe6779dae7f37506c1e9bacfe", + "changeType": "UPSERT", + "aspectName": "browsePathsV2", + "aspect": { + "json": { + "path": [] + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-mixed-deps-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataFlow", + "entityUrn": "urn:li:dataFlow:(azure-data-factory,complex-data-factory.MixedOrchestrationPipeline,DEV)", + "changeType": "UPSERT", + "aspectName": "dataPlatformInstance", + "aspect": { + "json": { + "platform": "urn:li:dataPlatform:azure-data-factory" + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-mixed-deps-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataFlow", + "entityUrn": "urn:li:dataFlow:(azure-data-factory,complex-data-factory.MixedOrchestrationPipeline,DEV)", + "changeType": "UPSERT", + "aspectName": "dataFlowInfo", + "aspect": { + "json": { + "customProperties": { + "azure_resource_id": "/subscriptions/12345678-1234-1234-1234-123456789012/resourceGroups/complex-test-rg/providers/Microsoft.DataFactory/factories/complex-data-factory/pipelines/MixedOrchestrationPipeline", + "factory_name": "complex-data-factory" + }, + "externalUrl": "https://adf.azure.com/en/authoring/pipeline/MixedOrchestrationPipeline?factory=/subscriptions/12345678-1234-1234-1234-123456789012/resourceGroups/complex-test-rg/providers/Microsoft.DataFactory/factories/complex-data-factory", + "name": "MixedOrchestrationPipeline", + "description": "Pipeline demonstrating both pipeline and dataset dependencies", + "env": "DEV" + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-mixed-deps-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataFlow", + "entityUrn": "urn:li:dataFlow:(azure-data-factory,complex-data-factory.MixedOrchestrationPipeline,DEV)", + "changeType": "UPSERT", + "aspectName": "subTypes", + "aspect": { + "json": { + "typeNames": [ + "Pipeline" + ] + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-mixed-deps-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataFlow", + "entityUrn": "urn:li:dataFlow:(azure-data-factory,complex-data-factory.MixedOrchestrationPipeline,DEV)", + "changeType": "UPSERT", + "aspectName": "container", + "aspect": { + "json": { + "container": "urn:li:container:7f68411fe6779dae7f37506c1e9bacfe" + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-mixed-deps-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataFlow", + "entityUrn": "urn:li:dataFlow:(azure-data-factory,complex-data-factory.MixedOrchestrationPipeline,DEV)", + "changeType": "UPSERT", + "aspectName": "browsePathsV2", + "aspect": { + "json": { + "path": [ + { + "id": "urn:li:container:7f68411fe6779dae7f37506c1e9bacfe", + "urn": "urn:li:container:7f68411fe6779dae7f37506c1e9bacfe" + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-mixed-deps-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure-data-factory,complex-data-factory.MixedOrchestrationPipeline,DEV),ExecuteExtract)", + "changeType": "UPSERT", + "aspectName": "dataPlatformInstance", + "aspect": { + "json": { + "platform": "urn:li:dataPlatform:azure-data-factory" + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-mixed-deps-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure-data-factory,complex-data-factory.MixedOrchestrationPipeline,DEV),ExecuteExtract)", + "changeType": "UPSERT", + "aspectName": "dataJobInfo", + "aspect": { + "json": { + "customProperties": { + "activity_type": "ExecutePipeline", + "calls_pipeline": "ExtractDataPipeline", + "child_pipeline_urn": "urn:li:dataFlow:(azure-data-factory,complex-data-factory.ExtractDataPipeline,DEV)", + "child_first_activity": "ExtractFromSource" + }, + "externalUrl": "https://adf.azure.com/en/authoring/pipeline/MixedOrchestrationPipeline?factory=/subscriptions/12345678-1234-1234-1234-123456789012/resourceGroups/complex-test-rg/providers/Microsoft.DataFactory/factories/complex-data-factory", + "name": "ExecuteExtract", + "type": { + "string": "COMMAND" + }, + "flowUrn": "urn:li:dataFlow:(azure-data-factory,complex-data-factory.MixedOrchestrationPipeline,DEV)", + "env": "DEV" + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-mixed-deps-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure-data-factory,complex-data-factory.MixedOrchestrationPipeline,DEV),ExecuteExtract)", + "changeType": "UPSERT", + "aspectName": "subTypes", + "aspect": { + "json": { + "typeNames": [ + "Execute Pipeline" + ] + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-mixed-deps-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure-data-factory,complex-data-factory.MixedOrchestrationPipeline,DEV),ExecuteExtract)", + "changeType": "UPSERT", + "aspectName": "browsePathsV2", + "aspect": { + "json": { + "path": [ + { + "id": "urn:li:container:7f68411fe6779dae7f37506c1e9bacfe", + "urn": "urn:li:container:7f68411fe6779dae7f37506c1e9bacfe" + }, + { + "id": "complex-data-factory.MixedOrchestrationPipeline", + "urn": "urn:li:dataFlow:(azure-data-factory,complex-data-factory.MixedOrchestrationPipeline,DEV)" + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-mixed-deps-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure-data-factory,complex-data-factory.ExtractDataPipeline,DEV),ExtractFromSource)", + "changeType": "UPSERT", + "aspectName": "dataJobInputOutput", + "aspect": { + "json": { + "inputDatasets": [], + "outputDatasets": [], + "inputDatajobs": [ + "urn:li:dataJob:(urn:li:dataFlow:(azure-data-factory,complex-data-factory.MixedOrchestrationPipeline,DEV),ExecuteExtract)" + ] + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-mixed-deps-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure-data-factory,complex-data-factory.MixedOrchestrationPipeline,DEV),TransformInMain)", + "changeType": "UPSERT", + "aspectName": "dataPlatformInstance", + "aspect": { + "json": { + "platform": "urn:li:dataPlatform:azure-data-factory" + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-mixed-deps-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure-data-factory,complex-data-factory.MixedOrchestrationPipeline,DEV),TransformInMain)", + "changeType": "UPSERT", + "aspectName": "dataJobInfo", + "aspect": { + "json": { + "customProperties": { + "activity_type": "Copy" + }, + "externalUrl": "https://adf.azure.com/en/authoring/pipeline/MixedOrchestrationPipeline?factory=/subscriptions/12345678-1234-1234-1234-123456789012/resourceGroups/complex-test-rg/providers/Microsoft.DataFactory/factories/complex-data-factory", + "name": "TransformInMain", + "type": { + "string": "COMMAND" + }, + "flowUrn": "urn:li:dataFlow:(azure-data-factory,complex-data-factory.MixedOrchestrationPipeline,DEV)", + "env": "DEV" + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-mixed-deps-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure-data-factory,complex-data-factory.MixedOrchestrationPipeline,DEV),TransformInMain)", + "changeType": "UPSERT", + "aspectName": "subTypes", + "aspect": { + "json": { + "typeNames": [ + "Copy Activity" + ] + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-mixed-deps-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure-data-factory,complex-data-factory.MixedOrchestrationPipeline,DEV),TransformInMain)", + "changeType": "UPSERT", + "aspectName": "dataJobInputOutput", + "aspect": { + "json": { + "inputDatasets": [ + "urn:li:dataset:(urn:li:dataPlatform:azure_blob_storage,staging/customers,DEV)" + ], + "outputDatasets": [ + "urn:li:dataset:(urn:li:dataPlatform:synapse,DimCustomers,DEV)" + ] + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-mixed-deps-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure-data-factory,complex-data-factory.MixedOrchestrationPipeline,DEV),TransformInMain)", + "changeType": "UPSERT", + "aspectName": "browsePathsV2", + "aspect": { + "json": { + "path": [ + { + "id": "urn:li:container:7f68411fe6779dae7f37506c1e9bacfe", + "urn": "urn:li:container:7f68411fe6779dae7f37506c1e9bacfe" + }, + { + "id": "complex-data-factory.MixedOrchestrationPipeline", + "urn": "urn:li:dataFlow:(azure-data-factory,complex-data-factory.MixedOrchestrationPipeline,DEV)" + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-mixed-deps-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure-data-factory,complex-data-factory.MixedOrchestrationPipeline,DEV),ExecuteLoad)", + "changeType": "UPSERT", + "aspectName": "dataPlatformInstance", + "aspect": { + "json": { + "platform": "urn:li:dataPlatform:azure-data-factory" + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-mixed-deps-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure-data-factory,complex-data-factory.MixedOrchestrationPipeline,DEV),ExecuteLoad)", + "changeType": "UPSERT", + "aspectName": "dataJobInfo", + "aspect": { + "json": { + "customProperties": { + "activity_type": "ExecutePipeline", + "calls_pipeline": "LoadDataPipeline", + "child_pipeline_urn": "urn:li:dataFlow:(azure-data-factory,complex-data-factory.LoadDataPipeline,DEV)", + "child_first_activity": "LoadToDestination" + }, + "externalUrl": "https://adf.azure.com/en/authoring/pipeline/MixedOrchestrationPipeline?factory=/subscriptions/12345678-1234-1234-1234-123456789012/resourceGroups/complex-test-rg/providers/Microsoft.DataFactory/factories/complex-data-factory", + "name": "ExecuteLoad", + "type": { + "string": "COMMAND" + }, + "flowUrn": "urn:li:dataFlow:(azure-data-factory,complex-data-factory.MixedOrchestrationPipeline,DEV)", + "env": "DEV" + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-mixed-deps-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure-data-factory,complex-data-factory.MixedOrchestrationPipeline,DEV),ExecuteLoad)", + "changeType": "UPSERT", + "aspectName": "subTypes", + "aspect": { + "json": { + "typeNames": [ + "Execute Pipeline" + ] + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-mixed-deps-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure-data-factory,complex-data-factory.MixedOrchestrationPipeline,DEV),ExecuteLoad)", + "changeType": "UPSERT", + "aspectName": "browsePathsV2", + "aspect": { + "json": { + "path": [ + { + "id": "urn:li:container:7f68411fe6779dae7f37506c1e9bacfe", + "urn": "urn:li:container:7f68411fe6779dae7f37506c1e9bacfe" + }, + { + "id": "complex-data-factory.MixedOrchestrationPipeline", + "urn": "urn:li:dataFlow:(azure-data-factory,complex-data-factory.MixedOrchestrationPipeline,DEV)" + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-mixed-deps-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure-data-factory,complex-data-factory.LoadDataPipeline,DEV),LoadToDestination)", + "changeType": "UPSERT", + "aspectName": "dataJobInputOutput", + "aspect": { + "json": { + "inputDatasets": [], + "outputDatasets": [], + "inputDatajobs": [ + "urn:li:dataJob:(urn:li:dataFlow:(azure-data-factory,complex-data-factory.MixedOrchestrationPipeline,DEV),ExecuteLoad)" + ] + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-mixed-deps-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataFlow", + "entityUrn": "urn:li:dataFlow:(azure-data-factory,complex-data-factory.ExtractDataPipeline,DEV)", + "changeType": "UPSERT", + "aspectName": "dataPlatformInstance", + "aspect": { + "json": { + "platform": "urn:li:dataPlatform:azure-data-factory" + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-mixed-deps-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataFlow", + "entityUrn": "urn:li:dataFlow:(azure-data-factory,complex-data-factory.ExtractDataPipeline,DEV)", + "changeType": "UPSERT", + "aspectName": "dataFlowInfo", + "aspect": { + "json": { + "customProperties": { + "azure_resource_id": "/subscriptions/12345678-1234-1234-1234-123456789012/resourceGroups/complex-test-rg/providers/Microsoft.DataFactory/factories/complex-data-factory/pipelines/ExtractDataPipeline", + "factory_name": "complex-data-factory" + }, + "externalUrl": "https://adf.azure.com/en/authoring/pipeline/ExtractDataPipeline?factory=/subscriptions/12345678-1234-1234-1234-123456789012/resourceGroups/complex-test-rg/providers/Microsoft.DataFactory/factories/complex-data-factory", + "name": "ExtractDataPipeline", + "description": "Child pipeline for extracting data from source", + "env": "DEV" + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-mixed-deps-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataFlow", + "entityUrn": "urn:li:dataFlow:(azure-data-factory,complex-data-factory.ExtractDataPipeline,DEV)", + "changeType": "UPSERT", + "aspectName": "subTypes", + "aspect": { + "json": { + "typeNames": [ + "Pipeline" + ] + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-mixed-deps-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataFlow", + "entityUrn": "urn:li:dataFlow:(azure-data-factory,complex-data-factory.ExtractDataPipeline,DEV)", + "changeType": "UPSERT", + "aspectName": "container", + "aspect": { + "json": { + "container": "urn:li:container:7f68411fe6779dae7f37506c1e9bacfe" + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-mixed-deps-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataFlow", + "entityUrn": "urn:li:dataFlow:(azure-data-factory,complex-data-factory.ExtractDataPipeline,DEV)", + "changeType": "UPSERT", + "aspectName": "browsePathsV2", + "aspect": { + "json": { + "path": [ + { + "id": "urn:li:container:7f68411fe6779dae7f37506c1e9bacfe", + "urn": "urn:li:container:7f68411fe6779dae7f37506c1e9bacfe" + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-mixed-deps-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure-data-factory,complex-data-factory.ExtractDataPipeline,DEV),ExtractFromSource)", + "changeType": "UPSERT", + "aspectName": "dataPlatformInstance", + "aspect": { + "json": { + "platform": "urn:li:dataPlatform:azure-data-factory" + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-mixed-deps-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure-data-factory,complex-data-factory.ExtractDataPipeline,DEV),ExtractFromSource)", + "changeType": "UPSERT", + "aspectName": "dataJobInfo", + "aspect": { + "json": { + "customProperties": { + "activity_type": "Copy" + }, + "externalUrl": "https://adf.azure.com/en/authoring/pipeline/ExtractDataPipeline?factory=/subscriptions/12345678-1234-1234-1234-123456789012/resourceGroups/complex-test-rg/providers/Microsoft.DataFactory/factories/complex-data-factory", + "name": "ExtractFromSource", + "type": { + "string": "COMMAND" + }, + "flowUrn": "urn:li:dataFlow:(azure-data-factory,complex-data-factory.ExtractDataPipeline,DEV)", + "env": "DEV" + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-mixed-deps-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure-data-factory,complex-data-factory.ExtractDataPipeline,DEV),ExtractFromSource)", + "changeType": "UPSERT", + "aspectName": "subTypes", + "aspect": { + "json": { + "typeNames": [ + "Copy Activity" + ] + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-mixed-deps-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure-data-factory,complex-data-factory.ExtractDataPipeline,DEV),ExtractFromSource)", + "changeType": "UPSERT", + "aspectName": "dataJobInputOutput", + "aspect": { + "json": { + "inputDatasets": [ + "urn:li:dataset:(urn:li:dataPlatform:mssql,Customers,DEV)" + ], + "outputDatasets": [ + "urn:li:dataset:(urn:li:dataPlatform:azure_blob_storage,staging/customers,DEV)" + ] + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-mixed-deps-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure-data-factory,complex-data-factory.ExtractDataPipeline,DEV),ExtractFromSource)", + "changeType": "UPSERT", + "aspectName": "browsePathsV2", + "aspect": { + "json": { + "path": [ + { + "id": "urn:li:container:7f68411fe6779dae7f37506c1e9bacfe", + "urn": "urn:li:container:7f68411fe6779dae7f37506c1e9bacfe" + }, + { + "id": "complex-data-factory.ExtractDataPipeline", + "urn": "urn:li:dataFlow:(azure-data-factory,complex-data-factory.ExtractDataPipeline,DEV)" + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-mixed-deps-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataFlow", + "entityUrn": "urn:li:dataFlow:(azure-data-factory,complex-data-factory.LoadDataPipeline,DEV)", + "changeType": "UPSERT", + "aspectName": "dataPlatformInstance", + "aspect": { + "json": { + "platform": "urn:li:dataPlatform:azure-data-factory" + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-mixed-deps-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataFlow", + "entityUrn": "urn:li:dataFlow:(azure-data-factory,complex-data-factory.LoadDataPipeline,DEV)", + "changeType": "UPSERT", + "aspectName": "dataFlowInfo", + "aspect": { + "json": { + "customProperties": { + "azure_resource_id": "/subscriptions/12345678-1234-1234-1234-123456789012/resourceGroups/complex-test-rg/providers/Microsoft.DataFactory/factories/complex-data-factory/pipelines/LoadDataPipeline", + "factory_name": "complex-data-factory" + }, + "externalUrl": "https://adf.azure.com/en/authoring/pipeline/LoadDataPipeline?factory=/subscriptions/12345678-1234-1234-1234-123456789012/resourceGroups/complex-test-rg/providers/Microsoft.DataFactory/factories/complex-data-factory", + "name": "LoadDataPipeline", + "description": "Child pipeline for loading data to destination", + "env": "DEV" + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-mixed-deps-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataFlow", + "entityUrn": "urn:li:dataFlow:(azure-data-factory,complex-data-factory.LoadDataPipeline,DEV)", + "changeType": "UPSERT", + "aspectName": "subTypes", + "aspect": { + "json": { + "typeNames": [ + "Pipeline" + ] + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-mixed-deps-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataFlow", + "entityUrn": "urn:li:dataFlow:(azure-data-factory,complex-data-factory.LoadDataPipeline,DEV)", + "changeType": "UPSERT", + "aspectName": "container", + "aspect": { + "json": { + "container": "urn:li:container:7f68411fe6779dae7f37506c1e9bacfe" + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-mixed-deps-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataFlow", + "entityUrn": "urn:li:dataFlow:(azure-data-factory,complex-data-factory.LoadDataPipeline,DEV)", + "changeType": "UPSERT", + "aspectName": "browsePathsV2", + "aspect": { + "json": { + "path": [ + { + "id": "urn:li:container:7f68411fe6779dae7f37506c1e9bacfe", + "urn": "urn:li:container:7f68411fe6779dae7f37506c1e9bacfe" + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-mixed-deps-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure-data-factory,complex-data-factory.LoadDataPipeline,DEV),LoadToDestination)", + "changeType": "UPSERT", + "aspectName": "dataPlatformInstance", + "aspect": { + "json": { + "platform": "urn:li:dataPlatform:azure-data-factory" + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-mixed-deps-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure-data-factory,complex-data-factory.LoadDataPipeline,DEV),LoadToDestination)", + "changeType": "UPSERT", + "aspectName": "dataJobInfo", + "aspect": { + "json": { + "customProperties": { + "activity_type": "Copy" + }, + "externalUrl": "https://adf.azure.com/en/authoring/pipeline/LoadDataPipeline?factory=/subscriptions/12345678-1234-1234-1234-123456789012/resourceGroups/complex-test-rg/providers/Microsoft.DataFactory/factories/complex-data-factory", + "name": "LoadToDestination", + "type": { + "string": "COMMAND" + }, + "flowUrn": "urn:li:dataFlow:(azure-data-factory,complex-data-factory.LoadDataPipeline,DEV)", + "env": "DEV" + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-mixed-deps-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure-data-factory,complex-data-factory.LoadDataPipeline,DEV),LoadToDestination)", + "changeType": "UPSERT", + "aspectName": "subTypes", + "aspect": { + "json": { + "typeNames": [ + "Copy Activity" + ] + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-mixed-deps-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure-data-factory,complex-data-factory.LoadDataPipeline,DEV),LoadToDestination)", + "changeType": "UPSERT", + "aspectName": "dataJobInputOutput", + "aspect": { + "json": { + "inputDatasets": [ + "urn:li:dataset:(urn:li:dataPlatform:synapse,DimCustomers,DEV)" + ], + "outputDatasets": [ + "urn:li:dataset:(urn:li:dataPlatform:azure_data_lake,sales_summary,DEV)" + ] + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-mixed-deps-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure-data-factory,complex-data-factory.LoadDataPipeline,DEV),LoadToDestination)", + "changeType": "UPSERT", + "aspectName": "browsePathsV2", + "aspect": { + "json": { + "path": [ + { + "id": "urn:li:container:7f68411fe6779dae7f37506c1e9bacfe", + "urn": "urn:li:container:7f68411fe6779dae7f37506c1e9bacfe" + }, + { + "id": "complex-data-factory.LoadDataPipeline", + "urn": "urn:li:dataFlow:(azure-data-factory,complex-data-factory.LoadDataPipeline,DEV)" + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-mixed-deps-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "container", + "entityUrn": "urn:li:container:7f68411fe6779dae7f37506c1e9bacfe", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-mixed-deps-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataFlow", + "entityUrn": "urn:li:dataFlow:(azure-data-factory,complex-data-factory.ExtractDataPipeline,DEV)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-mixed-deps-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataFlow", + "entityUrn": "urn:li:dataFlow:(azure-data-factory,complex-data-factory.LoadDataPipeline,DEV)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-mixed-deps-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataFlow", + "entityUrn": "urn:li:dataFlow:(azure-data-factory,complex-data-factory.MixedOrchestrationPipeline,DEV)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-mixed-deps-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure-data-factory,complex-data-factory.ExtractDataPipeline,DEV),ExtractFromSource)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-mixed-deps-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure-data-factory,complex-data-factory.LoadDataPipeline,DEV),LoadToDestination)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-mixed-deps-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure-data-factory,complex-data-factory.MixedOrchestrationPipeline,DEV),ExecuteExtract)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-mixed-deps-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure-data-factory,complex-data-factory.MixedOrchestrationPipeline,DEV),ExecuteLoad)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-mixed-deps-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure-data-factory,complex-data-factory.MixedOrchestrationPipeline,DEV),TransformInMain)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-mixed-deps-test", + "lastRunId": "no-run-id-provided" + } +} +] \ No newline at end of file diff --git a/metadata-ingestion/tests/integration/azure_data_factory/adf_nested_golden.json b/metadata-ingestion/tests/integration/azure_data_factory/adf_nested_golden.json index 2a7e65206305d5..a1ba40c1aa7a5e 100644 --- a/metadata-ingestion/tests/integration/azure_data_factory/adf_nested_golden.json +++ b/metadata-ingestion/tests/integration/azure_data_factory/adf_nested_golden.json @@ -46,14 +46,14 @@ } }, { - "entityType": "dataFlow", - "entityUrn": "urn:li:dataFlow:(azure-data-factory,complex-data-factory.ChildDataMovementPipeline,DEV)", + "entityType": "container", + "entityUrn": "urn:li:container:7f68411fe6779dae7f37506c1e9bacfe", "changeType": "UPSERT", "aspectName": "subTypes", "aspect": { "json": { "typeNames": [ - "Pipeline" + "Data Factory" ] } }, @@ -79,79 +79,6 @@ "lastRunId": "no-run-id-provided" } }, -{ - "entityType": "dataFlow", - "entityUrn": "urn:li:dataFlow:(azure-data-factory,complex-data-factory.ChildDataMovementPipeline,DEV)", - "changeType": "UPSERT", - "aspectName": "container", - "aspect": { - "json": { - "container": "urn:li:container:7f68411fe6779dae7f37506c1e9bacfe" - } - }, - "systemMetadata": { - "lastObserved": 1705320000000, - "runId": "adf-nested-test", - "lastRunId": "no-run-id-provided" - } -}, -{ - "entityType": "container", - "entityUrn": "urn:li:container:7f68411fe6779dae7f37506c1e9bacfe", - "changeType": "UPSERT", - "aspectName": "subTypes", - "aspect": { - "json": { - "typeNames": [ - "Data Factory" - ] - } - }, - "systemMetadata": { - "lastObserved": 1705320000000, - "runId": "adf-nested-test", - "lastRunId": "no-run-id-provided" - } -}, -{ - "entityType": "dataFlow", - "entityUrn": "urn:li:dataFlow:(azure-data-factory,complex-data-factory.ChildDataMovementPipeline,DEV)", - "changeType": "UPSERT", - "aspectName": "dataPlatformInstance", - "aspect": { - "json": { - "platform": "urn:li:dataPlatform:azure-data-factory" - } - }, - "systemMetadata": { - "lastObserved": 1705320000000, - "runId": "adf-nested-test", - "lastRunId": "no-run-id-provided" - } -}, -{ - "entityType": "dataFlow", - "entityUrn": "urn:li:dataFlow:(azure-data-factory,complex-data-factory.ChildDataMovementPipeline,DEV)", - "changeType": "UPSERT", - "aspectName": "dataFlowInfo", - "aspect": { - "json": { - "customProperties": { - "azure_resource_id": "/subscriptions/12345678-1234-1234-1234-123456789012/resourceGroups/complex-test-rg/providers/Microsoft.DataFactory/factories/complex-data-factory/pipelines/ChildDataMovementPipeline", - "factory_name": "complex-data-factory" - }, - "externalUrl": "https://adf.azure.com/en/authoring/pipeline/ChildDataMovementPipeline?factory=/subscriptions/12345678-1234-1234-1234-123456789012/resourceGroups/complex-test-rg/providers/Microsoft.DataFactory/factories/complex-data-factory", - "name": "ChildDataMovementPipeline", - "description": "Child pipeline for data movement", - "env": "DEV" - } - }, - "systemMetadata": { - "lastObserved": 1705320000000, - "runId": "adf-nested-test", - "lastRunId": "no-run-id-provided" - } -}, { "entityType": "dataFlow", "entityUrn": "urn:li:dataFlow:(azure-data-factory,complex-data-factory.ParentOrchestrationPipeline,DEV)", @@ -193,12 +120,14 @@ }, { "entityType": "dataFlow", - "entityUrn": "urn:li:dataFlow:(azure-data-factory,complex-data-factory.ChildDataMovementPipeline,DEV)", + "entityUrn": "urn:li:dataFlow:(azure-data-factory,complex-data-factory.ParentOrchestrationPipeline,DEV)", "changeType": "UPSERT", - "aspectName": "status", + "aspectName": "subTypes", "aspect": { "json": { - "removed": false + "typeNames": [ + "Pipeline" + ] } }, "systemMetadata": { @@ -209,17 +138,12 @@ }, { "entityType": "dataFlow", - "entityUrn": "urn:li:dataFlow:(azure-data-factory,complex-data-factory.ChildDataMovementPipeline,DEV)", + "entityUrn": "urn:li:dataFlow:(azure-data-factory,complex-data-factory.ParentOrchestrationPipeline,DEV)", "changeType": "UPSERT", - "aspectName": "browsePathsV2", + "aspectName": "container", "aspect": { "json": { - "path": [ - { - "id": "urn:li:container:7f68411fe6779dae7f37506c1e9bacfe", - "urn": "urn:li:container:7f68411fe6779dae7f37506c1e9bacfe" - } - ] + "container": "urn:li:container:7f68411fe6779dae7f37506c1e9bacfe" } }, "systemMetadata": { @@ -265,24 +189,6 @@ "lastRunId": "no-run-id-provided" } }, -{ - "entityType": "dataFlow", - "entityUrn": "urn:li:dataFlow:(azure-data-factory,complex-data-factory.ParentOrchestrationPipeline,DEV)", - "changeType": "UPSERT", - "aspectName": "subTypes", - "aspect": { - "json": { - "typeNames": [ - "Pipeline" - ] - } - }, - "systemMetadata": { - "lastObserved": 1705320000000, - "runId": "adf-nested-test", - "lastRunId": "no-run-id-provided" - } -}, { "entityType": "dataJob", "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure-data-factory,complex-data-factory.ParentOrchestrationPipeline,DEV),ExecuteDataMovement)", @@ -315,13 +221,11 @@ "entityType": "dataJob", "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure-data-factory,complex-data-factory.ParentOrchestrationPipeline,DEV),ExecuteDataMovement)", "changeType": "UPSERT", - "aspectName": "dataJobInputOutput", + "aspectName": "subTypes", "aspect": { "json": { - "inputDatasets": [], - "outputDatasets": [], - "inputDatajobs": [ - "urn:li:dataJob:(urn:li:dataFlow:(azure-data-factory,complex-data-factory.ChildDataMovementPipeline,DEV),CopyCustomersToStaging)" + "typeNames": [ + "Execute Pipeline" ] } }, @@ -358,12 +262,16 @@ }, { "entityType": "dataJob", - "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure-data-factory,complex-data-factory.ParentOrchestrationPipeline,DEV),ExecuteTransform)", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure-data-factory,complex-data-factory.ChildDataMovementPipeline,DEV),CopyCustomersToStaging)", "changeType": "UPSERT", - "aspectName": "dataPlatformInstance", + "aspectName": "dataJobInputOutput", "aspect": { "json": { - "platform": "urn:li:dataPlatform:azure-data-factory" + "inputDatasets": [], + "outputDatasets": [], + "inputDatajobs": [ + "urn:li:dataJob:(urn:li:dataFlow:(azure-data-factory,complex-data-factory.ParentOrchestrationPipeline,DEV),ExecuteDataMovement)" + ] } }, "systemMetadata": { @@ -374,14 +282,12 @@ }, { "entityType": "dataJob", - "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure-data-factory,complex-data-factory.ParentOrchestrationPipeline,DEV),ExecuteDataMovement)", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure-data-factory,complex-data-factory.ParentOrchestrationPipeline,DEV),ExecuteTransform)", "changeType": "UPSERT", - "aspectName": "subTypes", + "aspectName": "dataPlatformInstance", "aspect": { "json": { - "typeNames": [ - "Execute Pipeline" - ] + "platform": "urn:li:dataPlatform:azure-data-factory" } }, "systemMetadata": { @@ -422,13 +328,11 @@ "entityType": "dataJob", "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure-data-factory,complex-data-factory.ParentOrchestrationPipeline,DEV),ExecuteTransform)", "changeType": "UPSERT", - "aspectName": "dataJobInputOutput", + "aspectName": "subTypes", "aspect": { "json": { - "inputDatasets": [], - "outputDatasets": [], - "inputDatajobs": [ - "urn:li:dataJob:(urn:li:dataFlow:(azure-data-factory,complex-data-factory.ChildTransformPipeline,DEV),TransformCustomerData)" + "typeNames": [ + "Execute Pipeline" ] } }, @@ -465,13 +369,15 @@ }, { "entityType": "dataJob", - "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure-data-factory,complex-data-factory.ParentOrchestrationPipeline,DEV),ExecuteTransform)", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure-data-factory,complex-data-factory.ChildTransformPipeline,DEV),TransformCustomerData)", "changeType": "UPSERT", - "aspectName": "subTypes", + "aspectName": "dataJobInputOutput", "aspect": { "json": { - "typeNames": [ - "Execute Pipeline" + "inputDatasets": [], + "outputDatasets": [], + "inputDatajobs": [ + "urn:li:dataJob:(urn:li:dataFlow:(azure-data-factory,complex-data-factory.ParentOrchestrationPipeline,DEV),ExecuteTransform)" ] } }, @@ -483,12 +389,12 @@ }, { "entityType": "dataFlow", - "entityUrn": "urn:li:dataFlow:(azure-data-factory,complex-data-factory.ParentOrchestrationPipeline,DEV)", + "entityUrn": "urn:li:dataFlow:(azure-data-factory,complex-data-factory.ChildDataMovementPipeline,DEV)", "changeType": "UPSERT", - "aspectName": "container", + "aspectName": "dataPlatformInstance", "aspect": { "json": { - "container": "urn:li:container:7f68411fe6779dae7f37506c1e9bacfe" + "platform": "urn:li:dataPlatform:azure-data-factory" } }, "systemMetadata": { @@ -499,12 +405,19 @@ }, { "entityType": "dataFlow", - "entityUrn": "urn:li:dataFlow:(azure-data-factory,complex-data-factory.ParentOrchestrationPipeline,DEV)", + "entityUrn": "urn:li:dataFlow:(azure-data-factory,complex-data-factory.ChildDataMovementPipeline,DEV)", "changeType": "UPSERT", - "aspectName": "status", + "aspectName": "dataFlowInfo", "aspect": { "json": { - "removed": false + "customProperties": { + "azure_resource_id": "/subscriptions/12345678-1234-1234-1234-123456789012/resourceGroups/complex-test-rg/providers/Microsoft.DataFactory/factories/complex-data-factory/pipelines/ChildDataMovementPipeline", + "factory_name": "complex-data-factory" + }, + "externalUrl": "https://adf.azure.com/en/authoring/pipeline/ChildDataMovementPipeline?factory=/subscriptions/12345678-1234-1234-1234-123456789012/resourceGroups/complex-test-rg/providers/Microsoft.DataFactory/factories/complex-data-factory", + "name": "ChildDataMovementPipeline", + "description": "Child pipeline for data movement", + "env": "DEV" } }, "systemMetadata": { @@ -514,13 +427,15 @@ } }, { - "entityType": "dataJob", - "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure-data-factory,complex-data-factory.ParentOrchestrationPipeline,DEV),ExecuteDataMovement)", + "entityType": "dataFlow", + "entityUrn": "urn:li:dataFlow:(azure-data-factory,complex-data-factory.ChildDataMovementPipeline,DEV)", "changeType": "UPSERT", - "aspectName": "status", + "aspectName": "subTypes", "aspect": { "json": { - "removed": false + "typeNames": [ + "Pipeline" + ] } }, "systemMetadata": { @@ -530,13 +445,13 @@ } }, { - "entityType": "dataJob", - "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure-data-factory,complex-data-factory.ParentOrchestrationPipeline,DEV),ExecuteTransform)", + "entityType": "dataFlow", + "entityUrn": "urn:li:dataFlow:(azure-data-factory,complex-data-factory.ChildDataMovementPipeline,DEV)", "changeType": "UPSERT", - "aspectName": "status", + "aspectName": "container", "aspect": { "json": { - "removed": false + "container": "urn:li:container:7f68411fe6779dae7f37506c1e9bacfe" } }, "systemMetadata": { @@ -546,13 +461,18 @@ } }, { - "entityType": "dataJob", - "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure-data-factory,complex-data-factory.ChildDataMovementPipeline,DEV),CopyCustomersToStaging)", + "entityType": "dataFlow", + "entityUrn": "urn:li:dataFlow:(azure-data-factory,complex-data-factory.ChildDataMovementPipeline,DEV)", "changeType": "UPSERT", - "aspectName": "dataPlatformInstance", + "aspectName": "browsePathsV2", "aspect": { "json": { - "platform": "urn:li:dataPlatform:azure-data-factory" + "path": [ + { + "id": "urn:li:container:7f68411fe6779dae7f37506c1e9bacfe", + "urn": "urn:li:container:7f68411fe6779dae7f37506c1e9bacfe" + } + ] } }, "systemMetadata": { @@ -562,15 +482,13 @@ } }, { - "entityType": "dataFlow", - "entityUrn": "urn:li:dataFlow:(azure-data-factory,complex-data-factory.ChildTransformPipeline,DEV)", + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure-data-factory,complex-data-factory.ChildDataMovementPipeline,DEV),CopyCustomersToStaging)", "changeType": "UPSERT", - "aspectName": "subTypes", + "aspectName": "dataPlatformInstance", "aspect": { "json": { - "typeNames": [ - "Pipeline" - ] + "platform": "urn:li:dataPlatform:azure-data-factory" } }, "systemMetadata": { @@ -604,6 +522,24 @@ "lastRunId": "no-run-id-provided" } }, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure-data-factory,complex-data-factory.ChildDataMovementPipeline,DEV),CopyCustomersToStaging)", + "changeType": "UPSERT", + "aspectName": "subTypes", + "aspect": { + "json": { + "typeNames": [ + "Copy Activity" + ] + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-nested-test", + "lastRunId": "no-run-id-provided" + } +}, { "entityType": "dataJob", "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure-data-factory,complex-data-factory.ChildDataMovementPipeline,DEV),CopyCustomersToStaging)", @@ -651,14 +587,53 @@ } }, { - "entityType": "dataJob", - "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure-data-factory,complex-data-factory.ChildDataMovementPipeline,DEV),CopyCustomersToStaging)", + "entityType": "dataFlow", + "entityUrn": "urn:li:dataFlow:(azure-data-factory,complex-data-factory.ChildTransformPipeline,DEV)", + "changeType": "UPSERT", + "aspectName": "dataPlatformInstance", + "aspect": { + "json": { + "platform": "urn:li:dataPlatform:azure-data-factory" + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-nested-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataFlow", + "entityUrn": "urn:li:dataFlow:(azure-data-factory,complex-data-factory.ChildTransformPipeline,DEV)", + "changeType": "UPSERT", + "aspectName": "dataFlowInfo", + "aspect": { + "json": { + "customProperties": { + "azure_resource_id": "/subscriptions/12345678-1234-1234-1234-123456789012/resourceGroups/complex-test-rg/providers/Microsoft.DataFactory/factories/complex-data-factory/pipelines/ChildTransformPipeline", + "factory_name": "complex-data-factory" + }, + "externalUrl": "https://adf.azure.com/en/authoring/pipeline/ChildTransformPipeline?factory=/subscriptions/12345678-1234-1234-1234-123456789012/resourceGroups/complex-test-rg/providers/Microsoft.DataFactory/factories/complex-data-factory", + "name": "ChildTransformPipeline", + "description": "Child pipeline for data transformation", + "env": "DEV" + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-nested-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataFlow", + "entityUrn": "urn:li:dataFlow:(azure-data-factory,complex-data-factory.ChildTransformPipeline,DEV)", "changeType": "UPSERT", "aspectName": "subTypes", "aspect": { "json": { "typeNames": [ - "Copy Activity" + "Pipeline" ] } }, @@ -688,6 +663,27 @@ "entityType": "dataFlow", "entityUrn": "urn:li:dataFlow:(azure-data-factory,complex-data-factory.ChildTransformPipeline,DEV)", "changeType": "UPSERT", + "aspectName": "browsePathsV2", + "aspect": { + "json": { + "path": [ + { + "id": "urn:li:container:7f68411fe6779dae7f37506c1e9bacfe", + "urn": "urn:li:container:7f68411fe6779dae7f37506c1e9bacfe" + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-nested-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure-data-factory,complex-data-factory.ChildTransformPipeline,DEV),TransformCustomerData)", + "changeType": "UPSERT", "aspectName": "dataPlatformInstance", "aspect": { "json": { @@ -701,19 +697,21 @@ } }, { - "entityType": "dataFlow", - "entityUrn": "urn:li:dataFlow:(azure-data-factory,complex-data-factory.ChildTransformPipeline,DEV)", + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure-data-factory,complex-data-factory.ChildTransformPipeline,DEV),TransformCustomerData)", "changeType": "UPSERT", - "aspectName": "dataFlowInfo", + "aspectName": "dataJobInfo", "aspect": { "json": { "customProperties": { - "azure_resource_id": "/subscriptions/12345678-1234-1234-1234-123456789012/resourceGroups/complex-test-rg/providers/Microsoft.DataFactory/factories/complex-data-factory/pipelines/ChildTransformPipeline", - "factory_name": "complex-data-factory" + "activity_type": "ExecuteDataFlow" }, "externalUrl": "https://adf.azure.com/en/authoring/pipeline/ChildTransformPipeline?factory=/subscriptions/12345678-1234-1234-1234-123456789012/resourceGroups/complex-test-rg/providers/Microsoft.DataFactory/factories/complex-data-factory", - "name": "ChildTransformPipeline", - "description": "Child pipeline for data transformation", + "name": "TransformCustomerData", + "type": { + "string": "COMMAND" + }, + "flowUrn": "urn:li:dataFlow:(azure-data-factory,complex-data-factory.ChildTransformPipeline,DEV)", "env": "DEV" } }, @@ -724,13 +722,15 @@ } }, { - "entityType": "dataFlow", - "entityUrn": "urn:li:dataFlow:(azure-data-factory,complex-data-factory.ChildTransformPipeline,DEV)", + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure-data-factory,complex-data-factory.ChildTransformPipeline,DEV),TransformCustomerData)", "changeType": "UPSERT", - "aspectName": "status", + "aspectName": "subTypes", "aspect": { "json": { - "removed": false + "typeNames": [ + "Data Flow Activity" + ] } }, "systemMetadata": { @@ -740,8 +740,8 @@ } }, { - "entityType": "dataFlow", - "entityUrn": "urn:li:dataFlow:(azure-data-factory,complex-data-factory.ChildTransformPipeline,DEV)", + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure-data-factory,complex-data-factory.ChildTransformPipeline,DEV),TransformCustomerData)", "changeType": "UPSERT", "aspectName": "browsePathsV2", "aspect": { @@ -750,6 +750,10 @@ { "id": "urn:li:container:7f68411fe6779dae7f37506c1e9bacfe", "urn": "urn:li:container:7f68411fe6779dae7f37506c1e9bacfe" + }, + { + "id": "complex-data-factory.ChildTransformPipeline", + "urn": "urn:li:dataFlow:(azure-data-factory,complex-data-factory.ChildTransformPipeline,DEV)" } ] } @@ -777,22 +781,13 @@ } }, { - "entityType": "dataJob", - "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure-data-factory,complex-data-factory.ChildTransformPipeline,DEV),TransformCustomerData)", + "entityType": "dataFlow", + "entityUrn": "urn:li:dataFlow:(azure-data-factory,complex-data-factory.ChildDataMovementPipeline,DEV)", "changeType": "UPSERT", - "aspectName": "dataJobInfo", + "aspectName": "status", "aspect": { "json": { - "customProperties": { - "activity_type": "ExecuteDataFlow" - }, - "externalUrl": "https://adf.azure.com/en/authoring/pipeline/ChildTransformPipeline?factory=/subscriptions/12345678-1234-1234-1234-123456789012/resourceGroups/complex-test-rg/providers/Microsoft.DataFactory/factories/complex-data-factory", - "name": "TransformCustomerData", - "type": { - "string": "COMMAND" - }, - "flowUrn": "urn:li:dataFlow:(azure-data-factory,complex-data-factory.ChildTransformPipeline,DEV)", - "env": "DEV" + "removed": false } }, "systemMetadata": { @@ -802,15 +797,29 @@ } }, { - "entityType": "dataJob", - "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure-data-factory,complex-data-factory.ChildTransformPipeline,DEV),TransformCustomerData)", + "entityType": "dataFlow", + "entityUrn": "urn:li:dataFlow:(azure-data-factory,complex-data-factory.ChildTransformPipeline,DEV)", "changeType": "UPSERT", - "aspectName": "subTypes", + "aspectName": "status", "aspect": { "json": { - "typeNames": [ - "Data Flow Activity" - ] + "removed": false + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-nested-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataFlow", + "entityUrn": "urn:li:dataFlow:(azure-data-factory,complex-data-factory.ParentOrchestrationPipeline,DEV)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false } }, "systemMetadata": { @@ -821,12 +830,12 @@ }, { "entityType": "dataJob", - "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure-data-factory,complex-data-factory.ChildTransformPipeline,DEV),TransformCustomerData)", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure-data-factory,complex-data-factory.ChildDataMovementPipeline,DEV),CopyCustomersToStaging)", "changeType": "UPSERT", - "aspectName": "dataPlatformInstance", + "aspectName": "status", "aspect": { "json": { - "platform": "urn:li:dataPlatform:azure-data-factory" + "removed": false } }, "systemMetadata": { @@ -853,21 +862,12 @@ }, { "entityType": "dataJob", - "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure-data-factory,complex-data-factory.ChildTransformPipeline,DEV),TransformCustomerData)", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure-data-factory,complex-data-factory.ParentOrchestrationPipeline,DEV),ExecuteDataMovement)", "changeType": "UPSERT", - "aspectName": "browsePathsV2", + "aspectName": "status", "aspect": { "json": { - "path": [ - { - "id": "urn:li:container:7f68411fe6779dae7f37506c1e9bacfe", - "urn": "urn:li:container:7f68411fe6779dae7f37506c1e9bacfe" - }, - { - "id": "complex-data-factory.ChildTransformPipeline", - "urn": "urn:li:dataFlow:(azure-data-factory,complex-data-factory.ChildTransformPipeline,DEV)" - } - ] + "removed": false } }, "systemMetadata": { @@ -878,7 +878,7 @@ }, { "entityType": "dataJob", - "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure-data-factory,complex-data-factory.ChildDataMovementPipeline,DEV),CopyCustomersToStaging)", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(azure-data-factory,complex-data-factory.ParentOrchestrationPipeline,DEV),ExecuteTransform)", "changeType": "UPSERT", "aspectName": "status", "aspect": { diff --git a/metadata-ingestion/tests/integration/azure_data_factory/complex_mocks.py b/metadata-ingestion/tests/integration/azure_data_factory/complex_mocks.py index 42eb5691a98e3c..3e72c96681d9ac 100644 --- a/metadata-ingestion/tests/integration/azure_data_factory/complex_mocks.py +++ b/metadata-ingestion/tests/integration/azure_data_factory/complex_mocks.py @@ -1191,6 +1191,174 @@ def create_diverse_activities_scenario() -> Dict[str, Any]: } +# ============================================================================= +# SCENARIO 7: MIXED DEPENDENCIES (Pipeline + Dataset Lineage) +# ============================================================================= + + +def create_mixed_dependencies_scenario() -> Dict[str, Any]: + """Create mock data for mixed pipeline and dataset dependencies. + + This scenario tests both types of lineage in a single orchestration: + 1. Pipeline-to-pipeline lineage (ExecutePipeline activities) + 2. Dataset lineage (Copy activities with inputs/outputs) + + Structure: + - MixedOrchestrationPipeline + └── ExecutePipeline: ExtractDataPipeline (child) + └── Copy: ExtractFromSource (reads SqlCustomersTable, writes BlobStagingCustomers) + └── Copy: TransformInMain (reads BlobStagingCustomers, writes SynapseCustomersDim) + └── ExecutePipeline: LoadDataPipeline (child) + └── Copy: LoadToDestination (reads SynapseCustomersDim, writes DataLakeCuratedData) + + Expected lineage: + - ExecuteExtract -> ExtractFromSource (pipeline lineage) + - TransformInMain -> BlobStagingCustomers (dataset input) + - TransformInMain -> SynapseCustomersDim (dataset output) + - ExecuteLoad -> LoadToDestination (pipeline lineage) + """ + # Child pipeline for extraction + extract_pipeline = { + "id": _base_resource_id("pipelines", "ExtractDataPipeline"), + "name": "ExtractDataPipeline", + "type": "Microsoft.DataFactory/factories/pipelines", + "properties": { + "description": "Child pipeline for extracting data from source", + "activities": [ + { + "name": "ExtractFromSource", + "type": "Copy", + "inputs": [ + { + "referenceName": "SqlCustomersTable", + "type": "DatasetReference", + } + ], + "outputs": [ + { + "referenceName": "BlobStagingCustomers", + "type": "DatasetReference", + } + ], + "typeProperties": { + "source": {"type": "AzureSqlSource"}, + "sink": {"type": "DelimitedTextSink"}, + }, + } + ], + }, + } + + # Child pipeline for loading + load_pipeline = { + "id": _base_resource_id("pipelines", "LoadDataPipeline"), + "name": "LoadDataPipeline", + "type": "Microsoft.DataFactory/factories/pipelines", + "properties": { + "description": "Child pipeline for loading data to destination", + "activities": [ + { + "name": "LoadToDestination", + "type": "Copy", + "inputs": [ + { + "referenceName": "SynapseCustomersDim", + "type": "DatasetReference", + } + ], + "outputs": [ + { + "referenceName": "DataLakeCuratedData", + "type": "DatasetReference", + } + ], + "typeProperties": { + "source": {"type": "SqlDWSource"}, + "sink": {"type": "ParquetSink"}, + }, + } + ], + }, + } + + # Main orchestration pipeline with both ExecutePipeline and Copy activities + main_pipeline = { + "id": _base_resource_id("pipelines", "MixedOrchestrationPipeline"), + "name": "MixedOrchestrationPipeline", + "type": "Microsoft.DataFactory/factories/pipelines", + "properties": { + "description": "Pipeline demonstrating both pipeline and dataset dependencies", + "activities": [ + # Step 1: Call child pipeline to extract data + { + "name": "ExecuteExtract", + "type": "ExecutePipeline", + "typeProperties": { + "pipeline": { + "referenceName": "ExtractDataPipeline", + "type": "PipelineReference", + }, + "waitOnCompletion": True, + }, + }, + # Step 2: Transform data in main pipeline (has dataset lineage) + { + "name": "TransformInMain", + "type": "Copy", + "dependsOn": [ + { + "activity": "ExecuteExtract", + "dependencyConditions": ["Succeeded"], + } + ], + "inputs": [ + { + "referenceName": "BlobStagingCustomers", + "type": "DatasetReference", + } + ], + "outputs": [ + { + "referenceName": "SynapseCustomersDim", + "type": "DatasetReference", + } + ], + "typeProperties": { + "source": {"type": "DelimitedTextSource"}, + "sink": {"type": "SqlDWSink"}, + }, + }, + # Step 3: Call child pipeline to load data + { + "name": "ExecuteLoad", + "type": "ExecutePipeline", + "dependsOn": [ + { + "activity": "TransformInMain", + "dependencyConditions": ["Succeeded"], + } + ], + "typeProperties": { + "pipeline": { + "referenceName": "LoadDataPipeline", + "type": "PipelineReference", + }, + "waitOnCompletion": True, + }, + }, + ], + }, + } + + return { + "pipelines": [main_pipeline, extract_pipeline, load_pipeline], + "expected_dataflows": 3, # 3 pipelines + "expected_datajobs": 5, # 2 ExecutePipeline + 1 Copy in main + 2 Copy in children + "expected_pipeline_lineage": 2, # 2 ExecutePipeline activities + "expected_dataset_lineage": 3, # TransformInMain (1 in, 1 out) + ExtractFromSource + LoadToDestination + } + + # ============================================================================= # FACTORY HELPER # ============================================================================= diff --git a/metadata-ingestion/tests/integration/azure_data_factory/test_complex_pipelines.py b/metadata-ingestion/tests/integration/azure_data_factory/test_complex_pipelines.py index 06ff70920d608a..b5e3839977fa66 100644 --- a/metadata-ingestion/tests/integration/azure_data_factory/test_complex_pipelines.py +++ b/metadata-ingestion/tests/integration/azure_data_factory/test_complex_pipelines.py @@ -50,6 +50,7 @@ """ import json +from pathlib import Path from typing import Any, Dict, Iterator, List from unittest import mock @@ -1186,23 +1187,292 @@ def test_pipeline_to_pipeline_lineage(tmp_path): f"Expected TransformCustomerData as first activity: {first_activities}" ) - # Verify DataJobInputOutput aspects have inputDatajobs for pipeline-to-pipeline lineage - input_datajobs_found = [] + # Verify DataJobInputOutput aspects create correct lineage direction + # The child's first activity should have the parent ExecutePipeline as inputDatajobs + # This creates lineage: ExecutePipeline -> ChildFirstActivity + child_activity_inputs = {} for io in datajob_io: + entity_urn = io.get("entityUrn", "") input_jobs = io.get("aspect", {}).get("json", {}).get("inputDatajobs", []) if input_jobs: - input_datajobs_found.extend(input_jobs) + child_activity_inputs[entity_urn] = input_jobs - # Should have at least 2 inputDatajobs references (one for each ExecutePipeline) - assert len(input_datajobs_found) >= 2, ( - f"Expected at least 2 inputDatajobs for pipeline-to-pipeline lineage, " - f"but found: {len(input_datajobs_found)}" + # Should have at least 2 child activities with inputDatajobs (one for each ExecutePipeline) + assert len(child_activity_inputs) >= 2, ( + f"Expected at least 2 child activities with inputDatajobs lineage, " + f"but found: {len(child_activity_inputs)}" ) - # Verify the inputDatajobs point to child pipeline activities - assert any("CopyCustomersToStaging" in urn for urn in input_datajobs_found), ( - f"Expected inputDatajobs to reference CopyCustomersToStaging: {input_datajobs_found}" + # Verify the child activities have ExecutePipeline as their input (upstream) + # CopyCustomersToStaging should have ExecuteDataMovement as input + # TransformCustomerData should have ExecuteTransform as input + all_inputs = [] + for inputs in child_activity_inputs.values(): + all_inputs.extend(inputs) + + assert any("ExecuteDataMovement" in urn for urn in all_inputs), ( + f"Expected ExecuteDataMovement as upstream of child activity: {all_inputs}" + ) + assert any("ExecuteTransform" in urn for urn in all_inputs), ( + f"Expected ExecuteTransform as upstream of child activity: {all_inputs}" + ) + + +def test_mixed_pipeline_and_dataset_dependencies(tmp_path: Path) -> None: + """Test scenario with both pipeline-to-pipeline and dataset dependencies. + + This test verifies that the connector correctly handles pipelines that have: + 1. ExecutePipeline activities (pipeline-to-pipeline lineage) + 2. Copy activities with explicit inputs/outputs (dataset lineage) + + Structure: + - MixedOrchestrationPipeline + └── ExecuteExtract -> ExtractDataPipeline.ExtractFromSource + └── TransformInMain (Copy with dataset I/O) + └── ExecuteLoad -> LoadDataPipeline.LoadToDestination + + Expected results: + - Pipeline lineage: ExecuteExtract -> ExtractFromSource + - Pipeline lineage: ExecuteLoad -> LoadToDestination + - Dataset lineage: TransformInMain reads BlobStagingCustomers + - Dataset lineage: TransformInMain writes SynapseCustomersDim + """ + from tests.integration.azure_data_factory.complex_mocks import ( + create_mixed_dependencies_scenario, + ) + + scenario = create_mixed_dependencies_scenario() + output_file = tmp_path / "mixed_deps_output.json" + + # Create mock client using the existing helper + mock_client = create_mock_client( + pipelines=scenario["pipelines"], + datasets=create_complex_datasets(), + linked_services=create_complex_linked_services(), + ) + + config = { + "run_id": "mixed_deps_test", + "source": { + "type": "azure-data-factory", + "config": { + "subscription_id": SUBSCRIPTION_ID, + "resource_group": RESOURCE_GROUP, + "credential": {"authentication_method": "default"}, + "env": "DEV", + "include_lineage": True, + "include_execution_history": False, + }, + }, + "sink": {"type": "file", "config": {"filename": str(output_file)}}, + } + + with mock.patch( + "datahub.ingestion.source.azure_data_factory.adf_client.DataFactoryManagementClient" + ) as MockClientClass: + MockClientClass.return_value = mock_client + + with mock.patch( + "datahub.ingestion.source.azure.azure_auth.DefaultAzureCredential" + ): + pipeline = Pipeline.create(config) + pipeline.run() + + # Read output + with open(output_file) as f: + mcps = json.load(f) + + # ========================================================================= + # Verify Pipeline-to-Pipeline Lineage + # ========================================================================= + # Find DataJobInfo aspects to identify ExecutePipeline activities + datajob_infos = [ + mcp + for mcp in mcps + if mcp.get("entityType") == "dataJob" and mcp.get("aspectName") == "dataJobInfo" + ] + + # Find activities that call child pipelines + execute_pipeline_refs = [] + for info in datajob_infos: + custom_props = ( + info.get("aspect", {}).get("json", {}).get("customProperties", {}) + ) + if "calls_pipeline" in custom_props: + execute_pipeline_refs.append( + { + "activity_urn": info.get("entityUrn", ""), + "calls": custom_props.get("calls_pipeline"), + "child_first_activity": custom_props.get("child_first_activity"), + } + ) + + # Should have 2 ExecutePipeline activities + assert len(execute_pipeline_refs) == 2, ( + f"Expected 2 ExecutePipeline activities, found: {len(execute_pipeline_refs)}" + ) + + # Verify correct child pipelines are referenced + child_pipelines = {ref["calls"] for ref in execute_pipeline_refs} + assert "ExtractDataPipeline" in child_pipelines + assert "LoadDataPipeline" in child_pipelines + + # Verify first activities of child pipelines + first_activities = {ref["child_first_activity"] for ref in execute_pipeline_refs} + assert "ExtractFromSource" in first_activities + assert "LoadToDestination" in first_activities + + # ========================================================================= + # Verify Dataset Lineage + # ========================================================================= + # Find DataJobInputOutput aspects + datajob_io = [ + mcp + for mcp in mcps + if mcp.get("entityType") == "dataJob" + and mcp.get("aspectName") == "dataJobInputOutput" + ] + + # Build a map of entity URN -> (inputDatasets, outputDatasets) + dataset_lineage: dict[str, dict[str, list[str]]] = {} + for io in datajob_io: + entity_urn = io.get("entityUrn", "") + input_datasets = io.get("aspect", {}).get("json", {}).get("inputDatasets", []) + output_datasets = io.get("aspect", {}).get("json", {}).get("outputDatasets", []) + if input_datasets or output_datasets: + dataset_lineage[entity_urn] = { + "inputs": input_datasets, + "outputs": output_datasets, + } + + # Find TransformInMain activity's lineage + transform_lineage = None + for urn, lineage in dataset_lineage.items(): + if "TransformInMain" in urn: + transform_lineage = lineage + break + + assert transform_lineage is not None, ( + f"TransformInMain activity should have dataset lineage. " + f"Available URNs: {list(dataset_lineage.keys())}" + ) + + # TransformInMain should read from BlobStagingCustomers (blob storage) + assert len(transform_lineage["inputs"]) >= 1, ( + "TransformInMain should have at least 1 input dataset" + ) + # The URN uses platform and dataset path from typeProperties, not the ADF dataset name + # BlobStagingCustomers maps to azure_blob_storage platform with path staging/customers + assert any( + "azure_blob_storage" in urn or "staging" in urn + for urn in transform_lineage["inputs"] + ), f"TransformInMain should read from blob storage: {transform_lineage['inputs']}" + + # TransformInMain should write to SynapseCustomersDim (synapse) + assert len(transform_lineage["outputs"]) >= 1, ( + "TransformInMain should have at least 1 output dataset" + ) + # SynapseCustomersDim maps to synapse platform with schema Sales.CustomersDim + assert any( + "synapse" in urn or "Customers" in urn for urn in transform_lineage["outputs"] + ), f"TransformInMain should write to synapse: {transform_lineage['outputs']}" + + # ========================================================================= + # Verify Both Lineage Types Coexist + # ========================================================================= + # We should have at least 3 DataJobInputOutput aspects: + # - 2 for child pipelines' first activities (inputDatajobs from pipeline lineage) + # - Several for Copy activities (inputDatasets/outputDatasets) + assert len(datajob_io) >= 3, ( + f"Expected at least 3 DataJobInputOutput aspects for mixed lineage, " + f"found: {len(datajob_io)}" ) - assert any("TransformCustomerData" in urn for urn in input_datajobs_found), ( - f"Expected inputDatajobs to reference TransformCustomerData: {input_datajobs_found}" + + # Verify pipeline lineage exists (inputDatajobs) + pipeline_lineage_count = sum( + 1 + for io in datajob_io + if io.get("aspect", {}).get("json", {}).get("inputDatajobs", []) + ) + assert pipeline_lineage_count >= 2, ( + f"Expected at least 2 activities with pipeline lineage (inputDatajobs), " + f"found: {pipeline_lineage_count}" + ) + + # Verify dataset lineage exists (inputDatasets or outputDatasets) + dataset_lineage_count = sum( + 1 + for io in datajob_io + if io.get("aspect", {}).get("json", {}).get("inputDatasets", []) + or io.get("aspect", {}).get("json", {}).get("outputDatasets", []) + ) + assert dataset_lineage_count >= 3, ( + f"Expected at least 3 activities with dataset lineage, " + f"found: {dataset_lineage_count}" + ) + + +@freeze_time(FROZEN_TIME) +@pytest.mark.integration +def test_mixed_dependencies_golden(pytestconfig, tmp_path): + """Golden file test for mixed pipeline and dataset dependencies. + + This golden test validates the complete output when a pipeline has both: + 1. ExecutePipeline activities (pipeline-to-pipeline lineage) + 2. Copy activities with dataset inputs/outputs (dataset lineage) + + The golden file captures: + - Container for the factory + - DataFlow entities for all 3 pipelines + - DataJob entities for all 5 activities + - DataJobInputOutput aspects showing both pipeline and dataset lineage + - Browse paths and custom properties + """ + from tests.integration.azure_data_factory.complex_mocks import ( + create_mixed_dependencies_scenario, + ) + + test_resources_dir = pytestconfig.rootpath / "tests/integration/azure_data_factory" + scenario = create_mixed_dependencies_scenario() + + output_file = tmp_path / "adf_mixed_deps_events.json" + golden_file = test_resources_dir / "adf_mixed_deps_golden.json" + + mock_client = create_mock_client( + pipelines=scenario["pipelines"], + datasets=create_complex_datasets(), + linked_services=create_complex_linked_services(), + ) + + config = { + "run_id": "adf-mixed-deps-test", + "source": { + "type": "azure-data-factory", + "config": { + "subscription_id": SUBSCRIPTION_ID, + "resource_group": RESOURCE_GROUP, + "credential": {"authentication_method": "default"}, + "env": "DEV", + "include_lineage": True, + }, + }, + "sink": {"type": "file", "config": {"filename": str(output_file)}}, + } + + with mock.patch( + "datahub.ingestion.source.azure_data_factory.adf_client.DataFactoryManagementClient" + ) as MockClientClass: + MockClientClass.return_value = mock_client + + with mock.patch( + "datahub.ingestion.source.azure.azure_auth.DefaultAzureCredential" + ): + pipeline = Pipeline.create(config) + pipeline.run() + pipeline.raise_from_status() + + mce_helpers.check_golden_file( + pytestconfig, + output_path=str(output_file), + golden_path=str(golden_file), ) From 89aa56b1b82d2e53a492992cd49ade49c0b6ab1f Mon Sep 17 00:00:00 2001 From: Anush Kumar Date: Mon, 8 Dec 2025 17:53:26 -0800 Subject: [PATCH 04/13] feat(azure-data-factory): add comprehensive documentation and example recipes for Azure Data Factory connector - Introduced detailed documentation for the Azure Data Factory connector, covering metadata extraction, prerequisites, and configuration options. - Added example recipes to facilitate quick setup and usage of the connector. - Documented various authentication methods and their configurations, enhancing user guidance. - Included information on lineage extraction capabilities and entity mapping for better understanding of the integration. --- .../sources/{azure_data_factory => azure-data-factory}/README.md | 0 .../azure-data-factory_pre.md} | 0 .../azure-data-factory_recipe.yml} | 0 3 files changed, 0 insertions(+), 0 deletions(-) rename metadata-ingestion/docs/sources/{azure_data_factory => azure-data-factory}/README.md (100%) rename metadata-ingestion/docs/sources/{azure_data_factory/azure_data_factory_pre.md => azure-data-factory/azure-data-factory_pre.md} (100%) rename metadata-ingestion/docs/sources/{azure_data_factory/azure_data_factory_recipe.yml => azure-data-factory/azure-data-factory_recipe.yml} (100%) diff --git a/metadata-ingestion/docs/sources/azure_data_factory/README.md b/metadata-ingestion/docs/sources/azure-data-factory/README.md similarity index 100% rename from metadata-ingestion/docs/sources/azure_data_factory/README.md rename to metadata-ingestion/docs/sources/azure-data-factory/README.md diff --git a/metadata-ingestion/docs/sources/azure_data_factory/azure_data_factory_pre.md b/metadata-ingestion/docs/sources/azure-data-factory/azure-data-factory_pre.md similarity index 100% rename from metadata-ingestion/docs/sources/azure_data_factory/azure_data_factory_pre.md rename to metadata-ingestion/docs/sources/azure-data-factory/azure-data-factory_pre.md diff --git a/metadata-ingestion/docs/sources/azure_data_factory/azure_data_factory_recipe.yml b/metadata-ingestion/docs/sources/azure-data-factory/azure-data-factory_recipe.yml similarity index 100% rename from metadata-ingestion/docs/sources/azure_data_factory/azure_data_factory_recipe.yml rename to metadata-ingestion/docs/sources/azure-data-factory/azure-data-factory_recipe.yml From 25a3349a3234d942a53ba4fb81af268067dfd896 Mon Sep 17 00:00:00 2001 From: Anush Kumar Date: Mon, 8 Dec 2025 18:07:23 -0800 Subject: [PATCH 05/13] chore(azure-data-factory): add azure-data-factory to capability summary for docgen --- .../autogenerated/capability_summary.json | 34 ++++++++++++++++++- 1 file changed, 33 insertions(+), 1 deletion(-) diff --git a/metadata-ingestion/src/datahub/ingestion/autogenerated/capability_summary.json b/metadata-ingestion/src/datahub/ingestion/autogenerated/capability_summary.json index e0b59f866091a7..9e864336fc2c49 100644 --- a/metadata-ingestion/src/datahub/ingestion/autogenerated/capability_summary.json +++ b/metadata-ingestion/src/datahub/ingestion/autogenerated/capability_summary.json @@ -1,5 +1,5 @@ { - "generated_at": "2025-11-14T14:26:00.526772+00:00", + "generated_at": "2025-12-09T02:03:28.193633+00:00", "generated_by": "metadata-ingestion/scripts/capability_summary.py", "plugin_details": { "abs": { @@ -136,6 +136,38 @@ "platform_name": "Azure AD", "support_status": "CERTIFIED" }, + "azure-data-factory": { + "capabilities": [ + { + "capability": "CONTAINERS", + "description": "Enabled by default", + "subtype_modifier": null, + "supported": true + }, + { + "capability": "DELETION_DETECTION", + "description": "Enabled by default via stateful ingestion", + "subtype_modifier": null, + "supported": true + }, + { + "capability": "PLATFORM_INSTANCE", + "description": "Enabled by default", + "subtype_modifier": null, + "supported": true + }, + { + "capability": "LINEAGE_COARSE", + "description": "Extracts lineage from activity inputs/outputs", + "subtype_modifier": null, + "supported": true + } + ], + "classname": "datahub.ingestion.source.azure_data_factory.adf_source.AzureDataFactorySource", + "platform_id": "azure-data-factory", + "platform_name": "Azure Data Factory", + "support_status": "INCUBATING" + }, "bigquery": { "capabilities": [ { From 0ade97938c82f6e5834ef6a93f3475aeb1438eb5 Mon Sep 17 00:00:00 2001 From: Anush Kumar Date: Mon, 8 Dec 2025 18:10:23 -0800 Subject: [PATCH 06/13] fix(azure-data-factory): use Python 3.9 compatible type annotations in tests - Replace X | Y union syntax with Optional[X] for Python 3.9 compatibility - Add isinstance checks before accessing source.report for proper type narrowing - Add missing type annotation for tmp_path parameter --- .../azure_data_factory/test_adf_source.py | 26 +++++++++---------- .../test_complex_pipelines.py | 23 +++++++++------- 2 files changed, 27 insertions(+), 22 deletions(-) diff --git a/metadata-ingestion/tests/integration/azure_data_factory/test_adf_source.py b/metadata-ingestion/tests/integration/azure_data_factory/test_adf_source.py index 36b9a215bb4eb0..d3a76433dfdaaa 100644 --- a/metadata-ingestion/tests/integration/azure_data_factory/test_adf_source.py +++ b/metadata-ingestion/tests/integration/azure_data_factory/test_adf_source.py @@ -5,7 +5,7 @@ """ from datetime import datetime, timezone -from typing import Any, Dict, Iterator, List +from typing import Any, Dict, Iterator, List, Optional from unittest import mock from unittest.mock import MagicMock @@ -25,7 +25,7 @@ def create_mock_factory( resource_group: str, subscription_id: str, location: str = "eastus", - tags: Dict[str, str] | None = None, + tags: Optional[Dict[str, str]] = None, ) -> Dict[str, Any]: """Create a mock factory response.""" return { @@ -46,8 +46,8 @@ def create_mock_pipeline( factory_name: str, resource_group: str, subscription_id: str, - activities: List[Dict[str, Any]] | None = None, - description: str | None = None, + activities: Optional[List[Dict[str, Any]]] = None, + description: Optional[str] = None, ) -> Dict[str, Any]: """Create a mock pipeline response.""" return { @@ -67,10 +67,10 @@ def create_mock_pipeline( def create_mock_activity( name: str, activity_type: str, - inputs: List[Dict[str, Any]] | None = None, - outputs: List[Dict[str, Any]] | None = None, - depends_on: List[Dict[str, Any]] | None = None, - description: str | None = None, + inputs: Optional[List[Dict[str, Any]]] = None, + outputs: Optional[List[Dict[str, Any]]] = None, + depends_on: Optional[List[Dict[str, Any]]] = None, + description: Optional[str] = None, ) -> Dict[str, Any]: """Create a mock activity definition.""" return { @@ -93,7 +93,7 @@ def create_mock_dataset( subscription_id: str, linked_service_name: str, dataset_type: str = "AzureBlobDataset", - type_properties: Dict[str, Any] | None = None, + type_properties: Optional[Dict[str, Any]] = None, ) -> Dict[str, Any]: """Create a mock dataset response.""" return { @@ -139,7 +139,7 @@ def create_mock_trigger( resource_group: str, subscription_id: str, trigger_type: str = "ScheduleTrigger", - pipelines: List[str] | None = None, + pipelines: Optional[List[str]] = None, ) -> Dict[str, Any]: """Create a mock trigger response.""" pipeline_refs = [ @@ -167,8 +167,8 @@ def create_mock_pipeline_run( run_id: str, pipeline_name: str, status: str = "Succeeded", - start_time: datetime | None = None, - end_time: datetime | None = None, + start_time: Optional[datetime] = None, + end_time: Optional[datetime] = None, ) -> Dict[str, Any]: """Create a mock pipeline run response.""" return { @@ -215,7 +215,7 @@ class MockQueryResponse: """Mock class for query responses with continuation token.""" def __init__( - self, items: List[Dict[str, Any]], continuation_token: str | None = None + self, items: List[Dict[str, Any]], continuation_token: Optional[str] = None ): self.value = [MockAzureResource(item) for item in items] self.continuation_token = continuation_token diff --git a/metadata-ingestion/tests/integration/azure_data_factory/test_complex_pipelines.py b/metadata-ingestion/tests/integration/azure_data_factory/test_complex_pipelines.py index b5e3839977fa66..2c09ad3919cfbf 100644 --- a/metadata-ingestion/tests/integration/azure_data_factory/test_complex_pipelines.py +++ b/metadata-ingestion/tests/integration/azure_data_factory/test_complex_pipelines.py @@ -51,13 +51,16 @@ import json from pathlib import Path -from typing import Any, Dict, Iterator, List +from typing import Any, Dict, Iterator, List, Optional from unittest import mock import pytest from freezegun import freeze_time from datahub.ingestion.run.pipeline import Pipeline +from datahub.ingestion.source.azure_data_factory.adf_source import ( + AzureDataFactorySource, +) from datahub.testing import mce_helpers from tests.integration.azure_data_factory.complex_mocks import ( RESOURCE_GROUP, @@ -123,7 +126,7 @@ class MockQueryResponse: """ def __init__( - self, items: List[Dict[str, Any]], continuation_token: str | None = None + self, items: List[Dict[str, Any]], continuation_token: Optional[str] = None ): self.value = [MockAzureResource(item) for item in items] self.continuation_token = continuation_token @@ -133,9 +136,9 @@ def create_mock_client( pipelines: List[Dict[str, Any]], datasets: List[Dict[str, Any]], linked_services: List[Dict[str, Any]], - data_flows: List[Dict[str, Any]] | None = None, - triggers: List[Dict[str, Any]] | None = None, - pipeline_runs: List[Dict[str, Any]] | None = None, + data_flows: Optional[List[Dict[str, Any]]] = None, + triggers: Optional[List[Dict[str, Any]]] = None, + pipeline_runs: Optional[List[Dict[str, Any]]] = None, ) -> mock.MagicMock: """Create a mock DataFactoryManagementClient with the given test data. @@ -197,12 +200,12 @@ def create_mock_client( def _run_test_pipeline( - tmp_path, + tmp_path: Any, run_id: str, pipelines: List[Dict[str, Any]], - datasets: List[Dict[str, Any]] | None = None, - linked_services: List[Dict[str, Any]] | None = None, - data_flows: List[Dict[str, Any]] | None = None, + datasets: Optional[List[Dict[str, Any]]] = None, + linked_services: Optional[List[Dict[str, Any]]] = None, + data_flows: Optional[List[Dict[str, Any]]] = None, include_lineage: bool = True, ) -> Pipeline: """Helper function to run an ingestion pipeline with mocked Azure data. @@ -314,6 +317,7 @@ def test_nested_pipeline_creates_all_entities(pytestconfig, tmp_path): ) # Verify all pipelines were processed (not filtered out) + assert isinstance(pipeline.source, AzureDataFactorySource) assert pipeline.source.report.pipelines_scanned == len(scenario["pipelines"]) @@ -882,6 +886,7 @@ def test_dataflow_lineage_sources_and_sinks(tmp_path): # Verify that data flows were fetched and processed # This confirms the connector is looking up Data Flow definitions + assert isinstance(pipeline.source, AzureDataFactorySource) assert pipeline.source.report.data_flows_scanned > 0, ( "Expected data flows to be scanned for lineage extraction" ) From fadb18c6d32dcd0f10641e2108c0e2e99c5012a4 Mon Sep 17 00:00:00 2001 From: Anush Kumar Date: Mon, 8 Dec 2025 18:58:05 -0800 Subject: [PATCH 07/13] fix(azure): use StrEnum instead of str,Enum mixin for Python 3.11 compatibility --- .../src/datahub/ingestion/source/azure/azure_auth.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/metadata-ingestion/src/datahub/ingestion/source/azure/azure_auth.py b/metadata-ingestion/src/datahub/ingestion/source/azure/azure_auth.py index 9c2427bddaf349..424a796e323fcc 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/azure/azure_auth.py +++ b/metadata-ingestion/src/datahub/ingestion/source/azure/azure_auth.py @@ -10,7 +10,6 @@ - DefaultAzureCredential (auto-detects environment) """ -from enum import Enum from typing import Optional from azure.core.credentials import TokenCredential @@ -23,9 +22,10 @@ from pydantic import Field, SecretStr, model_validator from datahub.configuration import ConfigModel +from datahub.utilities.str_enum import StrEnum -class AzureAuthenticationMethod(str, Enum): +class AzureAuthenticationMethod(StrEnum): """Supported Azure authentication methods. - DEFAULT: Uses DefaultAzureCredential which auto-detects credentials from From c894ab7ba3746e946e88313ed30ac12f6d33d87d Mon Sep 17 00:00:00 2001 From: Anush Kumar Date: Tue, 9 Dec 2025 08:56:18 -0800 Subject: [PATCH 08/13] fix(ingestion): add azure-data-factory to test requirements - Add azure-data-factory to full_test_dev_requirements in setup.py - Ensures azure.mgmt.datafactory is installed during test runs - Fixes ModuleNotFoundError in unit/integration tests --- metadata-ingestion/setup.py | 1 + 1 file changed, 1 insertion(+) diff --git a/metadata-ingestion/setup.py b/metadata-ingestion/setup.py index a3e50534b973e0..c27cbe4cd0a2d0 100644 --- a/metadata-ingestion/setup.py +++ b/metadata-ingestion/setup.py @@ -787,6 +787,7 @@ dependency for plugin in [ "athena", + "azure-data-factory", "circuit-breaker", "clickhouse", "delta-lake", From 876d828a4d3e41c9936446f9f22fc33c95343d34 Mon Sep 17 00:00:00 2001 From: Anush Kumar Date: Tue, 9 Dec 2025 10:35:17 -0800 Subject: [PATCH 09/13] fix(constants): restore azure-data-factory logo import in ingestion source files --- datahub-web-react/src/app/ingest/source/builder/constants.ts | 2 +- datahub-web-react/src/app/ingestV2/source/builder/constants.ts | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/datahub-web-react/src/app/ingest/source/builder/constants.ts b/datahub-web-react/src/app/ingest/source/builder/constants.ts index c9997484d57ca1..89efd267edfe77 100644 --- a/datahub-web-react/src/app/ingest/source/builder/constants.ts +++ b/datahub-web-react/src/app/ingest/source/builder/constants.ts @@ -1,6 +1,6 @@ import athenaLogo from '@images/awsathenalogo.png'; -import azureDataFactoryLogo from '@images/azuredatafactorylogo.svg'; import azureLogo from '@images/azure-ad.png'; +import azureDataFactoryLogo from '@images/azuredatafactorylogo.svg'; import bigqueryLogo from '@images/bigquerylogo.png'; import cassandraLogo from '@images/cassandralogo.png'; import clickhouseLogo from '@images/clickhouselogo.png'; diff --git a/datahub-web-react/src/app/ingestV2/source/builder/constants.ts b/datahub-web-react/src/app/ingestV2/source/builder/constants.ts index 695d6a12b5bcd1..bf8d80bc0afa41 100644 --- a/datahub-web-react/src/app/ingestV2/source/builder/constants.ts +++ b/datahub-web-react/src/app/ingestV2/source/builder/constants.ts @@ -1,6 +1,6 @@ import athenaLogo from '@images/awsathenalogo.png'; -import azureDataFactoryLogo from '@images/azuredatafactorylogo.svg'; import azureLogo from '@images/azure-ad.png'; +import azureDataFactoryLogo from '@images/azuredatafactorylogo.svg'; import bigqueryLogo from '@images/bigquerylogo.png'; import cassandraLogo from '@images/cassandralogo.png'; import clickhouseLogo from '@images/clickhouselogo.png'; From 5a9f14869c6cdece24a3c1c0a0ee3d3445b6a729 Mon Sep 17 00:00:00 2001 From: Anush Kumar Date: Thu, 11 Dec 2025 13:40:35 -0800 Subject: [PATCH 10/13] feat(azure-data-factory): enhance metadata ingestion and lineage tracking - Updated linked service mappings to consolidate Azure storage types under a single identifier (`abs`). - Improved configuration options to enable column lineage and execution history extraction by default. - Enhanced lineage reporting to differentiate between dataset, pipeline, and dataflow lineage types. - Refactored API call tracking for better granularity and added support for timing metrics. - Updated documentation to clarify naming rules, uniqueness handling, and case sensitivity in Azure Data Factory. - Adjusted integration tests to reflect changes in platform mappings and lineage extraction logic. --- .../azure-data-factory_pre.md | 92 ++++++++++++++++--- .../source/azure_data_factory/adf_config.py | 17 ++-- .../source/azure_data_factory/adf_models.py | 4 +- .../source/azure_data_factory/adf_report.py | 51 +++++++--- .../source/azure_data_factory/adf_source.py | 37 +++++--- .../azure_data_factory/adf_basic_golden.json | 4 +- .../adf_mixed_deps_golden.json | 10 +- .../adf_multisource_golden.json | 16 ++-- .../azure_data_factory/adf_nested_golden.json | 2 +- .../adf_platform_instance_golden.json | 4 +- .../adf_with_runs_golden.json | 4 +- .../test_complex_pipelines.py | 42 ++++----- .../azure_data_factory/test_adf_source.py | 18 ++-- 13 files changed, 201 insertions(+), 100 deletions(-) diff --git a/metadata-ingestion/docs/sources/azure-data-factory/azure-data-factory_pre.md b/metadata-ingestion/docs/sources/azure-data-factory/azure-data-factory_pre.md index 0f77bffcf78d96..0f8ba02229d439 100644 --- a/metadata-ingestion/docs/sources/azure-data-factory/azure-data-factory_pre.md +++ b/metadata-ingestion/docs/sources/azure-data-factory/azure-data-factory_pre.md @@ -91,21 +91,19 @@ The child pipeline's first activity will have the ExecutePipeline as its input/u ### Supported Linked Service Mappings -| ADF Linked Service | DataHub Platform | -| ------------------------------------------ | -------------------- | -| AzureBlobStorage, AzureBlobFS | `azure_blob_storage` | -| AzureDataLakeStore, AzureDataLakeStoreGen2 | `azure_data_lake` | -| AzureSqlDatabase, AzureSqlDW | `mssql` | -| AzureSynapseAnalytics | `synapse` | -| Snowflake | `snowflake` | -| AmazonS3 | `s3` | -| GoogleBigQuery | `bigquery` | -| PostgreSql, AzurePostgreSql | `postgres` | -| MySql, AzureMySql | `mysql` | -| Oracle | `oracle` | -| Salesforce | `salesforce` | -| CosmosDb | `cosmos` | -| AzureDatabricks, DatabricksDeltaLake | `databricks` | +| ADF Linked Service | DataHub Platform | +| --------------------------------------------------- | ---------------- | +| AzureBlobStorage, AzureBlobFS, AzureDataLakeStore | `abs` | +| AzureSqlDatabase, AzureSqlDW, AzureSynapseAnalytics | `mssql` | +| Snowflake | `snowflake` | +| AmazonS3 | `s3` | +| GoogleBigQuery | `bigquery` | +| PostgreSql, AzurePostgreSql | `postgres` | +| MySql, AzureMySql | `mysql` | +| Oracle | `oracle` | +| Salesforce | `salesforce` | +| CosmosDb | `cosmosdb` | +| AzureDatabricks, AzureDatabricksDeltaLake | `databricks` | ### Platform Instance Mapping @@ -232,3 +230,67 @@ urn:li:dataFlow:(azure-data-factory,{platform_instance}.{factory_name}.{pipeline ``` Example: `urn:li:dataFlow:(azure-data-factory,production.my-factory.ETL-Pipeline,PROD)` + +## Naming Rules and Uniqueness + +### Azure Naming Rules + +Azure Data Factory enforces specific naming rules documented at [Azure Data Factory naming rules](https://learn.microsoft.com/en-us/azure/data-factory/naming-rules): + +| Resource | Uniqueness | Case Sensitivity | +| --------------- | ---------------------------- | ---------------- | +| Data Factory | Globally unique across Azure | Case-insensitive | +| Pipelines | Unique within a factory | Case-insensitive | +| Datasets | Unique within a factory | Case-insensitive | +| Linked Services | Unique within a factory | Case-insensitive | +| Data Flows | Unique within a factory | Case-insensitive | + +### How DataHub Handles Uniqueness + +The connector constructs URNs using `{factory_name}.{pipeline_name}` format: + +- **Factory names are globally unique** in Azure, preventing collisions within a subscription +- **Pipeline names are unique within a factory**, so the combination is globally unique +- **No additional namespacing needed** for single-subscription deployments + +### Multi-Subscription and Multi-Tenant Scenarios + +:::warning Important +Factory names are globally unique _within Azure_, but different Azure tenants or subscriptions in different regions could have identically-named factories. +::: + +| Scenario | Risk | Solution | +| ------------------------------------ | ------------------------------------- | -------------------------------------------------- | +| Single subscription | None | Default URN format works | +| Multiple subscriptions (same tenant) | Low - factory names still unique | Default works, but `platform_instance` recommended | +| Multiple tenants | **High** - same factory name possible | **Must use `platform_instance`** | + +**Example: Multi-Tenant Setup** + +```yaml +# Tenant A +source: + type: azure-data-factory + config: + subscription_id: "tenant-a-sub" + platform_instance: "tenant-a" + +# Tenant B (could have same factory name!) +source: + type: azure-data-factory + config: + subscription_id: "tenant-b-sub" + platform_instance: "tenant-b" +``` + +### Case Sensitivity + +Azure treats names as **case-insensitive** (e.g., `MyFactory` and `myfactory` are the same factory). DataHub URNs are case-sensitive, but this doesn't cause issues because: + +1. Azure prevents creating duplicate names with different casing at the source +2. The connector uses exact names from the Azure API response +3. Consistent casing is maintained throughout ingestion + +:::tip +If you're ingesting from multiple Azure tenants and see unexpected entity overwrites in DataHub, ensure each ingestion recipe uses a unique `platform_instance` value. +::: diff --git a/metadata-ingestion/src/datahub/ingestion/source/azure_data_factory/adf_config.py b/metadata-ingestion/src/datahub/ingestion/source/azure_data_factory/adf_config.py index c48e998ba24e6c..7b00b53eca0742 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/azure_data_factory/adf_config.py +++ b/metadata-ingestion/src/datahub/ingestion/source/azure_data_factory/adf_config.py @@ -86,19 +86,19 @@ class AzureDataFactoryConfig( ) include_column_lineage: bool = Field( - default=False, + default=True, description=( "Extract column-level lineage from Data Flow activities. " - "Requires parsing Data Flow definitions. " - "Note: This is an advanced feature and may increase ingestion time." + "Requires parsing Data Flow definitions." ), ) include_execution_history: bool = Field( - default=False, + default=True, description=( "Extract pipeline and activity execution history as DataProcessInstance. " - "Includes run status, duration, and parameters." + "Includes run status, duration, and parameters. " + "Enables lineage extraction from parameterized activities using actual runtime values." ), ) @@ -116,8 +116,11 @@ class AzureDataFactoryConfig( include_datasets: bool = Field( default=True, description=( - "Include ADF dataset definitions in the metadata. " - "Datasets are used to resolve lineage to external platforms." + "Extract ADF dataset definitions to enable lineage resolution. " + "When enabled, the connector reads dataset configurations (linked service, " + "table names, file paths) to map ADF datasets to DataHub dataset URNs. " + "This is required for table-level lineage. Disable only if you want to " + "extract just pipeline/activity structure without lineage." ), ) diff --git a/metadata-ingestion/src/datahub/ingestion/source/azure_data_factory/adf_models.py b/metadata-ingestion/src/datahub/ingestion/source/azure_data_factory/adf_models.py index b92f0375595013..4d89df3c583c8c 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/azure_data_factory/adf_models.py +++ b/metadata-ingestion/src/datahub/ingestion/source/azure_data_factory/adf_models.py @@ -17,7 +17,9 @@ JsonPrimitive = Union[str, int, float, bool, None] -# TypedDict for well-known structures in ADF API responses +# TypedDict for well-known structures in ADF API responses. +# These provide type hints for commonly-used nested dictionaries from Azure SDK. +# Using total=False makes all fields optional, matching Azure's inconsistent responses. class FolderInfo(TypedDict, total=False): """Folder organization structure used by pipelines, datasets, etc.""" diff --git a/metadata-ingestion/src/datahub/ingestion/source/azure_data_factory/adf_report.py b/metadata-ingestion/src/datahub/ingestion/source/azure_data_factory/adf_report.py index b984b67be74907..bfc9f19842e677 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/azure_data_factory/adf_report.py +++ b/metadata-ingestion/src/datahub/ingestion/source/azure_data_factory/adf_report.py @@ -1,6 +1,7 @@ """Custom report class for Azure Data Factory connector.""" from dataclasses import dataclass, field +from typing import Dict from datahub.ingestion.source.state.stale_entity_removal_handler import ( StaleEntityRemovalSourceReport, @@ -29,8 +30,12 @@ class AzureDataFactorySourceReport(StaleEntityRemovalSourceReport): filtered_factories: LossyList[str] = field(default_factory=LossyList) filtered_pipelines: LossyList[str] = field(default_factory=LossyList) - # Lineage metrics - lineage_edges_extracted: int = 0 + # Lineage metrics - split by type for better visibility + dataset_lineage_extracted: int = 0 # Dataset-to-dataset lineage (Copy activities) + pipeline_lineage_extracted: int = ( + 0 # Pipeline-to-pipeline lineage (ExecutePipeline) + ) + dataflow_lineage_extracted: int = 0 # Data Flow source/sink lineage lineage_extraction_failures: int = 0 datasets_with_lineage: int = 0 datasets_without_platform_mapping: LossyList[str] = field(default_factory=LossyList) @@ -39,9 +44,11 @@ class AzureDataFactorySourceReport(StaleEntityRemovalSourceReport): pipeline_runs_scanned: int = 0 activity_runs_scanned: int = 0 - # API metrics - api_calls: int = 0 - api_errors: int = 0 + # API metrics - granular tracking by endpoint type + api_calls_total_count: int = 0 + api_calls_total_error_count: int = 0 + api_call_counts_by_type: Dict[str, int] = field(default_factory=dict) + total_api_response_time_seconds: float = 0.0 def report_factory_scanned(self) -> None: """Increment factories scanned counter.""" @@ -79,9 +86,18 @@ def report_trigger_scanned(self) -> None: """Increment triggers scanned counter.""" self.triggers_scanned += 1 - def report_lineage_extracted(self) -> None: - """Increment lineage edges counter.""" - self.lineage_edges_extracted += 1 + def report_lineage_extracted(self, lineage_type: str = "dataset") -> None: + """Increment lineage edges counter by type. + + Args: + lineage_type: One of "dataset", "pipeline", or "dataflow" + """ + if lineage_type == "dataset": + self.dataset_lineage_extracted += 1 + elif lineage_type == "pipeline": + self.pipeline_lineage_extracted += 1 + elif lineage_type == "dataflow": + self.dataflow_lineage_extracted += 1 self.datasets_with_lineage += 1 def report_lineage_failed(self, entity_name: str, error: str) -> None: @@ -109,13 +125,24 @@ def report_activity_run_scanned(self) -> None: """Increment activity runs scanned counter.""" self.activity_runs_scanned += 1 - def report_api_call(self) -> None: - """Track an API call.""" - self.api_calls += 1 + def report_api_call( + self, api_type: str = "general", duration_seconds: float = 0.0 + ) -> None: + """Track an API call with timing. + + Args: + api_type: Type of API call (e.g., "factories", "pipelines", "datasets") + duration_seconds: Time taken for the API call + """ + self.api_calls_total_count += 1 + self.total_api_response_time_seconds += duration_seconds + if api_type not in self.api_call_counts_by_type: + self.api_call_counts_by_type[api_type] = 0 + self.api_call_counts_by_type[api_type] += 1 def report_api_error(self, endpoint: str, error: str) -> None: """Record an API error.""" - self.api_errors += 1 + self.api_calls_total_error_count += 1 self.report_warning( title="API Error", message="Failed to call Azure Data Factory API.", diff --git a/metadata-ingestion/src/datahub/ingestion/source/azure_data_factory/adf_source.py b/metadata-ingestion/src/datahub/ingestion/source/azure_data_factory/adf_source.py index bdb96077af00af..aac2367c3d2612 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/azure_data_factory/adf_source.py +++ b/metadata-ingestion/src/datahub/ingestion/source/azure_data_factory/adf_source.py @@ -88,18 +88,20 @@ MAX_RUN_PARAMETERS = 10 # Limit number of parameters to store MAX_PARAMETER_VALUE_LENGTH = 100 # Truncate long parameter values -# Mapping of ADF linked service types to DataHub platforms +# Mapping of ADF linked service types to DataHub platforms. +# Platform identifiers must match those defined in: +# metadata-service/configuration/src/main/resources/bootstrap_mcps/data-platforms.yaml LINKED_SERVICE_PLATFORM_MAP: dict[str, str] = { - # Azure Storage - "AzureBlobStorage": "azure_blob_storage", - "AzureBlobFS": "azure_data_lake", - "AzureDataLakeStore": "azure_data_lake", - "AzureDataLakeStoreCosmosStructuredStream": "azure_data_lake", - "AzureFileStorage": "azure_file_storage", - # Azure Databases + # Azure Storage - all Azure storage types map to "abs" (Azure Blob Storage) + "AzureBlobStorage": "abs", + "AzureBlobFS": "abs", # Azure Data Lake Storage Gen2 (uses abfs:// protocol) + "AzureDataLakeStore": "abs", # Azure Data Lake Storage Gen1 + "AzureDataLakeStoreCosmosStructuredStream": "abs", + "AzureFileStorage": "abs", + # Azure Databases - Synapse uses mssql protocol "AzureSqlDatabase": "mssql", - "AzureSqlDW": "synapse", - "AzureSynapseAnalytics": "synapse", + "AzureSqlDW": "mssql", # Azure Synapse (formerly SQL DW) + "AzureSynapseAnalytics": "mssql", # Azure Synapse Analytics "AzureSqlMI": "mssql", "SqlServer": "mssql", "AzurePostgreSql": "postgres", @@ -234,7 +236,12 @@ def __init__(self, config: AzureDataFactoryConfig, ctx: PipelineContext) -> None subscription_id=config.subscription_id, ) - # Cache for datasets, linked services, data flows, pipelines, and triggers (per factory) + # Cache for datasets, linked services, data flows, pipelines, and triggers. + # Structure: {factory_key: {resource_name: resource_object}} + # - factory_key: "{resource_group}/{factory_name}" - uniquely identifies a factory + # - resource_name: Name of the ADF resource (e.g., "MyDataset", "MyPipeline") + # - resource_object: Parsed ADF resource model + # These caches enable resolution of cross-references (e.g., dataset -> linked service) self._datasets_cache: dict[str, dict[str, AdfDataset]] = {} self._linked_services_cache: dict[str, dict[str, LinkedService]] = {} self._data_flows_cache: dict[str, dict[str, AdfDataFlow]] = {} @@ -616,7 +623,7 @@ def _extract_activity_inputs( ) if dataset_urn: inputs.append(str(dataset_urn)) - self.report.report_lineage_extracted() + self.report.report_lineage_extracted("dataset") # Process Data Flow activities - extract sources as inputs if activity.type == "ExecuteDataFlow": @@ -650,7 +657,7 @@ def _extract_activity_outputs( ) if dataset_urn: outputs.append(str(dataset_urn)) - self.report.report_lineage_extracted() + self.report.report_lineage_extracted("dataset") # Process Data Flow activities - extract sinks as outputs if activity.type == "ExecuteDataFlow": @@ -816,7 +823,7 @@ def _extract_data_flow_endpoints( ) if dataset_urn: urns.append(str(dataset_urn)) - self.report.report_lineage_extracted() + self.report.report_lineage_extracted("dataflow") logger.debug( f"Extracted Data Flow {endpoint_label}: {endpoint.name} -> {dataset_urn}" ) @@ -911,7 +918,7 @@ def _emit_pipeline_lineage( current_props["child_first_activity"] = first_activity_name datajob.set_custom_properties(current_props) - self.report.report_lineage_extracted() + self.report.report_lineage_extracted("pipeline") # Emit DataJobInputOutput on the CHILD's first activity, setting ExecutePipeline as upstream # This creates lineage: ExecutePipeline -> ChildFirstActivity diff --git a/metadata-ingestion/tests/integration/azure_data_factory/adf_basic_golden.json b/metadata-ingestion/tests/integration/azure_data_factory/adf_basic_golden.json index eab65cfd2c3a9b..9ef535630cce09 100644 --- a/metadata-ingestion/tests/integration/azure_data_factory/adf_basic_golden.json +++ b/metadata-ingestion/tests/integration/azure_data_factory/adf_basic_golden.json @@ -420,7 +420,7 @@ "aspect": { "json": { "inputDatasets": [ - "urn:li:dataset:(urn:li:dataPlatform:azure_blob_storage,raw-data/input/data.csv,PROD)" + "urn:li:dataset:(urn:li:dataPlatform:abs,raw-data/input/data.csv,PROD)" ], "outputDatasets": [ "urn:li:dataset:(urn:li:dataPlatform:mssql,ProcessedData,PROD)" @@ -584,7 +584,7 @@ "aspect": { "json": { "inputDatasets": [ - "urn:li:dataset:(urn:li:dataPlatform:azure_blob_storage,config/settings.json,PROD)" + "urn:li:dataset:(urn:li:dataPlatform:abs,config/settings.json,PROD)" ], "outputDatasets": [] } diff --git a/metadata-ingestion/tests/integration/azure_data_factory/adf_mixed_deps_golden.json b/metadata-ingestion/tests/integration/azure_data_factory/adf_mixed_deps_golden.json index f45d4aafc4121b..a06aee44d61042 100644 --- a/metadata-ingestion/tests/integration/azure_data_factory/adf_mixed_deps_golden.json +++ b/metadata-ingestion/tests/integration/azure_data_factory/adf_mixed_deps_golden.json @@ -347,10 +347,10 @@ "aspect": { "json": { "inputDatasets": [ - "urn:li:dataset:(urn:li:dataPlatform:azure_blob_storage,staging/customers,DEV)" + "urn:li:dataset:(urn:li:dataPlatform:abs,staging/customers,DEV)" ], "outputDatasets": [ - "urn:li:dataset:(urn:li:dataPlatform:synapse,DimCustomers,DEV)" + "urn:li:dataset:(urn:li:dataPlatform:mssql,DimCustomers,DEV)" ] } }, @@ -656,7 +656,7 @@ "urn:li:dataset:(urn:li:dataPlatform:mssql,Customers,DEV)" ], "outputDatasets": [ - "urn:li:dataset:(urn:li:dataPlatform:azure_blob_storage,staging/customers,DEV)" + "urn:li:dataset:(urn:li:dataPlatform:abs,staging/customers,DEV)" ] } }, @@ -852,10 +852,10 @@ "aspect": { "json": { "inputDatasets": [ - "urn:li:dataset:(urn:li:dataPlatform:synapse,DimCustomers,DEV)" + "urn:li:dataset:(urn:li:dataPlatform:mssql,DimCustomers,DEV)" ], "outputDatasets": [ - "urn:li:dataset:(urn:li:dataPlatform:azure_data_lake,sales_summary,DEV)" + "urn:li:dataset:(urn:li:dataPlatform:abs,sales_summary,DEV)" ] } }, diff --git a/metadata-ingestion/tests/integration/azure_data_factory/adf_multisource_golden.json b/metadata-ingestion/tests/integration/azure_data_factory/adf_multisource_golden.json index 830c84f0b559cb..4061b36570ef9f 100644 --- a/metadata-ingestion/tests/integration/azure_data_factory/adf_multisource_golden.json +++ b/metadata-ingestion/tests/integration/azure_data_factory/adf_multisource_golden.json @@ -146,10 +146,10 @@ "aspect": { "json": { "inputDatasets": [ - "urn:li:dataset:(urn:li:dataPlatform:azure_blob_storage,staging/customers,DEV)" + "urn:li:dataset:(urn:li:dataPlatform:abs,staging/customers,DEV)" ], "outputDatasets": [ - "urn:li:dataset:(urn:li:dataPlatform:azure_data_lake,sales,DEV)" + "urn:li:dataset:(urn:li:dataPlatform:abs,sales,DEV)" ] } }, @@ -346,7 +346,7 @@ "urn:li:dataset:(urn:li:dataPlatform:mssql,Orders,DEV)" ], "outputDatasets": [ - "urn:li:dataset:(urn:li:dataPlatform:azure_blob_storage,staging/orders,DEV)" + "urn:li:dataset:(urn:li:dataPlatform:abs,staging/orders,DEV)" ] } }, @@ -435,7 +435,7 @@ "urn:li:dataset:(urn:li:dataPlatform:mssql,Customers,DEV)" ], "outputDatasets": [ - "urn:li:dataset:(urn:li:dataPlatform:azure_blob_storage,staging/customers,DEV)" + "urn:li:dataset:(urn:li:dataPlatform:abs,staging/customers,DEV)" ] } }, @@ -571,10 +571,10 @@ "aspect": { "json": { "inputDatasets": [ - "urn:li:dataset:(urn:li:dataPlatform:azure_blob_storage,staging/orders,DEV)" + "urn:li:dataset:(urn:li:dataPlatform:abs,staging/orders,DEV)" ], "outputDatasets": [ - "urn:li:dataset:(urn:li:dataPlatform:synapse,FactOrders,DEV)" + "urn:li:dataset:(urn:li:dataPlatform:mssql,FactOrders,DEV)" ] } }, @@ -708,10 +708,10 @@ "aspect": { "json": { "inputDatasets": [ - "urn:li:dataset:(urn:li:dataPlatform:azure_blob_storage,staging/customers,DEV)" + "urn:li:dataset:(urn:li:dataPlatform:abs,staging/customers,DEV)" ], "outputDatasets": [ - "urn:li:dataset:(urn:li:dataPlatform:synapse,DimCustomers,DEV)" + "urn:li:dataset:(urn:li:dataPlatform:mssql,DimCustomers,DEV)" ] } }, diff --git a/metadata-ingestion/tests/integration/azure_data_factory/adf_nested_golden.json b/metadata-ingestion/tests/integration/azure_data_factory/adf_nested_golden.json index a1ba40c1aa7a5e..62ba1ab0e9c1d2 100644 --- a/metadata-ingestion/tests/integration/azure_data_factory/adf_nested_golden.json +++ b/metadata-ingestion/tests/integration/azure_data_factory/adf_nested_golden.json @@ -551,7 +551,7 @@ "urn:li:dataset:(urn:li:dataPlatform:mssql,Customers,DEV)" ], "outputDatasets": [ - "urn:li:dataset:(urn:li:dataPlatform:azure_blob_storage,staging/customers,DEV)" + "urn:li:dataset:(urn:li:dataPlatform:abs,staging/customers,DEV)" ] } }, diff --git a/metadata-ingestion/tests/integration/azure_data_factory/adf_platform_instance_golden.json b/metadata-ingestion/tests/integration/azure_data_factory/adf_platform_instance_golden.json index 5da85bdc49314c..72c59841446d22 100644 --- a/metadata-ingestion/tests/integration/azure_data_factory/adf_platform_instance_golden.json +++ b/metadata-ingestion/tests/integration/azure_data_factory/adf_platform_instance_golden.json @@ -261,7 +261,7 @@ "aspect": { "json": { "inputDatasets": [ - "urn:li:dataset:(urn:li:dataPlatform:azure_blob_storage,raw-data/input/data.csv,DEV)" + "urn:li:dataset:(urn:li:dataPlatform:abs,raw-data/input/data.csv,DEV)" ], "outputDatasets": [ "urn:li:dataset:(urn:li:dataPlatform:mssql,ProcessedData,DEV)" @@ -479,7 +479,7 @@ "aspect": { "json": { "inputDatasets": [ - "urn:li:dataset:(urn:li:dataPlatform:azure_blob_storage,config/settings.json,DEV)" + "urn:li:dataset:(urn:li:dataPlatform:abs,config/settings.json,DEV)" ], "outputDatasets": [] } diff --git a/metadata-ingestion/tests/integration/azure_data_factory/adf_with_runs_golden.json b/metadata-ingestion/tests/integration/azure_data_factory/adf_with_runs_golden.json index 4d324a76d50694..8db2216f22c5e5 100644 --- a/metadata-ingestion/tests/integration/azure_data_factory/adf_with_runs_golden.json +++ b/metadata-ingestion/tests/integration/azure_data_factory/adf_with_runs_golden.json @@ -245,7 +245,7 @@ "aspect": { "json": { "inputDatasets": [ - "urn:li:dataset:(urn:li:dataPlatform:azure_blob_storage,raw-data/input/data.csv,PROD)" + "urn:li:dataset:(urn:li:dataPlatform:abs,raw-data/input/data.csv,PROD)" ], "outputDatasets": [ "urn:li:dataset:(urn:li:dataPlatform:mssql,ProcessedData,PROD)" @@ -354,7 +354,7 @@ "aspect": { "json": { "inputDatasets": [ - "urn:li:dataset:(urn:li:dataPlatform:azure_blob_storage,config/settings.json,PROD)" + "urn:li:dataset:(urn:li:dataPlatform:abs,config/settings.json,PROD)" ], "outputDatasets": [] } diff --git a/metadata-ingestion/tests/integration/azure_data_factory/test_complex_pipelines.py b/metadata-ingestion/tests/integration/azure_data_factory/test_complex_pipelines.py index 2c09ad3919cfbf..c1053b62cae15d 100644 --- a/metadata-ingestion/tests/integration/azure_data_factory/test_complex_pipelines.py +++ b/metadata-ingestion/tests/integration/azure_data_factory/test_complex_pipelines.py @@ -30,7 +30,7 @@ 5. **Multi-Source ETL Pipelines** - Full ETL chains: SQL → Blob → Synapse → DataLake - Tests end-to-end lineage across multiple hops - - Validates platform mapping (mssql, azure_blob_storage, synapse) + - Validates platform mapping (mssql, abs) Why These Tests Matter: ====================== @@ -173,7 +173,7 @@ def create_mock_client( mock_client.datasets.list_by_factory.return_value = MockPagedIterator(datasets) # Mock linked services - determine the platform type for datasets - # (e.g., AzureSqlDatabase → mssql, AzureBlobStorage → azure_blob_storage) + # (e.g., AzureSqlDatabase → mssql, AzureBlobStorage → abs) mock_client.linked_services.list_by_factory.return_value = MockPagedIterator( linked_services ) @@ -662,9 +662,9 @@ def test_dataflow_pipeline_with_lineage(pytestconfig, tmp_path): # - Multi-hop lineage is captured correctly # - Platform mapping works for different linked services: # - AzureSqlDatabase → mssql -# - AzureBlobStorage → azure_blob_storage -# - AzureSynapseAnalytics → synapse -# - AzureBlobFS → azure_data_lake +# - AzureBlobStorage → abs +# - AzureSynapseAnalytics → mssql +# - AzureBlobFS → abs # - Dependencies between activities are respected # # Why this matters: @@ -686,10 +686,8 @@ def test_multisource_etl_pipeline(pytestconfig, tmp_path): The test verifies: - All Copy activities are captured with correct subtypes - Platform mapping produces correct URNs: - - mssql for SQL datasets - - azure_blob_storage for Blob datasets - - synapse for Synapse datasets - - azure_data_lake for Data Lake datasets + - mssql for SQL and Synapse datasets + - abs for Blob and Data Lake datasets - Activity dependencies are reflected in job order """ test_resources_dir = pytestconfig.rootpath / "tests/integration/azure_data_factory" @@ -755,7 +753,7 @@ def test_multisource_lineage_accuracy(tmp_path): This test programmatically inspects the generated MCPs to verify that: 1. dataJobInputOutput aspects are emitted (lineage is captured) 2. SQL sources appear as input datasets with 'mssql' platform - 3. Synapse destinations appear as output datasets with 'synapse' platform + 3. Synapse destinations appear as output datasets with 'mssql' platform (Synapse uses mssql protocol) This complements the golden file test by focusing on specific lineage properties that are critical for data governance use cases. @@ -823,10 +821,11 @@ def test_multisource_lineage_accuracy(tmp_path): assert len(sql_inputs) > 0, "Expected SQL dataset inputs with 'mssql' platform" # Verify Synapse destinations are captured with correct platform - # Synapse outputs should have URNs containing 'synapse' - synapse_outputs = [o for o in all_outputs if "synapse" in o] - assert len(synapse_outputs) > 0, ( - "Expected Synapse dataset outputs with 'synapse' platform" + # Synapse outputs should have URNs containing 'mssql' (Synapse uses mssql protocol) + # Check for output datasets that contain common Synapse table naming patterns + mssql_outputs = [o for o in all_outputs if "mssql" in o] + assert len(mssql_outputs) > 0, ( + "Expected Synapse dataset outputs with 'mssql' platform" ) @@ -1367,20 +1366,21 @@ def test_mixed_pipeline_and_dataset_dependencies(tmp_path: Path) -> None: "TransformInMain should have at least 1 input dataset" ) # The URN uses platform and dataset path from typeProperties, not the ADF dataset name - # BlobStagingCustomers maps to azure_blob_storage platform with path staging/customers + # BlobStagingCustomers maps to abs platform with path staging/customers assert any( - "azure_blob_storage" in urn or "staging" in urn - for urn in transform_lineage["inputs"] + "abs" in urn or "staging" in urn for urn in transform_lineage["inputs"] ), f"TransformInMain should read from blob storage: {transform_lineage['inputs']}" - # TransformInMain should write to SynapseCustomersDim (synapse) + # TransformInMain should write to SynapseSalesTable (mssql platform) assert len(transform_lineage["outputs"]) >= 1, ( "TransformInMain should have at least 1 output dataset" ) - # SynapseCustomersDim maps to synapse platform with schema Sales.CustomersDim + # SynapseSalesTable maps to mssql platform (Synapse uses mssql protocol) assert any( - "synapse" in urn or "Customers" in urn for urn in transform_lineage["outputs"] - ), f"TransformInMain should write to synapse: {transform_lineage['outputs']}" + "mssql" in urn or "Sales" in urn for urn in transform_lineage["outputs"] + ), ( + f"TransformInMain should write to Synapse (mssql): {transform_lineage['outputs']}" + ) # ========================================================================= # Verify Both Lineage Types Coexist diff --git a/metadata-ingestion/tests/unit/azure_data_factory/test_adf_source.py b/metadata-ingestion/tests/unit/azure_data_factory/test_adf_source.py index 59ba8ddf00d3ac..8ca3d574053d9a 100644 --- a/metadata-ingestion/tests/unit/azure_data_factory/test_adf_source.py +++ b/metadata-ingestion/tests/unit/azure_data_factory/test_adf_source.py @@ -35,12 +35,12 @@ def test_azure_sql_variants_map_to_mssql(self) -> None: f"{sql_type} should map to 'mssql'" ) - def test_synapse_variants_map_correctly(self) -> None: - """Azure Synapse variants should map to synapse platform.""" + def test_synapse_variants_map_to_mssql(self) -> None: + """Azure Synapse variants should map to mssql platform (same protocol).""" synapse_types = ["AzureSynapseAnalytics", "AzureSqlDW"] for synapse_type in synapse_types: - assert LINKED_SERVICE_PLATFORM_MAP.get(synapse_type) == "synapse", ( - f"{synapse_type} should map to 'synapse'" + assert LINKED_SERVICE_PLATFORM_MAP.get(synapse_type) == "mssql", ( + f"{synapse_type} should map to 'mssql'" ) def test_databricks_variants_map_correctly(self) -> None: @@ -51,11 +51,11 @@ def test_databricks_variants_map_correctly(self) -> None: f"{db_type} should map to 'databricks'" ) - def test_azure_storage_types_map_to_distinct_platforms(self) -> None: - """Different Azure storage types should map to distinct platforms.""" - assert LINKED_SERVICE_PLATFORM_MAP["AzureBlobStorage"] == "azure_blob_storage" - assert LINKED_SERVICE_PLATFORM_MAP["AzureBlobFS"] == "azure_data_lake" - assert LINKED_SERVICE_PLATFORM_MAP["AzureDataLakeStore"] == "azure_data_lake" + def test_azure_storage_types_map_to_abs_platform(self) -> None: + """All Azure storage types should map to abs (Azure Blob Storage) platform.""" + assert LINKED_SERVICE_PLATFORM_MAP["AzureBlobStorage"] == "abs" + assert LINKED_SERVICE_PLATFORM_MAP["AzureBlobFS"] == "abs" + assert LINKED_SERVICE_PLATFORM_MAP["AzureDataLakeStore"] == "abs" def test_major_cloud_databases_covered(self) -> None: """Major cloud databases should be mapped.""" From c6943ee55fd8e495ab1f9b3966f199d1533c8ac1 Mon Sep 17 00:00:00 2001 From: Anush Kumar Date: Thu, 11 Dec 2025 14:32:25 -0800 Subject: [PATCH 11/13] refactor(azure-data-factory): streamline configuration options and lineage caching - Removed default inclusion of datasets, linked services, and triggers from the Azure Data Factory configuration. - Updated lineage caching logic to rely on a single `include_lineage` option for better clarity and efficiency. - Adjusted related documentation to reflect the changes in configuration and caching behavior. --- .../azure-data-factory_recipe.yml | 3 --- .../source/azure_data_factory/adf_config.py | 27 ------------------- .../source/azure_data_factory/adf_source.py | 24 +++++++---------- 3 files changed, 10 insertions(+), 44 deletions(-) diff --git a/metadata-ingestion/docs/sources/azure-data-factory/azure-data-factory_recipe.yml b/metadata-ingestion/docs/sources/azure-data-factory/azure-data-factory_recipe.yml index 175ecad33e8c27..e07c71556b0649 100644 --- a/metadata-ingestion/docs/sources/azure-data-factory/azure-data-factory_recipe.yml +++ b/metadata-ingestion/docs/sources/azure-data-factory/azure-data-factory_recipe.yml @@ -34,9 +34,6 @@ source: include_column_lineage: false # Advanced: requires Data Flow parsing include_execution_history: false # Set to true for pipeline run history execution_history_days: 7 # Only used when include_execution_history is true - include_datasets: true - include_linked_services: true - include_triggers: true # Optional: Map linked services to platform instances for accurate lineage # platform_instance_map: diff --git a/metadata-ingestion/src/datahub/ingestion/source/azure_data_factory/adf_config.py b/metadata-ingestion/src/datahub/ingestion/source/azure_data_factory/adf_config.py index 7b00b53eca0742..2db88fecd2773d 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/azure_data_factory/adf_config.py +++ b/metadata-ingestion/src/datahub/ingestion/source/azure_data_factory/adf_config.py @@ -113,33 +113,6 @@ class AzureDataFactoryConfig( le=90, ) - include_datasets: bool = Field( - default=True, - description=( - "Extract ADF dataset definitions to enable lineage resolution. " - "When enabled, the connector reads dataset configurations (linked service, " - "table names, file paths) to map ADF datasets to DataHub dataset URNs. " - "This is required for table-level lineage. Disable only if you want to " - "extract just pipeline/activity structure without lineage." - ), - ) - - include_linked_services: bool = Field( - default=True, - description=( - "Include linked service connection information as custom properties. " - "Sensitive connection strings are not extracted." - ), - ) - - include_triggers: bool = Field( - default=True, - description=( - "Include trigger information as custom properties on pipelines. " - "Shows schedule and event triggers associated with pipelines." - ), - ) - # Platform Mapping platform_instance_map: dict[str, str] = Field( default_factory=dict, diff --git a/metadata-ingestion/src/datahub/ingestion/source/azure_data_factory/adf_source.py b/metadata-ingestion/src/datahub/ingestion/source/azure_data_factory/adf_source.py index aac2367c3d2612..f0a9c9fd281dd2 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/azure_data_factory/adf_source.py +++ b/metadata-ingestion/src/datahub/ingestion/source/azure_data_factory/adf_source.py @@ -313,29 +313,28 @@ def _cache_factory_resources(self, resource_group: str, factory_name: str) -> No """Cache datasets and linked services for a factory.""" factory_key = f"{resource_group}/{factory_name}" - # Cache datasets - if self.config.include_datasets: + # Cache datasets (needed for lineage resolution) + if self.config.include_lineage: self._datasets_cache[factory_key] = {} for dataset in self.client.get_datasets(resource_group, factory_name): self.report.report_api_call() self.report.report_dataset_scanned() self._datasets_cache[factory_key][dataset.name] = dataset - # Cache linked services - if self.config.include_linked_services: + # Cache linked services (needed for lineage resolution - maps datasets to platforms) + if self.config.include_lineage: self._linked_services_cache[factory_key] = {} for ls in self.client.get_linked_services(resource_group, factory_name): self.report.report_api_call() self.report.report_linked_service_scanned() self._linked_services_cache[factory_key][ls.name] = ls - # Cache triggers - if self.config.include_triggers: - self._triggers_cache[factory_key] = [] - for trigger in self.client.get_triggers(resource_group, factory_name): - self.report.report_api_call() - self.report.report_trigger_scanned() - self._triggers_cache[factory_key].append(trigger) + # Cache triggers (for custom properties on pipelines) + self._triggers_cache[factory_key] = [] + for trigger in self.client.get_triggers(resource_group, factory_name): + self.report.report_api_call() + self.report.report_trigger_scanned() + self._triggers_cache[factory_key].append(trigger) # Cache data flows (for lineage extraction from Data Flow activities) if self.config.include_lineage: @@ -525,9 +524,6 @@ def _get_pipeline_triggers( self, resource_group: str, factory_name: str, pipeline_name: str ) -> list[str]: """Get trigger names associated with a pipeline.""" - if not self.config.include_triggers: - return [] - factory_key = f"{resource_group}/{factory_name}" triggers = self._triggers_cache.get(factory_key, []) From be67bb57be3e618b324af3555d0e3461e9816352 Mon Sep 17 00:00:00 2001 From: Anush Kumar Date: Thu, 11 Dec 2025 15:25:28 -0800 Subject: [PATCH 12/13] feat(azure-data-factory): implement activity run extraction for enhanced metadata tracking - Added functionality to emit activity runs as DataProcessInstance entities linked to DataJobs, improving the granularity of execution history. - Introduced a new method `_emit_activity_runs` to handle the extraction and mapping of activity run properties, including status, duration, and error handling. - Updated integration tests to validate the extraction of activity runs and their properties, ensuring accurate representation in the DataHub UI. - Enhanced unit tests to cover activity run property extraction and URN mapping, ensuring robustness in handling various scenarios. --- .../source/azure_data_factory/adf_source.py | 96 +++ .../adf_with_runs_golden.json | 655 +++++++++++++++++- .../azure_data_factory/test_adf_source.py | 146 +++- .../azure_data_factory/test_adf_source.py | 171 +++++ 4 files changed, 1050 insertions(+), 18 deletions(-) diff --git a/metadata-ingestion/src/datahub/ingestion/source/azure_data_factory/adf_source.py b/metadata-ingestion/src/datahub/ingestion/source/azure_data_factory/adf_source.py index f0a9c9fd281dd2..96087a5e79a1f1 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/azure_data_factory/adf_source.py +++ b/metadata-ingestion/src/datahub/ingestion/source/azure_data_factory/adf_source.py @@ -1128,6 +1128,9 @@ def _emit_pipeline_run( ): yield mcp.as_workunit() + # Emit activity runs for this pipeline run + yield from self._emit_activity_runs(pipeline_run, factory, resource_group) + def _map_run_status(self, status: str) -> Optional[InstanceRunResult]: """Map ADF run status to DataHub InstanceRunResult.""" status_map = { @@ -1151,6 +1154,99 @@ def _get_pipeline_run_url( f"/providers/Microsoft.DataFactory/factories/{factory.name}" ) + def _emit_activity_runs( + self, + pipeline_run: PipelineRun, + factory: Factory, + resource_group: str, + ) -> Iterable[MetadataWorkUnit]: + """Emit activity runs as DataProcessInstance for each DataJob.""" + try: + for activity_run in self.client.get_activity_runs( + resource_group, + factory.name, + pipeline_run.run_id, + ): + self.report.report_api_call() + self.report.report_activity_run_scanned() + + # Build DataJob URN for the template + flow_name = f"{factory.name}.{activity_run.pipeline_name}" + flow_urn = DataFlowUrn.create_from_ids( + orchestrator=PLATFORM, + flow_id=flow_name, + env=self.config.env, + platform_instance=self.config.platform_instance, + ) + job_urn = DataJobUrn.create_from_ids( + data_flow_urn=str(flow_urn), + job_id=activity_run.activity_name, + ) + + # Map ADF status to InstanceRunResult + result = self._map_run_status(activity_run.status) + + # Build custom properties + properties: dict[str, str] = { + "activity_run_id": activity_run.activity_run_id, + "activity_type": activity_run.activity_type, + "pipeline_run_id": activity_run.pipeline_run_id, + "status": activity_run.status, + } + if activity_run.duration_in_ms is not None: + properties["duration_ms"] = str(activity_run.duration_in_ms) + if activity_run.error: + error_msg = str(activity_run.error.get("message", "")) + if error_msg: + properties["error"] = error_msg[:MAX_RUN_MESSAGE_LENGTH] + + # Create DataProcessInstance linked to DataJob + dpi = DataProcessInstance( + id=activity_run.activity_run_id, + orchestrator=PLATFORM, + cluster=self.config.env, + type=DataProcessTypeClass.BATCH_SCHEDULED, + template_urn=job_urn, + properties=properties, + url=self._get_pipeline_run_url( + factory, resource_group, pipeline_run.run_id + ), + data_platform_instance=self.config.platform_instance, + subtype="Activity Run", + ) + + # Emit the instance + for mcp in dpi.generate_mcp( + created_ts_millis=( + int(activity_run.activity_run_start.timestamp() * 1000) + if activity_run.activity_run_start + else None + ), + materialize_iolets=False, + ): + yield mcp.as_workunit() + + # Emit start event + if activity_run.activity_run_start: + start_ts = int(activity_run.activity_run_start.timestamp() * 1000) + for mcp in dpi.start_event_mcp(start_ts): + yield mcp.as_workunit() + + # Emit end event if run is complete + if activity_run.activity_run_end and result: + end_ts = int(activity_run.activity_run_end.timestamp() * 1000) + for mcp in dpi.end_event_mcp( + end_timestamp_millis=end_ts, + result=result, + result_type=activity_run.status, + ): + yield mcp.as_workunit() + + except Exception as e: + logger.warning( + f"Failed to fetch activity runs for pipeline run {pipeline_run.run_id}: {e}" + ) + def get_report(self) -> AzureDataFactorySourceReport: return self.report diff --git a/metadata-ingestion/tests/integration/azure_data_factory/adf_with_runs_golden.json b/metadata-ingestion/tests/integration/azure_data_factory/adf_with_runs_golden.json index 8db2216f22c5e5..964ed8b1d83877 100644 --- a/metadata-ingestion/tests/integration/azure_data_factory/adf_with_runs_golden.json +++ b/metadata-ingestion/tests/integration/azure_data_factory/adf_with_runs_golden.json @@ -769,24 +769,465 @@ "lastRunId": "no-run-id-provided" } }, +{ + "entityType": "dataProcessInstance", + "entityUrn": "urn:li:dataProcessInstance:564015ffc3af551e6d8e26a5f4710ea5", + "changeType": "UPSERT", + "aspectName": "dataProcessInstanceProperties", + "aspect": { + "json": { + "customProperties": { + "activity_run_id": "act-001-copy", + "activity_type": "Copy", + "pipeline_run_id": "run-001-abc", + "status": "Succeeded", + "duration_ms": "900000" + }, + "externalUrl": "https://adf.azure.com/en/monitoring/pipelineruns/run-001-abc?factory=/subscriptions/12345678-1234-1234-1234-123456789012/resourceGroups/test-resource-group/providers/Microsoft.DataFactory/factories/test-data-factory", + "name": "act-001-copy", + "type": "BATCH_SCHEDULED", + "created": { + "time": 1705305900000, + "actor": "urn:li:corpuser:datahub" + } + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-test-with-runs", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataProcessInstance", + "entityUrn": "urn:li:dataProcessInstance:564015ffc3af551e6d8e26a5f4710ea5", + "changeType": "UPSERT", + "aspectName": "dataProcessInstanceRelationships", + "aspect": { + "json": { + "parentTemplate": "urn:li:dataJob:(urn:li:dataFlow:(azure-data-factory,test-data-factory.DataIngestionPipeline,PROD),CopyBlobToSQL)", + "upstreamInstances": [] + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-test-with-runs", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataProcessInstance", + "entityUrn": "urn:li:dataProcessInstance:564015ffc3af551e6d8e26a5f4710ea5", + "changeType": "UPSERT", + "aspectName": "subTypes", + "aspect": { + "json": { + "typeNames": [ + "Activity Run" + ] + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-test-with-runs", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataProcessInstance", + "entityUrn": "urn:li:dataProcessInstance:564015ffc3af551e6d8e26a5f4710ea5", + "changeType": "UPSERT", + "aspectName": "dataProcessInstanceRunEvent", + "aspect": { + "json": { + "timestampMillis": 1705305900000, + "partitionSpec": { + "partition": "FULL_TABLE_SNAPSHOT", + "type": "FULL_TABLE" + }, + "status": "STARTED" + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-test-with-runs", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataProcessInstance", + "entityUrn": "urn:li:dataProcessInstance:564015ffc3af551e6d8e26a5f4710ea5", + "changeType": "UPSERT", + "aspectName": "dataProcessInstanceRunEvent", + "aspect": { + "json": { + "timestampMillis": 1705306800000, + "partitionSpec": { + "partition": "FULL_TABLE_SNAPSHOT", + "type": "FULL_TABLE" + }, + "status": "COMPLETE", + "result": { + "type": "SUCCESS", + "nativeResultType": "Succeeded" + } + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-test-with-runs", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataProcessInstance", + "entityUrn": "urn:li:dataProcessInstance:aaef305ee765d2fac00af4c0f4d859a7", + "changeType": "UPSERT", + "aspectName": "dataProcessInstanceProperties", + "aspect": { + "json": { + "customProperties": { + "activity_run_id": "act-001-lookup", + "activity_type": "Lookup", + "pipeline_run_id": "run-001-abc", + "status": "Succeeded", + "duration_ms": "60000" + }, + "externalUrl": "https://adf.azure.com/en/monitoring/pipelineruns/run-001-abc?factory=/subscriptions/12345678-1234-1234-1234-123456789012/resourceGroups/test-resource-group/providers/Microsoft.DataFactory/factories/test-data-factory", + "name": "act-001-lookup", + "type": "BATCH_SCHEDULED", + "created": { + "time": 1705306800000, + "actor": "urn:li:corpuser:datahub" + } + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-test-with-runs", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataProcessInstance", + "entityUrn": "urn:li:dataProcessInstance:aaef305ee765d2fac00af4c0f4d859a7", + "changeType": "UPSERT", + "aspectName": "dataProcessInstanceRelationships", + "aspect": { + "json": { + "parentTemplate": "urn:li:dataJob:(urn:li:dataFlow:(azure-data-factory,test-data-factory.DataIngestionPipeline,PROD),LookupConfig)", + "upstreamInstances": [] + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-test-with-runs", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataProcessInstance", + "entityUrn": "urn:li:dataProcessInstance:aaef305ee765d2fac00af4c0f4d859a7", + "changeType": "UPSERT", + "aspectName": "subTypes", + "aspect": { + "json": { + "typeNames": [ + "Activity Run" + ] + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-test-with-runs", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataProcessInstance", + "entityUrn": "urn:li:dataProcessInstance:aaef305ee765d2fac00af4c0f4d859a7", + "changeType": "UPSERT", + "aspectName": "dataProcessInstanceRunEvent", + "aspect": { + "json": { + "timestampMillis": 1705306800000, + "partitionSpec": { + "partition": "FULL_TABLE_SNAPSHOT", + "type": "FULL_TABLE" + }, + "status": "STARTED" + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-test-with-runs", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataProcessInstance", + "entityUrn": "urn:li:dataProcessInstance:aaef305ee765d2fac00af4c0f4d859a7", + "changeType": "UPSERT", + "aspectName": "dataProcessInstanceRunEvent", + "aspect": { + "json": { + "timestampMillis": 1705306860000, + "partitionSpec": { + "partition": "FULL_TABLE_SNAPSHOT", + "type": "FULL_TABLE" + }, + "status": "COMPLETE", + "result": { + "type": "SUCCESS", + "nativeResultType": "Succeeded" + } + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-test-with-runs", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataProcessInstance", + "entityUrn": "urn:li:dataProcessInstance:be65c76a61bba7d112df51643744bd7d", + "changeType": "UPSERT", + "aspectName": "dataProcessInstanceProperties", + "aspect": { + "json": { + "customProperties": { + "activity_run_id": "act-001-transform", + "activity_type": "ExecuteDataFlow", + "pipeline_run_id": "run-001-abc", + "status": "Succeeded", + "duration_ms": "1440000" + }, + "externalUrl": "https://adf.azure.com/en/monitoring/pipelineruns/run-001-abc?factory=/subscriptions/12345678-1234-1234-1234-123456789012/resourceGroups/test-resource-group/providers/Microsoft.DataFactory/factories/test-data-factory", + "name": "act-001-transform", + "type": "BATCH_SCHEDULED", + "created": { + "time": 1705306860000, + "actor": "urn:li:corpuser:datahub" + } + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-test-with-runs", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataProcessInstance", + "entityUrn": "urn:li:dataProcessInstance:be65c76a61bba7d112df51643744bd7d", + "changeType": "UPSERT", + "aspectName": "dataProcessInstanceRelationships", + "aspect": { + "json": { + "parentTemplate": "urn:li:dataJob:(urn:li:dataFlow:(azure-data-factory,test-data-factory.DataIngestionPipeline,PROD),TransformData)", + "upstreamInstances": [] + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-test-with-runs", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataProcessInstance", + "entityUrn": "urn:li:dataProcessInstance:be65c76a61bba7d112df51643744bd7d", + "changeType": "UPSERT", + "aspectName": "subTypes", + "aspect": { + "json": { + "typeNames": [ + "Activity Run" + ] + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-test-with-runs", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataProcessInstance", + "entityUrn": "urn:li:dataProcessInstance:be65c76a61bba7d112df51643744bd7d", + "changeType": "UPSERT", + "aspectName": "dataProcessInstanceRunEvent", + "aspect": { + "json": { + "timestampMillis": 1705306860000, + "partitionSpec": { + "partition": "FULL_TABLE_SNAPSHOT", + "type": "FULL_TABLE" + }, + "status": "STARTED" + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-test-with-runs", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataProcessInstance", + "entityUrn": "urn:li:dataProcessInstance:be65c76a61bba7d112df51643744bd7d", + "changeType": "UPSERT", + "aspectName": "dataProcessInstanceRunEvent", + "aspect": { + "json": { + "timestampMillis": 1705308300000, + "partitionSpec": { + "partition": "FULL_TABLE_SNAPSHOT", + "type": "FULL_TABLE" + }, + "status": "COMPLETE", + "result": { + "type": "SUCCESS", + "nativeResultType": "Succeeded" + } + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-test-with-runs", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataProcessInstance", + "entityUrn": "urn:li:dataProcessInstance:1f3c62dd3b62b29a83221a57582a9bf3", + "changeType": "UPSERT", + "aspectName": "dataProcessInstanceProperties", + "aspect": { + "json": { + "customProperties": { + "run_id": "run-002-def", + "status": "Failed", + "invoked_by": "Manual", + "invoked_by_type": "Manual" + }, + "externalUrl": "https://adf.azure.com/en/monitoring/pipelineruns/run-002-def?factory=/subscriptions/12345678-1234-1234-1234-123456789012/resourceGroups/test-resource-group/providers/Microsoft.DataFactory/factories/test-data-factory", + "name": "run-002-def", + "type": "BATCH_SCHEDULED", + "created": { + "time": 1705219200000, + "actor": "urn:li:corpuser:datahub" + } + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-test-with-runs", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataProcessInstance", + "entityUrn": "urn:li:dataProcessInstance:1f3c62dd3b62b29a83221a57582a9bf3", + "changeType": "UPSERT", + "aspectName": "dataProcessInstanceRelationships", + "aspect": { + "json": { + "parentTemplate": "urn:li:dataFlow:(azure-data-factory,test-data-factory.DataIngestionPipeline,PROD)", + "upstreamInstances": [] + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-test-with-runs", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataProcessInstance", + "entityUrn": "urn:li:dataProcessInstance:1f3c62dd3b62b29a83221a57582a9bf3", + "changeType": "UPSERT", + "aspectName": "subTypes", + "aspect": { + "json": { + "typeNames": [ + "Pipeline Run" + ] + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-test-with-runs", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataProcessInstance", + "entityUrn": "urn:li:dataProcessInstance:1f3c62dd3b62b29a83221a57582a9bf3", + "changeType": "UPSERT", + "aspectName": "dataProcessInstanceRunEvent", + "aspect": { + "json": { + "timestampMillis": 1705219200000, + "partitionSpec": { + "partition": "FULL_TABLE_SNAPSHOT", + "type": "FULL_TABLE" + }, + "status": "STARTED" + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-test-with-runs", + "lastRunId": "no-run-id-provided" + } +}, { "entityType": "dataProcessInstance", "entityUrn": "urn:li:dataProcessInstance:1f3c62dd3b62b29a83221a57582a9bf3", "changeType": "UPSERT", + "aspectName": "dataProcessInstanceRunEvent", + "aspect": { + "json": { + "timestampMillis": 1705220100000, + "partitionSpec": { + "partition": "FULL_TABLE_SNAPSHOT", + "type": "FULL_TABLE" + }, + "status": "COMPLETE", + "result": { + "type": "FAILURE", + "nativeResultType": "Failed" + } + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-test-with-runs", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataProcessInstance", + "entityUrn": "urn:li:dataProcessInstance:c86ba22f37d917ce9c2933e1cc443fd5", + "changeType": "UPSERT", "aspectName": "dataProcessInstanceProperties", "aspect": { "json": { "customProperties": { - "run_id": "run-002-def", + "activity_run_id": "act-002-copy", + "activity_type": "Copy", + "pipeline_run_id": "run-002-def", "status": "Failed", - "invoked_by": "Manual", - "invoked_by_type": "Manual" + "duration_ms": "600000", + "error": "Connection timeout to SQL database" }, "externalUrl": "https://adf.azure.com/en/monitoring/pipelineruns/run-002-def?factory=/subscriptions/12345678-1234-1234-1234-123456789012/resourceGroups/test-resource-group/providers/Microsoft.DataFactory/factories/test-data-factory", - "name": "run-002-def", + "name": "act-002-copy", "type": "BATCH_SCHEDULED", "created": { - "time": 1705219200000, + "time": 1705219500000, "actor": "urn:li:corpuser:datahub" } } @@ -799,12 +1240,12 @@ }, { "entityType": "dataProcessInstance", - "entityUrn": "urn:li:dataProcessInstance:1f3c62dd3b62b29a83221a57582a9bf3", + "entityUrn": "urn:li:dataProcessInstance:c86ba22f37d917ce9c2933e1cc443fd5", "changeType": "UPSERT", "aspectName": "dataProcessInstanceRelationships", "aspect": { "json": { - "parentTemplate": "urn:li:dataFlow:(azure-data-factory,test-data-factory.DataIngestionPipeline,PROD)", + "parentTemplate": "urn:li:dataJob:(urn:li:dataFlow:(azure-data-factory,test-data-factory.DataIngestionPipeline,PROD),CopyBlobToSQL)", "upstreamInstances": [] } }, @@ -816,13 +1257,13 @@ }, { "entityType": "dataProcessInstance", - "entityUrn": "urn:li:dataProcessInstance:1f3c62dd3b62b29a83221a57582a9bf3", + "entityUrn": "urn:li:dataProcessInstance:c86ba22f37d917ce9c2933e1cc443fd5", "changeType": "UPSERT", "aspectName": "subTypes", "aspect": { "json": { "typeNames": [ - "Pipeline Run" + "Activity Run" ] } }, @@ -834,12 +1275,12 @@ }, { "entityType": "dataProcessInstance", - "entityUrn": "urn:li:dataProcessInstance:1f3c62dd3b62b29a83221a57582a9bf3", + "entityUrn": "urn:li:dataProcessInstance:c86ba22f37d917ce9c2933e1cc443fd5", "changeType": "UPSERT", "aspectName": "dataProcessInstanceRunEvent", "aspect": { "json": { - "timestampMillis": 1705219200000, + "timestampMillis": 1705219500000, "partitionSpec": { "partition": "FULL_TABLE_SNAPSHOT", "type": "FULL_TABLE" @@ -855,7 +1296,7 @@ }, { "entityType": "dataProcessInstance", - "entityUrn": "urn:li:dataProcessInstance:1f3c62dd3b62b29a83221a57582a9bf3", + "entityUrn": "urn:li:dataProcessInstance:c86ba22f37d917ce9c2933e1cc443fd5", "changeType": "UPSERT", "aspectName": "dataProcessInstanceRunEvent", "aspect": { @@ -987,6 +1428,116 @@ "lastRunId": "no-run-id-provided" } }, +{ + "entityType": "dataProcessInstance", + "entityUrn": "urn:li:dataProcessInstance:215721aa606120d23beb87553668afb1", + "changeType": "UPSERT", + "aspectName": "dataProcessInstanceProperties", + "aspect": { + "json": { + "customProperties": { + "activity_run_id": "act-003-proc", + "activity_type": "SqlServerStoredProcedure", + "pipeline_run_id": "run-003-ghi", + "status": "Succeeded", + "duration_ms": "1500000" + }, + "externalUrl": "https://adf.azure.com/en/monitoring/pipelineruns/run-003-ghi?factory=/subscriptions/12345678-1234-1234-1234-123456789012/resourceGroups/test-resource-group/providers/Microsoft.DataFactory/factories/test-data-factory", + "name": "act-003-proc", + "type": "BATCH_SCHEDULED", + "created": { + "time": 1705309500000, + "actor": "urn:li:corpuser:datahub" + } + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-test-with-runs", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataProcessInstance", + "entityUrn": "urn:li:dataProcessInstance:215721aa606120d23beb87553668afb1", + "changeType": "UPSERT", + "aspectName": "dataProcessInstanceRelationships", + "aspect": { + "json": { + "parentTemplate": "urn:li:dataJob:(urn:li:dataFlow:(azure-data-factory,test-data-factory.DataProcessingPipeline,PROD),CallStoredProc)", + "upstreamInstances": [] + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-test-with-runs", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataProcessInstance", + "entityUrn": "urn:li:dataProcessInstance:215721aa606120d23beb87553668afb1", + "changeType": "UPSERT", + "aspectName": "subTypes", + "aspect": { + "json": { + "typeNames": [ + "Activity Run" + ] + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-test-with-runs", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataProcessInstance", + "entityUrn": "urn:li:dataProcessInstance:215721aa606120d23beb87553668afb1", + "changeType": "UPSERT", + "aspectName": "dataProcessInstanceRunEvent", + "aspect": { + "json": { + "timestampMillis": 1705309500000, + "partitionSpec": { + "partition": "FULL_TABLE_SNAPSHOT", + "type": "FULL_TABLE" + }, + "status": "STARTED" + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-test-with-runs", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataProcessInstance", + "entityUrn": "urn:li:dataProcessInstance:215721aa606120d23beb87553668afb1", + "changeType": "UPSERT", + "aspectName": "dataProcessInstanceRunEvent", + "aspect": { + "json": { + "timestampMillis": 1705311000000, + "partitionSpec": { + "partition": "FULL_TABLE_SNAPSHOT", + "type": "FULL_TABLE" + }, + "status": "COMPLETE", + "result": { + "type": "SUCCESS", + "nativeResultType": "Succeeded" + } + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-test-with-runs", + "lastRunId": "no-run-id-provided" + } +}, { "entityType": "container", "entityUrn": "urn:li:container:cdaebfa861d2b2a3853328719496bce5", @@ -1115,6 +1666,38 @@ "lastRunId": "no-run-id-provided" } }, +{ + "entityType": "dataProcessInstance", + "entityUrn": "urn:li:dataProcessInstance:215721aa606120d23beb87553668afb1", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-test-with-runs", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataProcessInstance", + "entityUrn": "urn:li:dataProcessInstance:564015ffc3af551e6d8e26a5f4710ea5", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-test-with-runs", + "lastRunId": "no-run-id-provided" + } +}, { "entityType": "dataProcessInstance", "entityUrn": "urn:li:dataProcessInstance:7aa70b5e31344dc1946c045ef1df4619", @@ -1146,5 +1729,53 @@ "runId": "adf-test-with-runs", "lastRunId": "no-run-id-provided" } +}, +{ + "entityType": "dataProcessInstance", + "entityUrn": "urn:li:dataProcessInstance:aaef305ee765d2fac00af4c0f4d859a7", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-test-with-runs", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataProcessInstance", + "entityUrn": "urn:li:dataProcessInstance:be65c76a61bba7d112df51643744bd7d", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-test-with-runs", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataProcessInstance", + "entityUrn": "urn:li:dataProcessInstance:c86ba22f37d917ce9c2933e1cc443fd5", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + }, + "systemMetadata": { + "lastObserved": 1705320000000, + "runId": "adf-test-with-runs", + "lastRunId": "no-run-id-provided" + } } ] \ No newline at end of file diff --git a/metadata-ingestion/tests/integration/azure_data_factory/test_adf_source.py b/metadata-ingestion/tests/integration/azure_data_factory/test_adf_source.py index d3a76433dfdaaa..17462be3a06f65 100644 --- a/metadata-ingestion/tests/integration/azure_data_factory/test_adf_source.py +++ b/metadata-ingestion/tests/integration/azure_data_factory/test_adf_source.py @@ -191,6 +191,39 @@ def create_mock_pipeline_run( } +def create_mock_activity_run( + activity_run_id: str, + activity_name: str, + activity_type: str, + pipeline_run_id: str, + pipeline_name: str, + status: str = "Succeeded", + start_time: Optional[datetime] = None, + end_time: Optional[datetime] = None, + duration_ms: int = 30000, + error: Optional[Dict[str, Any]] = None, +) -> Dict[str, Any]: + """Create a mock activity run response.""" + return { + "activityRunId": activity_run_id, + "activityName": activity_name, + "activityType": activity_type, + "pipelineRunId": pipeline_run_id, + "pipelineName": pipeline_name, + "status": status, + "activityRunStart": ( + start_time or datetime(2024, 1, 15, 10, 5, 0, tzinfo=timezone.utc) + ).isoformat(), + "activityRunEnd": ( + end_time or datetime(2024, 1, 15, 10, 10, 0, tzinfo=timezone.utc) + ).isoformat(), + "durationInMs": duration_ms, + "input": {}, + "output": {}, + "error": error, + } + + class MockAzureResource: """Mock class to simulate Azure SDK resource objects.""" @@ -388,6 +421,76 @@ def get_mock_test_data() -> Dict[str, Any]: ), ] + # Create activity runs for each pipeline run + # Activity runs are linked to DataJobs (activities), not DataFlows (pipelines) + activity_runs = { + "run-001-abc": [ # DataIngestionPipeline - Succeeded + create_mock_activity_run( + activity_run_id="act-001-copy", + activity_name="CopyBlobToSQL", + activity_type="Copy", + pipeline_run_id="run-001-abc", + pipeline_name="DataIngestionPipeline", + status="Succeeded", + start_time=datetime(2024, 1, 15, 8, 5, 0, tzinfo=timezone.utc), + end_time=datetime(2024, 1, 15, 8, 20, 0, tzinfo=timezone.utc), + duration_ms=900000, + ), + create_mock_activity_run( + activity_run_id="act-001-lookup", + activity_name="LookupConfig", + activity_type="Lookup", + pipeline_run_id="run-001-abc", + pipeline_name="DataIngestionPipeline", + status="Succeeded", + start_time=datetime(2024, 1, 15, 8, 20, 0, tzinfo=timezone.utc), + end_time=datetime(2024, 1, 15, 8, 21, 0, tzinfo=timezone.utc), + duration_ms=60000, + ), + create_mock_activity_run( + activity_run_id="act-001-transform", + activity_name="TransformData", + activity_type="ExecuteDataFlow", + pipeline_run_id="run-001-abc", + pipeline_name="DataIngestionPipeline", + status="Succeeded", + start_time=datetime(2024, 1, 15, 8, 21, 0, tzinfo=timezone.utc), + end_time=datetime(2024, 1, 15, 8, 45, 0, tzinfo=timezone.utc), + duration_ms=1440000, + ), + ], + "run-002-def": [ # DataIngestionPipeline - Failed + create_mock_activity_run( + activity_run_id="act-002-copy", + activity_name="CopyBlobToSQL", + activity_type="Copy", + pipeline_run_id="run-002-def", + pipeline_name="DataIngestionPipeline", + status="Failed", + start_time=datetime(2024, 1, 14, 8, 5, 0, tzinfo=timezone.utc), + end_time=datetime(2024, 1, 14, 8, 15, 0, tzinfo=timezone.utc), + duration_ms=600000, + error={ + "message": "Connection timeout to SQL database", + "errorCode": "2200", + }, + ), + ], + "run-003-ghi": [ # DataProcessingPipeline - Succeeded + create_mock_activity_run( + activity_run_id="act-003-proc", + activity_name="CallStoredProc", + activity_type="SqlServerStoredProcedure", + pipeline_run_id="run-003-ghi", + pipeline_name="DataProcessingPipeline", + status="Succeeded", + start_time=datetime(2024, 1, 15, 9, 5, 0, tzinfo=timezone.utc), + end_time=datetime(2024, 1, 15, 9, 30, 0, tzinfo=timezone.utc), + duration_ms=1500000, + ), + ], + } + return { "factories": factories, "pipelines": pipelines, @@ -395,11 +498,20 @@ def get_mock_test_data() -> Dict[str, Any]: "linked_services": linked_services, "triggers": triggers, "pipeline_runs": pipeline_runs, + "activity_runs": activity_runs, } -def create_mock_client(test_data: Dict[str, Any]) -> MagicMock: - """Create a mock DataFactoryManagementClient.""" +def create_mock_client( + test_data: Dict[str, Any], include_activity_runs: bool = False +) -> MagicMock: + """Create a mock DataFactoryManagementClient. + + Args: + test_data: Dictionary containing mock data for factories, pipelines, etc. + include_activity_runs: If True, return activity runs for each pipeline run. + This enables testing of the activity run extraction feature. + """ mock_client = MagicMock() # Mock factories @@ -436,8 +548,22 @@ def create_mock_client(test_data: Dict[str, Any]) -> MagicMock: test_data["pipeline_runs"] ) - # Mock activity runs (empty for basic tests) - mock_client.activity_runs.query_by_pipeline_run.return_value = MockQueryResponse([]) + # Mock activity runs - return based on pipeline run ID if enabled + if include_activity_runs and "activity_runs" in test_data: + activity_runs_by_pipeline = test_data["activity_runs"] + + def get_activity_runs( + resource_group_name: str, factory_name: str, run_id: str, filter_parameters + ) -> MockQueryResponse: + """Return activity runs for the given pipeline run ID.""" + runs = activity_runs_by_pipeline.get(run_id, []) + return MockQueryResponse(runs) + + mock_client.activity_runs.query_by_pipeline_run.side_effect = get_activity_runs + else: + mock_client.activity_runs.query_by_pipeline_run.return_value = ( + MockQueryResponse([]) + ) return mock_client @@ -501,13 +627,21 @@ def test_adf_source_basic(pytestconfig, tmp_path): @freeze_time(FROZEN_TIME) @pytest.mark.integration def test_adf_source_with_execution_history(pytestconfig, tmp_path): - """Test ADF metadata extraction with execution history.""" + """Test ADF metadata extraction with execution history. + + This test verifies: + - Pipeline runs are extracted as DataProcessInstance linked to DataFlow + - Activity runs are extracted as DataProcessInstance linked to DataJob + - Run status (Succeeded, Failed) is correctly mapped + - Both start and end events are emitted for completed runs + """ test_resources_dir = pytestconfig.rootpath / "tests/integration/azure_data_factory" output_file = tmp_path / "adf_with_runs_events.json" golden_file = test_resources_dir / "adf_with_runs_golden.json" test_data = get_mock_test_data() - mock_client = create_mock_client(test_data) + # Enable activity runs to test DataJob-level run history + mock_client = create_mock_client(test_data, include_activity_runs=True) with mock.patch( "datahub.ingestion.source.azure_data_factory.adf_client.DataFactoryManagementClient" diff --git a/metadata-ingestion/tests/unit/azure_data_factory/test_adf_source.py b/metadata-ingestion/tests/unit/azure_data_factory/test_adf_source.py index 8ca3d574053d9a..73a1c0f026c0c6 100644 --- a/metadata-ingestion/tests/unit/azure_data_factory/test_adf_source.py +++ b/metadata-ingestion/tests/unit/azure_data_factory/test_adf_source.py @@ -276,3 +276,174 @@ def test_extract_with_complex_resource_group_name(self) -> None: rg_index = parts.index("resourceGroups") extracted = parts[rg_index + 1] assert extracted == expected + + +class TestActivityRunPropertyExtraction: + """Tests for activity run property extraction logic. + + Activity runs create DataProcessInstance entities linked to DataJobs. + These tests verify the property extraction patterns. + """ + + def test_activity_run_properties_extracted(self) -> None: + """Verify essential activity run properties are extracted.""" + activity_run = { + "activityRunId": "act-run-123", + "activityName": "CopyData", + "activityType": "Copy", + "pipelineRunId": "pipe-run-456", + "status": "Succeeded", + "durationInMs": 45000, + } + + # Logic pattern from _emit_activity_runs + properties: dict[str, str] = { + "activity_run_id": activity_run["activityRunId"], + "activity_type": activity_run["activityType"], + "pipeline_run_id": activity_run["pipelineRunId"], + "status": activity_run["status"], + } + + if activity_run.get("durationInMs") is not None: + properties["duration_ms"] = str(activity_run["durationInMs"]) + + assert properties["activity_run_id"] == "act-run-123" + assert properties["activity_type"] == "Copy" + assert properties["pipeline_run_id"] == "pipe-run-456" + assert properties["status"] == "Succeeded" + assert properties["duration_ms"] == "45000" + + def test_activity_run_error_truncated(self) -> None: + """Verify error messages are truncated to prevent oversized properties.""" + MAX_RUN_MESSAGE_LENGTH = 500 + long_error = "E" * 1000 # 1000 character error + + activity_run = { + "activityRunId": "act-run-err", + "error": {"message": long_error}, + } + + # Logic pattern from _emit_activity_runs + error = activity_run.get("error", {}) + if error: + error_msg = str(error.get("message", "")) + if error_msg: + truncated = error_msg[:MAX_RUN_MESSAGE_LENGTH] + + assert len(truncated) == MAX_RUN_MESSAGE_LENGTH + assert len(truncated) < len(long_error) + + def test_activity_run_missing_optional_fields(self) -> None: + """Verify graceful handling of missing optional fields.""" + activity_run = { + "activityRunId": "act-run-minimal", + "activityName": "MinimalActivity", + "activityType": "Copy", + "pipelineRunId": "pipe-run-789", + "status": "Succeeded", + # No durationInMs, error, input, output + } + + properties: dict[str, str] = { + "activity_run_id": activity_run["activityRunId"], + "activity_type": activity_run["activityType"], + "pipeline_run_id": activity_run["pipelineRunId"], + "status": activity_run["status"], + } + + # Optional fields should not cause errors + if activity_run.get("durationInMs") is not None: + properties["duration_ms"] = str(activity_run["durationInMs"]) + + error = activity_run.get("error") + if error: + error_msg = str(error.get("message", "")) + if error_msg: + properties["error"] = error_msg[:500] + + assert "duration_ms" not in properties + assert "error" not in properties + assert len(properties) == 4 + + +class TestActivityRunToDataJobUrnMapping: + """Tests for mapping activity runs to DataJob URNs. + + Activity runs must link to DataJob URNs (not DataFlow URNs) so the + Runs tab appears on DataJob pages in the UI. + """ + + def test_datajob_urn_constructed_from_activity_run(self) -> None: + """DataJob URN should use activity name as job_id.""" + from datahub.metadata.urns import DataFlowUrn, DataJobUrn + + factory_name = "my-factory" + pipeline_name = "DataPipeline" + activity_name = "CopyActivity" + env = "PROD" + platform = "azure-data-factory" + + # Logic pattern from _emit_activity_runs + flow_name = f"{factory_name}.{pipeline_name}" + flow_urn = DataFlowUrn.create_from_ids( + orchestrator=platform, + flow_id=flow_name, + env=env, + ) + job_urn = DataJobUrn.create_from_ids( + data_flow_urn=str(flow_urn), + job_id=activity_name, + ) + + # Verify URN structure + assert "dataJob" in str(job_urn) + assert activity_name in str(job_urn) + assert flow_name in str(job_urn) + assert platform in str(job_urn) + + def test_activity_run_links_to_datajob_not_dataflow(self) -> None: + """Verify activity runs link to DataJob, enabling the Runs tab in UI.""" + from datahub.metadata.urns import DataFlowUrn, DataJobUrn + + flow_urn = DataFlowUrn.create_from_ids( + orchestrator="azure-data-factory", + flow_id="factory.pipeline", + env="PROD", + ) + job_urn = DataJobUrn.create_from_ids( + data_flow_urn=str(flow_urn), + job_id="MyActivity", + ) + + # The URN type should be dataJob, not dataFlow + assert job_urn.entity_type == "dataJob" + assert flow_urn.entity_type == "dataFlow" + + # The job URN should reference the flow URN + assert str(flow_urn) in str(job_urn) + + def test_multiple_activities_get_unique_urns(self) -> None: + """Each activity in a pipeline should have a unique DataJob URN.""" + from datahub.metadata.urns import DataFlowUrn, DataJobUrn + + flow_urn = DataFlowUrn.create_from_ids( + orchestrator="azure-data-factory", + flow_id="factory.pipeline", + env="PROD", + ) + + activities = ["CopyData", "TransformData", "LoadData"] + job_urns = [ + DataJobUrn.create_from_ids( + data_flow_urn=str(flow_urn), + job_id=activity, + ) + for activity in activities + ] + + # All URNs should be unique + assert len(set(str(u) for u in job_urns)) == len(activities) + + # Each URN should contain its activity name + for activity, urn in zip(activities, job_urns): + assert activity in str(urn) From 135d10a2d06f9545bf5b6d58ee65e1a918d6140c Mon Sep 17 00:00:00 2001 From: Anush Kumar Date: Sat, 13 Dec 2025 23:14:35 -0800 Subject: [PATCH 13/13] feat(azure-data-factory): improve logging for Azure Data Factory ingestion process - Added detailed logging to track the start of ingestion, resource group filtering, lineage resource fetching, and pipeline extraction for better observability. - Updated execution history logging to clarify the fetching process for factory execution history. --- .../source/azure_data_factory/adf_source.py | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/metadata-ingestion/src/datahub/ingestion/source/azure_data_factory/adf_source.py b/metadata-ingestion/src/datahub/ingestion/source/azure_data_factory/adf_source.py index 96087a5e79a1f1..13ec2b83a0a837 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/azure_data_factory/adf_source.py +++ b/metadata-ingestion/src/datahub/ingestion/source/azure_data_factory/adf_source.py @@ -265,6 +265,11 @@ def get_workunit_processors(self) -> list[Optional[MetadataWorkUnitProcessor]]: def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]: """Generate workunits for all Azure Data Factory resources.""" + logger.info( + f"Starting Azure Data Factory ingestion for subscription: {self.config.subscription_id}" + ) + if self.config.resource_group: + logger.info(f"Filtering to resource group: {self.config.resource_group}") # Iterate over all factories for factory in self.client.get_factories( @@ -285,6 +290,8 @@ def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]: resource_group = self._extract_resource_group(factory.id) # Cache datasets and linked services for this factory + if self.config.include_lineage: + logger.info(f"Fetching lineage resources for factory: {factory.name}") self._cache_factory_resources(resource_group, factory.name) # Emit factory as container and get the Container object for browse paths @@ -292,6 +299,9 @@ def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]: yield from container_workunits # Process pipelines, passing the Container for proper browse path hierarchy + logger.info( + f"Extracting pipelines and activities for factory: {factory.name}" + ) yield from self._process_pipelines(factory, resource_group, container) # Process execution history if enabled @@ -1029,7 +1039,7 @@ def _process_execution_history( ) -> Iterable[MetadataWorkUnit]: """Process pipeline execution history for a factory.""" logger.info( - f"Processing execution history for factory: {factory.name} " + f"Fetching execution history for factory: {factory.name} " f"(last {self.config.execution_history_days} days)" )