Add Dataplex operators (#20377)

apache · Mar 14, 2022 · 87c1246 · 87c1246
1 parent c1ab8e2
commit 87c1246
Show file tree

Hide file tree

Showing 13 changed files with 1,564 additions and 0 deletions.
diff --git a/airflow/providers/google/cloud/example_dags/example_dataplex.py b/airflow/providers/google/cloud/example_dags/example_dataplex.py
@@ -0,0 +1,122 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+"""
+Example Airflow DAG that shows how to use Dataplex.
+"""
+
+import datetime
+import os
+
+from airflow import models
+from airflow.models.baseoperator import chain
+from airflow.providers.google.cloud.operators.dataplex import (
+    DataplexCreateTaskOperator,
+    DataplexDeleteTaskOperator,
+    DataplexGetTaskOperator,
+    DataplexListTasksOperator,
+)
+from airflow.providers.google.cloud.sensors.dataplex import DataplexTaskStateSensor
+
+PROJECT_ID = os.environ.get("GCP_PROJECT_ID", "INVALID PROJECT ID")
+REGION = os.environ.get("GCP_REGION", "INVALID REGION")
+LAKE_ID = os.environ.get("GCP_LAKE_ID", "INVALID LAKE ID")
+SERVICE_ACC = os.environ.get("GCP_DATAPLEX_SERVICE_ACC", "[email protected]")
+BUCKET = os.environ.get("GCP_DATAPLEX_BUCKET", "INVALID BUCKET NAME")
+SPARK_FILE_NAME = os.environ.get("SPARK_FILE_NAME", "INVALID FILE NAME")
+SPARK_FILE_FULL_PATH = f"gs://{BUCKET}/{SPARK_FILE_NAME}"
+DATAPLEX_TASK_ID = "task001"
+TRIGGER_SPEC_TYPE = "ON_DEMAND"
+
+# [START howto_dataplex_configuration]
+EXAMPLE_TASK_BODY = {
+    "trigger_spec": {"type_": TRIGGER_SPEC_TYPE},
+    "execution_spec": {"service_account": SERVICE_ACC},
+    "spark": {"python_script_file": SPARK_FILE_FULL_PATH},
+}
+# [END howto_dataplex_configuration]
+
+with models.DAG(
+    "example_dataplex", start_date=datetime.datetime(2021, 1, 1), schedule_interval="@once", catchup=False
+) as dag:
+    # [START howto_dataplex_create_task_operator]
+    create_dataplex_task = DataplexCreateTaskOperator(
+        project_id=PROJECT_ID,
+        region=REGION,
+        lake_id=LAKE_ID,
+        body=EXAMPLE_TASK_BODY,
+        dataplex_task_id=DATAPLEX_TASK_ID,
+        task_id="create_dataplex_task",
+    )
+    # [END howto_dataplex_create_task_operator]
+
+    # [START howto_dataplex_async_create_task_operator]
+    create_dataplex_task_async = DataplexCreateTaskOperator(
+        project_id=PROJECT_ID,
+        region=REGION,
+        lake_id=LAKE_ID,
+        body=EXAMPLE_TASK_BODY,
+        dataplex_task_id=DATAPLEX_TASK_ID,
+        asynchronous=True,
+        task_id="create_dataplex_task_async",
+    )
+    # [END howto_dataplex_async_create_task_operator]
+
+    # [START howto_dataplex_delete_task_operator]
+    delete_dataplex_task = DataplexDeleteTaskOperator(
+        project_id=PROJECT_ID,
+        region=REGION,
+        lake_id=LAKE_ID,
+        dataplex_task_id=DATAPLEX_TASK_ID,
+        task_id="delete_dataplex_task",
+    )
+    # [END howto_dataplex_delete_task_operator]
+
+    # [START howto_dataplex_list_tasks_operator]
+    list_dataplex_task = DataplexListTasksOperator(
+        project_id=PROJECT_ID, region=REGION, lake_id=LAKE_ID, task_id="list_dataplex_task"
+    )
+    # [END howto_dataplex_list_tasks_operator]
+
+    # [START howto_dataplex_get_task_operator]
+    get_dataplex_task = DataplexGetTaskOperator(
+        project_id=PROJECT_ID,
+        region=REGION,
+        lake_id=LAKE_ID,
+        dataplex_task_id=DATAPLEX_TASK_ID,
+        task_id="get_dataplex_task",
+    )
+    # [END howto_dataplex_get_task_operator]
+
+    # [START howto_dataplex_task_state_sensor]
+    dataplex_task_state = DataplexTaskStateSensor(
+        project_id=PROJECT_ID,
+        region=REGION,
+        lake_id=LAKE_ID,
+        dataplex_task_id=DATAPLEX_TASK_ID,
+        task_id="dataplex_task_state",
+    )
+    # [END howto_dataplex_task_state_sensor]
+
+    chain(
+        create_dataplex_task,
+        get_dataplex_task,
+        list_dataplex_task,
+        delete_dataplex_task,
+        create_dataplex_task_async,
+        dataplex_task_state,
+    )
diff --git a/airflow/providers/google/cloud/hooks/dataplex.py b/airflow/providers/google/cloud/hooks/dataplex.py
@@ -0,0 +1,247 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+"""This module contains Google Dataplex hook."""
+from typing import Any, Dict, Optional, Sequence, Tuple, Union
+
+from google.api_core.operation import Operation
+from google.api_core.retry import Retry
+from google.cloud.dataplex_v1 import DataplexServiceClient
+from google.cloud.dataplex_v1.types import Task
+from googleapiclient.discovery import Resource
+
+from airflow.exceptions import AirflowException
+from airflow.providers.google.common.hooks.base_google import GoogleBaseHook
+
+
+class DataplexHook(GoogleBaseHook):
+    """
+    Hook for Google Dataplex.
+
+    :param api_version: The version of the api that will be requested for example 'v3'.
+    :param gcp_conn_id: The connection ID to use when fetching connection info.
+    :param delegate_to: The account to impersonate, if any. For this to work, the service accountmaking the
+        request must have  domain-wide delegation enabled.
+    :param impersonation_chain: Optional service account to impersonate using short-term
+        credentials, or chained list of accounts required to get the access_token
+        of the last account in the list, which will be impersonated in the request.
+        If set as a string, the account must grant the originating account
+        the Service Account Token Creator IAM role.
+        If set as a sequence, the identities from the list must grant
+        Service Account Token Creator IAM role to the directly preceding identity, with first
+        account from the list granting this role to the originating account (templated).
+    """
+
+    _conn = None  # type: Optional[Resource]
+
+    def __init__(
+        self,
+        api_version: str = "v1",
+        gcp_conn_id: str = "google_cloud_default",
+        delegate_to: Optional[str] = None,
+        impersonation_chain: Optional[Union[str, Sequence[str]]] = None,
+    ) -> None:
+        super().__init__(
+            gcp_conn_id=gcp_conn_id,
+            delegate_to=delegate_to,
+            impersonation_chain=impersonation_chain,
+        )
+        self.api_version = api_version
+
+    def get_dataplex_client(self) -> DataplexServiceClient:
+        """Returns DataplexServiceClient."""
+        client_options = {'api_endpoint': 'dataplex.googleapis.com:443'}
+
+        return DataplexServiceClient(
+            credentials=self._get_credentials(), client_info=self.client_info, client_options=client_options
+        )
+
+    def wait_for_operation(self, timeout: Optional[float], operation: Operation):
+        """Waits for long-lasting operation to complete."""
+        try:
+            return operation.result(timeout=timeout)
+        except Exception:
+            error = operation.exception(timeout=timeout)
+            raise AirflowException(error)
+
+    @GoogleBaseHook.fallback_to_default_project_id
+    def create_task(
+        self,
+        project_id: str,
+        region: str,
+        lake_id: str,
+        body: Union[Dict[str, Any], Task],
+        dataplex_task_id: str,
+        validate_only: Optional[bool] = None,
+        retry: Optional[Retry] = None,
+        timeout: Optional[float] = None,
+        metadata: Sequence[Tuple[str, str]] = (),
+    ) -> Any:
+        """
+        Creates a task resource within a lake.
+
+        :param project_id: Required. The ID of the Google Cloud project that the task belongs to.
+        :param region: Required. The ID of the Google Cloud region that the task belongs to.
+        :param lake_id: Required. The ID of the Google Cloud lake that the task belongs to.
+        :param body: Required. The Request body contains an instance of Task.
+        :param dataplex_task_id: Required. Task identifier.
+        :param validate_only: Optional. Only validate the request, but do not perform mutations.
+            The default is false.
+        :param retry: A retry object used  to retry requests. If `None` is specified, requests
+            will not be retried.
+        :param timeout: The amount of time, in seconds, to wait for the request to complete.
+            Note that if `retry` is specified, the timeout applies to each individual attempt.
+        :param metadata: Additional metadata that is provided to the method.
+        """
+        parent = f'projects/{project_id}/locations/{region}/lakes/{lake_id}'
+
+        client = self.get_dataplex_client()
+        result = client.create_task(
+            request={
+                'parent': parent,
+                'task_id': dataplex_task_id,
+                'task': body,
+            },
+            retry=retry,
+            timeout=timeout,
+            metadata=metadata,
+        )
+        return result
+
+    @GoogleBaseHook.fallback_to_default_project_id
+    def delete_task(
+        self,
+        project_id: str,
+        region: str,
+        lake_id: str,
+        dataplex_task_id: str,
+        retry: Optional[Retry] = None,
+        timeout: Optional[float] = None,
+        metadata: Sequence[Tuple[str, str]] = (),
+    ) -> Any:
+        """
+        Delete the task resource.
+
+        :param project_id: Required. The ID of the Google Cloud project that the task belongs to.
+        :param region: Required. The ID of the Google Cloud region that the task belongs to.
+        :param lake_id: Required. The ID of the Google Cloud lake that the task belongs to.
+        :param dataplex_task_id: Required. The ID of the Google Cloud task to be deleted.
+        :param retry: A retry object used  to retry requests. If `None` is specified, requests
+            will not be retried.
+        :param timeout: The amount of time, in seconds, to wait for the request to complete.
+            Note that if `retry` is specified, the timeout applies to each individual attempt.
+        :param metadata: Additional metadata that is provided to the method.
+        """
+        name = f'projects/{project_id}/locations/{region}/lakes/{lake_id}/tasks/{dataplex_task_id}'
+
+        client = self.get_dataplex_client()
+        result = client.delete_task(
+            request={
+                'name': name,
+            },
+            retry=retry,
+            timeout=timeout,
+            metadata=metadata,
+        )
+        return result
+
+    @GoogleBaseHook.fallback_to_default_project_id
+    def list_tasks(
+        self,
+        project_id: str,
+        region: str,
+        lake_id: str,
+        page_size: Optional[int] = None,
+        page_token: Optional[str] = None,
+        filter: Optional[str] = None,
+        order_by: Optional[str] = None,
+        retry: Optional[Retry] = None,
+        timeout: Optional[float] = None,
+        metadata: Sequence[Tuple[str, str]] = (),
+    ) -> Any:
+        """
+        Lists tasks under the given lake.
+
+        :param project_id: Required. The ID of the Google Cloud project that the task belongs to.
+        :param region: Required. The ID of the Google Cloud region that the task belongs to.
+        :param lake_id: Required. The ID of the Google Cloud lake that the task belongs to.
+        :param page_size: Optional. Maximum number of tasks to return. The service may return fewer than this
+            value. If unspecified, at most 10 tasks will be returned. The maximum value is 1000;
+            values above 1000 will be coerced to 1000.
+        :param page_token: Optional. Page token received from a previous ListZones call. Provide this to
+            retrieve the subsequent page. When paginating, all other parameters provided to ListZones must
+            match the call that provided the page token.
+        :param filter: Optional. Filter request.
+        :param order_by: Optional. Order by fields for the result.
+        :param retry: A retry object used  to retry requests. If `None` is specified, requests
+            will not be retried.
+        :param timeout: The amount of time, in seconds, to wait for the request to complete.
+            Note that if `retry` is specified, the timeout applies to each individual attempt.
+        :param metadata: Additional metadata that is provided to the method.
+        """
+        parent = f'projects/{project_id}/locations/{region}/lakes/{lake_id}'
+
+        client = self.get_dataplex_client()
+        result = client.list_tasks(
+            request={
+                'parent': parent,
+                'page_size': page_size,
+                'page_token': page_token,
+                'filter': filter,
+                'order_by': order_by,
+            },
+            retry=retry,
+            timeout=timeout,
+            metadata=metadata,
+        )
+        return result
+
+    @GoogleBaseHook.fallback_to_default_project_id
+    def get_task(
+        self,
+        project_id: str,
+        region: str,
+        lake_id: str,
+        dataplex_task_id: str,
+        retry: Optional[Retry] = None,
+        timeout: Optional[float] = None,
+        metadata: Sequence[Tuple[str, str]] = (),
+    ) -> Any:
+        """
+        Get task resource.
+
+        :param project_id: Required. The ID of the Google Cloud project that the task belongs to.
+        :param region: Required. The ID of the Google Cloud region that the task belongs to.
+        :param lake_id: Required. The ID of the Google Cloud lake that the task belongs to.
+        :param dataplex_task_id: Required. The ID of the Google Cloud task to be retrieved.
+        :param retry: A retry object used  to retry requests. If `None` is specified, requests
+            will not be retried.
+        :param timeout: The amount of time, in seconds, to wait for the request to complete.
+            Note that if `retry` is specified, the timeout applies to each individual attempt.
+        :param metadata: Additional metadata that is provided to the method.
+        """
+        name = f'projects/{project_id}/locations/{region}/lakes/{lake_id}/tasks/{dataplex_task_id}'
+        client = self.get_dataplex_client()
+        result = client.get_task(
+            request={
+                'name': name,
+            },
+            retry=retry,
+            timeout=timeout,
+            metadata=metadata,
+        )
+        return result