Add Dataprep operators (#10304)

Add DataprepGetJobGroupOperator and DataprepRunJobGroupOperator for Dataprep service. Co-authored-by: Tomek Urbaszek <[email protected]>
apache · Sep 1, 2020 · 804548d · 804548d
1 parent f40ac9b
commit 804548d
Show file tree

Hide file tree

Showing 8 changed files with 469 additions and 67 deletions.
diff --git a/airflow/models/connection.py b/airflow/models/connection.py
@@ -51,7 +51,7 @@
     ),
     "cassandra": ("airflow.providers.apache.cassandra.hooks.cassandra.CassandraHook", "cassandra_conn_id"),
     "cloudant": ("airflow.providers.cloudant.hooks.cloudant.CloudantHook", "cloudant_conn_id"),
-    "dataprep": ("airflow.providers.google.cloud.hooks.dataprep.GoogleDataprepHook", "dataprep_conn_id"),
+    "dataprep": ("airflow.providers.google.cloud.hooks.dataprep.GoogleDataprepHook", "dataprep_default"),
     "docker": ("airflow.providers.docker.hooks.docker.DockerHook", "docker_conn_id"),
     "elasticsearch": (
         "airflow.providers.elasticsearch.hooks.elasticsearch.ElasticsearchHook",

diff --git a/airflow/providers/google/cloud/example_dags/example_dataprep.py b/airflow/providers/google/cloud/example_dags/example_dataprep.py
@@ -17,19 +17,56 @@
 """
 Example Airflow DAG that shows how to use Google Dataprep.
 """
+import os
 
 from airflow import models
-from airflow.providers.google.cloud.operators.dataprep import DataprepGetJobsForJobGroupOperator
+from airflow.providers.google.cloud.operators.dataprep import (
+    DataprepGetJobGroupOperator,
+    DataprepGetJobsForJobGroupOperator,
+    DataprepRunJobGroupOperator,
+)
 from airflow.utils import dates
 
-JOB_ID = 6269792
+DATAPREP_JOB_ID = int(os.environ.get('DATAPREP_JOB_ID', 12345677))
+DATAPREP_JOB_RECIPE_ID = int(os.environ.get('DATAPREP_JOB_RECIPE_ID', 12345677))
+DATAPREP_BUCKET = os.environ.get("DATAPREP_BUCKET", "gs://afl-sql/[email protected]")
+
+DATA = {
+    "wrangledDataset": {"id": DATAPREP_JOB_RECIPE_ID},
+    "overrides": {
+        "execution": "dataflow",
+        "profiler": False,
+        "writesettings": [
+            {
+                "path": DATAPREP_BUCKET,
+                "action": "create",
+                "format": "csv",
+                "compression": "none",
+                "header": False,
+                "asSingleFile": False,
+            }
+        ],
+    },
+}
+
 
 with models.DAG(
-    "example_dataprep", schedule_interval=None, start_date=dates.days_ago(1)  # Override to match your needs
+    "example_dataprep", schedule_interval=None, start_date=dates.days_ago(1),  # Override to match your needs
 ) as dag:
+    # [START how_to_dataprep_run_job_group_operator]
+    run_job_group = DataprepRunJobGroupOperator(task_id="run_job_group", body_request=DATA)
+    # [END how_to_dataprep_run_job_group_operator]
 
     # [START how_to_dataprep_get_jobs_for_job_group_operator]
     get_jobs_for_job_group = DataprepGetJobsForJobGroupOperator(
-        task_id="get_jobs_for_job_group", job_id=JOB_ID
+        task_id="get_jobs_for_job_group", job_id=DATAPREP_JOB_ID
     )
     # [END how_to_dataprep_get_jobs_for_job_group_operator]
+
+    # [START how_to_dataprep_get_job_group_operator]
+    get_job_group = DataprepGetJobGroupOperator(
+        task_id="get_job_group", job_group_id=DATAPREP_JOB_ID, embed="", include_deleted=False,
+    )
+    # [END how_to_dataprep_get_job_group_operator]
+
+    run_job_group >> [get_jobs_for_job_group, get_job_group]
diff --git a/airflow/providers/google/cloud/hooks/dataprep.py b/airflow/providers/google/cloud/hooks/dataprep.py
@@ -18,12 +18,14 @@
 """
 This module contains Google Dataprep hook.
 """
+import json
+import os
 from typing import Any, Dict
 
 import requests
+from requests import HTTPError
 from tenacity import retry, stop_after_attempt, wait_exponential
 
-from airflow import AirflowException
 from airflow.hooks.base_hook import BaseHook
 
 
@@ -37,10 +39,13 @@ class GoogleDataprepHook(BaseHook):
 
     """
 
-    def __init__(self, dataprep_conn_id: str = "dataprep_conn_id") -> None:
+    def __init__(self, dataprep_conn_id: str = "dataprep_default") -> None:
         super().__init__()
         self.dataprep_conn_id = dataprep_conn_id
-        self._url = "https://api.clouddataprep.com/v4/jobGroups"
+        conn = self.get_connection(self.dataprep_conn_id)
+        extra_dejson = conn.extra_dejson
+        self._token = extra_dejson.get("extra__dataprep__token")
+        self._base_url = extra_dejson.get("extra__dataprep__base_url", "https://api.clouddataprep.com")
 
     @property
     def _headers(self) -> Dict[str, str]:
@@ -50,26 +55,63 @@ def _headers(self) -> Dict[str, str]:
         }
         return headers
 
-    @property
-    def _token(self) -> str:
-        conn = self.get_connection(self.dataprep_conn_id)
-        token = conn.extra_dejson.get("token")
-        if token is None:
-            raise AirflowException(
-                "Dataprep token is missing or has invalid format. "
-                "Please make sure that Dataprep token is added to the Airflow Connections."
-            )
-        return token
-
     @retry(stop=stop_after_attempt(5), wait=wait_exponential(multiplier=1, max=10))
     def get_jobs_for_job_group(self, job_id: int) -> Dict[str, Any]:
         """
         Get information about the batch jobs within a Cloud Dataprep job.
 
-        :param job_id The ID of the job that will be fetched.
+        :param job_id: The ID of the job that will be fetched
         :type job_id: int
         """
-        url: str = f"{self._url}/{job_id}/jobs"
+
+        endpoint_path = f"v4/jobGroups/{job_id}/jobs"
+        url: str = os.path.join(self._base_url, endpoint_path)
         response = requests.get(url, headers=self._headers)
-        response.raise_for_status()
+        self._raise_for_status(response)
+        return response.json()
+
+    @retry(stop=stop_after_attempt(5), wait=wait_exponential(multiplier=1, max=10))
+    def get_job_group(self, job_group_id: int, embed: str, include_deleted: bool) -> Dict[str, Any]:
+        """
+        Get the specified job group.
+        A job group is a job that is executed from a specific node in a flow.
+
+        :param job_group_id: The ID of the job that will be fetched
+        :type job_group_id: int
+        :param embed: Comma-separated list of objects to pull in as part of the response
+        :type embed: str
+        :param include_deleted: if set to "true", will include deleted objects
+        :type include_deleted: bool
+        """
+
+        params: Dict[str, Any] = {"embed": embed, "includeDeleted": include_deleted}
+        endpoint_path = f"v4/jobGroups/{job_group_id}"
+        url: str = os.path.join(self._base_url, endpoint_path)
+        response = requests.get(url, headers=self._headers, params=params)
+        self._raise_for_status(response)
+        return response.json()
+
+    @retry(stop=stop_after_attempt(5), wait=wait_exponential(multiplier=1, max=10))
+    def run_job_group(self, body_request: dict) -> Dict[str, Any]:
+        """
+        Creates a ``jobGroup``, which launches the specified job as the authenticated user.
+        This performs the same action as clicking on the Run Job button in the application.
+        To get recipe_id please follow the Dataprep API documentation
+        https://clouddataprep.com/documentation/api#operation/runJobGroup
+
+        :param body_request: The identifier for the recipe you would like to run.
+        :type body_request: dict
+        """
+
+        endpoint_path = "v4/jobGroups"
+        url: str = os.path.join(self._base_url, endpoint_path)
+        response = requests.post(url, headers=self._headers, data=json.dumps(body_request))
+        self._raise_for_status(response)
         return response.json()
+
+    def _raise_for_status(self, response: requests.models.Response) -> None:
+        try:
+            response.raise_for_status()
+        except HTTPError:
+            self.log.error(response.json().get('exception'))
+            raise
diff --git a/airflow/providers/google/cloud/operators/dataprep.py b/airflow/providers/google/cloud/operators/dataprep.py
@@ -35,20 +35,90 @@ class DataprepGetJobsForJobGroupOperator(BaseOperator):
         For more information on how to use this operator, take a look at the guide:
         :ref:`howto/operator:DataprepGetJobsForJobGroupOperator`
 
-
     :param job_id The ID of the job that will be requests
     :type job_id: int
     """
 
     template_fields = ("job_id",)
 
     @apply_defaults
-    def __init__(self, *, job_id: int, **kwargs) -> None:
+    def __init__(self, *, dataprep_conn_id: str = "dataprep_default", job_id: int, **kwargs) -> None:
         super().__init__(**kwargs)
+        self.dataprep_conn_id = (dataprep_conn_id,)
         self.job_id = job_id
 
     def execute(self, context: Dict):
         self.log.info("Fetching data for job with id: %d ...", self.job_id)
-        hook = GoogleDataprepHook(dataprep_conn_id="dataprep_conn_id")
+        hook = GoogleDataprepHook(dataprep_conn_id="dataprep_default",)
         response = hook.get_jobs_for_job_group(job_id=self.job_id)
         return response
+
+
+class DataprepGetJobGroupOperator(BaseOperator):
+    """
+    Get the specified job group.
+    A job group is a job that is executed from a specific node in a flow.
+    API documentation https://clouddataprep.com/documentation/api#section/Overview
+
+    .. seealso::
+        For more information on how to use this operator, take a look at the guide:
+        :ref:`howto/operator:DataprepGetJobGroupOperator`
+
+    :param job_group_id: The ID of the job that will be requests
+    :type job_group_id: int
+    :param embed: Comma-separated list of objects to pull in as part of the response
+    :type embed: string
+    :param include_deleted: if set to "true", will include deleted objects
+    :type include_deleted: bool
+    """
+
+    template_fields = ("job_group_id", "embed")
+
+    @apply_defaults
+    def __init__(
+        self,
+        *,
+        dataprep_conn_id: str = "dataprep_default",
+        job_group_id: int,
+        embed: str,
+        include_deleted: bool,
+        **kwargs,
+    ) -> None:
+        super().__init__(**kwargs)
+        self.dataprep_conn_id: str = dataprep_conn_id
+        self.job_group_id = job_group_id
+        self.embed = embed
+        self.include_deleted = include_deleted
+
+    def execute(self, context: Dict):
+        self.log.info("Fetching data for job with id: %d ...", self.job_group_id)
+        hook = GoogleDataprepHook(dataprep_conn_id=self.dataprep_conn_id)
+        response = hook.get_job_group(
+            job_group_id=self.job_group_id, embed=self.embed, include_deleted=self.include_deleted,
+        )
+        return response
+
+
+class DataprepRunJobGroupOperator(BaseOperator):
+    """
+    Create a ``jobGroup``, which launches the specified job as the authenticated user.
+    This performs the same action as clicking on the Run Job button in the application.
+    To get recipe_id please follow the Dataprep API documentation
+    https://clouddataprep.com/documentation/api#operation/runJobGroup
+
+    :param recipe_id: The identifier for the recipe you would like to run.
+    :type recipe_id: int
+    """
+
+    template_fields = ("body_request",)
+
+    def __init__(self, *, dataprep_conn_id: str = "dataprep_default", body_request: dict, **kwargs) -> None:
+        super().__init__(**kwargs)
+        self.body_request = body_request
+        self.dataprep_conn_id = dataprep_conn_id
+
+    def execute(self, context: None):
+        self.log.info("Creating a job...")
+        hook = GoogleDataprepHook(dataprep_conn_id=self.dataprep_conn_id)
+        response = hook.run_job_group(body_request=self.body_request)
+        return response
diff --git a/docs/howto/operator/google/cloud/dataprep.rst b/docs/howto/operator/google/cloud/dataprep.rst
@@ -17,7 +17,30 @@
 
 Google Dataprep Operators
 =========================
-`Google Dataprep API documentation is available here <https://cloud.google.com/dataprep/docs/html/API-Reference_145281441>`__
+Dataprep is the intelligent cloud data service to visually explore, clean, and prepare data for analysis and machine learning.
+Service can be use to explore and transform raw data from disparate and/or large datasets into clean and structured data for further analysis and processing.
+Dataprep Job is an internal object encoding the information necessary to run a part of a Cloud Dataprep job group.
+For more information about the service visit `Google Dataprep API documentation <https://cloud.google.com/dataprep/docs/html/API-Reference_145281441>`_
+
+Before you begin
+^^^^^^^^^^^^^^^^
+Before using Dataprep within Airflow you need to authenticate your account with TOKEN.
+To get connection Dataprep with Airflow you need Dataprep token. Please follow Dataprep `instructions <https://clouddataprep.com/documentation/api#section/Authentication>`_ to do it.
+
+TOKEN should be added to the Connection in Airflow in JSON format.
+You can check `how to do such connection <https://airflow.readthedocs.io/en/stable/howto/connection/index.html#editing-a-connection-with-the-ui>`_.
+
+The DataprepRunJobGroupOperator will run specified job. Operator required a recipe id. To identify the recipe id please use `API documentation for runJobGroup <https://clouddataprep.com/documentation/api#operation/runJobGroup>`_
+E.g. if the URL is /flows/10?recipe=7, the recipe id is 7. The recipe cannot be created via this operator. It can be created only via UI which is available `here <https://clouddataprep.com/>`_.
+Some of parameters can be override by DAG's body request. How to do it is shown in example dag.
+
+See following example:
+Set values for these fields:
+.. code-block::
+
+  Conn Id: "your_conn_id"
+  Extra: {"extra__dataprep__token": "TOKEN",
+          "extra__dataprep__base_url": "https://api.clouddataprep.com"}
 
 .. contents::
   :depth: 1
@@ -28,33 +51,58 @@ Prerequisite Tasks
 
 .. include:: /howto/operator/google/_partials/prerequisite_tasks.rst
 
+.. _howto/operator:DataprepRunJobGroupOperator:
+
+Run Job Group
+^^^^^^^^^^^^^
+
+Operator task is to create a job group, which launches the specified job as the authenticated user.
+This performs the same action as clicking on the Run Job button in the application.
+
+To get information about jobs within a Cloud Dataprep job use:
+:class:`~airflow.providers.google.cloud.operators.dataprep.DataprepRunJobGroupOperator`
+
+Example usage:
+
+.. exampleinclude:: /../airflow/providers/google/cloud/example_dags/example_dataprep.py
+    :language: python
+    :dedent: 4
+    :start-after: [START how_to_dataprep_run_job_group_operator]
+    :end-before: [END how_to_dataprep_run_job_group_operator]
+
 .. _howto/operator:DataprepGetJobsForJobGroupOperator:
 
 Get Jobs For Job Group
 ^^^^^^^^^^^^^^^^^^^^^^
 
+Operator task is to get information about the batch jobs within a Cloud Dataprep job.
+
 To get information about jobs within a Cloud Dataprep job use:
 :class:`~airflow.providers.google.cloud.operators.dataprep.DataprepGetJobsForJobGroupOperator`
 
-To get connection Dataprep with Airflow you need Dataprep token.
-Please follow Dataprep instructions.
-https://clouddataprep.com/documentation/api#section/Authentication
+Example usage:
+
+.. exampleinclude:: /../airflow/providers/google/cloud/example_dags/example_dataprep.py
+    :language: python
+    :dedent: 4
+    :start-after: [START how_to_dataprep_get_jobs_for_job_group_operator]
+    :end-before: [END how_to_dataprep_get_jobs_for_job_group_operator]
+
+.. _howto/operator:DataprepGetJobGroupOperator:
 
-It should be added to the Connection in Airflow in JSON format.
-Her you can check how to do such connection:
-https://airflow.readthedocs.io/en/stable/howto/connection/index.html#editing-a-connection-with-the-ui
+Get Job Group
+^^^^^^^^^^^^^
 
-See following example:
-Set values for these fields:
-.. code-block::
+Operator task is to get the specified job group.
+A job group is a job that is executed from a specific node in a flow.
 
-  Conn Id: "your_conn_id"
-  Extra: "{\"token\": \"TOKEN\"}
+To get information about jobs within a Cloud Dataprep job use:
+:class:`~airflow.providers.google.cloud.operators.dataprep.DataprepGetJobGroupOperator`
 
 Example usage:
 
 .. exampleinclude:: /../airflow/providers/google/cloud/example_dags/example_dataprep.py
     :language: python
     :dedent: 4
-    :start-after: [START how_to_dataprep_get_jobs_for_job_group_operator]
-    :end-before: [END how_to_dataprep_get_jobs_for_job_group_operator]
+    :start-after: [START how_to_dataprep_get_job_group_operator]
+    :end-before: [END how_to_dataprep_get_job_group_operator]