Skip to content

Commit

Permalink
Add BigQueryInsertJobOperator (#8868)
Browse files Browse the repository at this point in the history
* Add BigQueryInsertJobOperator

* fixup! Add BigQueryInsertJobOperator

* fixup! fixup! Add BigQueryInsertJobOperator

* fixup! fixup! fixup! Add BigQueryInsertJobOperator
  • Loading branch information
turbaszek authored Jun 1, 2020
1 parent 7c0e6ed commit 7898525
Show file tree
Hide file tree
Showing 5 changed files with 266 additions and 53 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@
from airflow.providers.google.cloud.operators.bigquery import (
BigQueryCheckOperator, BigQueryCreateEmptyDatasetOperator, BigQueryCreateEmptyTableOperator,
BigQueryDeleteDatasetOperator, BigQueryExecuteQueryOperator, BigQueryGetDataOperator,
BigQueryIntervalCheckOperator, BigQueryValueCheckOperator,
BigQueryInsertJobOperator, BigQueryIntervalCheckOperator, BigQueryValueCheckOperator,
)
from airflow.utils.dates import days_ago

Expand All @@ -40,10 +40,10 @@
INSERT_DATE = datetime.now().strftime("%Y-%m-%d")

# [START howto_operator_bigquery_query]
INSERT_ROWS_QUERY = f"""
INSERT INTO {DATASET_NAME}.{TABLE_1} VALUES (42, "monthy python", "{INSERT_DATE}");
INSERT INTO {DATASET_NAME}.{TABLE_1} VALUES (42, "fishy fish", "{INSERT_DATE}");
"""
INSERT_ROWS_QUERY = \
f"INSERT {DATASET_NAME}.{TABLE_1} VALUES " \
f"(42, 'monthy python', '{INSERT_DATE}'), " \
f"(42, 'fishy fish', '{INSERT_DATE}');"
# [END howto_operator_bigquery_query]

SCHEMA = [
Expand Down Expand Up @@ -84,13 +84,22 @@
task_id="delete_dataset", dataset_id=DATASET_NAME, delete_contents=True
)

# [START howto_operator_bigquery_execute_query]
# [START howto_operator_bigquery_insert_job]
insert_query_job = BigQueryInsertJobOperator(
task_id="insert_query_job",
configuration={
"query": {
"query": INSERT_ROWS_QUERY,
"useLegacySql": False,
}
},
)
# [END howto_operator_bigquery_insert_job]

execute_insert_query = BigQueryExecuteQueryOperator(
task_id="execute_insert_query", sql=INSERT_ROWS_QUERY, use_legacy_sql=False
)
# [END howto_operator_bigquery_execute_query]

# [START howto_operator_bigquery_execute_query_list]
bigquery_execute_multi_query = BigQueryExecuteQueryOperator(
task_id="execute_multi_query",
sql=[
Expand All @@ -99,16 +108,13 @@
],
use_legacy_sql=False,
)
# [END howto_operator_bigquery_execute_query_list]

# [START howto_operator_bigquery_execute_query_save]
execute_query_save = BigQueryExecuteQueryOperator(
task_id="execute_query_save",
sql=f"SELECT * FROM {DATASET_NAME}.{TABLE_1}",
use_legacy_sql=False,
destination_dataset_table=f"{DATASET_NAME}.{TABLE_2}",
)
# [END howto_operator_bigquery_execute_query_save]

# [START howto_operator_bigquery_get_data]
get_data = BigQueryGetDataOperator(
Expand Down Expand Up @@ -137,7 +143,7 @@
check_value = BigQueryValueCheckOperator(
task_id="check_value",
sql=f"SELECT COUNT(*) FROM {DATASET_NAME}.{TABLE_1}",
pass_value=2,
pass_value=4,
use_legacy_sql=False,
)
# [END howto_operator_bigquery_value_check]
Expand All @@ -152,8 +158,9 @@
)
# [END howto_operator_bigquery_interval_check]

[create_table_1, create_table_2] >> execute_insert_query
[create_table_1, create_table_2] >> insert_query_job

insert_query_job >> execute_insert_query
execute_insert_query >> get_data >> get_data_result >> delete_dataset
execute_insert_query >> execute_query_save >> bigquery_execute_multi_query >> delete_dataset
execute_insert_query >> [check_count, check_value, check_interval] >> delete_dataset
76 changes: 61 additions & 15 deletions airflow/providers/google/cloud/hooks/bigquery.py
Original file line number Diff line number Diff line change
Expand Up @@ -1397,13 +1397,42 @@ def cancel_job(
self.log.info('Waiting for canceled job with id %s to finish.', job_id)
time.sleep(5)

@GoogleBaseHook.fallback_to_default_project_id
def get_job(
self,
job_id: Optional[str] = None,
project_id: Optional[str] = None,
location: Optional[str] = None,
) -> Union[CopyJob, QueryJob, LoadJob, ExtractJob]:
"""
Retrives a BigQuery job. For more information see:
https://cloud.google.com/bigquery/docs/reference/v2/jobs
:param job_id: The ID of the job. The ID must contain only letters (a-z, A-Z),
numbers (0-9), underscores (_), or dashes (-). The maximum length is 1,024
characters. If not provided then uuid will be generated.
:type job_id: str
:param project_id: Google Cloud Project where the job is running
:type project_id: str
:param location: location the job is running
:type location: str
"""
client = self.get_client(project_id=project_id, location=location)
job = client.get_job(
job_id=job_id,
project=project_id,
location=location
)
return job

@GoogleBaseHook.fallback_to_default_project_id
def insert_job(
self,
configuration: Dict,
job_id: Optional[str] = None,
project_id: Optional[str] = None,
location: Optional[str] = None,
) -> str:
) -> Union[CopyJob, QueryJob, LoadJob, ExtractJob]:
"""
Executes a BigQuery job. Waits for the job to complete and returns job id.
See here:
Expand All @@ -1414,17 +1443,23 @@ def insert_job(
BigQuery's configuration field in the job object. See
https://cloud.google.com/bigquery/docs/reference/v2/jobs for
details.
:type configuration: Dict[str, Any]
:param job_id: The ID of the job. The ID must contain only letters (a-z, A-Z),
numbers (0-9), underscores (_), or dashes (-). The maximum length is 1,024
characters. If not provided then uuid will be generated.
:type job_id: str
:param project_id: Google Cloud Project where the job is running
:type project_id: str
:param location: location the job is running
:type location: str
"""
job_id = job_id or str(uuid.uuid4())
location = location or self.location
client = self.get_client(project_id=project_id, location=location)
job_data = {
"configuration": configuration,
"jobReference": {
"jobId": str(uuid.uuid4()),
"jobId": job_id,
"projectId": project_id,
"location": location
}
Expand All @@ -1446,9 +1481,7 @@ def insert_job(
if not job:
raise AirflowException(f"Unknown job type. Supported types: {supported_jobs.keys()}")
job = job.from_api_repr(job_data, client)
# Start the job and wait for it to complete and get the result.
job.result()
return job.job_id
return job

def run_with_configuration(self, configuration: Dict) -> str:
"""
Expand All @@ -1467,8 +1500,11 @@ def run_with_configuration(self, configuration: Dict) -> str:
"This method is deprecated. Please use `BigQueryHook.insert_job`",
DeprecationWarning
)
self.running_job_id = self.insert_job(configuration=configuration, project_id=self.project_id)
return self.running_job_id
job = self.insert_job(configuration=configuration, project_id=self.project_id)
# Start the job and wait for it to complete and get the result.
job.result()
self.running_job_id = job.job_id
return job.job_id

def run_load(self, # pylint: disable=too-many-locals,too-many-arguments,invalid-name
destination_project_dataset_table: str,
Expand Down Expand Up @@ -1709,8 +1745,11 @@ def run_load(self, # pylint: disable=too-many-locals,too-many-arguments,invalid
if allow_jagged_rows:
configuration['load']['allowJaggedRows'] = allow_jagged_rows

self.running_job_id = self.insert_job(configuration=configuration, project_id=self.project_id)
return self.running_job_id
job = self.insert_job(configuration=configuration, project_id=self.project_id)
# Start the job and wait for it to complete and get the result.
job.result()
self.running_job_id = job.job_id
return job.job_id

def run_copy(self, # pylint: disable=invalid-name
source_project_dataset_tables: Union[List, str],
Expand Down Expand Up @@ -1803,8 +1842,11 @@ def run_copy(self, # pylint: disable=invalid-name
"destinationEncryptionConfiguration"
] = encryption_configuration

self.running_job_id = self.insert_job(configuration=configuration, project_id=self.project_id)
return self.running_job_id
job = self.insert_job(configuration=configuration, project_id=self.project_id)
# Start the job and wait for it to complete and get the result.
job.result()
self.running_job_id = job.job_id
return job.job_id

def run_extract(
self,
Expand Down Expand Up @@ -1878,8 +1920,9 @@ def run_extract(
configuration['extract']['fieldDelimiter'] = field_delimiter
configuration['extract']['printHeader'] = print_header

self.running_job_id = self.insert_job(configuration=configuration, project_id=self.project_id)
return self.running_job_id
job = self.insert_job(configuration=configuration, project_id=self.project_id)
self.running_job_id = job.job_id
return job.job_id

# pylint: disable=too-many-locals,too-many-arguments, too-many-branches
def run_query(self,
Expand Down Expand Up @@ -2123,8 +2166,11 @@ def run_query(self,
"destinationEncryptionConfiguration"
] = encryption_configuration

self.running_job_id = self.insert_job(configuration=configuration, project_id=self.project_id)
return self.running_job_id
job = self.insert_job(configuration=configuration, project_id=self.project_id)
# Start the job and wait for it to complete and get the result.
job.result()
self.running_job_id = job.job_id
return job.job_id


class BigQueryPandasConnector(GbqConnector):
Expand Down
82 changes: 82 additions & 0 deletions airflow/providers/google/cloud/operators/bigquery.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,10 +23,12 @@
import enum
import json
import warnings
from time import sleep
from typing import Any, Dict, Iterable, List, Optional, SupportsAbs, Union

import attr
from google.api_core.exceptions import Conflict
from google.api_core.retry import exponential_sleep_generator
from google.cloud.bigquery import TableReference

from airflow.exceptions import AirflowException
Expand Down Expand Up @@ -546,6 +548,11 @@ def __init__(self,
"the gcp_conn_id parameter.", DeprecationWarning, stacklevel=3)
gcp_conn_id = bigquery_conn_id

warnings.warn(
"This operator is deprecated. Please use `BigQueryInsertJobOperator`.",
DeprecationWarning, stacklevel=3,
)

self.sql = sql
self.destination_dataset_table = destination_dataset_table
self.write_disposition = write_disposition
Expand Down Expand Up @@ -1570,3 +1577,78 @@ def execute(self, context):
table_resource=self.table_resource,
project_id=self.project_id,
)


class BigQueryInsertJobOperator(BaseOperator):
"""
Executes a BigQuery job. Waits for the job to complete and returns job id.
See here:
https://cloud.google.com/bigquery/docs/reference/v2/jobs
:param configuration: The configuration parameter maps directly to BigQuery's
configuration field in the job object. For more details see
https://cloud.google.com/bigquery/docs/reference/v2/jobs
:type configuration: Dict[str, Any]
:param job_id: The ID of the job. The ID must contain only letters (a-z, A-Z),
numbers (0-9), underscores (_), or dashes (-). The maximum length is 1,024
characters. If not provided then uuid will be generated.
:type job_id: str
:param project_id: Google Cloud Project where the job is running
:type project_id: str
:param location: location the job is running
:type location: str
:param gcp_conn_id: (Optional) The connection ID used to connect to Google Cloud Platform.
:type gcp_conn_id: str
"""

template_fields = ("configuration", "job_id")
ui_color = BigQueryUIColors.QUERY.value

def __init__(
self,
configuration: Dict[str, Any],
project_id: Optional[str] = None,
location: Optional[str] = None,
job_id: Optional[str] = None,
gcp_conn_id: str = 'google_cloud_default',
delegate_to: Optional[str] = None,
*args,
**kwargs,
) -> None:
super().__init__(*args, **kwargs)
self.configuration = configuration
self.location = location
self.job_id = job_id
self.project_id = project_id
self.gcp_conn_id = gcp_conn_id
self.delegate_to = delegate_to

def execute(self, context: Any):
hook = BigQueryHook(
gcp_conn_id=self.gcp_conn_id,
delegate_to=self.delegate_to,
)

try:
job = hook.insert_job(
configuration=self.configuration,
project_id=self.project_id,
location=self.location,
job_id=self.job_id,
)
# Start the job and wait for it to complete and get the result.
job.result()
except Conflict:
job = hook.get_job(
project_id=self.project_id,
location=self.location,
job_id=self.job_id,
)
# Get existing job and wait for it to be ready
for time_to_wait in exponential_sleep_generator(initial=10, maximum=120):
sleep(time_to_wait)
job.reload()
if job.done():
break
return job.job_id
37 changes: 14 additions & 23 deletions docs/howto/operator/gcp/bigquery.rst
Original file line number Diff line number Diff line change
Expand Up @@ -241,10 +241,10 @@ You can also use this operator to delete a view.
:start-after: [START howto_operator_bigquery_delete_view]
:end-before: [END howto_operator_bigquery_delete_view]

.. _howto/operator:BigQueryExecuteQueryOperator:
.. _howto/operator:BigQueryInsertJobOperator:

Execute queries
^^^^^^^^^^^^^^^
Execute BigQuery jobs
^^^^^^^^^^^^^^^^^^^^^

Let's say you would like to execute the following query.

Expand All @@ -255,32 +255,23 @@ Let's say you would like to execute the following query.
:end-before: [END howto_operator_bigquery_query]

To execute the SQL query in a specific BigQuery database you can use
:class:`~airflow.providers.google.cloud.operators.bigquery.BigQueryExecuteQueryOperator`.
:class:`~airflow.providers.google.cloud.operators.bigquery.BigQueryInsertJobOperator` with
proper query job configuration.

.. exampleinclude:: ../../../../airflow/providers/google/cloud/example_dags/example_bigquery_queries.py
:language: python
:dedent: 4
:start-after: [START howto_operator_bigquery_execute_query]
:end-before: [END howto_operator_bigquery_execute_query]

``sql`` argument can receive a str representing a sql statement, a list of str
(sql statements), or reference to a template file. Template reference are recognized
by str ending in '.sql'.
:start-after: [START howto_operator_bigquery_insert_job]
:end-before: [END howto_operator_bigquery_insert_job]

.. exampleinclude:: ../../../../airflow/providers/google/cloud/example_dags/example_bigquery_queries.py
:language: python
:dedent: 4
:start-after: [START howto_operator_bigquery_execute_query_list]
:end-before: [END howto_operator_bigquery_execute_query_list]
For more information on types of BigQuery job please check
`documentation <https://cloud.google.com/bigquery/docs/reference/v2/jobs>`__.

You can store the results of the query in a table by specifying
``destination_dataset_table``.

.. exampleinclude:: ../../../../airflow/providers/google/cloud/example_dags/example_bigquery_queries.py
:language: python
:dedent: 4
:start-after: [START howto_operator_bigquery_execute_query_save]
:end-before: [END howto_operator_bigquery_execute_query_save]
Additionally you can use ``job_id`` parameter of
:class:`~airflow.providers.google.cloud.operators.bigquery.BigQueryInsertJobOperator` to improve
idempotency. If this parameter is not passed then uuid will be used as ``job_id``. If provided then
operator will try to submit a new job with this ``job_id```. If there's already a job with such ``job_id``
then it will reattach to the existing job.

Validate data
^^^^^^^^^^^^^
Expand Down
Loading

0 comments on commit 7898525

Please sign in to comment.