Skip to content

Commit

Permalink
[AIRFLOW-7064] Add CloudFirestoreExportDatabaseOperator (#7725)
Browse files Browse the repository at this point in the history
  • Loading branch information
mik-laj authored Mar 18, 2020
1 parent e21b2e1 commit 63a3102
Show file tree
Hide file tree
Showing 18 changed files with 892 additions and 0 deletions.
16 changes: 16 additions & 0 deletions airflow/providers/google/firebase/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
16 changes: 16 additions & 0 deletions airflow/providers/google/firebase/example_dags/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
118 changes: 118 additions & 0 deletions airflow/providers/google/firebase/example_dags/example_firestore.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,118 @@
#
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.

"""
Example Airflow DAG that showss interactions with Google Cloud Firestore.
Prerequisites
=============
This example uses two GCP projects:
* ``GCP_PROJECT_ID`` - It contains a bucket and a firestore database.
* ``G_FIRESTORE_PROJECT_ID`` - it contains the Data Warehouse based on the BigQuery service.
Saving in a bucket should be possible from the ``G_FIRESTORE_PROJECT_ID`` project.
Reading from a bucket should be possible from the ``GCP_PROJECT_ID`` project.
The bucket and dataset should be located in the same region.
If you want to run this example, you must do the following:
1. Create GCP project and enable the BigQuery API
2. Create the Firebase project
3. Create a bucket in the same location as the the Firebase project
4. Grant Firebase admin account permissions to manage BigQuery. This is required to create a dataset.
5. Create a bucket in Firebase project and
6. Give read/write access for Firebase admin to bucket to step no. 5.
"""

import os
from urllib.parse import urlparse

from airflow import models
from airflow.providers.google.cloud.operators.bigquery import (
BigQueryCreateEmptyDatasetOperator, BigQueryCreateExternalTableOperator, BigQueryDeleteDatasetOperator,
BigQueryExecuteQueryOperator,
)
from airflow.providers.google.firebase.operators.firestore import CloudFirestoreExportDatabaseOperator
from airflow.utils import dates

GCP_PROJECT_ID = os.environ.get("GCP_PROJECT_ID", "example-gcp-project")
FIRESTORE_PROJECT_ID = os.environ.get("G_FIRESTORE_PROJECT_ID", "example-firebase-project")

EXPORT_DESTINATION_URL = os.environ.get("GCP_FIRESTORE_ARCHIVE_URL", "gs://airflow-firestore/namespace/")
BUCKET_NAME = urlparse(EXPORT_DESTINATION_URL).hostname
EXPORT_PREFIX = urlparse(EXPORT_DESTINATION_URL).path

EXPORT_COLLECTION_ID = os.environ.get("GCP_FIRESTORE_COLLECTION_ID", "firestore_collection_id")
DATASET_NAME = os.environ.get("GCP_FIRESTORE_DATASET_NAME", "test_firestore_export")
DATASET_LOCATION = os.environ.get("GCP_FIRESTORE_DATASET_LOCATION", "EU")


with models.DAG(
"example_google_firestore",
default_args=dict(start_date=dates.days_ago(1)),
schedule_interval=None,
tags=["example"],
) as dag:
# [START howto_operator_export_database_to_gcs]
export_database_to_gcs = CloudFirestoreExportDatabaseOperator(
task_id="export_database_to_gcs",
project_id=FIRESTORE_PROJECT_ID,
body={"outputUriPrefix": EXPORT_DESTINATION_URL, "collectionIds": [EXPORT_COLLECTION_ID]},
)
# [END howto_operator_export_database_to_gcs]

create_dataset = BigQueryCreateEmptyDatasetOperator(
task_id="create_dataset",
dataset_id=DATASET_NAME,
location=DATASET_LOCATION,
project_id=GCP_PROJECT_ID,
)

delete_dataset = BigQueryDeleteDatasetOperator(
task_id="delete_dataset", dataset_id=DATASET_NAME, project_id=GCP_PROJECT_ID, delete_contents=True
)

# [START howto_operator_create_external_table_multiple_types]
create_external_table_multiple_types = BigQueryCreateExternalTableOperator(
task_id="create_external_table",
bucket=BUCKET_NAME,
source_objects=[
f"{EXPORT_PREFIX}/all_namespaces/kind_{EXPORT_COLLECTION_ID}"
f"/all_namespaces_kind_{EXPORT_COLLECTION_ID}.export_metadata"
],
source_format="DATASTORE_BACKUP",
destination_project_dataset_table=f"{GCP_PROJECT_ID}.{DATASET_NAME}.firestore_data",
)
# [END howto_operator_create_external_table_multiple_types]

read_data_from_gcs_multiple_types = BigQueryExecuteQueryOperator(
task_id="execute_query",
sql=f"SELECT COUNT(*) FROM `{GCP_PROJECT_ID}.{DATASET_NAME}.firestore_data`",
use_legacy_sql=False,
)

# Firestore
export_database_to_gcs >> create_dataset

# BigQuery
create_dataset >> create_external_table_multiple_types
create_external_table_multiple_types >> read_data_from_gcs_multiple_types
read_data_from_gcs_multiple_types >> delete_dataset
16 changes: 16 additions & 0 deletions airflow/providers/google/firebase/hooks/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
142 changes: 142 additions & 0 deletions airflow/providers/google/firebase/hooks/firestore.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,142 @@
#
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
"""Hook for Google Cloud Firestore service"""

import time
from typing import Any, Dict, Optional

from googleapiclient.discovery import build, build_from_document

from airflow.exceptions import AirflowException
from airflow.providers.google.cloud.hooks.base import CloudBaseHook

# Time to sleep between active checks of the operation results
TIME_TO_SLEEP_IN_SECONDS = 5


# noinspection PyAbstractClass
class CloudFirestoreHook(CloudBaseHook):
"""
Hook for the Google Firestore APIs.
All the methods in the hook where project_id is used must be called with
keyword arguments rather than positional.
:param api_version: API version used (for example v1 or v1beta1).
:type api_version: str
:param gcp_conn_id: The connection ID to use when fetching connection info.
:type gcp_conn_id: str
:param delegate_to: The account to impersonate, if any.
For this to work, the service account making the request must have
domain-wide delegation enabled.
:type delegate_to: str
"""

_conn = None # type: Optional[Any]

def __init__(
self,
api_version: str = "v1",
gcp_conn_id: str = "google_cloud_default",
delegate_to: Optional[str] = None,
) -> None:
super().__init__(gcp_conn_id, delegate_to)
self.api_version = api_version

def get_conn(self):
"""
Retrieves the connection to Cloud Firestore.
:return: Google Cloud Firestore services object.
"""
if not self._conn:
http_authorized = self._authorize()
# We cannot use an Authorized Client to retrieve discovery document due to an error in the API.
# When the authorized customer will send a request to the address below
# https://www.googleapis.com/discovery/v1/apis/firestore/v1/rest
# then it will get the message below:
# > Request contains an invalid argument.
# At the same time, the Non-Authorized Client has no problems.
non_authorized_conn = build("firestore", self.api_version, cache_discovery=False)
self._conn = build_from_document(
non_authorized_conn._rootDesc, # pylint: disable=protected-access
http=http_authorized
)
return self._conn

@CloudBaseHook.fallback_to_default_project_id
def export_documents(
self, body: Dict, database_id: str = "(default)", project_id: Optional[str] = None
) -> None:
"""
Starts a export with the specified configuration.
:param database_id: The Database ID.
:type database_id: str
:param body: The request body.
See:
https://firebase.google.com/docs/firestore/reference/rest/v1beta1/projects.databases/exportDocuments
:type body: dict
:param project_id: Optional, Google Cloud Project project_id where the database belongs.
If set to None or missing, the default project_id from the GCP connection is used.
:type project_id: str
"""
if not project_id:
raise ValueError("The project_id should be set")
service = self.get_conn()

name = f"projects/{project_id}/databases/{database_id}"

operation = (
service.projects() # pylint: disable=no-member
.databases()
.exportDocuments(name=name, body=body)
.execute(num_retries=self.num_retries)
)

self._wait_for_operation_to_complete(operation["name"])

def _wait_for_operation_to_complete(self, operation_name: str) -> None:
"""
Waits for the named operation to complete - checks status of the
asynchronous call.
:param operation_name: The name of the operation.
:type operation_name: str
:return: The response returned by the operation.
:rtype: dict
:exception: AirflowException in case error is returned.
"""
service = self.get_conn()
while True:
operation_response = (
service.projects() # pylint: disable=no-member
.databases()
.operations()
.get(name=operation_name)
.execute(num_retries=self.num_retries)
)
if operation_response.get("done"):
response = operation_response.get("response")
error = operation_response.get("error")
# Note, according to documentation always either response or error is
# set when "done" == True
if error:
raise AirflowException(str(error))
return response
time.sleep(TIME_TO_SLEEP_IN_SECONDS)
16 changes: 16 additions & 0 deletions airflow/providers/google/firebase/operators/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
Loading

0 comments on commit 63a3102

Please sign in to comment.