From 5c4cfea8c0f488496c1cbcc4c6c5db13d8210979 Mon Sep 17 00:00:00 2001 From: TobKed Date: Fri, 15 Nov 2019 14:32:32 +0100 Subject: [PATCH] [AIRFLOW-5718] Add SFTPToGoogleCloudStorageOperator (#6393) --- .../cloud/example_dags/example_sftp_to_gcs.py | 79 +++++++ .../google/cloud/operators/sftp_to_gcs.py | 178 +++++++++++++++ docs/howto/operator/gcp/sftp_to_gcs.rst | 106 +++++++++ docs/operators-and-hooks-ref.rst | 5 + .../cloud/operators/test_sftp_to_gcs.py | 216 ++++++++++++++++++ .../operators/test_sftp_to_gcs_system.py | 50 ++++ .../test_sftp_to_gcs_system_helper.py | 140 ++++++++++++ 7 files changed, 774 insertions(+) create mode 100644 airflow/providers/google/cloud/example_dags/example_sftp_to_gcs.py create mode 100644 airflow/providers/google/cloud/operators/sftp_to_gcs.py create mode 100644 docs/howto/operator/gcp/sftp_to_gcs.rst create mode 100644 tests/providers/google/cloud/operators/test_sftp_to_gcs.py create mode 100644 tests/providers/google/cloud/operators/test_sftp_to_gcs_system.py create mode 100644 tests/providers/google/cloud/operators/test_sftp_to_gcs_system_helper.py diff --git a/airflow/providers/google/cloud/example_dags/example_sftp_to_gcs.py b/airflow/providers/google/cloud/example_dags/example_sftp_to_gcs.py new file mode 100644 index 0000000000000..08d621d2b2814 --- /dev/null +++ b/airflow/providers/google/cloud/example_dags/example_sftp_to_gcs.py @@ -0,0 +1,79 @@ +# -*- coding: utf-8 -*- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +""" +Example Airflow DAG for Google Cloud Storage to SFTP transfer operators. +""" + +import os + +import airflow +from airflow import models +from airflow.providers.google.cloud.operators.sftp_to_gcs import SFTPToGoogleCloudStorageOperator + +default_args = {"start_date": airflow.utils.dates.days_ago(1)} + +BUCKET_SRC = os.environ.get("GCP_GCS_BUCKET_1_SRC", "test-sftp-gcs") + +TMP_PATH = "/tmp" +DIR = "tests_sftp_hook_dir" +SUBDIR = "subdir" + +OBJECT_SRC_1 = "parent-1.bin" +OBJECT_SRC_2 = "parent-2.bin" +OBJECT_SRC_3 = "parent-3.txt" + + +with models.DAG( + "example_sftp_to_gcs", default_args=default_args, schedule_interval=None +) as dag: + # [START howto_operator_sftp_to_gcs_copy_single_file] + copy_file_from_sftp_to_gcs = SFTPToGoogleCloudStorageOperator( + task_id="file-copy-sftp-to-gcs", + source_path=os.path.join(TMP_PATH, DIR, OBJECT_SRC_1), + destination_bucket=BUCKET_SRC, + ) + # [END howto_operator_sftp_to_gcs_copy_single_file] + + # [START howto_operator_sftp_to_gcs_move_single_file_destination] + move_file_from_sftp_to_gcs_destination = SFTPToGoogleCloudStorageOperator( + task_id="file-move-sftp-to-gcs-destination", + source_path=os.path.join(TMP_PATH, DIR, OBJECT_SRC_2), + destination_bucket=BUCKET_SRC, + destination_path="destination_dir/destination_filename.bin", + move_object=True, + ) + # [END howto_operator_sftp_to_gcs_move_single_file_destination] + + # [START howto_operator_sftp_to_gcs_copy_directory] + copy_directory_from_sftp_to_gcs = SFTPToGoogleCloudStorageOperator( + task_id="dir-copy-sftp-to-gcs", + source_path=os.path.join(TMP_PATH, DIR, SUBDIR, "*"), + destination_bucket=BUCKET_SRC, + ) + # [END howto_operator_sftp_to_gcs_copy_directory] + + # [START howto_operator_sftp_to_gcs_move_specific_files] + move_specific_files_from_gcs_to_sftp = SFTPToGoogleCloudStorageOperator( + task_id="dir-move-specific-files-sftp-to-gcs", + source_path=os.path.join(TMP_PATH, DIR, SUBDIR, "*.bin"), + destination_bucket=BUCKET_SRC, + destination_path="specific_files/", + move_object=True, + ) + # [END howto_operator_sftp_to_gcs_move_specific_files] diff --git a/airflow/providers/google/cloud/operators/sftp_to_gcs.py b/airflow/providers/google/cloud/operators/sftp_to_gcs.py new file mode 100644 index 0000000000000..478bff1e868ca --- /dev/null +++ b/airflow/providers/google/cloud/operators/sftp_to_gcs.py @@ -0,0 +1,178 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +""" +This module contains SFTP to Google Cloud Storage operator. +""" +import os +from tempfile import NamedTemporaryFile +from typing import Optional, Union + +from airflow import AirflowException +from airflow.contrib.hooks.sftp_hook import SFTPHook +from airflow.gcp.hooks.gcs import GoogleCloudStorageHook +from airflow.models import BaseOperator +from airflow.utils.decorators import apply_defaults + +WILDCARD = "*" + + +class SFTPToGoogleCloudStorageOperator(BaseOperator): + """ + Transfer files to Google Cloud Storage from SFTP server. + + .. seealso:: + For more information on how to use this operator, take a look at the guide: + :ref:`howto/operator:SFTPToGoogleCloudStorageOperator` + + :param source_path: The sftp remote path. This is the specified file path + for downloading the single file or multiple files from the SFTP server. + You can use only one wildcard within your path. The wildcard can appear + inside the path or at the end of the path. + :type source_path: str + :param destination_bucket: The bucket to upload to. + :type destination_bucket: str + :param destination_path: The destination name of the object in the + destination Google Cloud Storage bucket. + If destination_path is not provided file/files will be placed in the + main bucket path. + If a wildcard is supplied in the destination_path argument, this is the + prefix that will be prepended to the final destination objects' paths. + :type destination_path: str + :param gcp_conn_id: (Optional) The connection ID used to connect to Google Cloud Platform. + :type gcp_conn_id: str + :param sftp_conn_id: The sftp connection id. The name or identifier for + establishing a connection to the SFTP server. + :type sftp_conn_id: str + :param delegate_to: The account to impersonate, if any + :type delegate_to: str + :param mime_type: The mime-type string + :type mime_type: str + :param gzip: Allows for file to be compressed and uploaded as gzip + :type gzip: bool + :param move_object: When move object is True, the object is moved instead + of copied to the new location. This is the equivalent of a mv command + as opposed to a cp command. + :type move_object: bool + """ + + template_fields = ("source_path", "destination_path", "destination_bucket") + + @apply_defaults + def __init__( + self, + source_path: str, + destination_bucket: str, + destination_path: Optional[str] = None, + gcp_conn_id: str = "google_cloud_default", + sftp_conn_id: str = "ssh_default", + delegate_to: Optional[str] = None, + mime_type: str = "application/octet-stream", + gzip: bool = False, + move_object: bool = False, + *args, + **kwargs + ) -> None: + super().__init__(*args, **kwargs) + + self.source_path = source_path + self.destination_path = self._set_destination_path(destination_path) + self.destination_bucket = self._set_bucket_name(destination_bucket) + self.gcp_conn_id = gcp_conn_id + self.mime_type = mime_type + self.delegate_to = delegate_to + self.gzip = gzip + self.sftp_conn_id = sftp_conn_id + self.move_object = move_object + + def execute(self, context): + gcs_hook = GoogleCloudStorageHook( + gcp_conn_id=self.gcp_conn_id, delegate_to=self.delegate_to + ) + + sftp_hook = SFTPHook(self.sftp_conn_id) + + if WILDCARD in self.source_path: + total_wildcards = self.source_path.count(WILDCARD) + if total_wildcards > 1: + raise AirflowException( + "Only one wildcard '*' is allowed in source_path parameter. " + "Found {} in {}.".format(total_wildcards, self.source_path) + ) + + prefix, delimiter = self.source_path.split(WILDCARD, 1) + base_path = os.path.dirname(prefix) + + files, _, _ = sftp_hook.get_tree_map( + base_path, prefix=prefix, delimiter=delimiter + ) + + for file in files: + destination_path = file.replace(base_path, self.destination_path, 1) + self._copy_single_object(gcs_hook, sftp_hook, file, destination_path) + + else: + destination_object = ( + self.destination_path + if self.destination_path + else self.source_path.rsplit("/", 1)[1] + ) + self._copy_single_object( + gcs_hook, sftp_hook, self.source_path, destination_object + ) + + def _copy_single_object( + self, + gcs_hook: GoogleCloudStorageHook, + sftp_hook: SFTPHook, + source_path: str, + destination_object: str, + ) -> None: + """ + Helper function to copy single object. + """ + self.log.info( + "Executing copy of %s to gs://%s/%s", + source_path, + self.destination_bucket, + destination_object, + ) + + with NamedTemporaryFile("w") as tmp: + sftp_hook.retrieve_file(source_path, tmp.name) + + gcs_hook.upload( + bucket_name=self.destination_bucket, + object_name=destination_object, + filename=tmp.name, + mime_type=self.mime_type, + ) + + if self.move_object: + self.log.info("Executing delete of %s", source_path) + sftp_hook.delete_file(source_path) + + @staticmethod + def _set_destination_path(path: Union[str, None]) -> str: + if path is not None: + return path.lstrip("/") if path.startswith("/") else path + return "" + + @staticmethod + def _set_bucket_name(name: str) -> str: + bucket = name if not name.startswith("gs://") else name[5:] + return bucket.strip("/") diff --git a/docs/howto/operator/gcp/sftp_to_gcs.rst b/docs/howto/operator/gcp/sftp_to_gcs.rst new file mode 100644 index 0000000000000..887b045388124 --- /dev/null +++ b/docs/howto/operator/gcp/sftp_to_gcs.rst @@ -0,0 +1,106 @@ + .. Licensed to the Apache Software Foundation (ASF) under one + or more contributor license agreements. See the NOTICE file + distributed with this work for additional information + regarding copyright ownership. The ASF licenses this file + to you under the Apache License, Version 2.0 (the + "License"); you may not use this file except in compliance + with the License. You may obtain a copy of the License at + + .. http://www.apache.org/licenses/LICENSE-2.0 + + .. Unless required by applicable law or agreed to in writing, + software distributed under the License is distributed on an + "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + KIND, either express or implied. See the License for the + specific language governing permissions and limitations + under the License. + + +SFTP to Google Cloud Storage Transfer Operator +============================================== + +Google has a service `Google Cloud Storage `__. This service is +used to store large data from various applications. +SFTP (SSH File Transfer Protocol) is a secure file transfer protocol. +It runs over the SSH protocol. It supports the full security and authentication functionality of the SSH. + + +.. contents:: + :depth: 1 + :local: + +Prerequisite Tasks +^^^^^^^^^^^^^^^^^^ + +.. include:: _partials/prerequisite_tasks.rst + +.. _howto/operator:SFTPToGoogleCloudStorageOperator: + +Operator +^^^^^^^^ + +Transfer files between SFTP and Google Storage is performed with the +:class:`~airflow.providers.google.cloud.operators.sftp_to_gcs.SFTPToGoogleCloudStorageOperator` operator. + +Use :ref:`Jinja templating ` with +:template-fields:`airflow.providers.google.cloud.operators.sftp_to_gcs.SFTPToGoogleCloudStorageOperator` +to define values dynamically. + +Copying single files +-------------------- + +The following Operator copies a single file. + +.. exampleinclude:: ../../../../airflow/providers/google/cloud/example_dags/example_sftp_to_gcs.py + :language: python + :dedent: 4 + :start-after: [START howto_operator_sftp_to_gcs_copy_single_file] + :end-before: [END howto_operator_sftp_to_gcs_copy_single_file] + +Moving a single file +-------------------- + +To move the file use the ``move_object`` parameter. Once the file is copied to Google Storage, +the original file from the SFTP is deleted. +The ``destination_path`` parameter defines the full path of the file in the bucket. + +.. exampleinclude:: ../../../../airflow/providers/google/cloud/example_dags/example_sftp_to_gcs.py + :language: python + :dedent: 4 + :start-after: [START howto_operator_sftp_to_gcs_move_single_file_destination] + :end-before: [END howto_operator_sftp_to_gcs_move_single_file_destination] + + +Copying directory +----------------- + +Use the ``wildcard`` in ``source_path`` parameter to copy the directory. + +.. exampleinclude:: ../../../../airflow/providers/google/cloud/example_dags/example_sftp_to_gcs.py + :language: python + :dedent: 4 + :start-after: [START howto_operator_sftp_to_gcs_copy_directory] + :end-before: [END howto_operator_sftp_to_gcs_copy_directory] + +Moving specific files +--------------------- + +Use the ``wildcard`` in ``source_path`` parameter to move the specific files. +You can use only one wildcard within your path. +The ``destination_path`` defines the path that is prefixed to all copied files, +e.g. ``tests_sftp_hook_dir/subdir/parent-1.bin`` is copied to ``specific_files/parent-1.bin``, +and ``tests_sftp_hook_dir/subdir/parent-2.bin`` is copied to ``specific_files/parent-2.bin`` . +``tests_sftp_hook_dir/subdir/parent-3.txt`` is skipped. + +.. exampleinclude:: ../../../../airflow/providers/google/cloud/example_dags/example_sftp_to_gcs.py + :language: python + :dedent: 4 + :start-after: [START howto_operator_sftp_to_gcs_move_specific_files] + :end-before: [END howto_operator_sftp_to_gcs_move_specific_files] + +Reference +^^^^^^^^^ + +For more information, see + +* `Google Cloud Storage Documentation `__ diff --git a/docs/operators-and-hooks-ref.rst b/docs/operators-and-hooks-ref.rst index ae3697381b841..3ed9fcee3e3ab 100644 --- a/docs/operators-and-hooks-ref.rst +++ b/docs/operators-and-hooks-ref.rst @@ -777,6 +777,11 @@ These integrations allow you to copy data from/to Google Cloud Platform. - - :mod:`airflow.operators.postgres_to_gcs` + * - SFTP + - `Google Cloud Storage (GCS) `__ + - :doc:`How to use ` + - :mod:`airflow.providers.google.cloud.operators.sftp_to_gcs` + * - SQL - `Cloud Storage (GCS) `__ - diff --git a/tests/providers/google/cloud/operators/test_sftp_to_gcs.py b/tests/providers/google/cloud/operators/test_sftp_to_gcs.py new file mode 100644 index 0000000000000..e79187460eb88 --- /dev/null +++ b/tests/providers/google/cloud/operators/test_sftp_to_gcs.py @@ -0,0 +1,216 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +import os +import unittest + +from airflow.exceptions import AirflowException +from airflow.providers.google.cloud.operators.sftp_to_gcs import SFTPToGoogleCloudStorageOperator +from tests.compat import mock + +TASK_ID = "test-gcs-to-sftp-operator" +GCP_CONN_ID = "GCP_CONN_ID" +SFTP_CONN_ID = "SFTP_CONN_ID" +DELEGATE_TO = "DELEGATE_TO" + +DEFAULT_MIME_TYPE = "application/octet-stream" + +TEST_BUCKET = "test-bucket" +SOURCE_OBJECT_WILDCARD_FILENAME = "main_dir/test_object*.json" +SOURCE_OBJECT_NO_WILDCARD = "main_dir/test_object3.json" +SOURCE_OBJECT_MULTIPLE_WILDCARDS = "main_dir/csv/*/test_*.csv" + +SOURCE_FILES_LIST = [ + "main_dir/test_object1.txt", + "main_dir/test_object2.txt", + "main_dir/test_object3.json", + "main_dir/sub_dir/test_object1.txt", + "main_dir/sub_dir/test_object2.txt", + "main_dir/sub_dir/test_object3.json", +] + +DESTINATION_PATH_DIR = "destination_dir" +DESTINATION_PATH_FILE = "destination_dir/copy.txt" + + +# pylint: disable=unused-argument +class TestSFTPToGoogleCloudStorageOperator(unittest.TestCase): + @mock.patch("airflow.providers.google.cloud.operators.sftp_to_gcs.GoogleCloudStorageHook") + @mock.patch("airflow.providers.google.cloud.operators.sftp_to_gcs.SFTPHook") + def test_execute_copy_single_file(self, sftp_hook, gcs_hook): + task = SFTPToGoogleCloudStorageOperator( + task_id=TASK_ID, + source_path=SOURCE_OBJECT_NO_WILDCARD, + destination_bucket=TEST_BUCKET, + destination_path=DESTINATION_PATH_FILE, + move_object=False, + gcp_conn_id=GCP_CONN_ID, + sftp_conn_id=SFTP_CONN_ID, + delegate_to=DELEGATE_TO, + ) + task.execute(None) + gcs_hook.assert_called_once_with( + gcp_conn_id=GCP_CONN_ID, delegate_to=DELEGATE_TO + ) + sftp_hook.assert_called_once_with(SFTP_CONN_ID) + + sftp_hook.return_value.retrieve_file.assert_called_once_with( + os.path.join(SOURCE_OBJECT_NO_WILDCARD), mock.ANY + ) + + gcs_hook.return_value.upload.assert_called_once_with( + bucket_name=TEST_BUCKET, + object_name=DESTINATION_PATH_FILE, + filename=mock.ANY, + mime_type=DEFAULT_MIME_TYPE, + ) + + sftp_hook.return_value.delete_file.assert_not_called() + + @mock.patch("airflow.providers.google.cloud.operators.sftp_to_gcs.GoogleCloudStorageHook") + @mock.patch("airflow.providers.google.cloud.operators.sftp_to_gcs.SFTPHook") + def test_execute_move_single_file(self, sftp_hook, gcs_hook): + task = SFTPToGoogleCloudStorageOperator( + task_id=TASK_ID, + source_path=SOURCE_OBJECT_NO_WILDCARD, + destination_bucket=TEST_BUCKET, + destination_path=DESTINATION_PATH_FILE, + move_object=True, + gcp_conn_id=GCP_CONN_ID, + sftp_conn_id=SFTP_CONN_ID, + delegate_to=DELEGATE_TO, + ) + task.execute(None) + gcs_hook.assert_called_once_with( + gcp_conn_id=GCP_CONN_ID, delegate_to=DELEGATE_TO + ) + sftp_hook.assert_called_once_with(SFTP_CONN_ID) + + sftp_hook.return_value.retrieve_file.assert_called_once_with( + os.path.join(SOURCE_OBJECT_NO_WILDCARD), mock.ANY + ) + + gcs_hook.return_value.upload.assert_called_once_with( + bucket_name=TEST_BUCKET, + object_name=DESTINATION_PATH_FILE, + filename=mock.ANY, + mime_type=DEFAULT_MIME_TYPE, + ) + + sftp_hook.return_value.delete_file.assert_called_once_with( + SOURCE_OBJECT_NO_WILDCARD + ) + + @mock.patch("airflow.providers.google.cloud.operators.sftp_to_gcs.GoogleCloudStorageHook") + @mock.patch("airflow.providers.google.cloud.operators.sftp_to_gcs.SFTPHook") + def test_execute_copy_with_wildcard(self, sftp_hook, gcs_hook): + sftp_hook.return_value.get_tree_map.return_value = [ + ["main_dir/test_object3.json", "main_dir/sub_dir/test_object3.json"], + [], + [], + ] + + task = SFTPToGoogleCloudStorageOperator( + task_id=TASK_ID, + source_path=SOURCE_OBJECT_WILDCARD_FILENAME, + destination_bucket=TEST_BUCKET, + destination_path=DESTINATION_PATH_DIR, + move_object=True, + gcp_conn_id=GCP_CONN_ID, + sftp_conn_id=SFTP_CONN_ID, + delegate_to=DELEGATE_TO, + ) + task.execute(None) + + sftp_hook.return_value.get_tree_map.assert_called_with( + "main_dir", prefix="main_dir/test_object", delimiter=".json" + ) + + sftp_hook.return_value.retrieve_file.assert_has_calls( + [ + mock.call("main_dir/test_object3.json", mock.ANY), + mock.call("main_dir/sub_dir/test_object3.json", mock.ANY), + ] + ) + + gcs_hook.return_value.upload.assert_has_calls( + [ + mock.call( + bucket_name=TEST_BUCKET, + object_name="destination_dir/test_object3.json", + mime_type=DEFAULT_MIME_TYPE, + filename=mock.ANY, + ), + mock.call( + bucket_name=TEST_BUCKET, + object_name="destination_dir/sub_dir/test_object3.json", + mime_type=DEFAULT_MIME_TYPE, + filename=mock.ANY, + ), + ] + ) + + @mock.patch("airflow.providers.google.cloud.operators.sftp_to_gcs.GoogleCloudStorageHook") + @mock.patch("airflow.providers.google.cloud.operators.sftp_to_gcs.SFTPHook") + def test_execute_move_with_wildcard(self, sftp_hook, gcs_hook): + sftp_hook.return_value.get_tree_map.return_value = [ + ["main_dir/test_object3.json", "main_dir/sub_dir/test_object3.json"], + [], + [], + ] + + gcs_hook.return_value.list.return_value = SOURCE_FILES_LIST[:2] + task = SFTPToGoogleCloudStorageOperator( + task_id=TASK_ID, + source_path=SOURCE_OBJECT_WILDCARD_FILENAME, + destination_bucket=TEST_BUCKET, + destination_path=DESTINATION_PATH_DIR, + move_object=True, + gcp_conn_id=GCP_CONN_ID, + sftp_conn_id=SFTP_CONN_ID, + delegate_to=DELEGATE_TO, + ) + task.execute(None) + + sftp_hook.return_value.delete_file.assert_has_calls( + [ + mock.call("main_dir/test_object3.json"), + mock.call("main_dir/sub_dir/test_object3.json"), + ] + ) + + @mock.patch("airflow.providers.google.cloud.operators.sftp_to_gcs.GoogleCloudStorageHook") + @mock.patch("airflow.providers.google.cloud.operators.sftp_to_gcs.SFTPHook") + def test_execute_more_than_one_wildcard_exception(self, sftp_hook, gcs_hook): + task = SFTPToGoogleCloudStorageOperator( + task_id=TASK_ID, + source_path=SOURCE_OBJECT_MULTIPLE_WILDCARDS, + destination_bucket=TEST_BUCKET, + destination_path=DESTINATION_PATH_FILE, + move_object=False, + gcp_conn_id=GCP_CONN_ID, + sftp_conn_id=SFTP_CONN_ID, + delegate_to=DELEGATE_TO, + ) + with self.assertRaises(AirflowException) as cm: + task.execute(None) + + err = cm.exception + self.assertIn("Only one wildcard '*' is allowed in source_path parameter", str(err)) diff --git a/tests/providers/google/cloud/operators/test_sftp_to_gcs_system.py b/tests/providers/google/cloud/operators/test_sftp_to_gcs_system.py new file mode 100644 index 0000000000000..1d0d2c94e9f60 --- /dev/null +++ b/tests/providers/google/cloud/operators/test_sftp_to_gcs_system.py @@ -0,0 +1,50 @@ +# -*- coding: utf-8 -*- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +"""System tests for Google Cloud Build operators""" + +from tests.gcp.utils.gcp_authenticator import GCP_GCS_KEY +from tests.providers.google.cloud.operators.test_sftp_to_gcs_system_helper import SFTPtoGcsTestHelper +from tests.test_utils.gcp_system_helpers import GCP_DAG_FOLDER, provide_gcp_context, skip_gcp_system +from tests.test_utils.system_tests_class import SystemTest + + +@skip_gcp_system(GCP_GCS_KEY) +class SFTPToGcsExampleDagsSystemTest(SystemTest): + """ + System tests for SFTP to Google Cloud Storage transfer operator + It use a real service. + """ + + helper = SFTPtoGcsTestHelper() + + @provide_gcp_context(GCP_GCS_KEY) + def setUp(self): + super().setUp() + self.helper.create_buckets() + self.helper.create_temp_files() + + @provide_gcp_context(GCP_GCS_KEY) + def test_run_example_dag(self): + self.run_dag("example_sftp_to_gcs", GCP_DAG_FOLDER) + + @provide_gcp_context(GCP_GCS_KEY) + def tearDown(self): + self.helper.delete_buckets() + self.helper.delete_temp_files() + super().tearDown() diff --git a/tests/providers/google/cloud/operators/test_sftp_to_gcs_system_helper.py b/tests/providers/google/cloud/operators/test_sftp_to_gcs_system_helper.py new file mode 100644 index 0000000000000..c043644bd1865 --- /dev/null +++ b/tests/providers/google/cloud/operators/test_sftp_to_gcs_system_helper.py @@ -0,0 +1,140 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +""" +Helpers to perform system tests for the Google Cloud Storage service. +""" +import argparse +import os +import shutil + +from airflow.providers.google.cloud.example_dags.example_sftp_to_gcs import ( + BUCKET_SRC, DIR, OBJECT_SRC_1, OBJECT_SRC_2, OBJECT_SRC_3, SUBDIR, TMP_PATH, +) +from tests.contrib.utils.logging_command_executor import LoggingCommandExecutor +from tests.gcp.utils.gcp_authenticator import GCP_GCS_KEY, GcpAuthenticator + +GCP_PROJECT_ID = os.environ.get("GCP_PROJECT_ID", "example-project") + + +class SFTPtoGcsTestHelper(LoggingCommandExecutor): + """ + Helper class to perform system tests for the Google Cloud Storage service. + """ + + files_and_dirs = [ + (OBJECT_SRC_1, os.path.join(TMP_PATH, DIR)), + (OBJECT_SRC_2, os.path.join(TMP_PATH, DIR)), + (OBJECT_SRC_3, os.path.join(TMP_PATH, DIR)), + (OBJECT_SRC_1, os.path.join(TMP_PATH, DIR, SUBDIR)), + (OBJECT_SRC_2, os.path.join(TMP_PATH, DIR, SUBDIR)), + (OBJECT_SRC_3, os.path.join(TMP_PATH, DIR, SUBDIR)), + ] + + buckets = [BUCKET_SRC] + + def create_buckets(self): + """Create a bucket in Google Cloud Storage service with sample content.""" + + # 1. Create buckets + for bucket in self.buckets: + self.execute_cmd(["gsutil", "mb", "gs://{}".format(bucket)]) + + def create_temp_files(self): + for filename, dir_path in self.files_and_dirs: + self._create_temp_file(filename, dir_path) + + def delete_temp_files(self): + for filename, dir_path in self.files_and_dirs: + self._delete_temp_file(filename, dir_path) + self._delete_temp_dir(dir_path) + + def delete_buckets(self): + """Delete bucket in Google Cloud Storage service""" + self.execute_cmd(["gsutil", "rm", "gs://{}/**".format(BUCKET_SRC)]) + self.execute_cmd(["gsutil", "rb", "gs://{}".format(BUCKET_SRC)]) + + @staticmethod + def _create_temp_file(filename, dir_path="/tmp"): + os.makedirs(dir_path, exist_ok=True) + + full_path = os.path.join(dir_path, filename) + with open(full_path, "wb") as f: + f.write(os.urandom(1 * 1024 * 1024)) + + @staticmethod + def _delete_temp_file(filename, dir_path): + full_path = os.path.join(dir_path, filename) + try: + os.remove(full_path) + except FileNotFoundError: + pass + if dir_path != "/tmp": + shutil.rmtree(dir_path, ignore_errors=True) + + @staticmethod + def _delete_temp_dir(dir_path): + if dir_path != "/tmp": + shutil.rmtree(dir_path, ignore_errors=True) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description="Create bucket for system tests.") + parser.add_argument( + "--action", + dest="action", + required=True, + choices=( + "create-buckets", + "delete-buckets", + "before-tests", + "after-tests", + "create-files", + "delete-files", + ), + ) + action = parser.parse_args().action + + helper = SFTPtoGcsTestHelper() + gcp_authenticator = GcpAuthenticator(GCP_GCS_KEY) + helper.log.info("Starting action: %s", action) + + gcp_authenticator.gcp_store_authentication() + try: + gcp_authenticator.gcp_authenticate() + if action == "before-tests": + helper.create_buckets() + helper.create_temp_files() + elif action == "after-tests": + helper.delete_buckets() + helper.delete_temp_files() + elif action == "create-buckets": + helper.create_buckets() + elif action == "delete-buckets": + helper.delete_buckets() + elif action == "create-files": + helper.create_temp_files() + elif action == "delete-files": + helper.delete_temp_files() + else: + raise Exception("Unknown action: {}".format(action)) + finally: + gcp_authenticator.gcp_restore_authentication() + + helper.log.info("Finishing action: %s", action)