Skip to content

Commit

Permalink
Add multiple file upload functionality to GCS hook (#8849)
Browse files Browse the repository at this point in the history
Co-authored-by: Timothy Healy <[email protected]>
  • Loading branch information
timhealz and Timothy Healy committed Jul 13, 2020
1 parent 383b676 commit 6892590
Show file tree
Hide file tree
Showing 2 changed files with 120 additions and 19 deletions.
43 changes: 30 additions & 13 deletions airflow/providers/google/cloud/transfers/local_to_gcs.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,9 +16,11 @@
# specific language governing permissions and limitations
# under the License.
"""
This module contains operator for uploading local file to GCS.
This module contains operator for uploading local file(s) to GCS.
"""
import os
import warnings
from glob import glob

from airflow.models import BaseOperator
from airflow.providers.google.cloud.hooks.gcs import GCSHook
Expand All @@ -27,16 +29,19 @@

class LocalFilesystemToGCSOperator(BaseOperator):
"""
Uploads a file to Google Cloud Storage.
Uploads a file or list of files to Google Cloud Storage.
Optionally can compress the file for upload.
.. seealso::
For more information on how to use this operator, take a look at the guide:
:ref:`howto/operator:LocalFilesystemToGCSOperator`
:param src: Path to the local file. (templated)
:type src: str
:param dst: The object name to set when uploading the file. (templated)
:param src: Path to the local file, or list of local files. Path can be either absolute
(e.g. /path/to/file.ext) or relative (e.g. ../../foo/*/*.csv). (templated)
:type src: str or list
:param dst: Destination path within the specified bucket on GCS (e.g. /path/to/file.ext).
If multiple files are being uploaded, specify object prefix with trailing backslash
(e.g. /path/to/directory/) (templated)
:type dst: str
:param bucket: The bucket to upload to. (templated)
:type bucket: str
Expand Down Expand Up @@ -84,16 +89,28 @@ def __init__(self,

def execute(self, context):
"""
Uploads the file to Google Cloud Storage
Uploads a file or list of files to Google Cloud Storage
"""
hook = GCSHook(
google_cloud_storage_conn_id=self.gcp_conn_id,
delegate_to=self.delegate_to)

hook.upload(
bucket_name=self.bucket,
object_name=self.dst,
mime_type=self.mime_type,
filename=self.src,
gzip=self.gzip,
)
filepaths = self.src if isinstance(self.src, list) else glob(self.src)
if os.path.basename(self.dst): # path to a file
if len(filepaths) > 1: # multiple file upload
raise ValueError("'dst' parameter references filepath. Please specifiy "
"directory (with trailing backslash) to upload multiple "
"files. e.g. /path/to/directory/")
object_paths = [self.dst]
else: # directory is provided
object_paths = [os.path.join(self.dst, os.path.basename(filepath))
for filepath in filepaths]

for filepath, object_path in zip(filepaths, object_paths):
hook.upload(
bucket_name=self.bucket,
object_name=object_path,
mime_type=self.mime_type,
filename=filepath,
gzip=self.gzip,
)
96 changes: 90 additions & 6 deletions tests/providers/google/cloud/transfers/test_local_to_gcs.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,9 +18,12 @@
#

import datetime
import os
import unittest
from glob import glob

import mock
import pytest

from airflow.models.dag import DAG
from airflow.providers.google.cloud.transfers.local_to_gcs import LocalFilesystemToGCSOperator
Expand All @@ -29,8 +32,6 @@
class TestFileToGcsOperator(unittest.TestCase):

_config = {
'src': '/tmp/fake.csv',
'dst': 'fake.csv',
'bucket': 'dummy',
'mime_type': 'application/octet-stream',
'gzip': False
Expand All @@ -42,15 +43,28 @@ def setUp(self):
'start_date': datetime.datetime(2017, 1, 1)
}
self.dag = DAG('test_dag_id', default_args=args)
self.testfile1 = '/tmp/fake1.csv'
with open(self.testfile1, 'wb') as f:
f.write(b"x" * 393216)
self.testfile2 = '/tmp/fake2.csv'
with open(self.testfile2, 'wb') as f:
f.write(b"x" * 393216)
self.testfiles = [self.testfile1, self.testfile2]

def tearDown(self):
os.remove(self.testfile1)
os.remove(self.testfile2)

def test_init(self):
operator = LocalFilesystemToGCSOperator(
task_id='file_to_gcs_operator',
dag=self.dag,
src=self.testfile1,
dst='test/test1.csv',
**self._config
)
self.assertEqual(operator.src, self._config['src'])
self.assertEqual(operator.dst, self._config['dst'])
self.assertEqual(operator.src, self.testfile1)
self.assertEqual(operator.dst, 'test/test1.csv')
self.assertEqual(operator.bucket, self._config['bucket'])
self.assertEqual(operator.mime_type, self._config['mime_type'])
self.assertEqual(operator.gzip, self._config['gzip'])
Expand All @@ -62,13 +76,83 @@ def test_execute(self, mock_hook):
operator = LocalFilesystemToGCSOperator(
task_id='gcs_to_file_sensor',
dag=self.dag,
src=self.testfile1,
dst='test/test1.csv',
**self._config
)
operator.execute(None)
mock_instance.upload.assert_called_once_with(
bucket_name=self._config['bucket'],
filename=self._config['src'],
filename=self.testfile1,
gzip=self._config['gzip'],
mime_type=self._config['mime_type'],
object_name=self._config['dst']
object_name='test/test1.csv'
)

@mock.patch('airflow.providers.google.cloud.transfers.local_to_gcs.GCSHook',
autospec=True)
def test_execute_multiple(self, mock_hook):
mock_instance = mock_hook.return_value
operator = LocalFilesystemToGCSOperator(
task_id='gcs_to_file_sensor',
dag=self.dag,
src=self.testfiles,
dst='test/',
**self._config
)
operator.execute(None)
files_objects = zip(self.testfiles, ['test/' + os.path.basename(testfile)
for testfile in self.testfiles])
calls = [
mock.call(
bucket_name=self._config['bucket'],
filename=filepath,
gzip=self._config['gzip'],
mime_type=self._config['mime_type'],
object_name=object_name
)
for filepath, object_name in files_objects
]
mock_instance.upload.assert_has_calls(calls)

@mock.patch('airflow.providers.google.cloud.transfers.local_to_gcs.GCSHook',
autospec=True)
def test_execute_wildcard(self, mock_hook):
mock_instance = mock_hook.return_value
operator = LocalFilesystemToGCSOperator(
task_id='gcs_to_file_sensor',
dag=self.dag,
src='/tmp/fake*.csv',
dst='test/',
**self._config
)
operator.execute(None)
object_names = ['test/' + os.path.basename(fp) for fp in glob('/tmp/fake*.csv')]
files_objects = zip(glob('/tmp/fake*.csv'), object_names)
calls = [
mock.call(
bucket_name=self._config['bucket'],
filename=filepath,
gzip=self._config['gzip'],
mime_type=self._config['mime_type'],
object_name=object_name
)
for filepath, object_name in files_objects
]
mock_instance.upload.assert_has_calls(calls)

@mock.patch('airflow.providers.google.cloud.transfers.local_to_gcs.GCSHook',
autospec=True)
def test_execute_negative(self, mock_hook):
mock_instance = mock_hook.return_value
operator = LocalFilesystemToGCSOperator(
task_id='gcs_to_file_sensor',
dag=self.dag,
src='/tmp/fake*.csv',
dst='test/test1.csv',
**self._config
)
print(glob('/tmp/fake*.csv'))
with pytest.raises(ValueError):
operator.execute(None)
mock_instance.assert_not_called()

0 comments on commit 6892590

Please sign in to comment.