Skip to content

Commit

Permalink
Fix GCSToGCSOperator copy without wildcard and exact_match=True (#32376)
Browse files Browse the repository at this point in the history
  • Loading branch information
moiseenkov committed Jul 6, 2023
1 parent 2b841f7 commit e4757d6
Show file tree
Hide file tree
Showing 2 changed files with 46 additions and 1 deletion.
10 changes: 9 additions & 1 deletion airflow/providers/google/cloud/transfers/gcs_to_gcs.py
Original file line number Diff line number Diff line change
Expand Up @@ -370,6 +370,8 @@ def _copy_source_without_wildcard(self, hook, prefix):
self.source_bucket, prefix=prefix, delimiter=self.delimiter, match_glob=self.match_glob
)

objects = [obj for obj in objects if self._check_exact_match(obj, prefix)]

if not self.replace:
# If we are not replacing, ignore files already existing in source buckets
objects = self._ignore_existing_files(
Expand Down Expand Up @@ -405,7 +407,7 @@ def _copy_file(self, hook, source_object):
def _copy_directory(self, hook, source_objects, prefix):
_prefix = prefix.rstrip("/") + "/"
for source_obj in source_objects:
if self.exact_match and (source_obj != prefix or not source_obj.endswith(prefix)):
if not self._check_exact_match(source_obj, prefix):
continue
if self.destination_object is None:
destination_object = source_obj
Expand All @@ -417,6 +419,12 @@ def _copy_directory(self, hook, source_objects, prefix):
hook=hook, source_object=source_obj, destination_object=destination_object
)

def _check_exact_match(self, source_object: str, prefix: str) -> bool:
"""Checks whether source_object's name matches the prefix according to the exact_match flag."""
if self.exact_match and (source_object != prefix or not source_object.endswith(prefix)):
return False
return True

def _copy_source_with_wildcard(self, hook, prefix):
total_wildcards = prefix.count(WILDCARD)
if total_wildcards > 1:
Expand Down
37 changes: 37 additions & 0 deletions tests/providers/google/cloud/transfers/test_gcs_to_gcs.py
Original file line number Diff line number Diff line change
Expand Up @@ -128,6 +128,7 @@ def test_execute_no_wildcard_with_replace_flag_false(self, mock_hook):
def test_copy_file_with_exact_match(self, mock_hook):
SOURCE_FILES = [
"test_object.txt",
"test_object.txt.abc",
"test_object.txt.copy/",
"test_object.txt.folder/",
]
Expand All @@ -145,6 +146,42 @@ def test_copy_file_with_exact_match(self, mock_hook):
mock.call(TEST_BUCKET, prefix="test_object.txt", delimiter=None, match_glob=None),
]
mock_hook.return_value.list.assert_has_calls(mock_calls)
mock_hook.return_value.rewrite.assert_has_calls(
[
mock.call(TEST_BUCKET, "test_object.txt", DESTINATION_BUCKET, "test_object.txt"),
]
)

@mock.patch("airflow.providers.google.cloud.transfers.gcs_to_gcs.GCSHook")
def test_copy_file_with_exact_match_destination(self, mock_hook):
SOURCE_FILES = [
"test_object.txt",
"test_object.txt.abc",
"test_object.txt.copy/",
"test_object.txt.folder/",
]
DESTINATION_OBJ = f"{DESTINATION_OBJECT_PREFIX}/test_object.txt"

mock_hook.return_value.list.return_value = SOURCE_FILES
operator = GCSToGCSOperator(
task_id=TASK_ID,
source_bucket=TEST_BUCKET,
source_object=SOURCE_OBJECT_NO_WILDCARD,
destination_bucket=DESTINATION_BUCKET,
destination_object=DESTINATION_OBJ,
exact_match=True,
)

operator.execute(None)
mock_calls = [
mock.call(TEST_BUCKET, prefix="test_object.txt", delimiter=None, match_glob=None),
]
mock_hook.return_value.list.assert_has_calls(mock_calls)

mock_calls_rewrite = [
mock.call(TEST_BUCKET, "test_object.txt", DESTINATION_BUCKET, DESTINATION_OBJ),
]
mock_hook.return_value.rewrite.assert_has_calls(mock_calls_rewrite)

@mock.patch("airflow.providers.google.cloud.transfers.gcs_to_gcs.GCSHook")
def test_execute_prefix_and_suffix(self, mock_hook):
Expand Down

0 comments on commit e4757d6

Please sign in to comment.