Skip to content

Commit

Permalink
Add method to get metadata from GCS blob in GCSHook (#38398)
Browse files Browse the repository at this point in the history
* Adding get metadata to gcs hook

* unit test

* Spelling and rm fstrings

* test for blob not found

* fix pytest raises, add match regex
  • Loading branch information
jalengg committed May 27, 2024
1 parent a12a4a5 commit 23e03db
Show file tree
Hide file tree
Showing 2 changed files with 47 additions and 0 deletions.
21 changes: 21 additions & 0 deletions airflow/providers/google/cloud/hooks/gcs.py
Original file line number Diff line number Diff line change
Expand Up @@ -1010,6 +1010,27 @@ def get_md5hash(self, bucket_name: str, object_name: str) -> str:
self.log.info("The md5Hash of %s is %s", object_name, blob_md5hash)
return blob_md5hash

def get_metadata(self, bucket_name: str, object_name: str) -> dict | None:
"""
Get the metadata of an object in Google Cloud Storage.
:param bucket_name: Name of the Google Cloud Storage bucket where the object is.
:param object_name: The name of the object containing the desired metadata
:return: The metadata associated with the object
"""
self.log.info("Retrieving the metadata dict of object (%s) in bucket (%s)", object_name, bucket_name)
client = self.get_conn()
bucket = client.bucket(bucket_name)
blob = bucket.get_blob(blob_name=object_name)
if blob is None:
raise ValueError("Object (%s) not found in bucket (%s)", object_name, bucket_name)
blob_metadata = blob.metadata
if blob_metadata:
self.log.info("Retrieved metadata of object (%s) with %s fields", object_name, len(blob_metadata))
else:
self.log.info("Metadata of object (%s) is empty or it does not exist", object_name)
return blob_metadata

@GoogleBaseHook.fallback_to_default_project_id
def create_bucket(
self,
Expand Down
26 changes: 26 additions & 0 deletions tests/providers/google/cloud/hooks/test_gcs.py
Original file line number Diff line number Diff line change
Expand Up @@ -565,6 +565,32 @@ def test_object_get_md5hash(self, mock_service):

assert response == returned_file_md5hash

@mock.patch(GCS_STRING.format("GCSHook.get_conn"))
def test_object_get_metadata(self, mock_service):
test_bucket = "test_bucket"
test_object = "test_object"
returned_file_metadata = {"test_metadata_key": "test_metadata_val"}

bucket_method = mock_service.return_value.bucket
get_blob_method = bucket_method.return_value.get_blob
get_blob_method.return_value.metadata = returned_file_metadata

response = self.gcs_hook.get_metadata(bucket_name=test_bucket, object_name=test_object)

assert response == returned_file_metadata

@mock.patch(GCS_STRING.format("GCSHook.get_conn"))
def test_nonexisting_object_get_metadata(self, mock_service):
test_bucket = "test_bucket"
test_object = "test_object"

bucket_method = mock_service.return_value.bucket
get_blob_method = bucket_method.return_value.get_blob
get_blob_method.return_value = None

with pytest.raises(ValueError, match=r"Object \((.*?)\) not found in bucket \((.*?)\)"):
self.gcs_hook.get_metadata(bucket_name=test_bucket, object_name=test_object)

@mock.patch("google.cloud.storage.Bucket")
@mock.patch(GCS_STRING.format("GCSHook.get_conn"))
def test_create_bucket(self, mock_service, mock_bucket):
Expand Down

0 comments on commit 23e03db

Please sign in to comment.