Skip to content
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.

Commit 0433a1c

Browse files
authoredJun 19, 2024
feat: bigframes.streaming module for continuous queries (#703)
* feat: bigframes.streaming module for continuous queries * mypy fix * mypy fix 2 * mypy fix 3 * ignore mypy, error is in bq library * address comments from meeting * fix mypy * don't use app profile * address comments * check job_id * add bigtable setup script * further simplify string * fix bugs * add str() for consistent clarification
1 parent 0afbcec commit 0433a1c

File tree

5 files changed

+265
-0
lines changed

5 files changed

+265
-0
lines changed
 

‎bigframes/streaming/__init__.py

+139
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,139 @@
1+
# Copyright 2024 Google LLC
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
"""Module for bigquery continuous queries"""
16+
17+
import json
18+
from typing import Optional
19+
20+
from google.cloud import bigquery
21+
22+
import bigframes
23+
24+
25+
def to_bigtable(
26+
query: str,
27+
instance: str,
28+
table: str,
29+
bq_client: Optional[bigquery.Client] = None,
30+
app_profile: Optional[str] = None,
31+
truncate: bool = False,
32+
overwrite: bool = False,
33+
auto_create_column_families: bool = False,
34+
bigtable_options: Optional[dict] = None,
35+
job_id: Optional[str] = None,
36+
job_id_prefix: Optional[str] = None,
37+
) -> bigquery.QueryJob:
38+
"""Launches a BigQuery continuous query and returns a
39+
QueryJob object for some management functionality.
40+
41+
This method requires an existing bigtable preconfigured to
42+
accept the continuous query export statement. For instructions
43+
on export to bigtable, see
44+
https://cloud.google.com/bigquery/docs/export-to-bigtable.
45+
46+
Args:
47+
query (str):
48+
The sql statement to execute as a continuous function.
49+
For example: "SELECT * FROM dataset.table"
50+
This will be wrapped in an EXPORT DATA statement to
51+
launch a continuous query writing to bigtable.
52+
instance (str):
53+
The name of the bigtable instance to export to.
54+
table (str):
55+
The name of the bigtable table to export to.
56+
bq_client (str, default None):
57+
The Client object to use for the query. This determines
58+
the project id and location of the query. If None, will
59+
default to the bigframes global session default client.
60+
app_profile (str, default None):
61+
The bigtable app profile to export to. If None, no app
62+
profile will be used.
63+
truncate (bool, default False):
64+
The export truncate option, see
65+
https://cloud.google.com/bigquery/docs/reference/standard-sql/other-statements#bigtable_export_option
66+
overwrite (bool, default False):
67+
The export overwrite option, see
68+
https://cloud.google.com/bigquery/docs/reference/standard-sql/other-statements#bigtable_export_option
69+
auto_create_column_families (bool, default False):
70+
The auto_create_column_families option, see
71+
https://cloud.google.com/bigquery/docs/reference/standard-sql/other-statements#bigtable_export_option
72+
bigtable_options (dict, default None):
73+
The bigtable options dict, which will be converted to JSON
74+
using json.dumps, see
75+
https://cloud.google.com/bigquery/docs/reference/standard-sql/other-statements#bigtable_export_option
76+
If None, no bigtable_options parameter will be passed.
77+
job_id (str, default None):
78+
If specified, replace the default job id for the query,
79+
see job_id parameter of
80+
https://cloud.google.com/python/docs/reference/bigquery/latest/google.cloud.bigquery.client.Client#google_cloud_bigquery_client_Client_query
81+
job_id_prefix (str, default None):
82+
If specified, a job id prefix for the query, see
83+
job_id_prefix parameter of
84+
https://cloud.google.com/python/docs/reference/bigquery/latest/google.cloud.bigquery.client.Client#google_cloud_bigquery_client_Client_query
85+
86+
Returns:
87+
google.cloud.bigquery.QueryJob:
88+
See https://cloud.google.com/python/docs/reference/bigquery/latest/google.cloud.bigquery.job.QueryJob
89+
The ongoing query job can be managed using this object.
90+
For example, the job can be cancelled or its error status
91+
can be examined.
92+
"""
93+
# get default client if not passed
94+
if bq_client is None:
95+
bq_client = bigframes.get_global_session().bqclient
96+
97+
# build export string from parameters
98+
project = bq_client.project
99+
100+
app_profile_url_string = ""
101+
if app_profile is not None:
102+
app_profile_url_string = f"appProfiles/{app_profile}/"
103+
104+
bigtable_options_parameter_string = ""
105+
if bigtable_options is not None:
106+
bigtable_options_parameter_string = (
107+
'bigtable_options = """' + json.dumps(bigtable_options) + '""",\n'
108+
)
109+
110+
sql = (
111+
"EXPORT DATA\n"
112+
"OPTIONS (\n"
113+
"format = 'CLOUD_BIGTABLE',\n"
114+
f"{bigtable_options_parameter_string}"
115+
f"truncate = {str(truncate)},\n"
116+
f"overwrite = {str(overwrite)},\n"
117+
f"auto_create_column_families = {str(auto_create_column_families)},\n"
118+
f'uri = "https://bigtable.googleapis.com/projects/{project}/instances/{instance}/{app_profile_url_string}tables/{table}"\n'
119+
")\n"
120+
"AS (\n"
121+
f"{query});"
122+
)
123+
124+
# override continuous http parameter
125+
job_config = bigquery.job.QueryJobConfig()
126+
job_config_filled = job_config.from_api_repr({"query": {"continuous": True}})
127+
128+
# begin the query job
129+
query_job = bq_client.query(
130+
sql,
131+
job_config=job_config_filled, # type:ignore
132+
# typing error above is in bq client library
133+
# (should accept abstract job_config, only takes concrete)
134+
job_id=job_id,
135+
job_id_prefix=job_id_prefix,
136+
)
137+
138+
# return the query job to the user for lifetime management
139+
return query_job

‎scripts/create_bigtable.py

+76
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,76 @@
1+
# Copyright 2024 Google LLC
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# https://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
# This script create the bigtable resources required for
16+
# bigframes.streaming testing if they don't already exist
17+
18+
import os
19+
import pathlib
20+
import sys
21+
22+
import google.cloud.bigtable as bigtable
23+
24+
REPO_ROOT = pathlib.Path(__file__).parent.parent
25+
26+
PROJECT_ID = os.getenv("GOOGLE_CLOUD_PROJECT")
27+
28+
if not PROJECT_ID:
29+
print(
30+
"Please set GOOGLE_CLOUD_PROJECT environment variable before running.",
31+
file=sys.stderr,
32+
)
33+
sys.exit(1)
34+
35+
36+
def create_instance(client):
37+
instance_name = "streaming-testing-instance"
38+
instance = bigtable.instance.Instance(
39+
instance_name,
40+
client,
41+
)
42+
cluster_id = "streaming-testing-instance-c1"
43+
cluster = instance.cluster(
44+
cluster_id,
45+
location_id="us-west1-a",
46+
serve_nodes=1,
47+
)
48+
if not instance.exists():
49+
operation = instance.create(
50+
clusters=[cluster],
51+
)
52+
operation.result(timeout=480)
53+
print(f"Created instance {instance_name}")
54+
return instance
55+
56+
57+
def create_table(instance):
58+
table_id = "table-testing"
59+
table = bigtable.table.Table(
60+
table_id,
61+
instance,
62+
)
63+
if not table.exists():
64+
table.create()
65+
print(f"Created table {table_id}")
66+
67+
68+
def main():
69+
client = bigtable.Client(project=PROJECT_ID, admin=True)
70+
71+
instance = create_instance(client)
72+
create_table(instance)
73+
74+
75+
if __name__ == "__main__":
76+
main()

‎setup.py

+1
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,7 @@
3939
"gcsfs >=2023.3.0",
4040
"geopandas >=0.12.2",
4141
"google-auth >=2.15.0,<3.0dev",
42+
"google-cloud-bigtable >=2.24.0",
4243
"google-cloud-bigquery[bqstorage,pandas] >=3.16.0",
4344
"google-cloud-functions >=1.12.0",
4445
"google-cloud-bigquery-connection >=1.12.0",

‎testing/constraints-3.9.txt

+1
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@ fsspec==2023.3.0
44
gcsfs==2023.3.0
55
geopandas==0.12.2
66
google-auth==2.15.0
7+
google-cloud-bigtable==2.24.0
78
google-cloud-bigquery==3.16.0
89
google-cloud-functions==1.12.0
910
google-cloud-bigquery-connection==1.12.0

‎tests/system/large/test_streaming.py

+48
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,48 @@
1+
# Copyright 2024 Google LLC
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
import time
16+
17+
import bigframes.streaming
18+
19+
20+
def test_streaming_to_bigtable():
21+
# launch a continuous query
22+
job_id_prefix = "test_streaming_"
23+
sql = """SELECT
24+
body_mass_g, island as rowkey
25+
FROM birds.penguins"""
26+
query_job = bigframes.streaming.to_bigtable(
27+
sql,
28+
"streaming-testing-instance",
29+
"table-testing",
30+
app_profile=None,
31+
truncate=True,
32+
overwrite=True,
33+
auto_create_column_families=True,
34+
bigtable_options={},
35+
job_id=None,
36+
job_id_prefix=job_id_prefix,
37+
)
38+
39+
try:
40+
# wait 100 seconds in order to ensure the query doesn't stop
41+
# (i.e. it is continuous)
42+
time.sleep(100)
43+
assert query_job.error_result is None
44+
assert query_job.errors is None
45+
assert query_job.running()
46+
assert str(query_job.job_id).startswith(job_id_prefix)
47+
finally:
48+
query_job.cancel()

0 commit comments

Comments
 (0)
Failed to load comments.