Skip to content
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.

Commit 034f71f

Browse files
Genesis929gcf-owl-bot[bot]tswast
authoredDec 13, 2023
feat: Add filters argument to read_gbq for enhanced data querying (#198)
* feat: Add filters argument to read_gbq for enhanced data querying * feat: Add filters argument to read_gbq for enhanced data querying * 🦉 Updates from OwlBot post-processor See https://github.com/googleapis/repo-automation-bots/blob/main/packages/owl-bot/README.md * feat: Add filters and columns arguments to read_gbq for enhanced data querying See https://github.com/googleapis/repo-automation-bots/blob/main/packages/owl-bot/README.md * 🦉 Updates from OwlBot post-processor See https://github.com/googleapis/repo-automation-bots/blob/main/packages/owl-bot/README.md * feat: Add filters and columns arguments to read_gbq for enhanced data querying See https://github.com/googleapis/repo-automation-bots/blob/main/packages/owl-bot/README.md * feat: Add filters and columns arguments to read_gbq for enhanced data querying See https://github.com/googleapis/repo-automation-bots/blob/main/packages/owl-bot/README.md * feat: Add filters and columns arguments to read_gbq for enhanced data querying See https://github.com/googleapis/repo-automation-bots/blob/main/packages/owl-bot/README.md * feat: Add filters and columns arguments to read_gbq for enhanced data querying See https://github.com/googleapis/repo-automation-bots/blob/main/packages/owl-bot/README.md * feat: Add filters and columns arguments to read_gbq for enhanced data querying See https://github.com/googleapis/repo-automation-bots/blob/main/packages/owl-bot/README.md * feat: Add filters and columns arguments to read_gbq for enhanced data querying See https://github.com/googleapis/repo-automation-bots/blob/main/packages/owl-bot/README.md * feat: Add filters and columns arguments to read_gbq for enhanced data querying See https://github.com/googleapis/repo-automation-bots/blob/main/packages/owl-bot/README.md * feat: Add filters and columns arguments to read_gbq for enhanced data querying See https://github.com/googleapis/repo-automation-bots/blob/main/packages/owl-bot/README.md * 🦉 Updates from OwlBot post-processor See https://github.com/googleapis/repo-automation-bots/blob/main/packages/owl-bot/README.md * feat: Add filters and columns arguments to read_gbq for enhanced data querying See https://github.com/googleapis/repo-automation-bots/blob/main/packages/owl-bot/README.md * 🦉 Updates from OwlBot post-processor See https://github.com/googleapis/repo-automation-bots/blob/main/packages/owl-bot/README.md * update docstring * remove columns input * make filter_to_query run only when there are filters * remove named input --------- Co-authored-by: Owl Bot <gcf-owl-bot[bot]@users.noreply.github.com> Co-authored-by: Tim Swast <swast@google.com>
1 parent 319a1f2 commit 034f71f

File tree

4 files changed

+165
-1
lines changed

4 files changed

+165
-1
lines changed
 

‎bigframes/pandas/__init__.py

+3
Original file line numberDiff line numberDiff line change
@@ -59,6 +59,7 @@
5959
import third_party.bigframes_vendored.pandas.core.reshape.encoding as vendored_pandas_encoding
6060
import third_party.bigframes_vendored.pandas.core.reshape.merge as vendored_pandas_merge
6161
import third_party.bigframes_vendored.pandas.core.reshape.tile as vendored_pandas_tile
62+
import third_party.bigframes_vendored.pandas.io.gbq as vendored_pandas_gbq
6263

6364

6465
# Include method definition so that the method appears in our docs for
@@ -486,6 +487,7 @@ def read_gbq(
486487
index_col: Iterable[str] | str = (),
487488
col_order: Iterable[str] = (),
488489
max_results: Optional[int] = None,
490+
filters: vendored_pandas_gbq.FiltersType = (),
489491
use_cache: bool = True,
490492
) -> bigframes.dataframe.DataFrame:
491493
_set_default_session_location_if_possible(query_or_table)
@@ -495,6 +497,7 @@ def read_gbq(
495497
index_col=index_col,
496498
col_order=col_order,
497499
max_results=max_results,
500+
filters=filters,
498501
use_cache=use_cache,
499502
)
500503

‎bigframes/session/__init__.py

+77
Original file line numberDiff line numberDiff line change
@@ -233,10 +233,13 @@ def read_gbq(
233233
index_col: Iterable[str] | str = (),
234234
col_order: Iterable[str] = (),
235235
max_results: Optional[int] = None,
236+
filters: third_party_pandas_gbq.FiltersType = (),
236237
use_cache: bool = True,
237238
# Add a verify index argument that fails if the index is not unique.
238239
) -> dataframe.DataFrame:
239240
# TODO(b/281571214): Generate prompt to show the progress of read_gbq.
241+
query_or_table = self._filters_to_query(query_or_table, col_order, filters)
242+
240243
if _is_query(query_or_table):
241244
return self._read_gbq_query(
242245
query_or_table,
@@ -259,6 +262,80 @@ def read_gbq(
259262
use_cache=use_cache,
260263
)
261264

265+
def _filters_to_query(self, query_or_table, columns, filters):
266+
"""Convert filters to query"""
267+
if len(filters) == 0:
268+
return query_or_table
269+
270+
sub_query = (
271+
f"({query_or_table})" if _is_query(query_or_table) else query_or_table
272+
)
273+
274+
select_clause = "SELECT " + (
275+
", ".join(f"`{column}`" for column in columns) if columns else "*"
276+
)
277+
278+
where_clause = ""
279+
if filters:
280+
valid_operators = {
281+
"in": "IN",
282+
"not in": "NOT IN",
283+
"==": "=",
284+
">": ">",
285+
"<": "<",
286+
">=": ">=",
287+
"<=": "<=",
288+
"!=": "!=",
289+
}
290+
291+
if (
292+
isinstance(filters, Iterable)
293+
and isinstance(filters[0], Tuple)
294+
and (len(filters[0]) == 0 or not isinstance(filters[0][0], Tuple))
295+
):
296+
filters = [filters]
297+
298+
or_expressions = []
299+
for group in filters:
300+
if not isinstance(group, Iterable):
301+
raise ValueError(
302+
f"Filter group should be a iterable, {group} is not valid."
303+
)
304+
305+
and_expressions = []
306+
for filter_item in group:
307+
if not isinstance(filter_item, tuple) or (len(filter_item) != 3):
308+
raise ValueError(
309+
f"Filter condition should be a tuple of length 3, {filter_item} is not valid."
310+
)
311+
312+
column, operator, value = filter_item
313+
314+
if not isinstance(column, str):
315+
raise ValueError(
316+
f"Column name should be a string, but received '{column}' of type {type(column).__name__}."
317+
)
318+
319+
if operator not in valid_operators:
320+
raise ValueError(f"Operator {operator} is not valid.")
321+
322+
operator = valid_operators[operator]
323+
324+
if operator in ["IN", "NOT IN"]:
325+
value_list = ", ".join([repr(v) for v in value])
326+
expression = f"`{column}` {operator} ({value_list})"
327+
else:
328+
expression = f"`{column}` {operator} {repr(value)}"
329+
and_expressions.append(expression)
330+
331+
or_expressions.append(" AND ".join(and_expressions))
332+
333+
if or_expressions:
334+
where_clause = " WHERE " + " OR ".join(or_expressions)
335+
336+
full_query = f"{select_clause} FROM {sub_query} AS sub{where_clause}"
337+
return full_query
338+
262339
def _query_to_destination(
263340
self,
264341
query: str,

‎tests/unit/session/test_session.py

+57
Original file line numberDiff line numberDiff line change
@@ -57,3 +57,60 @@ def test_session_init_fails_with_no_project():
5757
credentials=mock.Mock(spec=google.auth.credentials.Credentials)
5858
)
5959
)
60+
61+
62+
@pytest.mark.parametrize(
63+
("query_or_table", "columns", "filters", "expected_output"),
64+
[
65+
pytest.param(
66+
"""SELECT
67+
rowindex,
68+
string_col,
69+
FROM `test_table` AS t
70+
""",
71+
[],
72+
[("rowindex", "<", 4), ("string_col", "==", "Hello, World!")],
73+
"""SELECT * FROM (SELECT
74+
rowindex,
75+
string_col,
76+
FROM `test_table` AS t
77+
) AS sub WHERE `rowindex` < 4 AND `string_col` = 'Hello, World!'""",
78+
id="query_input",
79+
),
80+
pytest.param(
81+
"test_table",
82+
[],
83+
[("date_col", ">", "2022-10-20")],
84+
"SELECT * FROM test_table AS sub WHERE `date_col` > '2022-10-20'",
85+
id="table_input",
86+
),
87+
pytest.param(
88+
"test_table",
89+
["row_index", "string_col"],
90+
[
91+
(("rowindex", "not in", [0, 6]),),
92+
(("string_col", "in", ["Hello, World!", "こんにちは"]),),
93+
],
94+
(
95+
"SELECT `row_index`, `string_col` FROM test_table AS sub WHERE "
96+
"`rowindex` NOT IN (0, 6) OR `string_col` IN ('Hello, World!', "
97+
"'こんにちは')"
98+
),
99+
id="or_operation",
100+
),
101+
pytest.param(
102+
"test_table",
103+
[],
104+
["date_col", ">", "2022-10-20"],
105+
None,
106+
marks=pytest.mark.xfail(
107+
raises=ValueError,
108+
),
109+
id="raise_error",
110+
),
111+
],
112+
)
113+
def test_read_gbq_with_filters(query_or_table, columns, filters, expected_output):
114+
session = resources.create_bigquery_session()
115+
query = session._filters_to_query(query_or_table, columns, filters)
116+
assert query == expected_output

‎third_party/bigframes_vendored/pandas/io/gbq.py

+28-1
Original file line numberDiff line numberDiff line change
@@ -3,10 +3,13 @@
33

44
from __future__ import annotations
55

6-
from typing import Iterable, Optional
6+
from typing import Any, Iterable, Literal, Optional, Tuple, Union
77

88
from bigframes import constants
99

10+
FilterType = Tuple[str, Literal["in", "not in", "<", "<=", "==", "!=", ">=", ">"], Any]
11+
FiltersType = Iterable[Union[FilterType, Iterable[FilterType]]]
12+
1013

1114
class GBQIOMixin:
1215
def read_gbq(
@@ -16,6 +19,7 @@ def read_gbq(
1619
index_col: Iterable[str] | str = (),
1720
col_order: Iterable[str] = (),
1821
max_results: Optional[int] = None,
22+
filters: FiltersType = (),
1923
use_cache: bool = True,
2024
):
2125
"""Loads a DataFrame from BigQuery.
@@ -71,6 +75,21 @@ def read_gbq(
7175
<BLANKLINE>
7276
[2 rows x 3 columns]
7377
78+
Reading data with `columns` and `filters` parameters:
79+
80+
>>> col_order = ['pitcherFirstName', 'pitcherLastName', 'year', 'pitchSpeed']
81+
>>> filters = [('year', '==', 2016), ('pitcherFirstName', 'in', ['John', 'Doe']), ('pitcherLastName', 'in', ['Gant'])]
82+
>>> df = bpd.read_gbq(
83+
... "bigquery-public-data.baseball.games_wide",
84+
... col_order=col_order,
85+
... filters=filters,
86+
... )
87+
>>> df.head(1)
88+
pitcherFirstName pitcherLastName year pitchSpeed
89+
0 John Gant 2016 82
90+
<BLANKLINE>
91+
[1 rows x 4 columns]
92+
7493
Args:
7594
query_or_table (str):
7695
A SQL string to be executed or a BigQuery table to be read. The
@@ -84,6 +103,14 @@ def read_gbq(
84103
max_results (Optional[int], default None):
85104
If set, limit the maximum number of rows to fetch from the
86105
query results.
106+
filters (Iterable[Union[Tuple, Iterable[Tuple]]], default ()): To
107+
filter out data. Filter syntax: [[(column, op, val), …],…] where
108+
op is [==, >, >=, <, <=, !=, in, not in]. The innermost tuples
109+
are transposed into a set of filters applied through an AND
110+
operation. The outer Iterable combines these sets of filters
111+
through an OR operation. A single Iterable of tuples can also
112+
be used, meaning that no OR operation between set of filters
113+
is to be conducted.
87114
use_cache (bool, default True):
88115
Whether to cache the query inputs. Default to True.
89116

0 commit comments

Comments
 (0)
Failed to load comments.