Skip to content
Permalink

Comparing changes

Choose two branches to see what’s changed or to start a new pull request. If you need to, you can also or learn more about diff comparisons.

Open a pull request

Create a new pull request by comparing changes across two branches. If you need to, you can also . Learn more about diff comparisons here.
base repository: googleapis/python-bigquery-dataframes
Failed to load repositories. Confirm that selected base ref is valid, then try again.
Loading
base: main
Choose a base ref
...
head repository: googleapis/python-bigquery-dataframes
Failed to load repositories. Confirm that selected head ref is valid, then try again.
Loading
compare: direct_table_scan
Choose a head ref
Can’t automatically merge. Don’t worry, you can still create the pull request.
  • 1 commit
  • 9 files changed
  • 1 contributor

Commits on Apr 9, 2025

  1. Copy the full SHA
    3a9dc4b View commit details
46 changes: 45 additions & 1 deletion bigframes/core/nodes.py
Original file line number Diff line number Diff line change
@@ -20,7 +20,16 @@
import functools
import itertools
import typing
from typing import Callable, cast, Iterable, Mapping, Optional, Sequence, Tuple
from typing import (
AbstractSet,
Callable,
cast,
Iterable,
Mapping,
Optional,
Sequence,
Tuple,
)

import google.cloud.bigquery as bq

@@ -574,6 +583,31 @@ def with_id(self, id: identifiers.ColumnId) -> ScanItem:
class ScanList:
items: typing.Tuple[ScanItem, ...]

def filter(
self,
ids: AbstractSet[identifiers.ColumnId],
) -> ScanList:
result = ScanList(tuple(item for item in self.items if item.id in ids))
if len(result.items) == 0:
# We need to select something, or stuff breaks
result = ScanList(self.items[:1])
return result

def select(
self,
selections: Mapping[identifiers.ColumnId, identifiers.ColumnId],
) -> ScanList:
by_id = {item.id: item for item in self.items}
result = ScanList(
tuple(
by_id[old_id].with_id(new_id) for old_id, new_id in selections.items()
)
)
if len(result.items) == 0:
# We need to select something, or stuff breaks
result = ScanList((self.items[:1]))
return result


@dataclasses.dataclass(frozen=True, eq=False)
class ReadLocalNode(LeafNode):
@@ -677,6 +711,11 @@ def from_table(table: bq.Table, columns: Sequence[str] = ()) -> GbqTable:
else tuple(table.clustering_fields),
)

def get_table_ref(self) -> bq.TableReference:
return bq.TableReference(
bq.DatasetReference(self.project_id, self.dataset_id), self.table_id
)

@property
@functools.cache
def schema_by_id(self):
@@ -1069,6 +1108,11 @@ def variables_introduced(self) -> int:
# This operation only renames variables, doesn't actually create new ones
return 0

@property
def double_references_ids(self) -> bool:
referenced = tuple(ref.ref.id for ref in self.input_output_pairs)
return len(referenced) != len(set(referenced))

# TODO: Reuse parent namespace
# Currently, Selection node allows renaming an reusing existing names, so it must establish a
# new namespace.
2 changes: 2 additions & 0 deletions bigframes/core/rewrite/__init__.py
Original file line number Diff line number Diff line change
@@ -17,6 +17,7 @@
from bigframes.core.rewrite.legacy_align import legacy_join_as_projection
from bigframes.core.rewrite.order import pull_up_order
from bigframes.core.rewrite.pruning import column_pruning
from bigframes.core.rewrite.scan_reduction import try_reduce_to_table_scan
from bigframes.core.rewrite.slices import pullup_limit_from_slice, rewrite_slice
from bigframes.core.rewrite.timedeltas import rewrite_timedelta_expressions
from bigframes.core.rewrite.windows import rewrite_range_rolling
@@ -31,4 +32,5 @@
"pull_up_order",
"column_pruning",
"rewrite_range_rolling",
"try_reduce_to_table_scan",
]
17 changes: 2 additions & 15 deletions bigframes/core/rewrite/pruning.py
Original file line number Diff line number Diff line change
@@ -170,7 +170,7 @@ def prune_readlocal(
node: bigframes.core.nodes.ReadLocalNode,
selection: AbstractSet[identifiers.ColumnId],
) -> bigframes.core.nodes.ReadLocalNode:
new_scan_list = filter_scanlist(node.scan_list, selection)
new_scan_list = node.scan_list.filter(selection)
return dataclasses.replace(
node,
scan_list=new_scan_list,
@@ -183,18 +183,5 @@ def prune_readtable(
node: bigframes.core.nodes.ReadTableNode,
selection: AbstractSet[identifiers.ColumnId],
) -> bigframes.core.nodes.ReadTableNode:
new_scan_list = filter_scanlist(node.scan_list, selection)
new_scan_list = node.scan_list.filter(selection)
return dataclasses.replace(node, scan_list=new_scan_list)


def filter_scanlist(
scanlist: bigframes.core.nodes.ScanList,
ids: AbstractSet[identifiers.ColumnId],
):
result = bigframes.core.nodes.ScanList(
tuple(item for item in scanlist.items if item.id in ids)
)
if len(result.items) == 0:
# We need to select something, or stuff breaks
result = bigframes.core.nodes.ScanList(scanlist.items[:1])
return result
51 changes: 51 additions & 0 deletions bigframes/core/rewrite/scan_reduction.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
# Copyright 2025 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import dataclasses
import functools
from typing import Optional

from bigframes.core import nodes


def try_reduce_to_table_scan(node: nodes.BigFrameNode) -> Optional[nodes.ReadTableNode]:
if not all(
map(
lambda x: isinstance(x, (nodes.ReadTableNode, nodes.SelectionNode)),
node.unique_nodes(),
)
):
return None
result = node.bottom_up(merge_scan)
if isinstance(result, nodes.ReadTableNode):
return result
return None


@functools.singledispatch
def merge_scan(node: nodes.BigFrameNode) -> nodes.BigFrameNode:
return node


@merge_scan.register
def _(node: nodes.SelectionNode) -> nodes.BigFrameNode:
if not isinstance(node.child, nodes.ReadTableNode):
return node
if node.double_references_ids:
return node

selection = {
aliased_ref.ref.id: aliased_ref.id for aliased_ref in node.input_output_pairs
}
new_scan_list = node.child.scan_list.select(selection)
return dataclasses.replace(node.child, scan_list=new_scan_list)
17 changes: 7 additions & 10 deletions bigframes/session/__init__.py
Original file line number Diff line number Diff line change
@@ -70,11 +70,10 @@
import bigframes.dtypes
import bigframes.functions._function_session as bff_session
import bigframes.functions.function as bff
from bigframes.session import bigquery_session
from bigframes.session import bigquery_session, bq_caching_executor, executor
import bigframes.session._io.bigquery as bf_io_bigquery
import bigframes.session.anonymous_dataset
import bigframes.session.clients
import bigframes.session.executor
import bigframes.session.loader
import bigframes.session.metrics
import bigframes.session.planner
@@ -252,14 +251,12 @@ def __init__(
self._temp_storage_manager = (
self._session_resource_manager or self._anon_dataset_manager
)
self._executor: bigframes.session.executor.Executor = (
bigframes.session.executor.BigQueryCachingExecutor(
bqclient=self._clients_provider.bqclient,
bqstoragereadclient=self._clients_provider.bqstoragereadclient,
storage_manager=self._temp_storage_manager,
strictly_ordered=self._strictly_ordered,
metrics=self._metrics,
)
self._executor: executor.Executor = bq_caching_executor.BigQueryCachingExecutor(
bqclient=self._clients_provider.bqclient,
bqstoragereadclient=self._clients_provider.bqstoragereadclient,
storage_manager=self._temp_storage_manager,
strictly_ordered=self._strictly_ordered,
metrics=self._metrics,
)
self._loader = bigframes.session.loader.GbqDataLoader(
session=self,
Loading