Skip to content
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.

Commit 9e32f57

Browse files
authoredApr 4, 2024
feat: (Series|DataFrame).explode (#556)
* feat: (Series|DataFrame).explode * fixing schema and adding tests * fixing multi-index tests * add docs and fix tests
1 parent 659a161 commit 9e32f57

File tree

17 files changed

+523
-6
lines changed

17 files changed

+523
-6
lines changed
 

‎bigframes/core/__init__.py

+9
Original file line numberDiff line numberDiff line change
@@ -401,6 +401,15 @@ def join(
401401
return ArrayValue(bigframes.core.rewrite.maybe_rewrite_join(join_node))
402402
return ArrayValue(join_node)
403403

404+
def explode(self, column_ids: typing.Sequence[str]) -> ArrayValue:
405+
assert len(column_ids) > 0
406+
for column_id in column_ids:
407+
assert bigframes.dtypes.is_array_like(self.get_column_type(column_id))
408+
409+
return ArrayValue(
410+
nodes.ExplodeNode(child=self.node, column_ids=tuple(column_ids))
411+
)
412+
404413
def _uniform_sampling(self, fraction: float) -> ArrayValue:
405414
"""Sampling the table on given fraction.
406415

‎bigframes/core/blocks.py

+30
Original file line numberDiff line numberDiff line change
@@ -1162,6 +1162,36 @@ def calculate_pairwise_metric(self, op=agg_ops.CorrOp()):
11621162
index_labels=self.column_labels.names,
11631163
)
11641164

1165+
def explode(
1166+
self,
1167+
column_ids: typing.Sequence[str],
1168+
ignore_index: Optional[bool],
1169+
) -> Block:
1170+
column_ids = [
1171+
column_id
1172+
for column_id in column_ids
1173+
if bigframes.dtypes.is_array_like(self.expr.get_column_type(column_id))
1174+
]
1175+
if len(column_ids) == 0:
1176+
expr = self.expr
1177+
else:
1178+
expr = self.expr.explode(column_ids)
1179+
1180+
if ignore_index:
1181+
return Block(
1182+
expr.drop_columns(self.index_columns),
1183+
column_labels=self.column_labels,
1184+
# Initiates default index creation using the block constructor.
1185+
index_columns=[],
1186+
)
1187+
else:
1188+
return Block(
1189+
expr,
1190+
column_labels=self.column_labels,
1191+
index_columns=self.index_columns,
1192+
index_labels=self.column_labels.names,
1193+
)
1194+
11651195
def _standard_stats(self, column_id) -> typing.Sequence[agg_ops.UnaryAggregateOp]:
11661196
"""
11671197
Gets a standard set of stats to preemptively fetch for a column if

‎bigframes/core/compile/compiled.py

+118
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@
2020
import typing
2121
from typing import Collection, Iterable, Literal, Optional, Sequence
2222

23+
import bigframes_vendored.ibis.expr.operations as vendored_ibis_ops
2324
import ibis
2425
import ibis.backends.bigquery as ibis_bigquery
2526
import ibis.common.deferred # type: ignore
@@ -502,6 +503,51 @@ def _uniform_sampling(self, fraction: float) -> UnorderedIR:
502503
columns=columns,
503504
)
504505

506+
def explode(self, column_ids: typing.Sequence[str]) -> UnorderedIR:
507+
table = self._to_ibis_expr()
508+
509+
# The offset array ensures null represents empty arrays after unnesting.
510+
offset_array_id = bigframes.core.guid.generate_guid("offset_array_")
511+
offset_array = (
512+
vendored_ibis_ops.GenerateArray(
513+
ibis.greatest(
514+
0,
515+
ibis.least(
516+
*[table[column_id].length() - 1 for column_id in column_ids]
517+
),
518+
)
519+
)
520+
.to_expr()
521+
.name(offset_array_id),
522+
)
523+
table_w_offset_array = table.select(
524+
offset_array,
525+
*self._column_names,
526+
)
527+
528+
unnest_offset_id = bigframes.core.guid.generate_guid("unnest_offset_")
529+
unnest_offset = (
530+
table_w_offset_array[offset_array_id].unnest().name(unnest_offset_id)
531+
)
532+
table_w_offset = table_w_offset_array.select(
533+
unnest_offset,
534+
*self._column_names,
535+
)
536+
537+
unnested_columns = [
538+
table_w_offset[column_id][table_w_offset[unnest_offset_id]].name(column_id)
539+
if column_id in column_ids
540+
else table_w_offset[column_id]
541+
for column_id in self._column_names
542+
]
543+
table_w_unnest = table_w_offset.select(*unnested_columns)
544+
545+
columns = [table_w_unnest[column_name] for column_name in self._column_names]
546+
return UnorderedIR(
547+
table_w_unnest,
548+
columns=columns,
549+
)
550+
505551
## Helpers
506552
def _set_or_replace_by_id(
507553
self, id: str, new_value: ibis_types.Value
@@ -719,6 +765,78 @@ def _uniform_sampling(self, fraction: float) -> OrderedIR:
719765
ordering=self._ordering,
720766
)
721767

768+
def explode(self, column_ids: typing.Sequence[str]) -> OrderedIR:
769+
table = self._to_ibis_expr(ordering_mode="unordered", expose_hidden_cols=True)
770+
771+
offset_array_id = bigframes.core.guid.generate_guid("offset_array_")
772+
offset_array = (
773+
vendored_ibis_ops.GenerateArray(
774+
ibis.greatest(
775+
0,
776+
ibis.least(
777+
*[table[column_id].length() - 1 for column_id in column_ids]
778+
),
779+
)
780+
)
781+
.to_expr()
782+
.name(offset_array_id),
783+
)
784+
table_w_offset_array = table.select(
785+
offset_array,
786+
*self._column_names,
787+
*self._hidden_ordering_column_names,
788+
)
789+
790+
unnest_offset_id = bigframes.core.guid.generate_guid("unnest_offset_")
791+
unnest_offset = (
792+
table_w_offset_array[offset_array_id].unnest().name(unnest_offset_id)
793+
)
794+
table_w_offset = table_w_offset_array.select(
795+
unnest_offset,
796+
*self._column_names,
797+
*self._hidden_ordering_column_names,
798+
)
799+
800+
unnested_columns = [
801+
table_w_offset[column_id][table_w_offset[unnest_offset_id]].name(column_id)
802+
if column_id in column_ids
803+
else table_w_offset[column_id]
804+
for column_id in self._column_names
805+
]
806+
807+
table_w_unnest = table_w_offset.select(
808+
table_w_offset[unnest_offset_id],
809+
*unnested_columns,
810+
*self._hidden_ordering_column_names,
811+
)
812+
813+
columns = [table_w_unnest[column_name] for column_name in self._column_names]
814+
hidden_ordering_columns = [
815+
*[
816+
table_w_unnest[column_name]
817+
for column_name in self._hidden_ordering_column_names
818+
],
819+
table_w_unnest[unnest_offset_id],
820+
]
821+
ordering = ExpressionOrdering(
822+
ordering_value_columns=tuple(
823+
[
824+
*self._ordering.ordering_value_columns,
825+
ascending_over(unnest_offset_id),
826+
]
827+
),
828+
total_ordering_columns=frozenset(
829+
[*self._ordering.total_ordering_columns, unnest_offset_id]
830+
),
831+
)
832+
833+
return OrderedIR(
834+
table_w_unnest,
835+
columns=columns,
836+
hidden_ordering_columns=hidden_ordering_columns,
837+
ordering=ordering,
838+
)
839+
722840
def promote_offsets(self, col_id: str) -> OrderedIR:
723841
"""
724842
Convenience function to promote copy of column offsets to a value column. Can be used to reset index.

‎bigframes/core/compile/compiler.py

+5
Original file line numberDiff line numberDiff line change
@@ -191,6 +191,11 @@ def compile_unpivot(node: nodes.UnpivotNode, ordered: bool = True):
191191
)
192192

193193

194+
@_compile_node.register
195+
def compiler_explode(node: nodes.ExplodeNode, ordered: bool = True):
196+
return compile_node(node.child, ordered).explode(node.column_ids)
197+
198+
194199
@_compile_node.register
195200
def compiler_random_sample(node: nodes.RandomSampleNode, ordered: bool = True):
196201
return compile_node(node.child, ordered)._uniform_sampling(node.fraction)

‎bigframes/core/nodes.py

+27
Original file line numberDiff line numberDiff line change
@@ -484,3 +484,30 @@ def row_preserving(self) -> bool:
484484

485485
def __hash__(self):
486486
return self._node_hash
487+
488+
489+
@dataclass(frozen=True)
490+
class ExplodeNode(UnaryNode):
491+
column_ids: typing.Tuple[str, ...]
492+
493+
@property
494+
def row_preserving(self) -> bool:
495+
return False
496+
497+
def __hash__(self):
498+
return self._node_hash
499+
500+
@functools.cached_property
501+
def schema(self) -> schemata.ArraySchema:
502+
items = tuple(
503+
schemata.SchemaItem(
504+
name,
505+
bigframes.dtypes.arrow_dtype_to_bigframes_dtype(
506+
self.child.schema.get_type(name).pyarrow_dtype.value_type
507+
),
508+
)
509+
if name in self.column_ids
510+
else schemata.SchemaItem(name, self.child.schema.get_type(name))
511+
for name in self.child.schema.names
512+
)
513+
return schemata.ArraySchema(items)

‎bigframes/dataframe.py

+30
Original file line numberDiff line numberDiff line change
@@ -2579,6 +2579,36 @@ def sample(
25792579
)[0]
25802580
)
25812581

2582+
def explode(
2583+
self,
2584+
column: typing.Union[blocks.Label, typing.Sequence[blocks.Label]],
2585+
*,
2586+
ignore_index: Optional[bool] = False,
2587+
) -> DataFrame:
2588+
if not utils.is_list_like(column):
2589+
column_labels = typing.cast(typing.Sequence[blocks.Label], (column,))
2590+
else:
2591+
column_labels = typing.cast(typing.Sequence[blocks.Label], tuple(column))
2592+
2593+
if not column_labels:
2594+
raise ValueError("column must be nonempty")
2595+
if len(column_labels) > len(set(column_labels)):
2596+
raise ValueError("column must be unique")
2597+
2598+
column_ids = [self._resolve_label_exact(label) for label in column_labels]
2599+
missing = [
2600+
column_labels[i] for i in range(len(column_ids)) if column_ids[i] is None
2601+
]
2602+
if len(missing) > 0:
2603+
raise KeyError(f"None of {missing} are in the columns")
2604+
2605+
return DataFrame(
2606+
self._block.explode(
2607+
column_ids=typing.cast(typing.Sequence[str], tuple(column_ids)),
2608+
ignore_index=ignore_index,
2609+
)
2610+
)
2611+
25822612
def _split(
25832613
self,
25842614
ns: Iterable[int] = (),

‎bigframes/dtypes.py

+11-4
Original file line numberDiff line numberDiff line change
@@ -129,16 +129,19 @@ def is_string_like(type: ExpressionType) -> bool:
129129

130130

131131
def is_array_like(type: ExpressionType) -> bool:
132-
if isinstance(type, pd.ArrowDtype) and isinstance(type.pyarrow_dtype, pa.ListType):
133-
return True
134-
else:
135-
return type in (STRING_DTYPE, BYTES_DTYPE)
132+
return isinstance(type, pd.ArrowDtype) and isinstance(
133+
type.pyarrow_dtype, pa.ListType
134+
)
136135

137136

138137
def is_numeric(type: ExpressionType) -> bool:
139138
return type in NUMERIC_BIGFRAMES_TYPES_PERMISSIVE
140139

141140

141+
def is_iterable(type: ExpressionType) -> bool:
142+
return type in (STRING_DTYPE, BYTES_DTYPE) or is_array_like(type)
143+
144+
142145
def is_comparable(type: ExpressionType) -> bool:
143146
return (type is not None) and (type not in UNORDERED_DTYPES)
144147

@@ -348,6 +351,10 @@ def arrow_dtype_to_ibis_dtype(arrow_dtype: pa.DataType) -> ibis_dtypes.DataType:
348351
)
349352

350353

354+
def arrow_dtype_to_bigframes_dtype(arrow_dtype: pa.DataType) -> Dtype:
355+
return ibis_dtype_to_bigframes_dtype(arrow_dtype_to_ibis_dtype(arrow_dtype))
356+
357+
351358
def bigframes_dtype_to_ibis_dtype(
352359
bigframes_dtype: Union[DtypeString, Dtype, np.dtype[Any]]
353360
) -> ibis_dtypes.DataType:

‎bigframes/operations/__init__.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -212,7 +212,7 @@ def create_binary_op(
212212
len_op = create_unary_op(
213213
name="len",
214214
type_signature=op_typing.FixedOutputType(
215-
dtypes.is_array_like, dtypes.INT_DTYPE, description="array-like"
215+
dtypes.is_iterable, dtypes.INT_DTYPE, description="iterable"
216216
),
217217
)
218218
reverse_op = create_unary_op(name="reverse", type_signature=op_typing.STRING_TRANSFORM)

‎bigframes/series.py

+7
Original file line numberDiff line numberDiff line change
@@ -1547,6 +1547,13 @@ def sample(
15471547
)[0]
15481548
)
15491549

1550+
def explode(self, *, ignore_index: Optional[bool] = False) -> Series:
1551+
return Series(
1552+
self._block.explode(
1553+
column_ids=[self._value_column], ignore_index=ignore_index
1554+
)
1555+
)
1556+
15501557
def __array_ufunc__(
15511558
self, ufunc: numpy.ufunc, method: str, *inputs, **kwargs
15521559
) -> Series:
There was a problem loading the remainder of the diff.

0 commit comments

Comments
 (0)
Failed to load comments.