|
20 | 20 | import typing
|
21 | 21 | from typing import Collection, Iterable, Literal, Optional, Sequence
|
22 | 22 |
|
| 23 | +import bigframes_vendored.ibis.expr.operations as vendored_ibis_ops |
23 | 24 | import ibis
|
24 | 25 | import ibis.backends.bigquery as ibis_bigquery
|
25 | 26 | import ibis.common.deferred # type: ignore
|
@@ -502,6 +503,51 @@ def _uniform_sampling(self, fraction: float) -> UnorderedIR:
|
502 | 503 | columns=columns,
|
503 | 504 | )
|
504 | 505 |
|
| 506 | + def explode(self, column_ids: typing.Sequence[str]) -> UnorderedIR: |
| 507 | + table = self._to_ibis_expr() |
| 508 | + |
| 509 | + # The offset array ensures null represents empty arrays after unnesting. |
| 510 | + offset_array_id = bigframes.core.guid.generate_guid("offset_array_") |
| 511 | + offset_array = ( |
| 512 | + vendored_ibis_ops.GenerateArray( |
| 513 | + ibis.greatest( |
| 514 | + 0, |
| 515 | + ibis.least( |
| 516 | + *[table[column_id].length() - 1 for column_id in column_ids] |
| 517 | + ), |
| 518 | + ) |
| 519 | + ) |
| 520 | + .to_expr() |
| 521 | + .name(offset_array_id), |
| 522 | + ) |
| 523 | + table_w_offset_array = table.select( |
| 524 | + offset_array, |
| 525 | + *self._column_names, |
| 526 | + ) |
| 527 | + |
| 528 | + unnest_offset_id = bigframes.core.guid.generate_guid("unnest_offset_") |
| 529 | + unnest_offset = ( |
| 530 | + table_w_offset_array[offset_array_id].unnest().name(unnest_offset_id) |
| 531 | + ) |
| 532 | + table_w_offset = table_w_offset_array.select( |
| 533 | + unnest_offset, |
| 534 | + *self._column_names, |
| 535 | + ) |
| 536 | + |
| 537 | + unnested_columns = [ |
| 538 | + table_w_offset[column_id][table_w_offset[unnest_offset_id]].name(column_id) |
| 539 | + if column_id in column_ids |
| 540 | + else table_w_offset[column_id] |
| 541 | + for column_id in self._column_names |
| 542 | + ] |
| 543 | + table_w_unnest = table_w_offset.select(*unnested_columns) |
| 544 | + |
| 545 | + columns = [table_w_unnest[column_name] for column_name in self._column_names] |
| 546 | + return UnorderedIR( |
| 547 | + table_w_unnest, |
| 548 | + columns=columns, |
| 549 | + ) |
| 550 | + |
505 | 551 | ## Helpers
|
506 | 552 | def _set_or_replace_by_id(
|
507 | 553 | self, id: str, new_value: ibis_types.Value
|
@@ -719,6 +765,78 @@ def _uniform_sampling(self, fraction: float) -> OrderedIR:
|
719 | 765 | ordering=self._ordering,
|
720 | 766 | )
|
721 | 767 |
|
| 768 | + def explode(self, column_ids: typing.Sequence[str]) -> OrderedIR: |
| 769 | + table = self._to_ibis_expr(ordering_mode="unordered", expose_hidden_cols=True) |
| 770 | + |
| 771 | + offset_array_id = bigframes.core.guid.generate_guid("offset_array_") |
| 772 | + offset_array = ( |
| 773 | + vendored_ibis_ops.GenerateArray( |
| 774 | + ibis.greatest( |
| 775 | + 0, |
| 776 | + ibis.least( |
| 777 | + *[table[column_id].length() - 1 for column_id in column_ids] |
| 778 | + ), |
| 779 | + ) |
| 780 | + ) |
| 781 | + .to_expr() |
| 782 | + .name(offset_array_id), |
| 783 | + ) |
| 784 | + table_w_offset_array = table.select( |
| 785 | + offset_array, |
| 786 | + *self._column_names, |
| 787 | + *self._hidden_ordering_column_names, |
| 788 | + ) |
| 789 | + |
| 790 | + unnest_offset_id = bigframes.core.guid.generate_guid("unnest_offset_") |
| 791 | + unnest_offset = ( |
| 792 | + table_w_offset_array[offset_array_id].unnest().name(unnest_offset_id) |
| 793 | + ) |
| 794 | + table_w_offset = table_w_offset_array.select( |
| 795 | + unnest_offset, |
| 796 | + *self._column_names, |
| 797 | + *self._hidden_ordering_column_names, |
| 798 | + ) |
| 799 | + |
| 800 | + unnested_columns = [ |
| 801 | + table_w_offset[column_id][table_w_offset[unnest_offset_id]].name(column_id) |
| 802 | + if column_id in column_ids |
| 803 | + else table_w_offset[column_id] |
| 804 | + for column_id in self._column_names |
| 805 | + ] |
| 806 | + |
| 807 | + table_w_unnest = table_w_offset.select( |
| 808 | + table_w_offset[unnest_offset_id], |
| 809 | + *unnested_columns, |
| 810 | + *self._hidden_ordering_column_names, |
| 811 | + ) |
| 812 | + |
| 813 | + columns = [table_w_unnest[column_name] for column_name in self._column_names] |
| 814 | + hidden_ordering_columns = [ |
| 815 | + *[ |
| 816 | + table_w_unnest[column_name] |
| 817 | + for column_name in self._hidden_ordering_column_names |
| 818 | + ], |
| 819 | + table_w_unnest[unnest_offset_id], |
| 820 | + ] |
| 821 | + ordering = ExpressionOrdering( |
| 822 | + ordering_value_columns=tuple( |
| 823 | + [ |
| 824 | + *self._ordering.ordering_value_columns, |
| 825 | + ascending_over(unnest_offset_id), |
| 826 | + ] |
| 827 | + ), |
| 828 | + total_ordering_columns=frozenset( |
| 829 | + [*self._ordering.total_ordering_columns, unnest_offset_id] |
| 830 | + ), |
| 831 | + ) |
| 832 | + |
| 833 | + return OrderedIR( |
| 834 | + table_w_unnest, |
| 835 | + columns=columns, |
| 836 | + hidden_ordering_columns=hidden_ordering_columns, |
| 837 | + ordering=ordering, |
| 838 | + ) |
| 839 | + |
722 | 840 | def promote_offsets(self, col_id: str) -> OrderedIR:
|
723 | 841 | """
|
724 | 842 | Convenience function to promote copy of column offsets to a value column. Can be used to reset index.
|
|
0 commit comments