Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: Add MultiIndex subclass. #596

Merged
merged 8 commits into from
Apr 10, 2024
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions bigframes/core/indexes/__init__.py
Original file line number Diff line number Diff line change
@@ -13,7 +13,9 @@
# limitations under the License.

from bigframes.core.indexes.base import Index
from bigframes.core.indexes.multi import MultiIndex

__all__ = [
"Index",
"MultiIndex",
]
57 changes: 28 additions & 29 deletions bigframes/core/indexes/base.py
Original file line number Diff line number Diff line change
@@ -42,9 +42,15 @@

class Index(vendored_pandas_index.Index):
__doc__ = vendored_pandas_index.Index.__doc__

def __init__(
self,
_query_job = None
_block: blocks.Block
_linked_frame: Union[
bigframes.dataframe.DataFrame, bigframes.series.Series, None
] = None

# Overrided on __new__ to create subclasses like pandas does
def __new__(
cls,
data=None,
dtype=None,
*,
@@ -73,18 +79,30 @@ def __init__(
if dtype is not None:
index = index.astype(dtype)
block = index._block
elif isinstance(data, pandas.Index):
pd_df = pandas.DataFrame(index=data)
block = df.DataFrame(pd_df, session=session)._block
else:
pd_index = pandas.Index(data=data, dtype=dtype, name=name)
pd_df = pandas.DataFrame(index=pd_index)
block = df.DataFrame(pd_df, session=session)._block
self._query_job = None
self._block: blocks.Block = block

# TODO: Support more index subtypes
from bigframes.core.indexes.multi import MultiIndex

klass = MultiIndex if len(block._index_columns) > 1 else cls
result = typing.cast(Index, object.__new__(klass))
result._query_job = None
result._block = block
return result

@classmethod
def from_frame(
cls, frame: Union[bigframes.series.Series, bigframes.dataframe.DataFrame]
) -> Index:
return FrameIndex(frame)
index = Index(frame._block)
index._linked_frame = frame
return index

@property
def name(self) -> blocks.Label:
@@ -107,6 +125,10 @@ def names(self) -> typing.Sequence[blocks.Label]:
@names.setter
def names(self, values: typing.Sequence[blocks.Label]):
new_block = self._block.with_index_labels(values)
if self._linked_frame is not None:
self._linked_frame._set_block(
self._linked_frame._block.with_index_labels(values)
)
self._block = new_block

@property
@@ -452,26 +474,3 @@ def to_numpy(self, dtype=None, **kwargs) -> np.ndarray:

def __len__(self):
return self.shape[0]


# Index that mutates the originating dataframe/series
class FrameIndex(Index):
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This will be a breaking change that we may not want to introduce right now. @tswast

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I am deleting a class yes, but it was more of an implementation detail of the Index class. Should only affect users if they were somehow depending on exact type or type name in their code, which would be very strange.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Good catch @GarrettWu.

Given that we say ) -> Index: in the only method that would return such an object and we don't explicitly document FrameIndex, I'm okay overlooking this and treating FrameIndex as a private implementation detail.

def __init__(
self,
series_or_dataframe: typing.Union[
bigframes.series.Series, bigframes.dataframe.DataFrame
],
):
super().__init__(series_or_dataframe._block)
self._whole_frame = series_or_dataframe

@property
def names(self) -> typing.Sequence[blocks.Label]:
"""Returns the names of the Index."""
return self._block._index_labels

@names.setter
def names(self, values: typing.Sequence[blocks.Label]):
new_block = self._whole_frame._get_block().with_index_labels(values)
self._whole_frame._set_block(new_block)
self._block = new_block
48 changes: 48 additions & 0 deletions bigframes/core/indexes/multi.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
# Copyright 2024 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from __future__ import annotations

from typing import cast, Hashable, Iterable, Sequence

import bigframes_vendored.pandas.core.indexes.multi as vendored_pandas_multindex
import pandas

from bigframes.core.indexes.base import Index


class MultiIndex(Index, vendored_pandas_multindex.MultiIndex):
__doc__ = vendored_pandas_multindex.MultiIndex.__doc__

@classmethod
def from_tuples(
cls,
tuples: Iterable[tuple[Hashable, ...]],
sortorder: int | None = None,
names: Sequence[Hashable] | Hashable | None = None,
) -> MultiIndex:
pd_index = pandas.MultiIndex.from_tuples(tuples, sortorder, names)
# Index.__new__ should detect multiple levels and properly create a multiindex
return cast(MultiIndex, Index(pd_index))

@classmethod
def from_arrays(
cls,
arrays,
sortorder: int | None = None,
names=None,
) -> MultiIndex:
pd_index = pandas.MultiIndex.from_arrays(arrays, sortorder, names)
# Index.__new__ should detect multiple levels and properly create a multiindex
return cast(MultiIndex, Index(pd_index))
2 changes: 2 additions & 0 deletions bigframes/pandas/__init__.py
Original file line number Diff line number Diff line change
@@ -707,6 +707,7 @@ def to_datetime(
# checking and docstrings.
DataFrame = bigframes.dataframe.DataFrame
Index = bigframes.core.indexes.Index
MultiIndex = bigframes.core.indexes.MultiIndex
Series = bigframes.series.Series

# Other public pandas attributes
@@ -760,6 +761,7 @@ def to_datetime(
# Class aliases
"DataFrame",
"Index",
"MultiIndex",
"Series",
# Other public pandas attributes
"NamedAgg",
25 changes: 25 additions & 0 deletions tests/system/small/test_multiindex.py
Original file line number Diff line number Diff line change
@@ -20,6 +20,31 @@
from tests.system.utils import assert_pandas_df_equal, skip_legacy_pandas


def test_multi_index_from_arrays():
bf_idx = bpd.MultiIndex.from_arrays(
[
pandas.Index([4, 99], dtype=pandas.Int64Dtype()),
pandas.Index(
[" Hello, World!", "_some_new_string"],
dtype=pandas.StringDtype(storage="pyarrow"),
),
],
names=[" 1index 1", "_1index 2"],
)
pd_idx = pandas.MultiIndex.from_arrays(
[
pandas.Index([4, 99], dtype=pandas.Int64Dtype()),
pandas.Index(
[" Hello, World!", "_some_new_string"],
dtype=pandas.StringDtype(storage="pyarrow"),
),
],
names=[" 1index 1", "_1index 2"],
)
assert bf_idx.names == pd_idx.names
pandas.testing.assert_index_equal(bf_idx.to_pandas(), pd_idx)


@skip_legacy_pandas
def test_read_pandas_multi_index_axes():
index = pandas.MultiIndex.from_arrays(
88 changes: 88 additions & 0 deletions third_party/bigframes_vendored/pandas/core/indexes/multi.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,88 @@
# Contains code from https://github.com/pandas-dev/pandas/blob/main/pandas/core/indexes/multi.py
from __future__ import annotations

from typing import Hashable, Iterable, Sequence

import bigframes_vendored.pandas.core.indexes.base

from bigframes import constants


class MultiIndex(bigframes_vendored.pandas.core.indexes.base.Index):
"""
A multi-level, or hierarchical, index object for pandas objects.
"""

@classmethod
def from_tuples(
cls,
tuples: Iterable[tuple[Hashable, ...]],
sortorder: int | None = None,
names: Sequence[Hashable] | Hashable | None = None,
) -> MultiIndex:
"""
Convert list of tuples to MultiIndex.

**Examples:**

>>> import bigframes.pandas as bpd
>>> bpd.options.display.progress_bar = None
>>> tuples = [(1, 'red'), (1, 'blue'),
... (2, 'red'), (2, 'blue')]
>>> bpd.MultiIndex.from_tuples(tuples, names=('number', 'color'))
MultiIndex([(1, 'red'),
(1, 'blue'),
(2, 'red'),
(2, 'blue')],
names=['number', 'color'])

Args:
tuples (list / sequence of tuple-likes):
Each tuple is the index of one row/column.
sortorder (int or None):
Level of sortedness (must be lexicographically sorted by that
level).
names (list / sequence of str, optional):
Names for the levels in the index.

Returns:
MultiIndex
"""
raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)

@classmethod
def from_arrays(
cls,
arrays,
sortorder: int | None = None,
names=None,
) -> MultiIndex:
"""
Convert arrays to MultiIndex.

**Examples:**

>>> import bigframes.pandas as bpd
>>> bpd.options.display.progress_bar = None
>>> arrays = [[1, 1, 2, 2], ['red', 'blue', 'red', 'blue']]
>>> bpd.MultiIndex.from_arrays(arrays, names=('number', 'color'))
MultiIndex([(1, 'red'),
(1, 'blue'),
(2, 'red'),
(2, 'blue')],
names=['number', 'color'])

Args:
arrays (list / sequence of array-likes):
Each array-like gives one level's value for each data point.
len(arrays) is the number of levels.
sortorder (int or None):
Level of sortedness (must be lexicographically sorted by that
level).
names (list / sequence of str, optional):
Names for the levels in the index.

Returns:
MultiIndex
"""
raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)