Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: Add more index methods #54

Merged
merged 2 commits into from
Sep 26, 2023
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 7 additions & 3 deletions bigframes/core/block_transforms.py
Original file line number Diff line number Diff line change
@@ -218,13 +218,17 @@ def rank(
return block.select_columns(rownum_col_ids).with_column_labels(labels)


def dropna(block: blocks.Block, how: typing.Literal["all", "any"] = "any"):
def dropna(
block: blocks.Block,
column_ids: typing.Sequence[str],
how: typing.Literal["all", "any"] = "any",
):
"""
Drop na entries from block
"""
if how == "any":
filtered_block = block
for column in block.value_columns:
for column in column_ids:
filtered_block, result_id = filtered_block.apply_unary_op(
column, ops.notnull_op
)
@@ -234,7 +238,7 @@ def dropna(block: blocks.Block, how: typing.Literal["all", "any"] = "any"):
else: # "all"
filtered_block = block
predicate = None
for column in block.value_columns:
for column in column_ids:
filtered_block, partial_predicate = filtered_block.apply_unary_op(
column, ops.notnull_op
)
118 changes: 103 additions & 15 deletions bigframes/core/indexes/index.py
Original file line number Diff line number Diff line change
@@ -24,8 +24,10 @@

import bigframes.constants as constants
import bigframes.core as core
import bigframes.core.block_transforms as block_ops
import bigframes.core.blocks as blocks
import bigframes.core.joins as joins
import bigframes.core.ordering as order
import bigframes.core.utils as utils
import bigframes.dtypes
import bigframes.dtypes as bf_dtypes
@@ -149,6 +151,27 @@ def has_duplicates(self) -> bool:
def _block(self) -> blocks.Block:
return self._data._get_block()

@property
def T(self) -> Index:
return self.transpose()

def transpose(self) -> Index:
return self

def sort_values(self, *, ascending: bool = True, na_position: str = "last"):
if na_position not in ["first", "last"]:
raise ValueError("Param na_position must be one of 'first' or 'last'")
direction = (
order.OrderingDirection.ASC if ascending else order.OrderingDirection.DESC
)
na_last = na_position == "last"
index_columns = self._block.index_columns
ordering = [
order.OrderingColumnReference(column, direction=direction, na_last=na_last)
for column in index_columns
]
return Index._from_block(self._block.order_by(ordering))

def astype(
self,
dtype: Union[bigframes.dtypes.DtypeString, bigframes.dtypes.Dtype],
@@ -176,6 +199,57 @@ def max(self) -> typing.Any:
def min(self) -> typing.Any:
return self._apply_aggregation(agg_ops.min_op)

def argmax(self) -> int:
block, row_nums = self._block.promote_offsets()
block = block.order_by(
[
*[
order.OrderingColumnReference(
col, direction=order.OrderingDirection.DESC
)
for col in self._block.index_columns
],
order.OrderingColumnReference(row_nums),
]
)
import bigframes.series as series

return typing.cast(int, series.Series(block.select_column(row_nums)).iloc[0])

def argmin(self) -> int:
block, row_nums = self._block.promote_offsets()
block = block.order_by(
[
*[
order.OrderingColumnReference(col)
for col in self._block.index_columns
],
order.OrderingColumnReference(row_nums),
]
)
import bigframes.series as series

return typing.cast(int, series.Series(block.select_column(row_nums)).iloc[0])

def value_counts(
self,
normalize: bool = False,
sort: bool = True,
ascending: bool = False,
*,
dropna: bool = True,
):
block = block_ops.value_counts(
self._block,
self._block.index_columns,
normalize=normalize,
ascending=ascending,
dropna=dropna,
)
import bigframes.series as series

return series.Series(block)

def fillna(self, value=None) -> Index:
if self.nlevels > 1:
raise TypeError("Multiindex does not support 'fillna'")
@@ -185,10 +259,7 @@ def rename(self, name: Union[str, Sequence[str]]) -> Index:
names = [name] if isinstance(name, str) else list(name)
if len(names) != self.nlevels:
raise ValueError("'name' must be same length as levels")

import bigframes.dataframe as df

return Index(df.DataFrame(self._block.with_index_labels(names)))
return Index._from_block(self._block.with_index_labels(names))

def drop(
self,
@@ -210,9 +281,28 @@ def drop(
)
block = block.filter(condition_id, keep_null=True)
block = block.drop_columns([condition_id])
import bigframes.dataframe as df
return Index._from_block(block)

def dropna(self, how: str = "any") -> Index:
if how not in ("any", "all"):
raise ValueError("'how' must be one of 'any', 'all'")
result = block_ops.dropna(self._block, self._block.index_columns, how=how) # type: ignore
return Index._from_block(result)

def drop_duplicates(self, *, keep: str = "first") -> Index:
block = block_ops.drop_duplicates(self._block, self._block.index_columns, keep)
return Index._from_block(block)

def isin(self, values) -> Index:
if not utils.is_list_like(values):
raise TypeError(
"only list-like objects are allowed to be passed to "
f"isin(), you passed a [{type(values).__name__}]"
)

return Index(df.DataFrame(block.select_columns([])))
return self._apply_unary_op(ops.IsInOp(values, match_nulls=True)).fillna(
value=False
)

def _apply_unary_op(
self,
@@ -226,9 +316,7 @@ def _apply_unary_op(
result_ids.append(result_id)

block = block.set_index(result_ids, index_labels=self._block.index_labels)
import bigframes.dataframe as df

return Index(df.DataFrame(block))
return Index._from_block(block)

def _apply_aggregation(self, op: agg_ops.AggregateOp) -> typing.Any:
if self.nlevels > 1:
@@ -262,6 +350,12 @@ def to_numpy(self, dtype=None, **kwargs) -> np.ndarray:
def __len__(self):
return self.shape[0]

@classmethod
def _from_block(cls, block: blocks.Block) -> Index:
import bigframes.dataframe as df

return Index(df.DataFrame(block))


class IndexValue:
"""An immutable index."""
@@ -356,12 +450,6 @@ def resolve_level_name(self: IndexValue, label: blocks.Label) -> str:
def is_uniquely_named(self: IndexValue):
return len(set(self.names)) == len(self.names)

def _set_block(self, block: blocks.Block):
self._block = block

def _get_block(self) -> blocks.Block:
return self._block


def join_mono_indexed(
left: IndexValue,
7 changes: 5 additions & 2 deletions bigframes/dataframe.py
Original file line number Diff line number Diff line change
@@ -1440,7 +1440,7 @@ def dropna(
axis_n = utils.get_axis_number(axis)

if axis_n == 0:
result = block_ops.dropna(self._block, how=how) # type: ignore
result = block_ops.dropna(self._block, self._block.value_columns, how=how) # type: ignore
if ignore_index:
result = result.reset_index()
return DataFrame(result)
@@ -1674,7 +1674,10 @@ def pivot(
def stack(self):
# TODO: support 'level' param by simply reordering levels such that selected level is last before passing to Block.stack.
# TODO: match impl to pandas future_stack as described in pandas 2.1 release notes
result_block = block_ops.dropna(self._block.stack(), how="all")
stack_block = self._block.stack()
result_block = block_ops.dropna(
stack_block, stack_block.value_columns, how="all"
)
if not isinstance(self.columns, pandas.MultiIndex):
return bigframes.series.Series(result_block)
return DataFrame(result_block)
6 changes: 3 additions & 3 deletions bigframes/series.py
Original file line number Diff line number Diff line change
@@ -459,7 +459,7 @@ def dropna(
) -> Series:
if inplace:
raise NotImplementedError("'inplace'=True not supported")
result = block_ops.dropna(self._block, how="any")
result = block_ops.dropna(self._block, [self._value_column], how="any")
if ignore_index:
result = result.reset_index()
return Series(result)
@@ -856,7 +856,7 @@ def clip(self, lower, upper):
)
return Series(block.select_column(result_id).with_column_labels([self.name]))

def argmax(self) -> scalars.Scalar:
def argmax(self) -> int:
block, row_nums = self._block.promote_offsets()
block = block.order_by(
[
@@ -870,7 +870,7 @@ def argmax(self) -> scalars.Scalar:
scalars.Scalar, Series(block.select_column(row_nums)).iloc[0]
)

def argmin(self) -> scalars.Scalar:
def argmin(self) -> int:
block, row_nums = self._block.promote_offsets()
block = block.order_by(
[
120 changes: 120 additions & 0 deletions tests/system/small/test_index.py
Original file line number Diff line number Diff line change
@@ -14,6 +14,7 @@

import numpy
import pandas as pd
import pytest

from tests.system.utils import assert_pandas_index_equal_ignore_index_type

@@ -174,3 +175,122 @@ def test_is_monotonic_decreasing(scalars_df_index, scalars_pandas_df_index):
scalars_df_index.index.is_monotonic_increasing
== scalars_pandas_df_index.index.is_monotonic_increasing
)


def test_index_argmin(scalars_df_index, scalars_pandas_df_index):
if pd.__version__.startswith("1."):
pytest.skip("doesn't work in pandas 1.x.")
bf_result = scalars_df_index.set_index(["int64_too", "rowindex_2"]).index.argmin()
pd_result = scalars_pandas_df_index.set_index(
["int64_too", "rowindex_2"]
).index.argmin()
assert bf_result == pd_result


def test_index_argmax(scalars_df_index, scalars_pandas_df_index):
if pd.__version__.startswith("1."):
pytest.skip("doesn't work in pandas 1.x.")
bf_result = scalars_df_index.set_index(["int64_too", "rowindex_2"]).index.argmax()
pd_result = scalars_pandas_df_index.set_index(
["int64_too", "rowindex_2"]
).index.argmax()
assert bf_result == pd_result


@pytest.mark.parametrize(
("ascending", "na_position"),
[
(True, "first"),
(True, "last"),
(False, "first"),
(False, "last"),
],
)
def test_index_sort_values(
scalars_df_index, scalars_pandas_df_index, ascending, na_position
):
# Test needs values to be unique
bf_result = (
scalars_df_index.set_index(["int64_too", "rowindex_2"])
.index.sort_values(ascending=ascending, na_position=na_position)
.to_pandas()
)
pd_result = scalars_pandas_df_index.set_index(
["int64_too", "rowindex_2"]
).index.sort_values(ascending=ascending, na_position=na_position)

pd.testing.assert_index_equal(
bf_result,
pd_result,
)


def test_index_value_counts(scalars_df_index, scalars_pandas_df_index):
if pd.__version__.startswith("1."):
pytest.skip("value_counts results different in pandas 1.x.")
bf_result = (
scalars_df_index.set_index(["int64_too", "rowindex_2"])
.index.value_counts()
.to_pandas()
)
pd_result = scalars_pandas_df_index.set_index(
["int64_too", "rowindex_2"]
).index.value_counts()

pd.testing.assert_series_equal(bf_result, pd_result, check_dtype=False)


@pytest.mark.parametrize(
("how",),
[
("any",),
("all",),
],
)
def test_index_dropna(scalars_df_index, scalars_pandas_df_index, how):
bf_result = (
scalars_df_index.set_index(["int64_col", "float64_col"])
.index.dropna(how=how)
.to_pandas()
)
pd_result = scalars_pandas_df_index.set_index(
["int64_col", "float64_col"]
).index.dropna(how=how)
pd.testing.assert_index_equal(pd_result, bf_result)


@pytest.mark.parametrize(
("keep",),
[
("first",),
("last",),
(False,),
],
)
def test_index_drop_duplicates(scalars_df_index, scalars_pandas_df_index, keep):
bf_series = (
scalars_df_index.set_index("int64_col")
.index.drop_duplicates(keep=keep)
.to_pandas()
)
pd_series = scalars_pandas_df_index.set_index("int64_col").index.drop_duplicates(
keep=keep
)
pd.testing.assert_index_equal(
pd_series,
bf_series,
)


def test_index_isin(scalars_df_index, scalars_pandas_df_index):
bf_series = (
scalars_df_index.set_index("int64_col").index.isin([2, 55555, 4]).to_pandas()
)
pd_result_array = scalars_pandas_df_index.set_index("int64_col").index.isin(
[2, 55555, 4]
)
pd.testing.assert_index_equal(
pd.Index(pd_result_array),
bf_series,
check_names=False,
)
135 changes: 135 additions & 0 deletions third_party/bigframes_vendored/pandas/core/indexes/base.py
Original file line number Diff line number Diff line change
@@ -53,6 +53,20 @@ def dtypes(self):
"""Return the dtypes as a Series for the underlying MultiIndex."""
raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)

@property
def T(self) -> Index:
"""Return the transpose, which is by definition self."""
raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)

def transpose(self) -> Index:
"""
Return the transpose, which is by definition self.
Returns:
Index
"""
raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)

def astype(self, dtype):
"""Create an Index with values cast to dtypes.
@@ -67,6 +81,23 @@ def astype(self, dtype):
"""
raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)

def isin(self, values):
"""
Return a boolean array where the index values are in `values`.
Compute boolean array of whether each index value is found in the
passed set of values. The length of the returned boolean array matches
the length of the index.
Args:
values (set or list-like):
Sought values.
Returns:
Series: Series of boolean values.
"""
raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)

def all(self) -> bool:
"""Return whether all elements are Truthy.
@@ -99,6 +130,30 @@ def max(self):
"""
raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)

def argmin(self) -> int:
"""
Return int position of the smallest value in the Series.
If the minimum is achieved in multiple locations,
the first row position is returned.
Returns:
int: Row position of the minimum value.
"""
raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)

def argmax(self) -> int:
"""
Return int position of the largest value in the Series.
If the maximum is achieved in multiple locations,
the first row position is returned.
Returns:
int: Row position of the maximum value.
"""
raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)

def nunique(self) -> int:
"""Return number of unique elements in the object.
@@ -109,6 +164,57 @@ def nunique(self) -> int:
"""
raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)

def sort_values(
self, *, ascending: bool = True, na_position: str = "last"
) -> Index:
"""
Return a sorted copy of the index.
Return a sorted copy of the index, and optionally return the indices
that sorted the index itself.
Args:
ascending (bool, default True):
Should the index values be sorted in an ascending order.
na_position ({'first' or 'last'}, default 'last'):
Argument 'first' puts NaNs at the beginning, 'last' puts NaNs at
the end.
Returns:
pandas.Index: Sorted copy of the index.
"""
raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)

def value_counts(
self,
normalize: bool = True,
sort: bool = True,
ascending: bool = False,
*,
dropna: bool = True,
):
"""Return a Series containing counts of unique values.
The resulting object will be in descending order so that the
first element is the most frequently-occurring element.
Excludes NA values by default.
Args:
normalize (bool, default False):
If True then the object returned will contain the relative
frequencies of the unique values.
sort (bool, default True):
Sort by frequencies.
ascending (bool, default False):
Sort in ascending order.
dropna (bool, default True):
Don't include counts of NaN.
Returns:
Series
"""
raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)

def fillna(self, value) -> Index:
"""
Fill NA/NaN values with the specified value.
@@ -151,6 +257,35 @@ def drop(self, labels) -> Index:
"""
raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)

def dropna(self, how: str = "any"):
"""Return Index without NA/NaN values.
Args:
how ({'any', 'all'}, default 'any'):
If the Index is a MultiIndex, drop the value when any or all levels
are NaN.
Returns:
Index
"""
raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)

def drop_duplicates(self, *, keep: str = "first"):
"""
Return Index with duplicate values removed.
Args:
keep ({'first', 'last', ``False``}, default 'first'):
One of:
'first' : Drop duplicates except for the first occurrence.
'last' : Drop duplicates except for the last occurrence.
``False`` : Drop all duplicates.
Returns:
Index
"""
raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)

def to_numpy(self, dtype):
"""
A NumPy ndarray representing the values in this Series or Index.