diff --git a/CHANGELOG.md b/CHANGELOG.md index 93ebadb56f..845d3634bc 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -4,6 +4,35 @@ [1]: https://pypi.org/project/bigframes/#history +## [0.12.0](https://github.com/googleapis/python-bigquery-dataframes/compare/v0.11.0...v0.12.0) (2023-11-01) + + +### Features + +* Add `DataFrame.melt` ([#113](https://github.com/googleapis/python-bigquery-dataframes/issues/113)) ([4e4409c](https://github.com/googleapis/python-bigquery-dataframes/commit/4e4409c5b235171f3770aec852193026519948fd)) +* Add `DataFrame.to_pandas_batches()` to download large `DataFrame` objects ([#136](https://github.com/googleapis/python-bigquery-dataframes/issues/136)) ([3afd4a3](https://github.com/googleapis/python-bigquery-dataframes/commit/3afd4a35f4c38dad86dab17ff62444cd418cab88)) +* Add bigframes.options.compute.maximum_bytes_billed option that sets maximum bytes billed on query jobs ([#133](https://github.com/googleapis/python-bigquery-dataframes/issues/133)) ([63c7919](https://github.com/googleapis/python-bigquery-dataframes/commit/63c7919e28d2e0b864142320b47374d807f07c03)) +* Add pandas.qcut ([#104](https://github.com/googleapis/python-bigquery-dataframes/issues/104)) ([8e44518](https://github.com/googleapis/python-bigquery-dataframes/commit/8e4451841ba09099b0ed5433f9102511741dfbed)) +* Add pd.get_dummies ([#149](https://github.com/googleapis/python-bigquery-dataframes/issues/149)) ([d8baad5](https://github.com/googleapis/python-bigquery-dataframes/commit/d8baad5b71ec67a35a0fb6132ee16e4c7418c456)) +* Add unstack to series, add level param ([#115](https://github.com/googleapis/python-bigquery-dataframes/issues/115)) ([5edcd19](https://github.com/googleapis/python-bigquery-dataframes/commit/5edcd19e6200db9b9ebe3d4945816b3ebf1f7bcd)) +* Implement operator `@` for `DataFrame.dot` ([#139](https://github.com/googleapis/python-bigquery-dataframes/issues/139)) ([79a638e](https://github.com/googleapis/python-bigquery-dataframes/commit/79a638eda80c482b640b523426ffd95c42747edc)) +* Populate ibis version in user agent ([#140](https://github.com/googleapis/python-bigquery-dataframes/issues/140)) ([c639a36](https://github.com/googleapis/python-bigquery-dataframes/commit/c639a3657465e2b68a3b93c363bd3ae1e969d2cc)) + + +### Bug Fixes + +* Don't override the global logging config ([#138](https://github.com/googleapis/python-bigquery-dataframes/issues/138)) ([2ddbf74](https://github.com/googleapis/python-bigquery-dataframes/commit/2ddbf743efc2fd8ffb61ae8d3333fc4b98ce4b55)) +* Fix bug with column names under repeated column assignment ([#150](https://github.com/googleapis/python-bigquery-dataframes/issues/150)) ([29032d0](https://github.com/googleapis/python-bigquery-dataframes/commit/29032d06811569121f7be2a7de915740df7daf6e)) +* Resolve plotly rendering issue by using ipython html for job pro… ([#134](https://github.com/googleapis/python-bigquery-dataframes/issues/134)) ([39df43e](https://github.com/googleapis/python-bigquery-dataframes/commit/39df43e243ac0374d1a1eb2a75779324825afbe9)) +* Use indexee's session for loc listlike cases ([#152](https://github.com/googleapis/python-bigquery-dataframes/issues/152)) ([27c5725](https://github.com/googleapis/python-bigquery-dataframes/commit/27c57255c7fe11e1ef9b9826d988d80fc17442a6)) + + +### Documentation + +* Add artithmetic df sample code ([#153](https://github.com/googleapis/python-bigquery-dataframes/issues/153)) ([ac44ccd](https://github.com/googleapis/python-bigquery-dataframes/commit/ac44ccd3936cdb28755d2bbe16377d489f08d5e5)) +* Fix indentation on `read_gbq_function` code sample ([#163](https://github.com/googleapis/python-bigquery-dataframes/issues/163)) ([0801d96](https://github.com/googleapis/python-bigquery-dataframes/commit/0801d96830dab467232277dea9fd2dacee41055c)) +* Link to ML.EVALUATE BQML page for score() methods ([#137](https://github.com/googleapis/python-bigquery-dataframes/issues/137)) ([45c617f](https://github.com/googleapis/python-bigquery-dataframes/commit/45c617fee7becc42f1c129246ffdc32f3a963f12)) + ## [0.11.0](https://github.com/googleapis/python-bigquery-dataframes/compare/v0.10.0...v0.11.0) (2023-10-26) diff --git a/bigframes/__init__.py b/bigframes/__init__.py index 8f41790072..bd1476957b 100644 --- a/bigframes/__init__.py +++ b/bigframes/__init__.py @@ -14,7 +14,7 @@ """BigQuery DataFrames provides a DataFrame API scaled by the BigQuery engine.""" -from bigframes._config import options +from bigframes._config import option_context, options from bigframes._config.bigquery_options import BigQueryOptions from bigframes.core.global_session import close_session, get_global_session from bigframes.session import connect, Session @@ -28,4 +28,5 @@ "connect", "Session", "__version__", + "option_context", ] diff --git a/bigframes/_config/__init__.py b/bigframes/_config/__init__.py index e26eaf8800..8dcebfce6a 100644 --- a/bigframes/_config/__init__.py +++ b/bigframes/_config/__init__.py @@ -18,8 +18,10 @@ """ import bigframes._config.bigquery_options as bigquery_options +import bigframes._config.compute_options as compute_options import bigframes._config.display_options as display_options import bigframes._config.sampling_options as sampling_options +import third_party.bigframes_vendored.pandas._config.config as pandas_config class Options: @@ -29,6 +31,7 @@ def __init__(self): self._bigquery_options = bigquery_options.BigQueryOptions() self._display_options = display_options.DisplayOptions() self._sampling_options = sampling_options.SamplingOptions() + self._compute_options = compute_options.ComputeOptions() @property def bigquery(self) -> bigquery_options.BigQueryOptions: @@ -49,6 +52,11 @@ def sampling(self) -> sampling_options.SamplingOptions: parameters in specific functions.""" return self._sampling_options + @property + def compute(self) -> compute_options.ComputeOptions: + """Options controlling object computation.""" + return self._compute_options + options = Options() """Global options for default session.""" @@ -58,3 +66,6 @@ def sampling(self) -> sampling_options.SamplingOptions: "Options", "options", ) + + +option_context = pandas_config.option_context diff --git a/bigframes/_config/compute_options.py b/bigframes/_config/compute_options.py new file mode 100644 index 0000000000..20c31d3906 --- /dev/null +++ b/bigframes/_config/compute_options.py @@ -0,0 +1,35 @@ +# Copyright 2023 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Options for displaying objects.""" + +import dataclasses +from typing import Optional + + +@dataclasses.dataclass +class ComputeOptions: + """ + Encapsulates configuration for compute options. + + Attributes: + maximum_bytes_billed (int, Options): + Limits the bytes billed for query jobs. Queries that will have + bytes billed beyond this limit will fail (without incurring a + charge). If unspecified, this will be set to your project default. + See `maximum_bytes_billed `_. + + """ + + maximum_bytes_billed: Optional[int] = None diff --git a/bigframes/_config/display_options.py b/bigframes/_config/display_options.py index 8bd2743f17..ad3ea3f68c 100644 --- a/bigframes/_config/display_options.py +++ b/bigframes/_config/display_options.py @@ -40,17 +40,12 @@ def pandas_repr(display_options: DisplayOptions): This context manager makes sure we reset the pandas options when we're done so that we don't override pandas behavior. """ - original_max_cols = pd.options.display.max_columns - original_max_rows = pd.options.display.max_rows - original_show_dimensions = pd.options.display.show_dimensions - - pd.options.display.max_columns = display_options.max_columns - pd.options.display.max_rows = display_options.max_rows - pd.options.display.show_dimensions = True # type: ignore - - try: - yield - finally: - pd.options.display.max_columns = original_max_cols - pd.options.display.max_rows = original_max_rows - pd.options.display.show_dimensions = original_show_dimensions + with pd.option_context( + "display.max_columns", + display_options.max_columns, + "display.max_rows", + display_options.max_rows, + "display.show_dimensions", + True, + ) as pandas_context: + yield (pandas_context) diff --git a/bigframes/clients.py b/bigframes/clients.py index 4ba9d93d69..de2421e499 100644 --- a/bigframes/clients.py +++ b/bigframes/clients.py @@ -24,9 +24,6 @@ from google.cloud import bigquery_connection_v1, resourcemanager_v3 from google.iam.v1 import iam_policy_pb2, policy_pb2 -logging.basicConfig( - level=logging.INFO, format="[%(levelname)s][%(asctime)s][%(name)s] %(message)s" -) logger = logging.getLogger(__name__) diff --git a/bigframes/core/__init__.py b/bigframes/core/__init__.py index 6c78a07f3b..4653f0ab6a 100644 --- a/bigframes/core/__init__.py +++ b/bigframes/core/__init__.py @@ -14,29 +14,21 @@ from __future__ import annotations from dataclasses import dataclass -import functools -import math -import textwrap +import io import typing -from typing import Collection, Iterable, Literal, Optional, Sequence, Tuple +from typing import Iterable, Literal, Optional, Sequence, Tuple from google.cloud import bigquery import ibis -import ibis.expr.datatypes as ibis_dtypes import ibis.expr.types as ibis_types import pandas -import bigframes.constants as constants +import bigframes.core.compile as compiled import bigframes.core.guid -from bigframes.core.ordering import ( - encode_order_string, - ExpressionOrdering, - IntegerEncoding, - OrderingColumnReference, - reencode_order_string, - StringEncoding, -) -import bigframes.core.utils as utils +import bigframes.core.nodes as nodes +from bigframes.core.ordering import OrderingColumnReference +import bigframes.core.ordering as orderings +from bigframes.core.window_spec import WindowSpec import bigframes.dtypes import bigframes.operations as ops import bigframes.operations.aggregations as agg_ops @@ -49,470 +41,190 @@ @dataclass(frozen=True) -class WindowSpec: +class ArrayValue: """ - Specifies a window over which aggregate and analytic function may be applied. - grouping_keys: set of column ids to group on - preceding: Number of preceding rows in the window - following: Number of preceding rows in the window - ordering: List of columns ids and ordering direction to override base ordering + ArrayValue is an immutable type representing a 2D array with per-column types. """ - grouping_keys: typing.Sequence[str] = tuple() - ordering: typing.Sequence[OrderingColumnReference] = tuple() - preceding: typing.Optional[int] = None - following: typing.Optional[int] = None - min_periods: int = 0 - - -# TODO(swast): We might want to move this to it's own sub-module. -class ArrayValue: - """Immutable BigQuery DataFrames expression tree. - - Note: Usage of this class is considered to be private and subject to change - at any time. + node: nodes.BigFrameNode - This class is a wrapper around Ibis expressions. Its purpose is to defer - Ibis projection operations to keep generated SQL small and correct when - mixing and matching columns from different versions of a DataFrame. - - Args: - session: - A BigQuery DataFrames session to allow more flexibility in running - queries. - table: An Ibis table expression. - columns: Ibis value expressions that can be projected as columns. - hidden_ordering_columns: Ibis value expressions to store ordering. - ordering: An ordering property of the data frame. - predicates: A list of filters on the data frame. - """ - - def __init__( - self, + @classmethod + def from_ibis( + cls, session: Session, table: ibis_types.Table, columns: Sequence[ibis_types.Value], - hidden_ordering_columns: Optional[Sequence[ibis_types.Value]] = None, - ordering: ExpressionOrdering = ExpressionOrdering(), - predicates: Optional[Collection[ibis_types.BooleanValue]] = None, + hidden_ordering_columns: Sequence[ibis_types.Value], + ordering: orderings.ExpressionOrdering, ): - self._session = session - self._table = table - self._predicates = tuple(predicates) if predicates is not None else () - # TODO: Validate ordering - if not ordering.total_ordering_columns: - raise ValueError("Must have total ordering defined by one or more columns") - self._ordering = ordering - # Allow creating a DataFrame directly from an Ibis table expression. - # TODO(swast): Validate that each column references the same table (or - # no table for literal values). - self._columns = tuple(columns) - - # Meta columns store ordering, or other data that doesn't correspond to dataframe columns - self._hidden_ordering_columns = ( - tuple(hidden_ordering_columns) - if hidden_ordering_columns is not None - else () - ) - - # To allow for more efficient lookup by column name, create a - # dictionary mapping names to column values. - self._column_names = {column.get_name(): column for column in self._columns} - self._hidden_ordering_column_names = { - column.get_name(): column for column in self._hidden_ordering_columns - } - ### Validation - value_col_ids = self._column_names.keys() - hidden_col_ids = self._hidden_ordering_column_names.keys() - - all_columns = value_col_ids | hidden_col_ids - ordering_valid = all( - col.column_id in all_columns for col in ordering.all_ordering_columns + node = nodes.ReadGbqNode( + table=table, + table_session=session, + columns=tuple(columns), + hidden_ordering_columns=tuple(hidden_ordering_columns), + ordering=ordering, ) - if value_col_ids & hidden_col_ids: - raise ValueError( - f"Keys in both hidden and exposed list: {value_col_ids & hidden_col_ids}" - ) - if not ordering_valid: - raise ValueError(f"Illegal ordering keys: {ordering.all_ordering_columns}") + return cls(node) @classmethod - def mem_expr_from_pandas( - cls, - pd_df: pandas.DataFrame, - session: Optional[Session], - ) -> ArrayValue: - """ - Builds an in-memory only (SQL only) expr from a pandas dataframe. + def from_pandas(cls, pd_df: pandas.DataFrame): + iobytes = io.BytesIO() + # Discard row labels and use simple string ids for columns + column_ids = tuple(str(label) for label in pd_df.columns) + pd_df.reset_index(drop=True).set_axis(column_ids, axis=1).to_feather(iobytes) + node = nodes.ReadLocalNode(iobytes.getvalue(), column_ids=column_ids) + return cls(node) - Caution: If session is None, only a subset of expr functionality will - be available (null Session is usually not supported). - """ - # We can't include any hidden columns in the ArrayValue constructor, so - # grab the column names before we add the hidden ordering column. - column_names = [str(column) for column in pd_df.columns] - # Make sure column names are all strings. - pd_df = pd_df.set_axis(column_names, axis="columns") - pd_df = pd_df.assign(**{ORDER_ID_COLUMN: range(len(pd_df))}) - - # ibis memtable cannot handle NA, must convert to None - pd_df = pd_df.astype("object") # type: ignore - pd_df = pd_df.where(pandas.notnull(pd_df), None) + @property + def column_ids(self) -> typing.Sequence[str]: + return self.compile().column_ids - # NULL type isn't valid in BigQuery, so retry with an explicit schema in these cases. - keys_memtable = ibis.memtable(pd_df) - schema = keys_memtable.schema() - new_schema = [] - for column_index, column in enumerate(schema): - if column == ORDER_ID_COLUMN: - new_type: ibis_dtypes.DataType = ibis_dtypes.int64 - else: - column_type = schema[column] - # The autodetected type might not be one we can support, such - # as NULL type for empty rows, so convert to a type we do - # support. - new_type = bigframes.dtypes.bigframes_dtype_to_ibis_dtype( - bigframes.dtypes.ibis_dtype_to_bigframes_dtype(column_type) - ) - # TODO(swast): Ibis memtable doesn't use backticks in struct - # field names, so spaces and other characters aren't allowed in - # the memtable context. Blocked by - # https://github.com/ibis-project/ibis/issues/7187 - column = f"col_{column_index}" - new_schema.append((column, new_type)) + @property + def session(self) -> Session: + required_session = self.node.session + from bigframes import get_global_session - # must set non-null column labels. these are not the user-facing labels - pd_df = pd_df.set_axis( - [column for column, _ in new_schema], - axis="columns", - ) - keys_memtable = ibis.memtable(pd_df, schema=ibis.schema(new_schema)) + return self.node.session[0] if required_session else get_global_session() - return cls( - session, # type: ignore # Session cannot normally be none, see "caution" above - keys_memtable, - columns=[ - keys_memtable[f"col_{column_index}"].name(column) - for column_index, column in enumerate(column_names) - ], - ordering=ExpressionOrdering( - ordering_value_columns=[OrderingColumnReference(ORDER_ID_COLUMN)], - total_ordering_columns=frozenset([ORDER_ID_COLUMN]), - ), - hidden_ordering_columns=(keys_memtable[ORDER_ID_COLUMN],), - ) - - @property - def columns(self) -> typing.Tuple[ibis_types.Value, ...]: - return self._columns + def get_column_type(self, key: str) -> bigframes.dtypes.Dtype: + return self.compile().get_column_type(key) - @property - def column_ids(self) -> typing.Sequence[str]: - return tuple(self._column_names.keys()) + def compile(self) -> compiled.CompiledArrayValue: + return compiled.compile_node(self.node) - @property - def _hidden_column_ids(self) -> typing.Sequence[str]: - return tuple(self._hidden_ordering_column_names.keys()) + def shape(self) -> typing.Tuple[int, int]: + """Returns dimensions as (length, width) tuple.""" + width = len(self.compile().columns) + count_expr = self.compile()._to_ibis_expr("unordered").count() - @property - def _reduced_predicate(self) -> typing.Optional[ibis_types.BooleanValue]: - """Returns the frame's predicates as an equivalent boolean value, useful where a single predicate value is preferred.""" - return ( - _reduce_predicate_list(self._predicates).name(PREDICATE_COLUMN) - if self._predicates - else None + # Support in-memory engines for hermetic unit tests. + if not self.node.session: + try: + length = ibis.pandas.connect({}).execute(count_expr) + return (length, width) + except Exception: + # Not all cases can be handled by pandas engine + pass + + sql = self.session.ibis_client.compile(count_expr) + row_iterator, _ = self.session._start_query( + sql=sql, + max_results=1, ) + length = next(row_iterator)[0] + return (length, width) - @property - def _ibis_order(self) -> Sequence[ibis_types.Value]: - """Returns a sequence of ibis values which can be directly used to order a table expression. Has direction modifiers applied.""" - return _convert_ordering_to_table_values( - {**self._column_names, **self._hidden_ordering_column_names}, - self._ordering.all_ordering_columns, + def to_sql( + self, + offset_column: typing.Optional[str] = None, + col_id_overrides: typing.Mapping[str, str] = {}, + sorted: bool = False, + ) -> str: + return self.compile().to_sql( + offset_column=offset_column, + col_id_overrides=col_id_overrides, + sorted=sorted, ) - def builder(self) -> ArrayValueBuilder: - """Creates a mutable builder for expressions.""" - # Since ArrayValue is intended to be immutable (immutability offers - # potential opportunities for caching, though we might need to introduce - # more node types for that to be useful), we create a builder class. - return ArrayValueBuilder( - self._session, - self._table, - columns=self._columns, - hidden_ordering_columns=self._hidden_ordering_columns, - ordering=self._ordering, - predicates=self._predicates, + def start_query( + self, + job_config: Optional[bigquery.job.QueryJobConfig] = None, + max_results: Optional[int] = None, + *, + sorted: bool = True, + ) -> Tuple[bigquery.table.RowIterator, bigquery.QueryJob]: + """Execute a query and return metadata about the results.""" + # TODO(swast): Cache the job ID so we can look it up again if they ask + # for the results? We'd need a way to invalidate the cache if DataFrame + # becomes mutable, though. Or move this method to the immutable + # expression class. + # TODO(swast): We might want to move this method to Session and/or + # provide our own minimal metadata class. Tight coupling to the + # BigQuery client library isn't ideal, especially if we want to support + # a LocalSession for unit testing. + # TODO(swast): Add a timeout here? If the query is taking a long time, + # maybe we just print the job metadata that we have so far? + sql = self.to_sql(sorted=sorted) # type:ignore + return self.session._start_query( + sql=sql, + job_config=job_config, + max_results=max_results, ) - def drop_columns(self, columns: Iterable[str]) -> ArrayValue: - # Must generate offsets if we are dropping a column that ordering depends on - expr = self - for ordering_column in set(columns).intersection( - [col.column_id for col in self._ordering.ordering_value_columns] - ): - expr = self._hide_column(ordering_column) - - expr_builder = expr.builder() - remain_cols = [ - column for column in expr.columns if column.get_name() not in columns - ] - expr_builder.columns = remain_cols - return expr_builder.build() - - def get_column_type(self, key: str) -> bigframes.dtypes.Dtype: - ibis_type = typing.cast( - bigframes.dtypes.IbisDtype, self._get_any_column(key).type() + def cached(self, cluster_cols: typing.Sequence[str]) -> ArrayValue: + """Write the ArrayValue to a session table and create a new block object that references it.""" + compiled = self.compile() + ibis_expr = compiled._to_ibis_expr("unordered", expose_hidden_cols=True) + destination = self.session._ibis_to_session_table( + ibis_expr, cluster_cols=cluster_cols, api_name="cache" ) - return typing.cast( - bigframes.dtypes.Dtype, - bigframes.dtypes.ibis_dtype_to_bigframes_dtype(ibis_type), + table_expression = self.session.ibis_client.table( + f"{destination.project}.{destination.dataset_id}.{destination.table_id}" + ) + new_columns = [table_expression[column] for column in compiled.column_ids] + new_hidden_columns = [ + table_expression[column] + for column in compiled._hidden_ordering_column_names + ] + return ArrayValue.from_ibis( + self.session, + table_expression, + columns=new_columns, + hidden_ordering_columns=new_hidden_columns, + ordering=compiled._ordering, ) - def _get_ibis_column(self, key: str) -> ibis_types.Value: - """Gets the Ibis expression for a given column.""" - if key not in self.column_ids: - raise ValueError( - "Column name {} not in set of values: {}".format(key, self.column_ids) - ) - return typing.cast(ibis_types.Value, self._column_names[key]) - - def _get_any_column(self, key: str) -> ibis_types.Value: - """Gets the Ibis expression for a given column. Will also get hidden columns.""" - all_columns = {**self._column_names, **self._hidden_ordering_column_names} - if key not in all_columns.keys(): - raise ValueError( - "Column name {} not in set of values: {}".format( - key, all_columns.keys() - ) - ) - return typing.cast(ibis_types.Value, all_columns[key]) + # Operations - def _get_hidden_ordering_column(self, key: str) -> ibis_types.Column: - """Gets the Ibis expression for a given hidden column.""" - if key not in self._hidden_ordering_column_names.keys(): - raise ValueError( - "Column name {} not in set of values: {}".format( - key, self._hidden_ordering_column_names.keys() - ) - ) - return typing.cast(ibis_types.Column, self._hidden_ordering_column_names[key]) + def drop_columns(self, columns: Iterable[str]) -> ArrayValue: + return ArrayValue( + nodes.DropColumnsNode(child=self.node, columns=tuple(columns)) + ) def filter(self, predicate_id: str, keep_null: bool = False) -> ArrayValue: """Filter the table on a given expression, the predicate must be a boolean series aligned with the table expression.""" - condition = typing.cast( - ibis_types.BooleanValue, self._get_ibis_column(predicate_id) - ) - if keep_null: - condition = typing.cast( - ibis_types.BooleanValue, - condition.fillna( - typing.cast(ibis_types.BooleanScalar, ibis_types.literal(True)) - ), + return ArrayValue( + nodes.FilterNode( + child=self.node, predicate_id=predicate_id, keep_null=keep_null ) - return self._filter(condition) - - def _filter(self, predicate_value: ibis_types.BooleanValue) -> ArrayValue: - """Filter the table on a given expression, the predicate must be a boolean series aligned with the table expression.""" - expr = self.builder() - expr.ordering = expr.ordering.with_non_sequential() - expr.predicates = [*self._predicates, predicate_value] - return expr.build() + ) def order_by( self, by: Sequence[OrderingColumnReference], stable: bool = False ) -> ArrayValue: - expr_builder = self.builder() - expr_builder.ordering = self._ordering.with_ordering_columns(by, stable=stable) - return expr_builder.build() - - def reversed(self) -> ArrayValue: - expr_builder = self.builder() - expr_builder.ordering = self._ordering.with_reverse() - return expr_builder.build() - - def _uniform_sampling(self, fraction: float) -> ArrayValue: - """Sampling the table on given fraction. - - .. warning:: - The row numbers of result is non-deterministic, avoid to use. - """ - table = self._to_ibis_expr( - "unordered", expose_hidden_cols=True, fraction=fraction - ) - columns = [table[column_name] for column_name in self._column_names] - hidden_ordering_columns = [ - table[column_name] for column_name in self._hidden_ordering_column_names - ] return ArrayValue( - self._session, - table, - columns=columns, - hidden_ordering_columns=hidden_ordering_columns, - ordering=self._ordering, + nodes.OrderByNode(child=self.node, by=tuple(by), stable=stable) ) - @property - def _offsets(self) -> ibis_types.IntegerColumn: - if not self._ordering.is_sequential: - raise ValueError( - "Expression does not have offsets. Generate them first using project_offsets." - ) - if not self._ordering.total_order_col: - raise ValueError( - "Ordering is invalid. Marked as sequential but no total order columns." - ) - column = self._get_any_column(self._ordering.total_order_col.column_id) - return typing.cast(ibis_types.IntegerColumn, column) - - def _project_offsets(self) -> ArrayValue: - """Create a new expression that contains offsets. Should only be executed when offsets are needed for an operations. Has no effect on expression semantics.""" - if self._ordering.is_sequential: - return self - # TODO(tbergeron): Enforce total ordering - table = self._to_ibis_expr( - ordering_mode="offset_col", order_col_name=ORDER_ID_COLUMN - ) - columns = [table[column_name] for column_name in self._column_names] - ordering = ExpressionOrdering( - ordering_value_columns=[OrderingColumnReference(ORDER_ID_COLUMN)], - total_ordering_columns=frozenset([ORDER_ID_COLUMN]), - integer_encoding=IntegerEncoding(True, is_sequential=True), - ) - return ArrayValue( - self._session, - table, - columns=columns, - hidden_ordering_columns=[table[ORDER_ID_COLUMN]], - ordering=ordering, - ) - - def _hide_column(self, column_id) -> ArrayValue: - """Pushes columns to hidden columns list. Used to hide ordering columns that have been dropped or destructively mutated.""" - expr_builder = self.builder() - # Need to rename column as caller might be creating a new row with the same name but different values. - # Can avoid this if don't allow callers to determine ids and instead generate unique ones in this class. - new_name = bigframes.core.guid.generate_guid(prefix="bigframes_hidden_") - expr_builder.hidden_ordering_columns = [ - *self._hidden_ordering_columns, - self._get_ibis_column(column_id).name(new_name), - ] - expr_builder.ordering = self._ordering.with_column_remap({column_id: new_name}) - return expr_builder.build() + def reversed(self) -> ArrayValue: + return ArrayValue(nodes.ReversedNode(child=self.node)) def promote_offsets(self, col_id: str) -> ArrayValue: """ Convenience function to promote copy of column offsets to a value column. Can be used to reset index. """ - # Special case: offsets already exist - ordering = self._ordering - - if (not ordering.is_sequential) or (not ordering.total_order_col): - return self._project_offsets().promote_offsets(col_id) - expr_builder = self.builder() - expr_builder.columns = [ - self._get_any_column(ordering.total_order_col.column_id).name(col_id), - *self.columns, - ] - return expr_builder.build() + return ArrayValue(nodes.PromoteOffsetsNode(child=self.node, col_id=col_id)) def select_columns(self, column_ids: typing.Sequence[str]) -> ArrayValue: - return self._projection( - [self._get_ibis_column(col_id) for col_id in column_ids] + return ArrayValue( + nodes.SelectNode(child=self.node, column_ids=tuple(column_ids)) ) - def _projection(self, columns: Iterable[ibis_types.Value]) -> ArrayValue: - """Creates a new expression based on this expression with new columns.""" - # TODO(swast): We might want to do validation here that columns derive - # from the same table expression instead of (in addition to?) at - # construction time. - - expr = self - for ordering_column in set(self.column_ids).intersection( - [col_ref.column_id for col_ref in self._ordering.ordering_value_columns] - ): - # Need to hide ordering columns that are being dropped. Alternatively, could project offsets - expr = expr._hide_column(ordering_column) - builder = expr.builder() - builder.columns = list(columns) - new_expr = builder.build() - return new_expr - - def shape(self) -> typing.Tuple[int, int]: - """Returns dimensions as (length, width) tuple.""" - width = len(self.columns) - count_expr = self._to_ibis_expr("unordered").count() - sql = self._session.ibis_client.compile(count_expr) - - # Support in-memory engines for hermetic unit tests. - if not isinstance(sql, str): - length = self._session.ibis_client.execute(count_expr) - else: - row_iterator, _ = self._session._start_query( - sql=sql, - max_results=1, - ) - length = next(row_iterator)[0] - return (length, width) - def concat(self, other: typing.Sequence[ArrayValue]) -> ArrayValue: """Append together multiple ArrayValue objects.""" - if len(other) == 0: - return self - tables = [] - prefix_base = 10 - prefix_size = math.ceil(math.log(len(other) + 1, prefix_base)) - # Must normalize all ids to the same encoding size - max_encoding_size = max( - self._ordering.string_encoding.length, - *[expression._ordering.string_encoding.length for expression in other], - ) - for i, expr in enumerate([self, *other]): - ordering_prefix = str(i).zfill(prefix_size) - table = expr._to_ibis_expr( - ordering_mode="string_encoded", order_col_name=ORDER_ID_COLUMN - ) - # Rename the value columns based on horizontal offset before applying union. - table = table.select( - [ - table[col].name(f"column_{i}") - if col != ORDER_ID_COLUMN - else ( - ordering_prefix - + reencode_order_string( - table[ORDER_ID_COLUMN], max_encoding_size - ) - ).name(ORDER_ID_COLUMN) - for i, col in enumerate(table.columns) - ] - ) - tables.append(table) - combined_table = ibis.union(*tables) - ordering = ExpressionOrdering( - ordering_value_columns=[OrderingColumnReference(ORDER_ID_COLUMN)], - total_ordering_columns=frozenset([ORDER_ID_COLUMN]), - string_encoding=StringEncoding(True, prefix_size + max_encoding_size), - ) return ArrayValue( - self._session, - combined_table, - columns=[ - combined_table[col] - for col in combined_table.columns - if col != ORDER_ID_COLUMN - ], - hidden_ordering_columns=[combined_table[ORDER_ID_COLUMN]], - ordering=ordering, + nodes.ConcatNode(children=tuple([self.node, *[val.node for val in other]])) ) def project_unary_op( self, column_name: str, op: ops.UnaryOp, output_name=None ) -> ArrayValue: """Creates a new expression based on this expression with unary operation applied to one column.""" - value = op._as_ibis(self._get_ibis_column(column_name)).name( - output_name or column_name + return ArrayValue( + nodes.ProjectUnaryOpNode( + child=self.node, input_id=column_name, op=op, output_id=output_name + ) ) - return self._set_or_replace_by_id(output_name or column_name, value) def project_binary_op( self, @@ -522,11 +234,15 @@ def project_binary_op( output_column_id: str, ) -> ArrayValue: """Creates a new expression based on this expression with binary operation applied to two columns.""" - value = op( - self._get_ibis_column(left_column_id), - self._get_ibis_column(right_column_id), - ).name(output_column_id) - return self._set_or_replace_by_id(output_column_id, value) + return ArrayValue( + nodes.ProjectBinaryOpNode( + child=self.node, + left_input_id=left_column_id, + right_input_id=right_column_id, + op=op, + output_id=output_column_id, + ) + ) def project_ternary_op( self, @@ -537,12 +253,16 @@ def project_ternary_op( output_column_id: str, ) -> ArrayValue: """Creates a new expression based on this expression with ternary operation applied to three columns.""" - value = op( - self._get_ibis_column(col_id_1), - self._get_ibis_column(col_id_2), - self._get_ibis_column(col_id_3), - ).name(output_column_id) - return self._set_or_replace_by_id(output_column_id, value) + return ArrayValue( + nodes.ProjectTernaryOpNode( + child=self.node, + input_id1=col_id_1, + input_id2=col_id_2, + input_id3=col_id_3, + op=op, + output_id=output_column_id, + ) + ) def aggregate( self, @@ -557,46 +277,14 @@ def aggregate( by_column_id: column id of the aggregation key, this is preserved through the transform dropna: whether null keys should be dropped """ - table = self._to_ibis_expr("unordered") - stats = { - col_out: agg_op._as_ibis(table[col_in]) - for col_in, agg_op, col_out in aggregations - } - if by_column_ids: - result = table.group_by(by_column_ids).aggregate(**stats) - # Must have deterministic ordering, so order by the unique "by" column - ordering = ExpressionOrdering( - [ - OrderingColumnReference(column_id=column_id) - for column_id in by_column_ids - ], - total_ordering_columns=frozenset(by_column_ids), - ) - columns = tuple(result[key] for key in result.columns) - expr = ArrayValue(self._session, result, columns=columns, ordering=ordering) - if dropna: - for column_id in by_column_ids: - expr = expr._filter( - ops.notnull_op._as_ibis(expr._get_ibis_column(column_id)) - ) - # Can maybe remove this as Ordering id is redundant as by_column is unique after aggregation - return expr._project_offsets() - else: - aggregates = {**stats, ORDER_ID_COLUMN: ibis_types.literal(0)} - result = table.aggregate(**aggregates) - # Ordering is irrelevant for single-row output, but set ordering id regardless as other ops(join etc.) expect it. - ordering = ExpressionOrdering( - ordering_value_columns=[OrderingColumnReference(ORDER_ID_COLUMN)], - total_ordering_columns=frozenset([ORDER_ID_COLUMN]), - integer_encoding=IntegerEncoding(is_encoded=True, is_sequential=True), - ) - return ArrayValue( - self._session, - result, - columns=[result[col_id] for col_id in [*stats.keys()]], - hidden_ordering_columns=[result[ORDER_ID_COLUMN]], - ordering=ordering, + return ArrayValue( + nodes.AggregateNode( + child=self.node, + aggregations=tuple(aggregations), + by_column_ids=tuple(by_column_ids), + dropna=dropna, ) + ) def corr_aggregate( self, corr_aggregations: typing.Sequence[typing.Tuple[str, str, str]] @@ -607,25 +295,8 @@ def corr_aggregate( Arguments: corr_aggregations: left_column_id, right_column_id, output_column_id tuples """ - table = self._to_ibis_expr("unordered") - stats = { - col_out: table[col_left].corr(table[col_right], how="pop") - for col_left, col_right, col_out in corr_aggregations - } - aggregates = {**stats, ORDER_ID_COLUMN: ibis_types.literal(0)} - result = table.aggregate(**aggregates) - # Ordering is irrelevant for single-row output, but set ordering id regardless as other ops(join etc.) expect it. - ordering = ExpressionOrdering( - ordering_value_columns=[OrderingColumnReference(ORDER_ID_COLUMN)], - total_ordering_columns=frozenset([ORDER_ID_COLUMN]), - integer_encoding=IntegerEncoding(is_encoded=True, is_sequential=True), - ) return ArrayValue( - self._session, - result, - columns=[result[col_id] for col_id in [*stats.keys()]], - hidden_ordering_columns=[result[ORDER_ID_COLUMN]], - ordering=ordering, + nodes.CorrNode(child=self.node, corr_aggregations=tuple(corr_aggregations)) ) def project_window_op( @@ -647,231 +318,17 @@ def project_window_op( never_skip_nulls: will disable null skipping for operators that would otherwise do so skip_reproject_unsafe: skips the reprojection step, can be used when performing many non-dependent window operations, user responsible for not nesting window expressions, or using outputs as join, filter or aggregation keys before a reprojection """ - column = typing.cast(ibis_types.Column, self._get_ibis_column(column_name)) - window = self._ibis_window_from_spec(window_spec, allow_ties=op.handles_ties) - - window_op = op._as_ibis(column, window) - - clauses = [] - if op.skips_nulls and not never_skip_nulls: - clauses.append((column.isnull(), ibis.NA)) - if window_spec.min_periods: - if op.skips_nulls: - # Most operations do not count NULL values towards min_periods - observation_count = agg_ops.count_op._as_ibis(column, window) - else: - # Operations like count treat even NULLs as valid observations for the sake of min_periods - # notnull is just used to convert null values to non-null (FALSE) values to be counted - denulled_value = typing.cast(ibis_types.BooleanColumn, column.notnull()) - observation_count = agg_ops.count_op._as_ibis(denulled_value, window) - clauses.append( - ( - observation_count < ibis_types.literal(window_spec.min_periods), - ibis.NA, - ) - ) - if clauses: - case_statement = ibis.case() - for clause in clauses: - case_statement = case_statement.when(clause[0], clause[1]) - case_statement = case_statement.else_(window_op).end() - window_op = case_statement - - result = self._set_or_replace_by_id(output_name or column_name, window_op) - # TODO(tbergeron): Automatically track analytic expression usage and defer reprojection until required for valid query generation. - return result._reproject_to_table() if not skip_reproject_unsafe else result - - def to_sql( - self, - offset_column: typing.Optional[str] = None, - col_id_overrides: typing.Mapping[str, str] = {}, - sorted: bool = False, - ) -> str: - offsets_id = offset_column or ORDER_ID_COLUMN - - sql = self._session.ibis_client.compile( - self._to_ibis_expr( - ordering_mode="offset_col" - if (offset_column or sorted) - else "unordered", - order_col_name=offsets_id, - col_id_overrides=col_id_overrides, - ) - ) - if sorted: - sql = textwrap.dedent( - f""" - SELECT * EXCEPT (`{offsets_id}`) - FROM ({sql}) - ORDER BY `{offsets_id}` - """ - ) - return typing.cast(str, sql) - - def _to_ibis_expr( - self, - ordering_mode: Literal["string_encoded", "offset_col", "unordered"], - order_col_name: Optional[str] = ORDER_ID_COLUMN, - expose_hidden_cols: bool = False, - fraction: Optional[float] = None, - col_id_overrides: typing.Mapping[str, str] = {}, - ): - """ - Creates an Ibis table expression representing the DataFrame. - - ArrayValue objects are sorted, so the following options are available - to reflect this in the ibis expression. - - * "offset_col": Zero-based offsets are generated as a column, this will - not sort the rows however. - * "string_encoded": An ordered string column is provided in output table. - * "unordered": No ordering information will be provided in output. Only - value columns are projected. - - For offset or ordered column, order_col_name can be used to assign the - output label for the ordering column. If none is specified, the default - column name will be 'bigframes_ordering_id' - - Args: - ordering_mode: - How to construct the Ibis expression from the ArrayValue. See - above for details. - order_col_name: - If the ordering mode outputs a single ordering or offsets - column, use this as the column name. - expose_hidden_cols: - If True, include the hidden ordering columns in the results. - Only compatible with `order_by` and `unordered` - ``ordering_mode``. - col_id_overrides: - overrides the column ids for the result - Returns: - An ibis expression representing the data help by the ArrayValue object. - """ - assert ordering_mode in ( - "string_encoded", - "offset_col", - "unordered", - ) - if expose_hidden_cols and ordering_mode in ("ordered_col", "offset_col"): - raise ValueError( - f"Cannot expose hidden ordering columns with ordering_mode {ordering_mode}" + return ArrayValue( + nodes.WindowOpNode( + child=self.node, + column_name=column_name, + op=op, + window_spec=window_spec, + output_name=output_name, + never_skip_nulls=never_skip_nulls, + skip_reproject_unsafe=skip_reproject_unsafe, ) - - columns = list(self._columns) - columns_to_drop: list[ - str - ] = [] # Ordering/Filtering columns that will be dropped at end - - if self._reduced_predicate is not None: - columns.append(self._reduced_predicate) - # Usually drop predicate as it is will be all TRUE after filtering - if not expose_hidden_cols: - columns_to_drop.append(self._reduced_predicate.get_name()) - - order_columns = self._create_order_columns( - ordering_mode, order_col_name, expose_hidden_cols ) - columns.extend(order_columns) - - # Special case for empty tables, since we can't create an empty - # projection. - if not columns: - return ibis.memtable([]) - - # Make sure all dtypes are the "canonical" ones for BigFrames. This is - # important for operations like UNION where the schema must match. - table = self._table.select( - bigframes.dtypes.ibis_value_to_canonical_type(column) for column in columns - ) - base_table = table - if self._reduced_predicate is not None: - table = table.filter(base_table[PREDICATE_COLUMN]) - table = table.drop(*columns_to_drop) - if col_id_overrides: - table = table.relabel(col_id_overrides) - if fraction is not None: - table = table.filter(ibis.random() < ibis.literal(fraction)) - return table - - def _create_order_columns( - self, - ordering_mode: str, - order_col_name: Optional[str], - expose_hidden_cols: bool, - ) -> typing.Sequence[ibis_types.Value]: - # Generate offsets if current ordering id semantics are not sufficiently strict - if ordering_mode == "offset_col": - return (self._create_offset_column().name(order_col_name),) - elif ordering_mode == "string_encoded": - return (self._create_string_ordering_column().name(order_col_name),) - elif expose_hidden_cols: - return self._hidden_ordering_columns - return () - - def _create_offset_column(self) -> ibis_types.IntegerColumn: - if self._ordering.total_order_col and self._ordering.is_sequential: - offsets = self._get_any_column(self._ordering.total_order_col.column_id) - return typing.cast(ibis_types.IntegerColumn, offsets) - else: - window = ibis.window(order_by=self._ibis_order) - if self._predicates: - window = window.group_by(self._reduced_predicate) - offsets = ibis.row_number().over(window) - return typing.cast(ibis_types.IntegerColumn, offsets) - - def _create_string_ordering_column(self) -> ibis_types.StringColumn: - if self._ordering.total_order_col and self._ordering.is_string_encoded: - string_order_ids = self._get_any_column( - self._ordering.total_order_col.column_id - ) - return typing.cast(ibis_types.StringColumn, string_order_ids) - if ( - self._ordering.total_order_col - and self._ordering.integer_encoding.is_encoded - ): - # Special case: non-negative integer ordering id can be converted directly to string without regenerating row numbers - int_values = self._get_any_column(self._ordering.total_order_col.column_id) - return encode_order_string( - typing.cast(ibis_types.IntegerColumn, int_values), - ) - else: - # Have to build string from scratch - window = ibis.window(order_by=self._ibis_order) - if self._predicates: - window = window.group_by(self._reduced_predicate) - row_nums = typing.cast( - ibis_types.IntegerColumn, ibis.row_number().over(window) - ) - return encode_order_string(row_nums) - - def start_query( - self, - job_config: Optional[bigquery.job.QueryJobConfig] = None, - max_results: Optional[int] = None, - *, - sorted: bool = True, - ) -> Tuple[bigquery.table.RowIterator, bigquery.QueryJob]: - """Execute a query and return metadata about the results.""" - # TODO(swast): Cache the job ID so we can look it up again if they ask - # for the results? We'd need a way to invalidate the cache if DataFrame - # becomes mutable, though. Or move this method to the immutable - # expression class. - # TODO(swast): We might want to move this method to Session and/or - # provide our own minimal metadata class. Tight coupling to the - # BigQuery client library isn't ideal, especially if we want to support - # a LocalSession for unit testing. - # TODO(swast): Add a timeout here? If the query is taking a long time, - # maybe we just print the job metadata that we have so far? - sql = self.to_sql(sorted=True) # type:ignore - return self._session._start_query( - sql=sql, - job_config=job_config, - max_results=max_results, - ) - - def _get_table_size(self, destination_table): - return self._session._get_table_size(destination_table) def _reproject_to_table(self) -> ArrayValue: """ @@ -881,74 +338,25 @@ def _reproject_to_table(self) -> ArrayValue: some operations such as window operations that cannot be used recursively in projections. """ - table = self._to_ibis_expr( - "unordered", - expose_hidden_cols=True, - ) - columns = [table[column_name] for column_name in self._column_names] - ordering_col_ids = [ - ref.column_id for ref in self._ordering.all_ordering_columns - ] - hidden_ordering_columns = [ - table[column_name] - for column_name in self._hidden_ordering_column_names - if column_name in ordering_col_ids - ] return ArrayValue( - self._session, - table, - columns=columns, - hidden_ordering_columns=hidden_ordering_columns, - ordering=self._ordering, - ) - - def _ibis_window_from_spec(self, window_spec: WindowSpec, allow_ties: bool = False): - group_by: typing.List[ibis_types.Value] = ( - [ - typing.cast( - ibis_types.Column, _as_identity(self._get_ibis_column(column)) - ) - for column in window_spec.grouping_keys - ] - if window_spec.grouping_keys - else [] - ) - if self._reduced_predicate is not None: - group_by.append(self._reduced_predicate) - if window_spec.ordering: - order_by = _convert_ordering_to_table_values( - {**self._column_names, **self._hidden_ordering_column_names}, - window_spec.ordering, + nodes.ReprojectOpNode( + child=self.node, ) - if not allow_ties: - # Most operator need an unambiguous ordering, so the table's total ordering is appended - order_by = tuple([*order_by, *self._ibis_order]) - elif (window_spec.following is not None) or (window_spec.preceding is not None): - # If window spec has following or preceding bounds, we need to apply an unambiguous ordering. - order_by = tuple(self._ibis_order) - else: - # Unbound grouping window. Suitable for aggregations but not for analytic function application. - order_by = None - return ibis.window( - preceding=window_spec.preceding, - following=window_spec.following, - order_by=order_by, - group_by=group_by, ) def unpivot( self, row_labels: typing.Sequence[typing.Hashable], unpivot_columns: typing.Sequence[ - typing.Tuple[str, typing.Sequence[typing.Optional[str]]] + typing.Tuple[str, typing.Tuple[typing.Optional[str], ...]] ], *, passthrough_columns: typing.Sequence[str] = (), index_col_ids: typing.Sequence[str] = ["index"], dtype: typing.Union[ - bigframes.dtypes.Dtype, typing.Sequence[bigframes.dtypes.Dtype] + bigframes.dtypes.Dtype, typing.Tuple[bigframes.dtypes.Dtype, ...] ] = pandas.Float64Dtype(), - how="left", + how: typing.Literal["left", "right"] = "left", ) -> ArrayValue: """ Unpivot ArrayValue columns. @@ -963,133 +371,23 @@ def unpivot( Returns: ArrayValue: The unpivoted ArrayValue """ - if how not in ("left", "right"): - raise ValueError("'how' must be 'left' or 'right'") - table = self._to_ibis_expr("unordered", expose_hidden_cols=True) - row_n = len(row_labels) - hidden_col_ids = self._hidden_ordering_column_names.keys() - if not all( - len(source_columns) == row_n for _, source_columns in unpivot_columns - ): - raise ValueError("Columns and row labels must all be same length.") - - unpivot_offset_id = bigframes.core.guid.generate_guid("unpivot_offsets_") - unpivot_table = table.cross_join( - ibis.memtable({unpivot_offset_id: range(row_n)}) - ) - # Use ibis memtable to infer type of rowlabels (if possible) - # TODO: Allow caller to specify dtype - if isinstance(row_labels[0], tuple): - labels_table = ibis.memtable(row_labels) - labels_ibis_types = [ - labels_table[col].type() for col in labels_table.columns - ] - else: - labels_ibis_types = [ibis.memtable({"col": row_labels})["col"].type()] - labels_dtypes = [ - bigframes.dtypes.ibis_dtype_to_bigframes_dtype(ibis_type) - for ibis_type in labels_ibis_types - ] - - label_columns = [] - for label_part, (col_id, label_dtype) in enumerate( - zip(index_col_ids, labels_dtypes) - ): - # interpret as tuples even if it wasn't originally so can apply same logic for multi-column labels - labels_as_tuples = [ - label if isinstance(label, tuple) else (label,) for label in row_labels - ] - cases = [ - ( - i, - bigframes.dtypes.literal_to_ibis_scalar( - label_tuple[label_part], # type:ignore - force_dtype=label_dtype, # type:ignore - ), - ) - for i, label_tuple in enumerate(labels_as_tuples) - ] - labels_value = ( - typing.cast(ibis_types.IntegerColumn, unpivot_table[unpivot_offset_id]) - .cases(cases, default=None) # type:ignore - .name(col_id) - ) - label_columns.append(labels_value) - - unpivot_values = [] - for j in range(len(unpivot_columns)): - col_dtype = dtype[j] if utils.is_list_like(dtype) else dtype - result_col, source_cols = unpivot_columns[j] - null_value = bigframes.dtypes.literal_to_ibis_scalar( - None, force_dtype=col_dtype - ) - ibis_values = [ - ops.AsTypeOp(col_dtype)._as_ibis(unpivot_table[col]) - if col is not None - else null_value - for col in source_cols - ] - cases = [(i, ibis_values[i]) for i in range(len(ibis_values))] - unpivot_value = typing.cast( - ibis_types.IntegerColumn, unpivot_table[unpivot_offset_id] - ).cases( - cases, default=null_value # type:ignore - ) - unpivot_values.append(unpivot_value.name(result_col)) - - unpivot_table = unpivot_table.select( - passthrough_columns, - *label_columns, - *unpivot_values, - *hidden_col_ids, - unpivot_offset_id, - ) - - # Extend the original ordering using unpivot_offset_id - old_ordering = self._ordering - if how == "left": - new_ordering = ExpressionOrdering( - ordering_value_columns=[ - *old_ordering.ordering_value_columns, - OrderingColumnReference(unpivot_offset_id), - ], - total_ordering_columns=frozenset( - [*old_ordering.total_ordering_columns, unpivot_offset_id] - ), - ) - else: # how=="right" - new_ordering = ExpressionOrdering( - ordering_value_columns=[ - OrderingColumnReference(unpivot_offset_id), - *old_ordering.ordering_value_columns, - ], - total_ordering_columns=frozenset( - [*old_ordering.total_ordering_columns, unpivot_offset_id] - ), - ) - value_columns = [ - unpivot_table[value_col_id] for value_col_id, _ in unpivot_columns - ] - passthrough_values = [unpivot_table[col] for col in passthrough_columns] - hidden_ordering_columns = [ - unpivot_table[unpivot_offset_id], - *[unpivot_table[hidden_col] for hidden_col in hidden_col_ids], - ] return ArrayValue( - session=self._session, - table=unpivot_table, - columns=[ - *[unpivot_table[col_id] for col_id in index_col_ids], - *value_columns, - *passthrough_values, - ], - hidden_ordering_columns=hidden_ordering_columns, - ordering=new_ordering, + nodes.UnpivotNode( + child=self.node, + row_labels=tuple(row_labels), + unpivot_columns=tuple(unpivot_columns), + passthrough_columns=tuple(passthrough_columns), + index_col_ids=tuple(index_col_ids), + dtype=dtype, + how=how, + ) ) def assign(self, source_id: str, destination_id: str) -> ArrayValue: - return self._set_or_replace_by_id( - destination_id, self._get_ibis_column(source_id) + return ArrayValue( + nodes.AssignNode( + child=self.node, source_id=source_id, destination_id=destination_id + ) ) def assign_constant( @@ -1098,128 +396,41 @@ def assign_constant( value: typing.Any, dtype: typing.Optional[bigframes.dtypes.Dtype], ) -> ArrayValue: - # TODO(b/281587571): Solve scalar constant aggregation problem w/Ibis. - ibis_value = bigframes.dtypes.literal_to_ibis_scalar(value, dtype) - if ibis_value is None: - raise NotImplementedError( - f"Type not supported as scalar value {type(value)}. {constants.FEEDBACK_LINK}" - ) - expr = self._set_or_replace_by_id(destination_id, ibis_value) - return expr._reproject_to_table() - - def _set_or_replace_by_id(self, id: str, new_value: ibis_types.Value) -> ArrayValue: - """Safely assign by id while maintaining ordering integrity.""" - # TODO: Split into explicit set and replace methods - ordering_col_ids = [ - col_ref.column_id for col_ref in self._ordering.ordering_value_columns - ] - if id in ordering_col_ids: - return self._hide_column(id)._set_or_replace_by_id(id, new_value) - - builder = self.builder() - if id in self.column_ids: - builder.columns = [ - val if (col_id != id) else new_value.name(id) - for col_id, val in zip(self.column_ids, self._columns) - ] - else: - builder.columns = [*self.columns, new_value.name(id)] - return builder.build() - - def cached(self, cluster_cols: typing.Sequence[str]) -> ArrayValue: - """Write the ArrayValue to a session table and create a new block object that references it.""" - ibis_expr = self._to_ibis_expr("unordered", expose_hidden_cols=True) - destination = self._session._ibis_to_session_table( - ibis_expr, cluster_cols=cluster_cols, api_name="cache" - ) - table_expression = self._session.ibis_client.table( - f"{destination.project}.{destination.dataset_id}.{destination.table_id}" - ) - new_columns = [table_expression[column] for column in self.column_ids] - new_hidden_columns = [ - table_expression[column] for column in self._hidden_ordering_column_names - ] return ArrayValue( - self._session, - table_expression, - columns=new_columns, - hidden_ordering_columns=new_hidden_columns, - ordering=self._ordering, + nodes.AssignConstantNode( + child=self.node, destination_id=destination_id, value=value, dtype=dtype + ) ) - -class ArrayValueBuilder: - """Mutable expression class. - Use ArrayValue.builder() to create from a ArrayValue object. - """ - - def __init__( + def join( self, - session: Session, - table: ibis_types.Table, - ordering: ExpressionOrdering, - columns: Collection[ibis_types.Value] = (), - hidden_ordering_columns: Collection[ibis_types.Value] = (), - predicates: Optional[Collection[ibis_types.BooleanValue]] = None, + self_column_ids: typing.Sequence[str], + other: ArrayValue, + other_column_ids: typing.Sequence[str], + *, + how: Literal[ + "inner", + "left", + "outer", + "right", + ], + allow_row_identity_join: bool = True, ): - self.session = session - self.table = table - self.columns = list(columns) - self.hidden_ordering_columns = list(hidden_ordering_columns) - self.ordering = ordering - self.predicates = list(predicates) if predicates is not None else None - - def build(self) -> ArrayValue: return ArrayValue( - session=self.session, - table=self.table, - columns=self.columns, - hidden_ordering_columns=self.hidden_ordering_columns, - ordering=self.ordering, - predicates=self.predicates, - ) - - -def _reduce_predicate_list( - predicate_list: typing.Collection[ibis_types.BooleanValue], -) -> ibis_types.BooleanValue: - """Converts a list of predicates BooleanValues into a single BooleanValue.""" - if len(predicate_list) == 0: - raise ValueError("Cannot reduce empty list of predicates") - if len(predicate_list) == 1: - (item,) = predicate_list - return item - return functools.reduce(lambda acc, pred: acc.__and__(pred), predicate_list) - - -def _convert_ordering_to_table_values( - value_lookup: typing.Mapping[str, ibis_types.Value], - ordering_columns: typing.Sequence[OrderingColumnReference], -) -> typing.Sequence[ibis_types.Value]: - column_refs = ordering_columns - ordering_values = [] - for ordering_col in column_refs: - column = typing.cast(ibis_types.Column, value_lookup[ordering_col.column_id]) - ordering_value = ( - ibis.asc(column) - if ordering_col.direction.is_ascending - else ibis.desc(column) + nodes.JoinNode( + left_child=self.node, + right_child=other.node, + left_column_ids=tuple(self_column_ids), + right_column_ids=tuple(other_column_ids), + how=how, + allow_row_identity_join=allow_row_identity_join, + ) ) - # Bigquery SQL considers NULLS to be "smallest" values, but we need to override in these cases. - if (not ordering_col.na_last) and (not ordering_col.direction.is_ascending): - # Force nulls to be first - is_null_val = typing.cast(ibis_types.Column, column.isnull()) - ordering_values.append(ibis.desc(is_null_val)) - elif (ordering_col.na_last) and (ordering_col.direction.is_ascending): - # Force nulls to be last - is_null_val = typing.cast(ibis_types.Column, column.isnull()) - ordering_values.append(ibis.asc(is_null_val)) - ordering_values.append(ordering_value) - return ordering_values + def _uniform_sampling(self, fraction: float) -> ArrayValue: + """Sampling the table on given fraction. -def _as_identity(value: ibis_types.Value): - # Some types need to be converted to string to enable groupby - if value.type().is_float64() or value.type().is_geospatial(): - return value.cast(ibis_dtypes.str) - return value + .. warning:: + The row numbers of result is non-deterministic, avoid to use. + """ + return ArrayValue(nodes.RandomSampleNode(self.node, fraction)) diff --git a/bigframes/core/block_transforms.py b/bigframes/core/block_transforms.py index b0f05f4798..3706bf1681 100644 --- a/bigframes/core/block_transforms.py +++ b/bigframes/core/block_transforms.py @@ -21,6 +21,7 @@ import bigframes.core as core import bigframes.core.blocks as blocks import bigframes.core.ordering as ordering +import bigframes.core.window_spec as windows import bigframes.operations as ops import bigframes.operations.aggregations as agg_ops @@ -68,21 +69,21 @@ def indicate_duplicates( if keep == "first": # Count how many copies occur up to current copy of value # Discard this value if there are copies BEFORE - window_spec = core.WindowSpec( + window_spec = windows.WindowSpec( grouping_keys=tuple(columns), following=0, ) elif keep == "last": # Count how many copies occur up to current copy of values # Discard this value if there are copies AFTER - window_spec = core.WindowSpec( + window_spec = windows.WindowSpec( grouping_keys=tuple(columns), preceding=0, ) else: # keep == False # Count how many copies of the value occur in entire series. # Discard this value if there are copies ANYWHERE - window_spec = core.WindowSpec(grouping_keys=tuple(columns)) + window_spec = windows.WindowSpec(grouping_keys=tuple(columns)) block, dummy = block.create_constant(1) block, val_count_col_id = block.apply_window_op( dummy, @@ -131,7 +132,7 @@ def value_counts( ) count_id = agg_ids[0] if normalize: - unbound_window = core.WindowSpec() + unbound_window = windows.WindowSpec() block, total_count_id = block.apply_window_op( count_id, agg_ops.sum_op, unbound_window ) @@ -153,7 +154,7 @@ def value_counts( def pct_change(block: blocks.Block, periods: int = 1) -> blocks.Block: column_labels = block.column_labels - window_spec = core.WindowSpec( + window_spec = windows.WindowSpec( preceding=periods if periods > 0 else None, following=-periods if periods < 0 else None, ) @@ -195,7 +196,7 @@ def rank( ops.isnull_op, ) nullity_col_ids.append(nullity_col_id) - window = core.WindowSpec( + window = windows.WindowSpec( # BigQuery has syntax to reorder nulls with "NULLS FIRST/LAST", but that is unavailable through ibis presently, so must order on a separate nullity expression first. ordering=( ordering.OrderingColumnReference( @@ -229,7 +230,7 @@ def rank( block, result_id = block.apply_window_op( rownum_col_ids[i], agg_op, - window_spec=core.WindowSpec(grouping_keys=[columns[i]]), + window_spec=windows.WindowSpec(grouping_keys=(columns[i],)), skip_reproject_unsafe=(i < (len(columns) - 1)), ) post_agg_rownum_col_ids.append(result_id) @@ -311,7 +312,7 @@ def nsmallest( block, counter = block.apply_window_op( column_ids[0], agg_ops.rank_op, - window_spec=core.WindowSpec(ordering=order_refs), + window_spec=windows.WindowSpec(ordering=tuple(order_refs)), ) block, condition = block.apply_unary_op( counter, ops.partial_right(ops.le_op, n) @@ -343,7 +344,7 @@ def nlargest( block, counter = block.apply_window_op( column_ids[0], agg_ops.rank_op, - window_spec=core.WindowSpec(ordering=order_refs), + window_spec=windows.WindowSpec(ordering=tuple(order_refs)), ) block, condition = block.apply_unary_op( counter, ops.partial_right(ops.le_op, n) @@ -440,14 +441,14 @@ def _mean_delta_to_power( grouping_column_ids: typing.Sequence[str], ) -> typing.Tuple[blocks.Block, typing.Sequence[str]]: """Calculate (x-mean(x))^n. Useful for calculating moment statistics such as skew and kurtosis.""" - window = core.WindowSpec(grouping_keys=grouping_column_ids) + window = windows.WindowSpec(grouping_keys=tuple(grouping_column_ids)) block, mean_ids = block.multi_apply_window_op(column_ids, agg_ops.mean_op, window) delta_ids = [] cube_op = ops.partial_right(ops.pow_op, n_power) for val_id, mean_val_id in zip(column_ids, mean_ids): block, delta_id = block.apply_binary_op(val_id, mean_val_id, ops.sub_op) block, delta_power_id = block.apply_unary_op(delta_id, cube_op) - block = block.drop_columns(delta_id) + block = block.drop_columns([delta_id]) delta_ids.append(delta_power_id) return block, delta_ids @@ -645,7 +646,7 @@ def _idx_extrema( for idx_col in original_block.index_columns ], ] - window_spec = core.WindowSpec(ordering=order_refs) + window_spec = windows.WindowSpec(ordering=tuple(order_refs)) idx_col = original_block.index_columns[0] block, result_col = block.apply_window_op( idx_col, agg_ops.first_op, window_spec diff --git a/bigframes/core/blocks.py b/bigframes/core/blocks.py index 046d2b3a44..635e7db865 100644 --- a/bigframes/core/blocks.py +++ b/bigframes/core/blocks.py @@ -28,17 +28,13 @@ from typing import Iterable, List, Optional, Sequence, Tuple import warnings -import geopandas as gpd # type: ignore import google.cloud.bigquery as bigquery -import numpy import pandas as pd -import pyarrow as pa # type: ignore import bigframes.constants as constants import bigframes.core as core import bigframes.core.guid as guid import bigframes.core.indexes as indexes -import bigframes.core.joins as joins import bigframes.core.joins.name_resolution as join_names import bigframes.core.ordering as ordering import bigframes.core.utils @@ -46,6 +42,7 @@ import bigframes.dtypes import bigframes.operations as ops import bigframes.operations.aggregations as agg_ops +import bigframes.session._io.pandas import third_party.bigframes_vendored.pandas.io.common as vendored_pandas_io_common # Type constraint for wherever column labels are used @@ -69,6 +66,10 @@ _MONOTONIC_DECREASING = "monotonic_decreasing" +LevelType = typing.Union[str, int] +LevelsType = typing.Union[LevelType, typing.Sequence[LevelType]] + + class BlockHolder(typing.Protocol): """Interface for mutable objects with state represented by a block value object.""" @@ -372,34 +373,11 @@ def reorder_levels(self, ids: typing.Sequence[str]): level_names = [self.col_id_to_index_name[index_id] for index_id in ids] return Block(self.expr, ids, self.column_labels, level_names) - @classmethod - def _to_dataframe( - cls, result, schema: typing.Mapping[str, bigframes.dtypes.Dtype] - ) -> pd.DataFrame: + def _to_dataframe(self, result) -> pd.DataFrame: """Convert BigQuery data to pandas DataFrame with specific dtypes.""" - dtypes = bigframes.dtypes.to_pandas_dtypes_overrides(result.schema) - df = result.to_dataframe( - dtypes=dtypes, - bool_dtype=pd.BooleanDtype(), - int_dtype=pd.Int64Dtype(), - float_dtype=pd.Float64Dtype(), - string_dtype=pd.StringDtype(storage="pyarrow"), - date_dtype=pd.ArrowDtype(pa.date32()), - datetime_dtype=pd.ArrowDtype(pa.timestamp("us")), - time_dtype=pd.ArrowDtype(pa.time64("us")), - timestamp_dtype=pd.ArrowDtype(pa.timestamp("us", tz="UTC")), - ) - - # Convert Geography column from StringDType to GeometryDtype. - for column_name, dtype in schema.items(): - if dtype == gpd.array.GeometryDtype(): - df[column_name] = gpd.GeoSeries.from_wkt( - # https://github.com/geopandas/geopandas/issues/1879 - df[column_name].replace({numpy.nan: None}), - # BigQuery geography type is based on the WGS84 reference ellipsoid. - crs="EPSG:4326", - ) - return df + dtypes = dict(zip(self.index_columns, self.index_dtypes)) + dtypes.update(zip(self.value_columns, self.dtypes)) + return self._expr.session._rows_to_dataframe(result, dtypes) def to_pandas( self, @@ -437,6 +415,30 @@ def to_pandas( ) return df, query_job + def to_pandas_batches(self): + """Download results one message at a time.""" + dtypes = dict(zip(self.index_columns, self.index_dtypes)) + dtypes.update(zip(self.value_columns, self.dtypes)) + results_iterator, _ = self._expr.start_query() + for arrow_table in results_iterator.to_arrow_iterable( + bqstorage_client=self._expr.session.bqstoragereadclient + ): + df = bigframes.session._io.pandas.arrow_to_pandas(arrow_table, dtypes) + self._copy_index_to_pandas(df) + yield df + + def _copy_index_to_pandas(self, df: pd.DataFrame): + """Set the index on pandas DataFrame to match this block. + + Warning: This method modifies ``df`` inplace. + """ + if self.index_columns: + df.set_index(list(self.index_columns), inplace=True) + # Pandas names is annotated as list[str] rather than the more + # general Sequence[Label] that BigQuery DataFrames has. + # See: https://github.com/pandas-dev/pandas-stubs/issues/804 + df.index.names = self.index.names # type: ignore + def _compute_and_count( self, value_keys: Optional[Iterable[str]] = None, @@ -451,7 +453,9 @@ def _compute_and_count( results_iterator, query_job = expr.start_query(max_results=max_results) - table_size = expr._get_table_size(query_job.destination) / _BYTES_TO_MEGABYTES + table_size = ( + expr.session._get_table_size(query_job.destination) / _BYTES_TO_MEGABYTES + ) fraction = ( max_download_size / table_size if (max_download_size is not None) and (table_size != 0) @@ -480,8 +484,7 @@ def _compute_and_count( if sampling_method == _HEAD: total_rows = int(results_iterator.total_rows * fraction) results_iterator.max_results = total_rows - schema = dict(zip(self.value_columns, self.dtypes)) - df = self._to_dataframe(results_iterator, schema) + df = self._to_dataframe(results_iterator) if self.index_columns: df.set_index(list(self.index_columns), inplace=True) @@ -510,12 +513,8 @@ def _compute_and_count( ) else: total_rows = results_iterator.total_rows - schema = dict(zip(self.value_columns, self.dtypes)) - df = self._to_dataframe(results_iterator, schema) - - if self.index_columns: - df.set_index(list(self.index_columns), inplace=True) - df.index.names = self.index.names # type: ignore + df = self._to_dataframe(results_iterator) + self._copy_index_to_pandas(df) return df, total_rows, query_job @@ -821,7 +820,9 @@ def aggregate_all_and_stack( axis: int | str = 0, value_col_id: str = "values", dropna: bool = True, - dtype=pd.Float64Dtype(), + dtype: typing.Union[ + bigframes.dtypes.Dtype, typing.Tuple[bigframes.dtypes.Dtype, ...] + ] = pd.Float64Dtype(), ) -> Block: axis_n = utils.get_axis_number(axis) if axis_n == 0: @@ -831,7 +832,7 @@ def aggregate_all_and_stack( result_expr = self.expr.aggregate(aggregations, dropna=dropna).unpivot( row_labels=self.column_labels.to_list(), index_col_ids=["index"], - unpivot_columns=[(value_col_id, self.value_columns)], + unpivot_columns=tuple([(value_col_id, tuple(self.value_columns))]), dtype=dtype, ) return Block(result_expr, index_columns=["index"], column_labels=[None]) @@ -843,7 +844,7 @@ def aggregate_all_and_stack( stacked_expr = expr_with_offsets.unpivot( row_labels=self.column_labels.to_list(), index_col_ids=[guid.generate_guid()], - unpivot_columns=[(value_col_id, self.value_columns)], + unpivot_columns=[(value_col_id, tuple(self.value_columns))], passthrough_columns=[*self.index_columns, offset_col], dtype=dtype, ) @@ -1031,13 +1032,13 @@ def summarize( for col_id in column_ids ] columns = [ - (col_id, [f"{col_id}-{stat.name}" for stat in stats]) + (col_id, tuple(f"{col_id}-{stat.name}" for stat in stats)) for col_id in column_ids ] expr = self.expr.aggregate(aggregations).unpivot( labels, - unpivot_columns=columns, - index_col_ids=[label_col_id], + unpivot_columns=tuple(columns), + index_col_ids=tuple([label_col_id]), ) labels = self._get_labels_for_columns(column_ids) return Block(expr, column_labels=labels, index_columns=[label_col_id]) @@ -1344,7 +1345,7 @@ def stack(self, how="left", levels: int = 1): passthrough_columns=self.index_columns, unpivot_columns=unpivot_columns, index_col_ids=added_index_columns, - dtype=dtypes, + dtype=tuple(dtypes), how=how, ) new_index_level_names = self.column_labels.names[-levels:] @@ -1355,13 +1356,50 @@ def stack(self, how="left", levels: int = 1): index_columns = [*added_index_columns, *self.index_columns] index_labels = [*new_index_level_names, *self._index_labels] - block = Block( + return Block( unpivot_expr, index_columns=index_columns, column_labels=result_index, index_labels=index_labels, ) - return block + + def melt( + self, + id_vars=typing.Sequence[str], + value_vars=typing.Sequence[str], + var_names=typing.Sequence[typing.Hashable], + value_name: typing.Hashable = "value", + ): + # TODO: Implement col_level and ignore_index + unpivot_col_id = guid.generate_guid() + var_col_ids = tuple([guid.generate_guid() for _ in var_names]) + # single unpivot col + unpivot_col = (unpivot_col_id, tuple(value_vars)) + value_labels = [self.col_id_to_label[col_id] for col_id in value_vars] + id_labels = [self.col_id_to_label[col_id] for col_id in id_vars] + + dtype = self._expr.get_column_type(value_vars[0]) + + unpivot_expr = self._expr.unpivot( + row_labels=value_labels, + passthrough_columns=id_vars, + unpivot_columns=(unpivot_col,), + index_col_ids=var_col_ids, + dtype=dtype, + how="right", + ) + index_id = guid.generate_guid() + unpivot_expr = unpivot_expr.promote_offsets(index_id) + # Need to reorder to get id_vars before var_col and unpivot_col + unpivot_expr = unpivot_expr.select_columns( + [index_id, *id_vars, *var_col_ids, unpivot_col_id] + ) + + return Block( + unpivot_expr, + column_labels=[*id_labels, *var_names, value_name], + index_columns=[index_id], + ) def _create_stack_column( self, col_label: typing.Tuple, stack_labels: typing.Sequence[typing.Tuple] @@ -1384,7 +1422,7 @@ def _create_stack_column( dtype = self._column_type(input_id) input_columns.append(input_id) # Input column i is the first one that - return input_columns, dtype or pd.Float64Dtype() + return tuple(input_columns), dtype or pd.Float64Dtype() def _column_type(self, col_id: str) -> bigframes.dtypes.Dtype: col_offset = self.value_columns.index(col_id) @@ -1450,9 +1488,7 @@ def _get_unique_values( raise ValueError(f"Too many unique values: {pd_values}") if len(columns) > 1: - return pd.MultiIndex.from_frame( - pd_values.sort_values(by=list(pd_values.columns), na_position="first") - ) + return pd.MultiIndex.from_frame(pd_values) else: return pd.Index(pd_values.squeeze(axis=1).sort_values(na_position="first")) @@ -1501,8 +1537,7 @@ def merge( sort: bool, suffixes: tuple[str, str] = ("_x", "_y"), ) -> Block: - joined_expr = joins.join_by_column( - self.expr, + joined_expr = self.expr.join( left_join_ids, other.expr, right_join_ids, @@ -1638,6 +1673,24 @@ def cached(self) -> Block: index_labels=self.index_labels, ) + def resolve_index_level(self, level: LevelsType) -> typing.Sequence[str]: + if utils.is_list_like(level): + levels = list(level) + else: + levels = [level] + resolved_level_ids = [] + for level_ref in levels: + if isinstance(level_ref, int): + resolved_level_ids.append(self.index_columns[level_ref]) + elif isinstance(level_ref, typing.Hashable): + matching_ids = self.index_name_to_col_id.get(level_ref, []) + if len(matching_ids) != 1: + raise ValueError("level name cannot be found or is ambiguous") + resolved_level_ids.append(matching_ids[0]) + else: + raise ValueError(f"Unexpected level: {level_ref}") + return resolved_level_ids + def _is_monotonic( self, column_ids: typing.Union[str, Sequence[str]], increasing: bool ) -> bool: @@ -1694,7 +1747,7 @@ def _is_monotonic( return result -def block_from_local(data, session=None) -> Block: +def block_from_local(data) -> Block: pd_data = pd.DataFrame(data) columns = pd_data.columns @@ -1716,7 +1769,7 @@ def block_from_local(data, session=None) -> Block: ) index_ids = pd_data.columns[: len(index_labels)] - keys_expr = core.ArrayValue.mem_expr_from_pandas(pd_data, session) + keys_expr = core.ArrayValue.from_pandas(pd_data) return Block( keys_expr, column_labels=columns, diff --git a/bigframes/core/compile/__init__.py b/bigframes/core/compile/__init__.py new file mode 100644 index 0000000000..c86f4463dc --- /dev/null +++ b/bigframes/core/compile/__init__.py @@ -0,0 +1,21 @@ +# Copyright 2023 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from bigframes.core.compile.compiled import CompiledArrayValue +from bigframes.core.compile.compiler import compile_node + +__all__ = [ + "compile_node", + "CompiledArrayValue", +] diff --git a/bigframes/core/compile/compiled.py b/bigframes/core/compile/compiled.py new file mode 100644 index 0000000000..1134f1aab0 --- /dev/null +++ b/bigframes/core/compile/compiled.py @@ -0,0 +1,1121 @@ +# Copyright 2023 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from __future__ import annotations + +import functools +import math +import textwrap +import typing +from typing import Collection, Iterable, Literal, Optional, Sequence + +import ibis +import ibis.backends.bigquery as ibis_bigquery +import ibis.expr.datatypes as ibis_dtypes +import ibis.expr.types as ibis_types +import pandas + +import bigframes.constants as constants +import bigframes.core.guid +from bigframes.core.ordering import ( + encode_order_string, + ExpressionOrdering, + IntegerEncoding, + OrderingColumnReference, + reencode_order_string, + StringEncoding, +) +import bigframes.core.utils as utils +from bigframes.core.window_spec import WindowSpec +import bigframes.dtypes +import bigframes.operations as ops +import bigframes.operations.aggregations as agg_ops + +ORDER_ID_COLUMN = "bigframes_ordering_id" +PREDICATE_COLUMN = "bigframes_predicate" + + +class CompiledArrayValue: + """Immutable BigQuery DataFrames expression tree. + + Note: Usage of this class is considered to be private and subject to change + at any time. + + This class is a wrapper around Ibis expressions. Its purpose is to defer + Ibis projection operations to keep generated SQL small and correct when + mixing and matching columns from different versions of a DataFrame. + + Args: + table: An Ibis table expression. + columns: Ibis value expressions that can be projected as columns. + hidden_ordering_columns: Ibis value expressions to store ordering. + ordering: An ordering property of the data frame. + predicates: A list of filters on the data frame. + """ + + def __init__( + self, + table: ibis_types.Table, + columns: Sequence[ibis_types.Value], + hidden_ordering_columns: Optional[Sequence[ibis_types.Value]] = None, + ordering: ExpressionOrdering = ExpressionOrdering(), + predicates: Optional[Collection[ibis_types.BooleanValue]] = None, + ): + self._table = table + self._predicates = tuple(predicates) if predicates is not None else () + # TODO: Validate ordering + if not ordering.total_ordering_columns: + raise ValueError("Must have total ordering defined by one or more columns") + self._ordering = ordering + # Allow creating a DataFrame directly from an Ibis table expression. + # TODO(swast): Validate that each column references the same table (or + # no table for literal values). + self._columns = tuple(columns) + + # Meta columns store ordering, or other data that doesn't correspond to dataframe columns + self._hidden_ordering_columns = ( + tuple(hidden_ordering_columns) + if hidden_ordering_columns is not None + else () + ) + + # To allow for more efficient lookup by column name, create a + # dictionary mapping names to column values. + self._column_names = {column.get_name(): column for column in self._columns} + self._hidden_ordering_column_names = { + column.get_name(): column for column in self._hidden_ordering_columns + } + ### Validation + value_col_ids = self._column_names.keys() + hidden_col_ids = self._hidden_ordering_column_names.keys() + + all_columns = value_col_ids | hidden_col_ids + ordering_valid = all( + col.column_id in all_columns for col in ordering.all_ordering_columns + ) + if value_col_ids & hidden_col_ids: + raise ValueError( + f"Keys in both hidden and exposed list: {value_col_ids & hidden_col_ids}" + ) + if not ordering_valid: + raise ValueError(f"Illegal ordering keys: {ordering.all_ordering_columns}") + + @classmethod + def mem_expr_from_pandas( + cls, + pd_df: pandas.DataFrame, + ) -> CompiledArrayValue: + """ + Builds an in-memory only (SQL only) expr from a pandas dataframe. + """ + # We can't include any hidden columns in the ArrayValue constructor, so + # grab the column names before we add the hidden ordering column. + column_names = [str(column) for column in pd_df.columns] + # Make sure column names are all strings. + pd_df = pd_df.set_axis(column_names, axis="columns") + pd_df = pd_df.assign(**{ORDER_ID_COLUMN: range(len(pd_df))}) + + # ibis memtable cannot handle NA, must convert to None + pd_df = pd_df.astype("object") # type: ignore + pd_df = pd_df.where(pandas.notnull(pd_df), None) + + # NULL type isn't valid in BigQuery, so retry with an explicit schema in these cases. + keys_memtable = ibis.memtable(pd_df) + schema = keys_memtable.schema() + new_schema = [] + for column_index, column in enumerate(schema): + if column == ORDER_ID_COLUMN: + new_type: ibis_dtypes.DataType = ibis_dtypes.int64 + else: + column_type = schema[column] + # The autodetected type might not be one we can support, such + # as NULL type for empty rows, so convert to a type we do + # support. + new_type = bigframes.dtypes.bigframes_dtype_to_ibis_dtype( + bigframes.dtypes.ibis_dtype_to_bigframes_dtype(column_type) + ) + # TODO(swast): Ibis memtable doesn't use backticks in struct + # field names, so spaces and other characters aren't allowed in + # the memtable context. Blocked by + # https://github.com/ibis-project/ibis/issues/7187 + column = f"col_{column_index}" + new_schema.append((column, new_type)) + + # must set non-null column labels. these are not the user-facing labels + pd_df = pd_df.set_axis( + [column for column, _ in new_schema], + axis="columns", + ) + keys_memtable = ibis.memtable(pd_df, schema=ibis.schema(new_schema)) + + return cls( + keys_memtable, + columns=[ + keys_memtable[f"col_{column_index}"].name(column) + for column_index, column in enumerate(column_names) + ], + ordering=ExpressionOrdering( + ordering_value_columns=tuple( + [OrderingColumnReference(ORDER_ID_COLUMN)] + ), + total_ordering_columns=frozenset([ORDER_ID_COLUMN]), + ), + hidden_ordering_columns=(keys_memtable[ORDER_ID_COLUMN],), + ) + + @property + def columns(self) -> typing.Tuple[ibis_types.Value, ...]: + return self._columns + + @property + def column_ids(self) -> typing.Sequence[str]: + return tuple(self._column_names.keys()) + + @property + def _hidden_column_ids(self) -> typing.Sequence[str]: + return tuple(self._hidden_ordering_column_names.keys()) + + @property + def _reduced_predicate(self) -> typing.Optional[ibis_types.BooleanValue]: + """Returns the frame's predicates as an equivalent boolean value, useful where a single predicate value is preferred.""" + return ( + _reduce_predicate_list(self._predicates).name(PREDICATE_COLUMN) + if self._predicates + else None + ) + + @property + def _ibis_order(self) -> Sequence[ibis_types.Value]: + """Returns a sequence of ibis values which can be directly used to order a table expression. Has direction modifiers applied.""" + return _convert_ordering_to_table_values( + {**self._column_names, **self._hidden_ordering_column_names}, + self._ordering.all_ordering_columns, + ) + + def builder(self) -> ArrayValueBuilder: + """Creates a mutable builder for expressions.""" + # Since ArrayValue is intended to be immutable (immutability offers + # potential opportunities for caching, though we might need to introduce + # more node types for that to be useful), we create a builder class. + return ArrayValueBuilder( + self._table, + columns=self._columns, + hidden_ordering_columns=self._hidden_ordering_columns, + ordering=self._ordering, + predicates=self._predicates, + ) + + def drop_columns(self, columns: Iterable[str]) -> CompiledArrayValue: + # Must generate offsets if we are dropping a column that ordering depends on + expr = self + for ordering_column in set(columns).intersection( + [col.column_id for col in self._ordering.ordering_value_columns] + ): + expr = self._hide_column(ordering_column) + + expr_builder = expr.builder() + remain_cols = [ + column for column in expr.columns if column.get_name() not in columns + ] + expr_builder.columns = remain_cols + return expr_builder.build() + + def get_column_type(self, key: str) -> bigframes.dtypes.Dtype: + ibis_type = typing.cast( + bigframes.dtypes.IbisDtype, self._get_any_column(key).type() + ) + return typing.cast( + bigframes.dtypes.Dtype, + bigframes.dtypes.ibis_dtype_to_bigframes_dtype(ibis_type), + ) + + def _get_ibis_column(self, key: str) -> ibis_types.Value: + """Gets the Ibis expression for a given column.""" + if key not in self.column_ids: + raise ValueError( + "Column name {} not in set of values: {}".format(key, self.column_ids) + ) + return typing.cast(ibis_types.Value, self._column_names[key]) + + def _get_any_column(self, key: str) -> ibis_types.Value: + """Gets the Ibis expression for a given column. Will also get hidden columns.""" + all_columns = {**self._column_names, **self._hidden_ordering_column_names} + if key not in all_columns.keys(): + raise ValueError( + "Column name {} not in set of values: {}".format( + key, all_columns.keys() + ) + ) + return typing.cast(ibis_types.Value, all_columns[key]) + + def _get_hidden_ordering_column(self, key: str) -> ibis_types.Column: + """Gets the Ibis expression for a given hidden column.""" + if key not in self._hidden_ordering_column_names.keys(): + raise ValueError( + "Column name {} not in set of values: {}".format( + key, self._hidden_ordering_column_names.keys() + ) + ) + return typing.cast(ibis_types.Column, self._hidden_ordering_column_names[key]) + + def filter(self, predicate_id: str, keep_null: bool = False) -> CompiledArrayValue: + """Filter the table on a given expression, the predicate must be a boolean series aligned with the table expression.""" + condition = typing.cast( + ibis_types.BooleanValue, self._get_ibis_column(predicate_id) + ) + if keep_null: + condition = typing.cast( + ibis_types.BooleanValue, + condition.fillna( + typing.cast(ibis_types.BooleanScalar, ibis_types.literal(True)) + ), + ) + return self._filter(condition) + + def _filter(self, predicate_value: ibis_types.BooleanValue) -> CompiledArrayValue: + """Filter the table on a given expression, the predicate must be a boolean series aligned with the table expression.""" + expr = self.builder() + expr.ordering = expr.ordering.with_non_sequential() + expr.predicates = [*self._predicates, predicate_value] + return expr.build() + + def order_by( + self, by: Sequence[OrderingColumnReference], stable: bool = False + ) -> CompiledArrayValue: + expr_builder = self.builder() + expr_builder.ordering = self._ordering.with_ordering_columns(by, stable=stable) + return expr_builder.build() + + def reversed(self) -> CompiledArrayValue: + expr_builder = self.builder() + expr_builder.ordering = self._ordering.with_reverse() + return expr_builder.build() + + def _uniform_sampling(self, fraction: float) -> CompiledArrayValue: + """Sampling the table on given fraction. + + .. warning:: + The row numbers of result is non-deterministic, avoid to use. + """ + table = self._to_ibis_expr( + "unordered", expose_hidden_cols=True, fraction=fraction + ) + columns = [table[column_name] for column_name in self._column_names] + hidden_ordering_columns = [ + table[column_name] for column_name in self._hidden_ordering_column_names + ] + return CompiledArrayValue( + table, + columns=columns, + hidden_ordering_columns=hidden_ordering_columns, + ordering=self._ordering, + ) + + @property + def _offsets(self) -> ibis_types.IntegerColumn: + if not self._ordering.is_sequential: + raise ValueError( + "Expression does not have offsets. Generate them first using project_offsets." + ) + if not self._ordering.total_order_col: + raise ValueError( + "Ordering is invalid. Marked as sequential but no total order columns." + ) + column = self._get_any_column(self._ordering.total_order_col.column_id) + return typing.cast(ibis_types.IntegerColumn, column) + + def _project_offsets(self) -> CompiledArrayValue: + """Create a new expression that contains offsets. Should only be executed when offsets are needed for an operations. Has no effect on expression semantics.""" + if self._ordering.is_sequential: + return self + # TODO(tbergeron): Enforce total ordering + table = self._to_ibis_expr( + ordering_mode="offset_col", order_col_name=ORDER_ID_COLUMN + ) + columns = [table[column_name] for column_name in self._column_names] + ordering = ExpressionOrdering( + ordering_value_columns=tuple([OrderingColumnReference(ORDER_ID_COLUMN)]), + total_ordering_columns=frozenset([ORDER_ID_COLUMN]), + integer_encoding=IntegerEncoding(True, is_sequential=True), + ) + return CompiledArrayValue( + table, + columns=columns, + hidden_ordering_columns=[table[ORDER_ID_COLUMN]], + ordering=ordering, + ) + + def _hide_column(self, column_id) -> CompiledArrayValue: + """Pushes columns to hidden columns list. Used to hide ordering columns that have been dropped or destructively mutated.""" + expr_builder = self.builder() + # Need to rename column as caller might be creating a new row with the same name but different values. + # Can avoid this if don't allow callers to determine ids and instead generate unique ones in this class. + new_name = bigframes.core.guid.generate_guid(prefix="bigframes_hidden_") + expr_builder.hidden_ordering_columns = [ + *self._hidden_ordering_columns, + self._get_ibis_column(column_id).name(new_name), + ] + expr_builder.ordering = self._ordering.with_column_remap({column_id: new_name}) + return expr_builder.build() + + def promote_offsets(self, col_id: str) -> CompiledArrayValue: + """ + Convenience function to promote copy of column offsets to a value column. Can be used to reset index. + """ + # Special case: offsets already exist + ordering = self._ordering + + if (not ordering.is_sequential) or (not ordering.total_order_col): + return self._project_offsets().promote_offsets(col_id) + expr_builder = self.builder() + expr_builder.columns = [ + self._get_any_column(ordering.total_order_col.column_id).name(col_id), + *self.columns, + ] + return expr_builder.build() + + def select_columns(self, column_ids: typing.Sequence[str]) -> CompiledArrayValue: + """Creates a new expression based on this expression with new columns.""" + columns = [self._get_ibis_column(col_id) for col_id in column_ids] + expr = self + for ordering_column in set(self.column_ids).intersection( + [col_ref.column_id for col_ref in self._ordering.ordering_value_columns] + ): + # Need to hide ordering columns that are being dropped. Alternatively, could project offsets + expr = expr._hide_column(ordering_column) + builder = expr.builder() + builder.columns = list(columns) + new_expr = builder.build() + return new_expr + + def concat(self, other: typing.Sequence[CompiledArrayValue]) -> CompiledArrayValue: + """Append together multiple ArrayValue objects.""" + if len(other) == 0: + return self + tables = [] + prefix_base = 10 + prefix_size = math.ceil(math.log(len(other) + 1, prefix_base)) + # Must normalize all ids to the same encoding size + max_encoding_size = max( + self._ordering.string_encoding.length, + *[expression._ordering.string_encoding.length for expression in other], + ) + for i, expr in enumerate([self, *other]): + ordering_prefix = str(i).zfill(prefix_size) + table = expr._to_ibis_expr( + ordering_mode="string_encoded", order_col_name=ORDER_ID_COLUMN + ) + # Rename the value columns based on horizontal offset before applying union. + table = table.select( + [ + table[col].name(f"column_{i}") + if col != ORDER_ID_COLUMN + else ( + ordering_prefix + + reencode_order_string( + table[ORDER_ID_COLUMN], max_encoding_size + ) + ).name(ORDER_ID_COLUMN) + for i, col in enumerate(table.columns) + ] + ) + tables.append(table) + combined_table = ibis.union(*tables) + ordering = ExpressionOrdering( + ordering_value_columns=tuple([OrderingColumnReference(ORDER_ID_COLUMN)]), + total_ordering_columns=frozenset([ORDER_ID_COLUMN]), + string_encoding=StringEncoding(True, prefix_size + max_encoding_size), + ) + return CompiledArrayValue( + combined_table, + columns=[ + combined_table[col] + for col in combined_table.columns + if col != ORDER_ID_COLUMN + ], + hidden_ordering_columns=[combined_table[ORDER_ID_COLUMN]], + ordering=ordering, + ) + + def project_unary_op( + self, column_name: str, op: ops.UnaryOp, output_name=None + ) -> CompiledArrayValue: + """Creates a new expression based on this expression with unary operation applied to one column.""" + value = op._as_ibis(self._get_ibis_column(column_name)).name( + output_name or column_name + ) + return self._set_or_replace_by_id(output_name or column_name, value) + + def project_binary_op( + self, + left_column_id: str, + right_column_id: str, + op: ops.BinaryOp, + output_column_id: str, + ) -> CompiledArrayValue: + """Creates a new expression based on this expression with binary operation applied to two columns.""" + value = op( + self._get_ibis_column(left_column_id), + self._get_ibis_column(right_column_id), + ).name(output_column_id) + return self._set_or_replace_by_id(output_column_id, value) + + def project_ternary_op( + self, + col_id_1: str, + col_id_2: str, + col_id_3: str, + op: ops.TernaryOp, + output_column_id: str, + ) -> CompiledArrayValue: + """Creates a new expression based on this expression with ternary operation applied to three columns.""" + value = op( + self._get_ibis_column(col_id_1), + self._get_ibis_column(col_id_2), + self._get_ibis_column(col_id_3), + ).name(output_column_id) + return self._set_or_replace_by_id(output_column_id, value) + + def aggregate( + self, + aggregations: typing.Sequence[typing.Tuple[str, agg_ops.AggregateOp, str]], + by_column_ids: typing.Sequence[str] = (), + dropna: bool = True, + ) -> CompiledArrayValue: + """ + Apply aggregations to the expression. + Arguments: + aggregations: input_column_id, operation, output_column_id tuples + by_column_id: column id of the aggregation key, this is preserved through the transform + dropna: whether null keys should be dropped + """ + table = self._to_ibis_expr("unordered") + stats = { + col_out: agg_op._as_ibis(table[col_in]) + for col_in, agg_op, col_out in aggregations + } + if by_column_ids: + result = table.group_by(by_column_ids).aggregate(**stats) + # Must have deterministic ordering, so order by the unique "by" column + ordering = ExpressionOrdering( + tuple( + [ + OrderingColumnReference(column_id=column_id) + for column_id in by_column_ids + ] + ), + total_ordering_columns=frozenset(by_column_ids), + ) + columns = tuple(result[key] for key in result.columns) + expr = CompiledArrayValue(result, columns=columns, ordering=ordering) + if dropna: + for column_id in by_column_ids: + expr = expr._filter( + ops.notnull_op._as_ibis(expr._get_ibis_column(column_id)) + ) + # Can maybe remove this as Ordering id is redundant as by_column is unique after aggregation + return expr._project_offsets() + else: + aggregates = {**stats, ORDER_ID_COLUMN: ibis_types.literal(0)} + result = table.aggregate(**aggregates) + # Ordering is irrelevant for single-row output, but set ordering id regardless as other ops(join etc.) expect it. + ordering = ExpressionOrdering( + ordering_value_columns=tuple( + [OrderingColumnReference(ORDER_ID_COLUMN)] + ), + total_ordering_columns=frozenset([ORDER_ID_COLUMN]), + integer_encoding=IntegerEncoding(is_encoded=True, is_sequential=True), + ) + return CompiledArrayValue( + result, + columns=[result[col_id] for col_id in [*stats.keys()]], + hidden_ordering_columns=[result[ORDER_ID_COLUMN]], + ordering=ordering, + ) + + def corr_aggregate( + self, corr_aggregations: typing.Sequence[typing.Tuple[str, str, str]] + ) -> CompiledArrayValue: + """ + Get correlations between each lef_column_id and right_column_id, stored in the respective output_column_id. + This uses BigQuery's CORR under the hood, and thus only Pearson's method is used. + Arguments: + corr_aggregations: left_column_id, right_column_id, output_column_id tuples + """ + table = self._to_ibis_expr("unordered") + stats = { + col_out: table[col_left].corr(table[col_right], how="pop") + for col_left, col_right, col_out in corr_aggregations + } + aggregates = {**stats, ORDER_ID_COLUMN: ibis_types.literal(0)} + result = table.aggregate(**aggregates) + # Ordering is irrelevant for single-row output, but set ordering id regardless as other ops(join etc.) expect it. + ordering = ExpressionOrdering( + ordering_value_columns=tuple([OrderingColumnReference(ORDER_ID_COLUMN)]), + total_ordering_columns=frozenset([ORDER_ID_COLUMN]), + integer_encoding=IntegerEncoding(is_encoded=True, is_sequential=True), + ) + return CompiledArrayValue( + result, + columns=[result[col_id] for col_id in [*stats.keys()]], + hidden_ordering_columns=[result[ORDER_ID_COLUMN]], + ordering=ordering, + ) + + def project_window_op( + self, + column_name: str, + op: agg_ops.WindowOp, + window_spec: WindowSpec, + output_name=None, + *, + never_skip_nulls=False, + skip_reproject_unsafe: bool = False, + ) -> CompiledArrayValue: + """ + Creates a new expression based on this expression with unary operation applied to one column. + column_name: the id of the input column present in the expression + op: the windowable operator to apply to the input column + window_spec: a specification of the window over which to apply the operator + output_name: the id to assign to the output of the operator, by default will replace input col if distinct output id not provided + never_skip_nulls: will disable null skipping for operators that would otherwise do so + skip_reproject_unsafe: skips the reprojection step, can be used when performing many non-dependent window operations, user responsible for not nesting window expressions, or using outputs as join, filter or aggregation keys before a reprojection + """ + column = typing.cast(ibis_types.Column, self._get_ibis_column(column_name)) + window = self._ibis_window_from_spec(window_spec, allow_ties=op.handles_ties) + + window_op = op._as_ibis(column, window) + + clauses = [] + if op.skips_nulls and not never_skip_nulls: + clauses.append((column.isnull(), ibis.NA)) + if window_spec.min_periods: + if op.skips_nulls: + # Most operations do not count NULL values towards min_periods + observation_count = agg_ops.count_op._as_ibis(column, window) + else: + # Operations like count treat even NULLs as valid observations for the sake of min_periods + # notnull is just used to convert null values to non-null (FALSE) values to be counted + denulled_value = typing.cast(ibis_types.BooleanColumn, column.notnull()) + observation_count = agg_ops.count_op._as_ibis(denulled_value, window) + clauses.append( + ( + observation_count < ibis_types.literal(window_spec.min_periods), + ibis.NA, + ) + ) + if clauses: + case_statement = ibis.case() + for clause in clauses: + case_statement = case_statement.when(clause[0], clause[1]) + case_statement = case_statement.else_(window_op).end() + window_op = case_statement + + result = self._set_or_replace_by_id(output_name or column_name, window_op) + # TODO(tbergeron): Automatically track analytic expression usage and defer reprojection until required for valid query generation. + return result._reproject_to_table() if not skip_reproject_unsafe else result + + def to_sql( + self, + offset_column: typing.Optional[str] = None, + col_id_overrides: typing.Mapping[str, str] = {}, + sorted: bool = False, + ) -> str: + offsets_id = offset_column or ORDER_ID_COLUMN + + sql = ibis_bigquery.Backend().compile( + self._to_ibis_expr( + ordering_mode="offset_col" + if (offset_column or sorted) + else "unordered", + order_col_name=offsets_id, + col_id_overrides=col_id_overrides, + ) + ) + if sorted: + sql = textwrap.dedent( + f""" + SELECT * EXCEPT (`{offsets_id}`) + FROM ({sql}) + ORDER BY `{offsets_id}` + """ + ) + return typing.cast(str, sql) + + def _to_ibis_expr( + self, + ordering_mode: Literal["string_encoded", "offset_col", "unordered"], + order_col_name: Optional[str] = ORDER_ID_COLUMN, + expose_hidden_cols: bool = False, + fraction: Optional[float] = None, + col_id_overrides: typing.Mapping[str, str] = {}, + ): + """ + Creates an Ibis table expression representing the DataFrame. + + ArrayValue objects are sorted, so the following options are available + to reflect this in the ibis expression. + + * "offset_col": Zero-based offsets are generated as a column, this will + not sort the rows however. + * "string_encoded": An ordered string column is provided in output table. + * "unordered": No ordering information will be provided in output. Only + value columns are projected. + + For offset or ordered column, order_col_name can be used to assign the + output label for the ordering column. If none is specified, the default + column name will be 'bigframes_ordering_id' + + Args: + ordering_mode: + How to construct the Ibis expression from the ArrayValue. See + above for details. + order_col_name: + If the ordering mode outputs a single ordering or offsets + column, use this as the column name. + expose_hidden_cols: + If True, include the hidden ordering columns in the results. + Only compatible with `order_by` and `unordered` + ``ordering_mode``. + col_id_overrides: + overrides the column ids for the result + Returns: + An ibis expression representing the data help by the ArrayValue object. + """ + assert ordering_mode in ( + "string_encoded", + "offset_col", + "unordered", + ) + if expose_hidden_cols and ordering_mode in ("ordered_col", "offset_col"): + raise ValueError( + f"Cannot expose hidden ordering columns with ordering_mode {ordering_mode}" + ) + + columns = list(self._columns) + columns_to_drop: list[ + str + ] = [] # Ordering/Filtering columns that will be dropped at end + + if self._reduced_predicate is not None: + columns.append(self._reduced_predicate) + # Usually drop predicate as it is will be all TRUE after filtering + if not expose_hidden_cols: + columns_to_drop.append(self._reduced_predicate.get_name()) + + order_columns = self._create_order_columns( + ordering_mode, order_col_name, expose_hidden_cols + ) + columns.extend(order_columns) + + # Special case for empty tables, since we can't create an empty + # projection. + if not columns: + return ibis.memtable([]) + + # Make sure all dtypes are the "canonical" ones for BigFrames. This is + # important for operations like UNION where the schema must match. + table = self._table.select( + bigframes.dtypes.ibis_value_to_canonical_type(column) for column in columns + ) + base_table = table + if self._reduced_predicate is not None: + table = table.filter(base_table[PREDICATE_COLUMN]) + table = table.drop(*columns_to_drop) + if col_id_overrides: + table = table.relabel(col_id_overrides) + if fraction is not None: + table = table.filter(ibis.random() < ibis.literal(fraction)) + return table + + def _create_order_columns( + self, + ordering_mode: str, + order_col_name: Optional[str], + expose_hidden_cols: bool, + ) -> typing.Sequence[ibis_types.Value]: + # Generate offsets if current ordering id semantics are not sufficiently strict + if ordering_mode == "offset_col": + return (self._create_offset_column().name(order_col_name),) + elif ordering_mode == "string_encoded": + return (self._create_string_ordering_column().name(order_col_name),) + elif expose_hidden_cols: + return self._hidden_ordering_columns + return () + + def _create_offset_column(self) -> ibis_types.IntegerColumn: + if self._ordering.total_order_col and self._ordering.is_sequential: + offsets = self._get_any_column(self._ordering.total_order_col.column_id) + return typing.cast(ibis_types.IntegerColumn, offsets) + else: + window = ibis.window(order_by=self._ibis_order) + if self._predicates: + window = window.group_by(self._reduced_predicate) + offsets = ibis.row_number().over(window) + return typing.cast(ibis_types.IntegerColumn, offsets) + + def _create_string_ordering_column(self) -> ibis_types.StringColumn: + if self._ordering.total_order_col and self._ordering.is_string_encoded: + string_order_ids = self._get_any_column( + self._ordering.total_order_col.column_id + ) + return typing.cast(ibis_types.StringColumn, string_order_ids) + if ( + self._ordering.total_order_col + and self._ordering.integer_encoding.is_encoded + ): + # Special case: non-negative integer ordering id can be converted directly to string without regenerating row numbers + int_values = self._get_any_column(self._ordering.total_order_col.column_id) + return encode_order_string( + typing.cast(ibis_types.IntegerColumn, int_values), + ) + else: + # Have to build string from scratch + window = ibis.window(order_by=self._ibis_order) + if self._predicates: + window = window.group_by(self._reduced_predicate) + row_nums = typing.cast( + ibis_types.IntegerColumn, ibis.row_number().over(window) + ) + return encode_order_string(row_nums) + + def _reproject_to_table(self) -> CompiledArrayValue: + """ + Internal operators that projects the internal representation into a + new ibis table expression where each value column is a direct + reference to a column in that table expression. Needed after + some operations such as window operations that cannot be used + recursively in projections. + """ + table = self._to_ibis_expr( + "unordered", + expose_hidden_cols=True, + ) + columns = [table[column_name] for column_name in self._column_names] + ordering_col_ids = [ + ref.column_id for ref in self._ordering.all_ordering_columns + ] + hidden_ordering_columns = [ + table[column_name] + for column_name in self._hidden_ordering_column_names + if column_name in ordering_col_ids + ] + return CompiledArrayValue( + table, + columns=columns, + hidden_ordering_columns=hidden_ordering_columns, + ordering=self._ordering, + ) + + def _ibis_window_from_spec(self, window_spec: WindowSpec, allow_ties: bool = False): + group_by: typing.List[ibis_types.Value] = ( + [ + typing.cast( + ibis_types.Column, _as_identity(self._get_ibis_column(column)) + ) + for column in window_spec.grouping_keys + ] + if window_spec.grouping_keys + else [] + ) + if self._reduced_predicate is not None: + group_by.append(self._reduced_predicate) + if window_spec.ordering: + order_by = _convert_ordering_to_table_values( + {**self._column_names, **self._hidden_ordering_column_names}, + window_spec.ordering, + ) + if not allow_ties: + # Most operator need an unambiguous ordering, so the table's total ordering is appended + order_by = tuple([*order_by, *self._ibis_order]) + elif (window_spec.following is not None) or (window_spec.preceding is not None): + # If window spec has following or preceding bounds, we need to apply an unambiguous ordering. + order_by = tuple(self._ibis_order) + else: + # Unbound grouping window. Suitable for aggregations but not for analytic function application. + order_by = None + return ibis.window( + preceding=window_spec.preceding, + following=window_spec.following, + order_by=order_by, + group_by=group_by, + ) + + def unpivot( + self, + row_labels: typing.Sequence[typing.Hashable], + unpivot_columns: typing.Sequence[ + typing.Tuple[str, typing.Sequence[typing.Optional[str]]] + ], + *, + passthrough_columns: typing.Sequence[str] = (), + index_col_ids: typing.Sequence[str] = ["index"], + dtype: typing.Union[ + bigframes.dtypes.Dtype, typing.Sequence[bigframes.dtypes.Dtype] + ] = pandas.Float64Dtype(), + how="left", + ) -> CompiledArrayValue: + """ + Unpivot ArrayValue columns. + + Args: + row_labels: Identifies the source of the row. Must be equal to length to source column list in unpivot_columns argument. + unpivot_columns: Mapping of column id to list of input column ids. Lists of input columns may use None. + passthrough_columns: Columns that will not be unpivoted. Column id will be preserved. + index_col_id (str): The column id to be used for the row labels. + dtype (dtype or list of dtype): Dtype to use for the unpivot columns. If list, must be equal in number to unpivot_columns. + + Returns: + ArrayValue: The unpivoted ArrayValue + """ + if how not in ("left", "right"): + raise ValueError("'how' must be 'left' or 'right'") + table = self._to_ibis_expr("unordered", expose_hidden_cols=True) + row_n = len(row_labels) + hidden_col_ids = self._hidden_ordering_column_names.keys() + if not all( + len(source_columns) == row_n for _, source_columns in unpivot_columns + ): + raise ValueError("Columns and row labels must all be same length.") + + unpivot_offset_id = bigframes.core.guid.generate_guid("unpivot_offsets_") + unpivot_table = table.cross_join( + ibis.memtable({unpivot_offset_id: range(row_n)}) + ) + # Use ibis memtable to infer type of rowlabels (if possible) + # TODO: Allow caller to specify dtype + if isinstance(row_labels[0], tuple): + labels_table = ibis.memtable(row_labels) + labels_ibis_types = [ + labels_table[col].type() for col in labels_table.columns + ] + else: + labels_ibis_types = [ibis.memtable({"col": row_labels})["col"].type()] + labels_dtypes = [ + bigframes.dtypes.ibis_dtype_to_bigframes_dtype(ibis_type) + for ibis_type in labels_ibis_types + ] + + label_columns = [] + for label_part, (col_id, label_dtype) in enumerate( + zip(index_col_ids, labels_dtypes) + ): + # interpret as tuples even if it wasn't originally so can apply same logic for multi-column labels + labels_as_tuples = [ + label if isinstance(label, tuple) else (label,) for label in row_labels + ] + cases = [ + ( + i, + bigframes.dtypes.literal_to_ibis_scalar( + label_tuple[label_part], # type:ignore + force_dtype=label_dtype, # type:ignore + ), + ) + for i, label_tuple in enumerate(labels_as_tuples) + ] + labels_value = ( + typing.cast(ibis_types.IntegerColumn, unpivot_table[unpivot_offset_id]) + .cases(cases, default=None) # type:ignore + .name(col_id) + ) + label_columns.append(labels_value) + + unpivot_values = [] + for j in range(len(unpivot_columns)): + col_dtype = dtype[j] if utils.is_list_like(dtype) else dtype + result_col, source_cols = unpivot_columns[j] + null_value = bigframes.dtypes.literal_to_ibis_scalar( + None, force_dtype=col_dtype + ) + ibis_values = [ + ops.AsTypeOp(col_dtype)._as_ibis(unpivot_table[col]) + if col is not None + else null_value + for col in source_cols + ] + cases = [(i, ibis_values[i]) for i in range(len(ibis_values))] + unpivot_value = typing.cast( + ibis_types.IntegerColumn, unpivot_table[unpivot_offset_id] + ).cases( + cases, default=null_value # type:ignore + ) + unpivot_values.append(unpivot_value.name(result_col)) + + unpivot_table = unpivot_table.select( + passthrough_columns, + *label_columns, + *unpivot_values, + *hidden_col_ids, + unpivot_offset_id, + ) + + # Extend the original ordering using unpivot_offset_id + old_ordering = self._ordering + if how == "left": + new_ordering = ExpressionOrdering( + ordering_value_columns=tuple( + [ + *old_ordering.ordering_value_columns, + OrderingColumnReference(unpivot_offset_id), + ] + ), + total_ordering_columns=frozenset( + [*old_ordering.total_ordering_columns, unpivot_offset_id] + ), + ) + else: # how=="right" + new_ordering = ExpressionOrdering( + ordering_value_columns=tuple( + [ + OrderingColumnReference(unpivot_offset_id), + *old_ordering.ordering_value_columns, + ] + ), + total_ordering_columns=frozenset( + [*old_ordering.total_ordering_columns, unpivot_offset_id] + ), + ) + value_columns = [ + unpivot_table[value_col_id] for value_col_id, _ in unpivot_columns + ] + passthrough_values = [unpivot_table[col] for col in passthrough_columns] + hidden_ordering_columns = [ + unpivot_table[unpivot_offset_id], + *[unpivot_table[hidden_col] for hidden_col in hidden_col_ids], + ] + return CompiledArrayValue( + table=unpivot_table, + columns=[ + *[unpivot_table[col_id] for col_id in index_col_ids], + *value_columns, + *passthrough_values, + ], + hidden_ordering_columns=hidden_ordering_columns, + ordering=new_ordering, + ) + + def assign(self, source_id: str, destination_id: str) -> CompiledArrayValue: + return self._set_or_replace_by_id( + destination_id, self._get_ibis_column(source_id) + ) + + def assign_constant( + self, + destination_id: str, + value: typing.Any, + dtype: typing.Optional[bigframes.dtypes.Dtype], + ) -> CompiledArrayValue: + # TODO(b/281587571): Solve scalar constant aggregation problem w/Ibis. + ibis_value = bigframes.dtypes.literal_to_ibis_scalar(value, dtype) + if ibis_value is None: + raise NotImplementedError( + f"Type not supported as scalar value {type(value)}. {constants.FEEDBACK_LINK}" + ) + expr = self._set_or_replace_by_id(destination_id, ibis_value) + return expr._reproject_to_table() + + def _set_or_replace_by_id( + self, id: str, new_value: ibis_types.Value + ) -> CompiledArrayValue: + """Safely assign by id while maintaining ordering integrity.""" + # TODO: Split into explicit set and replace methods + ordering_col_ids = [ + col_ref.column_id for col_ref in self._ordering.ordering_value_columns + ] + if id in ordering_col_ids: + return self._hide_column(id)._set_or_replace_by_id(id, new_value) + + builder = self.builder() + if id in self.column_ids: + builder.columns = [ + val if (col_id != id) else new_value.name(id) + for col_id, val in zip(self.column_ids, self._columns) + ] + else: + builder.columns = [*self.columns, new_value.name(id)] + return builder.build() + + +class ArrayValueBuilder: + """Mutable expression class. + Use ArrayValue.builder() to create from a ArrayValue object. + """ + + def __init__( + self, + table: ibis_types.Table, + ordering: ExpressionOrdering, + columns: Collection[ibis_types.Value] = (), + hidden_ordering_columns: Collection[ibis_types.Value] = (), + predicates: Optional[Collection[ibis_types.BooleanValue]] = None, + ): + self.table = table + self.columns = list(columns) + self.hidden_ordering_columns = list(hidden_ordering_columns) + self.ordering = ordering + self.predicates = list(predicates) if predicates is not None else None + + def build(self) -> CompiledArrayValue: + return CompiledArrayValue( + table=self.table, + columns=self.columns, + hidden_ordering_columns=self.hidden_ordering_columns, + ordering=self.ordering, + predicates=self.predicates, + ) + + +def _reduce_predicate_list( + predicate_list: typing.Collection[ibis_types.BooleanValue], +) -> ibis_types.BooleanValue: + """Converts a list of predicates BooleanValues into a single BooleanValue.""" + if len(predicate_list) == 0: + raise ValueError("Cannot reduce empty list of predicates") + if len(predicate_list) == 1: + (item,) = predicate_list + return item + return functools.reduce(lambda acc, pred: acc.__and__(pred), predicate_list) + + +def _convert_ordering_to_table_values( + value_lookup: typing.Mapping[str, ibis_types.Value], + ordering_columns: typing.Sequence[OrderingColumnReference], +) -> typing.Sequence[ibis_types.Value]: + column_refs = ordering_columns + ordering_values = [] + for ordering_col in column_refs: + column = typing.cast(ibis_types.Column, value_lookup[ordering_col.column_id]) + ordering_value = ( + ibis.asc(column) + if ordering_col.direction.is_ascending + else ibis.desc(column) + ) + # Bigquery SQL considers NULLS to be "smallest" values, but we need to override in these cases. + if (not ordering_col.na_last) and (not ordering_col.direction.is_ascending): + # Force nulls to be first + is_null_val = typing.cast(ibis_types.Column, column.isnull()) + ordering_values.append(ibis.desc(is_null_val)) + elif (ordering_col.na_last) and (ordering_col.direction.is_ascending): + # Force nulls to be last + is_null_val = typing.cast(ibis_types.Column, column.isnull()) + ordering_values.append(ibis.asc(is_null_val)) + ordering_values.append(ordering_value) + return ordering_values + + +def _as_identity(value: ibis_types.Value): + # Some types need to be converted to string to enable groupby + if value.type().is_float64() or value.type().is_geospatial(): + return value.cast(ibis_dtypes.str) + return value diff --git a/bigframes/core/compile/compiler.py b/bigframes/core/compile/compiler.py new file mode 100644 index 0000000000..195d830122 --- /dev/null +++ b/bigframes/core/compile/compiler.py @@ -0,0 +1,185 @@ +# Copyright 2023 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from __future__ import annotations + +import functools +import io +import typing + +import pandas as pd + +import bigframes.core.compile as compiled +import bigframes.core.compile.single_column +import bigframes.core.nodes as nodes + +if typing.TYPE_CHECKING: + import bigframes.core + import bigframes.session + + +@functools.cache +def compile_node(node: nodes.BigFrameNode) -> compiled.CompiledArrayValue: + """Compile node into CompileArrayValue. Caches result.""" + return _compile_node(node) + + +@functools.singledispatch +def _compile_node(node: nodes.BigFrameNode) -> compiled.CompiledArrayValue: + """Defines transformation but isn't cached, always use compile_node instead""" + raise ValueError(f"Can't compile unnrecognized node: {node}") + + +@_compile_node.register +def compile_join(node: nodes.JoinNode): + compiled_left = compile_node(node.left_child) + compiled_right = compile_node(node.right_child) + return bigframes.core.compile.single_column.join_by_column( + compiled_left, + node.left_column_ids, + compiled_right, + node.right_column_ids, + how=node.how, + allow_row_identity_join=node.allow_row_identity_join, + ) + + +@_compile_node.register +def compile_select(node: nodes.SelectNode): + return compile_node(node.child).select_columns(node.column_ids) + + +@_compile_node.register +def compile_drop(node: nodes.DropColumnsNode): + return compile_node(node.child).drop_columns(node.columns) + + +@_compile_node.register +def compile_readlocal(node: nodes.ReadLocalNode): + array_as_pd = pd.read_feather(io.BytesIO(node.feather_bytes)) + return compiled.CompiledArrayValue.mem_expr_from_pandas(array_as_pd) + + +@_compile_node.register +def compile_readgbq(node: nodes.ReadGbqNode): + return compiled.CompiledArrayValue( + node.table, + node.columns, + node.hidden_ordering_columns, + node.ordering, + ) + + +@_compile_node.register +def compile_promote_offsets(node: nodes.PromoteOffsetsNode): + return compile_node(node.child).promote_offsets(node.col_id) + + +@_compile_node.register +def compile_filter(node: nodes.FilterNode): + return compile_node(node.child).filter(node.predicate_id, node.keep_null) + + +@_compile_node.register +def compile_orderby(node: nodes.OrderByNode): + return compile_node(node.child).order_by(node.by, node.stable) + + +@_compile_node.register +def compile_reversed(node: nodes.ReversedNode): + return compile_node(node.child).reversed() + + +@_compile_node.register +def compile_project_unary(node: nodes.ProjectUnaryOpNode): + return compile_node(node.child).project_unary_op( + node.input_id, node.op, node.output_id + ) + + +@_compile_node.register +def compile_project_binary(node: nodes.ProjectBinaryOpNode): + return compile_node(node.child).project_binary_op( + node.left_input_id, node.right_input_id, node.op, node.output_id + ) + + +@_compile_node.register +def compile_project_ternary(node: nodes.ProjectTernaryOpNode): + return compile_node(node.child).project_ternary_op( + node.input_id1, node.input_id2, node.input_id3, node.op, node.output_id + ) + + +@_compile_node.register +def compile_concat(node: nodes.ConcatNode): + compiled_nodes = [compile_node(node) for node in node.children] + return compiled_nodes[0].concat(compiled_nodes[1:]) + + +@_compile_node.register +def compile_aggregate(node: nodes.AggregateNode): + return compile_node(node.child).aggregate( + node.aggregations, node.by_column_ids, node.dropna + ) + + +@_compile_node.register +def compile_corr(node: nodes.CorrNode): + return compile_node(node.child).corr_aggregate(node.corr_aggregations) + + +@_compile_node.register +def compile_window(node: nodes.WindowOpNode): + return compile_node(node.child).project_window_op( + node.column_name, + node.op, + node.window_spec, + node.output_name, + never_skip_nulls=node.never_skip_nulls, + skip_reproject_unsafe=node.skip_reproject_unsafe, + ) + + +@_compile_node.register +def compile_reproject(node: nodes.ReprojectOpNode): + return compile_node(node.child)._reproject_to_table() + + +@_compile_node.register +def compile_unpivot(node: nodes.UnpivotNode): + return compile_node(node.child).unpivot( + node.row_labels, + node.unpivot_columns, + passthrough_columns=node.passthrough_columns, + index_col_ids=node.index_col_ids, + dtype=node.dtype, + how=node.how, + ) + + +@_compile_node.register +def compile_assign(node: nodes.AssignNode): + return compile_node(node.child).assign(node.source_id, node.destination_id) + + +@_compile_node.register +def compile_assign_constant(node: nodes.AssignConstantNode): + return compile_node(node.child).assign_constant( + node.destination_id, node.value, node.dtype + ) + + +@_compile_node.register +def compiler_random_sample(node: nodes.RandomSampleNode): + return compile_node(node.child)._uniform_sampling(node.fraction) diff --git a/bigframes/core/joins/row_identity.py b/bigframes/core/compile/row_identity.py similarity index 94% rename from bigframes/core/joins/row_identity.py rename to bigframes/core/compile/row_identity.py index 76e456ec94..2e9bc0527c 100644 --- a/bigframes/core/joins/row_identity.py +++ b/bigframes/core/compile/row_identity.py @@ -23,15 +23,16 @@ import ibis.expr.types as ibis_types import bigframes.constants as constants -import bigframes.core as core +import bigframes.core.compile as compiled import bigframes.core.joins.name_resolution as naming +import bigframes.core.ordering as orderings SUPPORTED_ROW_IDENTITY_HOW = {"outer", "left", "inner"} def join_by_row_identity( - left: core.ArrayValue, right: core.ArrayValue, *, how: str -) -> core.ArrayValue: + left: compiled.CompiledArrayValue, right: compiled.CompiledArrayValue, *, how: str +) -> compiled.CompiledArrayValue: """Compute join when we are joining by row identity not a specific column.""" if how not in SUPPORTED_ROW_IDENTITY_HOW: raise NotImplementedError( @@ -101,8 +102,8 @@ def join_by_row_identity( ) # Assume that left ordering is sufficient since 1:1 join over same base table join_total_order_cols = left_total_order_cols - new_ordering = core.ExpressionOrdering( - ordering_columns, total_ordering_columns=join_total_order_cols + new_ordering = orderings.ExpressionOrdering( + tuple(ordering_columns), total_ordering_columns=join_total_order_cols ) hidden_ordering_columns = [ @@ -117,8 +118,7 @@ def join_by_row_identity( if key.column_id in right._hidden_ordering_column_names.keys() ] - joined_expr = core.ArrayValue( - left._session, + joined_expr = compiled.CompiledArrayValue( left._table, columns=joined_columns, hidden_ordering_columns=hidden_ordering_columns, diff --git a/bigframes/core/joins/single_column.py b/bigframes/core/compile/single_column.py similarity index 87% rename from bigframes/core/joins/single_column.py rename to bigframes/core/compile/single_column.py index 0c0e2008b5..b992aa1d1d 100644 --- a/bigframes/core/joins/single_column.py +++ b/bigframes/core/compile/single_column.py @@ -23,16 +23,16 @@ import ibis.expr.datatypes as ibis_dtypes import ibis.expr.types as ibis_types -import bigframes.core as core -import bigframes.core.joins.name_resolution as naming -import bigframes.core.joins.row_identity -import bigframes.core.ordering +import bigframes.core.compile as compiled +import bigframes.core.compile.row_identity +import bigframes.core.joins as joining +import bigframes.core.ordering as orderings def join_by_column( - left: core.ArrayValue, + left: compiled.CompiledArrayValue, left_column_ids: typing.Sequence[str], - right: core.ArrayValue, + right: compiled.CompiledArrayValue, right_column_ids: typing.Sequence[str], *, how: Literal[ @@ -42,7 +42,7 @@ def join_by_column( "right", ], allow_row_identity_join: bool = True, -) -> core.ArrayValue: +) -> compiled.CompiledArrayValue: """Join two expressions by column equality. Arguments: @@ -61,7 +61,7 @@ def join_by_column( """ if ( allow_row_identity_join - and how in bigframes.core.joins.row_identity.SUPPORTED_ROW_IDENTITY_HOW + and how in bigframes.core.compile.row_identity.SUPPORTED_ROW_IDENTITY_HOW and left._table.equals(right._table) # Make sure we're joining on exactly the same column(s), at least with # regards to value its possible that they both have the same names but @@ -73,15 +73,15 @@ def join_by_column( for lcol, rcol in zip(left_column_ids, right_column_ids) ) ): - return bigframes.core.joins.row_identity.join_by_row_identity( + return bigframes.core.compile.row_identity.join_by_row_identity( left, right, how=how ) else: # Value column mapping must use JOIN_NAME_REMAPPER to stay in sync with consumers of join result - l_public_mapping, r_public_mapping = naming.JOIN_NAME_REMAPPER( + l_public_mapping, r_public_mapping = joining.JOIN_NAME_REMAPPER( left.column_ids, right.column_ids ) - l_hidden_mapping, r_hidden_mapping = naming.JoinNameRemapper( + l_hidden_mapping, r_hidden_mapping = joining.JoinNameRemapper( namespace="hidden" )(left._hidden_column_ids, right._hidden_column_ids) l_mapping = {**l_public_mapping, **l_hidden_mapping} @@ -134,8 +134,7 @@ def join_by_column( for col in right._hidden_ordering_columns ], ] - return core.ArrayValue( - left._session, + return compiled.CompiledArrayValue( combined_table, columns=columns, hidden_ordering_columns=hidden_ordering_columns, @@ -151,12 +150,12 @@ def value_to_join_key(value: ibis_types.Value): def join_orderings( - left: core.ExpressionOrdering, - right: core.ExpressionOrdering, + left: orderings.ExpressionOrdering, + right: orderings.ExpressionOrdering, left_id_mapping: Mapping[str, str], right_id_mapping: Mapping[str, str], left_order_dominates: bool = True, -) -> core.ExpressionOrdering: +) -> orderings.ExpressionOrdering: left_ordering_refs = [ ref.with_name(left_id_mapping[ref.column_id]) for ref in left.all_ordering_columns @@ -176,7 +175,7 @@ def join_orderings( right_total_order_cols = frozenset( [right_id_mapping[id] for id in right.total_ordering_columns] ) - return core.ExpressionOrdering( - ordering_value_columns=joined_refs, + return orderings.ExpressionOrdering( + ordering_value_columns=tuple(joined_refs), total_ordering_columns=left_total_order_cols | right_total_order_cols, ) diff --git a/bigframes/core/groupby/__init__.py b/bigframes/core/groupby/__init__.py index db0843fcbc..2a19a83dd5 100644 --- a/bigframes/core/groupby/__init__.py +++ b/bigframes/core/groupby/__init__.py @@ -193,7 +193,7 @@ def cumprod(self, *args, **kwargs) -> df.DataFrame: def shift(self, periods=1) -> series.Series: window = core.WindowSpec( - grouping_keys=self._by_col_ids, + grouping_keys=tuple(self._by_col_ids), preceding=periods if periods > 0 else None, following=-periods if periods < 0 else None, ) @@ -201,7 +201,7 @@ def shift(self, periods=1) -> series.Series: def diff(self, periods=1) -> series.Series: window = core.WindowSpec( - grouping_keys=self._by_col_ids, + grouping_keys=tuple(self._by_col_ids), preceding=periods if periods > 0 else None, following=-periods if periods < 0 else None, ) @@ -210,7 +210,7 @@ def diff(self, periods=1) -> series.Series: def rolling(self, window: int, min_periods=None) -> windows.Window: # To get n size window, need current row and n-1 preceding rows. window_spec = core.WindowSpec( - grouping_keys=self._by_col_ids, + grouping_keys=tuple(self._by_col_ids), preceding=window - 1, following=0, min_periods=min_periods or window, @@ -225,7 +225,7 @@ def rolling(self, window: int, min_periods=None) -> windows.Window: def expanding(self, min_periods: int = 1) -> windows.Window: window_spec = core.WindowSpec( - grouping_keys=self._by_col_ids, + grouping_keys=tuple(self._by_col_ids), following=0, min_periods=min_periods, ) @@ -389,7 +389,7 @@ def _apply_window_op( ): """Apply window op to groupby. Defaults to grouped cumulative window.""" window_spec = window or core.WindowSpec( - grouping_keys=self._by_col_ids, following=0 + grouping_keys=tuple(self._by_col_ids), following=0 ) columns = self._aggregated_columns(numeric_only=numeric_only) block, result_ids = self._block.multi_apply_window_op( @@ -528,7 +528,7 @@ def cumcount(self, *args, **kwargs) -> series.Series: def shift(self, periods=1) -> series.Series: """Shift index by desired number of periods.""" window = core.WindowSpec( - grouping_keys=self._by_col_ids, + grouping_keys=tuple(self._by_col_ids), preceding=periods if periods > 0 else None, following=-periods if periods < 0 else None, ) @@ -536,7 +536,7 @@ def shift(self, periods=1) -> series.Series: def diff(self, periods=1) -> series.Series: window = core.WindowSpec( - grouping_keys=self._by_col_ids, + grouping_keys=tuple(self._by_col_ids), preceding=periods if periods > 0 else None, following=-periods if periods < 0 else None, ) @@ -545,7 +545,7 @@ def diff(self, periods=1) -> series.Series: def rolling(self, window: int, min_periods=None) -> windows.Window: # To get n size window, need current row and n-1 preceding rows. window_spec = core.WindowSpec( - grouping_keys=self._by_col_ids, + grouping_keys=tuple(self._by_col_ids), preceding=window - 1, following=0, min_periods=min_periods or window, @@ -564,7 +564,7 @@ def rolling(self, window: int, min_periods=None) -> windows.Window: def expanding(self, min_periods: int = 1) -> windows.Window: window_spec = core.WindowSpec( - grouping_keys=self._by_col_ids, + grouping_keys=tuple(self._by_col_ids), following=0, min_periods=min_periods, ) @@ -597,7 +597,7 @@ def _apply_window_op( ): """Apply window op to groupby. Defaults to grouped cumulative window.""" window_spec = window or core.WindowSpec( - grouping_keys=self._by_col_ids, following=0 + grouping_keys=tuple(self._by_col_ids), following=0 ) label = self._value_name if not discard_name else None diff --git a/bigframes/core/indexers.py b/bigframes/core/indexers.py index 4f5a9471b9..f6ce084714 100644 --- a/bigframes/core/indexers.py +++ b/bigframes/core/indexers.py @@ -310,7 +310,9 @@ def _loc_getitem_series_or_dataframe( index_name = temporary_index_names[i] values = [entry[i] for entry in key] index_cols_dict[index_name] = values - keys_df = bigframes.dataframe.DataFrame(index_cols_dict) + keys_df = bigframes.dataframe.DataFrame( + index_cols_dict, session=series_or_dataframe._get_block().expr.session + ) keys_df = keys_df.set_index(temporary_index_names, drop=True) keys_df = keys_df.rename_axis(original_index_names) else: @@ -320,7 +322,10 @@ def _loc_getitem_series_or_dataframe( index_name_is_none = index_name is None if index_name_is_none: index_name = "unnamed_col" - keys_df = bigframes.dataframe.DataFrame({index_name: key}) + keys_df = bigframes.dataframe.DataFrame( + {index_name: key}, + session=series_or_dataframe._get_block().expr.session, + ) keys_df = keys_df.set_index(index_name, drop=True) if index_name_is_none: keys_df.index.name = None @@ -338,7 +343,7 @@ def _loc_getitem_series_or_dataframe( elif pd.api.types.is_scalar(key): index_name = "unnamed_col" keys_df = bigframes.dataframe.DataFrame( - {index_name: [key]}, session=series_or_dataframe._get_block().expr._session + {index_name: [key]}, session=series_or_dataframe._get_block().expr.session ) keys_df = keys_df.set_index(index_name, drop=True) keys_df.index.name = None diff --git a/bigframes/core/indexes/index.py b/bigframes/core/indexes/index.py index 677bb8529c..6c66c36062 100644 --- a/bigframes/core/indexes/index.py +++ b/bigframes/core/indexes/index.py @@ -26,8 +26,7 @@ import bigframes.core as core import bigframes.core.block_transforms as block_ops import bigframes.core.blocks as blocks -import bigframes.core.joins as joins -import bigframes.core.joins.name_resolution as join_names +import bigframes.core.joins as joining import bigframes.core.ordering as order import bigframes.core.utils as utils import bigframes.dtypes @@ -399,9 +398,10 @@ def to_pandas(self) -> pandas.Index: """Executes deferred operations and downloads the results.""" # Project down to only the index column. So the query can be cached to visualize other data. index_columns = list(self._block.index_columns) + dtypes = dict(zip(index_columns, self.dtypes)) expr = self._expr.select_columns(index_columns) results, _ = expr.start_query() - df = expr._session._rows_to_dataframe(results) + df = expr.session._rows_to_dataframe(results, dtypes) df = df.set_index(index_columns) index = df.index index.names = list(self._block._index_labels) @@ -460,11 +460,10 @@ def join_mono_indexed( ) -> Tuple[IndexValue, Tuple[Mapping[str, str], Mapping[str, str]],]: left_expr = left._block.expr right_expr = right._block.expr - get_column_left, get_column_right = join_names.JOIN_NAME_REMAPPER( + get_column_left, get_column_right = joining.JOIN_NAME_REMAPPER( left_expr.column_ids, right_expr.column_ids ) - combined_expr = joins.join_by_column( - left._block.expr, + combined_expr = left._block.expr.join( left._block.index_columns, right._block.expr, right._block.index_columns, @@ -519,12 +518,11 @@ def join_multi_indexed( left_expr = left._block.expr right_expr = right._block.expr - get_column_left, get_column_right = join_names.JOIN_NAME_REMAPPER( + get_column_left, get_column_right = joining.JOIN_NAME_REMAPPER( left_expr.column_ids, right_expr.column_ids ) - combined_expr = joins.join_by_column( - left_expr, + combined_expr = left_expr.join( left_join_ids, right_expr, right_join_ids, diff --git a/bigframes/core/joins/__init__.py b/bigframes/core/joins/__init__.py index 3f9447aef0..5d407ec22b 100644 --- a/bigframes/core/joins/__init__.py +++ b/bigframes/core/joins/__init__.py @@ -15,11 +15,6 @@ """Helpers to join ArrayValue objects.""" from bigframes.core.joins.merge import merge -from bigframes.core.joins.row_identity import join_by_row_identity -from bigframes.core.joins.single_column import join_by_column +from bigframes.core.joins.name_resolution import JOIN_NAME_REMAPPER, JoinNameRemapper -__all__ = ( - "join_by_row_identity", - "join_by_column", - "merge", -) +__all__ = ("merge", "JoinNameRemapper", "JOIN_NAME_REMAPPER") diff --git a/bigframes/core/nodes.py b/bigframes/core/nodes.py new file mode 100644 index 0000000000..7b252b164f --- /dev/null +++ b/bigframes/core/nodes.py @@ -0,0 +1,245 @@ +# Copyright 2023 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from __future__ import annotations + +from dataclasses import dataclass, field +import functools +import typing +from typing import Optional, Tuple + +import pandas + +import bigframes.core.guid +from bigframes.core.ordering import OrderingColumnReference +import bigframes.core.window_spec as window +import bigframes.dtypes +import bigframes.operations as ops +import bigframes.operations.aggregations as agg_ops + +if typing.TYPE_CHECKING: + import ibis.expr.types as ibis_types + + import bigframes.core.ordering as orderings + import bigframes.session + + +@dataclass(frozen=True) +class BigFrameNode: + """ + Immutable node for representing 2D typed array as a tree of operators. + + All subclasses must be hashable so as to be usable as caching key. + """ + + @property + def deterministic(self) -> bool: + """Whether this node will evaluates deterministically.""" + return True + + @property + def child_nodes(self) -> typing.Sequence[BigFrameNode]: + """Direct children of this node""" + return tuple([]) + + @functools.cached_property + def session(self): + sessions = [] + for child in self.child_nodes: + if child.session is not None: + sessions.append(child.session) + unique_sessions = len(set(sessions)) + if unique_sessions > 1: + raise ValueError("Cannot use combine sources from multiple sessions.") + elif unique_sessions == 1: + return sessions[0] + return None + + +@dataclass(frozen=True) +class UnaryNode(BigFrameNode): + child: BigFrameNode + + @property + def child_nodes(self) -> typing.Sequence[BigFrameNode]: + return (self.child,) + + +@dataclass(frozen=True) +class JoinNode(BigFrameNode): + left_child: BigFrameNode + right_child: BigFrameNode + left_column_ids: typing.Tuple[str, ...] + right_column_ids: typing.Tuple[str, ...] + how: typing.Literal[ + "inner", + "left", + "outer", + "right", + ] + allow_row_identity_join: bool = True + + @property + def child_nodes(self) -> typing.Sequence[BigFrameNode]: + return (self.left_child, self.right_child) + + +@dataclass(frozen=True) +class ConcatNode(BigFrameNode): + children: Tuple[BigFrameNode, ...] + + @property + def child_nodes(self) -> typing.Sequence[BigFrameNode]: + return self.children + + +# Input Nodex +@dataclass(frozen=True) +class ReadLocalNode(BigFrameNode): + feather_bytes: bytes + column_ids: typing.Tuple[str, ...] + + +# TODO: Refactor to take raw gbq object reference +@dataclass(frozen=True) +class ReadGbqNode(BigFrameNode): + table: ibis_types.Table = field() + table_session: bigframes.session.Session = field() + columns: Tuple[ibis_types.Value, ...] = field() + hidden_ordering_columns: Tuple[ibis_types.Value, ...] = field() + ordering: orderings.ExpressionOrdering = field() + + @property + def session(self): + return (self.table_session,) + + +# Unary nodes +@dataclass(frozen=True) +class DropColumnsNode(UnaryNode): + columns: Tuple[str, ...] + + +@dataclass(frozen=True) +class PromoteOffsetsNode(UnaryNode): + col_id: str + + +@dataclass(frozen=True) +class FilterNode(UnaryNode): + predicate_id: str + keep_null: bool = False + + +@dataclass(frozen=True) +class OrderByNode(UnaryNode): + by: Tuple[OrderingColumnReference, ...] + stable: bool = False + + +@dataclass(frozen=True) +class ReversedNode(UnaryNode): + pass + + +@dataclass(frozen=True) +class SelectNode(UnaryNode): + column_ids: typing.Tuple[str, ...] + + +@dataclass(frozen=True) +class ProjectUnaryOpNode(UnaryNode): + input_id: str + op: ops.UnaryOp + output_id: Optional[str] = None + + +@dataclass(frozen=True) +class ProjectBinaryOpNode(UnaryNode): + left_input_id: str + right_input_id: str + op: ops.BinaryOp + output_id: str + + +@dataclass(frozen=True) +class ProjectTernaryOpNode(UnaryNode): + input_id1: str + input_id2: str + input_id3: str + op: ops.TernaryOp + output_id: str + + +@dataclass(frozen=True) +class AggregateNode(UnaryNode): + aggregations: typing.Tuple[typing.Tuple[str, agg_ops.AggregateOp, str], ...] + by_column_ids: typing.Tuple[str, ...] = tuple([]) + dropna: bool = True + + +# TODO: Unify into aggregate +@dataclass(frozen=True) +class CorrNode(UnaryNode): + corr_aggregations: typing.Tuple[typing.Tuple[str, str, str], ...] + + +@dataclass(frozen=True) +class WindowOpNode(UnaryNode): + column_name: str + op: agg_ops.WindowOp + window_spec: window.WindowSpec + output_name: typing.Optional[str] = None + never_skip_nulls: bool = False + skip_reproject_unsafe: bool = False + + +@dataclass(frozen=True) +class ReprojectOpNode(UnaryNode): + pass + + +@dataclass(frozen=True) +class UnpivotNode(UnaryNode): + row_labels: typing.Tuple[typing.Hashable, ...] + unpivot_columns: typing.Tuple[ + typing.Tuple[str, typing.Tuple[typing.Optional[str], ...]], ... + ] + passthrough_columns: typing.Tuple[str, ...] = () + index_col_ids: typing.Tuple[str, ...] = ("index",) + dtype: typing.Union[ + bigframes.dtypes.Dtype, typing.Tuple[bigframes.dtypes.Dtype, ...] + ] = (pandas.Float64Dtype(),) + how: typing.Literal["left", "right"] = "left" + + +@dataclass(frozen=True) +class AssignNode(UnaryNode): + source_id: str + destination_id: str + + +@dataclass(frozen=True) +class AssignConstantNode(UnaryNode): + destination_id: str + value: typing.Hashable + dtype: typing.Optional[bigframes.dtypes.Dtype] + + +@dataclass(frozen=True) +class RandomSampleNode(UnaryNode): + fraction: float + + @property + def deterministic(self) -> bool: + return False diff --git a/bigframes/core/ordering.py b/bigframes/core/ordering.py index d5f07ecf91..2cecd2fe7b 100644 --- a/bigframes/core/ordering.py +++ b/bigframes/core/ordering.py @@ -86,7 +86,7 @@ class IntegerEncoding: class ExpressionOrdering: """Immutable object that holds information about the ordering of rows in a ArrayValue object.""" - ordering_value_columns: Sequence[OrderingColumnReference] = () + ordering_value_columns: typing.Tuple[OrderingColumnReference, ...] = () integer_encoding: IntegerEncoding = IntegerEncoding(False) string_encoding: StringEncoding = StringEncoding(False) # A table has a total ordering defined by the identities of a set of 1 or more columns. @@ -170,7 +170,7 @@ def with_column_remap(self, mapping: typing.Mapping[str, str]): mapping.get(col_id, col_id) for col_id in self.total_ordering_columns ) return ExpressionOrdering( - new_value_columns, + tuple(new_value_columns), integer_encoding=self.integer_encoding, string_encoding=self.string_encoding, total_ordering_columns=new_total_order, diff --git a/bigframes/core/reshape/__init__.py b/bigframes/core/reshape/__init__.py index 339ce7466a..dc61c3baad 100644 --- a/bigframes/core/reshape/__init__.py +++ b/bigframes/core/reshape/__init__.py @@ -20,6 +20,7 @@ import bigframes.core as core import bigframes.core.utils as utils import bigframes.dataframe +import bigframes.operations as ops import bigframes.operations.aggregations as agg_ops import bigframes.series @@ -118,3 +119,35 @@ def cut( f"Only labels=False is supported in BigQuery DataFrames so far. {constants.FEEDBACK_LINK}" ) return x._apply_window_op(agg_ops.CutOp(bins), window_spec=core.WindowSpec()) + + +def qcut( + x: bigframes.series.Series, + q: typing.Union[int, typing.Sequence[float]], + *, + labels: Optional[bool] = None, + duplicates: typing.Literal["drop", "error"] = "error", +) -> bigframes.series.Series: + if isinstance(q, int) and q <= 0: + raise ValueError("`q` should be a positive integer.") + + if labels is not False: + raise NotImplementedError( + f"Only labels=False is supported in BigQuery DataFrames so far. {constants.FEEDBACK_LINK}" + ) + if duplicates != "drop": + raise NotImplementedError( + f"Only duplicates='drop' is supported in BigQuery DataFrames so far. {constants.FEEDBACK_LINK}" + ) + block = x._block + label = block.col_id_to_label[x._value_column] + block, nullity_id = block.apply_unary_op(x._value_column, ops.notnull_op) + block, result = block.apply_window_op( + x._value_column, + agg_ops.QcutOp(q), + window_spec=core.WindowSpec(grouping_keys=(nullity_id,)), + ) + block, result = block.apply_binary_op( + result, nullity_id, ops.partial_arg3(ops.where_op, None), result_label=label + ) + return bigframes.series.Series(block.select_column(result)) diff --git a/bigframes/core/window_spec.py b/bigframes/core/window_spec.py new file mode 100644 index 0000000000..3458bfb1b8 --- /dev/null +++ b/bigframes/core/window_spec.py @@ -0,0 +1,35 @@ +# Copyright 2023 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from dataclasses import dataclass +import typing + +import bigframes.core.ordering as orderings + + +@dataclass(frozen=True) +class WindowSpec: + """ + Specifies a window over which aggregate and analytic function may be applied. + grouping_keys: set of column ids to group on + preceding: Number of preceding rows in the window + following: Number of preceding rows in the window + ordering: List of columns ids and ordering direction to override base ordering + """ + + grouping_keys: typing.Tuple[str, ...] = tuple() + ordering: typing.Tuple[orderings.OrderingColumnReference, ...] = tuple() + preceding: typing.Optional[int] = None + following: typing.Optional[int] = None + min_periods: int = 0 diff --git a/bigframes/dataframe.py b/bigframes/dataframe.py index 5c0d9b78e1..3369fb4868 100644 --- a/bigframes/dataframe.py +++ b/bigframes/dataframe.py @@ -170,9 +170,7 @@ def __init__( if isinstance(dt, pandas.ArrowDtype) ) ): - self._block = blocks.block_from_local( - pd_dataframe, session or bigframes.pandas.get_global_session() - ) + self._block = blocks.block_from_local(pd_dataframe) elif session: self._block = session.read_pandas(pd_dataframe)._get_block() else: @@ -299,7 +297,7 @@ def values(self) -> numpy.ndarray: @property def _session(self) -> bigframes.Session: - return self._get_block().expr._session + return self._get_block().expr.session def __len__(self): rows, _ = self.shape @@ -893,6 +891,10 @@ def to_pandas( self._set_internal_query_job(query_job) return df.set_axis(self._block.column_labels, axis=1, copy=False) + def to_pandas_batches(self) -> Iterable[pandas.DataFrame]: + """Stream DataFrame results to an iterable of pandas DataFrame""" + return self._block.to_pandas_batches() + def _compute_dry_run(self) -> bigquery.QueryJob: return self._block._compute_dry_run() @@ -1038,22 +1040,7 @@ def reorder_levels(self, order: LevelsType, axis: int | str = 0): raise ValueError("Columns must be a multiindex to reorder levels.") def _resolve_levels(self, level: LevelsType) -> typing.Sequence[str]: - if utils.is_list_like(level): - levels = list(level) - else: - levels = [level] - resolved_level_ids = [] - for level_ref in levels: - if isinstance(level_ref, int): - resolved_level_ids.append(self._block.index_columns[level_ref]) - elif isinstance(level_ref, typing.Hashable): - matching_ids = self._block.index_name_to_col_id.get(level_ref, []) - if len(matching_ids) != 1: - raise ValueError("level name cannot be found or is ambiguous") - resolved_level_ids.append(matching_ids[0]) - else: - raise ValueError(f"Unexpected level: {level_ref}") - return resolved_level_ids + return self._block.resolve_index_level(level) def rename(self, *, columns: Mapping[blocks.Label, blocks.Label]) -> DataFrame: block = self._block.rename(columns=columns) @@ -1118,24 +1105,23 @@ def _assign_single_item( ) local_df = bigframes.dataframe.DataFrame( - {k: v}, session=self._get_block().expr._session + {k: v}, session=self._get_block().expr.session ) # local_df is likely (but not guarunteed) to be cached locally # since the original list came from memory and so is probably < MAX_INLINE_DF_SIZE - this_offsets_col_id = bigframes.core.guid.generate_guid() - this_expr = self._get_block()._expr.promote_offsets(this_offsets_col_id) - block = blocks.Block( - expr=this_expr, - index_labels=self.index.names, - index_columns=self._block.index_columns, - column_labels=[this_offsets_col_id] + list(self._block.value_columns), - ) # offsets are temporarily the first value column, label set to id - this_df_with_offsets = DataFrame(data=block) - join_result = this_df_with_offsets.join( - other=local_df, on=this_offsets_col_id, how="left" + new_column_block = local_df._block + original_index_column_ids = self._block.index_columns + self_block = self._block.reset_index(drop=False) + result_index, (get_column_left, get_column_right) = self_block.index.join( + new_column_block.index, how="left", block_identity_join=True + ) + result_block = result_index._block + result_block = result_block.set_index( + [get_column_left[col_id] for col_id in original_index_column_ids], + index_labels=self._block.index_labels, ) - return join_result.drop(columns=[this_offsets_col_id]) + return DataFrame(result_block) else: return self._assign_scalar(k, v) @@ -1687,6 +1673,44 @@ def idxmin(self) -> bigframes.series.Series: def idxmax(self) -> bigframes.series.Series: return bigframes.series.Series(block_ops.idxmax(self._block)) + def melt( + self, + id_vars: typing.Optional[typing.Iterable[typing.Hashable]] = None, + value_vars: typing.Optional[typing.Iterable[typing.Hashable]] = None, + var_name: typing.Union[ + typing.Hashable, typing.Sequence[typing.Hashable] + ] = None, + value_name: typing.Hashable = "value", + ): + if var_name is None: + # Determine default var_name. Attempt to use column labels if they are unique + if self.columns.nlevels > 1: + if len(set(self.columns.names)) == len(self.columns.names): + var_name = self.columns.names + else: + var_name = [f"variable_{i}" for i in range(len(self.columns.names))] + else: + var_name = self.columns.name or "variable" + + var_name = tuple(var_name) if utils.is_list_like(var_name) else (var_name,) + + if id_vars is not None: + id_col_ids = [self._resolve_label_exact(col) for col in id_vars] + else: + id_col_ids = [] + if value_vars is not None: + val_col_ids = [self._resolve_label_exact(col) for col in value_vars] + else: + val_col_ids = [ + col_id + for col_id in self._block.value_columns + if col_id not in id_col_ids + ] + + return DataFrame( + self._block.melt(id_col_ids, val_col_ids, var_name, value_name) + ) + def describe(self) -> DataFrame: df_numeric = self._drop_non_numeric(keep_bool=False) if len(df_numeric.columns) == 0: @@ -1802,20 +1826,25 @@ def _stack_multi(self, level: LevelsType = -1): block = block.stack(levels=len(level)) return DataFrame(block) - def unstack(self): + def unstack(self, level: LevelsType = -1): + if isinstance(level, int) or isinstance(level, str): + level = [level] + block = self._block # Special case, unstack with mono-index transpose into a series if self.index.nlevels == 1: block = block.stack(how="right", levels=self.columns.nlevels) return bigframes.series.Series(block) - # Pivot by last level of index - index_ids = block.index_columns + # Pivot by index levels + unstack_ids = self._resolve_levels(level) block = block.reset_index(drop=False) - block = block.set_index(index_ids[:-1]) + block = block.set_index( + [col for col in self._block.index_columns if col not in unstack_ids] + ) pivot_block = block.pivot( - columns=[index_ids[-1]], + columns=unstack_ids, values=self._block.value_columns, values_in_index=True, ) @@ -2209,7 +2238,7 @@ def to_csv( field_delimiter=sep, header=header, ) - _, query_job = self._block.expr._session._start_query(export_data_statement) + _, query_job = self._block.expr.session._start_query(export_data_statement) self._set_internal_query_job(query_job) def to_json( @@ -2251,7 +2280,7 @@ def to_json( format="JSON", export_options={}, ) - _, query_job = self._block.expr._session._start_query(export_data_statement) + _, query_job = self._block.expr.session._start_query(export_data_statement) self._set_internal_query_job(query_job) def to_gbq( @@ -2280,7 +2309,7 @@ def to_gbq( write_disposition=dispositions[if_exists], destination=bigquery.table.TableReference.from_string( destination_table, - default_project=self._block.expr._session.bqclient.project, + default_project=self._block.expr.session.bqclient.project, ), ) @@ -2327,7 +2356,7 @@ def to_parquet( format="PARQUET", export_options=export_options, ) - _, query_job = self._block.expr._session._start_query(export_data_statement) + _, query_job = self._block.expr.session._start_query(export_data_statement) self._set_internal_query_job(query_job) def to_dict( @@ -2470,7 +2499,7 @@ def _run_io_query( """Executes a query job presenting this dataframe and returns the destination table.""" expr = self._block.expr - session = expr._session + session = expr.session sql = self._create_io_query(index=index, ordering_id=ordering_id) _, query_job = session._start_query( sql=sql, job_config=job_config # type: ignore @@ -2678,3 +2707,5 @@ def get_right_id(id): result = result[other.name].rename() return result + + __matmul__ = dot diff --git a/bigframes/dtypes.py b/bigframes/dtypes.py index da221a95ac..079f0cc27a 100644 --- a/bigframes/dtypes.py +++ b/bigframes/dtypes.py @@ -169,6 +169,10 @@ def ibis_dtype_to_bigframes_dtype( if isinstance(ibis_dtype, ibis_dtypes.Struct): return pd.ArrowDtype(ibis_dtype_to_arrow_dtype(ibis_dtype)) + # BigQuery only supports integers of size 64 bits. + if isinstance(ibis_dtype, ibis_dtypes.Integer): + return pd.Int64Dtype() + if ibis_dtype in IBIS_TO_BIGFRAMES: return IBIS_TO_BIGFRAMES[ibis_dtype] elif isinstance(ibis_dtype, ibis_dtypes.Null): @@ -372,6 +376,8 @@ def cast_ibis_value( ibis_dtypes.float64: (ibis_dtypes.string, ibis_dtypes.int64), ibis_dtypes.string: (ibis_dtypes.int64, ibis_dtypes.float64), ibis_dtypes.date: (), + ibis_dtypes.Decimal(precision=38, scale=9): (ibis_dtypes.float64,), + ibis_dtypes.Decimal(precision=76, scale=38): (ibis_dtypes.float64,), ibis_dtypes.time: (), ibis_dtypes.timestamp: (ibis_dtypes.Timestamp(timezone="UTC"),), ibis_dtypes.Timestamp(timezone="UTC"): (ibis_dtypes.timestamp,), diff --git a/bigframes/formatting_helpers.py b/bigframes/formatting_helpers.py index 6851bdd2bd..752aeb7a10 100644 --- a/bigframes/formatting_helpers.py +++ b/bigframes/formatting_helpers.py @@ -16,6 +16,7 @@ # TODO(orrbradford): cleanup up typings and documenttion in this file import datetime +import random from typing import Any, Optional, Union import google.api_core.exceptions as api_core_exceptions @@ -57,9 +58,9 @@ def repr_query_job_html(query_job: Optional[bigquery.QueryJob]): Pywidget html table. """ if query_job is None: - return widgets.HTML("No job information available") + return display.HTML("No job information available") if query_job.dry_run: - return widgets.HTML( + return display.HTML( f"Computation deferred. Computation will process {get_formatted_bytes(query_job.total_bytes_processed)}" ) table_html = "" @@ -125,16 +126,20 @@ def wait_for_query_job( Returns: A row iterator over the query results. """ - loading_bar = widgets.HTML(get_query_job_loading_html(query_job)) if progress_bar == "auto": progress_bar = "notebook" if in_ipython() else "terminal" try: if progress_bar == "notebook": - display.display(loading_bar) + display_id = str(random.random()) + loading_bar = display.HTML(get_query_job_loading_html(query_job)) + display.display(loading_bar, display_id=display_id) query_result = query_job.result(max_results=max_results) query_job.reload() - loading_bar.value = get_query_job_loading_html(query_job) + display.update_display( + display.HTML(get_query_job_loading_html(query_job)), + display_id=display_id, + ) elif progress_bar == "terminal": initial_loading_bar = get_query_job_loading_string(query_job) print(initial_loading_bar) @@ -171,16 +176,19 @@ def wait_for_job(job: GenericJob, progress_bar: Optional[str] = None): progress_bar (str, Optional): Which progress bar to show. """ - loading_bar = widgets.HTML(get_base_job_loading_html(job)) if progress_bar == "auto": progress_bar = "notebook" if in_ipython() else "terminal" try: if progress_bar == "notebook": - display.display(loading_bar) + display_id = str(random.random()) + loading_bar = display.HTML(get_base_job_loading_html(job)) + display.display(loading_bar, display_id=display_id) job.result() job.reload() - loading_bar.value = get_base_job_loading_html(job) + display.update_display( + display.HTML(get_base_job_loading_html(job)), display_id=display_id + ) elif progress_bar == "terminal": inital_loading_bar = get_base_job_loading_string(job) print(inital_loading_bar) diff --git a/bigframes/ml/ensemble.py b/bigframes/ml/ensemble.py index 113ad872b5..19ca8608ff 100644 --- a/bigframes/ml/ensemble.py +++ b/bigframes/ml/ensemble.py @@ -507,6 +507,12 @@ def score( ): """Calculate evaluation metrics of the model. + .. note:: + + Output matches that of the BigQuery ML.EVALUTE function. + See: https://cloud.google.com/bigquery/docs/reference/standard-sql/bigqueryml-syntax-evaluate#regression_models + for the outputs relevant to this model type. + Args: X (bigframes.dataframe.DataFrame or bigframes.series.Series): A BigQuery DataFrame as evaluation data. @@ -676,6 +682,12 @@ def score( ): """Calculate evaluation metrics of the model. + .. note:: + + Output matches that of the BigQuery ML.EVALUTE function. + See: https://cloud.google.com/bigquery/docs/reference/standard-sql/bigqueryml-syntax-evaluate#classification_models + for the outputs relevant to this model type. + Args: X (bigframes.dataframe.DataFrame or bigframes.series.Series): A BigQuery DataFrame as evaluation data. diff --git a/bigframes/ml/forecasting.py b/bigframes/ml/forecasting.py index 8a6de1dd81..8e309d5e73 100644 --- a/bigframes/ml/forecasting.py +++ b/bigframes/ml/forecasting.py @@ -112,6 +112,12 @@ def score( ) -> bpd.DataFrame: """Calculate evaluation metrics of the model. + .. note:: + + Output matches that of the BigQuery ML.EVALUTE function. + See: https://cloud.google.com/bigquery/docs/reference/standard-sql/bigqueryml-syntax-evaluate#time_series_models + for the outputs relevant to this model type. + Args: X (bigframes.dataframe.DataFrame or bigframes.series.Series): A BigQuery DataFrame only contains 1 column as diff --git a/bigframes/ml/metrics.py b/bigframes/ml/metrics.py index 3bcb621f74..5731b946ca 100644 --- a/bigframes/ml/metrics.py +++ b/bigframes/ml/metrics.py @@ -96,7 +96,7 @@ def roc_curve( y_true_series, y_score_series = utils.convert_to_series(y_true, y_score) - session = y_true_series._block.expr._session + session = y_true_series._block.expr.session # We operate on rows, so, remove the index if there is one # TODO(bmil): check that the indexes are equivalent before removing diff --git a/bigframes/operations/aggregations.py b/bigframes/operations/aggregations.py index 23271e8220..465d188724 100644 --- a/bigframes/operations/aggregations.py +++ b/bigframes/operations/aggregations.py @@ -254,6 +254,53 @@ def handles_ties(self): return True +class QcutOp(WindowOp): + def __init__(self, quantiles: typing.Union[int, typing.Sequence[float]]): + self.name = f"qcut-{quantiles}" + self._quantiles = quantiles + + @numeric_op + def _as_ibis( + self, column: ibis_types.Column, window=None + ) -> ibis_types.IntegerValue: + if isinstance(self._quantiles, int): + quantiles_ibis = dtypes.literal_to_ibis_scalar(self._quantiles) + percent_ranks = typing.cast( + ibis_types.FloatingColumn, + _apply_window_if_present(column.percent_rank(), window), + ) + float_bucket = typing.cast( + ibis_types.FloatingColumn, (percent_ranks * quantiles_ibis) + ) + return float_bucket.ceil().clip(lower=_ibis_num(1)) - _ibis_num(1) + else: + percent_ranks = typing.cast( + ibis_types.FloatingColumn, + _apply_window_if_present(column.percent_rank(), window), + ) + out = ibis.case() + first_ibis_quantile = dtypes.literal_to_ibis_scalar(self._quantiles[0]) + out = out.when(percent_ranks < first_ibis_quantile, None) + for bucket_n in range(len(self._quantiles) - 1): + ibis_quantile = dtypes.literal_to_ibis_scalar( + self._quantiles[bucket_n + 1] + ) + out = out.when( + percent_ranks <= ibis_quantile, + dtypes.literal_to_ibis_scalar(bucket_n, force_dtype=Int64Dtype()), + ) + out = out.else_(None) + return out.end() + + @property + def skips_nulls(self): + return False + + @property + def handles_ties(self): + return True + + class NuniqueOp(AggregateOp): name = "nunique" @@ -491,3 +538,7 @@ def lookup_agg_func(key: str) -> AggregateOp: return _AGGREGATIONS_LOOKUP[key] else: raise ValueError(f"Unrecognize aggregate function: {key}") + + +def _ibis_num(number: float): + return typing.cast(ibis_types.NumericValue, ibis_types.literal(number)) diff --git a/bigframes/operations/base.py b/bigframes/operations/base.py index b9abb2cc03..d33befe4da 100644 --- a/bigframes/operations/base.py +++ b/bigframes/operations/base.py @@ -94,9 +94,7 @@ def __init__( if isinstance(dt, pd.ArrowDtype) ) ): - self._block = blocks.block_from_local( - pd_dataframe, session or bigframes.pandas.get_global_session() - ) + self._block = blocks.block_from_local(pd_dataframe) elif session: self._block = session.read_pandas(pd_dataframe)._get_block() else: diff --git a/bigframes/pandas/__init__.py b/bigframes/pandas/__init__.py index 5c1928e6f0..1c52b103fb 100644 --- a/bigframes/pandas/__init__.py +++ b/bigframes/pandas/__init__.py @@ -45,14 +45,18 @@ ) import bigframes._config as config +import bigframes.constants as constants +import bigframes.core.blocks import bigframes.core.global_session as global_session import bigframes.core.indexes import bigframes.core.reshape import bigframes.dataframe +import bigframes.operations as ops import bigframes.series import bigframes.session import bigframes.session.clients import third_party.bigframes_vendored.pandas.core.reshape.concat as vendored_pandas_concat +import third_party.bigframes_vendored.pandas.core.reshape.encoding as vendored_pandas_encoding import third_party.bigframes_vendored.pandas.core.reshape.merge as vendored_pandas_merge import third_party.bigframes_vendored.pandas.core.reshape.tile as vendored_pandas_tile @@ -134,6 +138,192 @@ def cut( cut.__doc__ = vendored_pandas_tile.cut.__doc__ +def get_dummies( + data: Union[DataFrame, Series], + prefix: Union[List, dict, str, None] = None, + prefix_sep: Union[List, dict, str, None] = "_", + dummy_na: bool = False, + columns: Optional[List] = None, + drop_first: bool = False, + dtype: Any = None, +) -> DataFrame: + # simplify input parameters into per-input-label lists + # also raise errors for invalid parameters + column_labels, prefixes, prefix_seps = _standardize_get_dummies_params( + data, prefix, prefix_sep, columns, dtype + ) + + # combine prefixes into per-column-id list + full_columns_prefixes, columns_ids = _determine_get_dummies_columns_from_labels( + data, column_labels, prefix is not None, prefixes, prefix_seps + ) + + # run queries to compute unique values + block = data._block + max_unique_value = ( + bigframes.core.blocks._BQ_MAX_COLUMNS + - len(block.value_columns) + - len(block.index_columns) + - 1 + ) // len(column_labels) + columns_values = [ + block._get_unique_values([col_id], max_unique_value) for col_id in columns_ids + ] + + # for each dummified column, add the content of the output columns via block operations + intermediate_col_ids = [] + for i in range(len(columns_values)): + level = columns_values[i].get_level_values(0).sort_values().dropna() + if drop_first: + level = level[1:] + column_label = full_columns_prefixes[i] + column_id = columns_ids[i] + block, new_intermediate_col_ids = _perform_get_dummies_block_operations( + block, level, column_label, column_id, dummy_na + ) + intermediate_col_ids.extend(new_intermediate_col_ids) + + # drop dummified columns (and the intermediate columns we added) + block = block.drop_columns(columns_ids + intermediate_col_ids) + return DataFrame(block) + + +get_dummies.__doc__ = vendored_pandas_encoding.get_dummies.__doc__ + + +def _standardize_get_dummies_params( + data: Union[DataFrame, Series], + prefix: Union[List, dict, str, None], + prefix_sep: Union[List, dict, str, None], + columns: Optional[List], + dtype: Any, +) -> Tuple[List, List[str], List[str]]: + block = data._block + + if isinstance(data, Series): + columns = [block.column_labels[0]] + if columns is not None and not pandas.api.types.is_list_like(columns): + raise TypeError("Input must be a list-like for parameter `columns`") + if dtype is not None and dtype not in [ + pandas.BooleanDtype, + bool, + "Boolean", + "boolean", + "bool", + ]: + raise NotImplementedError( + f"Only Boolean dtype is currently supported. {constants.FEEDBACK_LINK}" + ) + + if columns is None: + default_dummy_types = [pandas.StringDtype, "string[pyarrow]"] + columns = [] + columns_set = set() + for col_id in block.value_columns: + label = block.col_id_to_label[col_id] + if ( + label not in columns_set + and block.expr.get_column_type(col_id) in default_dummy_types + ): + columns.append(label) + columns_set.add(label) + + column_labels: List = typing.cast(List, columns) + + def parse_prefix_kwarg(kwarg, kwarg_name) -> Optional[List[str]]: + if kwarg is None: + return None + if isinstance(kwarg, str): + return [kwarg] * len(column_labels) + if isinstance(kwarg, dict): + return [kwarg[column] for column in column_labels] + kwarg = typing.cast(List, kwarg) + if pandas.api.types.is_list_like(kwarg) and len(kwarg) != len(column_labels): + raise ValueError( + f"Length of '{kwarg_name}' ({len(kwarg)}) did not match " + f"the length of the columns being encoded ({len(column_labels)})." + ) + if pandas.api.types.is_list_like(kwarg): + return list(map(str, kwarg)) + raise TypeError(f"{kwarg_name} kwarg must be a string, list, or dictionary") + + prefix_seps = parse_prefix_kwarg(prefix_sep or "_", "prefix_sep") + prefix_seps = typing.cast(List, prefix_seps) + prefixes = parse_prefix_kwarg(prefix, "prefix") + if prefixes is None: + prefixes = column_labels + prefixes = typing.cast(List, prefixes) + + return column_labels, prefixes, prefix_seps + + +def _determine_get_dummies_columns_from_labels( + data: Union[DataFrame, Series], + column_labels: List, + prefix_given: bool, + prefixes: List[str], + prefix_seps: List[str], +) -> Tuple[List[str], List[str]]: + block = data._block + + columns_ids = [] + columns_prefixes = [] + for i in range(len(column_labels)): + label = column_labels[i] + empty_prefix = label is None or (isinstance(data, Series) and not prefix_given) + full_prefix = "" if empty_prefix else prefixes[i] + prefix_seps[i] + + for col_id in block.label_to_col_id[label]: + columns_ids.append(col_id) + columns_prefixes.append(full_prefix) + + return columns_prefixes, columns_ids + + +def _perform_get_dummies_block_operations( + block: bigframes.core.blocks.Block, + level: pandas.Index, + column_label: str, + column_id: str, + dummy_na: bool, +) -> Tuple[bigframes.core.blocks.Block, List[str]]: + intermediate_col_ids = [] + for value in level: + new_column_label = f"{column_label}{value}" + if column_label == "": + new_column_label = value + new_block, new_id = block.apply_unary_op( + column_id, ops.BinopPartialLeft(ops.eq_op, value) + ) + intermediate_col_ids.append(new_id) + block, _ = new_block.apply_unary_op( + new_id, + ops.BinopPartialRight(ops.fillna_op, False), + result_label=new_column_label, + ) + if dummy_na: + # dummy column name for na depends on the dtype + na_string = str(pandas.Index([None], dtype=level.dtype)[0]) + new_column_label = f"{column_label}{na_string}" + block, _ = block.apply_unary_op( + column_id, ops.isnull_op, result_label=new_column_label + ) + return block, intermediate_col_ids + + +def qcut( + x: bigframes.series.Series, + q: int, + *, + labels: Optional[bool] = None, + duplicates: typing.Literal["drop", "error"] = "error", +) -> bigframes.series.Series: + return bigframes.core.reshape.qcut(x, q, labels=labels, duplicates=duplicates) + + +qcut.__doc__ = vendored_pandas_tile.qcut.__doc__ + + def merge( left: DataFrame, right: DataFrame, @@ -449,6 +639,9 @@ def read_gbq_function(function_name: str): options = config.options """Global :class:`~bigframes._config.Options` to configure BigQuery DataFrames.""" +option_context = config.option_context +"""Global :class:`~bigframes._config.option_context` to configure BigQuery DataFrames.""" + # Session management APIs get_global_session = global_session.get_global_session close_session = global_session.close_session @@ -481,6 +674,7 @@ def read_gbq_function(function_name: str): # Other public pandas attributes "NamedAgg", "options", + "option_context", # Session management APIs "get_global_session", "close_session", diff --git a/bigframes/remote_function.py b/bigframes/remote_function.py index c82ba84056..a39cd033f6 100644 --- a/bigframes/remote_function.py +++ b/bigframes/remote_function.py @@ -53,11 +53,6 @@ from bigframes import clients import bigframes.constants as constants -# TODO(shobs): Change the min log level to INFO after the development stabilizes -# before June 2023 -logging.basicConfig( - level=logging.INFO, format="[%(levelname)s][%(asctime)s][%(name)s] %(message)s" -) logger = logging.getLogger(__name__) # Protocol version 4 is available in python version 3.4 and above diff --git a/bigframes/series.py b/bigframes/series.py index 49df8ab61e..37d00d16f3 100644 --- a/bigframes/series.py +++ b/bigframes/series.py @@ -29,7 +29,6 @@ import bigframes.constants as constants import bigframes.core -from bigframes.core import WindowSpec import bigframes.core.block_transforms as block_ops import bigframes.core.blocks as blocks import bigframes.core.groupby as groupby @@ -43,6 +42,7 @@ import bigframes.core.scalar as scalars import bigframes.core.utils as utils import bigframes.core.window +import bigframes.core.window_spec import bigframes.dataframe import bigframes.dtypes import bigframes.formatting_helpers as formatter @@ -352,22 +352,7 @@ def reorder_levels(self, order: LevelsType, axis: int | str = 0): return Series(self._block.reorder_levels(resolved_level_ids)) def _resolve_levels(self, level: LevelsType) -> typing.Sequence[str]: - if _is_list_like(level): - levels = list(level) - else: - levels = [level] - resolved_level_ids = [] - for level_ref in levels: - if isinstance(level_ref, int): - resolved_level_ids.append(self._block.index_columns[level_ref]) - elif isinstance(level_ref, typing.Hashable): - matching_ids = self._block.index_name_to_col_id.get(level_ref, []) - if len(matching_ids) != 1: - raise ValueError("level name cannot be found or is ambiguous") - resolved_level_ids.append(matching_ids[0]) - else: - raise ValueError(f"Unexpected level: {level_ref}") - return resolved_level_ids + return self._block.resolve_index_level(level) def between(self, left, right, inclusive="both"): if inclusive not in ["both", "neither", "left", "right"]: @@ -382,43 +367,43 @@ def between(self, left, right, inclusive="both"): def cumsum(self) -> Series: return self._apply_window_op( - agg_ops.sum_op, bigframes.core.WindowSpec(following=0) + agg_ops.sum_op, bigframes.core.window_spec.WindowSpec(following=0) ) def ffill(self, *, limit: typing.Optional[int] = None) -> Series: - window = bigframes.core.WindowSpec(preceding=limit, following=0) + window = bigframes.core.window_spec.WindowSpec(preceding=limit, following=0) return self._apply_window_op(agg_ops.LastNonNullOp(), window) pad = ffill def bfill(self, *, limit: typing.Optional[int] = None) -> Series: - window = bigframes.core.WindowSpec(preceding=0, following=limit) + window = bigframes.core.window_spec.WindowSpec(preceding=0, following=limit) return self._apply_window_op(agg_ops.FirstNonNullOp(), window) def cummax(self) -> Series: return self._apply_window_op( - agg_ops.max_op, bigframes.core.WindowSpec(following=0) + agg_ops.max_op, bigframes.core.window_spec.WindowSpec(following=0) ) def cummin(self) -> Series: return self._apply_window_op( - agg_ops.min_op, bigframes.core.WindowSpec(following=0) + agg_ops.min_op, bigframes.core.window_spec.WindowSpec(following=0) ) def cumprod(self) -> Series: return self._apply_window_op( - agg_ops.product_op, bigframes.core.WindowSpec(following=0) + agg_ops.product_op, bigframes.core.window_spec.WindowSpec(following=0) ) def shift(self, periods: int = 1) -> Series: - window = bigframes.core.WindowSpec( + window = bigframes.core.window_spec.WindowSpec( preceding=periods if periods > 0 else None, following=-periods if periods < 0 else None, ) return self._apply_window_op(agg_ops.ShiftOp(periods), window) def diff(self, periods: int = 1) -> Series: - window = bigframes.core.WindowSpec( + window = bigframes.core.window_spec.WindowSpec( preceding=periods if periods > 0 else None, following=-periods if periods < 0 else None, ) @@ -820,7 +805,7 @@ def mode(self) -> Series: block, max_value_count_col_id = block.apply_window_op( value_count_col_id, agg_ops.max_op, - window_spec=WindowSpec(), + window_spec=bigframes.core.window_spec.WindowSpec(), ) block, is_mode_col_id = block.apply_binary_op( value_count_col_id, @@ -918,6 +903,29 @@ def argmin(self) -> int: scalars.Scalar, Series(block.select_column(row_nums)).iloc[0] ) + def unstack(self, level: LevelsType = -1): + if isinstance(level, int) or isinstance(level, str): + level = [level] + + block = self._block + + if self.index.nlevels == 1: + raise ValueError("Series must have multi-index to unstack") + + # Pivot by index levels + unstack_ids = self._resolve_levels(level) + block = block.reset_index(drop=False) + block = block.set_index( + [col for col in self._block.index_columns if col not in unstack_ids] + ) + + pivot_block = block.pivot( + columns=unstack_ids, + values=self._block.value_columns, + values_in_index=False, + ) + return bigframes.dataframe.DataFrame(pivot_block) + def idxmax(self) -> blocks.Label: block = self._block.order_by( [ @@ -1001,9 +1009,7 @@ def _apply_aggregation(self, op: agg_ops.AggregateOp) -> Any: return self._block.get_stat(self._value_column, op) def _apply_window_op( - self, - op: agg_ops.WindowOp, - window_spec: bigframes.core.WindowSpec, + self, op: agg_ops.WindowOp, window_spec: bigframes.core.window_spec.WindowSpec ): block = self._block block, result_id = block.apply_window_op( @@ -1062,7 +1068,7 @@ def sort_index(self, *, axis=0, ascending=True, na_position="last") -> Series: def rolling(self, window: int, min_periods=None) -> bigframes.core.window.Window: # To get n size window, need current row and n-1 preceding rows. - window_spec = WindowSpec( + window_spec = bigframes.core.window_spec.WindowSpec( preceding=window - 1, following=0, min_periods=min_periods or window ) return bigframes.core.window.Window( @@ -1070,7 +1076,9 @@ def rolling(self, window: int, min_periods=None) -> bigframes.core.window.Window ) def expanding(self, min_periods: int = 1) -> bigframes.core.window.Window: - window_spec = WindowSpec(following=0, min_periods=min_periods) + window_spec = bigframes.core.window_spec.WindowSpec( + following=0, min_periods=min_periods + ) return bigframes.core.window.Window( self._block, window_spec, self._block.value_columns, is_series=True ) @@ -1243,7 +1251,7 @@ def reindex(self, index=None, *, validate: typing.Optional[bool] = None): "Cannot reindex with index with different nlevels" ) new_indexer = bigframes.dataframe.DataFrame( - index=index, session=self._get_block().expr._session + index=index, session=self._get_block().expr.session )[[]] # multiindex join is senstive to index names, so we will set all these result = new_indexer.rename_axis(range(new_indexer.index.nlevels)).join( @@ -1407,7 +1415,7 @@ def map( elif isinstance(arg, Mapping): map_df = bigframes.dataframe.DataFrame( {"keys": list(arg.keys()), self.name: list(arg.values())}, - session=self._get_block().expr._session, + session=self._get_block().expr.session, ) map_df = map_df.set_index("keys") elif callable(arg): diff --git a/bigframes/session/__init__.py b/bigframes/session/__init__.py index db9c5a353c..5a61ed534f 100644 --- a/bigframes/session/__init__.py +++ b/bigframes/session/__init__.py @@ -68,6 +68,7 @@ import bigframes.core.blocks as blocks import bigframes.core.guid as guid from bigframes.core.ordering import IntegerEncoding, OrderingColumnReference +import bigframes.core.ordering as orderings import bigframes.core.utils as utils import bigframes.dataframe as dataframe import bigframes.formatting_helpers as formatting_helpers @@ -206,6 +207,10 @@ def _session_dataset_id(self): def _project(self): return self.bqclient.project + def __hash__(self): + # Stable hash needed to use in expression tree + return hash(self._session_id) + def _create_and_bind_bq_session(self): """Create a BQ session and bind the session id with clients to capture BQ activities: go/bigframes-transient-data""" @@ -347,7 +352,7 @@ def read_gbq_query( >>> import bigframes.pandas as bpd >>> bpd.options.display.progress_bar = None - Simple query input: + Simple query input: >>> df = bpd.read_gbq_query(''' ... SELECT @@ -363,7 +368,7 @@ def read_gbq_query( [2 rows x 3 columns] - Preserve ordering in a query input. + Preserve ordering in a query input. >>> df = bpd.read_gbq_query(''' ... SELECT @@ -592,11 +597,13 @@ def _read_gbq_table( # primary key(s) are set on a table. The query engine assumes such # columns are unique, even if not enforced. is_total_ordering = True - ordering = core.ExpressionOrdering( - ordering_value_columns=[ - core.OrderingColumnReference(column_id) - for column_id in total_ordering_cols - ], + ordering = orderings.ExpressionOrdering( + ordering_value_columns=tuple( + [ + core.OrderingColumnReference(column_id) + for column_id in total_ordering_cols + ] + ), total_ordering_columns=frozenset(total_ordering_cols), ) @@ -634,10 +641,13 @@ def _read_gbq_table( distinct_count = row["distinct_count"] is_total_ordering = total_count == distinct_count - ordering = core.ExpressionOrdering( - ordering_value_columns=[ - core.OrderingColumnReference(column_id) for column_id in index_cols - ], + ordering = orderings.ExpressionOrdering( + ordering_value_columns=tuple( + [ + core.OrderingColumnReference(column_id) + for column_id in index_cols + ] + ), total_ordering_columns=frozenset(index_cols), ) @@ -713,7 +723,7 @@ def _read_gbq_with_ordering( index_cols: Iterable[str] = (), index_labels: Iterable[Optional[str]] = (), hidden_cols: Iterable[str] = (), - ordering: core.ExpressionOrdering, + ordering: orderings.ExpressionOrdering, is_total_ordering: bool = False, api_name: str, ) -> dataframe.DataFrame: @@ -826,7 +836,7 @@ def _read_ibis( index_labels: Iterable[blocks.Label], column_keys: Iterable[str], column_labels: Iterable[blocks.Label], - ordering: core.ExpressionOrdering, + ordering: orderings.ExpressionOrdering, ) -> dataframe.DataFrame: """Turns a table expression (plus index column) into a DataFrame.""" @@ -843,7 +853,7 @@ def _read_ibis( hidden_ordering_columns.append(table_expression[ref.column_id]) block = blocks.Block( - core.ArrayValue( + core.ArrayValue.from_ibis( self, table_expression, columns, hidden_ordering_columns, ordering ), index_columns=[index_col.get_name() for index_col in index_cols], @@ -959,8 +969,8 @@ def _read_pandas( ) self._start_generic_job(load_job) - ordering = core.ExpressionOrdering( - ordering_value_columns=[OrderingColumnReference(ordering_col)], + ordering = orderings.ExpressionOrdering( + ordering_value_columns=tuple([OrderingColumnReference(ordering_col)]), total_ordering_columns=frozenset([ordering_col]), integer_encoding=IntegerEncoding(True, is_sequential=True), ) @@ -1303,7 +1313,7 @@ def _create_sequential_ordering( table: ibis_types.Table, index_cols: Iterable[str] = (), api_name: str = "", - ) -> Tuple[ibis_types.Table, core.ExpressionOrdering]: + ) -> Tuple[ibis_types.Table, orderings.ExpressionOrdering]: # Since this might also be used as the index, don't use the default # "ordering ID" name. default_ordering_name = guid.generate_guid("bigframes_ordering_") @@ -1320,8 +1330,8 @@ def _create_sequential_ordering( f"{table_ref.project}.{table_ref.dataset_id}.{table_ref.table_id}" ) ordering_reference = core.OrderingColumnReference(default_ordering_name) - ordering = core.ExpressionOrdering( - ordering_value_columns=[ordering_reference], + ordering = orderings.ExpressionOrdering( + ordering_value_columns=tuple([ordering_reference]), total_ordering_columns=frozenset([default_ordering_name]), integer_encoding=IntegerEncoding(is_encoded=True, is_sequential=True), ) @@ -1457,13 +1467,13 @@ def read_gbq_function( **Examples:** - >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None - >>> function_name = "bqutil.fn.cw_lower_case_ascii_only" - >>> func = bpd.read_gbq_function(function_name=function_name) - >>> func.bigframes_remote_function - 'bqutil.fn.cw_lower_case_ascii_only' + >>> function_name = "bqutil.fn.cw_lower_case_ascii_only" + >>> func = bpd.read_gbq_function(function_name=function_name) + >>> func.bigframes_remote_function + 'bqutil.fn.cw_lower_case_ascii_only' Args: function_name (str): @@ -1494,12 +1504,10 @@ def _start_query( max_results: Optional[int] = None, ) -> Tuple[bigquery.table.RowIterator, bigquery.QueryJob]: """ - Starts query job and waits for results + Starts query job and waits for results. """ - if job_config is not None: - query_job = self.bqclient.query(sql, job_config=job_config) - else: - query_job = self.bqclient.query(sql) + job_config = self._prepare_job_config(job_config) + query_job = self.bqclient.query(sql, job_config=job_config) opts = bigframes.options.display if opts.progress_bar is not None and not query_job.configuration.dry_run: @@ -1515,14 +1523,10 @@ def _get_table_size(self, destination_table): return table.num_bytes def _rows_to_dataframe( - self, row_iterator: bigquery.table.RowIterator + self, row_iterator: bigquery.table.RowIterator, dtypes: Dict ) -> pandas.DataFrame: - return row_iterator.to_dataframe( - bool_dtype=pandas.BooleanDtype(), - int_dtype=pandas.Int64Dtype(), - float_dtype=pandas.Float64Dtype(), - string_dtype=pandas.StringDtype(storage="pyarrow"), - ) + arrow_table = row_iterator.to_arrow() + return bigframes.session._io.pandas.arrow_to_pandas(arrow_table, dtypes) def _start_generic_job(self, job: formatting_helpers.GenericJob): if bigframes.options.display.progress_bar is not None: @@ -1532,6 +1536,17 @@ def _start_generic_job(self, job: formatting_helpers.GenericJob): else: job.result() + def _prepare_job_config( + self, job_config: Optional[bigquery.QueryJobConfig] = None + ) -> bigquery.QueryJobConfig: + if job_config is None: + job_config = self.bqclient.default_query_job_config + if bigframes.options.compute.maximum_bytes_billed is not None: + job_config.maximum_bytes_billed = ( + bigframes.options.compute.maximum_bytes_billed + ) + return job_config + def connect(context: Optional[bigquery_options.BigQueryOptions] = None) -> Session: return Session(context) diff --git a/bigframes/session/_io/pandas.py b/bigframes/session/_io/pandas.py new file mode 100644 index 0000000000..1af00a2d01 --- /dev/null +++ b/bigframes/session/_io/pandas.py @@ -0,0 +1,89 @@ +# Copyright 2023 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from typing import Dict, Union + +import geopandas # type: ignore +import pandas +import pandas.arrays +import pyarrow # type: ignore +import pyarrow.compute # type: ignore + +import bigframes.constants + + +def arrow_to_pandas( + arrow_table: Union[pyarrow.Table, pyarrow.RecordBatch], dtypes: Dict +): + if len(dtypes) != arrow_table.num_columns: + raise ValueError( + f"Number of types {len(dtypes)} doesn't match number of columns " + f"{arrow_table.num_columns}. {bigframes.constants.FEEDBACK_LINK}" + ) + + serieses = {} + for field, column in zip(arrow_table.schema, arrow_table): + dtype = dtypes[field.name] + + if dtype == geopandas.array.GeometryDtype(): + series = geopandas.GeoSeries.from_wkt( + column, + # BigQuery geography type is based on the WGS84 reference ellipsoid. + crs="EPSG:4326", + ) + elif dtype == pandas.Float64Dtype(): + # Preserve NA/NaN distinction. Note: This is currently needed, even if we use + # nullable Float64Dtype in the types_mapper. See: + # https://github.com/pandas-dev/pandas/issues/55668 + mask = pyarrow.compute.is_null(column) + nonnull = pyarrow.compute.fill_null(column, float("nan")) + # Regarding type: ignore, this class has been public at this + # location since pandas 1.2.0. See: + # https://pandas.pydata.org/docs/dev/reference/api/pandas.arrays.FloatingArray.html + pd_array = pandas.arrays.FloatingArray( # type: ignore + nonnull.to_numpy() + if isinstance(nonnull, pyarrow.ChunkedArray) + else nonnull.to_numpy(zero_copy_only=False), + mask.to_numpy() + if isinstance(mask, pyarrow.ChunkedArray) + else mask.to_numpy(zero_copy_only=False), + ) + series = pandas.Series(pd_array, dtype=dtype) + elif dtype == pandas.Int64Dtype(): + # Avoid out-of-bounds errors in Pandas 1.5.x, which incorrectly + # casts to float64 in an intermediate step. + mask = pyarrow.compute.is_null(column) + nonnull = pyarrow.compute.fill_null(column, 0) + pd_array = pandas.arrays.IntegerArray( + nonnull.to_numpy() + if isinstance(nonnull, pyarrow.ChunkedArray) + else nonnull.to_numpy(zero_copy_only=False), + mask.to_numpy() + if isinstance(mask, pyarrow.ChunkedArray) + else mask.to_numpy(zero_copy_only=False), + ) + series = pandas.Series(pd_array, dtype=dtype) + elif isinstance(dtype, pandas.ArrowDtype): + # Avoid conversion logic if we are backing the pandas Series by the + # arrow array. + series = pandas.Series( + pandas.arrays.ArrowExtensionArray(column), # type: ignore + dtype=dtype, + ) + else: + series = column.to_pandas(types_mapper=lambda _: dtype) + + serieses[field.name] = series + + return pandas.DataFrame(serieses) diff --git a/bigframes/session/clients.py b/bigframes/session/clients.py index 544f74265f..e33413002f 100644 --- a/bigframes/session/clients.py +++ b/bigframes/session/clients.py @@ -28,12 +28,13 @@ import google.cloud.bigquery_storage_v1 import google.cloud.functions_v2 import google.cloud.resourcemanager_v3 +import ibis import pydata_google_auth import bigframes.version _ENV_DEFAULT_PROJECT = "GOOGLE_CLOUD_PROJECT" -_APPLICATION_NAME = f"bigframes/{bigframes.version.__version__}" +_APPLICATION_NAME = f"bigframes/{bigframes.version.__version__} ibis/{ibis.__version__}" _SCOPES = ["https://www.googleapis.com/auth/cloud-platform"] # BigQuery is a REST API, which requires the protocol as part of the URL. diff --git a/bigframes/version.py b/bigframes/version.py index 18edfa5615..b324ed7234 100644 --- a/bigframes/version.py +++ b/bigframes/version.py @@ -12,4 +12,4 @@ # See the License for the specific language governing permissions and # limitations under the License. -__version__ = "0.11.0" +__version__ = "0.12.0" diff --git a/docs/reference/bigframes/options.rst b/docs/reference/bigframes/options.rst index d831a519fe..991399eb88 100644 --- a/docs/reference/bigframes/options.rst +++ b/docs/reference/bigframes/options.rst @@ -12,3 +12,5 @@ Options and settings .. autoclass:: bigframes._config.display_options.DisplayOptions .. autoclass:: bigframes._config.sampling_options.SamplingOptions + +.. autoclass:: bigframes._config.compute_options.ComputeOptions diff --git a/docs/templates/toc.yml b/docs/templates/toc.yml index 4fe2ec1a6a..9879721d28 100644 --- a/docs/templates/toc.yml +++ b/docs/templates/toc.yml @@ -13,6 +13,8 @@ uid: bigframes._config.display_options.DisplayOptions - name: SamplingOptions uid: bigframes._config.sampling_options.SamplingOptions + - name: ComputeOptions + uid: bigframes._config.compute_options.ComputeOptions name: Options and settings - items: - name: Session diff --git a/noxfile.py b/noxfile.py index 1864da9fe7..d0bbda80fd 100644 --- a/noxfile.py +++ b/noxfile.py @@ -305,8 +305,10 @@ def run_system( "py.test", "--quiet", "-n=20", - # Any individual test taking longer than 10 mins will be terminated. + # Any individual test taking longer than 15 mins will be terminated. "--timeout=900", + # Log 20 slowest tests + "--durations=20", f"--junitxml={prefix_name}_{session.python}_sponge_log.xml", ] if print_duration: diff --git a/samples/snippets/load_data_from_biquery_job_test.py b/samples/snippets/load_data_from_biquery_job_test.py new file mode 100644 index 0000000000..5271574a49 --- /dev/null +++ b/samples/snippets/load_data_from_biquery_job_test.py @@ -0,0 +1,51 @@ +# Copyright 2023 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +def test_bigquery_dataframes_load_data_from_bigquery_job(): + from google.cloud import bigquery + + # Construct a BigQuery client object. + client = bigquery.Client(project="bigframes-dev", location="us") + + query = """ + SELECT * + FROM `bigquery-public-data.ml_datasets.penguins` + LIMIT 20 + """ + query_job = client.query(query) + JOB_ID = query_job.job_id + your_project_id = "bigframes-dev" + + # [START bigquery_dataframes_load_data_from_bigquery_job] + from google.cloud import bigquery + + import bigframes.pandas as bpd + + # Project ID inserted based on the query results selected to explore + project = your_project_id + # Location inserted based on the query results selected to explore + location = "us" + client = bigquery.Client(project=project, location=location) + + # Job ID inserted based on the query results selcted to explore + job_id = JOB_ID + job = client.get_job(job_id) + destination = str(job.destination) + + # Load data from a BigQuery table using BigFrames DataFrames: + bq_df = bpd.read_gbq_table(destination) + + # [END bigquery_dataframes_load_data_from_bigquery_job] + assert bq_df is not None diff --git a/tests/system/conftest.py b/tests/system/conftest.py index cb664302a8..f9f69c6c8e 100644 --- a/tests/system/conftest.py +++ b/tests/system/conftest.py @@ -400,7 +400,11 @@ def hockey_df( hockey_table_id: str, session: bigframes.Session ) -> bigframes.dataframe.DataFrame: """DataFrame pointing at test data.""" - return session.read_gbq(hockey_table_id) + return ( + session.read_gbq(hockey_table_id) + .set_index(["player_name", "season"]) + .sort_index() + ) @pytest.fixture(scope="session") @@ -419,7 +423,7 @@ def hockey_pandas_df() -> pd.DataFrame: "season": pd.Int64Dtype(), }, ) - df.index = df.index.astype("Int64") + df = df.set_index(["player_name", "season"]).sort_index() return df @@ -894,13 +898,6 @@ def usa_names_grouped_table( return session.bqclient.get_table(table_id) -@pytest.fixture() -def deferred_repr(): - bigframes.options.display.repr_mode = "deferred" - yield - bigframes.options.display.repr_mode = "head" - - @pytest.fixture() def restore_sampling_settings(): enable_downsampling = bigframes.options.sampling.enable_downsampling diff --git a/tests/system/large/test_remote_function.py b/tests/system/large/test_remote_function.py index 730a1dbde4..c8f8f66eba 100644 --- a/tests/system/large/test_remote_function.py +++ b/tests/system/large/test_remote_function.py @@ -27,6 +27,7 @@ import pytest import test_utils.prefixer +import bigframes from bigframes.remote_function import ( get_cloud_function_name, get_remote_function_locations, @@ -1120,3 +1121,92 @@ def plusone(x): ) for dir_ in dirs_to_cleanup: shutil.rmtree(dir_) + + +@pytest.mark.flaky(retries=2, delay=120) +def test_remote_function_via_session_context_connection_setter( + scalars_dfs, dataset_id, bq_cf_connection +): + # Creating a session scoped only to this test as we would be setting a + # property in it + context = bigframes.BigQueryOptions() + context.bq_connection = bq_cf_connection + session = bigframes.connect(context) + + try: + # Without an explicit bigquery connection, the one present in Session, + # set via context setter would be used. Without an explicit `reuse` the + # default behavior of reuse=True will take effect. Please note that the + # udf is same as the one used in other tests in this file so the underlying + # cloud function would be common with reuse=True. Since we are using a + # unique dataset_id, even though the cloud function would be reused, the bq + # remote function would still be created, making use of the bq connection + # set in the BigQueryOptions above. + @session.remote_function([int], int, dataset=dataset_id) + def square(x): + return x * x + + scalars_df, scalars_pandas_df = scalars_dfs + + bf_int64_col = scalars_df["int64_col"] + bf_int64_col_filter = bf_int64_col.notnull() + bf_int64_col_filtered = bf_int64_col[bf_int64_col_filter] + bf_result_col = bf_int64_col_filtered.apply(square) + bf_result = ( + bf_int64_col_filtered.to_frame().assign(result=bf_result_col).to_pandas() + ) + + pd_int64_col = scalars_pandas_df["int64_col"] + pd_int64_col_filter = pd_int64_col.notnull() + pd_int64_col_filtered = pd_int64_col[pd_int64_col_filter] + pd_result_col = pd_int64_col_filtered.apply(lambda x: x * x) + # TODO(shobs): Figure why pandas .apply() changes the dtype, i.e. + # pd_int64_col_filtered.dtype is Int64Dtype() + # pd_int64_col_filtered.apply(lambda x: x * x).dtype is int64. + # For this test let's force the pandas dtype to be same as bigframes' dtype. + pd_result_col = pd_result_col.astype(pandas.Int64Dtype()) + pd_result = pd_int64_col_filtered.to_frame().assign(result=pd_result_col) + + assert_pandas_df_equal_ignore_ordering(bf_result, pd_result) + finally: + # clean up the gcp assets created for the remote function + cleanup_remote_function_assets( + session.bqclient, session.cloudfunctionsclient, square + ) + + +@pytest.mark.flaky(retries=2, delay=120) +def test_remote_function_default_connection(session, scalars_dfs, dataset_id): + try: + + @session.remote_function([int], int, dataset=dataset_id) + def square(x): + return x * x + + scalars_df, scalars_pandas_df = scalars_dfs + + bf_int64_col = scalars_df["int64_col"] + bf_int64_col_filter = bf_int64_col.notnull() + bf_int64_col_filtered = bf_int64_col[bf_int64_col_filter] + bf_result_col = bf_int64_col_filtered.apply(square) + bf_result = ( + bf_int64_col_filtered.to_frame().assign(result=bf_result_col).to_pandas() + ) + + pd_int64_col = scalars_pandas_df["int64_col"] + pd_int64_col_filter = pd_int64_col.notnull() + pd_int64_col_filtered = pd_int64_col[pd_int64_col_filter] + pd_result_col = pd_int64_col_filtered.apply(lambda x: x * x) + # TODO(shobs): Figure why pandas .apply() changes the dtype, i.e. + # pd_int64_col_filtered.dtype is Int64Dtype() + # pd_int64_col_filtered.apply(lambda x: x * x).dtype is int64. + # For this test let's force the pandas dtype to be same as bigframes' dtype. + pd_result_col = pd_result_col.astype(pandas.Int64Dtype()) + pd_result = pd_int64_col_filtered.to_frame().assign(result=pd_result_col) + + assert_pandas_df_equal_ignore_ordering(bf_result, pd_result) + finally: + # clean up the gcp assets created for the remote function + cleanup_remote_function_assets( + session.bqclient, session.cloudfunctionsclient, square + ) diff --git a/tests/system/small/ml/test_llm.py b/tests/system/small/ml/test_llm.py index a801c36c83..b7257dde1b 100644 --- a/tests/system/small/ml/test_llm.py +++ b/tests/system/small/ml/test_llm.py @@ -26,9 +26,6 @@ def test_create_text_generator_model(palm2_text_generator_model): assert palm2_text_generator_model._bqml_model is not None -@pytest.mark.skip( - reason="Temporarily disable to validate the hypothesis that LLM capacity is causing the presubmit tests to take long to run." -) @pytest.mark.flaky(retries=2, delay=120) def test_create_text_generator_model_default_session(bq_connection, llm_text_pandas_df): import bigframes.pandas as bpd @@ -51,9 +48,6 @@ def test_create_text_generator_model_default_session(bq_connection, llm_text_pan assert all(series.str.len() > 20) -@pytest.mark.skip( - reason="Temporarily disable to validate the hypothesis that LLM capacity is causing the presubmit tests to take long to run." -) @pytest.mark.flaky(retries=2, delay=120) def test_create_text_generator_model_default_connection(llm_text_pandas_df): from bigframes import _config @@ -80,9 +74,6 @@ def test_create_text_generator_model_default_connection(llm_text_pandas_df): # Marked as flaky only because BQML LLM is in preview, the service only has limited capacity, not stable enough. -@pytest.mark.skip( - reason="Temporarily disable to validate the hypothesis that LLM capacity is causing the presubmit tests to take long to run." -) @pytest.mark.flaky(retries=2, delay=120) def test_text_generator_predict_default_params_success( palm2_text_generator_model, llm_text_df @@ -94,9 +85,6 @@ def test_text_generator_predict_default_params_success( assert all(series.str.len() > 20) -@pytest.mark.skip( - reason="Temporarily disable to validate the hypothesis that LLM capacity is causing the presubmit tests to take long to run." -) @pytest.mark.flaky(retries=2, delay=120) def test_text_generator_predict_series_default_params_success( palm2_text_generator_model, llm_text_df @@ -108,9 +96,6 @@ def test_text_generator_predict_series_default_params_success( assert all(series.str.len() > 20) -@pytest.mark.skip( - reason="Temporarily disable to validate the hypothesis that LLM capacity is causing the presubmit tests to take long to run." -) @pytest.mark.flaky(retries=2, delay=120) def test_text_generator_predict_arbitrary_col_label_success( palm2_text_generator_model, llm_text_df @@ -123,9 +108,6 @@ def test_text_generator_predict_arbitrary_col_label_success( assert all(series.str.len() > 20) -@pytest.mark.skip( - reason="Temporarily disable to validate the hypothesis that LLM capacity is causing the presubmit tests to take long to run." -) @pytest.mark.flaky(retries=2, delay=120) def test_text_generator_predict_with_params_success( palm2_text_generator_model, llm_text_df @@ -157,9 +139,6 @@ def test_create_text_embedding_generator_model_defaults(bq_connection): assert model._bqml_model is not None -@pytest.mark.skip( - reason="Temporarily disable to validate the hypothesis that LLM capacity is causing the presubmit tests to take long to run." -) @pytest.mark.flaky(retries=2, delay=120) def test_embedding_generator_predict_success( palm2_embedding_generator_model, llm_text_df @@ -173,9 +152,6 @@ def test_embedding_generator_predict_success( assert value.size == 768 -@pytest.mark.skip( - reason="Temporarily disable to validate the hypothesis that LLM capacity is causing the presubmit tests to take long to run." -) @pytest.mark.flaky(retries=2, delay=120) def test_embedding_generator_predict_series_success( palm2_embedding_generator_model, llm_text_df diff --git a/tests/system/small/test_dataframe.py b/tests/system/small/test_dataframe.py index 19e50eb06d..c96faa3526 100644 --- a/tests/system/small/test_dataframe.py +++ b/tests/system/small/test_dataframe.py @@ -505,14 +505,32 @@ def test_assign_new_column_w_setitem_list(scalars_dfs): pd.testing.assert_frame_equal(bf_result, pd_result) +def test_assign_new_column_w_setitem_list_repeated(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + bf_df = scalars_df.copy() + pd_df = scalars_pandas_df.copy() + bf_df["new_col"] = [9, 8, 7, 6, 5, 4, 3, 2, 1] + pd_df["new_col"] = [9, 8, 7, 6, 5, 4, 3, 2, 1] + bf_df["new_col_2"] = [1, 3, 2, 5, 4, 7, 6, 9, 8] + pd_df["new_col_2"] = [1, 3, 2, 5, 4, 7, 6, 9, 8] + bf_result = bf_df.to_pandas() + pd_result = pd_df + + # Convert default pandas dtypes `int64` to match BigQuery DataFrames dtypes. + pd_result["new_col"] = pd_result["new_col"].astype("Int64") + pd_result["new_col_2"] = pd_result["new_col_2"].astype("Int64") + + pd.testing.assert_frame_equal(bf_result, pd_result) + + def test_assign_new_column_w_setitem_list_custom_index(scalars_dfs): scalars_df, scalars_pandas_df = scalars_dfs bf_df = scalars_df.copy() pd_df = scalars_pandas_df.copy() # set the custom index - pd_df = pd_df.set_index("string_col") - bf_df = bf_df.set_index("string_col") + pd_df = pd_df.set_index(["string_col", "int64_col"]) + bf_df = bf_df.set_index(["string_col", "int64_col"]) bf_df["new_col"] = [9, 8, 7, 6, 5, 4, 3, 2, 1] pd_df["new_col"] = [9, 8, 7, 6, 5, 4, 3, 2, 1] @@ -1901,6 +1919,49 @@ def test_df_stack(scalars_dfs): pd.testing.assert_series_equal(bf_result, pd_result, check_dtype=False) +def test_df_melt_default(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + # To match bigquery dataframes + scalars_pandas_df = scalars_pandas_df.copy() + scalars_pandas_df.columns = scalars_pandas_df.columns.astype("string[pyarrow]") + # Can only stack identically-typed columns + columns = ["int64_col", "int64_too", "rowindex_2"] + + bf_result = scalars_df[columns].melt().to_pandas() + pd_result = scalars_pandas_df[columns].melt() + + # Pandas produces int64 index, Bigframes produces Int64 (nullable) + pd.testing.assert_frame_equal( + bf_result, pd_result, check_index_type=False, check_dtype=False + ) + + +def test_df_melt_parameterized(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + # To match bigquery dataframes + scalars_pandas_df = scalars_pandas_df.copy() + scalars_pandas_df.columns = scalars_pandas_df.columns.astype("string[pyarrow]") + # Can only stack identically-typed columns + + bf_result = scalars_df.melt( + var_name="alice", + value_name="bob", + id_vars=["string_col"], + value_vars=["int64_col", "int64_too"], + ).to_pandas() + pd_result = scalars_pandas_df.melt( + var_name="alice", + value_name="bob", + id_vars=["string_col"], + value_vars=["int64_col", "int64_too"], + ) + + # Pandas produces int64 index, Bigframes produces Int64 (nullable) + pd.testing.assert_frame_equal( + bf_result, pd_result, check_index_type=False, check_dtype=False + ) + + def test_df_unstack(scalars_dfs): scalars_df, scalars_pandas_df = scalars_dfs # To match bigquery dataframes @@ -1949,8 +2010,14 @@ def test_df_pivot(scalars_dfs, values, index, columns): ], ) def test_df_pivot_hockey(hockey_df, hockey_pandas_df, values, index, columns): - bf_result = hockey_df.pivot(values=values, index=index, columns=columns).to_pandas() - pd_result = hockey_pandas_df.pivot(values=values, index=index, columns=columns) + bf_result = ( + hockey_df.reset_index() + .pivot(values=values, index=index, columns=columns) + .to_pandas() + ) + pd_result = hockey_pandas_df.reset_index().pivot( + values=values, index=index, columns=columns + ) # Pandas produces NaN, where bq dataframes produces pd.NA pd.testing.assert_frame_equal(bf_result, pd_result, check_dtype=False) @@ -2046,16 +2113,6 @@ def test__dir__with_rename(scalars_dfs): def test_iloc_slice(scalars_df_index, scalars_pandas_df_index, start, stop, step): bf_result = scalars_df_index.iloc[start:stop:step].to_pandas() pd_result = scalars_pandas_df_index.iloc[start:stop:step] - - # Pandas may assign non-object dtype to empty series and series index - # dtypes of empty columns are a known area of divergence from pandas - for column in pd_result.columns: - if ( - pd_result[column].empty and column != "geography_col" - ): # for empty geography_col, bigframes assigns non-object dtype - pd_result[column] = pd_result[column].astype("object") - pd_result.index = pd_result.index.astype("object") - pd.testing.assert_frame_equal( bf_result, pd_result, @@ -3207,6 +3264,23 @@ def test_df_dot( ) +def test_df_dot_operator( + matrix_2by3_df, matrix_2by3_pandas_df, matrix_3by4_df, matrix_3by4_pandas_df +): + bf_result = (matrix_2by3_df @ matrix_3by4_df).to_pandas() + pd_result = matrix_2by3_pandas_df @ matrix_3by4_pandas_df + + # Patch pandas dtypes for testing parity + # Pandas result is object instead of Int64 (nullable) dtype. + for name in pd_result.columns: + pd_result[name] = pd_result[name].astype(pd.Int64Dtype()) + + pd.testing.assert_frame_equal( + bf_result, + pd_result, + ) + + def test_df_dot_series( matrix_2by3_df, matrix_2by3_pandas_df, matrix_3by4_df, matrix_3by4_pandas_df ): @@ -3221,3 +3295,19 @@ def test_df_dot_series( bf_result, pd_result, ) + + +def test_df_dot_operator_series( + matrix_2by3_df, matrix_2by3_pandas_df, matrix_3by4_df, matrix_3by4_pandas_df +): + bf_result = (matrix_2by3_df @ matrix_3by4_df["x"]).to_pandas() + pd_result = matrix_2by3_pandas_df @ matrix_3by4_pandas_df["x"] + + # Patch pandas dtypes for testing parity + # Pandas result is object instead of Int64 (nullable) dtype. + pd_result = pd_result.astype(pd.Int64Dtype()) + + pd.testing.assert_series_equal( + bf_result, + pd_result, + ) diff --git a/tests/system/small/test_dataframe_io.py b/tests/system/small/test_dataframe_io.py index d60083a837..8f5d706f62 100644 --- a/tests/system/small/test_dataframe_io.py +++ b/tests/system/small/test_dataframe_io.py @@ -83,6 +83,14 @@ def test_to_pandas_array_struct_correct_result(session): ) +def test_to_pandas_batches_w_correct_dtypes(scalars_df_default_index): + """Verify to_pandas_batches() APIs returns the expected dtypes.""" + expected = scalars_df_default_index.dtypes + for df in scalars_df_default_index.to_pandas_batches(): + actual = df.dtypes + pd.testing.assert_series_equal(actual, expected) + + @pytest.mark.parametrize( ("index"), [True, False], diff --git a/tests/system/small/test_multiindex.py b/tests/system/small/test_multiindex.py index b5c78de69c..bc35f633fd 100644 --- a/tests/system/small/test_multiindex.py +++ b/tests/system/small/test_multiindex.py @@ -752,6 +752,34 @@ def test_column_multi_index_stack(level): ) +def test_column_multi_index_melt(): + if pandas.__version__.startswith("1.") or pandas.__version__.startswith("2.0"): + pytest.skip("pandas <2.1 uses different stack implementation") + + level1 = pandas.Index(["b", "a", "b"]) + level2 = pandas.Index(["a", "b", "b"]) + level3 = pandas.Index(["b", "b", "a"]) + + multi_columns = pandas.MultiIndex.from_arrays( + [level1, level2, level3], names=["l1", "l2", "l3"] + ) + pd_df = pandas.DataFrame( + [[1, 2, 3], [4, 5, 6], [7, 8, 9]], + index=[5, 2, None], + columns=multi_columns, + dtype="Int64", + ) + bf_df = bpd.DataFrame(pd_df) + + bf_result = bf_df.melt().to_pandas() + pd_result = pd_df.melt() + + # BigFrames uses different string and int types, but values are identical + pandas.testing.assert_frame_equal( + bf_result, pd_result, check_index_type=False, check_dtype=False + ) + + def test_column_multi_index_unstack(scalars_df_index, scalars_pandas_df_index): columns = ["int64_too", "int64_col", "rowindex_2"] level1 = pandas.Index(["b", "a", "b"], dtype="string[pyarrow]") @@ -909,13 +937,36 @@ def test_column_multi_index_reorder_levels(scalars_df_index, scalars_pandas_df_i pandas.testing.assert_frame_equal(bf_result, pd_result) -def test_multi_index_unstack(hockey_df, hockey_pandas_df): +@pytest.mark.parametrize( + ("level",), + [(["position", "team_name"],), ([-2, -1],), (["position"],), ("season",), (-3,)], +) +def test_df_multi_index_unstack(hockey_df, hockey_pandas_df, level): bf_result = ( - hockey_df.set_index(["team_name", "season", "position"]).unstack().to_pandas() + hockey_df.set_index(["team_name", "position"], append=True) + .unstack(level=level) + .to_pandas() ) pd_result = hockey_pandas_df.set_index( - ["team_name", "season", "position"] - ).unstack() + ["team_name", "position"], append=True + ).unstack(level=level) + + pandas.testing.assert_frame_equal(bf_result, pd_result, check_dtype=False) + + +@pytest.mark.parametrize( + ("level",), + [(["position", "team_name"],), ([-2, -1],), (["position"],), ("season",), (-3,)], +) +def test_series_multi_index_unstack(hockey_df, hockey_pandas_df, level): + bf_result = ( + hockey_df.set_index(["team_name", "position"], append=True)["number"] + .unstack(level=level) + .to_pandas() + ) + pd_result = hockey_pandas_df.set_index(["team_name", "position"], append=True)[ + "number" + ].unstack(level=level) pandas.testing.assert_frame_equal(bf_result, pd_result, check_dtype=False) @@ -947,6 +998,9 @@ def test_df_multi_index_dot_not_supported(): with pytest.raises(NotImplementedError, match="Multi-index input is not supported"): bf1.dot(bf2) + with pytest.raises(NotImplementedError, match="Multi-index input is not supported"): + bf1 @ bf2 + # right multi-index right_index = pandas.MultiIndex.from_tuples([("a", "aa"), ("a", "ab"), ("b", "bb")]) bf1 = bpd.DataFrame(left_matrix) @@ -954,6 +1008,9 @@ def test_df_multi_index_dot_not_supported(): with pytest.raises(NotImplementedError, match="Multi-index input is not supported"): bf1.dot(bf2) + with pytest.raises(NotImplementedError, match="Multi-index input is not supported"): + bf1 @ bf2 + def test_column_multi_index_dot_not_supported(): left_matrix = [[1, 2, 3], [2, 5, 7]] @@ -971,6 +1028,11 @@ def test_column_multi_index_dot_not_supported(): ): bf1.dot(bf2) + with pytest.raises( + NotImplementedError, match="Multi-level column input is not supported" + ): + bf1 @ bf2 + # right multi-columns bf1 = bpd.DataFrame(left_matrix) bf2 = bpd.DataFrame(right_matrix, columns=multi_level_columns) @@ -978,3 +1040,8 @@ def test_column_multi_index_dot_not_supported(): NotImplementedError, match="Multi-level column input is not supported" ): bf1.dot(bf2) + + with pytest.raises( + NotImplementedError, match="Multi-level column input is not supported" + ): + bf1 @ bf2 diff --git a/tests/system/small/test_pandas.py b/tests/system/small/test_pandas.py index a429c6551d..0292ebd206 100644 --- a/tests/system/small/test_pandas.py +++ b/tests/system/small/test_pandas.py @@ -45,6 +45,118 @@ def test_concat_series(scalars_dfs): pd.testing.assert_series_equal(bf_result, pd_result) +@pytest.mark.parametrize( + ("kwargs"), + [ + { + "prefix": ["prefix1", "prefix2"], + "prefix_sep": "_", + "dummy_na": None, + "columns": ["bool_col", "int64_col"], + "drop_first": False, + }, + { + "prefix": "prefix", + "prefix_sep": ["_", ","], + "dummy_na": False, + "columns": ["int64_too", "string_col"], + "drop_first": False, + }, + { + "prefix": None, + "prefix_sep": ".", + "dummy_na": True, + "columns": ["time_col", "float64_col"], + "drop_first": True, + }, + ], +) +def test_get_dummies_dataframe(scalars_dfs, kwargs): + scalars_df, scalars_pandas_df = scalars_dfs + + bf_result = bpd.get_dummies(scalars_df, **kwargs, dtype=bool) + pd_result = pd.get_dummies(scalars_pandas_df, **kwargs, dtype=bool) + # dtype argument above is needed for pandas v1 only + + # adjust for expected dtype differences + for (column_name, type_name) in zip(pd_result.columns, pd_result.dtypes): + if type_name == "bool": + pd_result[column_name] = pd_result[column_name].astype("boolean") + + pd.testing.assert_frame_equal(bf_result.to_pandas(), pd_result) + + +def test_get_dummies_dataframe_duplicate_labels(scalars_dfs): + if pd.__version__.startswith("1."): + pytest.skip("pandas has different behavior in 1.x") + + scalars_df, scalars_pandas_df = scalars_dfs + + scalars_renamed_df = scalars_df.rename( + columns={"int64_too": "int64_col", "float64_col": None, "string_col": None} + ) + scalars_renamed_pandas_df = scalars_pandas_df.rename( + columns={"int64_too": "int64_col", "float64_col": None, "string_col": None} + ) + + bf_result = bpd.get_dummies( + scalars_renamed_df, columns=["int64_col", None], dtype=bool + ) + pd_result = pd.get_dummies( + scalars_renamed_pandas_df, columns=["int64_col", None], dtype=bool + ) + # dtype argument above is needed for pandas v1 only + + # adjust for expected dtype differences + for (column_name, type_name) in zip(pd_result.columns, pd_result.dtypes): + if type_name == "bool": + pd_result[column_name] = pd_result[column_name].astype("boolean") + + pd.testing.assert_frame_equal(bf_result.to_pandas(), pd_result) + + +def test_get_dummies_series(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + bf_series = scalars_df.date_col + pd_series = scalars_pandas_df.date_col + + bf_result = bpd.get_dummies(bf_series, dtype=bool) + pd_result = pd.get_dummies(pd_series, dtype=bool) + # dtype argument above is needed for pandas v1 only + + # adjust for expected dtype differences + for (column_name, type_name) in zip(pd_result.columns, pd_result.dtypes): + if type_name == "bool": + pd_result[column_name] = pd_result[column_name].astype("boolean") + pd_result.columns = pd_result.columns.astype(object) + + pd.testing.assert_frame_equal( + bf_result.to_pandas(), + pd_result, + ) + + +def test_get_dummies_series_nameless(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + bf_series = scalars_df.date_col.rename(None) + pd_series = scalars_pandas_df.date_col.rename(None) + + bf_result = bpd.get_dummies(bf_series, dtype=bool) + pd_result = pd.get_dummies(pd_series, dtype=bool) + # dtype argument above is needed for pandas v1 only + + # adjust for expected dtype differences + for (column_name, type_name) in zip(pd_result.columns, pd_result.dtypes): + if type_name == "bool": + pd_result[column_name] = pd_result[column_name].astype("boolean") + pd_result.columns = pd_result.columns.astype(object) + + pd.testing.assert_frame_equal( + bf_result.to_pandas(), + pd_result, + ) + + @pytest.mark.parametrize( ("how"), [ @@ -223,3 +335,28 @@ def test_cut(scalars_dfs): bf_result = bf_result.to_pandas() pd_result = pd_result.astype("Int64") pd.testing.assert_series_equal(bf_result, pd_result) + + +@pytest.mark.parametrize( + ("q",), + [ + (1,), + (2,), + (7,), + (32,), + ([0, 0.1, 0.3, 0.4, 0.9, 1.0],), + ([0.5, 0.9],), + ], +) +def test_qcut(scalars_dfs, q): + scalars_df, scalars_pandas_df = scalars_dfs + + pd_result = pd.qcut( + scalars_pandas_df["float64_col"], q, labels=False, duplicates="drop" + ) + bf_result = bpd.qcut(scalars_df["float64_col"], q, labels=False, duplicates="drop") + + bf_result = bf_result.to_pandas() + pd_result = pd_result.astype("Int64") + + pd.testing.assert_series_equal(bf_result, pd_result) diff --git a/tests/system/small/test_progress_bar.py b/tests/system/small/test_progress_bar.py index 00380c2639..30ea63b483 100644 --- a/tests/system/small/test_progress_bar.py +++ b/tests/system/small/test_progress_bar.py @@ -12,6 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. +import re import tempfile import pandas as pd @@ -19,95 +20,85 @@ import bigframes as bf import bigframes.formatting_helpers as formatting_helpers +job_load_message_regex = r"\w+ job [\w-]+ is \w+\." + def test_progress_bar_dataframe( penguins_df_default_index: bf.dataframe.DataFrame, capsys ): - bf.options.display.progress_bar = "notebook" + bf.options.display.progress_bar = "terminal" + capsys.readouterr() # clear output penguins_df_default_index.to_pandas() - html_check = "HTML(value=" - open_job_check = "Open Job" - lines = capsys.readouterr().out.split("\n") - lines = [line for line in lines if len(line) > 0] - assert len(lines) > 0 + + assert_loading_msg_exist(capsys.readouterr().out) assert penguins_df_default_index.query_job is not None - for line in lines: - assert html_check in line and open_job_check in line def test_progress_bar_series(penguins_df_default_index: bf.dataframe.DataFrame, capsys): - bf.options.display.progress_bar = "notebook" + bf.options.display.progress_bar = "terminal" series = penguins_df_default_index["body_mass_g"].head(10) + capsys.readouterr() # clear output series.to_pandas() - html_check = "HTML(value=" - open_job_check = "Open Job" - lines = capsys.readouterr().out.split("\n") - lines = [line for line in lines if len(line) > 0] - assert len(lines) > 0 + + assert_loading_msg_exist(capsys.readouterr().out) assert series.query_job is not None - for line in lines: - assert html_check in line and open_job_check in line def test_progress_bar_scalar(penguins_df_default_index: bf.dataframe.DataFrame, capsys): - bf.options.display.progress_bar = "notebook" + bf.options.display.progress_bar = "terminal" + capsys.readouterr() # clear output penguins_df_default_index["body_mass_g"].head(10).mean() - html_check = "HTML(value=" - open_job_check = "Open Job" - lines = capsys.readouterr().out.split("\n") - lines = [line for line in lines if len(line) > 0] - assert len(lines) > 0 - for line in lines: - assert html_check in line and open_job_check in line + + assert_loading_msg_exist(capsys.readouterr().out) def test_progress_bar_read_gbq(session: bf.Session, penguins_table_id: str, capsys): - bf.options.display.progress_bar = "notebook" + bf.options.display.progress_bar = "terminal" + capsys.readouterr() # clear output session.read_gbq(penguins_table_id) - html_check = "HTML(value=" - open_job_check = "Open Job" - lines = capsys.readouterr().out.split("\n") - lines = [line for line in lines if len(line) > 0] - assert len(lines) > 0 - for line in lines: - assert html_check in line and open_job_check in line + + assert_loading_msg_exist(capsys.readouterr().out) def test_progress_bar_extract_jobs( penguins_df_default_index: bf.dataframe.DataFrame, gcs_folder, capsys ): - bf.options.display.progress_bar = "notebook" + bf.options.display.progress_bar = "terminal" path = gcs_folder + "test_read_csv_progress_bar*.csv" + capsys.readouterr() # clear output penguins_df_default_index.to_csv(path) - html_check = "HTML(value=" - open_job_check = "Open Job" - lines = capsys.readouterr().out.split("\n") - lines = [line for line in lines if len(line) > 0] - assert len(lines) > 0 - for line in lines: - assert html_check in line and open_job_check in line + + assert_loading_msg_exist(capsys.readouterr().out) def test_progress_bar_load_jobs( session: bf.Session, penguins_pandas_df_default_index: pd.DataFrame, capsys ): - bf.options.display.progress_bar = "notebook" + bf.options.display.progress_bar = "terminal" with tempfile.TemporaryDirectory() as dir: path = dir + "/test_read_csv_progress_bar*.csv" penguins_pandas_df_default_index.to_csv(path, index=False) + capsys.readouterr() # clear output session.read_csv(path) - html_check = "HTML(value=" - open_job_check = "Open Job" - lines = capsys.readouterr().out.split("\n") + + assert_loading_msg_exist(capsys.readouterr().out) + + +def assert_loading_msg_exist(capystOut: str, pattern=job_load_message_regex): + numLoadingMsg = 0 + lines = capystOut.split("\n") lines = [line for line in lines if len(line) > 0] + assert len(lines) > 0 for line in lines: - assert html_check in line and open_job_check in line + if re.match(pattern, line) is not None: + numLoadingMsg += 1 + assert numLoadingMsg > 0 def test_query_job_repr_html(penguins_df_default_index: bf.dataframe.DataFrame): - bf.options.display.progress_bar = "notebook" - penguins_df_default_index._block._expr._session.bqclient.default_query_job_config.use_query_cache = ( + bf.options.display.progress_bar = "terminal" + penguins_df_default_index._block._expr.session.bqclient.default_query_job_config.use_query_cache = ( False ) penguins_df_default_index.to_pandas() @@ -126,7 +117,7 @@ def test_query_job_repr_html(penguins_df_default_index: bf.dataframe.DataFrame): def test_query_job_repr(penguins_df_default_index: bf.dataframe.DataFrame): - penguins_df_default_index._block._expr._session.bqclient.default_query_job_config.use_query_cache = ( + penguins_df_default_index._block._expr.session.bqclient.default_query_job_config.use_query_cache = ( False ) penguins_df_default_index.to_pandas() @@ -144,12 +135,11 @@ def test_query_job_repr(penguins_df_default_index: bf.dataframe.DataFrame): assert string in query_job_repr -def test_query_job_dry_run( - penguins_df_default_index: bf.dataframe.DataFrame, capsys, deferred_repr -): - repr(penguins_df_default_index) - repr(penguins_df_default_index["body_mass_g"]) - lines = capsys.readouterr().out.split("\n") - lines = filter(None, lines) - for line in lines: - assert "Computation deferred. Computation will process" in line +def test_query_job_dry_run(penguins_df_default_index: bf.dataframe.DataFrame, capsys): + with bf.option_context("display.repr_mode", "deferred"): + repr(penguins_df_default_index) + repr(penguins_df_default_index["body_mass_g"]) + lines = capsys.readouterr().out.split("\n") + lines = filter(None, lines) + for line in lines: + assert "Computation deferred. Computation will process" in line diff --git a/tests/system/small/test_remote_function.py b/tests/system/small/test_remote_function.py index d024a57ded..89907a53df 100644 --- a/tests/system/small/test_remote_function.py +++ b/tests/system/small/test_remote_function.py @@ -13,14 +13,11 @@ # limitations under the License. from google.cloud import bigquery -from ibis.backends.bigquery import datatypes as bq_types -from ibis.expr import datatypes as ibis_types import pandas as pd import pytest import bigframes from bigframes import remote_function as rf -import bigframes.pandas as bpd from tests.system.utils import assert_pandas_df_equal_ignore_ordering @@ -65,45 +62,14 @@ def bq_cf_connection_location_project_mismatched() -> str: @pytest.fixture(scope="module") -def session_with_bq_connection(bq_cf_connection) -> bigframes.Session: - return bigframes.Session(bigframes.BigQueryOptions(bq_connection=bq_cf_connection)) - - -@pytest.fixture(scope="module") -def session_with_bq_connection_location_specified( - bq_cf_connection_location, -) -> bigframes.Session: - return bigframes.Session( - bigframes.BigQueryOptions(bq_connection=bq_cf_connection_location) - ) - - -@pytest.fixture(scope="module") -def session_with_bq_connection_location_mistached( - bq_cf_connection_location_mistached, -) -> bigframes.Session: - return bigframes.Session( - bigframes.BigQueryOptions(bq_connection=bq_cf_connection_location_mistached) - ) - - -@pytest.fixture(scope="module") -def session_with_bq_connection_location_project_specified( - bq_cf_connection_location_project, +def session_with_bq_connection_and_permanent_dataset( + bq_cf_connection, dataset_id_permanent ) -> bigframes.Session: - return bigframes.Session( - bigframes.BigQueryOptions(bq_connection=bq_cf_connection_location_project) + session = bigframes.Session( + bigframes.BigQueryOptions(bq_connection=bq_cf_connection) ) - - -def test_supported_types_correspond(): - # The same types should be representable by the supported Python and BigQuery types. - ibis_types_from_python = {ibis_types.dtype(t) for t in rf.SUPPORTED_IO_PYTHON_TYPES} - ibis_types_from_bigquery = { - bq_types.BigQueryType.to_ibis(tk) for tk in rf.SUPPORTED_IO_BIGQUERY_TYPEKINDS - } - - assert ibis_types_from_python == ibis_types_from_bigquery + session._session_dataset = bigquery.Dataset(dataset_id_permanent) + return session @pytest.mark.flaky(retries=2, delay=120) @@ -311,11 +277,13 @@ def square(x): @pytest.mark.flaky(retries=2, delay=120) -def test_remote_function_direct_session_param(session_with_bq_connection, scalars_dfs): +def test_remote_function_direct_session_param( + session_with_bq_connection_and_permanent_dataset, scalars_dfs +): @rf.remote_function( [int], int, - session=session_with_bq_connection, + session=session_with_bq_connection_and_permanent_dataset, ) def square(x): return x * x @@ -345,7 +313,9 @@ def square(x): @pytest.mark.flaky(retries=2, delay=120) -def test_remote_function_via_session_default(session_with_bq_connection, scalars_dfs): +def test_remote_function_via_session_default( + session_with_bq_connection_and_permanent_dataset, scalars_dfs +): # Session has bigquery connection initialized via context. Without an # explicit dataset the default dataset from the session would be used. # Without an explicit bigquery connection, the one present in Session set @@ -353,7 +323,7 @@ def test_remote_function_via_session_default(session_with_bq_connection, scalars # the default behavior of reuse=True will take effect. Please note that the # udf is same as the one used in other tests in this file so the underlying # cloud function would be common and quickly reused. - @session_with_bq_connection.remote_function([int], int) + @session_with_bq_connection_and_permanent_dataset.remote_function([int], int) def square(x): return x * x @@ -421,87 +391,15 @@ def square(x): @pytest.mark.flaky(retries=2, delay=120) -def test_remote_function_via_session_context_connection_setter( - scalars_dfs, dataset_id, bq_cf_connection +def test_dataframe_applymap( + session_with_bq_connection_and_permanent_dataset, scalars_dfs ): - # Creating a session scoped only to this test as we would be setting a - # property in it - context = bigframes.BigQueryOptions() - context.bq_connection = bq_cf_connection - session = bigframes.connect(context) - - # Without an explicit bigquery connection, the one present in Session, - # set via context setter would be used. Without an explicit `reuse` the - # default behavior of reuse=True will take effect. Please note that the - # udf is same as the one used in other tests in this file so the underlying - # cloud function would be common with reuse=True. Since we are using a - # unique dataset_id, even though the cloud function would be reused, the bq - # remote function would still be created, making use of the bq connection - # set in the BigQueryOptions above. - @session.remote_function([int], int, dataset=dataset_id) - def square(x): - return x * x - - scalars_df, scalars_pandas_df = scalars_dfs - - bf_int64_col = scalars_df["int64_col"] - bf_int64_col_filter = bf_int64_col.notnull() - bf_int64_col_filtered = bf_int64_col[bf_int64_col_filter] - bf_result_col = bf_int64_col_filtered.apply(square) - bf_result = ( - bf_int64_col_filtered.to_frame().assign(result=bf_result_col).to_pandas() - ) - - pd_int64_col = scalars_pandas_df["int64_col"] - pd_int64_col_filter = pd_int64_col.notnull() - pd_int64_col_filtered = pd_int64_col[pd_int64_col_filter] - pd_result_col = pd_int64_col_filtered.apply(lambda x: x * x) - # TODO(shobs): Figure why pandas .apply() changes the dtype, i.e. - # pd_int64_col_filtered.dtype is Int64Dtype() - # pd_int64_col_filtered.apply(lambda x: x * x).dtype is int64. - # For this test let's force the pandas dtype to be same as bigframes' dtype. - pd_result_col = pd_result_col.astype(pd.Int64Dtype()) - pd_result = pd_int64_col_filtered.to_frame().assign(result=pd_result_col) - - assert_pandas_df_equal_ignore_ordering(bf_result, pd_result) - - -@pytest.mark.flaky(retries=2, delay=120) -def test_remote_function_default_connection(scalars_dfs, dataset_id): - @bpd.remote_function([int], int, dataset=dataset_id) - def square(x): - return x * x - - scalars_df, scalars_pandas_df = scalars_dfs - - bf_int64_col = scalars_df["int64_col"] - bf_int64_col_filter = bf_int64_col.notnull() - bf_int64_col_filtered = bf_int64_col[bf_int64_col_filter] - bf_result_col = bf_int64_col_filtered.apply(square) - bf_result = ( - bf_int64_col_filtered.to_frame().assign(result=bf_result_col).to_pandas() - ) - - pd_int64_col = scalars_pandas_df["int64_col"] - pd_int64_col_filter = pd_int64_col.notnull() - pd_int64_col_filtered = pd_int64_col[pd_int64_col_filter] - pd_result_col = pd_int64_col_filtered.apply(lambda x: x * x) - # TODO(shobs): Figure why pandas .apply() changes the dtype, i.e. - # pd_int64_col_filtered.dtype is Int64Dtype() - # pd_int64_col_filtered.apply(lambda x: x * x).dtype is int64. - # For this test let's force the pandas dtype to be same as bigframes' dtype. - pd_result_col = pd_result_col.astype(pd.Int64Dtype()) - pd_result = pd_int64_col_filtered.to_frame().assign(result=pd_result_col) - - assert_pandas_df_equal_ignore_ordering(bf_result, pd_result) - - -@pytest.mark.flaky(retries=2, delay=120) -def test_dataframe_applymap(session_with_bq_connection, scalars_dfs): def add_one(x): return x + 1 - remote_add_one = session_with_bq_connection.remote_function([int], int)(add_one) + remote_add_one = session_with_bq_connection_and_permanent_dataset.remote_function( + [int], int + )(add_one) scalars_df, scalars_pandas_df = scalars_dfs int64_cols = ["int64_col", "int64_too"] @@ -524,11 +422,15 @@ def add_one(x): @pytest.mark.flaky(retries=2, delay=120) -def test_dataframe_applymap_na_ignore(session_with_bq_connection, scalars_dfs): +def test_dataframe_applymap_na_ignore( + session_with_bq_connection_and_permanent_dataset, scalars_dfs +): def add_one(x): return x + 1 - remote_add_one = session_with_bq_connection.remote_function([int], int)(add_one) + remote_add_one = session_with_bq_connection_and_permanent_dataset.remote_function( + [int], int + )(add_one) scalars_df, scalars_pandas_df = scalars_dfs int64_cols = ["int64_col", "int64_too"] @@ -549,11 +451,13 @@ def add_one(x): @pytest.mark.flaky(retries=2, delay=120) -def test_series_map(session_with_bq_connection, scalars_dfs): +def test_series_map(session_with_bq_connection_and_permanent_dataset, scalars_dfs): def add_one(x): return x + 1 - remote_add_one = session_with_bq_connection.remote_function([int], int)(add_one) + remote_add_one = session_with_bq_connection_and_permanent_dataset.remote_function( + [int], int + )(add_one) scalars_df, scalars_pandas_df = scalars_dfs @@ -635,7 +539,7 @@ def square1(x): @pytest.mark.flaky(retries=2, delay=120) -def test_read_gbq_function_reads_udfs(bigquery_client, scalars_dfs, dataset_id): +def test_read_gbq_function_reads_udfs(bigquery_client, dataset_id): dataset_ref = bigquery.DatasetReference.from_string(dataset_id) arg = bigquery.RoutineArgument( name="x", diff --git a/tests/system/small/test_series.py b/tests/system/small/test_series.py index bd9edbb1ca..05d8b84185 100644 --- a/tests/system/small/test_series.py +++ b/tests/system/small/test_series.py @@ -575,7 +575,15 @@ def test_series_int_int_operators_series(scalars_dfs, operator): ) def test_mods(scalars_dfs, col_x, col_y, method): scalars_df, scalars_pandas_df = scalars_dfs - bf_result = getattr(scalars_df[col_x], method)(scalars_df[col_y]).to_pandas() + x_bf = scalars_df[col_x] + y_bf = scalars_df[col_y] + bf_series = getattr(x_bf, method)(y_bf) + # BigQuery's mod functions return [BIG]NUMERIC values unless both arguments are integers. + # https://cloud.google.com/bigquery/docs/reference/standard-sql/mathematical_functions#mod + if x_bf.dtype == pd.Int64Dtype() and y_bf.dtype == pd.Int64Dtype(): + bf_result = bf_series.to_pandas() + else: + bf_result = bf_series.astype("Float64").to_pandas() pd_result = getattr(scalars_pandas_df[col_x], method)(scalars_pandas_df[col_y]) pd.testing.assert_series_equal(pd_result, bf_result) @@ -620,8 +628,20 @@ def test_divmods_series(scalars_dfs, col_x, col_y, method): pd_div_result, pd_mod_result = getattr(scalars_pandas_df[col_x], method)( scalars_pandas_df[col_y] ) - pd.testing.assert_series_equal(pd_div_result, bf_div_result.to_pandas()) - pd.testing.assert_series_equal(pd_mod_result, bf_mod_result.to_pandas()) + # BigQuery's mod functions return NUMERIC values for non-INT64 inputs. + if bf_div_result.dtype == pd.Int64Dtype(): + pd.testing.assert_series_equal(pd_div_result, bf_div_result.to_pandas()) + else: + pd.testing.assert_series_equal( + pd_div_result, bf_div_result.astype("Float64").to_pandas() + ) + + if bf_mod_result.dtype == pd.Int64Dtype(): + pd.testing.assert_series_equal(pd_mod_result, bf_mod_result.to_pandas()) + else: + pd.testing.assert_series_equal( + pd_mod_result, bf_mod_result.astype("Float64").to_pandas() + ) @pytest.mark.parametrize( @@ -649,8 +669,20 @@ def test_divmods_scalars(scalars_dfs, col_x, other, method): scalars_df, scalars_pandas_df = scalars_dfs bf_div_result, bf_mod_result = getattr(scalars_df[col_x], method)(other) pd_div_result, pd_mod_result = getattr(scalars_pandas_df[col_x], method)(other) - pd.testing.assert_series_equal(pd_div_result, bf_div_result.to_pandas()) - pd.testing.assert_series_equal(pd_mod_result, bf_mod_result.to_pandas()) + # BigQuery's mod functions return NUMERIC values for non-INT64 inputs. + if bf_div_result.dtype == pd.Int64Dtype(): + pd.testing.assert_series_equal(pd_div_result, bf_div_result.to_pandas()) + else: + pd.testing.assert_series_equal( + pd_div_result, bf_div_result.astype("Float64").to_pandas() + ) + + if bf_mod_result.dtype == pd.Int64Dtype(): + pd.testing.assert_series_equal(pd_mod_result, bf_mod_result.to_pandas()) + else: + pd.testing.assert_series_equal( + pd_mod_result, bf_mod_result.astype("Float64").to_pandas() + ) @pytest.mark.parametrize( @@ -1941,12 +1973,6 @@ def test_iloc_nested(scalars_df_index, scalars_pandas_df_index): def test_series_iloc(scalars_df_index, scalars_pandas_df_index, start, stop, step): bf_result = scalars_df_index["string_col"].iloc[start:stop:step].to_pandas() pd_result = scalars_pandas_df_index["string_col"].iloc[start:stop:step] - - # Pandas may assign non-object dtype to empty series and series index - if pd_result.empty: - pd_result = pd_result.astype("object") - pd_result.index = pd_result.index.astype("object") - pd.testing.assert_series_equal( bf_result, pd_result, @@ -2832,7 +2858,7 @@ def test_map_series_input(scalars_dfs): pd_map_series = scalars_pandas_df.string_col.iloc[0 : len(new_index)] pd_map_series.index = new_index bf_map_series = series.Series( - pd_map_series, session=scalars_df._get_block().expr._session + pd_map_series, session=scalars_df._get_block().expr.session ) pd_result = scalars_pandas_df.int64_too.map(pd_map_series) @@ -2851,7 +2877,7 @@ def test_map_series_input_duplicates_error(scalars_dfs): pd_map_series = scalars_pandas_df.string_col.iloc[0 : len(new_index)] pd_map_series.index = new_index bf_map_series = series.Series( - pd_map_series, session=scalars_df._get_block().expr._session + pd_map_series, session=scalars_df._get_block().expr.session ) with pytest.raises(pd.errors.InvalidIndexError): diff --git a/tests/system/small/test_session.py b/tests/system/small/test_session.py index 127a88a760..bf72e444eb 100644 --- a/tests/system/small/test_session.py +++ b/tests/system/small/test_session.py @@ -318,7 +318,6 @@ def test_read_pandas(session, scalars_dfs): _, scalars_pandas_df = scalars_dfs df = session.read_pandas(scalars_pandas_df) - assert df._block._expr._ordering is not None result = df.to_pandas() expected = scalars_pandas_df @@ -350,9 +349,8 @@ def test_read_pandas_rowid_exists_adds_suffix(session, scalars_pandas_df_default pandas_df = scalars_pandas_df_default_index.copy() pandas_df["rowid"] = np.arange(pandas_df.shape[0]) - df = session.read_pandas(pandas_df) - total_order_col = df._block._expr._ordering.total_order_col - assert total_order_col and total_order_col.column_id == "rowid_2" + df_roundtrip = session.read_pandas(pandas_df).to_pandas() + pd.testing.assert_frame_equal(df_roundtrip, pandas_df, check_dtype=False) def test_read_pandas_tokyo( @@ -385,7 +383,6 @@ def test_read_csv_gcs_default_engine(session, scalars_dfs, gcs_folder): # Convert default pandas dtypes to match BigQuery DataFrames dtypes. dtype=dtype, ) - assert df._block._expr._ordering is not None # TODO(chelsealin): If we serialize the index, can more easily compare values. pd.testing.assert_index_equal(df.columns, scalars_df.columns) @@ -441,7 +438,6 @@ def test_read_csv_local_default_engine(session, scalars_dfs, sep): # Convert default pandas dtypes to match BigQuery DataFrames dtypes. dtype=dtype, ) - assert df._block._expr._ordering is not None # TODO(chelsealin): If we serialize the index, can more easily compare values. pd.testing.assert_index_equal(df.columns, scalars_df.columns) @@ -976,7 +972,6 @@ def test_read_json_gcs_default_engine(session, scalars_dfs, gcs_folder): orient="records", ) - assert df._block._expr._ordering is not None pd.testing.assert_index_equal(df.columns, scalars_df.columns) # The auto detects of BigQuery load job have restrictions to detect the bytes, diff --git a/tests/unit/core/test_blocks.py b/tests/unit/core/test_blocks.py index a7e9b5a84b..86715d090c 100644 --- a/tests/unit/core/test_blocks.py +++ b/tests/unit/core/test_blocks.py @@ -18,8 +18,6 @@ import bigframes.core.blocks as blocks -from .. import resources - @pytest.mark.parametrize( ("data",), @@ -76,9 +74,8 @@ ) def test_block_from_local(data): expected = pandas.DataFrame(data) - session = resources.create_pandas_session({}) - block = blocks.block_from_local(data, session=session) + block = blocks.block_from_local(data) pandas.testing.assert_index_equal(block.column_labels, expected.columns) assert tuple(block.index_labels) == tuple(expected.index.names) diff --git a/tests/unit/resources.py b/tests/unit/resources.py index 0a68600a35..f660d774f0 100644 --- a/tests/unit/resources.py +++ b/tests/unit/resources.py @@ -22,6 +22,7 @@ import bigframes import bigframes.core as core +import bigframes.core.ordering import bigframes.session.clients """Utilities for creating test resources.""" @@ -61,14 +62,20 @@ def create_pandas_session(tables: Dict[str, pandas.DataFrame]) -> bigframes.Sess def create_arrayvalue( df: pandas.DataFrame, total_ordering_columns: List[str] -) -> bigframes.core.ArrayValue: +) -> core.ArrayValue: session = create_pandas_session({"test_table": df}) ibis_table = session.ibis_client.table("test_table") columns = tuple(ibis_table[key] for key in ibis_table.columns) - ordering = core.ExpressionOrdering( - [core.OrderingColumnReference(column) for column in total_ordering_columns], + ordering = bigframes.core.ordering.ExpressionOrdering( + tuple( + [core.OrderingColumnReference(column) for column in total_ordering_columns] + ), total_ordering_columns=frozenset(total_ordering_columns), ) - return core.ArrayValue( - session=session, table=ibis_table, columns=columns, ordering=ordering + return core.ArrayValue.from_ibis( + session=session, + table=ibis_table, + columns=columns, + hidden_ordering_columns=(), + ordering=ordering, ) diff --git a/tests/unit/session/test_io_pandas.py b/tests/unit/session/test_io_pandas.py new file mode 100644 index 0000000000..0f6f5dae03 --- /dev/null +++ b/tests/unit/session/test_io_pandas.py @@ -0,0 +1,352 @@ +# Copyright 2023 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import datetime +from typing import Dict, Union + +import geopandas # type: ignore +import numpy +import pandas +import pandas.arrays +import pandas.testing +import pyarrow # type: ignore +import pytest + +import bigframes.session._io.pandas + + +@pytest.mark.parametrize( + ("arrow_table", "dtypes", "expected"), + ( + pytest.param( + pyarrow.Table.from_pydict({}), + {}, + pandas.DataFrame(), + id="empty-df", + ), + pytest.param( + pyarrow.Table.from_pydict( + { + "bool": pyarrow.array([None, None, None], type=pyarrow.bool_()), + "float": pyarrow.array([None, None, None], type=pyarrow.float64()), + "int": pyarrow.array([None, None, None], type=pyarrow.int64()), + "string": pyarrow.array([None, None, None], type=pyarrow.string()), + "time": pyarrow.array( + [None, None, None], type=pyarrow.time64("us") + ), + } + ), + { + "bool": "boolean", + "float": pandas.Float64Dtype(), + "int": pandas.Int64Dtype(), + "string": "string[pyarrow]", + "time": pandas.ArrowDtype(pyarrow.time64("us")), + }, + pandas.DataFrame( + { + "bool": pandas.Series([None, None, None], dtype="boolean"), + "float": pandas.Series( + pandas.arrays.FloatingArray( # type: ignore + numpy.array( + [float("nan"), float("nan"), float("nan")], + dtype="float64", + ), + numpy.array([True, True, True], dtype="bool"), + ), + dtype=pandas.Float64Dtype(), + ), + "int": pandas.Series( + [None, None, None], + dtype=pandas.Int64Dtype(), + ), + "string": pandas.Series( + [None, None, None], dtype="string[pyarrow]" + ), + "time": pandas.Series( + [ + None, + None, + None, + ], + dtype=pandas.ArrowDtype(pyarrow.time64("us")), + ), + } + ), + id="nulls-df", + ), + pytest.param( + pyarrow.Table.from_pydict( + { + "date": pyarrow.array( + [ + datetime.date(2023, 8, 29), + None, + datetime.date(2024, 4, 9), + datetime.date(1, 1, 1), + ], + type=pyarrow.date32(), + ), + "datetime": pyarrow.array( + [ + datetime.datetime(2023, 8, 29), + None, + datetime.datetime(2024, 4, 9, 23, 59, 59), + datetime.datetime(1, 1, 1, 0, 0, 0, 1), + ], + type=pyarrow.timestamp("us"), + ), + "string": ["123", None, "abc", "xyz"], + "time": pyarrow.array( + [ + datetime.time(0, 0, 0, 1), + datetime.time(12, 0, 0), + None, + datetime.time(23, 59, 59, 999999), + ], + type=pyarrow.time64("us"), + ), + "timestamp": pyarrow.array( + [ + datetime.datetime(2023, 8, 29), + datetime.datetime(1, 1, 1, 0, 0, 0, 1), + None, + datetime.datetime(2024, 4, 9, 23, 59, 59), + ], + type=pyarrow.timestamp("us", datetime.timezone.utc), + ), + } + ), + { + "date": pandas.ArrowDtype(pyarrow.date32()), + "datetime": pandas.ArrowDtype(pyarrow.timestamp("us")), + "string": "string[pyarrow]", + "time": pandas.ArrowDtype(pyarrow.time64("us")), + "timestamp": pandas.ArrowDtype( + pyarrow.timestamp("us", datetime.timezone.utc) + ), + }, + pandas.DataFrame( + { + "date": pandas.Series( + [ + datetime.date(2023, 8, 29), + None, + datetime.date(2024, 4, 9), + datetime.date(1, 1, 1), + ], + dtype=pandas.ArrowDtype(pyarrow.date32()), + ), + "datetime": pandas.Series( + [ + datetime.datetime(2023, 8, 29), + None, + datetime.datetime(2024, 4, 9, 23, 59, 59), + datetime.datetime(1, 1, 1, 0, 0, 0, 1), + ], + dtype=pandas.ArrowDtype(pyarrow.timestamp("us")), + ), + "string": pandas.Series( + ["123", None, "abc", "xyz"], dtype="string[pyarrow]" + ), + "time": pandas.Series( + [ + datetime.time(0, 0, 0, 1), + datetime.time(12, 0, 0), + None, + datetime.time(23, 59, 59, 999999), + ], + dtype=pandas.ArrowDtype(pyarrow.time64("us")), + ), + "timestamp": pandas.Series( + [ + datetime.datetime(2023, 8, 29), + datetime.datetime(1, 1, 1, 0, 0, 0, 1), + None, + datetime.datetime(2024, 4, 9, 23, 59, 59), + ], + dtype=pandas.ArrowDtype( + pyarrow.timestamp("us", datetime.timezone.utc) + ), + ), + } + ), + id="arrow-dtypes", + ), + pytest.param( + pyarrow.Table.from_pydict( + { + "bool": [True, None, True, False], + "bytes": [b"123", None, b"abc", b"xyz"], + "float": pyarrow.array( + [1.0, None, float("nan"), -1.0], + type=pyarrow.float64(), + ), + "int": pyarrow.array( + [1, None, -1, 2**63 - 1], + type=pyarrow.int64(), + ), + "string": ["123", None, "abc", "xyz"], + } + ), + { + "bool": "boolean", + "bytes": "object", + "float": pandas.Float64Dtype(), + "int": pandas.Int64Dtype(), + "string": "string[pyarrow]", + }, + pandas.DataFrame( + { + "bool": pandas.Series([True, None, True, False], dtype="boolean"), + "bytes": [b"123", None, b"abc", b"xyz"], + "float": pandas.Series( + pandas.arrays.FloatingArray( # type: ignore + numpy.array( + [1.0, float("nan"), float("nan"), -1.0], dtype="float64" + ), + numpy.array([False, True, False, False], dtype="bool"), + ), + dtype=pandas.Float64Dtype(), + ), + "int": pandas.Series( + [1, None, -1, 2**63 - 1], + dtype=pandas.Int64Dtype(), + ), + "string": pandas.Series( + ["123", None, "abc", "xyz"], dtype="string[pyarrow]" + ), + } + ), + id="scalar-dtypes", + ), + pytest.param( + pyarrow.Table.from_pydict( + { + "bool": pyarrow.chunked_array( + [[True, None], [True, False]], + type=pyarrow.bool_(), + ), + "bytes": pyarrow.chunked_array( + [[b"123", None], [b"abc", b"xyz"]], + type=pyarrow.binary(), + ), + "float": pyarrow.chunked_array( + [[1.0, None], [float("nan"), -1.0]], + type=pyarrow.float64(), + ), + "int": pyarrow.chunked_array( + [[1, None], [-1, 2**63 - 1]], + type=pyarrow.int64(), + ), + "string": pyarrow.chunked_array( + [["123", None], ["abc", "xyz"]], + type=pyarrow.string(), + ), + } + ), + { + "bool": "boolean", + "bytes": "object", + "float": pandas.Float64Dtype(), + "int": pandas.Int64Dtype(), + "string": "string[pyarrow]", + }, + pandas.DataFrame( + { + "bool": pandas.Series([True, None, True, False], dtype="boolean"), + "bytes": [b"123", None, b"abc", b"xyz"], + "float": pandas.Series( + pandas.arrays.FloatingArray( # type: ignore + numpy.array( + [1.0, float("nan"), float("nan"), -1.0], dtype="float64" + ), + numpy.array([False, True, False, False], dtype="bool"), + ), + dtype=pandas.Float64Dtype(), + ), + "int": pandas.Series( + [1, None, -1, 2**63 - 1], + dtype=pandas.Int64Dtype(), + ), + "string": pandas.Series( + ["123", None, "abc", "xyz"], dtype="string[pyarrow]" + ), + } + ), + id="scalar-dtypes-chunked_array", + ), + pytest.param( + pyarrow.Table.from_pydict( + { + "geocol": [ + "POINT(32 210)", + None, + "LINESTRING(1 1, 2 1, 3.1 2.88, 3 -3)", + ] + } + ), + {"geocol": geopandas.array.GeometryDtype()}, + pandas.DataFrame( + { + "geocol": geopandas.GeoSeries.from_wkt( + ["POINT(32 210)", None, "LINESTRING(1 1, 2 1, 3.1 2.88, 3 -3)"], + crs="EPSG:4326", + ), + } + ), + id="geography-dtype", + ), + ), +) +def test_arrow_to_pandas( + arrow_table: Union[pyarrow.Table, pyarrow.RecordBatch], + dtypes: Dict, + expected: pandas.DataFrame, +): + actual = bigframes.session._io.pandas.arrow_to_pandas(arrow_table, dtypes) + pandas.testing.assert_series_equal(actual.dtypes, expected.dtypes) + + # assert_frame_equal is converting to numpy internally, which causes some + # loss of precision with the extreme values in this test. + for column in actual.columns: + assert tuple( + (index, value) if (value is pandas.NA or value == value) else (index, "nan") + for index, value in actual[column].items() + ) == tuple( + (index, value) if (value is pandas.NA or value == value) else (index, "nan") + for index, value in expected[column].items() + ) + + +@pytest.mark.parametrize( + ("arrow_table", "dtypes"), + ( + pytest.param( + pyarrow.Table.from_pydict({"col1": [1], "col2": [2]}), + {"col1": "Int64"}, + id="too-few-dtypes", + ), + pytest.param( + pyarrow.RecordBatch.from_pydict({"col1": [1]}), + {"col1": "Int64", "col2": "string[pyarrow]"}, + id="too-many-dtypes", + ), + ), +) +def test_arrow_to_pandas_wrong_size_dtypes( + arrow_table: Union[pyarrow.Table, pyarrow.RecordBatch], dtypes: Dict +): + with pytest.raises(ValueError, match=f"Number of types {len(dtypes)}"): + bigframes.session._io.pandas.arrow_to_pandas(arrow_table, dtypes) diff --git a/tests/unit/test_compute_options.py b/tests/unit/test_compute_options.py new file mode 100644 index 0000000000..499a0a5fef --- /dev/null +++ b/tests/unit/test_compute_options.py @@ -0,0 +1,30 @@ +# Copyright 2023 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import bigframes as bf + +from . import resources + + +def test_maximum_bytes_option(): + session = resources.create_bigquery_session() + num_query_calls = 0 + with bf.option_context("compute.maximum_bytes_billed", 10000): + # clear initial method calls + session.bqclient.method_calls = [] + session._start_query("query") + for call in session.bqclient.method_calls: + _, _, kwargs = call + num_query_calls += 1 + assert kwargs["job_config"].maximum_bytes_billed == 10000 + assert num_query_calls > 0 diff --git a/tests/unit/test_core.py b/tests/unit/test_core.py index 69b9e79807..d9672b2635 100644 --- a/tests/unit/test_core.py +++ b/tests/unit/test_core.py @@ -16,6 +16,7 @@ import pandas import bigframes.core as core +import bigframes.core.ordering import bigframes.operations as ops import bigframes.operations.aggregations as agg_ops @@ -37,15 +38,19 @@ def test_arrayvalue_constructor_from_ibis_table_adds_all_columns(): ) ibis_table = session.ibis_client.table("test_table") columns = (ibis_table["col1"], ibis_table["col2"], ibis_table["col3"]) - ordering = core.ExpressionOrdering( - [core.OrderingColumnReference("col1")], + ordering = bigframes.core.ordering.ExpressionOrdering( + tuple([core.OrderingColumnReference("col1")]), total_ordering_columns=frozenset(["col1"]), ) - actual = core.ArrayValue( - session=session, table=ibis_table, columns=columns, ordering=ordering + actual = core.ArrayValue.from_ibis( + session=session, + table=ibis_table, + columns=columns, + ordering=ordering, + hidden_ordering_columns=(), ) - assert actual._table is ibis_table - assert len(actual.columns) == 3 + assert actual.compile()._table is ibis_table + assert len(actual.column_ids) == 3 def test_arrayvalue_with_get_column_type(): @@ -78,7 +83,7 @@ def test_arrayvalue_with_get_column(): ), total_ordering_columns=["col1"], ) - col1 = value._get_ibis_column("col1") + col1 = value.compile()._get_ibis_column("col1") assert isinstance(col1, ibis_types.Value) assert col1.get_name() == "col1" assert col1.type().is_int64() @@ -95,7 +100,7 @@ def test_arrayvalues_to_ibis_expr_with_get_column(): ), total_ordering_columns=["col1"], ) - expr = value._get_ibis_column("col1") + expr = value.compile()._get_ibis_column("col1") assert expr.get_name() == "col1" assert expr.type().is_int64() @@ -112,7 +117,7 @@ def test_arrayvalues_to_ibis_expr_with_concat(): total_ordering_columns=["col1"], ) expr = value.concat([value]) - actual = expr._to_ibis_expr("unordered") + actual = expr.compile()._to_ibis_expr("unordered") assert len(actual.columns) == 3 # TODO(ashleyxu, b/299631930): test out the union expression assert actual.columns[0] == "column_0" @@ -131,8 +136,8 @@ def test_arrayvalues_to_ibis_expr_with_project_unary_op(): ), total_ordering_columns=["col1"], ) - expr = value.project_unary_op("col1", ops.AsTypeOp("string")) - assert value.columns[0].type().is_int64() + expr = value.project_unary_op("col1", ops.AsTypeOp("string")).compile() + assert value.compile().columns[0].type().is_int64() assert expr.columns[0].type().is_string() @@ -147,7 +152,7 @@ def test_arrayvalues_to_ibis_expr_with_project_binary_op(): ), total_ordering_columns=["col1"], ) - expr = value.project_binary_op("col2", "col3", ops.add_op, "col4") + expr = value.project_binary_op("col2", "col3", ops.add_op, "col4").compile() assert expr.columns[3].type().is_float64() actual = expr._to_ibis_expr("unordered") assert len(expr.columns) == 4 @@ -166,7 +171,9 @@ def test_arrayvalues_to_ibis_expr_with_project_ternary_op(): ), total_ordering_columns=["col1"], ) - expr = value.project_ternary_op("col2", "col3", "col4", ops.where_op, "col5") + expr = value.project_ternary_op( + "col2", "col3", "col4", ops.where_op, "col5" + ).compile() assert expr.columns[4].type().is_float64() actual = expr._to_ibis_expr("unordered") assert len(expr.columns) == 5 @@ -188,7 +195,7 @@ def test_arrayvalue_to_ibis_expr_with_aggregate(): aggregations=(("col1", agg_ops.sum_op, "col4"),), by_column_ids=["col1"], dropna=False, - ) + ).compile() actual = expr._to_ibis_expr("unordered") assert len(expr.columns) == 2 assert actual.columns[0] == "col1" @@ -207,7 +214,7 @@ def test_arrayvalue_to_ibis_expr_with_corr_aggregate(): ), total_ordering_columns=["col1"], ) - expr = value.corr_aggregate(corr_aggregations=[("col1", "col3", "col4")]) + expr = value.corr_aggregate(corr_aggregations=[("col1", "col3", "col4")]).compile() actual = expr._to_ibis_expr("unordered") assert len(expr.columns) == 1 assert actual.columns[0] == "col4" diff --git a/tests/unit/test_dtypes.py b/tests/unit/test_dtypes.py index 3baff2e1f5..6ceaaf911b 100644 --- a/tests/unit/test_dtypes.py +++ b/tests/unit/test_dtypes.py @@ -29,41 +29,42 @@ # TODO(bmil): Add ARRAY, INTERVAL, STRUCT to cover all the standard # BigQuery data types as they appear in Ibis: # https://cloud.google.com/bigquery/docs/reference/standard-sql/data-types - (ibis_dtypes.Decimal(precision=76, scale=38, nullable=True), np.dtype("O")), - (ibis_dtypes.boolean, pd.BooleanDtype()), - (ibis_dtypes.binary, np.dtype("O")), - (ibis_dtypes.date, pd.ArrowDtype(pa.date32())), - (ibis_dtypes.Timestamp(), pd.ArrowDtype(pa.timestamp("us"))), - (ibis_dtypes.float64, pd.Float64Dtype()), - ( + pytest.param( + ibis_dtypes.Decimal(precision=76, scale=38, nullable=True), + np.dtype("O"), + id="bignumeric", + ), + pytest.param(ibis_dtypes.boolean, pd.BooleanDtype(), id="bool"), + pytest.param(ibis_dtypes.binary, np.dtype("O"), id="bytes"), + pytest.param(ibis_dtypes.date, pd.ArrowDtype(pa.date32()), id="date"), + pytest.param( + ibis_dtypes.Timestamp(), pd.ArrowDtype(pa.timestamp("us")), id="datetime" + ), + pytest.param(ibis_dtypes.float64, pd.Float64Dtype(), id="float"), + pytest.param( ibis_dtypes.GeoSpatial(geotype="geography", srid=4326, nullable=True), gpd.array.GeometryDtype(), + id="geography", ), - (ibis_dtypes.int64, pd.Int64Dtype()), - (ibis_dtypes.json, np.dtype("O")), - (ibis_dtypes.Decimal(precision=38, scale=9, nullable=True), np.dtype("O")), - (ibis_dtypes.string, pd.StringDtype(storage="pyarrow")), - (ibis_dtypes.time, pd.ArrowDtype(pa.time64("us"))), - ( + pytest.param(ibis_dtypes.int8, pd.Int64Dtype(), id="int8-as-int64"), + pytest.param(ibis_dtypes.int64, pd.Int64Dtype(), id="int64"), + # TODO(tswast): custom dtype (or at least string dtype) for JSON objects + pytest.param(ibis_dtypes.json, np.dtype("O"), id="json"), + pytest.param( + ibis_dtypes.Decimal(precision=38, scale=9, nullable=True), + np.dtype("O"), + id="numeric", + ), + pytest.param( + ibis_dtypes.string, pd.StringDtype(storage="pyarrow"), id="string" + ), + pytest.param(ibis_dtypes.time, pd.ArrowDtype(pa.time64("us")), id="time"), + pytest.param( ibis_dtypes.Timestamp(timezone="UTC"), pd.ArrowDtype(pa.timestamp("us", tz="UTC")), # type: ignore + id="timestamp", ), ], - ids=[ - "bignumeric", - "bool", - "bytes", - "date", - "datetime", - "float", - "geography", - "int64", - "json", - "numeric", - "string", - "time", - "timestamp", - ], ) def test_ibis_dtype_converts(ibis_dtype, bigframes_dtype): """Test all the Ibis data types needed to read BigQuery tables""" diff --git a/tests/unit/test_remote_function.py b/tests/unit/test_remote_function.py new file mode 100644 index 0000000000..540f4020d3 --- /dev/null +++ b/tests/unit/test_remote_function.py @@ -0,0 +1,28 @@ +# Copyright 2023 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from ibis.backends.bigquery import datatypes as bq_types +from ibis.expr import datatypes as ibis_types + +from bigframes import remote_function as rf + + +def test_supported_types_correspond(): + # The same types should be representable by the supported Python and BigQuery types. + ibis_types_from_python = {ibis_types.dtype(t) for t in rf.SUPPORTED_IO_PYTHON_TYPES} + ibis_types_from_bigquery = { + bq_types.BigQueryType.to_ibis(tk) for tk in rf.SUPPORTED_IO_BIGQUERY_TYPEKINDS + } + + assert ibis_types_from_python == ibis_types_from_bigquery diff --git a/third_party/bigframes_vendored/pandas/_config/config.py b/third_party/bigframes_vendored/pandas/_config/config.py new file mode 100644 index 0000000000..8abaca76c7 --- /dev/null +++ b/third_party/bigframes_vendored/pandas/_config/config.py @@ -0,0 +1,45 @@ +# Contains code from https://github.com/pandas-dev/pandas/blob/main/pandas/_config/config.py +import contextlib +import operator + +import bigframes + + +class option_context(contextlib.ContextDecorator): + """ + Context manager to temporarily set options in the `with` statement context. + + You need to invoke as ``option_context(pat, val, [(pat, val), ...])``. + + Examples + -------- + >>> import bigframes + >>> with bigframes.option_context('display.max_rows', 10, 'display.max_columns', 5): + ... pass + """ + + def __init__(self, *args) -> None: + if len(args) % 2 != 0 or len(args) < 2: + raise ValueError( + "Need to invoke as option_context(pat, val, [(pat, val), ...])." + ) + + self.ops = list(zip(args[::2], args[1::2])) + + def __enter__(self) -> None: + self.undo = [ + (pat, operator.attrgetter(pat)(bigframes.options)) for pat, val in self.ops + ] + + for pat, val in self.ops: + self._set_option(pat, val) + + def __exit__(self, *args) -> None: + if self.undo: + for pat, val in self.undo: + self._set_option(pat, val) + + def _set_option(self, pat, val): + root, attr = pat.rsplit(".", 1) + parent = operator.attrgetter(root)(bigframes.options) + setattr(parent, attr, val) diff --git a/third_party/bigframes_vendored/pandas/core/frame.py b/third_party/bigframes_vendored/pandas/core/frame.py index 13a81b4645..013d170114 100644 --- a/third_party/bigframes_vendored/pandas/core/frame.py +++ b/third_party/bigframes_vendored/pandas/core/frame.py @@ -697,6 +697,7 @@ def align( Join method is specified for each axis Index. + Args: other (DataFrame or Series): join ({{'outer', 'inner', 'left', 'right'}}, default 'outer'): @@ -978,9 +979,9 @@ def sort_values( Sort ascending vs. descending. Specify list for multiple sort orders. If this is a list of bools, must match the length of the by. - kind (str, default `quicksort`): - Choice of sorting algorithm. Accepts 'quicksort’, ‘mergesort’, - ‘heapsort’, ‘stable’. Ignored except when determining whether to + kind (str, default 'quicksort'): + Choice of sorting algorithm. Accepts 'quicksort', 'mergesort', + 'heapsort', 'stable'. Ignored except when determining whether to sort stably. 'mergesort' or 'stable' will result in stable reorder. na_position ({'first', 'last'}, default `last`): ``{'first', 'last'}``, default 'last' Puts NaNs at the beginning @@ -1014,6 +1015,29 @@ def eq(self, other, axis: str | int = "columns") -> DataFrame: Equivalent to `==`, `!=`, `<=`, `<`, `>=`, `>` with support to choose axis (rows or columns) and level for comparison. + **Examples:** + + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + + You can use method name: + + >>> df = bpd.DataFrame({'angles': [0, 3, 4], + ... 'degrees': [360, 180, 360]}, + ... index=['circle', 'triangle', 'rectangle']) + >>> df["degrees"].eq(360) + circle True + triangle False + rectangle True + Name: degrees, dtype: boolean + + You can also use arithmetic operator ``==``: + >>> df["degrees"] == 360 + circle True + triangle False + rectangle True + Name: degrees, dtype: boolean + Args: other (scalar, sequence, Series, or DataFrame): Any single or multiple element data structure, or list-like object. @@ -1036,6 +1060,30 @@ def ne(self, other, axis: str | int = "columns") -> DataFrame: Equivalent to `==`, `!=`, `<=`, `<`, `>=`, `>` with support to choose axis (rows or columns) and level for comparison. + **Examples:** + + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + + You can use method name: + + >>> df = bpd.DataFrame({'angles': [0, 3, 4], + ... 'degrees': [360, 180, 360]}, + ... index=['circle', 'triangle', 'rectangle']) + >>> df["degrees"].ne(360) + circle False + triangle True + rectangle False + Name: degrees, dtype: boolean + + You can also use arithmetic operator ``!=``: + + >>> df["degrees"] != 360 + circle False + triangle True + rectangle False + Name: degrees, dtype: boolean + Args: other (scalar, sequence, Series, or DataFrame): Any single or multiple element data structure, or list-like object. @@ -1061,6 +1109,30 @@ def le(self, other, axis: str | int = "columns") -> DataFrame: floating point columns are considered different (i.e. `NaN` != `NaN`). + **Examples:** + + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + + You can use method name: + + >>> df = bpd.DataFrame({'angles': [0, 3, 4], + ... 'degrees': [360, 180, 360]}, + ... index=['circle', 'triangle', 'rectangle']) + >>> df["degrees"].le(180) + circle False + triangle True + rectangle False + Name: degrees, dtype: boolean + + You can also use arithmetic operator ``<=``: + + >>> df["degrees"] <= 180 + circle False + triangle True + rectangle False + Name: degrees, dtype: boolean + Args: other (scalar, sequence, Series, or DataFrame): Any single or multiple element data structure, or list-like object. @@ -1087,6 +1159,30 @@ def lt(self, other, axis: str | int = "columns") -> DataFrame: floating point columns are considered different (i.e. `NaN` != `NaN`). + **Examples:** + + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + + You can use method name: + + >>> df = bpd.DataFrame({'angles': [0, 3, 4], + ... 'degrees': [360, 180, 360]}, + ... index=['circle', 'triangle', 'rectangle']) + >>> df["degrees"].lt(180) + circle False + triangle False + rectangle False + Name: degrees, dtype: boolean + + You can also use arithmetic operator ``<``: + + >>> df["degrees"] < 180 + circle False + triangle False + rectangle False + Name: degrees, dtype: boolean + Args: other (scalar, sequence, Series, or DataFrame): Any single or multiple element data structure, or list-like object. @@ -1113,6 +1209,30 @@ def ge(self, other, axis: str | int = "columns") -> DataFrame: floating point columns are considered different (i.e. `NaN` != `NaN`). + **Examples:** + + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + + You can use method name: + + >>> df = bpd.DataFrame({'angles': [0, 3, 4], + ... 'degrees': [360, 180, 360]}, + ... index=['circle', 'triangle', 'rectangle']) + >>> df["degrees"].ge(360) + circle True + triangle False + rectangle True + Name: degrees, dtype: boolean + + You can also use arithmetic operator ``>=``: + + >>> df["degrees"] >= 360 + circle True + triangle False + rectangle True + Name: degrees, dtype: boolean + Args: other (scalar, sequence, Series, or DataFrame): Any single or multiple element data structure, or list-like object. @@ -1139,6 +1259,28 @@ def gt(self, other, axis: str | int = "columns") -> DataFrame: floating point columns are considered different (i.e. `NaN` != `NaN`). + **Examples:** + + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + + >>> df = bpd.DataFrame({'angles': [0, 3, 4], + ... 'degrees': [360, 180, 360]}, + ... index=['circle', 'triangle', 'rectangle']) + >>> df["degrees"].gt(360) + circle False + triangle False + rectangle False + Name: degrees, dtype: boolean + + You can also use arithmetic operator ``>``: + + >>> df["degrees"] > 360 + circle False + triangle False + rectangle False + Name: degrees, dtype: boolean + Args: other (scalar, sequence, Series, or DataFrame): Any single or multiple element data structure, or list-like object. @@ -1162,6 +1304,32 @@ def add(self, other, axis: str | int = "columns") -> DataFrame: .. note:: Mismatched indices will be unioned together. + **Examples:** + + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + + >>> df = bpd.DataFrame({ + ... 'A': [1, 2, 3], + ... 'B': [4, 5, 6], + ... }) + + You can use method name: + + >>> df['A'].add(df['B']) + 0 5 + 1 7 + 2 9 + dtype: Int64 + + You can also use arithmetic operator ``+``: + + >>> df['A'] + (df['B']) + 0 5 + 1 7 + 2 9 + dtype: Int64 + Args: other (float, int, or Series): Any single or multiple element data structure, or list-like object. @@ -1185,6 +1353,32 @@ def sub(self, other, axis: str | int = "columns") -> DataFrame: .. note:: Mismatched indices will be unioned together. + **Examples:** + + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + + >>> df = bpd.DataFrame({ + ... 'A': [1, 2, 3], + ... 'B': [4, 5, 6], + ... }) + + You can use method name: + + >>> df['A'].sub(df['B']) + 0 -3 + 1 -3 + 2 -3 + dtype: Int64 + + You can also use arithmetic operator ``-``: + + >>> df['A'] - (df['B']) + 0 -3 + 1 -3 + 2 -3 + dtype: Int64 + Args: other (float, int, or Series): Any single or multiple element data structure, or list-like object. @@ -1208,6 +1402,29 @@ def rsub(self, other, axis: str | int = "columns") -> DataFrame: .. note:: Mismatched indices will be unioned together. + **Examples:** + + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + + >>> df = bpd.DataFrame({ + ... 'A': [1, 2, 3], + ... 'B': [4, 5, 6], + ... }) + >>> df['A'].rsub(df['B']) + 0 3 + 1 3 + 2 3 + dtype: Int64 + + It's equivalent to using arithmetic operator: ``-``: + + >>> df['B'] - (df['A']) + 0 3 + 1 3 + 2 3 + dtype: Int64 + Args: other (float, int, or Series): Any single or multiple element data structure, or list-like object. @@ -1231,6 +1448,32 @@ def mul(self, other, axis: str | int = "columns") -> DataFrame: .. note:: Mismatched indices will be unioned together. + **Examples:** + + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + + >>> df = bpd.DataFrame({ + ... 'A': [1, 2, 3], + ... 'B': [4, 5, 6], + ... }) + + You can use method name: + + >>> df['A'].mul(df['B']) + 0 4 + 1 10 + 2 18 + dtype: Int64 + + You can also use arithmetic operator ``*``: + + >>> df['A'] * (df['B']) + 0 4 + 1 10 + 2 18 + dtype: Int64 + Args: other (float, int, or Series): Any single or multiple element data structure, or list-like object. @@ -1254,6 +1497,32 @@ def truediv(self, other, axis: str | int = "columns") -> DataFrame: .. note:: Mismatched indices will be unioned together. + **Examples:** + + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + + >>> df = bpd.DataFrame({ + ... 'A': [1, 2, 3], + ... 'B': [4, 5, 6], + ... }) + + You can use method name: + + >>> df['A'].truediv(df['B']) + 0 0.25 + 1 0.4 + 2 0.5 + dtype: Float64 + + You can also use arithmetic operator ``/``: + + >>> df['A'] / (df['B']) + 0 0.25 + 1 0.4 + 2 0.5 + dtype: Float64 + Args: other (float, int, or Series): Any single or multiple element data structure, or list-like object. @@ -1277,6 +1546,29 @@ def rtruediv(self, other, axis: str | int = "columns") -> DataFrame: .. note:: Mismatched indices will be unioned together. + **Examples:** + + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + + >>> df = bpd.DataFrame({ + ... 'A': [1, 2, 3], + ... 'B': [4, 5, 6], + ... }) + >>> df['A'].rtruediv(df['B']) + 0 4.0 + 1 2.5 + 2 2.0 + dtype: Float64 + + It's equivalent to using arithmetic operator: ``/``: + + >>> df['B'] / (df['A']) + 0 4.0 + 1 2.5 + 2 2.0 + dtype: Float64 + Args: other (float, int, or Series): Any single or multiple element data structure, or list-like object. @@ -1300,6 +1592,32 @@ def floordiv(self, other, axis: str | int = "columns") -> DataFrame: .. note:: Mismatched indices will be unioned together. + **Examples:** + + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + + >>> df = bpd.DataFrame({ + ... 'A': [1, 2, 3], + ... 'B': [4, 5, 6], + ... }) + + You can use method name: + + >>> df['A'].floordiv(df['B']) + 0 0 + 1 0 + 2 0 + dtype: Int64 + + You can also use arithmetic operator ``//``: + + >>> df['A'] // (df['B']) + 0 0 + 1 0 + 2 0 + dtype: Int64 + Args: other (float, int, or Series): Any single or multiple element data structure, or list-like object. @@ -1323,6 +1641,29 @@ def rfloordiv(self, other, axis: str | int = "columns") -> DataFrame: .. note:: Mismatched indices will be unioned together. + **Examples:** + + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + + >>> df = bpd.DataFrame({ + ... 'A': [1, 2, 3], + ... 'B': [4, 5, 6], + ... }) + >>> df['A'].rfloordiv(df['B']) + 0 4 + 1 2 + 2 2 + dtype: Int64 + + It's equivalent to using arithmetic operator: ``//``: + + >>> df['B'] // (df['A']) + 0 4 + 1 2 + 2 2 + dtype: Int64 + Args: other (float, int, or Series): Any single or multiple element data structure, or list-like object. @@ -1346,6 +1687,32 @@ def mod(self, other, axis: str | int = "columns") -> DataFrame: .. note:: Mismatched indices will be unioned together. + **Examples:** + + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + + >>> df = bpd.DataFrame({ + ... 'A': [1, 2, 3], + ... 'B': [4, 5, 6], + ... }) + + You can use method name: + + >>> df['A'].mod(df['B']) + 0 1 + 1 2 + 2 3 + dtype: Int64 + + You can also use arithmetic operator ``%``: + + >>> df['A'] % (df['B']) + 0 1 + 1 2 + 2 3 + dtype: Int64 + Args: other: Any single or multiple element data structure, or list-like object. @@ -1369,6 +1736,29 @@ def rmod(self, other, axis: str | int = "columns") -> DataFrame: .. note:: Mismatched indices will be unioned together. + **Examples:** + + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + + >>> df = bpd.DataFrame({ + ... 'A': [1, 2, 3], + ... 'B': [4, 5, 6], + ... }) + >>> df['A'].rmod(df['B']) + 0 0 + 1 1 + 2 0 + dtype: Int64 + + It's equivalent to using arithmetic operator: ``%``: + + >>> df['B'] % (df['A']) + 0 0 + 1 1 + 2 0 + dtype: Int64 + Args: other (float, int, or Series): Any single or multiple element data structure, or list-like object. @@ -1382,7 +1772,7 @@ def rmod(self, other, axis: str | int = "columns") -> DataFrame: raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) def pow(self, other, axis: str | int = "columns") -> DataFrame: - """Get Exponential power of dataframe and other, element-wise (binary operator `pow`). + """Get Exponential power of dataframe and other, element-wise (binary operator `**`). Equivalent to ``dataframe ** other``, but with support to substitute a fill_value for missing data in one of the inputs. With reverse version, `rpow`. @@ -1393,6 +1783,32 @@ def pow(self, other, axis: str | int = "columns") -> DataFrame: .. note:: Mismatched indices will be unioned together. + **Examples:** + + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + + >>> df = bpd.DataFrame({ + ... 'A': [1, 2, 3], + ... 'B': [4, 5, 6], + ... }) + + You can use method name: + + >>> df['A'].pow(df['B']) + 0 1 + 1 32 + 2 729 + dtype: Int64 + + You can also use arithmetic operator ``**``: + + >>> df['A'] ** (df['B']) + 0 1 + 1 32 + 2 729 + dtype: Int64 + Args: other (float, int, or Series): Any single or multiple element data structure, or list-like object. @@ -1417,6 +1833,29 @@ def rpow(self, other, axis: str | int = "columns") -> DataFrame: .. note:: Mismatched indices will be unioned together. + **Examples:** + + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + + >>> df = bpd.DataFrame({ + ... 'A': [1, 2, 3], + ... 'B': [4, 5, 6], + ... }) + >>> df['A'].rpow(df['B']) + 0 4 + 1 25 + 2 216 + dtype: Int64 + + It's equivalent to using arithmetic operator: ``**``: + + >>> df['B'] ** (df['A']) + 0 4 + 1 25 + 2 216 + dtype: Int64 + Args: other (float, int, or Series): Any single or multiple element data structure, or list-like object. @@ -1438,6 +1877,21 @@ def combine( to element-wise combine columns. The row and column indexes of the resulting DataFrame will be the union of the two. + **Examples:** + + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + + >>> df1 = bpd.DataFrame({'A': [0, 0], 'B': [4, 4]}) + >>> df2 = bpd.DataFrame({'A': [1, 1], 'B': [3, 3]}) + >>> take_smaller = lambda s1, s2: s1 if s1.sum() < s2.sum() else s2 + >>> df1.combine(df2, take_smaller) + A B + 0 0 3 + 1 0 3 + + [2 rows x 2 columns] + Args: other (DataFrame): The DataFrame to merge column-wise. @@ -1468,6 +1922,20 @@ def combine_first(self, other) -> DataFrame: second.loc[index, col] are not missing values, upon calling first.combine_first(second). + **Examples:** + + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + + >>> df1 = bpd.DataFrame({'A': [None, 0], 'B': [None, 4]}) + >>> df2 = bpd.DataFrame({'A': [1, 1], 'B': [3, 3]}) + >>> df1.combine_first(df2) + A B + 0 1.0 3.0 + 1 0.0 4.0 + + [2 rows x 2 columns] + Args: other (DataFrame): Provided DataFrame to use to fill null values. @@ -1485,6 +1953,24 @@ def update( Aligns on indices. There is no return value. + **Examples:** + + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + + >>> df = bpd.DataFrame({'A': [1, 2, 3], + ... 'B': [400, 500, 600]}) + >>> new_df = bpd.DataFrame({'B': [4, 5, 6], + ... 'C': [7, 8, 9]}) + >>> df.update(new_df) + >>> df + A B + 0 1 4 + 1 2 5 + 2 3 6 + + [3 rows x 2 columns] + Args: other (DataFrame, or object coercible into a DataFrame): Should have at least one matching index/column label @@ -2010,6 +2496,34 @@ def idxmax(self): """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + def melt(self, id_vars, value_vars, var_name, value_name): + """ + Unpivot a DataFrame from wide to long format, optionally leaving identifiers set. + + This function is useful to massage a DataFrame into a format where one + or more columns are identifier variables (`id_vars`), while all other + columns, considered measured variables (`value_vars`), are "unpivoted" to + the row axis, leaving just two non-identifier columns, 'variable' and + 'value'. + + Parameters + ---------- + id_vars (tuple, list, or ndarray, optional): + Column(s) to use as identifier variables. + value_vars (tuple, list, or ndarray, optional): + Column(s) to unpivot. If not specified, uses all columns that + are not set as `id_vars`. + var_name (scalar): + Name to use for the 'variable' column. If None it uses + ``frame.columns.name`` or 'variable'. + value_name (scalar, default 'value'): + Name to use for the 'value' column. + + Returns: + DataFrame: Unpivoted DataFrame. + """ + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + def nunique(self): """ Count number of distinct elements in specified axis. diff --git a/third_party/bigframes_vendored/pandas/core/reshape/concat.py b/third_party/bigframes_vendored/pandas/core/reshape/concat.py index 6e6d2d8b5c..b0472c524a 100644 --- a/third_party/bigframes_vendored/pandas/core/reshape/concat.py +++ b/third_party/bigframes_vendored/pandas/core/reshape/concat.py @@ -1,6 +1,6 @@ # Contains code from https://github.com/pandas-dev/pandas/blob/main/pandas/core/reshape/concat.py """ -Concat routines. +Concat routines """ from __future__ import annotations diff --git a/third_party/bigframes_vendored/pandas/core/reshape/encoding.py b/third_party/bigframes_vendored/pandas/core/reshape/encoding.py new file mode 100644 index 0000000000..da92b58f50 --- /dev/null +++ b/third_party/bigframes_vendored/pandas/core/reshape/encoding.py @@ -0,0 +1,119 @@ +# Contains code from https://github.com/pandas-dev/pandas/blob/main/pandas/core/reshape/encoding.py +""" +Encoding routines +""" +from __future__ import annotations + +from bigframes import constants + + +def get_dummies( + data, + prefix=None, + prefix_sep="_", + dummy_na=False, + columns=None, + drop_first=False, + dtype=None, +): + """ + Convert categorical variable into dummy/indicator variables. + + Each variable is converted in as many 0/1 variables as there are + different values. Columns in the output are each named after a value; + if the input is a DataFrame, the name of the original variable is + prepended to the value. + + **Examples:** + >>> import bigframes.pandas as pd + >>> pd.options.display.progress_bar = None + >>> s = pd.Series(list('abca')) + >>> pd.get_dummies(s) + a b c + 0 True False False + 1 False True False + 2 False False True + 3 True False False + + [4 rows x 3 columns] + + >>> s1 = pd.Series(['a', 'b', None]) + >>> pd.get_dummies(s1) + a b + 0 True False + 1 False True + 2 False False + + [3 rows x 2 columns] + + >>> pd.get_dummies(s1, dummy_na=True) + a b + 0 True False False + 1 False True False + 2 False False True + + [3 rows x 3 columns] + + >>> df = pd.DataFrame({'A': ['a', 'b', 'a'], 'B': ['b', 'a', 'c'], 'C': [1, 2, 3]}) + >>> pd.get_dummies(df, prefix=['col1', 'col2']) + C col1_a col1_b col2_a col2_b col2_c + 0 1 True False False True False + 1 2 False True True False False + 2 3 True False False False True + + [3 rows x 6 columns] + + >>> pd.get_dummies(pd.Series(list('abcaa'))) + a b c + 0 True False False + 1 False True False + 2 False False True + 3 True False False + 4 True False False + + [5 rows x 3 columns] + + >>> pd.get_dummies(pd.Series(list('abcaa')), drop_first=True) + b c + 0 False False + 1 True False + 2 False True + 3 False False + 4 False False + + [5 rows x 2 columns] + + Args: + data (Series or DataFrame): + Data of which to get dummy indicators. + + prefix (str, list of str, or dict of str, default None): + String to append DataFrame column names. Pass a list with length + equal to the number of columns when calling get_dummies on a + DataFrame. Alternatively, prefix can be a dictionary mapping column + names to prefixes. + + prefix_sep (str, list of str, or dict of str, default '_'): + Separator/delimiter to use, appended to prefix. Or pass a list or + dictionary as with prefix. + + dummy_na (bool, default False): + Add a column to indicate NaNs, if False NaNs are ignored. + + columns (list-like, default None): + Column names in the DataFrame to be encoded. If columns is None + then only the columns with string dtype will be converted. + + drop_first (bool, default False): + Whether to get k-1 dummies out of k categorical levels by removing the + first level. + + dtype (dtype, default bool): + Data type for new columns. Only a single dtype is allowed. + + Returns: + DataFrame: Dummy-coded data. If data contains other columns than the + dummy-coded one(s), these will be prepended, unaltered, to the + result. + """ + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) diff --git a/third_party/bigframes_vendored/pandas/core/reshape/merge.py b/third_party/bigframes_vendored/pandas/core/reshape/merge.py index cc81de405b..b03f366fca 100644 --- a/third_party/bigframes_vendored/pandas/core/reshape/merge.py +++ b/third_party/bigframes_vendored/pandas/core/reshape/merge.py @@ -16,7 +16,6 @@ def merge( sort=False, suffixes=("_x", "_y"), ): - """ Merge DataFrame objects with a database-style join. diff --git a/third_party/bigframes_vendored/pandas/core/reshape/tile.py b/third_party/bigframes_vendored/pandas/core/reshape/tile.py index 4f5f2efef0..d4471ed68e 100644 --- a/third_party/bigframes_vendored/pandas/core/reshape/tile.py +++ b/third_party/bigframes_vendored/pandas/core/reshape/tile.py @@ -1,6 +1,6 @@ # Contains code from https://github.com/pandas-dev/pandas/blob/main/pandas/core/reshape/tile.py """ -Quantilization functions and related stuff +Quantilization functions and related routines """ from __future__ import annotations @@ -65,3 +65,33 @@ def cut( False : returns an ndarray of integers. """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + + +def qcut(x, q, *, labels=None, duplicates="error"): + """ + Quantile-based discretization function. + + Discretize variable into equal-sized buckets based on rank or based + on sample quantiles. For example 1000 values for 10 quantiles would + produce a Categorical object indicating quantile membership for each data point. + + Args: + x (Series): + The input Series to be binned. Must be 1-dimensional. + q (int or list-like of float): + Number of quantiles. 10 for deciles, 4 for quartiles, etc. Alternately + array of quantiles, e.g. [0, .25, .5, .75, 1.] for quartiles. + labels (None): + Used as labels for the resulting bins. Must be of the same length as + the resulting bins. If False, return only integer indicators of the + bins. If True, raises an error. + duplicates ({default 'raise', 'drop'}, optional): + If bin edges are not unique, raise ValueError or drop non-uniques. + + Returns: + Series: Categorical or Series of integers if labels is False + The return type (Categorical or Series) depends on the input: a Series + of type category if input is a Series else Categorical. Bins are + represented as categories when categorical data is returned. + """ + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) diff --git a/third_party/bigframes_vendored/pandas/core/series.py b/third_party/bigframes_vendored/pandas/core/series.py index bd1f9a9a18..f0e13e16f5 100644 --- a/third_party/bigframes_vendored/pandas/core/series.py +++ b/third_party/bigframes_vendored/pandas/core/series.py @@ -1654,6 +1654,19 @@ def clip(self): """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + def unstack(self, level): + """ + Unstack, also known as pivot, Series with MultiIndex to produce DataFrame. + + Args: + level (int, str, or list of these, default last level): + Level(s) to unstack, can pass level name. + + Returns: + DataFrame: Unstacked Series. + """ + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + def argmax(self): """ Return int position of the smallest value in the Series. diff --git a/third_party/bigframes_vendored/sklearn/base.py b/third_party/bigframes_vendored/sklearn/base.py index 42868ce51f..768328e552 100644 --- a/third_party/bigframes_vendored/sklearn/base.py +++ b/third_party/bigframes_vendored/sklearn/base.py @@ -85,6 +85,12 @@ def score(self, X, y): which is a harsh metric since you require for each sample that each label set be correctly predicted. + .. note:: + + Output matches that of the BigQuery ML.EVALUTE function. + See: https://cloud.google.com/bigquery/docs/reference/standard-sql/bigqueryml-syntax-evaluate#classification_models + for the outputs relevant to this model type. + Args: X (bigframes.dataframe.DataFrame or bigframes.series.Series): DataFrame of shape (n_samples, n_features). Test samples. @@ -105,7 +111,13 @@ class RegressorMixin: _estimator_type = "regressor" def score(self, X, y): - """Return the evaluation metrics of the model. + """Calculate evaluation metrics of the model. + + .. note:: + + Output matches that of the BigQuery ML.EVALUTE function. + See: https://cloud.google.com/bigquery/docs/reference/standard-sql/bigqueryml-syntax-evaluate#regression_models + for the outputs relevant to this model type. Args: X (bigframes.dataframe.DataFrame or bigframes.series.Series): diff --git a/third_party/bigframes_vendored/sklearn/cluster/_kmeans.py b/third_party/bigframes_vendored/sklearn/cluster/_kmeans.py index ece62dc147..5369d3662d 100644 --- a/third_party/bigframes_vendored/sklearn/cluster/_kmeans.py +++ b/third_party/bigframes_vendored/sklearn/cluster/_kmeans.py @@ -12,7 +12,6 @@ # License: BSD 3 clause from abc import ABC -from typing import List, Optional from bigframes import constants from third_party.bigframes_vendored.sklearn.base import BaseEstimator @@ -83,7 +82,13 @@ def score( X, y=None, ): - """Metrics of the model. + """Calculate evaluation metrics of the model. + + .. note:: + + Output matches that of the BigQuery ML.EVALUTE function. + See: https://cloud.google.com/bigquery/docs/reference/standard-sql/bigqueryml-syntax-evaluate#k-means_models + for the outputs relevant to this model type. Args: X (bigframes.dataframe.DataFrame or bigframes.series.Series): diff --git a/third_party/bigframes_vendored/sklearn/decomposition/_pca.py b/third_party/bigframes_vendored/sklearn/decomposition/_pca.py index 97fee5a501..011ecc06dd 100644 --- a/third_party/bigframes_vendored/sklearn/decomposition/_pca.py +++ b/third_party/bigframes_vendored/sklearn/decomposition/_pca.py @@ -55,7 +55,13 @@ def fit(self, X, y=None): raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) def score(self, X=None, y=None): - """Return the metrics of the model. + """Calculate evaluation metrics of the model. + + .. note:: + + Output matches that of the BigQuery ML.EVALUTE function. + See: https://cloud.google.com/bigquery/docs/reference/standard-sql/bigqueryml-syntax-evaluate#pca_models + for the outputs relevant to this model type. Args: X (default None):