diff --git a/.github/.OwlBot.lock.yaml b/.github/.OwlBot.lock.yaml index 7f291dbd5f..ec696b558c 100644 --- a/.github/.OwlBot.lock.yaml +++ b/.github/.OwlBot.lock.yaml @@ -13,5 +13,5 @@ # limitations under the License. docker: image: gcr.io/cloud-devrel-public-resources/owlbot-python:latest - digest: sha256:4f9b3b106ad0beafc2c8a415e3f62c1a0cc23cabea115dbe841b848f581cfe99 -# created: 2023-10-18T20:26:37.410353675Z + digest: sha256:30470597773378105e239b59fce8eb27cc97375580d592699206d17d117143d0 +# created: 2023-11-03T00:57:07.335914631Z diff --git a/.github/workflows/docs.yml b/.github/workflows/docs.yml index e97d89e484..221806cedf 100644 --- a/.github/workflows/docs.yml +++ b/.github/workflows/docs.yml @@ -28,7 +28,7 @@ jobs: - name: Setup Python uses: actions/setup-python@v4 with: - python-version: "3.9" + python-version: "3.10" - name: Install nox run: | python -m pip install --upgrade setuptools pip wheel diff --git a/CHANGELOG.md b/CHANGELOG.md index 845d3634bc..fc327b2e96 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -4,6 +4,22 @@ [1]: https://pypi.org/project/bigframes/#history +## [0.13.0](https://github.com/googleapis/python-bigquery-dataframes/compare/v0.12.0...v0.13.0) (2023-11-07) + + +### Features + +* `to_gbq` without a destination table writes to a temporary table ([#158](https://github.com/googleapis/python-bigquery-dataframes/issues/158)) ([e1817c9](https://github.com/googleapis/python-bigquery-dataframes/commit/e1817c9201ba4ea7fd2f8b6f4a667b010a6fec1b)) +* Add `DataFrame.__iter__`, `DataFrame.iterrows`, `DataFrame.itertuples`, and `DataFrame.keys` methods ([#164](https://github.com/googleapis/python-bigquery-dataframes/issues/164)) ([c065071](https://github.com/googleapis/python-bigquery-dataframes/commit/c065071028c2f4ac80ee7f84dbeb1df385c2a512)) +* Add `Series.__iter__` method ([#164](https://github.com/googleapis/python-bigquery-dataframes/issues/164)) ([c065071](https://github.com/googleapis/python-bigquery-dataframes/commit/c065071028c2f4ac80ee7f84dbeb1df385c2a512)) +* Add interpolate() to series and dataframe ([#157](https://github.com/googleapis/python-bigquery-dataframes/issues/157)) ([b9cb55c](https://github.com/googleapis/python-bigquery-dataframes/commit/b9cb55c5b9354f9ff60de0aad66fe60049876055)) +* Support 32k text-generation and multilingual embedding models ([#161](https://github.com/googleapis/python-bigquery-dataframes/issues/161)) ([5f0ea37](https://github.com/googleapis/python-bigquery-dataframes/commit/5f0ea37fffff792fc3fbed65e6ace846d8ef6a06)) + + +### Bug Fixes + +* Update default temp table expiration to 7 days ([#174](https://github.com/googleapis/python-bigquery-dataframes/issues/174)) ([4ff26cd](https://github.com/googleapis/python-bigquery-dataframes/commit/4ff26cdf862e9f9b91a3a1d2abfa7fbdf0af9c5b)) + ## [0.12.0](https://github.com/googleapis/python-bigquery-dataframes/compare/v0.11.0...v0.12.0) (2023-11-01) diff --git a/bigframes/constants.py b/bigframes/constants.py index 90837c79eb..a1ffd2b755 100644 --- a/bigframes/constants.py +++ b/bigframes/constants.py @@ -12,6 +12,8 @@ # See the License for the specific language governing permissions and # limitations under the License. +import datetime + """Constants used across BigQuery DataFrames. This module should not depend on any others in the package. @@ -23,3 +25,5 @@ ) ABSTRACT_METHOD_ERROR_MESSAGE = f"Abstract method. You have likely encountered a bug. Please share this stacktrace and how you reached it with the BigQuery DataFrames team. {FEEDBACK_LINK}" + +DEFAULT_EXPIRATION = datetime.timedelta(days=7) diff --git a/bigframes/core/block_transforms.py b/bigframes/core/block_transforms.py index 3706bf1681..917edac0de 100644 --- a/bigframes/core/block_transforms.py +++ b/bigframes/core/block_transforms.py @@ -105,6 +105,97 @@ def indicate_duplicates( ) +def interpolate(block: blocks.Block, method: str = "linear") -> blocks.Block: + if method != "linear": + raise NotImplementedError( + f"Only 'linear' interpolate method supported. {constants.FEEDBACK_LINK}" + ) + backwards_window = windows.WindowSpec(following=0) + forwards_window = windows.WindowSpec(preceding=0) + + output_column_ids = [] + + original_columns = block.value_columns + original_labels = block.column_labels + block, offsets = block.promote_offsets() + for column in original_columns: + # null in same places column is null + should_interpolate = block._column_type(column) in [ + pd.Float64Dtype(), + pd.Int64Dtype(), + ] + if should_interpolate: + block, notnull = block.apply_unary_op(column, ops.notnull_op) + block, masked_offsets = block.apply_binary_op( + offsets, notnull, ops.partial_arg3(ops.where_op, None) + ) + + block, previous_value = block.apply_window_op( + column, agg_ops.LastNonNullOp(), backwards_window + ) + block, next_value = block.apply_window_op( + column, agg_ops.FirstNonNullOp(), forwards_window + ) + block, previous_value_offset = block.apply_window_op( + masked_offsets, + agg_ops.LastNonNullOp(), + backwards_window, + skip_reproject_unsafe=True, + ) + block, next_value_offset = block.apply_window_op( + masked_offsets, + agg_ops.FirstNonNullOp(), + forwards_window, + skip_reproject_unsafe=True, + ) + + block, prediction_id = _interpolate( + block, + previous_value_offset, + previous_value, + next_value_offset, + next_value, + offsets, + ) + + block, interpolated_column = block.apply_binary_op( + column, prediction_id, ops.fillna_op + ) + # Pandas performs ffill-like behavior to extrapolate forwards + block, interpolated_and_ffilled = block.apply_binary_op( + interpolated_column, previous_value, ops.fillna_op + ) + + output_column_ids.append(interpolated_and_ffilled) + else: + output_column_ids.append(column) + + # Force reproject since used `skip_project_unsafe` perviously + block = block.select_columns(output_column_ids)._force_reproject() + return block.with_column_labels(original_labels) + + +def _interpolate( + block: blocks.Block, + x0_id: str, + y0_id: str, + x1_id: str, + y1_id: str, + xpredict_id: str, +) -> typing.Tuple[blocks.Block, str]: + """Applies linear interpolation equation to predict y values for xpredict.""" + block, x1x0diff = block.apply_binary_op(x1_id, x0_id, ops.sub_op) + block, y1y0diff = block.apply_binary_op(y1_id, y0_id, ops.sub_op) + block, xpredictx0diff = block.apply_binary_op(xpredict_id, x0_id, ops.sub_op) + + block, y1_weight = block.apply_binary_op(y1y0diff, x1x0diff, ops.div_op) + block, y1_part = block.apply_binary_op(xpredictx0diff, y1_weight, ops.mul_op) + + block, prediction_id = block.apply_binary_op(y0_id, y1_part, ops.add_op) + block = block.drop_columns([x1x0diff, y1y0diff, xpredictx0diff, y1_weight, y1_part]) + return block, prediction_id + + def drop_duplicates( block: blocks.Block, columns: typing.Sequence[str], keep: str = "first" ) -> blocks.Block: diff --git a/bigframes/dataframe.py b/bigframes/dataframe.py index 3369fb4868..40f12671ae 100644 --- a/bigframes/dataframe.py +++ b/bigframes/dataframe.py @@ -16,6 +16,7 @@ from __future__ import annotations +import datetime import re import textwrap import typing @@ -303,6 +304,9 @@ def __len__(self): rows, _ = self.shape return rows + def __iter__(self): + return iter(self.columns) + def astype( self, dtype: Union[bigframes.dtypes.DtypeString, bigframes.dtypes.Dtype], @@ -1434,6 +1438,10 @@ def _reindex_columns(self, columns): def reindex_like(self, other: DataFrame, *, validate: typing.Optional[bool] = None): return self.reindex(index=other.index, columns=other.columns, validate=validate) + def interpolate(self, method: str = "linear") -> DataFrame: + result = block_ops.interpolate(self._block, method) + return DataFrame(result) + def fillna(self, value=None) -> DataFrame: return self._apply_binop(value, ops.fillna_op, how="left") @@ -1472,12 +1480,27 @@ def isin(self, values) -> DataFrame: f"isin(), you passed a [{type(values).__name__}]" ) + def keys(self) -> pandas.Index: + return self.columns + def items(self): column_ids = self._block.value_columns column_labels = self._block.column_labels for col_id, col_label in zip(column_ids, column_labels): yield col_label, bigframes.series.Series(self._block.select_column(col_id)) + def iterrows(self) -> Iterable[tuple[typing.Any, pandas.Series]]: + for df in self.to_pandas_batches(): + for item in df.iterrows(): + yield item + + def itertuples( + self, index: bool = True, name: typing.Optional[str] = "Pandas" + ) -> Iterable[tuple[typing.Any, ...]]: + for df in self.to_pandas_batches(): + for item in df.itertuples(index=index, name=name): + yield item + def dropna( self, *, @@ -2285,25 +2308,52 @@ def to_json( def to_gbq( self, - destination_table: str, + destination_table: Optional[str] = None, *, - if_exists: Optional[Literal["fail", "replace", "append"]] = "fail", + if_exists: Optional[Literal["fail", "replace", "append"]] = None, index: bool = True, ordering_id: Optional[str] = None, - ) -> None: - if "." not in destination_table: - raise ValueError( - "Invalid Table Name. Should be of the form 'datasetId.tableId' or " - "'projectId.datasetId.tableId'" - ) - + ) -> str: dispositions = { "fail": bigquery.WriteDisposition.WRITE_EMPTY, "replace": bigquery.WriteDisposition.WRITE_TRUNCATE, "append": bigquery.WriteDisposition.WRITE_APPEND, } + + if destination_table is None: + # TODO(swast): If there have been no modifications to the DataFrame + # since the last time it was written (cached), then return that. + # For `read_gbq` nodes, return the underlying table clone. + destination_table = bigframes.session._io.bigquery.create_temp_table( + self._session.bqclient, + self._session._anonymous_dataset, + # TODO(swast): allow custom expiration times, probably via session configuration. + datetime.datetime.now(datetime.timezone.utc) + + constants.DEFAULT_EXPIRATION, + ) + + if if_exists is not None and if_exists != "replace": + raise ValueError( + f"Got invalid value {repr(if_exists)} for if_exists. " + "When no destination table is specified, a new table is always created. " + "None or 'replace' are the only valid options in this case." + ) + if_exists = "replace" + + if "." not in destination_table: + raise ValueError( + f"Got invalid value for destination_table {repr(destination_table)}. " + "Should be of the form 'datasetId.tableId' or 'projectId.datasetId.tableId'." + ) + + if if_exists is None: + if_exists = "fail" + if if_exists not in dispositions: - raise ValueError("'{0}' is not valid for if_exists".format(if_exists)) + raise ValueError( + f"Got invalid value {repr(if_exists)} for if_exists. " + f"Valid options include None or one of {dispositions.keys()}." + ) job_config = bigquery.QueryJobConfig( write_disposition=dispositions[if_exists], @@ -2314,6 +2364,7 @@ def to_gbq( ) self._run_io_query(index=index, ordering_id=ordering_id, job_config=job_config) + return destination_table def to_numpy( self, dtype=None, copy=False, na_value=None, **kwargs diff --git a/bigframes/ml/llm.py b/bigframes/ml/llm.py index d78f467537..2e5a9a1e5e 100644 --- a/bigframes/ml/llm.py +++ b/bigframes/ml/llm.py @@ -16,7 +16,7 @@ from __future__ import annotations -from typing import cast, Optional, Union +from typing import cast, Literal, Optional, Union import bigframes from bigframes import clients, constants @@ -25,9 +25,11 @@ import bigframes.pandas as bpd _REMOTE_TEXT_GENERATOR_MODEL_CODE = "CLOUD_AI_LARGE_LANGUAGE_MODEL_V1" +_REMOTE_TEXT_GENERATOR_32K_MODEL_CODE = "text-bison-32k" _TEXT_GENERATE_RESULT_COLUMN = "ml_generate_text_llm_result" _REMOTE_EMBEDDING_GENERATOR_MODEL_CODE = "CLOUD_AI_TEXT_EMBEDDING_MODEL_V1" +_REMOTE_EMBEDDING_GENERATOR_MUlTILINGUAL_MODEL_CODE = "textembedding-gecko-multilingual" _EMBED_TEXT_RESULT_COLUMN = "text_embedding" @@ -35,19 +37,25 @@ class PaLM2TextGenerator(base.Predictor): """PaLM2 text generator LLM model. Args: + model_name (str, Default to "text-bison"): + The model for natural language tasks. “text-bison” returns model fine-tuned to follow natural language instructions + and is suitable for a variety of language tasks. "text-bison-32k" supports up to 32k tokens per request. + Default to "text-bison". session (bigframes.Session or None): BQ session to create the model. If None, use the global default session. connection_name (str or None): - connection to connect with remote service. str of the format ... + Connection to connect with remote service. str of the format ... if None, use default connection in session context. BigQuery DataFrame will try to create the connection and attach permission if the connection isn't fully setup. """ def __init__( self, + model_name: Literal["text-bison", "text-bison-32k"] = "text-bison", session: Optional[bigframes.Session] = None, connection_name: Optional[str] = None, ): + self.model_name = model_name self.session = session or bpd.get_global_session() self._bq_connection_manager = clients.BqConnectionManager( self.session.bqconnectionclient, self.session.resourcemanagerclient @@ -80,11 +88,14 @@ def _create_bqml_model(self): connection_id=connection_name_parts[2], iam_role="aiplatform.user", ) - - options = { - "remote_service_type": _REMOTE_TEXT_GENERATOR_MODEL_CODE, - } - + if self.model_name == "text-bison": + options = { + "remote_service_type": _REMOTE_TEXT_GENERATOR_MODEL_CODE, + } + else: + options = { + "endpoint": _REMOTE_TEXT_GENERATOR_32K_MODEL_CODE, + } return self._bqml_model_factory.create_remote_model( session=self.session, connection_name=self.connection_name, options=options ) @@ -118,7 +129,7 @@ def predict( top_k (int, default 40): Top-k changes how the model selects tokens for output. A top-k of 1 means the selected token is the most probable among all tokens - in the model’s vocabulary (also called greedy decoding), while a top-k of 3 means that the next token is selected from among the 3 most probable tokens (using temperature). + in the model's vocabulary (also called greedy decoding), while a top-k of 3 means that the next token is selected from among the 3 most probable tokens (using temperature). For each token selection step, the top K tokens with the highest probabilities are sampled. Then tokens are further filtered based on topP with the final token selected using temperature sampling. Specify a lower value for less random responses and a higher value for more random responses. Default 40. Possible values [1, 40]. @@ -175,6 +186,10 @@ class PaLM2TextEmbeddingGenerator(base.Predictor): """PaLM2 text embedding generator LLM model. Args: + model_name (str, Default to "textembedding-gecko"): + The model for text embedding. “textembedding-gecko” returns model embeddings for text inputs. + "textembedding-gecko-multilingual" returns model embeddings for text inputs which support over 100 languages + Default to "textembedding-gecko". session (bigframes.Session or None): BQ session to create the model. If None, use the global default session. connection_name (str or None): @@ -184,9 +199,13 @@ class PaLM2TextEmbeddingGenerator(base.Predictor): def __init__( self, + model_name: Literal[ + "textembedding-gecko", "textembedding-gecko-multilingual" + ] = "textembedding-gecko", session: Optional[bigframes.Session] = None, connection_name: Optional[str] = None, ): + self.model_name = model_name self.session = session or bpd.get_global_session() self._bq_connection_manager = clients.BqConnectionManager( self.session.bqconnectionclient, self.session.resourcemanagerclient @@ -219,10 +238,14 @@ def _create_bqml_model(self): connection_id=connection_name_parts[2], iam_role="aiplatform.user", ) - - options = { - "remote_service_type": _REMOTE_EMBEDDING_GENERATOR_MODEL_CODE, - } + if self.model_name == "textembedding-gecko": + options = { + "remote_service_type": _REMOTE_EMBEDDING_GENERATOR_MODEL_CODE, + } + else: + options = { + "endpoint": _REMOTE_EMBEDDING_GENERATOR_MUlTILINGUAL_MODEL_CODE, + } return self._bqml_model_factory.create_remote_model( session=self.session, connection_name=self.connection_name, options=options diff --git a/bigframes/series.py b/bigframes/series.py index 37d00d16f3..032bdf6c42 100644 --- a/bigframes/series.py +++ b/bigframes/series.py @@ -16,6 +16,7 @@ from __future__ import annotations +import itertools import numbers import textwrap import typing @@ -148,6 +149,11 @@ def _set_internal_query_job(self, query_job: bigquery.QueryJob): def __len__(self): return self.shape[0] + def __iter__(self) -> typing.Iterator: + return itertools.chain.from_iterable( + map(lambda x: x.index, self._block.to_pandas_batches()) + ) + def copy(self) -> Series: return Series(self._block) @@ -468,6 +474,10 @@ def replace( ) return Series(block.select_column(result_col)) + def interpolate(self, method: str = "linear") -> Series: + result = block_ops.interpolate(self._block, method) + return Series(result) + def dropna( self, *, diff --git a/bigframes/session/__init__.py b/bigframes/session/__init__.py index 5a61ed534f..2537e81e19 100644 --- a/bigframes/session/__init__.py +++ b/bigframes/session/__init__.py @@ -223,6 +223,17 @@ def _create_and_bind_bq_session(self): query_job.result() # blocks until finished self._session_id = query_job.session_info.session_id + # The anonymous dataset is used by BigQuery to write query results and + # session tables. BigQuery DataFrames also writes temp tables directly + # to the dataset, no BigQuery Session required. Note: there is a + # different anonymous dataset per location. See: + # https://cloud.google.com/bigquery/docs/cached-results#how_cached_results_are_stored + query_destination = query_job.destination + self._anonymous_dataset = bigquery.DatasetReference( + query_destination.project, + query_destination.dataset_id, + ) + self.bqclient.default_query_job_config = bigquery.QueryJobConfig( connection_properties=[ bigquery.ConnectionProperty("session_id", self._session_id) @@ -419,7 +430,9 @@ def _read_gbq_query( index_cols = list(index_col) destination, query_job = self._query_to_destination( - query, index_cols, api_name="read_gbq_query" + query, + index_cols, + api_name=api_name, ) # If there was no destination table, that means the query must have diff --git a/bigframes/session/_io/bigquery.py b/bigframes/session/_io/bigquery.py index d47efbdddc..06d240fec6 100644 --- a/bigframes/session/_io/bigquery.py +++ b/bigframes/session/_io/bigquery.py @@ -14,14 +14,18 @@ """Private module: Helpers for I/O operations.""" +from __future__ import annotations + import datetime import textwrap import types from typing import Dict, Iterable, Union +import uuid import google.cloud.bigquery as bigquery IO_ORDERING_ID = "bqdf_row_nums" +TEMP_TABLE_PREFIX = "bqdf{date}_{random_id}" def create_export_csv_statement( @@ -67,6 +71,29 @@ def create_export_data_statement( ) +def random_table(dataset: bigquery.DatasetReference) -> bigquery.TableReference: + """Generate a random table ID with BigQuery DataFrames prefix. + Args: + dataset (google.cloud.bigquery.DatasetReference): + The dataset to make the table reference in. Usually the anonymous + dataset for the session. + Returns: + google.cloud.bigquery.TableReference: + Fully qualified table ID of a table that doesn't exist. + """ + now = datetime.datetime.now(datetime.timezone.utc) + random_id = uuid.uuid4().hex + table_id = TEMP_TABLE_PREFIX.format( + date=now.strftime("%Y%m%d"), random_id=random_id + ) + return dataset.table(table_id) + + +def table_ref_to_sql(table: bigquery.TableReference) -> str: + """Format a table reference as escaped SQL.""" + return f"`{table.project}`.`{table.dataset_id}`.`{table.table_id}`" + + def create_snapshot_sql( table_ref: bigquery.TableReference, current_timestamp: datetime.datetime ) -> str: @@ -90,6 +117,19 @@ def create_snapshot_sql( ) +def create_temp_table( + bqclient: bigquery.Client, + dataset: bigquery.DatasetReference, + expiration: datetime.datetime, +) -> str: + """Create an empty table with an expiration in the desired dataset.""" + table_ref = random_table(dataset) + destination = bigquery.Table(table_ref) + destination.expires = expiration + bqclient.create_table(destination) + return f"{table_ref.project}.{table_ref.dataset_id}.{table_ref.table_id}" + + # BigQuery REST API returns types in Legacy SQL format # https://cloud.google.com/bigquery/docs/data-types but we use Standard SQL # names diff --git a/bigframes/version.py b/bigframes/version.py index b324ed7234..0a5df27479 100644 --- a/bigframes/version.py +++ b/bigframes/version.py @@ -12,4 +12,4 @@ # See the License for the specific language governing permissions and # limitations under the License. -__version__ = "0.12.0" +__version__ = "0.13.0" diff --git a/noxfile.py b/noxfile.py index d0bbda80fd..34b055de44 100644 --- a/noxfile.py +++ b/noxfile.py @@ -451,9 +451,7 @@ def docs(session): ) -# docfx doesn't yet support Python 3.10. -# https://github.com/googleapis/sphinx-docfx-yaml/issues/305 -@nox.session(python="3.9") +@nox.session(python=DEFAULT_PYTHON_VERSION) def docfx(session): """Build the docfx yaml files for this library.""" diff --git a/samples/snippets/clustering_model_test.py b/samples/snippets/clustering_model_test.py new file mode 100644 index 0000000000..a407fc7805 --- /dev/null +++ b/samples/snippets/clustering_model_test.py @@ -0,0 +1,35 @@ +# Copyright 2023 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +def test_clustering_model(): + # [START bigquery_dataframes_clustering_model] + from bigframes.ml.cluster import KMeans + import bigframes.pandas as bpd + + # Load data from BigQuery + query_or_table = "bigquery-public-data.ml_datasets.penguins" + bq_df = bpd.read_gbq(query_or_table) + + # Create the KMeans model + cluster_model = KMeans(n_clusters=10) + cluster_model.fit(bq_df["culmen_length_mm"], bq_df["sex"]) + + # Predict using the model + result = cluster_model.predict(bq_df) + # Score the model + score = cluster_model.score(bq_df) + # [END bigquery_dataframes_clustering_model] + assert result is not None + assert score is not None diff --git a/samples/snippets/gen_ai_model_test.py b/samples/snippets/gen_ai_model_test.py new file mode 100644 index 0000000000..7cbc90d4c0 --- /dev/null +++ b/samples/snippets/gen_ai_model_test.py @@ -0,0 +1,39 @@ +# Copyright 2023 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +def test_llm_model(): + PROJECT_ID = "bigframes-dev" + REGION = "us" + CONN_NAME = "bigframes-ml" + # [START bigquery_dataframes_gen_ai_model] + from bigframes.ml.llm import PaLM2TextGenerator + import bigframes.pandas as bpd + + # Create the LLM model + session = bpd.get_global_session() + connection = f"{PROJECT_ID}.{REGION}.{CONN_NAME}" + model = PaLM2TextGenerator(session=session, connection_name=connection) + + df_api = bpd.read_csv("gs://cloud-samples-data/vertex-ai/bigframe/df.csv") + + # Prepare the prompts and send them to the LLM model for prediction + df_prompt_prefix = "Generate Pandas sample code for DataFrame." + df_prompt = df_prompt_prefix + df_api["API"] + + # Predict using the model + df_pred = model.predict(df_prompt.to_frame(), max_output_tokens=1024) + # [END bigquery_dataframes_gen_ai_model] + assert df_pred["ml_generate_text_llm_result"] is not None + assert df_pred["ml_generate_text_llm_result"].iloc[0] is not None diff --git a/samples/snippets/regression_model_test.py b/samples/snippets/regression_model_test.py new file mode 100644 index 0000000000..7d1bde689c --- /dev/null +++ b/samples/snippets/regression_model_test.py @@ -0,0 +1,57 @@ +# Copyright 2023 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +def test_regression_model(): + # [START bigquery_dataframes_regression_model] + from bigframes.ml.linear_model import LinearRegression + import bigframes.pandas as bpd + + # Load data from BigQuery + query_or_table = "bigquery-public-data.ml_datasets.penguins" + bq_df = bpd.read_gbq(query_or_table) + + # Filter down to the data to the Adelie Penguin species + adelie_data = bq_df[bq_df.species == "Adelie Penguin (Pygoscelis adeliae)"] + + # Drop the species column + adelie_data = adelie_data.drop(columns=["species"]) + + # Drop rows with nulls to get training data + training_data = adelie_data.dropna() + + # Specify your feature (or input) columns and the label (or output) column: + feature_columns = training_data[ + ["island", "culmen_length_mm", "culmen_depth_mm", "flipper_length_mm", "sex"] + ] + label_columns = training_data[["body_mass_g"]] + + test_data = adelie_data[adelie_data.body_mass_g.isnull()] + + # Create the linear model + model = LinearRegression() + model.fit(feature_columns, label_columns) + + # Score the model + score = model.score(feature_columns, label_columns) + + # Predict using the model + result = model.predict(test_data) + # [END bigquery_dataframes_regression_model] + assert test_data is not None + assert feature_columns is not None + assert label_columns is not None + assert model is not None + assert score is not None + assert result is not None diff --git a/tests/system/small/ml/conftest.py b/tests/system/small/ml/conftest.py index 1dd1c813b8..c11445b79a 100644 --- a/tests/system/small/ml/conftest.py +++ b/tests/system/small/ml/conftest.py @@ -213,6 +213,13 @@ def palm2_text_generator_model(session, bq_connection) -> llm.PaLM2TextGenerator return llm.PaLM2TextGenerator(session=session, connection_name=bq_connection) +@pytest.fixture(scope="session") +def palm2_text_generator_32k_model(session, bq_connection) -> llm.PaLM2TextGenerator: + return llm.PaLM2TextGenerator( + model_name="text-bison-32k", session=session, connection_name=bq_connection + ) + + @pytest.fixture(scope="function") def ephemera_palm2_text_generator_model( session, bq_connection @@ -229,6 +236,17 @@ def palm2_embedding_generator_model( ) +@pytest.fixture(scope="session") +def palm2_embedding_generator_multilingual_model( + session, bq_connection +) -> llm.PaLM2TextEmbeddingGenerator: + return llm.PaLM2TextEmbeddingGenerator( + model_name="textembedding-gecko-multilingual", + session=session, + connection_name=bq_connection, + ) + + @pytest.fixture(scope="session") def time_series_bqml_arima_plus_model( session, time_series_arima_plus_model_name diff --git a/tests/system/small/ml/test_llm.py b/tests/system/small/ml/test_llm.py index b7257dde1b..79d3c40317 100644 --- a/tests/system/small/ml/test_llm.py +++ b/tests/system/small/ml/test_llm.py @@ -26,6 +26,12 @@ def test_create_text_generator_model(palm2_text_generator_model): assert palm2_text_generator_model._bqml_model is not None +def test_create_text_generator_32k_model(palm2_text_generator_32k_model): + # Model creation doesn't return error + assert palm2_text_generator_32k_model is not None + assert palm2_text_generator_32k_model._bqml_model is not None + + @pytest.mark.flaky(retries=2, delay=120) def test_create_text_generator_model_default_session(bq_connection, llm_text_pandas_df): import bigframes.pandas as bpd @@ -48,6 +54,30 @@ def test_create_text_generator_model_default_session(bq_connection, llm_text_pan assert all(series.str.len() > 20) +@pytest.mark.flaky(retries=2, delay=120) +def test_create_text_generator_32k_model_default_session( + bq_connection, llm_text_pandas_df +): + import bigframes.pandas as bpd + + bpd.close_session() + bpd.options.bigquery.bq_connection = bq_connection + bpd.options.bigquery.location = "us" + + model = llm.PaLM2TextGenerator(model_name="text-bison-32k") + assert model is not None + assert model._bqml_model is not None + assert model.connection_name.casefold() == "bigframes-dev.us.bigframes-rf-conn" + + llm_text_df = bpd.read_pandas(llm_text_pandas_df) + + df = model.predict(llm_text_df).to_pandas() + TestCase().assertSequenceEqual(df.shape, (3, 1)) + assert "ml_generate_text_llm_result" in df.columns + series = df["ml_generate_text_llm_result"] + assert all(series.str.len() > 20) + + @pytest.mark.flaky(retries=2, delay=120) def test_create_text_generator_model_default_connection(llm_text_pandas_df): from bigframes import _config @@ -127,6 +157,14 @@ def test_create_embedding_generator_model(palm2_embedding_generator_model): assert palm2_embedding_generator_model._bqml_model is not None +def test_create_embedding_generator_multilingual_model( + palm2_embedding_generator_multilingual_model, +): + # Model creation doesn't return error + assert palm2_embedding_generator_multilingual_model is not None + assert palm2_embedding_generator_multilingual_model._bqml_model is not None + + def test_create_text_embedding_generator_model_defaults(bq_connection): import bigframes.pandas as bpd @@ -139,6 +177,20 @@ def test_create_text_embedding_generator_model_defaults(bq_connection): assert model._bqml_model is not None +def test_create_text_embedding_generator_multilingual_model_defaults(bq_connection): + import bigframes.pandas as bpd + + bpd.close_session() + bpd.options.bigquery.bq_connection = bq_connection + bpd.options.bigquery.location = "us" + + model = llm.PaLM2TextEmbeddingGenerator( + model_name="textembedding-gecko-multilingual" + ) + assert model is not None + assert model._bqml_model is not None + + @pytest.mark.flaky(retries=2, delay=120) def test_embedding_generator_predict_success( palm2_embedding_generator_model, llm_text_df @@ -152,6 +204,19 @@ def test_embedding_generator_predict_success( assert value.size == 768 +@pytest.mark.flaky(retries=2, delay=120) +def test_embedding_generator_multilingual_predict_success( + palm2_embedding_generator_multilingual_model, llm_text_df +): + df = palm2_embedding_generator_multilingual_model.predict(llm_text_df).to_pandas() + TestCase().assertSequenceEqual(df.shape, (3, 1)) + assert "text_embedding" in df.columns + series = df["text_embedding"] + value = series[0] + assert isinstance(value, np.ndarray) + assert value.size == 768 + + @pytest.mark.flaky(retries=2, delay=120) def test_embedding_generator_predict_series_success( palm2_embedding_generator_model, llm_text_df diff --git a/tests/system/small/test_dataframe.py b/tests/system/small/test_dataframe.py index c96faa3526..bd5930e508 100644 --- a/tests/system/small/test_dataframe.py +++ b/tests/system/small/test_dataframe.py @@ -711,6 +711,22 @@ def test_df_dropna(scalars_dfs, axis, how, ignore_index): pandas.testing.assert_frame_equal(bf_result, pd_result) +def test_df_interpolate(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + columns = ["int64_col", "int64_too", "float64_col"] + bf_result = scalars_df[columns].interpolate().to_pandas() + # Pandas can only interpolate on "float64" columns + # https://github.com/pandas-dev/pandas/issues/40252 + pd_result = scalars_pandas_df[columns].astype("float64").interpolate() + + pandas.testing.assert_frame_equal( + bf_result, + pd_result, + check_index_type=False, + check_dtype=False, + ) + + def test_df_fillna(scalars_dfs): scalars_df, scalars_pandas_df = scalars_dfs df = scalars_df[["int64_col", "float64_col"]].fillna(3) @@ -787,6 +803,55 @@ def test_apply_series_scalar_callable( pandas.testing.assert_series_equal(bf_result, pd_result) +def test_df_keys( + scalars_df_index, + scalars_pandas_df_index, +): + pandas.testing.assert_index_equal( + scalars_df_index.keys(), scalars_pandas_df_index.keys() + ) + + +def test_df_iter( + scalars_df_index, + scalars_pandas_df_index, +): + for bf_i, df_i in zip(scalars_df_index, scalars_pandas_df_index): + assert bf_i == df_i + + +def test_iterrows( + scalars_df_index, + scalars_pandas_df_index, +): + for (bf_index, bf_series), (pd_index, pd_series) in zip( + scalars_df_index.iterrows(), scalars_pandas_df_index.iterrows() + ): + assert bf_index == pd_index + pandas.testing.assert_series_equal(bf_series, pd_series) + + +@pytest.mark.parametrize( + ( + "index", + "name", + ), + [ + ( + True, + "my_df", + ), + (False, None), + ], +) +def test_itertuples(scalars_df_index, index, name): + # Numeric has slightly different representation as a result of conversions. + bf_tuples = scalars_df_index.itertuples(index, name) + pd_tuples = scalars_df_index.to_pandas().itertuples(index, name) + for bf_tuple, pd_tuple in zip(bf_tuples, pd_tuples): + assert bf_tuple == pd_tuple + + def test_df_isin_list(scalars_dfs): scalars_df, scalars_pandas_df = scalars_dfs values = ["Hello, World!", 55555, 2.51, pd.NA, True] diff --git a/tests/system/small/test_series.py b/tests/system/small/test_series.py index 05d8b84185..183ba01c0e 100644 --- a/tests/system/small/test_series.py +++ b/tests/system/small/test_series.py @@ -272,6 +272,32 @@ def test_series_replace_list_scalar(scalars_dfs): ) +@pytest.mark.parametrize( + ("values",), + ( + ([None, 1, 2, None, None, 16, None],), + ([None, None, 3.6, None],), + ([403.2, None, 352.1, None, None, 111.9],), + ), +) +def test_series_interpolate(values): + pd_series = pd.Series(values) + bf_series = series.Series(pd_series) + + # Pandas can only interpolate on "float64" columns + # https://github.com/pandas-dev/pandas/issues/40252 + pd_result = pd_series.astype("float64").interpolate() + bf_result = bf_series.interpolate().to_pandas() + + # pd uses non-null types, while bf uses nullable types + pd.testing.assert_series_equal( + pd_result, + bf_result, + check_index_type=False, + check_dtype=False, + ) + + @pytest.mark.parametrize( ("ignore_index",), ( diff --git a/tests/unit/resources.py b/tests/unit/resources.py index f660d774f0..8fc8acd175 100644 --- a/tests/unit/resources.py +++ b/tests/unit/resources.py @@ -19,17 +19,21 @@ import google.cloud.bigquery import ibis import pandas +import pytest import bigframes import bigframes.core as core import bigframes.core.ordering +import bigframes.dataframe import bigframes.session.clients """Utilities for creating test resources.""" def create_bigquery_session( - bqclient: Optional[google.cloud.bigquery.Client] = None, session_id: str = "abcxyz" + bqclient: Optional[mock.Mock] = None, + session_id: str = "abcxyz", + anonymous_dataset: Optional[google.cloud.bigquery.DatasetReference] = None, ) -> bigframes.Session: credentials = mock.create_autospec( google.auth.credentials.Credentials, instance=True @@ -39,6 +43,21 @@ def create_bigquery_session( bqclient = mock.create_autospec(google.cloud.bigquery.Client, instance=True) bqclient.project = "test-project" + if anonymous_dataset is None: + anonymous_dataset = google.cloud.bigquery.DatasetReference( + "test-project", + "test_dataset", + ) + + query_job = mock.create_autospec(google.cloud.bigquery.QueryJob) + type(query_job).destination = mock.PropertyMock( + return_value=anonymous_dataset.table("test_table"), + ) + type(query_job).session_info = google.cloud.bigquery.SessionInfo( + {"sessionInfo": {"sessionId": session_id}}, + ) + bqclient.query.return_value = query_job + clients_provider = mock.create_autospec(bigframes.session.clients.ClientsProvider) type(clients_provider).bqclient = mock.PropertyMock(return_value=bqclient) clients_provider._credentials = credentials @@ -51,6 +70,19 @@ def create_bigquery_session( return session +def create_dataframe( + monkeypatch: pytest.MonkeyPatch, session: Optional[bigframes.Session] = None +) -> bigframes.dataframe.DataFrame: + if session is None: + session = create_bigquery_session() + + # Since this may create a ReadLocalNode, the session we explicitly pass in + # might not actually be used. Mock out the global session, too. + monkeypatch.setattr(bigframes.core.global_session, "_global_session", session) + bigframes.options.bigquery._session_started = True + return bigframes.dataframe.DataFrame({}, session=session) + + def create_pandas_session(tables: Dict[str, pandas.DataFrame]) -> bigframes.Session: # TODO(tswast): Refactor to make helper available for all tests. Consider # providing a proper "local Session" for use by downstream developers. diff --git a/tests/unit/session/test_io_bigquery.py b/tests/unit/session/test_io_bigquery.py index d2255d5edf..03470208e4 100644 --- a/tests/unit/session/test_io_bigquery.py +++ b/tests/unit/session/test_io_bigquery.py @@ -14,6 +14,7 @@ import datetime from typing import Iterable +import unittest.mock as mock import google.cloud.bigquery as bigquery import pytest @@ -37,7 +38,7 @@ def test_create_snapshot_sql_doesnt_timetravel_anonymous_datasets(): assert "`my-test-project`.`_e8166e0cdb`.`anonbb92cd`" in sql -def test_create_snapshot_sql_doesnt_timetravel_session_datasets(): +def test_create_snapshot_sql_doesnt_timetravel_session_tables(): table_ref = bigquery.TableReference.from_string("my-test-project._session.abcdefg") sql = bigframes.session._io.bigquery.create_snapshot_sql( @@ -51,6 +52,29 @@ def test_create_snapshot_sql_doesnt_timetravel_session_datasets(): assert "my-test-project" not in sql +def test_create_temp_table_default_expiration(): + """Make sure the created table has an expiration.""" + bqclient = mock.create_autospec(bigquery.Client) + dataset = bigquery.DatasetReference("test-project", "test_dataset") + expiration = datetime.datetime( + 2023, 11, 2, 13, 44, 55, 678901, datetime.timezone.utc + ) + + bigframes.session._io.bigquery.create_temp_table(bqclient, dataset, expiration) + + bqclient.create_table.assert_called_once() + call_args = bqclient.create_table.call_args + table = call_args.args[0] + assert table.project == "test-project" + assert table.dataset_id == "test_dataset" + assert table.table_id.startswith("bqdf") + assert ( + (expiration - datetime.timedelta(minutes=1)) + < table.expires + < (expiration + datetime.timedelta(minutes=1)) + ) + + @pytest.mark.parametrize( ("schema", "expected"), ( diff --git a/tests/unit/test_dataframe.py b/tests/unit/test_dataframe.py new file mode 100644 index 0000000000..17a8290889 --- /dev/null +++ b/tests/unit/test_dataframe.py @@ -0,0 +1,59 @@ +# Copyright 2023 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import google.cloud.bigquery +import pytest + +from . import resources + + +def test_dataframe_to_gbq_invalid_destination(monkeypatch: pytest.MonkeyPatch): + dataframe = resources.create_dataframe(monkeypatch) + + with pytest.raises(ValueError, match="no_dataset_or_project"): + dataframe.to_gbq("no_dataset_or_project") + + +def test_dataframe_to_gbq_invalid_if_exists(monkeypatch: pytest.MonkeyPatch): + dataframe = resources.create_dataframe(monkeypatch) + + with pytest.raises(ValueError, match="notreallyanoption"): + # Even though the type is annotated with the literals we accept, users + # might not be using a type checker, especially not in an interactive + # notebook. + dataframe.to_gbq(if_exists="notreallyanoption") # type: ignore + + +def test_dataframe_to_gbq_invalid_if_exists_no_destination( + monkeypatch: pytest.MonkeyPatch, +): + dataframe = resources.create_dataframe(monkeypatch) + + with pytest.raises(ValueError, match="append"): + dataframe.to_gbq(if_exists="append") + + +def test_dataframe_to_gbq_writes_to_anonymous_dataset( + monkeypatch: pytest.MonkeyPatch, +): + anonymous_dataset_id = "my-anonymous-project.my_anonymous_dataset" + anonymous_dataset = google.cloud.bigquery.DatasetReference.from_string( + anonymous_dataset_id + ) + session = resources.create_bigquery_session(anonymous_dataset=anonymous_dataset) + dataframe = resources.create_dataframe(monkeypatch, session=session) + + destination = dataframe.to_gbq() + + assert destination.startswith(anonymous_dataset_id) diff --git a/tests/unit/test_pandas.py b/tests/unit/test_pandas.py index 5d4f69c7c0..70c5441c68 100644 --- a/tests/unit/test_pandas.py +++ b/tests/unit/test_pandas.py @@ -116,7 +116,7 @@ def test_pandas_attribute(): assert bpd.ArrowDtype is pd.ArrowDtype -def test_close_session_after_bq_session_ended(monkeypatch): +def test_close_session_after_bq_session_ended(monkeypatch: pytest.MonkeyPatch): bqclient = mock.create_autospec(google.cloud.bigquery.Client, instance=True) bqclient.project = "test-project" session = resources.create_bigquery_session( @@ -141,7 +141,7 @@ def test_close_session_after_bq_session_ended(monkeypatch): google.api_core.exceptions.BadRequest, match="Session JUST_A_TEST has expired and is no longer available.", ): - bpd.read_gbq("SELECT 1") + bpd.read_gbq("SELECT 'ABC'") # Even though the query to stop the session raises an exception, we should # still be able to close it without raising an error to the user. diff --git a/third_party/bigframes_vendored/pandas/core/frame.py b/third_party/bigframes_vendored/pandas/core/frame.py index 013d170114..6f4f6be35d 100644 --- a/third_party/bigframes_vendored/pandas/core/frame.py +++ b/third_party/bigframes_vendored/pandas/core/frame.py @@ -125,12 +125,12 @@ def to_numpy(self, dtype=None, copy=False, na_value=None, **kwargs) -> np.ndarra def to_gbq( self, - destination_table: str, + destination_table: Optional[str], *, - if_exists: Optional[Literal["fail", "replace", "append"]] = "fail", + if_exists: Optional[Literal["fail", "replace", "append"]] = None, index: bool = True, ordering_id: Optional[str] = None, - ) -> None: + ) -> str: """Write a DataFrame to a BigQuery table. **Examples:** @@ -138,17 +138,40 @@ def to_gbq( >>> import bigframes.pandas as bpd >>> bpd.options.display.progress_bar = None + Write a DataFrame to a BigQuery table. + >>> df = bpd.DataFrame({'col1': [1, 2], 'col2': [3, 4]}) >>> # destination_table = PROJECT_ID + "." + DATASET_ID + "." + TABLE_NAME >>> df.to_gbq("bigframes-dev.birds.test-numbers", if_exists="replace") + 'bigframes-dev.birds.test-numbers' + + Write a DataFrame to a temporary BigQuery table in the anonymous dataset. + + >>> df = bpd.DataFrame({'col1': [1, 2], 'col2': [3, 4]}) + >>> destination = df.to_gbq(ordering_id="ordering_id") + >>> # The table created can be read outside of the current session. + >>> bpd.close_session() # For demonstration, only. + >>> bpd.read_gbq(destination, index_col="ordering_id") + col1 col2 + ordering_id + 0 1 3 + 1 2 4 + + [2 rows x 2 columns] Args: - destination_table (str): + destination_table (Optional[str]): Name of table to be written, in the form ``dataset.tablename`` or ``project.dataset.tablename``. - if_exists (str, default 'fail'): - Behavior when the destination table exists. Value can be one of: + If no ``destination_table`` is set, a new temporary table is + created in the BigQuery anonymous dataset. + + if_exists (Optional[str]): + Behavior when the destination table exists. When + ``destination_table`` is set, this defaults to ``'fail'``. When + ``destination_table`` is not set, this field is not applicable. + A new table is always created. Value can be one of: ``'fail'`` If table exists raise pandas_gbq.gbq.TableCreationError. @@ -163,6 +186,11 @@ def to_gbq( ordering_id (Optional[str], default None): If set, write the ordering of the DataFrame as a column in the result table with this name. + + Returns: + str: + The fully-qualified ID for the written table, in the form + ``project.dataset.tablename``. """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) @@ -947,6 +975,85 @@ def isin(self, values): """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + def keys(self): + """ + Get the 'info axis'. + + This is index for Series, columns for DataFrame. + + Returns: + Index: Info axis. + + **Examples:** + + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + + >>> df = bpd.DataFrame({ + ... 'A': [1, 2, 3], + ... 'B': [4, 5, 6], + ... }) + >>> df.keys() + Index(['A', 'B'], dtype='object') + """ + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + + def iterrows(self): + """ + Iterate over DataFrame rows as (index, Series) pairs. + + Yields: + a tuple (index, data) where data contains row values as a Series + + **Examples:** + + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + >>> df = bpd.DataFrame({ + ... 'A': [1, 2, 3], + ... 'B': [4, 5, 6], + ... }) + >>> index, row = next(df.iterrows()) + >>> index + 0 + >>> row + A 1 + B 4 + Name: 0, dtype: object + """ + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + + def itertuples(self, index: bool = True, name: str | None = "Pandas"): + """ + Iterate over DataFrame rows as namedtuples. + + Args: + index (bool, default True): + If True, return the index as the first element of the tuple. + name (str or None, default "Pandas"): + The name of the returned namedtuples or None to return regular + tuples. + + Returns: + iterator: + An object to iterate over namedtuples for each row in the + DataFrame with the first field possibly being the index and + following fields being the column values. + + + **Examples:** + + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + >>> df = bpd.DataFrame({ + ... 'A': [1, 2, 3], + ... 'B': [4, 5, 6], + ... }) + >>> next(df.itertuples(name="Pair")) + Pair(Index=0, A=1, B=4) + """ + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + def items(self): """ Iterate over (column name, Series) pairs. @@ -2756,6 +2863,43 @@ def value_counts( """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + def interpolate(self, method: str = "linear"): + """ + Fill NaN values using an interpolation method. + + Args: + method (str, default 'linear'): + Interpolation technique to use. Only 'linear' supported. + 'linear': Ignore the index and treat the values as equally spaced. + This is the only method supported on MultiIndexes. + + Returns: + DataFrame: + Returns the same object type as the caller, interpolated at + some or all ``NaN`` values + + **Examples:** + + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + + >>> df = bpd.DataFrame({ + ... 'A': [1, 2, 3, None, None, 6], + ... 'B': [None, 6, None, 2, None, 3], + ... }) + >>> df.interpolate() + A B + 0 1.0 + 1 2.0 6.0 + 2 3.0 4.0 + 3 4.0 2.0 + 4 5.0 2.5 + 5 6.0 3.0 + + [6 rows x 2 columns] + """ + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + def fillna(self, value): """ Fill NA/NaN values using the specified method. diff --git a/third_party/bigframes_vendored/pandas/core/generic.py b/third_party/bigframes_vendored/pandas/core/generic.py index 27d2e84537..127efe6a3d 100644 --- a/third_party/bigframes_vendored/pandas/core/generic.py +++ b/third_party/bigframes_vendored/pandas/core/generic.py @@ -1,7 +1,7 @@ # Contains code from https://github.com/pandas-dev/pandas/blob/main/pandas/core/generic.py from __future__ import annotations -from typing import Literal, Optional +from typing import Iterator, Literal, Optional from bigframes import constants from third_party.bigframes_vendored.pandas.core import indexing @@ -35,6 +35,35 @@ def size(self) -> int: """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + def __iter__(self) -> Iterator: + """ + Iterate over info axis. + + Returns + iterator: Info axis as iterator. + + **Examples:** + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + + >>> df = bpd.DataFrame({ + ... 'A': [1, 2, 3], + ... 'B': [4, 5, 6], + ... }) + >>> for x in df: + ... print(x) + A + B + + >>> series = bpd.Series(["a", "b", "c"], index=[10, 20, 30]) + >>> for x in series: + ... print(x) + 10 + 20 + 30 + """ + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + # ------------------------------------------------------------------------- # Unary Methods diff --git a/third_party/bigframes_vendored/pandas/core/series.py b/third_party/bigframes_vendored/pandas/core/series.py index f0e13e16f5..b569e5699c 100644 --- a/third_party/bigframes_vendored/pandas/core/series.py +++ b/third_party/bigframes_vendored/pandas/core/series.py @@ -916,6 +916,38 @@ def droplevel(self, level, axis): """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + def interpolate(self, method: str = "linear"): + """ + Fill NaN values using an interpolation method. + + Args: + method (str, default 'linear'): + Interpolation technique to use. Only 'linear' supported. + 'linear': Ignore the index and treat the values as equally spaced. + This is the only method supported on MultiIndexes. + + Returns: + Series: + Returns the same object type as the caller, interpolated at + some or all ``NaN`` values + + **Examples:** + + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + + >>> series = bpd.Series([1, 2, 3, None, None, 6]) + >>> series.interpolate() + 0 1.0 + 1 2.0 + 2 3.0 + 3 4.0 + 4 5.0 + 5 6.0 + dtype: Float64 + """ + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + def fillna( self, value=None,