From 5a7b1c9a0d1607a46823863a6a7bc861f9c9b5af Mon Sep 17 00:00:00 2001
From: Garrett Wu <6505921+GarrettWu@users.noreply.github.com>
Date: Tue, 7 May 2024 11:44:07 -0700
Subject: [PATCH 01/17] refactor: ml model load read from class type hints
 (#656)

* refactor: ml model load read from class type hints

* exclude unrelated files

* fix NoneType

* fix tests

* fix tests

* fix param mappings

* fix tests
---
 bigframes/ml/cluster.py                |  28 +++---
 bigframes/ml/decomposition.py          |  23 ++---
 bigframes/ml/ensemble.py               | 119 ++++++++++---------------
 bigframes/ml/forecasting.py            |  42 +++------
 bigframes/ml/imported.py               |  32 +++----
 bigframes/ml/linear_model.py           |  61 ++++---------
 bigframes/ml/llm.py                    |  71 +++++++--------
 bigframes/ml/loader.py                 |   4 +-
 bigframes/ml/utils.py                  |  39 +++++++-
 tests/system/large/ml/test_ensemble.py |   4 +-
 10 files changed, 192 insertions(+), 231 deletions(-)

diff --git a/bigframes/ml/cluster.py b/bigframes/ml/cluster.py
index e572bb3bfb..43cfbdd424 100644
--- a/bigframes/ml/cluster.py
+++ b/bigframes/ml/cluster.py
@@ -34,6 +34,7 @@
     "distance_type": "distanceType",
     "max_iter": "maxIterations",
     "tol": "minRelativeProgress",
+    "warm_start": "warmStart",
 }
 
 
@@ -67,27 +68,18 @@ def __init__(
         self._bqml_model_factory = globals.bqml_model_factory()
 
     @classmethod
-    def _from_bq(cls, session: bigframes.Session, model: bigquery.Model) -> KMeans:
-        assert model.model_type == "KMEANS"
+    def _from_bq(cls, session: bigframes.Session, bq_model: bigquery.Model) -> KMeans:
+        assert bq_model.model_type == "KMEANS"
 
         kwargs: dict = {}
 
-        # See https://cloud.google.com/bigquery/docs/reference/rest/v2/models#trainingrun
-        last_fitting = model.training_runs[-1]["trainingOptions"]
-        dummy_kmeans = cls()
-        for bf_param, bf_value in dummy_kmeans.__dict__.items():
-            bqml_param = _BQML_PARAMS_MAPPING.get(bf_param)
-            if bqml_param in last_fitting:
-                # Convert types
-                kwargs[bf_param] = (
-                    str(last_fitting[bqml_param])
-                    if bf_param in ["init"]
-                    else type(bf_value)(last_fitting[bqml_param])
-                )
-
-        new_kmeans = cls(**kwargs)
-        new_kmeans._bqml_model = core.BqmlModel(session, model)
-        return new_kmeans
+        kwargs = utils.retrieve_params_from_bq_model(
+            cls, bq_model, _BQML_PARAMS_MAPPING
+        )
+
+        model = cls(**kwargs)
+        model._bqml_model = core.BqmlModel(session, bq_model)
+        return model
 
     @property
     def _bqml_options(self) -> dict:
diff --git a/bigframes/ml/decomposition.py b/bigframes/ml/decomposition.py
index 01b1fda628..ad0bce481f 100644
--- a/bigframes/ml/decomposition.py
+++ b/bigframes/ml/decomposition.py
@@ -27,6 +27,8 @@
 from bigframes.ml import base, core, globals, utils
 import bigframes.pandas as bpd
 
+_BQML_PARAMS_MAPPING = {"svd_solver": "pcaSolver"}
+
 
 @log_adapter.class_logger
 class PCA(
@@ -47,23 +49,22 @@ def __init__(
         self._bqml_model_factory = globals.bqml_model_factory()
 
     @classmethod
-    def _from_bq(cls, session: bigframes.Session, model: bigquery.Model) -> PCA:
-        assert model.model_type == "PCA"
+    def _from_bq(cls, session: bigframes.Session, bq_model: bigquery.Model) -> PCA:
+        assert bq_model.model_type == "PCA"
 
-        kwargs: dict = {}
+        kwargs = utils.retrieve_params_from_bq_model(
+            cls, bq_model, _BQML_PARAMS_MAPPING
+        )
 
-        # See https://cloud.google.com/bigquery/docs/reference/rest/v2/models#trainingrun
-        last_fitting = model.training_runs[-1]["trainingOptions"]
+        last_fitting = bq_model.training_runs[-1]["trainingOptions"]
         if "numPrincipalComponents" in last_fitting:
             kwargs["n_components"] = int(last_fitting["numPrincipalComponents"])
-        if "pcaExplainedVarianceRatio" in last_fitting:
+        elif "pcaExplainedVarianceRatio" in last_fitting:
             kwargs["n_components"] = float(last_fitting["pcaExplainedVarianceRatio"])
-        if "pcaSolver" in last_fitting:
-            kwargs["svd_solver"] = str(last_fitting["pcaSolver"])
 
-        new_pca = cls(**kwargs)
-        new_pca._bqml_model = core.BqmlModel(session, model)
-        return new_pca
+        model = cls(**kwargs)
+        model._bqml_model = core.BqmlModel(session, bq_model)
+        return model
 
     @property
     def _bqml_options(self) -> dict:
diff --git a/bigframes/ml/ensemble.py b/bigframes/ml/ensemble.py
index b248c295f4..8fc1e22146 100644
--- a/bigframes/ml/ensemble.py
+++ b/bigframes/ml/ensemble.py
@@ -30,9 +30,10 @@
 
 _BQML_PARAMS_MAPPING = {
     "booster": "boosterType",
+    "dart_normalized_type": "dartNormalizeType",
     "tree_method": "treeMethod",
-    "colsample_bytree": "colsampleBylevel",
-    "colsample_bylevel": "colsampleBytree",
+    "colsample_bytree": "colsampleBytree",
+    "colsample_bylevel": "colsampleBylevel",
     "colsample_bynode": "colsampleBynode",
     "gamma": "minSplitLoss",
     "subsample": "subsample",
@@ -44,6 +45,8 @@
     "min_tree_child_weight": "minTreeChildWeight",
     "max_depth": "maxTreeDepth",
     "max_iterations": "maxIterations",
+    "enable_global_explain": "enableGlobalExplain",
+    "xgboost_version": "xgboostVersion",
 }
 
 
@@ -99,24 +102,17 @@ def __init__(
 
     @classmethod
     def _from_bq(
-        cls, session: bigframes.Session, model: bigquery.Model
+        cls, session: bigframes.Session, bq_model: bigquery.Model
     ) -> XGBRegressor:
-        assert model.model_type == "BOOSTED_TREE_REGRESSOR"
+        assert bq_model.model_type == "BOOSTED_TREE_REGRESSOR"
 
-        kwargs = {}
-
-        # See https://cloud.google.com/bigquery/docs/reference/rest/v2/models#trainingrun
-        last_fitting = model.training_runs[-1]["trainingOptions"]
-
-        dummy_regressor = cls()
-        for bf_param, bf_value in dummy_regressor.__dict__.items():
-            bqml_param = _BQML_PARAMS_MAPPING.get(bf_param)
-            if bqml_param in last_fitting:
-                kwargs[bf_param] = type(bf_value)(last_fitting[bqml_param])
+        kwargs = utils.retrieve_params_from_bq_model(
+            cls, bq_model, _BQML_PARAMS_MAPPING
+        )
 
-        new_xgb_regressor = cls(**kwargs)
-        new_xgb_regressor._bqml_model = core.BqmlModel(session, model)
-        return new_xgb_regressor
+        model = cls(**kwargs)
+        model._bqml_model = core.BqmlModel(session, bq_model)
+        return model
 
     @property
     def _bqml_options(self) -> Dict[str, str | int | bool | float | List[str]]:
@@ -255,24 +251,17 @@ def __init__(
 
     @classmethod
     def _from_bq(
-        cls, session: bigframes.Session, model: bigquery.Model
+        cls, session: bigframes.Session, bq_model: bigquery.Model
     ) -> XGBClassifier:
-        assert model.model_type == "BOOSTED_TREE_CLASSIFIER"
+        assert bq_model.model_type == "BOOSTED_TREE_CLASSIFIER"
 
-        kwargs = {}
-
-        # See https://cloud.google.com/bigquery/docs/reference/rest/v2/models#trainingrun
-        last_fitting = model.training_runs[-1]["trainingOptions"]
-
-        dummy_classifier = XGBClassifier()
-        for bf_param, bf_value in dummy_classifier.__dict__.items():
-            bqml_param = _BQML_PARAMS_MAPPING.get(bf_param)
-            if bqml_param is not None:
-                kwargs[bf_param] = type(bf_value)(last_fitting[bqml_param])
+        kwargs = utils.retrieve_params_from_bq_model(
+            cls, bq_model, _BQML_PARAMS_MAPPING
+        )
 
-        new_xgb_classifier = cls(**kwargs)
-        new_xgb_classifier._bqml_model = core.BqmlModel(session, model)
-        return new_xgb_classifier
+        model = cls(**kwargs)
+        model._bqml_model = core.BqmlModel(session, bq_model)
+        return model
 
     @property
     def _bqml_options(self) -> Dict[str, str | int | bool | float | List[str]]:
@@ -370,16 +359,16 @@ def __init__(
         *,
         tree_method: Literal["auto", "exact", "approx", "hist"] = "auto",
         min_tree_child_weight: int = 1,
-        colsample_bytree=1.0,
-        colsample_bylevel=1.0,
-        colsample_bynode=0.8,
-        gamma=0.00,
+        colsample_bytree: float = 1.0,
+        colsample_bylevel: float = 1.0,
+        colsample_bynode: float = 0.8,
+        gamma: float = 0.0,
         max_depth: int = 15,
-        subsample=0.8,
-        reg_alpha=0.0,
-        reg_lambda=1.0,
-        tol=0.01,
-        enable_global_explain=False,
+        subsample: float = 0.8,
+        reg_alpha: float = 0.0,
+        reg_lambda: float = 1.0,
+        tol: float = 0.01,
+        enable_global_explain: bool = False,
         xgboost_version: Literal["0.9", "1.1"] = "0.9",
     ):
         self.n_estimators = n_estimators
@@ -401,24 +390,17 @@ def __init__(
 
     @classmethod
     def _from_bq(
-        cls, session: bigframes.Session, model: bigquery.Model
+        cls, session: bigframes.Session, bq_model: bigquery.Model
     ) -> RandomForestRegressor:
-        assert model.model_type == "RANDOM_FOREST_REGRESSOR"
-
-        kwargs = {}
-
-        # See https://cloud.google.com/bigquery/docs/reference/rest/v2/models#trainingrun
-        last_fitting = model.training_runs[-1]["trainingOptions"]
+        assert bq_model.model_type == "RANDOM_FOREST_REGRESSOR"
 
-        dummy_model = cls()
-        for bf_param, bf_value in dummy_model.__dict__.items():
-            bqml_param = _BQML_PARAMS_MAPPING.get(bf_param)
-            if bqml_param in last_fitting:
-                kwargs[bf_param] = type(bf_value)(last_fitting[bqml_param])
+        kwargs = utils.retrieve_params_from_bq_model(
+            cls, bq_model, _BQML_PARAMS_MAPPING
+        )
 
-        new_random_forest_regressor = cls(**kwargs)
-        new_random_forest_regressor._bqml_model = core.BqmlModel(session, model)
-        return new_random_forest_regressor
+        model = cls(**kwargs)
+        model._bqml_model = core.BqmlModel(session, bq_model)
+        return model
 
     @property
     def _bqml_options(self) -> Dict[str, str | int | bool | float | List[str]]:
@@ -542,7 +524,7 @@ def __init__(
         reg_alpha: float = 0.0,
         reg_lambda: float = 1.0,
         tol: float = 0.01,
-        enable_global_explain=False,
+        enable_global_explain: bool = False,
         xgboost_version: Literal["0.9", "1.1"] = "0.9",
     ):
         self.n_estimators = n_estimators
@@ -564,24 +546,17 @@ def __init__(
 
     @classmethod
     def _from_bq(
-        cls, session: bigframes.Session, model: bigquery.Model
+        cls, session: bigframes.Session, bq_model: bigquery.Model
     ) -> RandomForestClassifier:
-        assert model.model_type == "RANDOM_FOREST_CLASSIFIER"
-
-        kwargs = {}
+        assert bq_model.model_type == "RANDOM_FOREST_CLASSIFIER"
 
-        # See https://cloud.google.com/bigquery/docs/reference/rest/v2/models#trainingrun
-        last_fitting = model.training_runs[-1]["trainingOptions"]
-
-        dummy_model = RandomForestClassifier()
-        for bf_param, bf_value in dummy_model.__dict__.items():
-            bqml_param = _BQML_PARAMS_MAPPING.get(bf_param)
-            if bqml_param is not None:
-                kwargs[bf_param] = type(bf_value)(last_fitting[bqml_param])
+        kwargs = utils.retrieve_params_from_bq_model(
+            cls, bq_model, _BQML_PARAMS_MAPPING
+        )
 
-        new_random_forest_classifier = cls(**kwargs)
-        new_random_forest_classifier._bqml_model = core.BqmlModel(session, model)
-        return new_random_forest_classifier
+        model = cls(**kwargs)
+        model._bqml_model = core.BqmlModel(session, bq_model)
+        return model
 
     @property
     def _bqml_options(self) -> Dict[str, str | int | bool | float | List[str]]:
diff --git a/bigframes/ml/forecasting.py b/bigframes/ml/forecasting.py
index 783e7741b8..a87df61801 100644
--- a/bigframes/ml/forecasting.py
+++ b/bigframes/ml/forecasting.py
@@ -32,6 +32,7 @@
     "auto_arima_min_order": "autoArimaMinOrder",
     "order": "nonSeasonalOrder",
     "data_frequency": "dataFrequency",
+    "include_drift": "includeDrift",
     "holiday_region": "holidayRegion",
     "clean_spikes_and_dips": "cleanSpikesAndDips",
     "adjust_step_changes": "adjustStepChanges",
@@ -131,35 +132,18 @@ def __init__(
         self._bqml_model_factory = globals.bqml_model_factory()
 
     @classmethod
-    def _from_bq(cls, session: bigframes.Session, model: bigquery.Model) -> ARIMAPlus:
-        assert model.model_type == "ARIMA_PLUS"
-
-        kwargs: dict = {}
-        last_fitting = model.training_runs[-1]["trainingOptions"]
-
-        dummy_arima = cls()
-        for bf_param, bf_value in dummy_arima.__dict__.items():
-            bqml_param = _BQML_PARAMS_MAPPING.get(bf_param)
-            if bqml_param in last_fitting:
-                # Convert types
-                if bf_param in ["time_series_length_fraction"]:
-                    kwargs[bf_param] = float(last_fitting[bqml_param])
-                elif bf_param in [
-                    "auto_arima_max_order",
-                    "auto_arima_min_order",
-                    "min_time_series_length",
-                    "max_time_series_length",
-                    "trend_smoothing_window_size",
-                ]:
-                    kwargs[bf_param] = int(last_fitting[bqml_param])
-                elif bf_param in ["holiday_region"]:
-                    kwargs[bf_param] = str(last_fitting[bqml_param])
-                else:
-                    kwargs[bf_param] = type(bf_value)(last_fitting[bqml_param])
-
-        new_arima_plus = cls(**kwargs)
-        new_arima_plus._bqml_model = core.BqmlModel(session, model)
-        return new_arima_plus
+    def _from_bq(
+        cls, session: bigframes.Session, bq_model: bigquery.Model
+    ) -> ARIMAPlus:
+        assert bq_model.model_type == "ARIMA_PLUS"
+
+        kwargs = utils.retrieve_params_from_bq_model(
+            cls, bq_model, _BQML_PARAMS_MAPPING
+        )
+
+        model = cls(**kwargs)
+        model._bqml_model = core.BqmlModel(session, bq_model)
+        return model
 
     @property
     def _bqml_options(self) -> dict:
diff --git a/bigframes/ml/imported.py b/bigframes/ml/imported.py
index a642fae74d..cb8fe7a96e 100644
--- a/bigframes/ml/imported.py
+++ b/bigframes/ml/imported.py
@@ -56,13 +56,13 @@ def _create_bqml_model(self):
 
     @classmethod
     def _from_bq(
-        cls, session: bigframes.Session, model: bigquery.Model
+        cls, session: bigframes.Session, bq_model: bigquery.Model
     ) -> TensorFlowModel:
-        assert model.model_type == "TENSORFLOW"
+        assert bq_model.model_type == "TENSORFLOW"
 
-        tf_model = cls(session=session, model_path="")
-        tf_model._bqml_model = core.BqmlModel(session, model)
-        return tf_model
+        model = cls(session=session, model_path="")
+        model._bqml_model = core.BqmlModel(session, bq_model)
+        return model
 
     def predict(self, X: Union[bpd.DataFrame, bpd.Series]) -> bpd.DataFrame:
         """Predict the result from input DataFrame.
@@ -134,12 +134,14 @@ def _create_bqml_model(self):
         )
 
     @classmethod
-    def _from_bq(cls, session: bigframes.Session, model: bigquery.Model) -> ONNXModel:
-        assert model.model_type == "ONNX"
+    def _from_bq(
+        cls, session: bigframes.Session, bq_model: bigquery.Model
+    ) -> ONNXModel:
+        assert bq_model.model_type == "ONNX"
 
-        onnx_model = cls(session=session, model_path="")
-        onnx_model._bqml_model = core.BqmlModel(session, model)
-        return onnx_model
+        model = cls(session=session, model_path="")
+        model._bqml_model = core.BqmlModel(session, bq_model)
+        return model
 
     def predict(self, X: Union[bpd.DataFrame, bpd.Series]) -> bpd.DataFrame:
         """Predict the result from input DataFrame.
@@ -249,13 +251,13 @@ def _create_bqml_model(self):
 
     @classmethod
     def _from_bq(
-        cls, session: bigframes.Session, model: bigquery.Model
+        cls, session: bigframes.Session, bq_model: bigquery.Model
     ) -> XGBoostModel:
-        assert model.model_type == "XGBOOST"
+        assert bq_model.model_type == "XGBOOST"
 
-        xgboost_model = cls(session=session, model_path="")
-        xgboost_model._bqml_model = core.BqmlModel(session, model)
-        return xgboost_model
+        model = cls(session=session, model_path="")
+        model._bqml_model = core.BqmlModel(session, bq_model)
+        return model
 
     def predict(self, X: Union[bpd.DataFrame, bpd.Series]) -> bpd.DataFrame:
         """Predict the result from input DataFrame.
diff --git a/bigframes/ml/linear_model.py b/bigframes/ml/linear_model.py
index 0c76a39a1c..32168e9a34 100644
--- a/bigframes/ml/linear_model.py
+++ b/bigframes/ml/linear_model.py
@@ -42,7 +42,6 @@
     "warm_start": "warmStart",
     "calculate_p_values": "calculatePValues",
     "enable_global_explain": "enableGlobalExplain",
-    "category_encoding_method": "categoryEncodingMethod",
 }
 
 
@@ -88,30 +87,17 @@ def __init__(
 
     @classmethod
     def _from_bq(
-        cls, session: bigframes.Session, model: bigquery.Model
+        cls, session: bigframes.Session, bq_model: bigquery.Model
     ) -> LinearRegression:
-        assert model.model_type == "LINEAR_REGRESSION"
+        assert bq_model.model_type == "LINEAR_REGRESSION"
 
-        # TODO(bmil): construct a standard way to extract these properties
-        kwargs = {}
-
-        # See https://cloud.google.com/bigquery/docs/reference/rest/v2/models#trainingrun
-        last_fitting = model.training_runs[-1]["trainingOptions"]
-
-        dummy_linear = cls()
-        for bf_param, bf_value in dummy_linear.__dict__.items():
-            bqml_param = _BQML_PARAMS_MAPPING.get(bf_param)
-            if bqml_param in last_fitting:
-                # Convert types
-                kwargs[bf_param] = (
-                    float(last_fitting[bqml_param])
-                    if bf_param in ["l1_reg", "learning_rate", "ls_init_learning_rate"]
-                    else type(bf_value)(last_fitting[bqml_param])
-                )
+        kwargs = utils.retrieve_params_from_bq_model(
+            cls, bq_model, _BQML_PARAMS_MAPPING
+        )
 
-        new_linear_regression = cls(**kwargs)
-        new_linear_regression._bqml_model = core.BqmlModel(session, model)
-        return new_linear_regression
+        model = cls(**kwargs)
+        model._bqml_model = core.BqmlModel(session, bq_model)
+        return model
 
     @property
     def _bqml_options(self) -> dict:
@@ -243,33 +229,24 @@ def __init__(
 
     @classmethod
     def _from_bq(
-        cls, session: bigframes.Session, model: bigquery.Model
+        cls, session: bigframes.Session, bq_model: bigquery.Model
     ) -> LogisticRegression:
-        assert model.model_type == "LOGISTIC_REGRESSION"
-
-        kwargs = {}
-
-        # See https://cloud.google.com/bigquery/docs/reference/rest/v2/models#trainingrun
-        last_fitting = model.training_runs[-1]["trainingOptions"]
-        dummy_logistic = cls()
-        for bf_param, bf_value in dummy_logistic.__dict__.items():
-            bqml_param = _BQML_PARAMS_MAPPING.get(bf_param)
-            if bqml_param in last_fitting:
-                # Convert types
-                kwargs[bf_param] = (
-                    float(last_fitting[bqml_param])
-                    if bf_param in ["l1_reg", "learning_rate", "ls_init_learning_rate"]
-                    else type(bf_value)(last_fitting[bqml_param])
-                )
+        assert bq_model.model_type == "LOGISTIC_REGRESSION"
+
+        kwargs = utils.retrieve_params_from_bq_model(
+            cls, bq_model, _BQML_PARAMS_MAPPING
+        )
+
+        last_fitting = bq_model.training_runs[-1]["trainingOptions"]
         if last_fitting["autoClassWeights"]:
             kwargs["class_weight"] = "balanced"
         # TODO(ashleyxu) support class_weight in the constructor.
         # if "labelClassWeights" in last_fitting:
         #     kwargs["class_weight"] = last_fitting["labelClassWeights"]
 
-        new_logistic_regression = cls(**kwargs)
-        new_logistic_regression._bqml_model = core.BqmlModel(session, model)
-        return new_logistic_regression
+        model = cls(**kwargs)
+        model._bqml_model = core.BqmlModel(session, bq_model)
+        return model
 
     @property
     def _bqml_options(self) -> dict:
diff --git a/bigframes/ml/llm.py b/bigframes/ml/llm.py
index 4b07524194..77dc1d2b0f 100644
--- a/bigframes/ml/llm.py
+++ b/bigframes/ml/llm.py
@@ -128,38 +128,30 @@ def _create_bqml_model(self):
 
     @classmethod
     def _from_bq(
-        cls, session: bigframes.Session, model: bigquery.Model
+        cls, session: bigframes.Session, bq_model: bigquery.Model
     ) -> PaLM2TextGenerator:
-        assert model.model_type == "MODEL_TYPE_UNSPECIFIED"
-        assert "remoteModelInfo" in model._properties
-        assert "endpoint" in model._properties["remoteModelInfo"]
-        assert "connection" in model._properties["remoteModelInfo"]
+        assert bq_model.model_type == "MODEL_TYPE_UNSPECIFIED"
+        assert "remoteModelInfo" in bq_model._properties
+        assert "endpoint" in bq_model._properties["remoteModelInfo"]
+        assert "connection" in bq_model._properties["remoteModelInfo"]
 
         # Parse the remote model endpoint
-        bqml_endpoint = model._properties["remoteModelInfo"]["endpoint"]
-        model_connection = model._properties["remoteModelInfo"]["connection"]
+        bqml_endpoint = bq_model._properties["remoteModelInfo"]["endpoint"]
+        model_connection = bq_model._properties["remoteModelInfo"]["connection"]
         model_endpoint = bqml_endpoint.split("/")[-1]
 
-        # Get the optional params
-        kwargs: dict = {}
-        last_fitting = model.training_runs[-1]["trainingOptions"]
-
-        dummy_text_generator = cls(session=session)
-        for bf_param, _ in dummy_text_generator.__dict__.items():
-            bqml_param = _BQML_PARAMS_MAPPING.get(bf_param)
-            if bqml_param in last_fitting:
-                # Convert types
-                if bf_param in ["max_iterations"]:
-                    kwargs[bf_param] = int(last_fitting[bqml_param])
+        kwargs = utils.retrieve_params_from_bq_model(
+            cls, bq_model, _BQML_PARAMS_MAPPING
+        )
 
-        text_generator_model = cls(
+        model = cls(
             **kwargs,
             session=session,
             model_name=model_endpoint,
             connection_name=model_connection,
         )
-        text_generator_model._bqml_model = core.BqmlModel(session, model)
-        return text_generator_model
+        model._bqml_model = core.BqmlModel(session, bq_model)
+        return model
 
     @property
     def _bqml_options(self) -> dict:
@@ -464,29 +456,30 @@ def _create_bqml_model(self):
 
     @classmethod
     def _from_bq(
-        cls, session: bigframes.Session, model: bigquery.Model
+        cls, session: bigframes.Session, bq_model: bigquery.Model
     ) -> PaLM2TextEmbeddingGenerator:
-        assert model.model_type == "MODEL_TYPE_UNSPECIFIED"
-        assert "remoteModelInfo" in model._properties
-        assert "endpoint" in model._properties["remoteModelInfo"]
-        assert "connection" in model._properties["remoteModelInfo"]
+        assert bq_model.model_type == "MODEL_TYPE_UNSPECIFIED"
+        assert "remoteModelInfo" in bq_model._properties
+        assert "endpoint" in bq_model._properties["remoteModelInfo"]
+        assert "connection" in bq_model._properties["remoteModelInfo"]
 
         # Parse the remote model endpoint
-        bqml_endpoint = model._properties["remoteModelInfo"]["endpoint"]
-        model_connection = model._properties["remoteModelInfo"]["connection"]
+        bqml_endpoint = bq_model._properties["remoteModelInfo"]["endpoint"]
+        model_connection = bq_model._properties["remoteModelInfo"]["connection"]
         model_endpoint = bqml_endpoint.split("/")[-1]
 
         model_name, version = utils.parse_model_endpoint(model_endpoint)
 
-        embedding_generator_model = cls(
+        model = cls(
             session=session,
             # str to literals
             model_name=model_name,  # type: ignore
             version=version,
             connection_name=model_connection,
         )
-        embedding_generator_model._bqml_model = core.BqmlModel(session, model)
-        return embedding_generator_model
+
+        model._bqml_model = core.BqmlModel(session, bq_model)
+        return model
 
     def predict(self, X: Union[bpd.DataFrame, bpd.Series]) -> bpd.DataFrame:
         """Predict the result from input DataFrame.
@@ -616,18 +609,18 @@ def _create_bqml_model(self):
 
     @classmethod
     def _from_bq(
-        cls, session: bigframes.Session, model: bigquery.Model
+        cls, session: bigframes.Session, bq_model: bigquery.Model
     ) -> GeminiTextGenerator:
-        assert model.model_type == "MODEL_TYPE_UNSPECIFIED"
-        assert "remoteModelInfo" in model._properties
-        assert "connection" in model._properties["remoteModelInfo"]
+        assert bq_model.model_type == "MODEL_TYPE_UNSPECIFIED"
+        assert "remoteModelInfo" in bq_model._properties
+        assert "connection" in bq_model._properties["remoteModelInfo"]
 
         # Parse the remote model endpoint
-        model_connection = model._properties["remoteModelInfo"]["connection"]
+        model_connection = bq_model._properties["remoteModelInfo"]["connection"]
 
-        text_generator_model = cls(session=session, connection_name=model_connection)
-        text_generator_model._bqml_model = core.BqmlModel(session, model)
-        return text_generator_model
+        model = cls(session=session, connection_name=model_connection)
+        model._bqml_model = core.BqmlModel(session, bq_model)
+        return model
 
     def predict(
         self,
diff --git a/bigframes/ml/loader.py b/bigframes/ml/loader.py
index c6e38e6534..916949077f 100644
--- a/bigframes/ml/loader.py
+++ b/bigframes/ml/loader.py
@@ -121,7 +121,7 @@ def _transformer_from_bq(session: bigframes.Session, bq_model: bigquery.Model):
 def _model_from_bq(session: bigframes.Session, bq_model: bigquery.Model):
     if bq_model.model_type in _BQML_MODEL_TYPE_MAPPING:
         return _BQML_MODEL_TYPE_MAPPING[bq_model.model_type]._from_bq(  # type: ignore
-            session=session, model=bq_model
+            session=session, bq_model=bq_model
         )
     if _is_bq_model_remote(bq_model):
         # Parse the remote model endpoint
@@ -130,7 +130,7 @@ def _model_from_bq(session: bigframes.Session, bq_model: bigquery.Model):
         model_name, _ = utils.parse_model_endpoint(model_endpoint)
 
         return _BQML_ENDPOINT_TYPE_MAPPING[model_name]._from_bq(  # type: ignore
-            session=session, model=bq_model
+            session=session, bq_model=bq_model
         )
 
     raise NotImplementedError(
diff --git a/bigframes/ml/utils.py b/bigframes/ml/utils.py
index 364fb5e88d..75dfb916f6 100644
--- a/bigframes/ml/utils.py
+++ b/bigframes/ml/utils.py
@@ -13,7 +13,9 @@
 # limitations under the License.
 
 import typing
-from typing import Iterable, Optional, Union
+from typing import Any, Iterable, Literal, Mapping, Optional, Union
+
+from google.cloud import bigquery
 
 import bigframes.constants as constants
 from bigframes.core import blocks
@@ -69,3 +71,38 @@ def parse_model_endpoint(model_endpoint: str) -> tuple[str, Optional[str]]:
         model_name = model_endpoint[:at_idx]
 
     return model_name, version
+
+
+def _resolve_param_type(t: type) -> type:
+    def is_optional(t):
+        return typing.get_origin(t) is Union and type(None) in typing.get_args(t)
+
+    # Optional[type] to type
+    if is_optional(t):
+        union_set = set(typing.get_args(t))
+        union_set.remove(type(None))
+        t = Union[tuple(union_set)]  # type: ignore
+
+    # Literal[value0, value1...] to type(value0)
+    if typing.get_origin(t) is Literal:
+        return type(typing.get_args(t)[0])
+
+    return t
+
+
+def retrieve_params_from_bq_model(
+    cls, bq_model: bigquery.Model, params_mapping: Mapping[str, str]
+) -> dict[str, Any]:
+    """Retrieve parameters of class constructor from BQ model. params_mapping specifies the names mapping param_name -> bqml_name. Params couldn't be found will be ignored."""
+    kwargs = {}
+
+    # See https://cloud.google.com/bigquery/docs/reference/rest/v2/models#trainingrun
+    last_fitting = bq_model.training_runs[-1]["trainingOptions"]
+
+    for bf_param, bf_param_type in typing.get_type_hints(cls.__init__).items():
+        bqml_param = params_mapping.get(bf_param)
+        if bqml_param in last_fitting:
+            bf_param_type = _resolve_param_type(bf_param_type)
+            kwargs[bf_param] = bf_param_type(last_fitting[bqml_param])
+
+    return kwargs
diff --git a/tests/system/large/ml/test_ensemble.py b/tests/system/large/ml/test_ensemble.py
index 2260e7bbce..3d1fcaf41c 100644
--- a/tests/system/large/ml/test_ensemble.py
+++ b/tests/system/large/ml/test_ensemble.py
@@ -123,7 +123,7 @@ def test_xgbregressor_dart_booster_multiple_params(
         in reloaded_model._bqml_model.model_name
     )
     assert reloaded_model.booster == "DART"
-    assert reloaded_model.dart_normalized_type == "tree"
+    assert reloaded_model.dart_normalized_type == "TREE"
     assert reloaded_model.tree_method == "AUTO"
     assert reloaded_model.colsample_bytree == 0.95
     assert reloaded_model.colsample_bylevel == 0.95
@@ -236,7 +236,7 @@ def test_xgbclassifier_dart_booster_multiple_params(
         in reloaded_model._bqml_model.model_name
     )
     assert reloaded_model.booster == "DART"
-    assert reloaded_model.dart_normalized_type == "tree"
+    assert reloaded_model.dart_normalized_type == "TREE"
     assert reloaded_model.tree_method == "AUTO"
     assert reloaded_model.colsample_bytree == 0.95
     assert reloaded_model.colsample_bylevel == 0.95

From c6c487fb3e39a980a05ff2dab5fb2b528d44016a Mon Sep 17 00:00:00 2001
From: Ashley Xu <139821907+ashleyxuu@users.noreply.github.com>
Date: Tue, 7 May 2024 13:24:17 -0700
Subject: [PATCH 02/17] feat: add `strategy="quantile"` in KBinsDiscretizer
 (#654)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Thank you for opening a Pull Request! Before submitting your PR, there are a few things you can do to make sure it goes smoothly:
- [ ] Make sure to open an issue as a [bug/issue](https://togithub.com/googleapis/python-bigquery-dataframes/issues/new/choose) before writing your code!  That way we can discuss the change, evaluate designs, and agree on the general idea
- [ ] Ensure the tests and linter pass
- [ ] Code coverage does not decrease (if any source code was changed)
- [ ] Appropriate docs were updated (if necessary)

Fixes internal #310685445 🦕
---
 bigframes/ml/compose.py                       |  1 +
 bigframes/ml/preprocessing.py                 | 51 +++++++++++-----
 bigframes/ml/sql.py                           | 11 +++-
 tests/system/small/ml/test_preprocessing.py   | 58 +++++++++++++++++++
 tests/unit/ml/test_sql.py                     |  7 +++
 .../sklearn/preprocessing/_discretization.py  |  2 +-
 6 files changed, 112 insertions(+), 18 deletions(-)

diff --git a/bigframes/ml/compose.py b/bigframes/ml/compose.py
index 89969f23e7..77bfd76bde 100644
--- a/bigframes/ml/compose.py
+++ b/bigframes/ml/compose.py
@@ -38,6 +38,7 @@
         "ML.MAX_ABS_SCALER": preprocessing.MaxAbsScaler,
         "ML.MIN_MAX_SCALER": preprocessing.MinMaxScaler,
         "ML.BUCKETIZE": preprocessing.KBinsDiscretizer,
+        "ML.QUANTILE_BUCKETIZE": preprocessing.KBinsDiscretizer,
         "ML.LABEL_ENCODER": preprocessing.LabelEncoder,
     }
 )
diff --git a/bigframes/ml/preprocessing.py b/bigframes/ml/preprocessing.py
index 673ee27db0..954d5adff0 100644
--- a/bigframes/ml/preprocessing.py
+++ b/bigframes/ml/preprocessing.py
@@ -290,10 +290,6 @@ def __init__(
         n_bins: int = 5,
         strategy: Literal["uniform", "quantile"] = "quantile",
     ):
-        if strategy != "uniform":
-            raise NotImplementedError(
-                f"Only strategy = 'uniform' is supported now, input is {strategy}."
-            )
         if n_bins < 2:
             raise ValueError(
                 f"n_bins has to be larger than or equal to 2, input is {n_bins}."
@@ -337,30 +333,53 @@ def _compile_to_sql(
                     min_value + i * bin_size for i in range(self.n_bins - 1)
                 ]
 
-        return [
-            (
-                self._base_sql_generator.ml_bucketize(
-                    column, array_split_points[column], f"kbinsdiscretizer_{column}"
-                ),
-                f"kbinsdiscretizer_{column}",
+            return [
+                (
+                    self._base_sql_generator.ml_bucketize(
+                        column, array_split_points[column], f"kbinsdiscretizer_{column}"
+                    ),
+                    f"kbinsdiscretizer_{column}",
+                )
+                for column in columns
+            ]
+
+        elif self.strategy == "quantile":
+
+            return [
+                (
+                    self._base_sql_generator.ml_quantile_bucketize(
+                        column, self.n_bins, f"kbinsdiscretizer_{column}"
+                    ),
+                    f"kbinsdiscretizer_{column}",
+                )
+                for column in columns
+            ]
+
+        else:
+            raise ValueError(
+                f"strategy should be set 'quantile' or 'uniform', but your input is {self.strategy}."
             )
-            for column in columns
-        ]
 
     @classmethod
     def _parse_from_sql(cls, sql: str) -> tuple[KBinsDiscretizer, str]:
         """Parse SQL to tuple(KBinsDiscretizer, column_label).
 
         Args:
-            sql: SQL string of format "ML.BUCKETIZE({col_label}, array_split_points, FALSE) OVER()"
+            sql: SQL string of format "ML.BUCKETIZE({col_label}, array_split_points, FALSE)"
+                or ML.QUANTILE_BUCKETIZE({col_label}, num_bucket) OVER()"
 
         Returns:
             tuple(KBinsDiscretizer, column_label)"""
         s = sql[sql.find("(") + 1 : sql.find(")")]
-        array_split_points = s[s.find("[") + 1 : s.find("]")]
         col_label = s[: s.find(",")]
-        n_bins = array_split_points.count(",") + 2
-        return cls(n_bins, "uniform"), col_label
+
+        if sql.startswith("ML.QUANTILE_BUCKETIZE"):
+            num_bins = s.split(",")[1]
+            return cls(int(num_bins), "quantile"), col_label
+        else:
+            array_split_points = s[s.find("[") + 1 : s.find("]")]
+            n_bins = array_split_points.count(",") + 2
+            return cls(n_bins, "uniform"), col_label
 
     def fit(
         self,
diff --git a/bigframes/ml/sql.py b/bigframes/ml/sql.py
index ea693e3437..b701ab301c 100644
--- a/bigframes/ml/sql.py
+++ b/bigframes/ml/sql.py
@@ -109,9 +109,18 @@ def ml_bucketize(
         array_split_points: Iterable[Union[int, float]],
         name: str,
     ) -> str:
-        """Encode ML.MIN_MAX_SCALER for BQML"""
+        """Encode ML.BUCKETIZE for BQML"""
         return f"""ML.BUCKETIZE({numeric_expr_sql}, {array_split_points}, FALSE) AS {name}"""
 
+    def ml_quantile_bucketize(
+        self,
+        numeric_expr_sql: str,
+        num_bucket: int,
+        name: str,
+    ) -> str:
+        """Encode ML.QUANTILE_BUCKETIZE for BQML"""
+        return f"""ML.QUANTILE_BUCKETIZE({numeric_expr_sql}, {num_bucket}) OVER() AS {name}"""
+
     def ml_one_hot_encoder(
         self,
         numeric_expr_sql: str,
diff --git a/tests/system/small/ml/test_preprocessing.py b/tests/system/small/ml/test_preprocessing.py
index faa0cd7bbd..5b457cc9c0 100644
--- a/tests/system/small/ml/test_preprocessing.py
+++ b/tests/system/small/ml/test_preprocessing.py
@@ -373,6 +373,27 @@ def test_k_bins_discretizer_normalized_fit_transform_default_params(new_penguins
     pd.testing.assert_frame_equal(result, expected, rtol=0.1)
 
 
+def test_k_bins_discretizer_normalized_fit_transform_default_params_quantile(
+    new_penguins_df,
+):
+    discretizer = preprocessing.KBinsDiscretizer(strategy="quantile")
+    result = discretizer.fit_transform(
+        new_penguins_df[["culmen_length_mm", "culmen_depth_mm", "flipper_length_mm"]]
+    ).to_pandas()
+
+    expected = pd.DataFrame(
+        {
+            "kbinsdiscretizer_culmen_length_mm": ["bin_2", "bin_2", "bin_1"],
+            "kbinsdiscretizer_culmen_depth_mm": ["bin_2", "bin_1", "bin_2"],
+            "kbinsdiscretizer_flipper_length_mm": ["bin_2", "bin_1", "bin_2"],
+        },
+        dtype="string[pyarrow]",
+        index=pd.Index([1633, 1672, 1690], name="tag_number", dtype="Int64"),
+    )
+
+    pd.testing.assert_frame_equal(result, expected, rtol=0.1)
+
+
 def test_k_bins_discretizer_series_normalizes(
     penguins_df_default_index, new_penguins_df
 ):
@@ -395,6 +416,28 @@ def test_k_bins_discretizer_series_normalizes(
     pd.testing.assert_frame_equal(result, expected, rtol=0.1)
 
 
+def test_k_bins_discretizer_series_normalizes_quantile(
+    penguins_df_default_index, new_penguins_df
+):
+    discretizer = preprocessing.KBinsDiscretizer(strategy="quantile")
+    discretizer.fit(penguins_df_default_index["culmen_length_mm"])
+
+    result = discretizer.transform(
+        penguins_df_default_index["culmen_length_mm"]
+    ).to_pandas()
+    result = discretizer.transform(new_penguins_df).to_pandas()
+
+    expected = pd.DataFrame(
+        {
+            "kbinsdiscretizer_culmen_length_mm": ["bin_2", "bin_2", "bin_1"],
+        },
+        dtype="string[pyarrow]",
+        index=pd.Index([1633, 1672, 1690], name="tag_number", dtype="Int64"),
+    )
+
+    pd.testing.assert_frame_equal(result, expected, rtol=0.1)
+
+
 def test_k_bins_discretizer_normalizes(penguins_df_default_index, new_penguins_df):
     # TODO(http://b/292431644): add a second test that compares output to sklearn.preprocessing.KBinsDiscretizer, when BQML's change is in prod.
     discretizer = preprocessing.KBinsDiscretizer(strategy="uniform")
@@ -488,6 +531,21 @@ def test_k_bins_discretizer_save_load(new_penguins_df, dataset_id):
     pd.testing.assert_frame_equal(result, expected, rtol=0.1)
 
 
+def test_k_bins_discretizer_save_load_quantile(new_penguins_df, dataset_id):
+    transformer = preprocessing.KBinsDiscretizer(n_bins=6, strategy="quantile")
+    transformer.fit(
+        new_penguins_df[["culmen_length_mm", "culmen_depth_mm", "flipper_length_mm"]]
+    )
+
+    reloaded_transformer = transformer.to_gbq(
+        f"{dataset_id}.temp_configured_model", replace=True
+    )
+    assert isinstance(reloaded_transformer, preprocessing.KBinsDiscretizer)
+    assert reloaded_transformer.n_bins == transformer.n_bins
+    assert reloaded_transformer.strategy == transformer.strategy
+    assert reloaded_transformer._bqml_model is not None
+
+
 def test_one_hot_encoder_default_params(new_penguins_df):
     encoder = preprocessing.OneHotEncoder()
     encoder.fit(new_penguins_df[["species", "sex"]])
diff --git a/tests/unit/ml/test_sql.py b/tests/unit/ml/test_sql.py
index 4dd90b2c4a..07b247fb41 100644
--- a/tests/unit/ml/test_sql.py
+++ b/tests/unit/ml/test_sql.py
@@ -113,6 +113,13 @@ def test_k_bins_discretizer_correct(
     assert sql == "ML.BUCKETIZE(col_a, [1, 2, 3, 4], FALSE) AS scaled_col_a"
 
 
+def test_k_bins_discretizer_quantile_correct(
+    base_sql_generator: ml_sql.BaseSqlGenerator,
+):
+    sql = base_sql_generator.ml_quantile_bucketize("col_a", 5, "scaled_col_a")
+    assert sql == "ML.QUANTILE_BUCKETIZE(col_a, 5) OVER() AS scaled_col_a"
+
+
 def test_one_hot_encoder_correct(
     base_sql_generator: ml_sql.BaseSqlGenerator,
 ):
diff --git a/third_party/bigframes_vendored/sklearn/preprocessing/_discretization.py b/third_party/bigframes_vendored/sklearn/preprocessing/_discretization.py
index 98b9d0371f..54c81af71d 100644
--- a/third_party/bigframes_vendored/sklearn/preprocessing/_discretization.py
+++ b/third_party/bigframes_vendored/sklearn/preprocessing/_discretization.py
@@ -18,7 +18,7 @@ class KBinsDiscretizer(TransformerMixin, BaseEstimator):
         strategy ({'uniform', 'quantile'}, default='quantile'):
             Strategy used to define the widths of the bins. 'uniform': All bins
             in each feature have identical widths. 'quantile': All bins in each
-            feature have the same number of points. Only `uniform` is supported.
+            feature have the same number of points.
     """
 
     def fit(self, X, y=None):

From 4fc89644e47a6da9367b54826b25c6abbe97327b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tim=20Swe=C3=B1a=20=28Swast=29?= <swast@google.com>
Date: Wed, 8 May 2024 12:03:54 -0500
Subject: [PATCH 03/17] docs: address lint errors in code samples (#665)

Towards internal issue 332735129

test: move samples tests to their own kokoro jobs
---
 .kokoro/continuous/e2e.cfg                    |   2 +-
 .kokoro/presubmit/e2e-gerrit.cfg              |   2 +-
 .kokoro/presubmit/e2e.cfg                     |   2 +-
 noxfile.py                                    |  20 --
 owlbot.py                                     |   6 +-
 samples/snippets/bqml_getting_started_test.py |   2 +-
 samples/snippets/clustering_model_test.py     |   2 +-
 samples/snippets/conftest.py                  |   2 +-
 samples/snippets/create_kmeans_model_test.py  |   8 +-
 ...e_multiple_timeseries_forecasting_model.py |   2 +-
 ...ingle_timeseries_forecasting_model_test.py |   3 +-
 samples/snippets/explore_query_result_test.py |   2 +-
 samples/snippets/gemini_model_test.py         |   2 +-
 samples/snippets/gen_ai_model_test.py         |   2 +-
 .../snippets/load_data_from_bigquery_test.py  |   2 +-
 .../load_data_from_biquery_job_test.py        |   2 +-
 samples/snippets/load_data_from_csv_test.py   |   2 +-
 samples/snippets/noxfile.py                   | 292 ++++++++++++++++++
 samples/snippets/noxfile_config.py            |  42 +++
 samples/snippets/pandas_methods_test.py       |   2 +-
 samples/snippets/quickstart.py                |   2 +-
 samples/snippets/regression_model_test.py     |   2 +-
 samples/snippets/remote_function.py           |   6 +-
 samples/snippets/remote_function_test.py      |  31 ++
 samples/snippets/requirements-test.txt        |   3 +
 samples/snippets/requirements.txt             |   2 +
 samples/snippets/set_options_test.py          |   2 +-
 27 files changed, 401 insertions(+), 46 deletions(-)
 create mode 100644 samples/snippets/noxfile.py
 create mode 100644 samples/snippets/noxfile_config.py
 create mode 100644 samples/snippets/requirements-test.txt
 create mode 100644 samples/snippets/requirements.txt

diff --git a/.kokoro/continuous/e2e.cfg b/.kokoro/continuous/e2e.cfg
index 7479346590..774b63313e 100644
--- a/.kokoro/continuous/e2e.cfg
+++ b/.kokoro/continuous/e2e.cfg
@@ -3,7 +3,7 @@
 # Only run this nox session.
 env_vars: {
     key: "NOX_SESSION"
-    value: "unit_prerelease system_prerelease system_noextras e2e notebook samples"
+    value: "unit_prerelease system_prerelease system_noextras e2e notebook"
 }
 
 env_vars: {
diff --git a/.kokoro/presubmit/e2e-gerrit.cfg b/.kokoro/presubmit/e2e-gerrit.cfg
index d875f36060..19913344b6 100644
--- a/.kokoro/presubmit/e2e-gerrit.cfg
+++ b/.kokoro/presubmit/e2e-gerrit.cfg
@@ -3,5 +3,5 @@
 # Only run this nox session.
 env_vars: {
     key: "NOX_SESSION"
-    value: "system_noextras e2e notebook samples"
+    value: "system_noextras e2e notebook"
 }
diff --git a/.kokoro/presubmit/e2e.cfg b/.kokoro/presubmit/e2e.cfg
index 7479346590..774b63313e 100644
--- a/.kokoro/presubmit/e2e.cfg
+++ b/.kokoro/presubmit/e2e.cfg
@@ -3,7 +3,7 @@
 # Only run this nox session.
 env_vars: {
     key: "NOX_SESSION"
-    value: "unit_prerelease system_prerelease system_noextras e2e notebook samples"
+    value: "unit_prerelease system_prerelease system_noextras e2e notebook"
 }
 
 env_vars: {
diff --git a/noxfile.py b/noxfile.py
index 91ad6bc0e6..af73495a7f 100644
--- a/noxfile.py
+++ b/noxfile.py
@@ -402,26 +402,6 @@ def load(session: nox.sessions.Session):
     )
 
 
-@nox.session(python=SYSTEM_TEST_PYTHON_VERSIONS)
-def samples(session):
-    """Run the samples test suite."""
-
-    constraints_path = str(
-        CURRENT_DIRECTORY / "testing" / f"constraints-{session.python}.txt"
-    )
-
-    # TODO(b/332735129): Remove this session and use python_samples templates
-    # where each samples directory has its own noxfile.py file, instead.
-    install_test_extra = True
-    install_systemtest_dependencies(session, install_test_extra, "-c", constraints_path)
-
-    session.run(
-        "py.test",
-        "samples",
-        *session.posargs,
-    )
-
-
 @nox.session(python=DEFAULT_PYTHON_VERSION)
 def cover(session):
     """Run the final coverage report.
diff --git a/owlbot.py b/owlbot.py
index f804859689..ddc578c3a2 100644
--- a/owlbot.py
+++ b/owlbot.py
@@ -74,7 +74,7 @@
         import pandas
         import pyarrow
         import sqlglot
-        
+
         print(f"Python: {sys.version}")
         print(f"bigframes=={bigframes.__version__}")
         print(f"google-cloud-bigquery=={google.cloud.bigquery.__version__}")
@@ -83,7 +83,7 @@
         print(f"pyarrow=={pyarrow.__version__}")
         print(f"sqlglot=={sqlglot.__version__}")
         ```
-        
+
         #### Steps to reproduce
         """,
     ),
@@ -148,3 +148,5 @@
 # ----------------------------------------------------------------------------
 
 s.shell.run(["nox", "-s", "format"], hide_output=False)
+for noxfile in REPO_ROOT.glob("samples/**/noxfile.py"):
+    s.shell.run(["nox", "-s", "format"], cwd=noxfile.parent, hide_output=False)
diff --git a/samples/snippets/bqml_getting_started_test.py b/samples/snippets/bqml_getting_started_test.py
index d9f9135faa..d249ca4ff3 100644
--- a/samples/snippets/bqml_getting_started_test.py
+++ b/samples/snippets/bqml_getting_started_test.py
@@ -13,7 +13,7 @@
 # limitations under the License.
 
 
-def test_bqml_getting_started(random_model_id):
+def test_bqml_getting_started(random_model_id: str) -> None:
     your_model_id = random_model_id  # for example: bqml_tutorial.sample_model
 
     # [START bigquery_dataframes_bqml_getting_started_tutorial]
diff --git a/samples/snippets/clustering_model_test.py b/samples/snippets/clustering_model_test.py
index a407fc7805..fec4bbcefd 100644
--- a/samples/snippets/clustering_model_test.py
+++ b/samples/snippets/clustering_model_test.py
@@ -13,7 +13,7 @@
 # limitations under the License.
 
 
-def test_clustering_model():
+def test_clustering_model() -> None:
     # [START bigquery_dataframes_clustering_model]
     from bigframes.ml.cluster import KMeans
     import bigframes.pandas as bpd
diff --git a/samples/snippets/conftest.py b/samples/snippets/conftest.py
index d34837b3e2..9171ac78a4 100644
--- a/samples/snippets/conftest.py
+++ b/samples/snippets/conftest.py
@@ -46,7 +46,7 @@ def project_id(bigquery_client: bigquery.Client) -> str:
 
 
 @pytest.fixture(autouse=True)
-def reset_session():
+def reset_session() -> None:
     """An autouse fixture ensuring each sample runs in a fresh session.
 
     This allows us to have samples that query data in different locations.
diff --git a/samples/snippets/create_kmeans_model_test.py b/samples/snippets/create_kmeans_model_test.py
index 2429060d09..32ebc60a69 100644
--- a/samples/snippets/create_kmeans_model_test.py
+++ b/samples/snippets/create_kmeans_model_test.py
@@ -13,12 +13,14 @@
 # limitations under the License.
 
 
-def test_kmeans_sample(project_id: str, random_model_id_eu: str):
+def test_kmeans_sample(project_id: str, random_model_id_eu: str) -> None:
     your_gcp_project_id = project_id
     your_model_id = random_model_id_eu
     # [START bigquery_dataframes_bqml_kmeans]
     import datetime
 
+    import pandas as pd
+
     import bigframes
     import bigframes.pandas as bpd
 
@@ -92,7 +94,9 @@ def test_kmeans_sample(project_id: str, random_model_id_eu: str):
     stationstats = merged_df.groupby(["station_name", "isweekday"]).agg(
         {"duration": ["mean", "count"], "distance_from_city_center": "max"}
     )
-    stationstats.columns = ["duration", "num_trips", "distance_from_city_center"]
+    stationstats.columns = pd.Index(
+        ["duration", "num_trips", "distance_from_city_center"]
+    )
     stationstats = stationstats.sort_values(
         by="distance_from_city_center", ascending=True
     ).reset_index()
diff --git a/samples/snippets/create_multiple_timeseries_forecasting_model.py b/samples/snippets/create_multiple_timeseries_forecasting_model.py
index 26fc15595f..b749c37d50 100644
--- a/samples/snippets/create_multiple_timeseries_forecasting_model.py
+++ b/samples/snippets/create_multiple_timeseries_forecasting_model.py
@@ -13,7 +13,7 @@
 # limitations under the License.
 
 
-def test_multiple_timeseries_forecasting_model(random_model_id):
+def test_multiple_timeseries_forecasting_model(random_model_id: str) -> None:
     your_model_id = random_model_id
 
     # [START bigquery_dataframes_bqml_arima_multiple_step_2_visualize]
diff --git a/samples/snippets/create_single_timeseries_forecasting_model_test.py b/samples/snippets/create_single_timeseries_forecasting_model_test.py
index 5750933713..0c694de2dc 100644
--- a/samples/snippets/create_single_timeseries_forecasting_model_test.py
+++ b/samples/snippets/create_single_timeseries_forecasting_model_test.py
@@ -13,8 +13,7 @@
 # limitations under the License.
 
 
-def test_create_single_timeseries():
-
+def test_create_single_timeseries() -> None:
     # [START bigquery_dataframes_single_timeseries_forecasting_model_tutorial]
     import bigframes.pandas as bpd
 
diff --git a/samples/snippets/explore_query_result_test.py b/samples/snippets/explore_query_result_test.py
index 5f0ec7d9b6..42f48fd94e 100644
--- a/samples/snippets/explore_query_result_test.py
+++ b/samples/snippets/explore_query_result_test.py
@@ -13,7 +13,7 @@
 # limitations under the License.
 
 
-def test_bigquery_dataframes_explore_query_result():
+def test_bigquery_dataframes_explore_query_result() -> None:
     import bigframes.pandas as bpd
 
     # [START bigquery_dataframes_explore_query_result]
diff --git a/samples/snippets/gemini_model_test.py b/samples/snippets/gemini_model_test.py
index 89212875ae..24b4e7d26d 100644
--- a/samples/snippets/gemini_model_test.py
+++ b/samples/snippets/gemini_model_test.py
@@ -13,7 +13,7 @@
 # limitations under the License.
 
 
-def test_gemini_text_generator_model():
+def test_gemini_text_generator_model() -> None:
     # Determine project id, in this case prefer the one set in the environment
     # variable GOOGLE_CLOUD_PROJECT (if any)
     import os
diff --git a/samples/snippets/gen_ai_model_test.py b/samples/snippets/gen_ai_model_test.py
index e4bead0e46..5cdcd6d3a7 100644
--- a/samples/snippets/gen_ai_model_test.py
+++ b/samples/snippets/gen_ai_model_test.py
@@ -13,7 +13,7 @@
 # limitations under the License.
 
 
-def test_llm_model():
+def test_llm_model() -> None:
     # Determine project id, in this case prefer the one set in the environment
     # variable GOOGLE_CLOUD_PROJECT (if any)
     import os
diff --git a/samples/snippets/load_data_from_bigquery_test.py b/samples/snippets/load_data_from_bigquery_test.py
index e4c65688bd..4523eece97 100644
--- a/samples/snippets/load_data_from_bigquery_test.py
+++ b/samples/snippets/load_data_from_bigquery_test.py
@@ -13,7 +13,7 @@
 # limitations under the License.
 
 
-def test_bigquery_dataframes_load_data_from_bigquery():
+def test_bigquery_dataframes_load_data_from_bigquery() -> None:
     # [START bigquery_dataframes_load_data_from_bigquery]
     # Create a DataFrame from a BigQuery table:
     import bigframes.pandas as bpd
diff --git a/samples/snippets/load_data_from_biquery_job_test.py b/samples/snippets/load_data_from_biquery_job_test.py
index 9a7793a7e5..4f1ddc062f 100644
--- a/samples/snippets/load_data_from_biquery_job_test.py
+++ b/samples/snippets/load_data_from_biquery_job_test.py
@@ -13,7 +13,7 @@
 # limitations under the License.
 
 
-def test_bigquery_dataframes_load_data_from_bigquery_job():
+def test_bigquery_dataframes_load_data_from_bigquery_job() -> None:
     # Determine project id, in this case prefer the one set in the environment
     # variable GOOGLE_CLOUD_PROJECT (if any)
     import os
diff --git a/samples/snippets/load_data_from_csv_test.py b/samples/snippets/load_data_from_csv_test.py
index 31ab9255bf..cc96b92fb8 100644
--- a/samples/snippets/load_data_from_csv_test.py
+++ b/samples/snippets/load_data_from_csv_test.py
@@ -13,7 +13,7 @@
 # limitations under the License.
 
 
-def test_bigquery_dataframes_load_data_from_csv():
+def test_bigquery_dataframes_load_data_from_csv() -> None:
     # [START bigquery_dataframes_load_data_from_csv]
     import bigframes.pandas as bpd
 
diff --git a/samples/snippets/noxfile.py b/samples/snippets/noxfile.py
new file mode 100644
index 0000000000..c36d5f2d81
--- /dev/null
+++ b/samples/snippets/noxfile.py
@@ -0,0 +1,292 @@
+# Copyright 2019 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import glob
+import os
+from pathlib import Path
+import sys
+from typing import Callable, Dict, Optional
+
+import nox
+
+# WARNING - WARNING - WARNING - WARNING - WARNING
+# WARNING - WARNING - WARNING - WARNING - WARNING
+#           DO NOT EDIT THIS FILE EVER!
+# WARNING - WARNING - WARNING - WARNING - WARNING
+# WARNING - WARNING - WARNING - WARNING - WARNING
+
+BLACK_VERSION = "black==22.3.0"
+ISORT_VERSION = "isort==5.10.1"
+
+# Copy `noxfile_config.py` to your directory and modify it instead.
+
+# `TEST_CONFIG` dict is a configuration hook that allows users to
+# modify the test configurations. The values here should be in sync
+# with `noxfile_config.py`. Users will copy `noxfile_config.py` into
+# their directory and modify it.
+
+TEST_CONFIG = {
+    # You can opt out from the test for specific Python versions.
+    "ignored_versions": [],
+    # Old samples are opted out of enforcing Python type hints
+    # All new samples should feature them
+    "enforce_type_hints": False,
+    # An envvar key for determining the project id to use. Change it
+    # to 'BUILD_SPECIFIC_GCLOUD_PROJECT' if you want to opt in using a
+    # build specific Cloud project. You can also use your own string
+    # to use your own Cloud project.
+    "gcloud_project_env": "GOOGLE_CLOUD_PROJECT",
+    # 'gcloud_project_env': 'BUILD_SPECIFIC_GCLOUD_PROJECT',
+    # If you need to use a specific version of pip,
+    # change pip_version_override to the string representation
+    # of the version number, for example, "20.2.4"
+    "pip_version_override": None,
+    # A dictionary you want to inject into your test. Don't put any
+    # secrets here. These values will override predefined values.
+    "envs": {},
+}
+
+
+try:
+    # Ensure we can import noxfile_config in the project's directory.
+    sys.path.append(".")
+    from noxfile_config import TEST_CONFIG_OVERRIDE
+except ImportError as e:
+    print("No user noxfile_config found: detail: {}".format(e))
+    TEST_CONFIG_OVERRIDE = {}
+
+# Update the TEST_CONFIG with the user supplied values.
+TEST_CONFIG.update(TEST_CONFIG_OVERRIDE)
+
+
+def get_pytest_env_vars() -> Dict[str, str]:
+    """Returns a dict for pytest invocation."""
+    ret = {}
+
+    # Override the GCLOUD_PROJECT and the alias.
+    env_key = TEST_CONFIG["gcloud_project_env"]
+    # This should error out if not set.
+    ret["GOOGLE_CLOUD_PROJECT"] = os.environ[env_key]
+
+    # Apply user supplied envs.
+    ret.update(TEST_CONFIG["envs"])
+    return ret
+
+
+# DO NOT EDIT - automatically generated.
+# All versions used to test samples.
+ALL_VERSIONS = ["3.7", "3.8", "3.9", "3.10", "3.11", "3.12"]
+
+# Any default versions that should be ignored.
+IGNORED_VERSIONS = TEST_CONFIG["ignored_versions"]
+
+TESTED_VERSIONS = sorted([v for v in ALL_VERSIONS if v not in IGNORED_VERSIONS])
+
+INSTALL_LIBRARY_FROM_SOURCE = os.environ.get("INSTALL_LIBRARY_FROM_SOURCE", False) in (
+    "True",
+    "true",
+)
+
+# Error if a python version is missing
+nox.options.error_on_missing_interpreters = True
+
+#
+# Style Checks
+#
+
+
+# Linting with flake8.
+#
+# We ignore the following rules:
+#   E203: whitespace before ‘:’
+#   E266: too many leading ‘#’ for block comment
+#   E501: line too long
+#   I202: Additional newline in a section of imports
+#
+# We also need to specify the rules which are ignored by default:
+# ['E226', 'W504', 'E126', 'E123', 'W503', 'E24', 'E704', 'E121']
+FLAKE8_COMMON_ARGS = [
+    "--show-source",
+    "--builtin=gettext",
+    "--max-complexity=20",
+    "--exclude=.nox,.cache,env,lib,generated_pb2,*_pb2.py,*_pb2_grpc.py",
+    "--ignore=E121,E123,E126,E203,E226,E24,E266,E501,E704,W503,W504,I202",
+    "--max-line-length=88",
+]
+
+
+@nox.session
+def lint(session: nox.sessions.Session) -> None:
+    if not TEST_CONFIG["enforce_type_hints"]:
+        session.install("flake8")
+    else:
+        session.install("flake8", "flake8-annotations")
+
+    args = FLAKE8_COMMON_ARGS + [
+        ".",
+    ]
+    session.run("flake8", *args)
+
+
+#
+# Black
+#
+
+
+@nox.session
+def blacken(session: nox.sessions.Session) -> None:
+    """Run black. Format code to uniform standard."""
+    session.install(BLACK_VERSION)
+    python_files = [path for path in os.listdir(".") if path.endswith(".py")]
+
+    session.run("black", *python_files)
+
+
+#
+# format = isort + black
+#
+
+
+@nox.session
+def format(session: nox.sessions.Session) -> None:
+    """
+    Run isort to sort imports. Then run black
+    to format code to uniform standard.
+    """
+    session.install(BLACK_VERSION, ISORT_VERSION)
+    python_files = [path for path in os.listdir(".") if path.endswith(".py")]
+
+    # Use the --fss option to sort imports using strict alphabetical order.
+    # See https://pycqa.github.io/isort/docs/configuration/options.html#force-sort-within-sections
+    session.run("isort", "--fss", *python_files)
+    session.run("black", *python_files)
+
+
+#
+# Sample Tests
+#
+
+
+PYTEST_COMMON_ARGS = ["--junitxml=sponge_log.xml"]
+
+
+def _session_tests(
+    session: nox.sessions.Session, post_install: Callable = None
+) -> None:
+    # check for presence of tests
+    test_list = glob.glob("**/*_test.py", recursive=True) + glob.glob(
+        "**/test_*.py", recursive=True
+    )
+    test_list.extend(glob.glob("**/tests", recursive=True))
+
+    if len(test_list) == 0:
+        print("No tests found, skipping directory.")
+        return
+
+    if TEST_CONFIG["pip_version_override"]:
+        pip_version = TEST_CONFIG["pip_version_override"]
+        session.install(f"pip=={pip_version}")
+    """Runs py.test for a particular project."""
+    concurrent_args = []
+    if os.path.exists("requirements.txt"):
+        if os.path.exists("constraints.txt"):
+            session.install("-r", "requirements.txt", "-c", "constraints.txt")
+        else:
+            session.install("-r", "requirements.txt")
+        with open("requirements.txt") as rfile:
+            packages = rfile.read()
+
+    if os.path.exists("requirements-test.txt"):
+        if os.path.exists("constraints-test.txt"):
+            session.install("-r", "requirements-test.txt", "-c", "constraints-test.txt")
+        else:
+            session.install("-r", "requirements-test.txt")
+        with open("requirements-test.txt") as rtfile:
+            packages += rtfile.read()
+
+    if INSTALL_LIBRARY_FROM_SOURCE:
+        session.install("-e", _get_repo_root())
+
+    if post_install:
+        post_install(session)
+
+    if "pytest-parallel" in packages:
+        concurrent_args.extend(["--workers", "auto", "--tests-per-worker", "auto"])
+    elif "pytest-xdist" in packages:
+        concurrent_args.extend(["-n", "auto"])
+
+    session.run(
+        "pytest",
+        *(PYTEST_COMMON_ARGS + session.posargs + concurrent_args),
+        # Pytest will return 5 when no tests are collected. This can happen
+        # on travis where slow and flaky tests are excluded.
+        # See http://doc.pytest.org/en/latest/_modules/_pytest/main.html
+        success_codes=[0, 5],
+        env=get_pytest_env_vars(),
+    )
+
+
+@nox.session(python=ALL_VERSIONS)
+def py(session: nox.sessions.Session) -> None:
+    """Runs py.test for a sample using the specified version of Python."""
+    if session.python in TESTED_VERSIONS:
+        _session_tests(session)
+    else:
+        session.skip(
+            "SKIPPED: {} tests are disabled for this sample.".format(session.python)
+        )
+
+
+#
+# Readmegen
+#
+
+
+def _get_repo_root() -> Optional[str]:
+    """Returns the root folder of the project."""
+    # Get root of this repository. Assume we don't have directories nested deeper than 10 items.
+    p = Path(os.getcwd())
+    for i in range(10):
+        if p is None:
+            break
+        if Path(p / ".git").exists():
+            return str(p)
+        # .git is not available in repos cloned via Cloud Build
+        # setup.py is always in the library's root, so use that instead
+        # https://github.com/googleapis/synthtool/issues/792
+        if Path(p / "setup.py").exists():
+            return str(p)
+        p = p.parent
+    raise Exception("Unable to detect repository root.")
+
+
+GENERATED_READMES = sorted([x for x in Path(".").rglob("*.rst.in")])
+
+
+@nox.session
+@nox.parametrize("path", GENERATED_READMES)
+def readmegen(session: nox.sessions.Session, path: str) -> None:
+    """(Re-)generates the readme for a sample."""
+    session.install("jinja2", "pyyaml")
+    dir_ = os.path.dirname(path)
+
+    if os.path.exists(os.path.join(dir_, "requirements.txt")):
+        session.install("-r", os.path.join(dir_, "requirements.txt"))
+
+    in_file = os.path.join(dir_, "README.rst.in")
+    session.run(
+        "python", _get_repo_root() + "/scripts/readme-gen/readme_gen.py", in_file
+    )
diff --git a/samples/snippets/noxfile_config.py b/samples/snippets/noxfile_config.py
new file mode 100644
index 0000000000..211d6974b9
--- /dev/null
+++ b/samples/snippets/noxfile_config.py
@@ -0,0 +1,42 @@
+# Copyright 2020 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Default TEST_CONFIG_OVERRIDE for python repos.
+
+# You can copy this file into your directory, then it will be inported from
+# the noxfile.py.
+
+# The source of truth:
+# https://github.com/GoogleCloudPlatform/python-docs-samples/blob/master/noxfile_config.py
+
+TEST_CONFIG_OVERRIDE = {
+    # You can opt out from the test for specific Python versions.
+    "ignored_versions": ["2.7", "3.7", "3.8"],
+    # Old samples are opted out of enforcing Python type hints
+    # All new samples should feature them
+    "enforce_type_hints": True,
+    # An envvar key for determining the project id to use. Change it
+    # to 'BUILD_SPECIFIC_GCLOUD_PROJECT' if you want to opt in using a
+    # build specific Cloud project. You can also use your own string
+    # to use your own Cloud project.
+    "gcloud_project_env": "GOOGLE_CLOUD_PROJECT",
+    # "gcloud_project_env": "BUILD_SPECIFIC_GCLOUD_PROJECT",
+    # If you need to use a specific version of pip,
+    # change pip_version_override to the string representation
+    # of the version number, for example, "20.2.4"
+    "pip_version_override": None,
+    # A dictionary you want to inject into your test. Don't put any
+    # secrets here. These values will override predefined values.
+    "envs": {},
+}
diff --git a/samples/snippets/pandas_methods_test.py b/samples/snippets/pandas_methods_test.py
index bd8e29c003..0f128f9e6a 100644
--- a/samples/snippets/pandas_methods_test.py
+++ b/samples/snippets/pandas_methods_test.py
@@ -13,7 +13,7 @@
 # limitations under the License.
 
 
-def test_bigquery_dataframes_pandas_methods():
+def test_bigquery_dataframes_pandas_methods() -> None:
     # [START bigquery_dataframes_pandas_methods]
     import bigframes.pandas as bpd
 
diff --git a/samples/snippets/quickstart.py b/samples/snippets/quickstart.py
index ae3a934004..c26c6f4442 100644
--- a/samples/snippets/quickstart.py
+++ b/samples/snippets/quickstart.py
@@ -13,7 +13,7 @@
 # limitations under the License.
 
 
-def run_quickstart(project_id: str):
+def run_quickstart(project_id: str) -> None:
     import bigframes
 
     session_options = bigframes.BigQueryOptions()
diff --git a/samples/snippets/regression_model_test.py b/samples/snippets/regression_model_test.py
index 7d1bde689c..43cdabacb4 100644
--- a/samples/snippets/regression_model_test.py
+++ b/samples/snippets/regression_model_test.py
@@ -13,7 +13,7 @@
 # limitations under the License.
 
 
-def test_regression_model():
+def test_regression_model() -> None:
     # [START bigquery_dataframes_regression_model]
     from bigframes.ml.linear_model import LinearRegression
     import bigframes.pandas as bpd
diff --git a/samples/snippets/remote_function.py b/samples/snippets/remote_function.py
index 4db4e67619..c35daf35fc 100644
--- a/samples/snippets/remote_function.py
+++ b/samples/snippets/remote_function.py
@@ -13,7 +13,7 @@
 # limitations under the License.
 
 
-def run_remote_function_and_read_gbq_function(project_id: str):
+def run_remote_function_and_read_gbq_function(project_id: str) -> None:
     your_gcp_project_id = project_id
 
     # [START bigquery_dataframes_remote_function]
@@ -51,7 +51,7 @@ def run_remote_function_and_read_gbq_function(project_id: str):
         str,
         reuse=False,
     )
-    def get_bucket(num):
+    def get_bucket(num: float) -> str:
         if not num:
             return "NA"
         boundary = 4000
@@ -96,7 +96,7 @@ def get_bucket(num):
         reuse=False,
         packages=["cryptography"],
     )
-    def get_hash(input):
+    def get_hash(input: str) -> str:
         from cryptography.fernet import Fernet
 
         # handle missing value
diff --git a/samples/snippets/remote_function_test.py b/samples/snippets/remote_function_test.py
index 8f891274de..24bc7e854e 100644
--- a/samples/snippets/remote_function_test.py
+++ b/samples/snippets/remote_function_test.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import google.api_core.exceptions
+import google.cloud.bigquery_connection_v1
 import pytest
 
 import bigframes.pandas
@@ -19,6 +21,35 @@
 from . import remote_function
 
 
+# TODO(tswast): Once the connections are cleaned up in the sample test project
+# and https://github.com/GoogleCloudPlatform/python-docs-samples/issues/11720
+# is closed, we shouldn't need this because AFAIK we only use one BQ connection
+# in this sample.
+@pytest.fixture(autouse=True)
+def cleanup_connections() -> None:
+    client = google.cloud.bigquery_connection_v1.ConnectionServiceClient()
+
+    for conn in client.list_connections(
+        parent="projects/python-docs-samples-tests/locations/us"
+    ):
+        try:
+            int(conn.name.split("/")[-1].split("-")[0], base=16)
+        except ValueError:
+            print(f"Couldn't parse {conn.name}")
+            continue
+
+        try:
+            print(f"removing {conn.name}")
+            client.delete_connection(
+                google.cloud.bigquery_connection_v1.DeleteConnectionRequest(
+                    {"name": conn.name},
+                )
+            )
+        except google.api_core.exceptions.GoogleAPIError:
+            # We did as much clean up as we can.
+            break
+
+
 def test_remote_function_and_read_gbq_function(
     capsys: pytest.CaptureFixture[str],
 ) -> None:
diff --git a/samples/snippets/requirements-test.txt b/samples/snippets/requirements-test.txt
new file mode 100644
index 0000000000..62b0c02e79
--- /dev/null
+++ b/samples/snippets/requirements-test.txt
@@ -0,0 +1,3 @@
+# samples/snippets should be runnable with no "extras"
+google-cloud-testutils==1.4.0
+pytest==8.1.1
diff --git a/samples/snippets/requirements.txt b/samples/snippets/requirements.txt
new file mode 100644
index 0000000000..1da77c1715
--- /dev/null
+++ b/samples/snippets/requirements.txt
@@ -0,0 +1,2 @@
+# samples/snippets should be runnable with no "extras"
+bigframes==1.4.0
diff --git a/samples/snippets/set_options_test.py b/samples/snippets/set_options_test.py
index f981009e9a..3dea524a17 100644
--- a/samples/snippets/set_options_test.py
+++ b/samples/snippets/set_options_test.py
@@ -13,7 +13,7 @@
 # limitations under the License.
 
 
-def test_bigquery_dataframes_set_options():
+def test_bigquery_dataframes_set_options() -> None:
     # Close the session before resetting the options
     import bigframes.pandas as bpd
 

From bcc054b90b7f84f79e127b27fd41ab5125f6c496 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tim=20Swe=C3=B1a=20=28Swast=29?= <swast@google.com>
Date: Wed, 8 May 2024 15:29:01 -0500
Subject: [PATCH 04/17] chore: remove outdated `NoDefaultIndexError` info from
 CHANGELOG.md (#668)

* chore: remove outdated `NoDefaultIndexError` info from CHANGELOG.md

We don't actually raise an error, just a `DefaultIndexWarning ` warning.

* remove redundant line
---
 CHANGELOG.md | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index f73d4b5750..4457c2e443 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -15,7 +15,7 @@
 * Add the `bigframes.bigquery` sub-package with a `bigframes.bigquery.array_length` function ([#630](https://github.com/googleapis/python-bigquery-dataframes/issues/630)) ([9963f85](https://github.com/googleapis/python-bigquery-dataframes/commit/9963f85b84c3b3c681447ab79e22ac93ac48349c))
 * Always do a query dry run when `option.repr_mode == "deferred"` ([#652](https://github.com/googleapis/python-bigquery-dataframes/issues/652)) ([651fd7d](https://github.com/googleapis/python-bigquery-dataframes/commit/651fd7daf14273f172c6c55e5d6c374eb590a22d))
 * Custom query labels for compute options ([#638](https://github.com/googleapis/python-bigquery-dataframes/issues/638)) ([f561799](https://github.com/googleapis/python-bigquery-dataframes/commit/f5617994bc136de5caa72719b8c3c297c512cb36))
-* Raise `NoDefaultIndexError` from `read_gbq` on clustered/partitioned tables with no `index_col` or `filters` set ([#631](https://github.com/googleapis/python-bigquery-dataframes/issues/631)) ([73064dd](https://github.com/googleapis/python-bigquery-dataframes/commit/73064dd2aa1ece5de8f5849a0fd337d0ba677404))
+* Warn with `DefaultIndexWarning` from `read_gbq` on clustered/partitioned tables with no `index_col` or `filters` set ([#631](https://github.com/googleapis/python-bigquery-dataframes/issues/631), [#658](https://github.com/googleapis/python-bigquery-dataframes/issues/658)) ([2715d2b](https://github.com/googleapis/python-bigquery-dataframes/commit/2715d2b4a353710175a66a4f6149356f583f2c45), [73064dd](https://github.com/googleapis/python-bigquery-dataframes/commit/73064dd2aa1ece5de8f5849a0fd337d0ba677404))
 * Support `index_col=False` in `read_csv` and `engine="bigquery"` ([73064dd](https://github.com/googleapis/python-bigquery-dataframes/commit/73064dd2aa1ece5de8f5849a0fd337d0ba677404))
 * Support gcf max instance count in `remote_function` ([#657](https://github.com/googleapis/python-bigquery-dataframes/issues/657)) ([36578ab](https://github.com/googleapis/python-bigquery-dataframes/commit/36578ab431119f71dda746de415d0c6417bb4de2))
 
@@ -23,7 +23,6 @@
 ### Bug Fixes
 
 * Don't raise UnknownLocationWarning for US or EU multi-regions ([#653](https://github.com/googleapis/python-bigquery-dataframes/issues/653)) ([8e4616b](https://github.com/googleapis/python-bigquery-dataframes/commit/8e4616b896f4e0d13d8bb0424c89335d3a1fe697))
-* Downgrade NoDefaultIndexError to DefaultIndexWarning ([#658](https://github.com/googleapis/python-bigquery-dataframes/issues/658)) ([2715d2b](https://github.com/googleapis/python-bigquery-dataframes/commit/2715d2b4a353710175a66a4f6149356f583f2c45))
 * Fix bug with na in the column labels in stack ([#659](https://github.com/googleapis/python-bigquery-dataframes/issues/659)) ([4a34293](https://github.com/googleapis/python-bigquery-dataframes/commit/4a342933559fba417fe42e2bd386838defdb2778))
 * Use explicit session in `PaLM2TextGenerator` ([#651](https://github.com/googleapis/python-bigquery-dataframes/issues/651)) ([e4f13c3](https://github.com/googleapis/python-bigquery-dataframes/commit/e4f13c3633b90e32d3171976d8b27ed10049882f))
 

From e084e54557addff78522bbd710637ecb4b46d23e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tim=20Swe=C3=B1a=20=28Swast=29?= <swast@google.com>
Date: Wed, 8 May 2024 17:57:41 -0500
Subject: [PATCH 05/17] fix: include `index_col` when selecting `columns` and
 `filters` in `read_gbq_table` (#648)

* fix: include `index_col` when selecting `columns` and `filters` in `read_gbq_table`

Fixes internal issue 339430305

*

feat: warn with a more specific `DefaultLocationWarning` category when no location can be detected (#648)
test: refactor `read_gbq` / `read_gbq_table` tests to test with all parameters combined (#648)
refactor: move query generation code to BigQuery I/O module (#648)
---
 bigframes/exceptions.py                    |   6 +
 bigframes/pandas/__init__.py               |   3 +-
 bigframes/session/__init__.py              | 158 +++++++--------------
 bigframes/session/_io/bigquery/__init__.py |  97 ++++++++++++-
 tests/system/small/test_pandas_options.py  |  26 ++--
 tests/system/small/test_session.py         |  94 +++++++++---
 tests/unit/session/test_io_bigquery.py     | 106 ++++++++++++++
 tests/unit/session/test_session.py         |  82 -----------
 8 files changed, 349 insertions(+), 223 deletions(-)

diff --git a/bigframes/exceptions.py b/bigframes/exceptions.py
index 5caf2aa1df..3ca6d8e1af 100644
--- a/bigframes/exceptions.py
+++ b/bigframes/exceptions.py
@@ -17,6 +17,12 @@
 # NOTE: This module should not depend on any others in the package.
 
 
+# Uses UserWarning for backwards compatibility with warning without a category
+# set.
+class DefaultLocationWarning(UserWarning):
+    """No location was specified, so using a default one."""
+
+
 class UnknownLocationWarning(Warning):
     """The location is set to an unknown value."""
 
diff --git a/bigframes/pandas/__init__.py b/bigframes/pandas/__init__.py
index 2200fd6aa4..1d6da46fae 100644
--- a/bigframes/pandas/__init__.py
+++ b/bigframes/pandas/__init__.py
@@ -67,6 +67,7 @@
 import bigframes.operations as ops
 import bigframes.series
 import bigframes.session
+import bigframes.session._io.bigquery
 import bigframes.session.clients
 
 
@@ -391,7 +392,7 @@ def _set_default_session_location_if_possible(query):
 
     bqclient = clients_provider.bqclient
 
-    if bigframes.session._is_query(query):
+    if bigframes.session._io.bigquery.is_query(query):
         job = bqclient.query(query, bigquery.QueryJobConfig(dry_run=True))
         options.bigquery.location = job.location
     else:
diff --git a/bigframes/session/__init__.py b/bigframes/session/__init__.py
index 7c7d93541c..89845bb842 100644
--- a/bigframes/session/__init__.py
+++ b/bigframes/session/__init__.py
@@ -20,7 +20,6 @@
 import datetime
 import logging
 import os
-import re
 import secrets
 import typing
 from typing import (
@@ -86,10 +85,11 @@
 import bigframes.core.tree_properties as tree_properties
 import bigframes.core.utils as utils
 import bigframes.dtypes
+import bigframes.exceptions
 import bigframes.formatting_helpers as formatting_helpers
 from bigframes.functions.remote_function import read_gbq_function as bigframes_rgf
 from bigframes.functions.remote_function import remote_function as bigframes_rf
-import bigframes.session._io.bigquery as bigframes_io
+import bigframes.session._io.bigquery as bf_io_bigquery
 import bigframes.session._io.bigquery.read_gbq_table as bf_read_gbq_table
 import bigframes.session.clients
 import bigframes.version
@@ -145,14 +145,18 @@
 )
 
 
-def _is_query(query_or_table: str) -> bool:
-    """Determine if `query_or_table` is a table ID or a SQL string"""
-    return re.search(r"\s", query_or_table.strip(), re.MULTILINE) is not None
+def _to_index_cols(
+    index_col: Iterable[str] | str | bigframes.enums.DefaultIndexKind = (),
+) -> List[str]:
+    """Convert index_col into a list of column names."""
+    if isinstance(index_col, bigframes.enums.DefaultIndexKind):
+        index_cols: List[str] = []
+    elif isinstance(index_col, str):
+        index_cols = [index_col]
+    else:
+        index_cols = list(index_col)
 
-
-def _is_table_with_wildcard_suffix(query_or_table: str) -> bool:
-    """Determine if `query_or_table` is a table and contains a wildcard suffix."""
-    return not _is_query(query_or_table) and query_or_table.endswith("*")
+    return index_cols
 
 
 class Session(
@@ -181,12 +185,26 @@ def __init__(
         if context is None:
             context = bigquery_options.BigQueryOptions()
 
-        # TODO(swast): Get location from the environment.
         if context.location is None:
             self._location = "US"
             warnings.warn(
                 f"No explicit location is set, so using location {self._location} for the session.",
-                stacklevel=2,
+                # User's code
+                # -> get_global_session()
+                # -> connect()
+                # -> Session()
+                #
+                # Note: We could also have:
+                # User's code
+                # -> read_gbq()
+                # -> with_default_session()
+                # -> get_global_session()
+                # -> connect()
+                # -> Session()
+                # but we currently have no way to disambiguate these
+                # situations.
+                stacklevel=4,
+                category=bigframes.exceptions.DefaultLocationWarning,
             )
         else:
             self._location = context.location
@@ -322,13 +340,19 @@ def read_gbq(
             columns = col_order
 
         filters = list(filters)
-        if len(filters) != 0 or _is_table_with_wildcard_suffix(query_or_table):
+        if len(filters) != 0 or bf_io_bigquery.is_table_with_wildcard_suffix(
+            query_or_table
+        ):
             # TODO(b/338111344): This appears to be missing index_cols, which
             # are necessary to be selected.
-            # TODO(b/338039517): Also, need to account for primary keys.
-            query_or_table = self._to_query(query_or_table, columns, filters)
+            # TODO(b/338039517): Refactor this to be called inside both
+            # _read_gbq_query and _read_gbq_table (after detecting primary keys)
+            # so we can make sure index_col/index_cols reflects primary keys.
+            query_or_table = bf_io_bigquery.to_query(
+                query_or_table, _to_index_cols(index_col), columns, filters
+            )
 
-        if _is_query(query_or_table):
+        if bf_io_bigquery.is_query(query_or_table):
             return self._read_gbq_query(
                 query_or_table,
                 index_col=index_col,
@@ -355,85 +379,6 @@ def read_gbq(
                 use_cache=use_cache if use_cache is not None else True,
             )
 
-    def _to_query(
-        self,
-        query_or_table: str,
-        columns: Iterable[str],
-        filters: third_party_pandas_gbq.FiltersType,
-    ) -> str:
-        """Compile query_or_table with conditions(filters, wildcards) to query."""
-        filters = list(filters)
-        sub_query = (
-            f"({query_or_table})"
-            if _is_query(query_or_table)
-            else f"`{query_or_table}`"
-        )
-
-        # TODO(b/338111344): Generate an index based on DefaultIndexKind if we
-        # don't have index columns specified.
-        select_clause = "SELECT " + (
-            ", ".join(f"`{column}`" for column in columns) if columns else "*"
-        )
-
-        where_clause = ""
-        if filters:
-            valid_operators: Mapping[third_party_pandas_gbq.FilterOps, str] = {
-                "in": "IN",
-                "not in": "NOT IN",
-                "LIKE": "LIKE",
-                "==": "=",
-                ">": ">",
-                "<": "<",
-                ">=": ">=",
-                "<=": "<=",
-                "!=": "!=",
-            }
-
-            # If single layer filter, add another pseudo layer. So the single layer represents "and" logic.
-            if isinstance(filters[0], tuple) and (
-                len(filters[0]) == 0 or not isinstance(list(filters[0])[0], tuple)
-            ):
-                filters = typing.cast(third_party_pandas_gbq.FiltersType, [filters])
-
-            or_expressions = []
-            for group in filters:
-                if not isinstance(group, Iterable):
-                    group = [group]
-
-                and_expressions = []
-                for filter_item in group:
-                    if not isinstance(filter_item, tuple) or (len(filter_item) != 3):
-                        raise ValueError(
-                            f"Filter condition should be a tuple of length 3, {filter_item} is not valid."
-                        )
-
-                    column, operator, value = filter_item
-
-                    if not isinstance(column, str):
-                        raise ValueError(
-                            f"Column name should be a string, but received '{column}' of type {type(column).__name__}."
-                        )
-
-                    if operator not in valid_operators:
-                        raise ValueError(f"Operator {operator} is not valid.")
-
-                    operator_str = valid_operators[operator]
-
-                    if operator_str in ["IN", "NOT IN"]:
-                        value_list = ", ".join([repr(v) for v in value])
-                        expression = f"`{column}` {operator_str} ({value_list})"
-                    else:
-                        expression = f"`{column}` {operator_str} {repr(value)}"
-                    and_expressions.append(expression)
-
-                or_expressions.append(" AND ".join(and_expressions))
-
-            if or_expressions:
-                where_clause = " WHERE " + " OR ".join(or_expressions)
-
-        full_query = f"{select_clause} FROM {sub_query} AS sub{where_clause}"
-        return full_query
-
     def _query_to_destination(
         self,
         query: str,
@@ -610,12 +555,7 @@ def _read_gbq_query(
                 True if use_cache is None else use_cache
             )
 
-        if isinstance(index_col, bigframes.enums.DefaultIndexKind):
-            index_cols = []
-        elif isinstance(index_col, str):
-            index_cols = [index_col]
-        else:
-            index_cols = list(index_col)
+        index_cols = _to_index_cols(index_col)
 
         destination, query_job = self._query_to_destination(
             query,
@@ -682,8 +622,13 @@ def read_gbq_table(
             columns = col_order
 
         filters = list(filters)
-        if len(filters) != 0 or _is_table_with_wildcard_suffix(query):
-            query = self._to_query(query, columns, filters)
+        if len(filters) != 0 or bf_io_bigquery.is_table_with_wildcard_suffix(query):
+            # TODO(b/338039517): Refactor this to be called inside both
+            # _read_gbq_query and _read_gbq_table (after detecting primary keys)
+            # so we can make sure index_col/index_cols reflects primary keys.
+            query = bf_io_bigquery.to_query(
+                query, _to_index_cols(index_col), columns, filters
+            )
 
             return self._read_gbq_query(
                 query,
@@ -838,12 +783,7 @@ def _read_bigquery_load_job(
         index_col: Iterable[str] | str | bigframes.enums.DefaultIndexKind = (),
         columns: Iterable[str] = (),
     ) -> dataframe.DataFrame:
-        if isinstance(index_col, bigframes.enums.DefaultIndexKind):
-            index_cols = []
-        elif isinstance(index_col, str):
-            index_cols = [index_col]
-        else:
-            index_cols = list(index_col)
+        index_cols = _to_index_cols(index_col)
 
         if not job_config.clustering_fields and index_cols:
             job_config.clustering_fields = index_cols[:_MAX_CLUSTER_COLUMNS]
@@ -1430,7 +1370,7 @@ def _create_empty_temp_table(
             datetime.datetime.now(datetime.timezone.utc) + constants.DEFAULT_EXPIRATION
         )
 
-        table = bigframes_io.create_temp_table(
+        table = bf_io_bigquery.create_temp_table(
             self,
             expiration,
             schema=schema,
diff --git a/bigframes/session/_io/bigquery/__init__.py b/bigframes/session/_io/bigquery/__init__.py
index 79108c71a2..98e0dac1e8 100644
--- a/bigframes/session/_io/bigquery/__init__.py
+++ b/bigframes/session/_io/bigquery/__init__.py
@@ -19,10 +19,13 @@
 import datetime
 import itertools
 import os
+import re
 import textwrap
 import types
-from typing import Dict, Iterable, Optional, Sequence, Tuple, Union
+import typing
+from typing import Dict, Iterable, Mapping, Optional, Sequence, Tuple, Union
 
+import bigframes_vendored.pandas.io.gbq as third_party_pandas_gbq
 import google.api_core.exceptions
 import google.cloud.bigquery as bigquery
 
@@ -311,3 +314,95 @@ def create_bq_dataset_reference(
         query_destination.project,
         query_destination.dataset_id,
     )
+
+
+def is_query(query_or_table: str) -> bool:
+    """Determine if `query_or_table` is a table ID or a SQL string"""
+    return re.search(r"\s", query_or_table.strip(), re.MULTILINE) is not None
+
+
+def is_table_with_wildcard_suffix(query_or_table: str) -> bool:
+    """Determine if `query_or_table` is a table and contains a wildcard suffix."""
+    return not is_query(query_or_table) and query_or_table.endswith("*")
+
+
+def to_query(
+    query_or_table: str,
+    index_cols: Iterable[str],
+    columns: Iterable[str],
+    filters: third_party_pandas_gbq.FiltersType,
+) -> str:
+    """Compile query_or_table with conditions(filters, wildcards) to query."""
+    filters = list(filters)
+    sub_query = (
+        f"({query_or_table})" if is_query(query_or_table) else f"`{query_or_table}`"
+    )
+
+    # TODO(b/338111344): Generate an index based on DefaultIndexKind if we
+    # don't have index columns specified.
+    if columns:
+        # We only reduce the selection if columns is set, but we always
+        # want to make sure index_cols is also included.
+        all_columns = itertools.chain(index_cols, columns)
+        select_clause = "SELECT " + ", ".join(f"`{column}`" for column in all_columns)
+    else:
+        select_clause = "SELECT *"
+
+    where_clause = ""
+    if filters:
+        valid_operators: Mapping[third_party_pandas_gbq.FilterOps, str] = {
+            "in": "IN",
+            "not in": "NOT IN",
+            "LIKE": "LIKE",
+            "==": "=",
+            ">": ">",
+            "<": "<",
+            ">=": ">=",
+            "<=": "<=",
+            "!=": "!=",
+        }
+
+        # If single layer filter, add another pseudo layer. So the single layer represents "and" logic.
+        if isinstance(filters[0], tuple) and (
+            len(filters[0]) == 0 or not isinstance(list(filters[0])[0], tuple)
+        ):
+            filters = typing.cast(third_party_pandas_gbq.FiltersType, [filters])
+
+        or_expressions = []
+        for group in filters:
+            if not isinstance(group, Iterable):
+                group = [group]
+
+            and_expressions = []
+            for filter_item in group:
+                if not isinstance(filter_item, tuple) or (len(filter_item) != 3):
+                    raise ValueError(
+                        f"Filter condition should be a tuple of length 3, {filter_item} is not valid."
+                    )
+
+                column, operator, value = filter_item
+
+                if not isinstance(column, str):
+                    raise ValueError(
+                        f"Column name should be a string, but received '{column}' of type {type(column).__name__}."
+                    )
+
+                if operator not in valid_operators:
+                    raise ValueError(f"Operator {operator} is not valid.")
+
+                operator_str = valid_operators[operator]
+
+                if operator_str in ["IN", "NOT IN"]:
+                    value_list = ", ".join([repr(v) for v in value])
+                    expression = f"`{column}` {operator_str} ({value_list})"
+                else:
+                    expression = f"`{column}` {operator_str} {repr(value)}"
+                and_expressions.append(expression)
+
+            or_expressions.append(" AND ".join(and_expressions))
+
+        if or_expressions:
+            where_clause = " WHERE " + " OR ".join(or_expressions)
+
+    full_query = f"{select_clause} FROM {sub_query} AS sub{where_clause}"
+    return full_query
diff --git a/tests/system/small/test_pandas_options.py b/tests/system/small/test_pandas_options.py
index afb75c65e3..c580f926c9 100644
--- a/tests/system/small/test_pandas_options.py
+++ b/tests/system/small/test_pandas_options.py
@@ -13,6 +13,7 @@
 # limitations under the License.
 
 import datetime
+import re
 from unittest import mock
 import warnings
 
@@ -69,8 +70,12 @@ def test_read_gbq_start_sets_session_location(
     assert not bpd.options.bigquery.location
 
     # Starting user journey with read_gbq* should work for a table in any
-    # location, in this case tokyo
-    df = read_method(query_tokyo)
+    # location, in this case tokyo.
+    with warnings.catch_warnings():
+        # Since the query refers to a specific location, no warning should be
+        # raised.
+        warnings.simplefilter("error", bigframes.exceptions.DefaultLocationWarning)
+        df = read_method(query_tokyo)
     assert df is not None
 
     # Now bigquery options location should be set to tokyo
@@ -146,7 +151,11 @@ def test_read_gbq_after_session_start_must_comply_with_default_location(
 
     # Starting user journey with anything other than read_gbq*, such as
     # read_pandas would bind the session to default location US
-    df = bpd.read_pandas(scalars_pandas_df_index)
+    with pytest.warns(
+        bigframes.exceptions.DefaultLocationWarning,
+        match=re.escape("using location US for the session"),
+    ):
+        df = bpd.read_pandas(scalars_pandas_df_index)
     assert df is not None
 
     # Doing read_gbq* from a table in another location should fail
@@ -262,17 +271,18 @@ def test_read_gbq_must_comply_with_set_location_non_US(
 
 def test_credentials_need_reauthentication(monkeypatch):
     # Use a simple test query to verify that default session works to interact
-    # with BQ
+    # with BQ.
     test_query = "SELECT 1"
 
-    # Confirm that default session has BQ client with valid credentials
-    session = bpd.get_global_session()
-    assert session.bqclient._credentials.valid
-
     # Confirm that default session works as usual
     df = bpd.read_gbq(test_query)
     assert df is not None
 
+    # Call get_global_session() *after* read_gbq so that our location detection
+    # has a chance to work.
+    session = bpd.get_global_session()
+    assert session.bqclient._credentials.valid
+
     with monkeypatch.context() as m:
         # Simulate expired credentials to trigger the credential refresh flow
         m.setattr(session.bqclient._credentials, "expiry", datetime.datetime.utcnow())
diff --git a/tests/system/small/test_session.py b/tests/system/small/test_session.py
index 6b2d7df50d..5daa01ad38 100644
--- a/tests/system/small/test_session.py
+++ b/tests/system/small/test_session.py
@@ -18,7 +18,7 @@
 import textwrap
 import time
 import typing
-from typing import List
+from typing import List, Sequence
 
 import google
 import google.cloud.bigquery as bigquery
@@ -338,30 +338,80 @@ def test_read_gbq_table_clustered_with_filter(session: bigframes.Session):
     assert "OLI_TIRS" in sensors.index
 
 
-def test_read_gbq_wildcard(session: bigframes.Session):
-    df = session.read_gbq("bigquery-public-data.noaa_gsod.gsod193*")
-    assert df.shape == (348485, 32)
+_GSOD_ALL_TABLES = "bigquery-public-data.noaa_gsod.gsod*"
+_GSOD_1930S = "bigquery-public-data.noaa_gsod.gsod193*"
 
 
-def test_read_gbq_wildcard_with_filter(session: bigframes.Session):
-    df = session.read_gbq(
-        "bigquery-public-data.noaa_gsod.gsod19*",
-        filters=[("_table_suffix", ">=", "30"), ("_table_suffix", "<=", "39")],  # type: ignore
-    )
-    assert df.shape == (348485, 32)
-
-
-def test_read_gbq_table_wildcard(session: bigframes.Session):
-    df = session.read_gbq_table("bigquery-public-data.noaa_gsod.gsod193*")
-    assert df.shape == (348485, 32)
-
-
-def test_read_gbq_table_wildcard_with_filter(session: bigframes.Session):
-    df = session.read_gbq_table(
-        "bigquery-public-data.noaa_gsod.gsod19*",
-        filters=[("_table_suffix", ">=", "30"), ("_table_suffix", "<=", "39")],  # type: ignore
+@pytest.mark.parametrize(
+    "api_method",
+    # Test that both methods work as there's a risk that read_gbq /
+    # read_gbq_table makes for an infinite loop. Table reads can convert to
+    # queries and read_gbq reads from tables.
+    ["read_gbq", "read_gbq_table"],
+)
+@pytest.mark.parametrize(
+    ("filters", "table_id", "index_col", "columns"),
+    [
+        pytest.param(
+            [("_table_suffix", ">=", "1930"), ("_table_suffix", "<=", "1939")],
+            _GSOD_ALL_TABLES,
+            ["stn", "wban", "year", "mo", "da"],
+            ["temp", "max", "min"],
+            id="all",
+        ),
+        pytest.param(
+            (),  # filters
+            _GSOD_1930S,
+            (),  # index_col
+            ["temp", "max", "min"],
+            id="columns",
+        ),
+        pytest.param(
+            [("_table_suffix", ">=", "1930"), ("_table_suffix", "<=", "1939")],
+            _GSOD_ALL_TABLES,
+            (),  # index_col,
+            (),  # columns
+            id="filters",
+        ),
+        pytest.param(
+            (),  # filters
+            _GSOD_1930S,
+            ["stn", "wban", "year", "mo", "da"],
+            (),  # columns
+            id="index_col",
+        ),
+    ],
+)
+def test_read_gbq_wildcard(
+    session: bigframes.Session,
+    api_method: str,
+    filters,
+    table_id: str,
+    index_col: Sequence[str],
+    columns: Sequence[str],
+):
+    table_metadata = session.bqclient.get_table(table_id)
+    method = getattr(session, api_method)
+    df = method(table_id, filters=filters, index_col=index_col, columns=columns)
+    num_rows, num_columns = df.shape
+
+    if index_col:
+        assert list(df.index.names) == list(index_col)
+    else:
+        assert df.index.name is None
+
+    expected_columns = (
+        columns
+        if columns
+        else [
+            field.name
+            for field in table_metadata.schema
+            if field.name not in index_col and field.name not in columns
+        ]
     )
-    assert df.shape == (348485, 32)
+    assert list(df.columns) == expected_columns
+    assert num_rows > 0
+    assert num_columns == len(expected_columns)
 
 
 @pytest.mark.parametrize(
diff --git a/tests/unit/session/test_io_bigquery.py b/tests/unit/session/test_io_bigquery.py
index 43865fc2c8..9da085e824 100644
--- a/tests/unit/session/test_io_bigquery.py
+++ b/tests/unit/session/test_io_bigquery.py
@@ -210,3 +210,109 @@ def test_create_temp_table_default_expiration():
 def test_bq_schema_to_sql(schema: Iterable[bigquery.SchemaField], expected: str):
     sql = io_bq.bq_schema_to_sql(schema)
     assert sql == expected
+
+
+@pytest.mark.parametrize(
+    ("query_or_table", "index_cols", "columns", "filters", "expected_output"),
+    [
+        pytest.param(
+            "test_table",
+            [],
+            [],
+            ["date_col", ">", "2022-10-20"],
+            None,
+            marks=pytest.mark.xfail(
+                raises=ValueError,
+            ),
+            id="raise_error",
+        ),
+        pytest.param(
+            "test_table",
+            ["row_index"],
+            ["string_col"],
+            [
+                (("rowindex", "not in", [0, 6]),),
+                (("string_col", "in", ["Hello, World!", "こんにちは"]),),
+            ],
+            (
+                "SELECT `row_index`, `string_col` FROM `test_table` AS sub WHERE "
+                "`rowindex` NOT IN (0, 6) OR `string_col` IN ('Hello, World!', "
+                "'こんにちは')"
+            ),
+            id="table-all_params-filter_or_operation",
+        ),
+        pytest.param(
+            """SELECT
+                rowindex,
+                string_col,
+            FROM `test_table` AS t
+            """,
+            ["rowindex"],
+            ["string_col"],
+            [
+                ("rowindex", "<", 4),
+                ("string_col", "==", "Hello, World!"),
+            ],
+            """SELECT `rowindex`, `string_col` FROM (SELECT
+                rowindex,
+                string_col,
+            FROM `test_table` AS t
+            ) AS sub WHERE `rowindex` < 4 AND `string_col` = 'Hello, World!'""",
+            id="subquery-all_params-filter_and_operation",
+        ),
+        pytest.param(
+            "test_table",
+            [],
+            ["col_a", "col_b"],
+            [],
+            "SELECT `col_a`, `col_b` FROM `test_table` AS sub",
+            id="table-columns",
+        ),
+        pytest.param(
+            "test_table",
+            [],
+            [],
+            [("date_col", ">", "2022-10-20")],
+            "SELECT * FROM `test_table` AS sub WHERE `date_col` > '2022-10-20'",
+            id="table-filter",
+        ),
+        pytest.param(
+            "test_table*",
+            [],
+            [],
+            [],
+            "SELECT * FROM `test_table*` AS sub",
+            id="wildcard-no_params",
+        ),
+        pytest.param(
+            "test_table*",
+            [],
+            [],
+            [("_TABLE_SUFFIX", ">", "2022-10-20")],
+            "SELECT * FROM `test_table*` AS sub WHERE `_TABLE_SUFFIX` > '2022-10-20'",
+            id="wildcard-filter",
+        ),
+    ],
+)
+def test_to_query(query_or_table, index_cols, columns, filters, expected_output):
+    query = io_bq.to_query(
+        query_or_table,
+        index_cols,
+        columns,
+        filters,
+    )
+    assert query == expected_output
+
+
+@pytest.mark.parametrize(
+    ("query_or_table", "filters", "expected_output"),
+    [],
+)
+def test_to_query_with_wildcard_table(query_or_table, filters, expected_output):
+    query = io_bq.to_query(
+        query_or_table,
+        (),  # index_cols
+        (),  # columns
+        filters,
+    )
+    assert query == expected_output
diff --git a/tests/unit/session/test_session.py b/tests/unit/session/test_session.py
index a161c2df76..bea858e037 100644
--- a/tests/unit/session/test_session.py
+++ b/tests/unit/session/test_session.py
@@ -398,85 +398,3 @@ def test_session_init_fails_with_no_project():
                 credentials=mock.Mock(spec=google.auth.credentials.Credentials)
             )
         )
-
-
-@pytest.mark.parametrize(
-    ("query_or_table", "columns", "filters", "expected_output"),
-    [
-        pytest.param(
-            """SELECT
-                rowindex,
-                string_col,
-            FROM `test_table` AS t
-            """,
-            [],
-            [("rowindex", "<", 4), ("string_col", "==", "Hello, World!")],
-            """SELECT * FROM (SELECT
-                rowindex,
-                string_col,
-            FROM `test_table` AS t
-            ) AS sub WHERE `rowindex` < 4 AND `string_col` = 'Hello, World!'""",
-            id="query_input",
-        ),
-        pytest.param(
-            "test_table",
-            [],
-            [("date_col", ">", "2022-10-20")],
-            "SELECT * FROM `test_table` AS sub WHERE `date_col` > '2022-10-20'",
-            id="table_input",
-        ),
-        pytest.param(
-            "test_table",
-            ["row_index", "string_col"],
-            [
-                (("rowindex", "not in", [0, 6]),),
-                (("string_col", "in", ["Hello, World!", "こんにちは"]),),
-            ],
-            (
-                "SELECT `row_index`, `string_col` FROM `test_table` AS sub WHERE "
-                "`rowindex` NOT IN (0, 6) OR `string_col` IN ('Hello, World!', "
-                "'こんにちは')"
-            ),
-            id="or_operation",
-        ),
-        pytest.param(
-            "test_table",
-            [],
-            ["date_col", ">", "2022-10-20"],
-            None,
-            marks=pytest.mark.xfail(
-                raises=ValueError,
-            ),
-            id="raise_error",
-        ),
-    ],
-)
-def test_read_gbq_with_filters(query_or_table, columns, filters, expected_output):
-    session = resources.create_bigquery_session()
-    query = session._to_query(query_or_table, columns, filters)
-    assert query == expected_output
-
-
-@pytest.mark.parametrize(
-    ("query_or_table", "columns", "filters", "expected_output"),
-    [
-        pytest.param(
-            "test_table*",
-            [],
-            [],
-            "SELECT * FROM `test_table*` AS sub",
-            id="wildcard_table_input",
-        ),
-        pytest.param(
-            "test_table*",
-            [],
-            [("_TABLE_SUFFIX", ">", "2022-10-20")],
-            "SELECT * FROM `test_table*` AS sub WHERE `_TABLE_SUFFIX` > '2022-10-20'",
-            id="wildcard_table_input_with_filter",
-        ),
-    ],
-)
-def test_read_gbq_wildcard(query_or_table, columns, filters, expected_output):
-    session = resources.create_bigquery_session()
-    query = session._to_query(query_or_table, columns, filters)
-    assert query == expected_output

From 306953aaae69e57c7c2f5eefb88d55a35bdcca9d Mon Sep 17 00:00:00 2001
From: Shobhit Singh <shobs@google.com>
Date: Wed, 8 May 2024 22:58:29 +0000
Subject: [PATCH 06/17] docs: document inlining of small data in `read_*` APIs
 (#670)

* docs: document inlining of small data in `read_*` APIs

* mention that threshold is in memory size

* non-bigquery instead of non-"bigquery"
---
 bigframes/session/__init__.py                          |  5 +++++
 third_party/bigframes_vendored/pandas/io/parquet.py    |  5 +++++
 .../bigframes_vendored/pandas/io/parsers/readers.py    | 10 ++++++++++
 third_party/bigframes_vendored/pandas/io/pickle.py     |  5 +++++
 4 files changed, 25 insertions(+)

diff --git a/bigframes/session/__init__.py b/bigframes/session/__init__.py
index 89845bb842..5f70fd77f9 100644
--- a/bigframes/session/__init__.py
+++ b/bigframes/session/__init__.py
@@ -874,6 +874,11 @@ def read_pandas(
         The pandas DataFrame will be persisted as a temporary BigQuery table, which can be
         automatically recycled after the Session is closed.
 
+        .. note::
+            Data is inlined in the query SQL if it is small enough (roughly 5MB
+            or less in memory). Larger size data is loaded to a BigQuery table
+            instead.
+
         **Examples:**
 
             >>> import bigframes.pandas as bpd
diff --git a/third_party/bigframes_vendored/pandas/io/parquet.py b/third_party/bigframes_vendored/pandas/io/parquet.py
index 877a384b6d..1f5563c962 100644
--- a/third_party/bigframes_vendored/pandas/io/parquet.py
+++ b/third_party/bigframes_vendored/pandas/io/parquet.py
@@ -19,6 +19,11 @@ def read_parquet(
             Instead, set a serialized index column as the index and sort by
             that in the resulting DataFrame.
 
+        .. note::
+            For non-"bigquery" engine, data is inlined in the query SQL if it is
+            small enough (roughly 5MB or less in memory). Larger size data is
+            loaded to a BigQuery table instead.
+
         **Examples:**
 
             >>> import bigframes.pandas as bpd
diff --git a/third_party/bigframes_vendored/pandas/io/parsers/readers.py b/third_party/bigframes_vendored/pandas/io/parsers/readers.py
index d147abfd22..248cf8e0fe 100644
--- a/third_party/bigframes_vendored/pandas/io/parsers/readers.py
+++ b/third_party/bigframes_vendored/pandas/io/parsers/readers.py
@@ -62,6 +62,11 @@ def read_csv(
             file. Instead, set a serialized index column as the index and sort by
             that in the resulting DataFrame.
 
+        .. note::
+            For non-bigquery engine, data is inlined in the query SQL if it is
+            small enough (roughly 5MB or less in memory). Larger size data is
+            loaded to a BigQuery table instead.
+
         **Examples:**
 
             >>> import bigframes.pandas as bpd
@@ -167,6 +172,11 @@ def read_json(
             file. Instead, set a serialized index column as the index and sort by
             that in the resulting DataFrame.
 
+        .. note::
+            For non-bigquery engine, data is inlined in the query SQL if it is
+            small enough (roughly 5MB or less in memory). Larger size data is
+            loaded to a BigQuery table instead.
+
         **Examples:**
 
             >>> import bigframes.pandas as bpd
diff --git a/third_party/bigframes_vendored/pandas/io/pickle.py b/third_party/bigframes_vendored/pandas/io/pickle.py
index 096d9b13d6..88684309f9 100644
--- a/third_party/bigframes_vendored/pandas/io/pickle.py
+++ b/third_party/bigframes_vendored/pandas/io/pickle.py
@@ -25,6 +25,11 @@ def read_pickle(
             If the content of the pickle file is a Series and its name attribute is None,
             the name will be set to '0' by default.
 
+        .. note::
+            Data is inlined in the query SQL if it is small enough (roughly 5MB
+            or less in memory). Larger size data is loaded to a BigQuery table
+            instead.
+
         **Examples:**
 
             >>> import bigframes.pandas as bpd

From 57ccabcd1402b7938e2c7068e5b4880ef018f39c Mon Sep 17 00:00:00 2001
From: Stephanie A <129541811+DevStephanie@users.noreply.github.com>
Date: Thu, 9 May 2024 09:38:28 -0500
Subject: [PATCH 07/17] feat: suggest correct options in
 bpd.options.bigquery.location (#666)

* feat: suggest correct options in bpd.options.bigquery.location

deps: add jellyfish as a dependency for spelling correction
---
 bigframes/_config/bigquery_options.py       | 11 +++++++++--
 setup.py                                    |  2 ++
 testing/constraints-3.9.txt                 |  1 +
 tests/unit/_config/test_bigquery_options.py | 11 ++++++-----
 4 files changed, 18 insertions(+), 7 deletions(-)

diff --git a/bigframes/_config/bigquery_options.py b/bigframes/_config/bigquery_options.py
index 74561e6f24..6f841a36b3 100644
--- a/bigframes/_config/bigquery_options.py
+++ b/bigframes/_config/bigquery_options.py
@@ -21,6 +21,7 @@
 
 import google.api_core.exceptions
 import google.auth.credentials
+import jellyfish
 
 import bigframes.constants
 import bigframes.exceptions
@@ -30,7 +31,8 @@
     "Call bigframes.pandas.close_session() first, if you are using the bigframes.pandas API."
 )
 
-UNKNOWN_LOCATION_MESSAGE = "The location '{location}' is set to an unknown value."
+
+UNKNOWN_LOCATION_MESSAGE = "The location '{location}' is set to an unknown value. Did you mean '{possibility}'?"
 
 
 def _validate_location(value: Optional[str]):
@@ -39,8 +41,13 @@ def _validate_location(value: Optional[str]):
         return
 
     if value not in bigframes.constants.ALL_BIGQUERY_LOCATIONS:
+        location = str(value)
+        possibility = min(
+            bigframes.constants.ALL_BIGQUERY_LOCATIONS,
+            key=lambda item: jellyfish.levenshtein_distance(location, item),
+        )
         warnings.warn(
-            UNKNOWN_LOCATION_MESSAGE.format(location=value),
+            UNKNOWN_LOCATION_MESSAGE.format(location=location, possibility=possibility),
             # There are many layers before we get to (possibly) the user's code:
             # -> bpd.options.bigquery.location = "us-central-1"
             # -> location.setter
diff --git a/setup.py b/setup.py
index 2ccf63259c..d5d282d11a 100644
--- a/setup.py
+++ b/setup.py
@@ -12,6 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+
 import io
 import itertools
 import os
@@ -45,6 +46,7 @@
     "google-cloud-resource-manager >=1.10.3",
     "google-cloud-storage >=2.0.0",
     "ibis-framework[bigquery] >=8.0.0,<9.0.0dev",
+    "jellyfish >=0.8.9",
     # TODO: Relax upper bound once we have fixed `system_prerelease` tests.
     "pandas >=1.5.0",
     "pyarrow >=8.0.0",
diff --git a/testing/constraints-3.9.txt b/testing/constraints-3.9.txt
index f5007ed564..3c51668655 100644
--- a/testing/constraints-3.9.txt
+++ b/testing/constraints-3.9.txt
@@ -11,6 +11,7 @@ google-cloud-iam==2.12.1
 google-cloud-resource-manager==1.10.3
 google-cloud-storage==2.0.0
 ibis-framework==8.0.0
+jellyfish==0.8.9
 pandas==1.5.0
 pyarrow==8.0.0
 pydata-google-auth==1.8.2
diff --git a/tests/unit/_config/test_bigquery_options.py b/tests/unit/_config/test_bigquery_options.py
index 7d9a452f42..b827b0723d 100644
--- a/tests/unit/_config/test_bigquery_options.py
+++ b/tests/unit/_config/test_bigquery_options.py
@@ -108,24 +108,25 @@ def test_location_set_to_valid_no_warning(valid_location):
 @pytest.mark.parametrize(
     [
         "invalid_location",
+        "possibility",
     ],
     [
         # Test with common mistakes, see article.
         # https://en.wikipedia.org/wiki/Edit_distance#Formal_definition_and_properties
         # Substitution
-        ("us-wist-3",),
+        ("us-wist3", "us-west3"),
         # Insertion
-        ("us-central-1",),
+        ("us-central-1", "us-central1"),
         # Deletion
-        ("asia-suth2",),
+        ("asia-suth2", "asia-south2"),
     ],
 )
-def test_location_set_to_invalid_warning(invalid_location):
+def test_location_set_to_invalid_warning(invalid_location, possibility):
     options = bigquery_options.BigQueryOptions()
     with pytest.warns(
         bigframes.exceptions.UnknownLocationWarning,
         match=re.escape(
-            f"The location '{invalid_location}' is set to an unknown value."
+            f"The location '{invalid_location}' is set to an unknown value. Did you mean '{possibility}'?"
         ),
     ):
         options.location = invalid_location

From 93416ed2f8353c12eb162e21e9bf155312b0ed8c Mon Sep 17 00:00:00 2001
From: Ashley Xu <139821907+ashleyxuu@users.noreply.github.com>
Date: Thu, 9 May 2024 13:20:20 -0700
Subject: [PATCH 08/17] docs: add code snippets for llm text generatiion (#669)

* docs: add code snippets for llm text generatiion
---
 samples/snippets/text_generation_test.py | 68 ++++++++++++++++++++++++
 1 file changed, 68 insertions(+)
 create mode 100644 samples/snippets/text_generation_test.py

diff --git a/samples/snippets/text_generation_test.py b/samples/snippets/text_generation_test.py
new file mode 100644
index 0000000000..c4df1dde3b
--- /dev/null
+++ b/samples/snippets/text_generation_test.py
@@ -0,0 +1,68 @@
+# Copyright 2024 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+def test_llm_text_generation() -> None:
+    # Determine project id, in this case prefer the one set in the environment
+    # variable GOOGLE_CLOUD_PROJECT (if any)
+    import os
+
+    PROJECT_ID = os.getenv("GOOGLE_CLOUD_PROJECT", "bigframes-dev")
+    LOCATION = "US"
+
+    # [START bigquery_dataframes_generate_text_tutorial_create_remote_model]
+    import bigframes
+    from bigframes.ml.llm import PaLM2TextGenerator
+
+    bigframes.options.bigquery.project = PROJECT_ID
+    bigframes.options.bigquery.location = LOCATION
+
+    model = PaLM2TextGenerator()
+    # [END bigquery_dataframes_generate_text_tutorial_create_remote_model]
+    assert model is not None
+
+    # [START bigquery_dataframes_generate_text_tutorial_perform_keyword_extraction]
+    import bigframes.pandas as bpd
+
+    df = bpd.read_gbq("bigquery-public-data.imdb.reviews", max_results=5)
+    df_prompt_prefix = "Extract the key words from the text below: "
+    df_prompt = df_prompt_prefix + df["review"]
+
+    # Predict using the model
+    df_pred = model.predict(df_prompt, temperature=0.2, max_output_tokens=100)
+    df_pred.peek(5)
+    # [END bigquery_dataframes_generate_text_tutorial_perform_keyword_extraction]
+    # peek() is used to show a preview of the results. If the output
+    # of this sample changes, also update the screenshot for the associated
+    # tutorial on cloud.google.com.
+    assert df_pred["ml_generate_text_llm_result"] is not None
+    assert df_pred["ml_generate_text_llm_result"].iloc[0] is not None
+
+    # [START bigquery_dataframes_generate_text_tutorial_perform_sentiment_analysis]
+    import bigframes.pandas as bpd
+
+    df = bpd.read_gbq("bigquery-public-data.imdb.reviews", max_results=5)
+    df_prompt_prefix = "perform sentiment analysis on the following text, return one the following categories: positive, negative: "
+    df_prompt = df_prompt_prefix + df["review"]
+
+    # Predict using the model
+    df_pred = model.predict(df_prompt, temperature=0.2, max_output_tokens=100)
+    df_pred.peek(5)
+    # [END bigquery_dataframes_generate_text_tutorial_perform_sentiment_analysis]
+    # peek() is used to show a preview of the results. If the output
+    # of this sample changes, also update the screenshot for the associated
+    # tutorial on cloud.google.com.
+
+    assert df_pred["ml_generate_text_llm_result"] is not None
+    assert df_pred["ml_generate_text_llm_result"].iloc[0] is not None

From 2218c21b5bb0f9e54a365ba1ada0203cbc4c9efc Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tim=20Swe=C3=B1a=20=28Swast=29?= <swast@google.com>
Date: Thu, 9 May 2024 21:17:57 -0500
Subject: [PATCH 09/17] feat: add `Series.case_when()` (#673)

* feat: add `Series.case_when()`

* rename to ScalarOp
* rename to exprs
* add type annotations

feat: add `DataFrame.__delitem__` (#673)
docs: add logistic regression samples (#673)
---
 bigframes/core/__init__.py                    |  17 ++-
 bigframes/core/blocks.py                      |   9 ++
 bigframes/dataframe.py                        |   4 +
 bigframes/operations/__init__.py              |  51 ++++---
 bigframes/operations/base.py                  |  23 ++-
 bigframes/series.py                           |  19 +++
 .../logistic_regression_prediction_test.py    | 137 ++++++++++++++++++
 tests/system/small/test_series.py             |  30 ++++
 .../bigframes_vendored/pandas/core/series.py  |  55 +++++++
 9 files changed, 311 insertions(+), 34 deletions(-)
 create mode 100644 samples/snippets/logistic_regression_prediction_test.py

diff --git a/bigframes/core/__init__.py b/bigframes/core/__init__.py
index eef0efcf83..79c6bb6495 100644
--- a/bigframes/core/__init__.py
+++ b/bigframes/core/__init__.py
@@ -16,6 +16,7 @@
 from dataclasses import dataclass
 import functools
 import io
+import itertools
 import typing
 from typing import Iterable, Sequence
 
@@ -370,14 +371,16 @@ def unpivot(
         for col_id, input_ids in unpivot_columns:
             # row explode offset used to choose the input column
             # we use offset instead of label as labels are not necessarily unique
-            cases = tuple(
-                (
-                    ops.eq_op.as_expr(explode_offsets_id, ex.const(i)),
-                    ex.free_var(id_or_null)
-                    if (id_or_null is not None)
-                    else ex.const(None),
+            cases = itertools.chain(
+                *(
+                    (
+                        ops.eq_op.as_expr(explode_offsets_id, ex.const(i)),
+                        ex.free_var(id_or_null)
+                        if (id_or_null is not None)
+                        else ex.const(None),
+                    )
+                    for i, id_or_null in enumerate(input_ids)
                 )
-                for i, id_or_null in enumerate(input_ids)
             )
             col_expr = ops.case_when_op.as_expr(*cases)
             unpivot_exprs.append((col_expr, col_id))
diff --git a/bigframes/core/blocks.py b/bigframes/core/blocks.py
index 402581eb6f..277409f3a3 100644
--- a/bigframes/core/blocks.py
+++ b/bigframes/core/blocks.py
@@ -803,6 +803,15 @@ def apply_ternary_op(
         expr = op.as_expr(col_id_1, col_id_2, col_id_3)
         return self.project_expr(expr, result_label)
 
+    def apply_nary_op(
+        self,
+        columns: Iterable[str],
+        op: ops.NaryOp,
+        result_label: Label = None,
+    ) -> typing.Tuple[Block, str]:
+        expr = op.as_expr(*columns)
+        return self.project_expr(expr, result_label)
+
     def multi_apply_window_op(
         self,
         columns: typing.Sequence[str],
diff --git a/bigframes/dataframe.py b/bigframes/dataframe.py
index 1f1fb5467f..47730630e3 100644
--- a/bigframes/dataframe.py
+++ b/bigframes/dataframe.py
@@ -655,6 +655,10 @@ def _repr_html_(self) -> str:
         html_string += f"[{row_count} rows x {column_count} columns in total]"
         return html_string
 
+    def __delitem__(self, key: str):
+        df = self.drop(columns=[key])
+        self._set_block(df._get_block())
+
     def __setitem__(self, key: str, value: SingleItemValue):
         df = self._assign_single_item(key, value)
         self._set_block(df._get_block())
diff --git a/bigframes/operations/__init__.py b/bigframes/operations/__init__.py
index a7c385a2b8..e52f488d38 100644
--- a/bigframes/operations/__init__.py
+++ b/bigframes/operations/__init__.py
@@ -17,7 +17,7 @@
 import dataclasses
 import functools
 import typing
-from typing import Tuple, Union
+from typing import Union
 
 import numpy as np
 import pandas as pd
@@ -46,7 +46,7 @@ def order_preserving(self) -> bool:
 
 
 @dataclasses.dataclass(frozen=True)
-class NaryOp:
+class ScalarOp:
     @property
     def name(self) -> str:
         raise NotImplementedError("RowOp abstract base class has no implementation")
@@ -60,10 +60,30 @@ def order_preserving(self) -> bool:
         return False
 
 
+@dataclasses.dataclass(frozen=True)
+class NaryOp(ScalarOp):
+    def as_expr(
+        self,
+        *exprs: Union[str | bigframes.core.expression.Expression],
+    ) -> bigframes.core.expression.Expression:
+        import bigframes.core.expression
+
+        # Keep this in sync with output_type and compilers
+        inputs: list[bigframes.core.expression.Expression] = []
+
+        for expr in exprs:
+            inputs.append(_convert_expr_input(expr))
+
+        return bigframes.core.expression.OpExpression(
+            self,
+            tuple(inputs),
+        )
+
+
 # These classes can be used to create simple ops that don't take local parameters
 # All is needed is a unique name, and to register an implementation in ibis_mappings.py
 @dataclasses.dataclass(frozen=True)
-class UnaryOp(NaryOp):
+class UnaryOp(ScalarOp):
     @property
     def arguments(self) -> int:
         return 1
@@ -79,7 +99,7 @@ def as_expr(
 
 
 @dataclasses.dataclass(frozen=True)
-class BinaryOp(NaryOp):
+class BinaryOp(ScalarOp):
     @property
     def arguments(self) -> int:
         return 2
@@ -101,7 +121,7 @@ def as_expr(
 
 
 @dataclasses.dataclass(frozen=True)
-class TernaryOp(NaryOp):
+class TernaryOp(ScalarOp):
     @property
     def arguments(self) -> int:
         return 3
@@ -655,27 +675,6 @@ def output_type(self, *input_types: dtypes.ExpressionType) -> dtypes.ExpressionT
             output_expr_types,
         )
 
-    def as_expr(
-        self,
-        *case_output_pairs: Tuple[
-            Union[str | bigframes.core.expression.Expression],
-            Union[str | bigframes.core.expression.Expression],
-        ],
-    ) -> bigframes.core.expression.Expression:
-        import bigframes.core.expression
-
-        # Keep this in sync with output_type and compilers
-        inputs: list[bigframes.core.expression.Expression] = []
-
-        for case, output in case_output_pairs:
-            inputs.append(_convert_expr_input(case))
-            inputs.append(_convert_expr_input(output))
-
-        return bigframes.core.expression.OpExpression(
-            self,
-            tuple(inputs),
-        )
-
 
 case_when_op = CaseWhenOp()
 
diff --git a/bigframes/operations/base.py b/bigframes/operations/base.py
index b003ce59cc..75d14f3fbc 100644
--- a/bigframes/operations/base.py
+++ b/bigframes/operations/base.py
@@ -15,6 +15,7 @@
 from __future__ import annotations
 
 import typing
+from typing import List, Sequence
 
 import bigframes_vendored.pandas.pandas._typing as vendored_pandas_typing
 import numpy
@@ -205,6 +206,21 @@ def _apply_binary_op(
             block, result_id = self._block.project_expr(expr, name)
             return series.Series(block.select_column(result_id))
 
+    def _apply_nary_op(
+        self,
+        op: ops.NaryOp,
+        others: Sequence[typing.Union[series.Series, scalars.Scalar]],
+        ignore_self=False,
+    ):
+        """Applies an n-ary operator to the series and others."""
+        values, block = self._align_n(others, ignore_self=ignore_self)
+        block, result_id = block.apply_nary_op(
+            values,
+            op,
+            self._name,
+        )
+        return series.Series(block.select_column(result_id))
+
     def _apply_binary_aggregation(
         self, other: series.Series, stat: agg_ops.BinaryAggregateOp
     ) -> float:
@@ -226,8 +242,13 @@ def _align_n(
         self,
         others: typing.Sequence[typing.Union[series.Series, scalars.Scalar]],
         how="outer",
+        ignore_self=False,
     ) -> tuple[typing.Sequence[str], blocks.Block]:
-        value_ids = [self._value_column]
+        if ignore_self:
+            value_ids: List[str] = []
+        else:
+            value_ids = [self._value_column]
+
         block = self._block
         for other in others:
             if isinstance(other, series.Series):
diff --git a/bigframes/series.py b/bigframes/series.py
index aea3d60ff5..ce13d205bd 100644
--- a/bigframes/series.py
+++ b/bigframes/series.py
@@ -410,6 +410,25 @@ def between(self, left, right, inclusive="both"):
             self._apply_binary_op(right, right_op)
         )
 
+    def case_when(self, caselist) -> Series:
+        return self._apply_nary_op(
+            ops.case_when_op,
+            tuple(
+                itertools.chain(
+                    itertools.chain(*caselist),
+                    # Fallback to current value if no other matches.
+                    (
+                        # We make a Series with a constant value to avoid casts to
+                        # types other than boolean.
+                        Series(True, index=self.index, dtype=pandas.BooleanDtype()),
+                        self,
+                    ),
+                ),
+            ),
+            # Self is already included in "others".
+            ignore_self=True,
+        )
+
     def cumsum(self) -> Series:
         return self._apply_window_op(
             agg_ops.sum_op, bigframes.core.window_spec.WindowSpec(following=0)
diff --git a/samples/snippets/logistic_regression_prediction_test.py b/samples/snippets/logistic_regression_prediction_test.py
new file mode 100644
index 0000000000..6a40369ba8
--- /dev/null
+++ b/samples/snippets/logistic_regression_prediction_test.py
@@ -0,0 +1,137 @@
+# Copyright 2024 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""BigQuery DataFrames code samples for
+https://cloud.google.com/bigquery/docs/logistic-regression-prediction.
+"""
+
+
+def test_logistic_regression_prediction(random_model_id: str) -> None:
+    your_model_id = random_model_id
+
+    # [START bigquery_dataframes_logistic_regression_prediction_examine]
+    import bigframes.pandas as bpd
+
+    df = bpd.read_gbq(
+        "bigquery-public-data.ml_datasets.census_adult_income",
+        columns=(
+            "age",
+            "workclass",
+            "marital_status",
+            "education_num",
+            "occupation",
+            "hours_per_week",
+            "income_bracket",
+            "functional_weight",
+        ),
+        max_results=100,
+    )
+    df.peek()
+    # Output:
+    # age      workclass       marital_status  education_num          occupation  hours_per_week income_bracket  functional_weight
+    #  47      Local-gov   Married-civ-spouse             13      Prof-specialty              40           >50K             198660
+    #  56        Private        Never-married              9        Adm-clerical              40          <=50K              85018
+    #  40        Private   Married-civ-spouse             12        Tech-support              40           >50K             285787
+    #  34   Self-emp-inc   Married-civ-spouse              9        Craft-repair              54           >50K             207668
+    #  23        Private   Married-civ-spouse             10   Handlers-cleaners              40          <=50K              40060
+    # [END bigquery_dataframes_logistic_regression_prediction_examine]
+
+    # [START bigquery_dataframes_logistic_regression_prediction_prepare]
+    import bigframes.pandas as bpd
+
+    input_data = bpd.read_gbq(
+        "bigquery-public-data.ml_datasets.census_adult_income",
+        columns=(
+            "age",
+            "workclass",
+            "marital_status",
+            "education_num",
+            "occupation",
+            "hours_per_week",
+            "income_bracket",
+            "functional_weight",
+        ),
+    )
+    input_data["dataframe"] = bpd.Series("training", index=input_data.index,).case_when(
+        [
+            (((input_data["functional_weight"] % 10) == 8), "evaluation"),
+            (((input_data["functional_weight"] % 10) == 9), "prediction"),
+        ]
+    )
+    del input_data["functional_weight"]
+    # [END bigquery_dataframes_logistic_regression_prediction_prepare]
+
+    # [START bigquery_dataframes_logistic_regression_prediction_create_model]
+    import bigframes.ml.linear_model
+
+    # input_data is defined in an earlier step.
+    training_data = input_data[input_data["dataframe"] == "training"]
+    X = training_data.drop(columns=["income_bracket", "dataframe"])
+    y = training_data["income_bracket"]
+
+    census_model = bigframes.ml.linear_model.LogisticRegression()
+    census_model.fit(X, y)
+
+    census_model.to_gbq(
+        your_model_id,  # For example: "your-project.census.census_model"
+        replace=True,
+    )
+    # [END bigquery_dataframes_logistic_regression_prediction_create_model]
+
+    # [START bigquery_dataframes_logistic_regression_prediction_evaluate_model]
+    # Select model you'll use for predictions. `read_gbq_model` loads model
+    # data from BigQuery, but you could also use the `census_model` object
+    # from previous steps.
+    census_model = bpd.read_gbq_model(
+        your_model_id,  # For example: "your-project.census.census_model"
+    )
+
+    # input_data is defined in an earlier step.
+    evaluation_data = input_data[input_data["dataframe"] == "evaluation"]
+    X = evaluation_data.drop(columns=["income_bracket", "dataframe"])
+    y = evaluation_data["income_bracket"]
+
+    # The score() method evaluates how the model performs compared to the
+    # actual data. Output DataFrame matches that of ML.EVALUATE().
+    score = census_model.score(X, y)
+    score.peek()
+    # Output:
+    #    precision    recall  accuracy  f1_score  log_loss   roc_auc
+    # 0   0.685764  0.536685   0.83819  0.602134  0.350417  0.882953
+    # [END bigquery_dataframes_logistic_regression_prediction_evaluate_model]
+
+    # [START bigquery_dataframes_logistic_regression_prediction_predict_income_bracket]
+    # Select model you'll use for predictions. `read_gbq_model` loads model
+    # data from BigQuery, but you could also use the `census_model` object
+    # from previous steps.
+    census_model = bpd.read_gbq_model(
+        your_model_id,  # For example: "your-project.census.census_model"
+    )
+
+    # input_data is defined in an earlier step.
+    prediction_data = input_data[input_data["dataframe"] == "prediction"]
+
+    predictions = census_model.predict(prediction_data)
+    predictions.peek()
+    # Output:
+    #           predicted_income_bracket                     predicted_income_bracket_probs  age workclass  ... occupation  hours_per_week income_bracket   dataframe
+    # 18004                    <=50K  [{'label': ' >50K', 'prob': 0.0763305999358786...   75         ?  ...          ?               6          <=50K  prediction
+    # 18886                    <=50K  [{'label': ' >50K', 'prob': 0.0448866871906495...   73         ?  ...          ?              22           >50K  prediction
+    # 31024                    <=50K  [{'label': ' >50K', 'prob': 0.0362982319421936...   69         ?  ...          ?               1          <=50K  prediction
+    # 31022                    <=50K  [{'label': ' >50K', 'prob': 0.0787836112058324...   75         ?  ...          ?               5          <=50K  prediction
+    # 23295                    <=50K  [{'label': ' >50K', 'prob': 0.3385373037905673...   78         ?  ...          ?              32          <=50K  prediction
+    # [END bigquery_dataframes_logistic_regression_prediction_predict_income_bracket]
+
+    # TODO(tswast): Implement ML.EXPLAIN_PREDICT() and corresponding sample.
+    # TODO(tswast): Implement ML.GLOBAL_EXPLAIN() and corresponding sample.
diff --git a/tests/system/small/test_series.py b/tests/system/small/test_series.py
index 38aed19f05..beb99b1ada 100644
--- a/tests/system/small/test_series.py
+++ b/tests/system/small/test_series.py
@@ -2565,6 +2565,36 @@ def test_between(scalars_df_index, scalars_pandas_df_index, left, right, inclusi
     )
 
 
+def test_case_when(scalars_df_index, scalars_pandas_df_index):
+    pytest.importorskip(
+        "pandas",
+        minversion="2.2.0",
+        reason="case_when added in pandas 2.2.0",
+    )
+
+    bf_series = scalars_df_index["int64_col"]
+    pd_series = scalars_pandas_df_index["int64_col"]
+
+    # TODO(tswast): pandas case_when appears to assume True when a value is
+    # null. I suspect this should be considered a bug in pandas.
+    bf_result = bf_series.case_when(
+        [
+            ((bf_series > 100).fillna(True), 1000),
+            ((bf_series < -100).fillna(True), -1000),
+        ]
+    ).to_pandas()
+    pd_result = pd_series.case_when(
+        [
+            (pd_series > 100, 1000),
+            (pd_series < -100, -1000),
+        ]
+    )
+    pd.testing.assert_series_equal(
+        bf_result,
+        pd_result.astype(pd.Int64Dtype()),
+    )
+
+
 def test_to_frame(scalars_dfs):
     scalars_df, scalars_pandas_df = scalars_dfs
 
diff --git a/third_party/bigframes_vendored/pandas/core/series.py b/third_party/bigframes_vendored/pandas/core/series.py
index 4833c41ff7..e155fb073a 100644
--- a/third_party/bigframes_vendored/pandas/core/series.py
+++ b/third_party/bigframes_vendored/pandas/core/series.py
@@ -6,10 +6,12 @@
 from typing import (
     Hashable,
     IO,
+    List,
     Literal,
     Mapping,
     Optional,
     Sequence,
+    Tuple,
     TYPE_CHECKING,
     Union,
 )
@@ -1937,6 +1939,59 @@ def between(
         """
         raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)
 
+    def case_when(
+        self,
+        caselist: List[Tuple[Series, Series]],
+    ) -> Series:
+        """Replace values where the conditions are True.
+
+        **Examples:**
+
+            >>> import bigframes.pandas as bpd
+            >>> bpd.options.display.progress_bar = None
+
+            >>> c = bpd.Series([6, 7, 8, 9], name="c")
+            >>> a = bpd.Series([0, 0, 1, 2])
+            >>> b = bpd.Series([0, 3, 4, 5])
+
+            >>> c.case_when(
+            ...     caselist=[
+            ...         (a.gt(0), a),  # condition, replacement
+            ...         (b.gt(0), b),
+            ...     ]
+            ... )
+            0    6
+            1    3
+            2    1
+            3    2
+            Name: c, dtype: Int64
+
+        **See also:**
+
+        - :func:`bigframes.series.Series.mask` : Replace values where the condition is True.
+
+        Args:
+            caselist:
+                A list of tuples of conditions and expected replacements
+                Takes the form:  ``(condition0, replacement0)``,
+                ``(condition1, replacement1)``, ... .
+                ``condition`` should be a 1-D boolean array-like object
+                or a callable. If ``condition`` is a callable,
+                it is computed on the Series
+                and should return a boolean Series or array.
+                The callable must not change the input Series
+                (though pandas doesn`t check it). ``replacement`` should be a
+                1-D array-like object, a scalar or a callable.
+                If ``replacement`` is a callable, it is computed on the Series
+                and should return a scalar or Series. The callable
+                must not change the input Series
+                (though pandas doesn`t check it).
+
+        Returns:
+            bigframes.series.Series
+        """
+        raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)
+
     def cumprod(self):
         """
         Return cumulative product over a DataFrame or Series axis.

From f2ed29cba8866508d1c68e45818c275b99732333 Mon Sep 17 00:00:00 2001
From: TrevorBergeron <tbergeron@google.com>
Date: Fri, 10 May 2024 09:21:52 -0700
Subject: [PATCH 10/17] refactor: Distinguish between range and row windows
 (#672)

---
 bigframes/core/block_transforms.py   |  49 +++++-----
 bigframes/core/blocks.py             |   7 +-
 bigframes/core/compile/compiled.py   |  42 ++++++---
 bigframes/core/groupby/__init__.py   |  27 +++---
 bigframes/core/reshape/__init__.py   |   6 +-
 bigframes/core/window_spec.py        | 129 +++++++++++++++++++++++++--
 bigframes/dataframe.py               |  27 +++---
 bigframes/operations/aggregations.py |  44 +++++----
 bigframes/series.py                  |  24 ++---
 9 files changed, 249 insertions(+), 106 deletions(-)

diff --git a/bigframes/core/block_transforms.py b/bigframes/core/block_transforms.py
index a221b343a5..e12e6bf054 100644
--- a/bigframes/core/block_transforms.py
+++ b/bigframes/core/block_transforms.py
@@ -71,21 +71,19 @@ def indicate_duplicates(
     if keep == "first":
         # Count how many copies occur up to current copy of value
         # Discard this value if there are copies BEFORE
-        window_spec = windows.WindowSpec(
+        window_spec = windows.cumulative_rows(
             grouping_keys=tuple(columns),
-            following=0,
         )
     elif keep == "last":
         # Count how many copies occur up to current copy of values
         # Discard this value if there are copies AFTER
-        window_spec = windows.WindowSpec(
+        window_spec = windows.inverse_cumulative_rows(
             grouping_keys=tuple(columns),
-            preceding=0,
         )
     else:  # keep == False
         # Count how many copies of the value occur in entire series.
         # Discard this value if there are copies ANYWHERE
-        window_spec = windows.WindowSpec(grouping_keys=tuple(columns))
+        window_spec = windows.unbound(grouping_keys=tuple(columns))
     block, dummy = block.create_constant(1)
     block, val_count_col_id = block.apply_window_op(
         dummy,
@@ -114,7 +112,7 @@ def quantile(
     dropna: bool = False,
 ) -> blocks.Block:
     # TODO: handle windowing and more interpolation methods
-    window = core.WindowSpec(
+    window = windows.unbound(
         grouping_keys=tuple(grouping_column_ids),
     )
     quantile_cols = []
@@ -212,8 +210,8 @@ def _interpolate_column(
     if interpolate_method not in ["linear", "nearest", "ffill"]:
         raise ValueError("interpolate method not supported")
     window_ordering = (ordering.OrderingExpression(ex.free_var(x_values)),)
-    backwards_window = windows.WindowSpec(following=0, ordering=window_ordering)
-    forwards_window = windows.WindowSpec(preceding=0, ordering=window_ordering)
+    backwards_window = windows.rows(following=0, ordering=window_ordering)
+    forwards_window = windows.rows(preceding=0, ordering=window_ordering)
 
     # Note, this method may
     block, notnull = block.apply_unary_op(column, ops.notnull_op)
@@ -364,7 +362,7 @@ def value_counts(
     )
     count_id = agg_ids[0]
     if normalize:
-        unbound_window = windows.WindowSpec()
+        unbound_window = windows.unbound()
         block, total_count_id = block.apply_window_op(
             count_id, agg_ops.sum_op, unbound_window
         )
@@ -388,7 +386,7 @@ def value_counts(
 
 def pct_change(block: blocks.Block, periods: int = 1) -> blocks.Block:
     column_labels = block.column_labels
-    window_spec = windows.WindowSpec(
+    window_spec = windows.rows(
         preceding=periods if periods > 0 else None,
         following=-periods if periods < 0 else None,
     )
@@ -430,23 +428,22 @@ def rank(
             ops.isnull_op,
         )
         nullity_col_ids.append(nullity_col_id)
-        window = windows.WindowSpec(
-            # BigQuery has syntax to reorder nulls with "NULLS FIRST/LAST", but that is unavailable through ibis presently, so must order on a separate nullity expression first.
-            ordering=(
-                ordering.OrderingExpression(
-                    ex.free_var(col),
-                    ordering.OrderingDirection.ASC
-                    if ascending
-                    else ordering.OrderingDirection.DESC,
-                    na_last=(na_option in ["bottom", "keep"]),
-                ),
+        window_ordering = (
+            ordering.OrderingExpression(
+                ex.free_var(col),
+                ordering.OrderingDirection.ASC
+                if ascending
+                else ordering.OrderingDirection.DESC,
+                na_last=(na_option in ["bottom", "keep"]),
             ),
         )
         # Count_op ignores nulls, so if na_option is "top" or "bottom", we instead count the nullity columns, where nulls have been mapped to bools
         block, rownum_id = block.apply_window_op(
             col if na_option == "keep" else nullity_col_id,
             agg_ops.dense_rank_op if method == "dense" else agg_ops.count_op,
-            window_spec=window,
+            window_spec=windows.unbound(ordering=window_ordering)
+            if method == "dense"
+            else windows.rows(following=0, ordering=window_ordering),
             skip_reproject_unsafe=(col != columns[-1]),
         )
         rownum_col_ids.append(rownum_id)
@@ -464,7 +461,7 @@ def rank(
             block, result_id = block.apply_window_op(
                 rownum_col_ids[i],
                 agg_op,
-                window_spec=windows.WindowSpec(grouping_keys=(columns[i],)),
+                window_spec=windows.unbound(grouping_keys=(columns[i],)),
                 skip_reproject_unsafe=(i < (len(columns) - 1)),
             )
             post_agg_rownum_col_ids.append(result_id)
@@ -528,7 +525,7 @@ def nsmallest(
         block, counter = block.apply_window_op(
             column_ids[0],
             agg_ops.rank_op,
-            window_spec=windows.WindowSpec(ordering=tuple(order_refs)),
+            window_spec=windows.unbound(ordering=tuple(order_refs)),
         )
         block, condition = block.project_expr(ops.le_op.as_expr(counter, ex.const(n)))
         block = block.filter_by_id(condition)
@@ -558,7 +555,7 @@ def nlargest(
         block, counter = block.apply_window_op(
             column_ids[0],
             agg_ops.rank_op,
-            window_spec=windows.WindowSpec(ordering=tuple(order_refs)),
+            window_spec=windows.unbound(ordering=tuple(order_refs)),
         )
         block, condition = block.project_expr(ops.le_op.as_expr(counter, ex.const(n)))
         block = block.filter_by_id(condition)
@@ -653,7 +650,7 @@ def _mean_delta_to_power(
     grouping_column_ids: typing.Sequence[str],
 ) -> typing.Tuple[blocks.Block, typing.Sequence[str]]:
     """Calculate (x-mean(x))^n. Useful for calculating moment statistics such as skew and kurtosis."""
-    window = windows.WindowSpec(grouping_keys=tuple(grouping_column_ids))
+    window = windows.unbound(grouping_keys=tuple(grouping_column_ids))
     block, mean_ids = block.multi_apply_window_op(column_ids, agg_ops.mean_op, window)
     delta_ids = []
     for val_id, mean_val_id in zip(column_ids, mean_ids):
@@ -845,7 +842,7 @@ def _idx_extrema(
                 for idx_col in original_block.index_columns
             ],
         ]
-        window_spec = windows.WindowSpec(ordering=tuple(order_refs))
+        window_spec = windows.unbound(ordering=tuple(order_refs))
         idx_col = original_block.index_columns[0]
         block, result_col = block.apply_window_op(
             idx_col, agg_ops.first_op, window_spec
diff --git a/bigframes/core/blocks.py b/bigframes/core/blocks.py
index 277409f3a3..2b2803b649 100644
--- a/bigframes/core/blocks.py
+++ b/bigframes/core/blocks.py
@@ -47,6 +47,7 @@
 import bigframes.core.tree_properties as tree_properties
 import bigframes.core.utils
 import bigframes.core.utils as utils
+import bigframes.core.window_spec as window_specs
 import bigframes.dtypes
 import bigframes.features
 import bigframes.operations as ops
@@ -816,7 +817,7 @@ def multi_apply_window_op(
         self,
         columns: typing.Sequence[str],
         op: agg_ops.WindowOp,
-        window_spec: core.WindowSpec,
+        window_spec: window_specs.WindowSpec,
         *,
         skip_null_groups: bool = False,
         never_skip_nulls: bool = False,
@@ -875,7 +876,7 @@ def apply_window_op(
         self,
         column: str,
         op: agg_ops.WindowOp,
-        window_spec: core.WindowSpec,
+        window_spec: window_specs.WindowSpec,
         *,
         result_label: Label = None,
         skip_null_groups: bool = False,
@@ -2029,7 +2030,7 @@ def _is_monotonic(
             return self._stats_cache[column_name][op_name]
 
         period = 1
-        window = bigframes.core.WindowSpec(
+        window = window_specs.rows(
             preceding=period,
             following=None,
         )
diff --git a/bigframes/core/compile/compiled.py b/bigframes/core/compile/compiled.py
index d14a5d3241..cc1d6baaa1 100644
--- a/bigframes/core/compile/compiled.py
+++ b/bigframes/core/compile/compiled.py
@@ -40,7 +40,7 @@
     OrderingExpression,
 )
 import bigframes.core.schema as schemata
-from bigframes.core.window_spec import WindowSpec
+from bigframes.core.window_spec import RangeWindowBounds, RowsWindowBounds, WindowSpec
 import bigframes.dtypes
 import bigframes.operations.aggregations as agg_ops
 
@@ -735,7 +735,9 @@ def project_window_op(
         skip_reproject_unsafe: skips the reprojection step, can be used when performing many non-dependent window operations, user responsible for not nesting window expressions, or using outputs as join, filter or aggregation keys before a reprojection
         """
         column = typing.cast(ibis_types.Column, self._get_ibis_column(column_name))
-        window = self._ibis_window_from_spec(window_spec, allow_ties=op.handles_ties)
+        window = self._ibis_window_from_spec(
+            window_spec, require_total_order=op.uses_total_row_ordering
+        )
         bindings = {col: self._get_ibis_column(col) for col in self.column_ids}
 
         window_op = agg_compiler.compile_analytic(
@@ -1162,7 +1164,9 @@ def _create_string_ordering_column(self) -> ibis_types.StringColumn:
     def _compile_expression(self, expr: ex.Expression):
         return op_compiler.compile_expression(expr, self._ibis_bindings)
 
-    def _ibis_window_from_spec(self, window_spec: WindowSpec, allow_ties: bool = False):
+    def _ibis_window_from_spec(
+        self, window_spec: WindowSpec, require_total_order: bool
+    ):
         group_by: typing.List[ibis_types.Value] = (
             [
                 typing.cast(
@@ -1175,26 +1179,40 @@ def _ibis_window_from_spec(self, window_spec: WindowSpec, allow_ties: bool = Fal
         )
         if self._reduced_predicate is not None:
             group_by.append(self._reduced_predicate)
+
+        # Construct ordering. There are basically 3 main cases
+        # 1. Order-independent op (aggregation, cut, rank) with unbound window - no ordering clause needed
+        # 2. Order-independent op (aggregation, cut, rank) with range window - use ordering clause, ties allowed
+        # 3. Order-depedenpent op (navigation functions, array_agg) or rows bounds - use total row order to break ties.
         if window_spec.ordering:
             order_by = _convert_ordering_to_table_values(
                 {**self._column_names, **self._hidden_ordering_column_names},
                 window_spec.ordering,
             )
-            if not allow_ties:
-                # Most operator need an unambiguous ordering, so the table's total ordering is appended
+            if require_total_order or isinstance(window_spec.bounds, RowsWindowBounds):
+                # Some operators need an unambiguous ordering, so the table's total ordering is appended
                 order_by = tuple([*order_by, *self._ibis_order])
-        elif (window_spec.following is not None) or (window_spec.preceding is not None):
+        elif isinstance(window_spec.bounds, RowsWindowBounds):
             # If window spec has following or preceding bounds, we need to apply an unambiguous ordering.
             order_by = tuple(self._ibis_order)
         else:
             # Unbound grouping window. Suitable for aggregations but not for analytic function application.
             order_by = None
-        return ibis.window(
-            preceding=window_spec.preceding,
-            following=window_spec.following,
-            order_by=order_by,
-            group_by=group_by,
-        )
+
+        bounds = window_spec.bounds
+        window = ibis.window(order_by=order_by, group_by=group_by)
+        if bounds is not None:
+            if isinstance(bounds, RangeWindowBounds):
+                window = window.preceding_following(
+                    bounds.preceding, bounds.following, how="range"
+                )
+            if isinstance(bounds, RowsWindowBounds):
+                window = window.preceding_following(
+                    bounds.preceding, bounds.following, how="rows"
+                )
+            else:
+                raise ValueError(f"unrecognized window bounds {bounds}")
+        return window
 
     class Builder:
         def __init__(
diff --git a/bigframes/core/groupby/__init__.py b/bigframes/core/groupby/__init__.py
index 05b1cc7f41..41d0750030 100644
--- a/bigframes/core/groupby/__init__.py
+++ b/bigframes/core/groupby/__init__.py
@@ -28,6 +28,7 @@
 import bigframes.core.ordering as order
 import bigframes.core.utils as utils
 import bigframes.core.window as windows
+import bigframes.core.window_spec as window_specs
 import bigframes.dataframe as df
 import bigframes.dtypes as dtypes
 import bigframes.operations.aggregations as agg_ops
@@ -217,7 +218,7 @@ def cumprod(self, *args, **kwargs) -> df.DataFrame:
         return self._apply_window_op(agg_ops.product_op, numeric_only=True)
 
     def shift(self, periods=1) -> series.Series:
-        window = core.WindowSpec(
+        window = window_specs.rows(
             grouping_keys=tuple(self._by_col_ids),
             preceding=periods if periods > 0 else None,
             following=-periods if periods < 0 else None,
@@ -225,7 +226,7 @@ def shift(self, periods=1) -> series.Series:
         return self._apply_window_op(agg_ops.ShiftOp(periods), window=window)
 
     def diff(self, periods=1) -> series.Series:
-        window = core.WindowSpec(
+        window = window_specs.rows(
             grouping_keys=tuple(self._by_col_ids),
             preceding=periods if periods > 0 else None,
             following=-periods if periods < 0 else None,
@@ -234,7 +235,7 @@ def diff(self, periods=1) -> series.Series:
 
     def rolling(self, window: int, min_periods=None) -> windows.Window:
         # To get n size window, need current row and n-1 preceding rows.
-        window_spec = core.WindowSpec(
+        window_spec = window_specs.rows(
             grouping_keys=tuple(self._by_col_ids),
             preceding=window - 1,
             following=0,
@@ -248,9 +249,8 @@ def rolling(self, window: int, min_periods=None) -> windows.Window:
         )
 
     def expanding(self, min_periods: int = 1) -> windows.Window:
-        window_spec = core.WindowSpec(
+        window_spec = window_specs.cumulative_rows(
             grouping_keys=tuple(self._by_col_ids),
-            following=0,
             min_periods=min_periods,
         )
         block = self._block.order_by(
@@ -424,8 +424,8 @@ def _apply_window_op(
         numeric_only: bool = False,
     ):
         """Apply window op to groupby. Defaults to grouped cumulative window."""
-        window_spec = window or core.WindowSpec(
-            grouping_keys=tuple(self._by_col_ids), following=0
+        window_spec = window or window_specs.cumulative_rows(
+            grouping_keys=tuple(self._by_col_ids)
         )
         columns = self._aggregated_columns(numeric_only=numeric_only)
         block, result_ids = self._block.multi_apply_window_op(
@@ -594,7 +594,7 @@ def cumcount(self, *args, **kwargs) -> series.Series:
 
     def shift(self, periods=1) -> series.Series:
         """Shift index by desired number of periods."""
-        window = core.WindowSpec(
+        window = window_specs.rows(
             grouping_keys=tuple(self._by_col_ids),
             preceding=periods if periods > 0 else None,
             following=-periods if periods < 0 else None,
@@ -602,7 +602,7 @@ def shift(self, periods=1) -> series.Series:
         return self._apply_window_op(agg_ops.ShiftOp(periods), window=window)
 
     def diff(self, periods=1) -> series.Series:
-        window = core.WindowSpec(
+        window = window_specs.rows(
             grouping_keys=tuple(self._by_col_ids),
             preceding=periods if periods > 0 else None,
             following=-periods if periods < 0 else None,
@@ -611,7 +611,7 @@ def diff(self, periods=1) -> series.Series:
 
     def rolling(self, window: int, min_periods=None) -> windows.Window:
         # To get n size window, need current row and n-1 preceding rows.
-        window_spec = core.WindowSpec(
+        window_spec = window_specs.rows(
             grouping_keys=tuple(self._by_col_ids),
             preceding=window - 1,
             following=0,
@@ -629,9 +629,8 @@ def rolling(self, window: int, min_periods=None) -> windows.Window:
         )
 
     def expanding(self, min_periods: int = 1) -> windows.Window:
-        window_spec = core.WindowSpec(
+        window_spec = window_specs.cumulative_rows(
             grouping_keys=tuple(self._by_col_ids),
-            following=0,
             min_periods=min_periods,
         )
         block = self._block.order_by(
@@ -661,8 +660,8 @@ def _apply_window_op(
         window: typing.Optional[core.WindowSpec] = None,
     ):
         """Apply window op to groupby. Defaults to grouped cumulative window."""
-        window_spec = window or core.WindowSpec(
-            grouping_keys=tuple(self._by_col_ids), following=0
+        window_spec = window or window_specs.cumulative_rows(
+            grouping_keys=tuple(self._by_col_ids)
         )
 
         label = self._value_name if not discard_name else None
diff --git a/bigframes/core/reshape/__init__.py b/bigframes/core/reshape/__init__.py
index 6bcc25319b..05cb5c7e94 100644
--- a/bigframes/core/reshape/__init__.py
+++ b/bigframes/core/reshape/__init__.py
@@ -19,10 +19,10 @@
 import pandas as pd
 
 import bigframes.constants as constants
-import bigframes.core as core
 import bigframes.core.expression as ex
 import bigframes.core.ordering as order
 import bigframes.core.utils as utils
+import bigframes.core.window_spec as window_specs
 import bigframes.dataframe
 import bigframes.operations as ops
 import bigframes.operations.aggregations as agg_ops
@@ -159,7 +159,7 @@ def cut(
         )
 
     return x._apply_window_op(
-        agg_ops.CutOp(bins, labels=labels), window_spec=core.WindowSpec()
+        agg_ops.CutOp(bins, labels=labels), window_spec=window_specs.unbound()
     )
 
 
@@ -189,7 +189,7 @@ def qcut(
     block, result = block.apply_window_op(
         x._value_column,
         agg_ops.QcutOp(q),  # type: ignore
-        window_spec=core.WindowSpec(
+        window_spec=window_specs.unbound(
             grouping_keys=(nullity_id,),
             ordering=(order.ascending_over(x._value_column),),
         ),
diff --git a/bigframes/core/window_spec.py b/bigframes/core/window_spec.py
index b02f13d333..71e88a4c3d 100644
--- a/bigframes/core/window_spec.py
+++ b/bigframes/core/window_spec.py
@@ -11,13 +11,133 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+from __future__ import annotations
 
 from dataclasses import dataclass
-import typing
+from typing import Optional, Tuple, Union
 
 import bigframes.core.ordering as orderings
 
 
+# Unbound Windows
+def unbound(
+    grouping_keys: Tuple[str, ...] = (),
+    min_periods: int = 0,
+    ordering: Tuple[orderings.OrderingExpression, ...] = (),
+) -> WindowSpec:
+    """
+    Create an unbound window.
+
+    Args:
+        grouping_keys:
+            Columns ids of grouping keys
+        min_periods (int, default 0):
+            Minimum number of input rows to generate output.
+        ordering:
+            Orders the rows within the window.
+
+    Returns:
+        WindowSpec
+    """
+    return WindowSpec(
+        grouping_keys=grouping_keys, min_periods=min_periods, ordering=ordering
+    )
+
+
+### Rows-based Windows
+def rows(
+    grouping_keys: Tuple[str, ...] = (),
+    preceding: Optional[int] = None,
+    following: Optional[int] = None,
+    min_periods: int = 0,
+    ordering: Tuple[orderings.OrderingExpression, ...] = (),
+) -> WindowSpec:
+    """
+    Create a row-bounded window.
+
+    Args:
+        grouping_keys:
+            Columns ids of grouping keys
+        preceding:
+            number of preceding rows to include. If None, include all preceding rows
+        following:
+            number of following rows to include. If None, include all following rows
+        min_periods (int, default 0):
+            Minimum number of input rows to generate output.
+        ordering:
+            Ordering to apply on top of based dataframe ordering
+    Returns:
+        WindowSpec
+    """
+    assert (preceding is not None) or (following is not None)
+    bounds = RowsWindowBounds(preceding=preceding, following=following)
+    return WindowSpec(
+        grouping_keys=grouping_keys,
+        bounds=bounds,
+        min_periods=min_periods,
+        ordering=ordering,
+    )
+
+
+def cumulative_rows(
+    grouping_keys: Tuple[str, ...] = (), min_periods: int = 0
+) -> WindowSpec:
+    """
+    Create a expanding window that includes all preceding rows
+
+    Args:
+        grouping_keys:
+            Columns ids of grouping keys
+        min_periods (int, default 0):
+            Minimum number of input rows to generate output.
+    Returns:
+        WindowSpec
+    """
+    bounds = RowsWindowBounds(following=0)
+    return WindowSpec(
+        grouping_keys=grouping_keys, bounds=bounds, min_periods=min_periods
+    )
+
+
+def inverse_cumulative_rows(
+    grouping_keys: Tuple[str, ...] = (), min_periods: int = 0
+) -> WindowSpec:
+    """
+    Create a shrinking window that includes all following rows
+
+    Args:
+        grouping_keys:
+            Columns ids of grouping keys
+        min_periods (int, default 0):
+            Minimum number of input rows to generate output.
+    Returns:
+        WindowSpec
+    """
+    bounds = RowsWindowBounds(preceding=0)
+    return WindowSpec(
+        grouping_keys=grouping_keys, bounds=bounds, min_periods=min_periods
+    )
+
+
+### Struct Classes
+
+
+@dataclass(frozen=True)
+class RowsWindowBounds:
+    preceding: Optional[int] = None
+    following: Optional[int] = None
+
+
+# TODO: Expand to datetime offsets
+OffsetType = Union[float, int]
+
+
+@dataclass(frozen=True)
+class RangeWindowBounds:
+    preceding: Optional[OffsetType] = None
+    following: Optional[OffsetType] = None
+
+
 @dataclass(frozen=True)
 class WindowSpec:
     """
@@ -28,8 +148,7 @@ class WindowSpec:
     ordering: List of columns ids and ordering direction to override base ordering
     """
 
-    grouping_keys: typing.Tuple[str, ...] = tuple()
-    ordering: typing.Tuple[orderings.OrderingExpression, ...] = tuple()
-    preceding: typing.Optional[int] = None
-    following: typing.Optional[int] = None
+    grouping_keys: Tuple[str, ...] = tuple()
+    ordering: Tuple[orderings.OrderingExpression, ...] = tuple()
+    bounds: Union[RowsWindowBounds, RangeWindowBounds, None] = None
     min_periods: int = 0
diff --git a/bigframes/dataframe.py b/bigframes/dataframe.py
index 47730630e3..5be28acf53 100644
--- a/bigframes/dataframe.py
+++ b/bigframes/dataframe.py
@@ -59,6 +59,7 @@
 import bigframes.core.ordering as order
 import bigframes.core.utils as utils
 import bigframes.core.window
+import bigframes.core.window_spec as window_spec
 import bigframes.dtypes
 import bigframes.formatting_helpers as formatter
 import bigframes.operations as ops
@@ -1874,11 +1875,11 @@ def replace(
         )
 
     def ffill(self, *, limit: typing.Optional[int] = None) -> DataFrame:
-        window = bigframes.core.WindowSpec(preceding=limit, following=0)
+        window = window_spec.rows(preceding=limit, following=0)
         return self._apply_window_op(agg_ops.LastNonNullOp(), window)
 
     def bfill(self, *, limit: typing.Optional[int] = None) -> DataFrame:
-        window = bigframes.core.WindowSpec(preceding=0, following=limit)
+        window = window_spec.rows(preceding=0, following=limit)
         return self._apply_window_op(agg_ops.FirstNonNullOp(), window)
 
     def isin(self, values) -> DataFrame:
@@ -2574,17 +2575,17 @@ def _perform_join_by_index(
 
     def rolling(self, window: int, min_periods=None) -> bigframes.core.window.Window:
         # To get n size window, need current row and n-1 preceding rows.
-        window_spec = bigframes.core.WindowSpec(
+        window_def = window_spec.rows(
             preceding=window - 1, following=0, min_periods=min_periods or window
         )
         return bigframes.core.window.Window(
-            self._block, window_spec, self._block.value_columns
+            self._block, window_def, self._block.value_columns
         )
 
     def expanding(self, min_periods: int = 1) -> bigframes.core.window.Window:
-        window_spec = bigframes.core.WindowSpec(following=0, min_periods=min_periods)
+        window = window_spec.cumulative_rows(min_periods=min_periods)
         return bigframes.core.window.Window(
-            self._block, window_spec, self._block.value_columns
+            self._block, window, self._block.value_columns
         )
 
     def groupby(
@@ -2691,7 +2692,7 @@ def cumsum(self):
             raise ValueError("All values must be numeric to apply cumsum.")
         return self._apply_window_op(
             agg_ops.sum_op,
-            bigframes.core.WindowSpec(following=0),
+            window_spec.cumulative_rows(),
         )
 
     def cumprod(self) -> DataFrame:
@@ -2703,30 +2704,30 @@ def cumprod(self) -> DataFrame:
             raise ValueError("All values must be numeric to apply cumsum.")
         return self._apply_window_op(
             agg_ops.product_op,
-            bigframes.core.WindowSpec(following=0),
+            window_spec.cumulative_rows(),
         )
 
     def cummin(self) -> DataFrame:
         return self._apply_window_op(
             agg_ops.min_op,
-            bigframes.core.WindowSpec(following=0),
+            window_spec.cumulative_rows(),
         )
 
     def cummax(self) -> DataFrame:
         return self._apply_window_op(
             agg_ops.max_op,
-            bigframes.core.WindowSpec(following=0),
+            window_spec.cumulative_rows(),
         )
 
     def shift(self, periods: int = 1) -> DataFrame:
-        window = bigframes.core.WindowSpec(
+        window = window_spec.rows(
             preceding=periods if periods > 0 else None,
             following=-periods if periods < 0 else None,
         )
         return self._apply_window_op(agg_ops.ShiftOp(periods), window)
 
     def diff(self, periods: int = 1) -> DataFrame:
-        window = bigframes.core.WindowSpec(
+        window = window_spec.rows(
             preceding=periods if periods > 0 else None,
             following=-periods if periods < 0 else None,
         )
@@ -2740,7 +2741,7 @@ def pct_change(self, periods: int = 1) -> DataFrame:
     def _apply_window_op(
         self,
         op: agg_ops.WindowOp,
-        window_spec: bigframes.core.WindowSpec,
+        window_spec: window_spec.WindowSpec,
     ):
         block, result_ids = self._block.multi_apply_window_op(
             self._block.value_columns,
diff --git a/bigframes/operations/aggregations.py b/bigframes/operations/aggregations.py
index 0d27d1d75d..c57fac4112 100644
--- a/bigframes/operations/aggregations.py
+++ b/bigframes/operations/aggregations.py
@@ -34,8 +34,8 @@ def skips_nulls(self):
         return True
 
     @property
-    def handles_ties(self):
-        """Whether the operator can handle ties without nondeterministic output. (eg. rank operator can handle ties but not the count operator)"""
+    def uses_total_row_ordering(self):
+        """Whether the operator needs total row ordering. (eg. lead, lag, array_agg)"""
         return False
 
     @abc.abstractmethod
@@ -232,10 +232,6 @@ class CutOp(UnaryWindowOp):
     def skips_nulls(self):
         return False
 
-    @property
-    def handles_ties(self):
-        return True
-
     def output_type(self, *input_types: dtypes.ExpressionType) -> dtypes.ExpressionType:
         if isinstance(self.bins, int) and (self.labels is False):
             return dtypes.INT_DTYPE
@@ -267,10 +263,6 @@ def name(self):
     def skips_nulls(self):
         return False
 
-    @property
-    def handles_ties(self):
-        return True
-
     def output_type(self, *input_types: dtypes.ExpressionType) -> dtypes.ExpressionType:
         return signatures.FixedOutputType(
             dtypes.is_orderable, dtypes.INT_DTYPE, "orderable"
@@ -308,10 +300,6 @@ class RankOp(UnaryWindowOp):
     def skips_nulls(self):
         return False
 
-    @property
-    def handles_ties(self):
-        return True
-
     def output_type(self, *input_types: dtypes.ExpressionType) -> dtypes.ExpressionType:
         return signatures.FixedOutputType(
             dtypes.is_orderable, dtypes.INT_DTYPE, "orderable"
@@ -324,10 +312,6 @@ class DenseRankOp(UnaryWindowOp):
     def skips_nulls(self):
         return False
 
-    @property
-    def handles_ties(self):
-        return True
-
     def output_type(self, *input_types: dtypes.ExpressionType) -> dtypes.ExpressionType:
         return signatures.FixedOutputType(
             dtypes.is_orderable, dtypes.INT_DTYPE, "orderable"
@@ -338,9 +322,17 @@ def output_type(self, *input_types: dtypes.ExpressionType) -> dtypes.ExpressionT
 class FirstOp(UnaryWindowOp):
     name: ClassVar[str] = "first"
 
+    @property
+    def uses_total_row_ordering(self):
+        return True
+
 
 @dataclasses.dataclass(frozen=True)
 class FirstNonNullOp(UnaryWindowOp):
+    @property
+    def uses_total_row_ordering(self):
+        return True
+
     @property
     def skips_nulls(self):
         return False
@@ -350,9 +342,17 @@ def skips_nulls(self):
 class LastOp(UnaryWindowOp):
     name: ClassVar[str] = "last"
 
+    @property
+    def uses_total_row_ordering(self):
+        return True
+
 
 @dataclasses.dataclass(frozen=True)
 class LastNonNullOp(UnaryWindowOp):
+    @property
+    def uses_total_row_ordering(self):
+        return True
+
     @property
     def skips_nulls(self):
         return False
@@ -362,6 +362,10 @@ def skips_nulls(self):
 class ShiftOp(UnaryWindowOp):
     periods: int
 
+    @property
+    def uses_total_row_ordering(self):
+        return True
+
     @property
     def skips_nulls(self):
         return False
@@ -371,6 +375,10 @@ def skips_nulls(self):
 class DiffOp(UnaryWindowOp):
     periods: int
 
+    @property
+    def uses_total_row_ordering(self):
+        return True
+
     @property
     def skips_nulls(self):
         return False
diff --git a/bigframes/series.py b/bigframes/series.py
index ce13d205bd..313380e4a4 100644
--- a/bigframes/series.py
+++ b/bigframes/series.py
@@ -431,44 +431,44 @@ def case_when(self, caselist) -> Series:
 
     def cumsum(self) -> Series:
         return self._apply_window_op(
-            agg_ops.sum_op, bigframes.core.window_spec.WindowSpec(following=0)
+            agg_ops.sum_op, bigframes.core.window_spec.cumulative_rows()
         )
 
     def ffill(self, *, limit: typing.Optional[int] = None) -> Series:
-        window = bigframes.core.window_spec.WindowSpec(preceding=limit, following=0)
+        window = bigframes.core.window_spec.rows(preceding=limit, following=0)
         return self._apply_window_op(agg_ops.LastNonNullOp(), window)
 
     pad = ffill
     pad.__doc__ = inspect.getdoc(vendored_pandas_series.Series.ffill)
 
     def bfill(self, *, limit: typing.Optional[int] = None) -> Series:
-        window = bigframes.core.window_spec.WindowSpec(preceding=0, following=limit)
+        window = bigframes.core.window_spec.rows(preceding=0, following=limit)
         return self._apply_window_op(agg_ops.FirstNonNullOp(), window)
 
     def cummax(self) -> Series:
         return self._apply_window_op(
-            agg_ops.max_op, bigframes.core.window_spec.WindowSpec(following=0)
+            agg_ops.max_op, bigframes.core.window_spec.cumulative_rows()
         )
 
     def cummin(self) -> Series:
         return self._apply_window_op(
-            agg_ops.min_op, bigframes.core.window_spec.WindowSpec(following=0)
+            agg_ops.min_op, bigframes.core.window_spec.cumulative_rows()
         )
 
     def cumprod(self) -> Series:
         return self._apply_window_op(
-            agg_ops.product_op, bigframes.core.window_spec.WindowSpec(following=0)
+            agg_ops.product_op, bigframes.core.window_spec.cumulative_rows()
         )
 
     def shift(self, periods: int = 1) -> Series:
-        window = bigframes.core.window_spec.WindowSpec(
+        window = bigframes.core.window_spec.rows(
             preceding=periods if periods > 0 else None,
             following=-periods if periods < 0 else None,
         )
         return self._apply_window_op(agg_ops.ShiftOp(periods), window)
 
     def diff(self, periods: int = 1) -> Series:
-        window = bigframes.core.window_spec.WindowSpec(
+        window = bigframes.core.window_spec.rows(
             preceding=periods if periods > 0 else None,
             following=-periods if periods < 0 else None,
         )
@@ -955,7 +955,7 @@ def mode(self) -> Series:
         block, max_value_count_col_id = block.apply_window_op(
             value_count_col_id,
             agg_ops.max_op,
-            window_spec=bigframes.core.window_spec.WindowSpec(),
+            window_spec=bigframes.core.window_spec.unbound(),
         )
         block, is_mode_col_id = block.apply_binary_op(
             value_count_col_id,
@@ -1226,7 +1226,7 @@ def sort_index(self, *, axis=0, ascending=True, na_position="last") -> Series:
 
     def rolling(self, window: int, min_periods=None) -> bigframes.core.window.Window:
         # To get n size window, need current row and n-1 preceding rows.
-        window_spec = bigframes.core.window_spec.WindowSpec(
+        window_spec = bigframes.core.window_spec.rows(
             preceding=window - 1, following=0, min_periods=min_periods or window
         )
         return bigframes.core.window.Window(
@@ -1234,8 +1234,8 @@ def rolling(self, window: int, min_periods=None) -> bigframes.core.window.Window
         )
 
     def expanding(self, min_periods: int = 1) -> bigframes.core.window.Window:
-        window_spec = bigframes.core.window_spec.WindowSpec(
-            following=0, min_periods=min_periods
+        window_spec = bigframes.core.window_spec.cumulative_rows(
+            min_periods=min_periods
         )
         return bigframes.core.window.Window(
             self._block, window_spec, self._block.value_columns, is_series=True

From f6bdc4aeb3f81a1e0b955521c04ac0dd22981c76 Mon Sep 17 00:00:00 2001
From: Shobhit Singh <shobs@google.com>
Date: Fri, 10 May 2024 19:50:35 +0000
Subject: [PATCH 11/17] feat: Support `axis=1` in `df.apply` for scalar outputs
 (#629)

* feat: Support `axis=1` in `df.apply` for scalar outputs

* avoid mixing other changes in the input_types param

* use guid instead of hard coded column name

* check_exact=False to avoid failing system_prerelease

* handle index in remote function, add large system tests

* make the test case more robust

* handle non-string column names, add unsupported dtype tests

* fix import

* use `_cached` in df.apply to catch any rf execution errors early

* add test for row aggregates

* add row dtype information, also test

* preserve the order of input in the output

* absorb to_numpy() disparity in prerelease tests

* add tests for column multiindex and non remote function

* add preview note for row processing

* add warning for input_types="row" and axis=1

* introduce early check on the supported dtypes

* asjust test after early dtype handling

* address review comments

* user NameError for column name parsing issue, address test coverage failure

* address nan return handling in the gcf code

* handle (nan, inf, -inf)

* replace "row" by bpd.Series for input types

* make the bq parity assert more readable

* fix the series name before assert

* fix docstring for args

* move more low level string logic in sql module

* raise explicit error when a column name cannot be supported

* keep literal_eval check on the serialization side to match
deserialization
---
 bigframes/core/blocks.py                      | 101 ++++++-
 bigframes/core/sql.py                         |  59 +++++
 bigframes/dataframe.py                        |  56 +++-
 bigframes/exceptions.py                       |   4 +
 bigframes/functions/remote_function.py        | 191 ++++++++++---
 bigframes/session/__init__.py                 |   8 +-
 tests/system/large/test_remote_function.py    | 250 ++++++++++++++++++
 tests/system/small/test_remote_function.py    | 146 ++++++++++
 .../bigframes_vendored/pandas/core/frame.py   |  29 +-
 9 files changed, 792 insertions(+), 52 deletions(-)
 create mode 100644 bigframes/core/sql.py

diff --git a/bigframes/core/blocks.py b/bigframes/core/blocks.py
index 2b2803b649..58b8515418 100644
--- a/bigframes/core/blocks.py
+++ b/bigframes/core/blocks.py
@@ -21,11 +21,13 @@
 
 from __future__ import annotations
 
+import ast
 import dataclasses
 import functools
 import itertools
 import os
 import random
+import textwrap
 import typing
 from typing import Iterable, List, Literal, Mapping, Optional, Sequence, Tuple, Union
 import warnings
@@ -44,8 +46,8 @@
 import bigframes.core.join_def as join_defs
 import bigframes.core.ordering as ordering
 import bigframes.core.schema as bf_schema
+import bigframes.core.sql as sql
 import bigframes.core.tree_properties as tree_properties
-import bigframes.core.utils
 import bigframes.core.utils as utils
 import bigframes.core.window_spec as window_specs
 import bigframes.dtypes
@@ -1437,9 +1439,7 @@ def promote_offsets(self, label: Label = None) -> typing.Tuple[Block, str]:
         )
 
     def add_prefix(self, prefix: str, axis: str | int | None = None) -> Block:
-        axis_number = bigframes.core.utils.get_axis_number(
-            "rows" if (axis is None) else axis
-        )
+        axis_number = utils.get_axis_number("rows" if (axis is None) else axis)
         if axis_number == 0:
             expr = self._expr
             for index_col in self._index_columns:
@@ -1460,9 +1460,7 @@ def add_prefix(self, prefix: str, axis: str | int | None = None) -> Block:
             return self.rename(columns=lambda label: f"{prefix}{label}")
 
     def add_suffix(self, suffix: str, axis: str | int | None = None) -> Block:
-        axis_number = bigframes.core.utils.get_axis_number(
-            "rows" if (axis is None) else axis
-        )
+        axis_number = utils.get_axis_number("rows" if (axis is None) else axis)
         if axis_number == 0:
             expr = self._expr
             for index_col in self._index_columns:
@@ -2072,6 +2070,95 @@ def _is_monotonic(
         self._stats_cache[column_name].update({op_name: result})
         return result
 
+    def _get_rows_as_json_values(self) -> Block:
+        # We want to preserve any ordering currently present before turning to
+        # direct SQL manipulation. We will restore the ordering when we rebuild
+        # expression.
+        # TODO(shobs): Replace direct SQL manipulation by structured expression
+        # manipulation
+        ordering_column_name = guid.generate_guid()
+        expr = self.session._cache_with_offsets(self.expr)
+        expr = expr.promote_offsets(ordering_column_name)
+        expr_sql = self.session._to_sql(expr)
+
+        # Names of the columns to serialize for the row.
+        # We will use the repr-eval pattern to serialize a value here and
+        # deserialize in the cloud function. Let's make sure that would work.
+        column_names = []
+        for col in list(self.index_columns) + [col for col in self.column_labels]:
+            serialized_column_name = repr(col)
+            try:
+                ast.literal_eval(serialized_column_name)
+            except Exception:
+                raise NameError(
+                    f"Column name type '{type(col).__name__}' is not supported for row serialization."
+                    " Please consider using a name for which literal_eval(repr(name)) works."
+                )
+
+            column_names.append(serialized_column_name)
+        column_names_csv = sql.csv(column_names, quoted=True)
+
+        # index columns count
+        index_columns_count = len(self.index_columns)
+
+        # column references to form the array of values for the row
+        column_references_csv = sql.csv(
+            [sql.cast_as_string(col) for col in self.expr.column_ids]
+        )
+
+        # types of the columns to serialize for the row
+        column_types = list(self.index.dtypes) + list(self.dtypes)
+        column_types_csv = sql.csv([str(typ) for typ in column_types], quoted=True)
+
+        # row dtype to use for deserializing the row as pandas series
+        pandas_row_dtype = bigframes.dtypes.lcd_type(*column_types)
+        if pandas_row_dtype is None:
+            pandas_row_dtype = "object"
+        pandas_row_dtype = sql.quote(str(pandas_row_dtype))
+
+        # create a json column representing row through SQL manipulation
+        row_json_column_name = guid.generate_guid()
+        select_columns = (
+            [ordering_column_name] + list(self.index_columns) + [row_json_column_name]
+        )
+        select_columns_csv = sql.csv(
+            [sql.column_reference(col) for col in select_columns]
+        )
+        json_sql = f"""\
+With T0 AS (
+{textwrap.indent(expr_sql, "    ")}
+),
+T1 AS (
+    SELECT *,
+           JSON_OBJECT(
+               "names", [{column_names_csv}],
+               "types", [{column_types_csv}],
+               "values", [{column_references_csv}],
+               "indexlength", {index_columns_count},
+               "dtype", {pandas_row_dtype}
+           ) AS {row_json_column_name} FROM T0
+)
+SELECT {select_columns_csv} FROM T1
+"""
+        ibis_table = self.session.ibis_client.sql(json_sql)
+        order_for_ibis_table = ordering.ExpressionOrdering.from_offset_col(
+            ordering_column_name
+        )
+        expr = core.ArrayValue.from_ibis(
+            self.session,
+            ibis_table,
+            [ibis_table[col] for col in select_columns if col != ordering_column_name],
+            hidden_ordering_columns=[ibis_table[ordering_column_name]],
+            ordering=order_for_ibis_table,
+        )
+        block = Block(
+            expr,
+            index_columns=self.index_columns,
+            column_labels=[row_json_column_name],
+            index_labels=self._index_labels,
+        )
+        return block
+
 
 class BlockIndexProperties:
     """Accessor for the index-related block properties."""
diff --git a/bigframes/core/sql.py b/bigframes/core/sql.py
new file mode 100644
index 0000000000..31ee5f9064
--- /dev/null
+++ b/bigframes/core/sql.py
@@ -0,0 +1,59 @@
+# Copyright 2023 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+Utility functions for SQL construction.
+"""
+
+from typing import Iterable
+
+
+def quote(value: str):
+    """Return quoted input string."""
+
+    # Let's use repr which also escapes any special characters
+    #
+    # >>> for val in [
+    # ...     "123",
+    # ...     "str with no special chars",
+    # ...     "str with special chars.,'\"/\\"
+    # ... ]:
+    # ...     print(f"{val} -> {repr(val)}")
+    # ...
+    # 123 -> '123'
+    # str with no special chars -> 'str with no special chars'
+    # str with special chars.,'"/\ -> 'str with special chars.,\'"/\\'
+
+    return repr(value)
+
+
+def column_reference(column_name: str):
+    """Return a string representing column reference in a SQL."""
+
+    return f"`{column_name}`"
+
+
+def cast_as_string(column_name: str):
+    """Return a string representing string casting of a column."""
+
+    return f"CAST({column_reference(column_name)} AS STRING)"
+
+
+def csv(values: Iterable[str], quoted=False):
+    """Return a string of comma separated values."""
+
+    if quoted:
+        values = [quote(val) for val in values]
+
+    return ", ".join(values)
diff --git a/bigframes/dataframe.py b/bigframes/dataframe.py
index 5be28acf53..d3fd39afa7 100644
--- a/bigframes/dataframe.py
+++ b/bigframes/dataframe.py
@@ -34,6 +34,7 @@
     Tuple,
     Union,
 )
+import warnings
 
 import bigframes_vendored.pandas.core.frame as vendored_pandas_frame
 import bigframes_vendored.pandas.pandas._typing as vendored_pandas_typing
@@ -61,6 +62,7 @@
 import bigframes.core.window
 import bigframes.core.window_spec as window_spec
 import bigframes.dtypes
+import bigframes.exceptions
 import bigframes.formatting_helpers as formatter
 import bigframes.operations as ops
 import bigframes.operations.aggregations as agg_ops
@@ -3308,7 +3310,59 @@ def map(self, func, na_action: Optional[str] = None) -> DataFrame:
             ops.RemoteFunctionOp(func=func, apply_on_null=(na_action is None))
         )
 
-    def apply(self, func, *, args: typing.Tuple = (), **kwargs):
+    def apply(self, func, *, axis=0, args: typing.Tuple = (), **kwargs):
+        if utils.get_axis_number(axis) == 1:
+            warnings.warn(
+                "axis=1 scenario is in preview.",
+                category=bigframes.exceptions.PreviewWarning,
+            )
+
+            # Early check whether the dataframe dtypes are currently supported
+            # in the remote function
+            # NOTE: Keep in sync with the value converters used in the gcf code
+            # generated in generate_cloud_function_main_code in remote_function.py
+            remote_function_supported_dtypes = (
+                bigframes.dtypes.INT_DTYPE,
+                bigframes.dtypes.FLOAT_DTYPE,
+                bigframes.dtypes.BOOL_DTYPE,
+                bigframes.dtypes.STRING_DTYPE,
+            )
+            supported_dtypes_types = tuple(
+                type(dtype) for dtype in remote_function_supported_dtypes
+            )
+            supported_dtypes_hints = tuple(
+                str(dtype) for dtype in remote_function_supported_dtypes
+            )
+
+            for dtype in self.dtypes:
+                if not isinstance(dtype, supported_dtypes_types):
+                    raise NotImplementedError(
+                        f"DataFrame has a column of dtype '{dtype}' which is not supported with axis=1."
+                        f" Supported dtypes are {supported_dtypes_hints}."
+                    )
+
+            # Check if the function is a remote function
+            if not hasattr(func, "bigframes_remote_function"):
+                raise ValueError("For axis=1 a remote function must be used.")
+
+            # Serialize the rows as json values
+            block = self._get_block()
+            rows_as_json_series = bigframes.series.Series(
+                block._get_rows_as_json_values()
+            )
+
+            # Apply the function
+            result_series = rows_as_json_series._apply_unary_op(
+                ops.RemoteFunctionOp(func=func, apply_on_null=True)
+            )
+            result_series.name = None
+
+            # Return Series with materialized result so that any error in the remote
+            # function is caught early
+            materialized_series = result_series.cache()
+            return materialized_series
+
+        # Per-column apply
         results = {name: func(col, *args, **kwargs) for name, col in self.items()}
         if all(
             [
diff --git a/bigframes/exceptions.py b/bigframes/exceptions.py
index 3ca6d8e1af..eae021b4cd 100644
--- a/bigframes/exceptions.py
+++ b/bigframes/exceptions.py
@@ -33,3 +33,7 @@ class CleanupFailedWarning(Warning):
 
 class DefaultIndexWarning(Warning):
     """Default index may cause unexpected costs."""
+
+
+class PreviewWarning(Warning):
+    """The feature is in preview."""
diff --git a/bigframes/functions/remote_function.py b/bigframes/functions/remote_function.py
index 9d826d0fa1..6e42ca9f48 100644
--- a/bigframes/functions/remote_function.py
+++ b/bigframes/functions/remote_function.py
@@ -25,8 +25,10 @@
 import tempfile
 import textwrap
 from typing import List, NamedTuple, Optional, Sequence, TYPE_CHECKING, Union
+import warnings
 
 import ibis
+import pandas
 import requests
 
 if TYPE_CHECKING:
@@ -262,7 +264,7 @@ def generate_udf_code(self, def_, dir):
 
         return udf_code_file_name, udf_bytecode_file_name
 
-    def generate_cloud_function_main_code(self, def_, dir):
+    def generate_cloud_function_main_code(self, def_, dir, is_row_processor=False):
         """Get main.py code for the cloud function for the given user defined function."""
 
         # Pickle the udf with all its dependencies
@@ -285,38 +287,120 @@ def generate_cloud_function_main_code(self, def_, dir):
         #   ...
         # }
         # https://cloud.google.com/bigquery/docs/reference/standard-sql/remote-functions#input_format
-        code_template = textwrap.dedent(
-            """\
-        import cloudpickle
-        import functions_framework
-        from flask import jsonify
-        import json
-
-        # original udf code is in {udf_code_file}
-        # serialized udf code is in {udf_bytecode_file}
-        with open("{udf_bytecode_file}", "rb") as f:
-          udf = cloudpickle.load(f)
-
-        def {handler_func_name}(request):
-          try:
-            request_json = request.get_json(silent=True)
-            calls = request_json["calls"]
-            replies = []
-            for call in calls:
-              reply = udf(*call)
-              replies.append(reply)
-            return_json = json.dumps({{"replies" : replies}})
-            return return_json
-          except Exception as e:
-            return jsonify( {{ "errorMessage": str(e) }} ), 400
-        """
-        )
-
-        code = code_template.format(
-            udf_code_file=udf_code_file,
-            udf_bytecode_file=udf_bytecode_file,
-            handler_func_name=handler_func_name,
-        )
+        code = """\
+import cloudpickle
+import functions_framework
+from flask import jsonify
+import json
+"""
+        if is_row_processor:
+            code += """\
+import ast
+import math
+import pandas as pd
+
+def get_pd_series(row):
+    row_json = json.loads(row)
+    col_names = row_json["names"]
+    col_types = row_json["types"]
+    col_values = row_json["values"]
+    index_length = row_json["indexlength"]
+    dtype = row_json["dtype"]
+
+    # At this point we are assuming that col_names, col_types and col_values are
+    # arrays of the same length, representing column names, types and values for
+    # one row of data
+
+    # column names are not necessarily strings
+    # they are serialized as repr(name) at source
+    evaluated_col_names = []
+    for col_name in col_names:
+        try:
+            col_name = ast.literal_eval(col_name)
+        except Exception as ex:
+            raise NameError(f"Failed to evaluate column name from '{col_name}': {ex}")
+        evaluated_col_names.append(col_name)
+    col_names = evaluated_col_names
+
+    # Supported converters for pandas to python types
+    value_converters = {
+        "boolean": lambda val: val == "true",
+        "Int64": int,
+        "Float64": float,
+        "string": str,
+    }
+
+    def convert_value(value, value_type):
+        value_converter = value_converters.get(value_type)
+        if value_converter is None:
+            raise ValueError(f"Don't know how to handle type '{value_type}'")
+        if value is None:
+            return None
+        return value_converter(value)
+
+    index_values = [
+        pd.Series([convert_value(col_values[i], col_types[i])], dtype=col_types[i])[0]
+        for i in range(index_length)
+    ]
+
+    data_col_names = col_names[index_length:]
+    data_col_types = col_types[index_length:]
+    data_col_values = col_values[index_length:]
+    data_col_values = [
+        pd.Series([convert_value(a, data_col_types[i])], dtype=data_col_types[i])[0]
+        for i, a in enumerate(data_col_values)
+    ]
+
+    row_index = index_values[0] if len(index_values) == 1 else tuple(index_values)
+    row_series = pd.Series(data_col_values, index=data_col_names, name=row_index, dtype=dtype)
+    return row_series
+"""
+        code += f"""\
+
+# original udf code is in {udf_code_file}
+# serialized udf code is in {udf_bytecode_file}
+with open("{udf_bytecode_file}", "rb") as f:
+    udf = cloudpickle.load(f)
+
+def {handler_func_name}(request):
+    try:
+        request_json = request.get_json(silent=True)
+        calls = request_json["calls"]
+        replies = []
+        for call in calls:
+"""
+
+        if is_row_processor:
+            code += """\
+            reply = udf(get_pd_series(call[0]))
+            if isinstance(reply, float) and (math.isnan(reply) or math.isinf(reply)):
+                # json serialization of the special float values (nan, inf, -inf)
+                # is not in strict compliance of the JSON specification
+                # https://docs.python.org/3/library/json.html#basic-usage.
+                # Let's convert them to a quoted string representation ("NaN",
+                # "Infinity", "-Infinity" respectively) which is handled by
+                # BigQuery
+                reply = json.dumps(reply)
+            elif pd.isna(reply):
+                # Pandas N/A values are not json serializable, so use a python
+                # equivalent instead
+                reply = None
+            elif hasattr(reply, "item"):
+                # Numpy types are not json serializable, so use its Python
+                # value instead
+                reply = reply.item()
+"""
+        else:
+            code += """\
+            reply = udf(*call)
+"""
+        code += """\
+            replies.append(reply)
+        return_json = json.dumps({"replies" : replies})
+        return return_json
+    except Exception as e:
+        return jsonify( { "errorMessage": str(e) } ), 400
+"""
 
         main_py = os.path.join(dir, "main.py")
         with open(main_py, "w") as f:
@@ -325,11 +409,17 @@ def {handler_func_name}(request):
 
         return handler_func_name
 
-    def generate_cloud_function_code(self, def_, dir, package_requirements=None):
+    def generate_cloud_function_code(
+        self, def_, dir, package_requirements=None, is_row_processor=False
+    ):
         """Generate the cloud function code for a given user defined function."""
 
         # requirements.txt
         requirements = ["cloudpickle >= 2.1.0"]
+        if is_row_processor:
+            # bigframes remote function will send an entire row of data as json,
+            # which would be converted to a pandas series and processed
+            requirements.append(f"pandas=={pandas.__version__}")
         if package_requirements:
             requirements.extend(package_requirements)
         requirements = sorted(requirements)
@@ -338,7 +428,9 @@ def generate_cloud_function_code(self, def_, dir, package_requirements=None):
             f.write("\n".join(requirements))
 
         # main.py
-        entry_point = self.generate_cloud_function_main_code(def_, dir)
+        entry_point = self.generate_cloud_function_main_code(
+            def_, dir, is_row_processor
+        )
         return entry_point
 
     def create_cloud_function(
@@ -348,13 +440,14 @@ def create_cloud_function(
         package_requirements=None,
         timeout_seconds=600,
         max_instance_count=None,
+        is_row_processor=False,
     ):
         """Create a cloud function from the given user defined function."""
 
         # Build and deploy folder structure containing cloud function
         with tempfile.TemporaryDirectory() as dir:
             entry_point = self.generate_cloud_function_code(
-                def_, dir, package_requirements
+                def_, dir, package_requirements, is_row_processor
             )
             archive_path = shutil.make_archive(dir, "zip", dir)
 
@@ -474,6 +567,7 @@ def provision_bq_remote_function(
         max_batching_rows,
         cloud_function_timeout,
         cloud_function_max_instance_count,
+        is_row_processor,
     ):
         """Provision a BigQuery remote function."""
         # If reuse of any existing function with the same name (indicated by the
@@ -500,6 +594,7 @@ def provision_bq_remote_function(
                 package_requirements,
                 cloud_function_timeout,
                 cloud_function_max_instance_count,
+                is_row_processor,
             )
         else:
             logger.info(f"Cloud function {cloud_function_name} already exists.")
@@ -700,8 +795,9 @@ def remote_function(
 
     Args:
         input_types (type or sequence(type)):
-            Input data type, or sequence of input data types in the user
-            defined function.
+            For scalar user defined function it should be the input type or
+            sequence of input types. For row processing user defined function,
+            type `Series` should be specified.
         output_type (type):
             Data type of the output in the user defined function.
         session (bigframes.Session, Optional):
@@ -800,9 +896,25 @@ def remote_function(
             function's default setting applies. For more details see
             https://cloud.google.com/functions/docs/configuring/max-instances
     """
-    if isinstance(input_types, type):
+    is_row_processor = False
+
+    import bigframes.series
+
+    if input_types == bigframes.series.Series:
+        warnings.warn(
+            "input_types=Series scenario is in preview.",
+            stacklevel=1,
+            category=bigframes.exceptions.PreviewWarning,
+        )
+
+        # we will model the row as a json serialized string containing the data
+        # and the metadata representing the row
+        input_types = [str]
+        is_row_processor = True
+    elif isinstance(input_types, type):
         input_types = [input_types]
 
+    # Some defaults may be used from the session if not provided otherwise
     import bigframes.pandas as bpd
 
     session = session or bpd.get_global_session()
@@ -928,6 +1040,7 @@ def wrapper(f):
             max_batching_rows,
             cloud_function_timeout,
             cloud_function_max_instances,
+            is_row_processor,
         )
 
         # TODO: Move ibis logic to compiler step
diff --git a/bigframes/session/__init__.py b/bigframes/session/__init__.py
index 5f70fd77f9..473fc4f098 100644
--- a/bigframes/session/__init__.py
+++ b/bigframes/session/__init__.py
@@ -1416,6 +1416,9 @@ def remote_function(
         """Decorator to turn a user defined function into a BigQuery remote function. Check out
         the code samples at: https://cloud.google.com/bigquery/docs/remote-functions#bigquery-dataframes.
 
+        .. note::
+            ``input_types=Series`` scenario is in preview.
+
         .. note::
             Please make sure following is setup before using this API:
 
@@ -1455,8 +1458,9 @@ def remote_function(
 
         Args:
             input_types (type or sequence(type)):
-                Input data type, or sequence of input data types in the user
-                defined function.
+                For scalar user defined function it should be the input type or
+                sequence of input types. For row processing user defined function,
+                type `Series` should be specified.
             output_type (type):
                 Data type of the output in the user defined function.
             dataset (str, Optional):
diff --git a/tests/system/large/test_remote_function.py b/tests/system/large/test_remote_function.py
index eb2a0884fe..e086903d03 100644
--- a/tests/system/large/test_remote_function.py
+++ b/tests/system/large/test_remote_function.py
@@ -12,6 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from datetime import datetime
 import importlib.util
 import inspect
 import math  # must keep this at top level to test udf referring global import
@@ -28,6 +29,7 @@
 
 import bigframes
 from bigframes.functions.remote_function import get_cloud_function_name
+import bigframes.series
 from tests.system.utils import (
     assert_pandas_df_equal,
     delete_cloud_function,
@@ -1454,3 +1456,251 @@ def square(x):
         cleanup_remote_function_assets(
             session.bqclient, session.cloudfunctionsclient, square_remote
         )
+
+
+@pytest.mark.flaky(retries=2, delay=120)
+def test_df_apply_axis_1(session, scalars_dfs):
+    columns = ["bool_col", "int64_col", "int64_too", "float64_col", "string_col"]
+    scalars_df, scalars_pandas_df = scalars_dfs
+    try:
+
+        def serialize_row(row):
+            custom = {
+                "name": row.name,
+                "index": [idx for idx in row.index],
+                "values": [
+                    val.item() if hasattr(val, "item") else val for val in row.values
+                ],
+            }
+
+            return str(
+                {
+                    "default": row.to_json(),
+                    "split": row.to_json(orient="split"),
+                    "records": row.to_json(orient="records"),
+                    "index": row.to_json(orient="index"),
+                    "table": row.to_json(orient="table"),
+                    "custom": custom,
+                }
+            )
+
+        serialize_row_remote = session.remote_function(
+            bigframes.series.Series, str, reuse=False
+        )(serialize_row)
+
+        bf_result = scalars_df[columns].apply(serialize_row_remote, axis=1).to_pandas()
+        pd_result = scalars_pandas_df[columns].apply(serialize_row, axis=1)
+
+        # bf_result.dtype is 'string[pyarrow]' while pd_result.dtype is 'object'
+        # , ignore this mismatch by using check_dtype=False.
+        pandas.testing.assert_series_equal(pd_result, bf_result, check_dtype=False)
+    finally:
+        # clean up the gcp assets created for the remote function
+        cleanup_remote_function_assets(
+            session.bqclient, session.cloudfunctionsclient, serialize_row_remote
+        )
+
+
+@pytest.mark.flaky(retries=2, delay=120)
+def test_df_apply_axis_1_aggregates(session, scalars_dfs):
+    columns = ["int64_col", "int64_too", "float64_col"]
+    scalars_df, scalars_pandas_df = scalars_dfs
+
+    try:
+
+        def analyze(row):
+            return str(
+                {
+                    "dtype": row.dtype,
+                    "count": row.count(),
+                    "min": row.max(),
+                    "max": row.max(),
+                    "mean": row.mean(),
+                    "std": row.std(),
+                    "var": row.var(),
+                }
+            )
+
+        analyze_remote = session.remote_function(bigframes.series.Series, str)(analyze)
+
+        bf_result = (
+            scalars_df[columns].dropna().apply(analyze_remote, axis=1).to_pandas()
+        )
+        pd_result = scalars_pandas_df[columns].dropna().apply(analyze, axis=1)
+
+        # bf_result.dtype is 'string[pyarrow]' while pd_result.dtype is 'object'
+        # , ignore this mismatch by using check_dtype=False.
+        pandas.testing.assert_series_equal(pd_result, bf_result, check_dtype=False)
+    finally:
+        # clean up the gcp assets created for the remote function
+        cleanup_remote_function_assets(
+            session.bqclient, session.cloudfunctionsclient, analyze_remote
+        )
+
+
+@pytest.mark.parametrize(
+    ("pd_df"),
+    [
+        pytest.param(
+            pandas.DataFrame(
+                {
+                    "2": [1, 2, 3],
+                    2: [1.5, 3.75, 5],
+                    "name, [with. special'- chars\")/\\": [10, 20, 30],
+                    (3, 4): ["pq", "rs", "tu"],
+                    (5.0, "six", 7): [8, 9, 10],
+                    'raise Exception("hacked!")': [11, 12, 13],
+                }
+            ),
+            id="all-kinds-of-column-names",
+        ),
+        pytest.param(
+            pandas.DataFrame(
+                {
+                    "x": [1, 2, 3],
+                    "y": [1.5, 3.75, 5],
+                    "z": ["pq", "rs", "tu"],
+                },
+                index=pandas.MultiIndex.from_tuples(
+                    [
+                        ("a", 100),
+                        ("a", 200),
+                        ("b", 300),
+                    ]
+                ),
+            ),
+            id="multiindex",
+        ),
+        pytest.param(
+            pandas.DataFrame(
+                [
+                    [10, 1.5, "pq"],
+                    [20, 3.75, "rs"],
+                    [30, 8.0, "tu"],
+                ],
+                columns=pandas.MultiIndex.from_arrays(
+                    [
+                        ["first", "last_two", "last_two"],
+                        [1, 2, 3],
+                    ]
+                ),
+            ),
+            id="column-multiindex",
+        ),
+        pytest.param(
+            pandas.DataFrame(
+                {
+                    datetime.now(): [1, 2, 3],
+                }
+            ),
+            id="column-name-not-supported",
+            marks=pytest.mark.xfail(raises=NameError),
+        ),
+    ],
+)
+@pytest.mark.flaky(retries=2, delay=120)
+def test_df_apply_axis_1_complex(session, pd_df):
+    bf_df = session.read_pandas(pd_df)
+
+    try:
+
+        def serialize_row(row):
+            custom = {
+                "name": row.name,
+                "index": [idx for idx in row.index],
+                "values": [
+                    val.item() if hasattr(val, "item") else val for val in row.values
+                ],
+            }
+            return str(
+                {
+                    "default": row.to_json(),
+                    "split": row.to_json(orient="split"),
+                    "records": row.to_json(orient="records"),
+                    "index": row.to_json(orient="index"),
+                    "custom": custom,
+                }
+            )
+
+        serialize_row_remote = session.remote_function(
+            bigframes.series.Series, str, reuse=False
+        )(serialize_row)
+
+        bf_result = bf_df.apply(serialize_row_remote, axis=1).to_pandas()
+        pd_result = pd_df.apply(serialize_row, axis=1)
+
+        # bf_result.dtype is 'string[pyarrow]' while pd_result.dtype is 'object'
+        # , ignore this mismatch by using check_dtype=False.
+        #
+        # bf_result.index[0].dtype is 'string[pyarrow]' while
+        # pd_result.index[0].dtype is 'object', ignore this mismatch by using
+        # check_index_type=False.
+        pandas.testing.assert_series_equal(
+            pd_result, bf_result, check_dtype=False, check_index_type=False
+        )
+    finally:
+        # clean up the gcp assets created for the remote function
+        cleanup_remote_function_assets(
+            session.bqclient, session.cloudfunctionsclient, serialize_row_remote
+        )
+
+
+@pytest.mark.flaky(retries=2, delay=120)
+def test_df_apply_axis_1_na_nan_inf(session):
+    """This test is for special cases of float values, to make sure any (nan,
+    inf, -inf) produced by user code is honored.
+    """
+    bf_df = session.read_gbq(
+        """\
+SELECT "1" AS text, 1 AS num
+UNION ALL
+SELECT "2.5" AS text, 2.5 AS num
+UNION ALL
+SELECT "nan" AS text, IEEE_DIVIDE(0, 0) AS num
+UNION ALL
+SELECT "inf" AS text, IEEE_DIVIDE(1, 0) AS num
+UNION ALL
+SELECT "-inf" AS text, IEEE_DIVIDE(-1, 0) AS num
+UNION ALL
+SELECT "numpy nan" AS text, IEEE_DIVIDE(0, 0) AS num
+UNION ALL
+SELECT "pandas na" AS text, NULL AS num
+                             """
+    )
+
+    pd_df = bf_df.to_pandas()
+
+    try:
+
+        def float_parser(row):
+            import numpy as mynp
+            import pandas as mypd
+
+            if row["text"] == "pandas na":
+                return mypd.NA
+            if row["text"] == "numpy nan":
+                return mynp.nan
+            return float(row["text"])
+
+        float_parser_remote = session.remote_function(
+            bigframes.series.Series, float, reuse=False
+        )(float_parser)
+
+        pd_result = pd_df.apply(float_parser, axis=1)
+        bf_result = bf_df.apply(float_parser_remote, axis=1).to_pandas()
+
+        # bf_result.dtype is 'Float64' while pd_result.dtype is 'object'
+        # , ignore this mismatch by using check_dtype=False.
+        pandas.testing.assert_series_equal(pd_result, bf_result, check_dtype=False)
+
+        # Let's also assert that the data is consistent in this round trip
+        # (BQ -> BigFrames -> BQ -> GCF -> BQ -> BigFrames) w.r.t. their
+        # expected values in BQ
+        bq_result = bf_df["num"].to_pandas()
+        bq_result.name = None
+        pandas.testing.assert_series_equal(bq_result, bf_result)
+    finally:
+        # clean up the gcp assets created for the remote function
+        cleanup_remote_function_assets(
+            session.bqclient, session.cloudfunctionsclient, float_parser_remote
+        )
diff --git a/tests/system/small/test_remote_function.py b/tests/system/small/test_remote_function.py
index 106638cef3..9c60c821a7 100644
--- a/tests/system/small/test_remote_function.py
+++ b/tests/system/small/test_remote_function.py
@@ -12,12 +12,15 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import re
+
 import google.api_core.exceptions
 from google.cloud import bigquery
 import pandas as pd
 import pytest
 
 import bigframes
+import bigframes.exceptions
 from bigframes.functions import remote_function as rf
 from tests.system.utils import assert_pandas_df_equal
 
@@ -685,3 +688,146 @@ def test_read_gbq_function_enforces_explicit_types(bigquery_client, dataset_id):
         rf.read_gbq_function(
             str(neither_type_specified.reference), bigquery_client=bigquery_client
         )
+
+
+@pytest.mark.flaky(retries=2, delay=120)
+def test_df_apply_axis_1(session, scalars_dfs):
+    columns = ["bool_col", "int64_col", "int64_too", "float64_col", "string_col"]
+    scalars_df, scalars_pandas_df = scalars_dfs
+
+    def add_ints(row):
+        return row["int64_col"] + row["int64_too"]
+
+    with pytest.warns(
+        bigframes.exceptions.PreviewWarning,
+        match="input_types=Series scenario is in preview.",
+    ):
+        add_ints_remote = session.remote_function(bigframes.series.Series, int)(
+            add_ints
+        )
+
+    with pytest.warns(
+        bigframes.exceptions.PreviewWarning, match="axis=1 scenario is in preview."
+    ):
+        bf_result = scalars_df[columns].apply(add_ints_remote, axis=1).to_pandas()
+
+    pd_result = scalars_pandas_df[columns].apply(add_ints, axis=1)
+
+    # bf_result.dtype is 'Int64' while pd_result.dtype is 'object', ignore this
+    # mismatch by using check_dtype=False.
+    #
+    # bf_result.to_numpy() produces an array of numpy.float64's
+    # (in system_prerelease tests), while pd_result.to_numpy() produces an
+    # array of ints, ignore this mismatch by using check_exact=False.
+    pd.testing.assert_series_equal(
+        pd_result, bf_result, check_dtype=False, check_exact=False
+    )
+
+
+@pytest.mark.flaky(retries=2, delay=120)
+def test_df_apply_axis_1_ordering(session, scalars_dfs):
+    columns = ["bool_col", "int64_col", "int64_too", "float64_col", "string_col"]
+    ordering_columns = ["bool_col", "int64_col"]
+    scalars_df, scalars_pandas_df = scalars_dfs
+
+    def add_ints(row):
+        return row["int64_col"] + row["int64_too"]
+
+    add_ints_remote = session.remote_function(bigframes.series.Series, int)(add_ints)
+
+    bf_result = (
+        scalars_df[columns]
+        .sort_values(ordering_columns)
+        .apply(add_ints_remote, axis=1)
+        .to_pandas()
+    )
+    pd_result = (
+        scalars_pandas_df[columns].sort_values(ordering_columns).apply(add_ints, axis=1)
+    )
+
+    # bf_result.dtype is 'Int64' while pd_result.dtype is 'object', ignore this
+    # mismatch by using check_dtype=False.
+    #
+    # bf_result.to_numpy() produces an array of numpy.float64's
+    # (in system_prerelease tests), while pd_result.to_numpy() produces an
+    # array of ints, ignore this mismatch by using check_exact=False.
+    pd.testing.assert_series_equal(
+        pd_result, bf_result, check_dtype=False, check_exact=False
+    )
+
+
+@pytest.mark.flaky(retries=2, delay=120)
+def test_df_apply_axis_1_multiindex(session):
+    pd_df = pd.DataFrame(
+        {"x": [1, 2, 3], "y": [1.5, 3.75, 5], "z": ["pq", "rs", "tu"]},
+        index=pd.MultiIndex.from_tuples([("a", 100), ("a", 200), ("b", 300)]),
+    )
+    bf_df = session.read_pandas(pd_df)
+
+    def add_numbers(row):
+        return row["x"] + row["y"]
+
+    add_numbers_remote = session.remote_function(bigframes.series.Series, float)(
+        add_numbers
+    )
+
+    bf_result = bf_df.apply(add_numbers_remote, axis=1).to_pandas()
+    pd_result = pd_df.apply(add_numbers, axis=1)
+
+    # bf_result.dtype is 'Float64' while pd_result.dtype is 'float64', ignore this
+    # mismatch by using check_dtype=False.
+    #
+    # bf_result.index[0].dtype is 'string[pyarrow]' while
+    # pd_result.index[0].dtype is 'object', ignore this mismatch by using
+    # check_index_type=False.
+    pd.testing.assert_series_equal(
+        pd_result, bf_result, check_dtype=False, check_index_type=False
+    )
+
+
+def test_df_apply_axis_1_unsupported_callable(scalars_dfs):
+    scalars_df, scalars_pandas_df = scalars_dfs
+    columns = ["bool_col", "int64_col", "int64_too", "float64_col", "string_col"]
+
+    def add_ints(row):
+        return row["int64_col"] + row["int64_too"]
+
+    # pandas works
+    scalars_pandas_df.apply(add_ints, axis=1)
+
+    with pytest.raises(ValueError, match="For axis=1 a remote function must be used."):
+        scalars_df[columns].apply(add_ints, axis=1)
+
+
+@pytest.mark.parametrize(
+    ("column"),
+    [
+        pytest.param("bytes_col"),
+        pytest.param("date_col"),
+        pytest.param("datetime_col"),
+        pytest.param("geography_col"),
+        pytest.param("numeric_col"),
+        pytest.param("time_col"),
+        pytest.param("timestamp_col"),
+    ],
+)
+def test_df_apply_axis_1_unsupported_dtype(scalars_dfs, column):
+    scalars_df, scalars_pandas_df = scalars_dfs
+
+    # It doesn't matter if it is a remote function or not, the dtype check
+    # is done even before the function type check with axis=1
+    def echo(row):
+        return row[column]
+
+    # pandas works
+    scalars_pandas_df[[column]].apply(echo, axis=1)
+
+    dtype = scalars_df[column].dtype
+
+    with pytest.raises(
+        NotImplementedError,
+        match=re.escape(
+            f"DataFrame has a column of dtype '{dtype}' which is not supported with axis=1. Supported dtypes are ('Int64', 'Float64', 'boolean', 'string')."
+        ),
+    ):
+        scalars_df[[column]].apply(echo, axis=1)
diff --git a/third_party/bigframes_vendored/pandas/core/frame.py b/third_party/bigframes_vendored/pandas/core/frame.py
index 4e17bca54d..31d5e88c7e 100644
--- a/third_party/bigframes_vendored/pandas/core/frame.py
+++ b/third_party/bigframes_vendored/pandas/core/frame.py
@@ -4200,12 +4200,16 @@ def merge(
         """
         raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)
 
-    def apply(self, func, *, args=(), **kwargs):
+    def apply(self, func, *, axis=0, args=(), **kwargs):
         """Apply a function along an axis of the DataFrame.
 
         Objects passed to the function are Series objects whose index is
-        the DataFrame's index (``axis=0``) the final return type
-        is inferred from the return type of the applied function.
+        the DataFrame's index (``axis=0``) or the DataFrame's columns (``axis=1``).
+        The final return type is inferred from the return type of the applied
+        function.
+
+        .. note::
+            ``axis=1`` scenario is in preview.
 
         **Examples:**
 
@@ -4230,9 +4234,28 @@ def apply(self, func, *, args=(), **kwargs):
             <BLANKLINE>
             [2 rows x 2 columns]
 
+        You could apply a user defined function to every row of the DataFrame by
+        creating a remote function out of it, and using it with `axis=1`.
+
+            >>> @bpd.remote_function(bpd.Series, int, reuse=False)
+            ... def foo(row):
+            ...     result = 1
+            ...     result += row["col1"]
+            ...     result += row["col2"]*row["col2"]
+            ...     return result
+
+            >>> df.apply(foo, axis=1)
+            0    11
+            1    19
+            dtype: Int64
+
         Args:
             func (function):
                 Function to apply to each column or row.
+            axis ({index (0), columns (1)}):
+                Axis along which the function is applied. Specify 0 or 'index'
+                to apply function to each column. Specify 1 or 'columns' to
+                apply function to each row.
             args (tuple):
                 Positional arguments to pass to `func` in addition to the
                 array/series.

From 21bd3e4f0be64c20ed8774f1a6e0b79fe40bba5a Mon Sep 17 00:00:00 2001
From: Huan Chen <142538604+Genesis929@users.noreply.github.com>
Date: Fri, 10 May 2024 14:40:40 -0700
Subject: [PATCH 12/17] chore: log and labels update (#674)

* chore: log and labels update

* remove unused logic

* Update unit test.

* fixes for mypy

* lint update
---
 bigframes/core/log_adapter.py                 | 16 +++++++++--
 bigframes/session/_io/bigquery/__init__.py    | 17 +++++++----
 .../session/_io/bigquery/read_gbq_table.py    |  4 ---
 tests/unit/session/test_io_bigquery.py        | 28 ++++++-------------
 4 files changed, 33 insertions(+), 32 deletions(-)

diff --git a/bigframes/core/log_adapter.py b/bigframes/core/log_adapter.py
index 860d394cd2..b5afafbe7c 100644
--- a/bigframes/core/log_adapter.py
+++ b/bigframes/core/log_adapter.py
@@ -21,6 +21,9 @@
 _api_methods: List = []
 _excluded_methods = ["__setattr__", "__getattr__"]
 
+# Stack to track method calls
+_call_stack: List = []
+
 
 def class_logger(decorated_cls):
     """Decorator that adds logging functionality to each method of the class."""
@@ -38,10 +41,17 @@ def wrapper(*args, **kwargs):
         class_name = decorated_cls.__name__  # Access decorated class name
         api_method_name = str(method.__name__)
         full_method_name = f"{class_name.lower()}-{api_method_name}"
-        # Track regular and "dunder" methods
-        if api_method_name.startswith("__") or not api_method_name.startswith("_"):
+
+        # Track directly called methods
+        if len(_call_stack) == 0:
             add_api_method(full_method_name)
-        return method(*args, **kwargs)
+
+        _call_stack.append(full_method_name)
+
+        try:
+            return method(*args, **kwargs)
+        finally:
+            _call_stack.pop()
 
     return wrapper
 
diff --git a/bigframes/session/_io/bigquery/__init__.py b/bigframes/session/_io/bigquery/__init__.py
index 98e0dac1e8..ed1bd39ada 100644
--- a/bigframes/session/_io/bigquery/__init__.py
+++ b/bigframes/session/_io/bigquery/__init__.py
@@ -23,7 +23,7 @@
 import textwrap
 import types
 import typing
-from typing import Dict, Iterable, Mapping, Optional, Sequence, Tuple, Union
+from typing import Dict, Iterable, Mapping, Optional, Tuple, Union
 
 import bigframes_vendored.pandas.io.gbq as third_party_pandas_gbq
 import google.api_core.exceptions
@@ -43,11 +43,15 @@
 
 def create_job_configs_labels(
     job_configs_labels: Optional[Dict[str, str]],
-    api_methods: Sequence[str],
+    api_methods: typing.List[str],
 ) -> Dict[str, str]:
     if job_configs_labels is None:
         job_configs_labels = {}
 
+    if api_methods:
+        job_configs_labels["bigframes-api"] = api_methods[0]
+        del api_methods[0]
+
     labels = list(
         itertools.chain(
             job_configs_labels.keys(),
@@ -198,10 +202,11 @@ def start_query_with_client(
     """
     Starts query job and waits for results.
     """
-    api_methods = log_adapter.get_and_reset_api_methods()
-    job_config.labels = create_job_configs_labels(
-        job_configs_labels=job_config.labels, api_methods=api_methods
-    )
+    if not job_config.dry_run:
+        api_methods = log_adapter.get_and_reset_api_methods()
+        job_config.labels = create_job_configs_labels(
+            job_configs_labels=job_config.labels, api_methods=api_methods
+        )
 
     try:
         query_job = bq_client.query(sql, job_config=job_config, timeout=timeout)
diff --git a/bigframes/session/_io/bigquery/read_gbq_table.py b/bigframes/session/_io/bigquery/read_gbq_table.py
index f6c1463e6c..0f6a3dadd2 100644
--- a/bigframes/session/_io/bigquery/read_gbq_table.py
+++ b/bigframes/session/_io/bigquery/read_gbq_table.py
@@ -112,8 +112,6 @@ def get_table_metadata(
     # atomically.
     table = bqclient.get_table(table_ref)
 
-    # TODO(b/336521938): Refactor to make sure we set the "bigframes-api"
-    # whereever we execute a query.
     job_config = bigquery.QueryJobConfig()
     job_config.labels["bigframes-api"] = api_name
     snapshot_timestamp = list(
@@ -344,8 +342,6 @@ def get_time_travel_datetime_and_table_metadata(
     # atomically.
     table = bqclient.get_table(table_ref)
 
-    # TODO(b/336521938): Refactor to make sure we set the "bigframes-api"
-    # whereever we execute a query.
     job_config = bigquery.QueryJobConfig()
     job_config.labels["bigframes-api"] = api_name
     snapshot_timestamp = list(
diff --git a/tests/unit/session/test_io_bigquery.py b/tests/unit/session/test_io_bigquery.py
index 9da085e824..5f4072e9c2 100644
--- a/tests/unit/session/test_io_bigquery.py
+++ b/tests/unit/session/test_io_bigquery.py
@@ -30,17 +30,13 @@ def test_create_job_configs_labels_is_none():
     labels = io_bq.create_job_configs_labels(
         job_configs_labels=None, api_methods=api_methods
     )
-    expected_dict = {
-        "recent-bigframes-api-0": "agg",
-        "recent-bigframes-api-1": "series-mode",
-    }
+    expected_dict = {"bigframes-api": "agg", "recent-bigframes-api-0": "series-mode"}
     assert labels is not None
     assert labels == expected_dict
 
 
 def test_create_job_configs_labels_length_limit_not_met():
     cur_labels = {
-        "bigframes-api": "read_pandas",
         "source": "bigquery-dataframes-temp",
     }
     api_methods = ["agg", "series-mode"]
@@ -48,20 +44,18 @@ def test_create_job_configs_labels_length_limit_not_met():
         job_configs_labels=cur_labels, api_methods=api_methods
     )
     expected_dict = {
-        "bigframes-api": "read_pandas",
         "source": "bigquery-dataframes-temp",
-        "recent-bigframes-api-0": "agg",
-        "recent-bigframes-api-1": "series-mode",
+        "bigframes-api": "agg",
+        "recent-bigframes-api-0": "series-mode",
     }
     assert labels is not None
-    assert len(labels) == 4
+    assert len(labels) == 3
     assert labels == expected_dict
 
 
 def test_create_job_configs_labels_log_adaptor_call_method_under_length_limit():
     log_adapter.get_and_reset_api_methods()
     cur_labels = {
-        "bigframes-api": "read_pandas",
         "source": "bigquery-dataframes-temp",
     }
     df = bpd.DataFrame(
@@ -76,14 +70,10 @@ def test_create_job_configs_labels_log_adaptor_call_method_under_length_limit():
         job_configs_labels=cur_labels, api_methods=api_methods
     )
     expected_dict = {
-        "bigframes-api": "read_pandas",
         "source": "bigquery-dataframes-temp",
-        "recent-bigframes-api-0": "series-__init__",
-        "recent-bigframes-api-1": "dataframe-max",
-        "recent-bigframes-api-2": "dataframe-__init__",
-        "recent-bigframes-api-3": "dataframe-head",
-        "recent-bigframes-api-4": "dataframe-__init__",
-        "recent-bigframes-api-5": "dataframe-__init__",
+        "bigframes-api": "dataframe-max",
+        "recent-bigframes-api-0": "dataframe-head",
+        "recent-bigframes-api-1": "dataframe-__init__",
     }
     assert labels == expected_dict
 
@@ -94,7 +84,7 @@ def test_create_job_configs_labels_length_limit_met_and_labels_is_none():
         {"col1": [1, 2], "col2": [3, 4]}, session=resources.create_bigquery_session()
     )
     # Test running methods more than the labels' length limit
-    for i in range(66):
+    for i in range(100):
         df.head()
     api_methods = log_adapter._api_methods
 
@@ -112,7 +102,7 @@ def test_create_job_configs_labels_length_limit_met():
         "bigframes-api": "read_pandas",
         "source": "bigquery-dataframes-temp",
     }
-    for i in range(60):
+    for i in range(100):
         key = f"bigframes-api-test-{i}"
         value = f"test{i}"
         cur_labels[key] = value

From 9ca92d09e9c56db408350b35ec698152c13954ed Mon Sep 17 00:00:00 2001
From: Shobhit Singh <shobs@google.com>
Date: Fri, 10 May 2024 23:54:16 +0000
Subject: [PATCH 13/17] feat: support gcf vpc connector in `remote_function`
 (#677)

---
 bigframes/functions/remote_function.py     | 14 +++-
 bigframes/pandas/__init__.py               |  2 +
 bigframes/session/__init__.py              |  9 ++-
 tests/system/large/test_remote_function.py | 75 +++++++++++++++++++++-
 4 files changed, 97 insertions(+), 3 deletions(-)

diff --git a/bigframes/functions/remote_function.py b/bigframes/functions/remote_function.py
index 6e42ca9f48..2a7a900779 100644
--- a/bigframes/functions/remote_function.py
+++ b/bigframes/functions/remote_function.py
@@ -441,6 +441,7 @@ def create_cloud_function(
         timeout_seconds=600,
         max_instance_count=None,
         is_row_processor=False,
+        vpc_connector=None,
     ):
         """Create a cloud function from the given user defined function."""
 
@@ -519,6 +520,8 @@ def create_cloud_function(
                 function.service_config.timeout_seconds = timeout_seconds
             if max_instance_count is not None:
                 function.service_config.max_instance_count = max_instance_count
+            if vpc_connector is not None:
+                function.service_config.vpc_connector = vpc_connector
             function.service_config.service_account_email = (
                 self._cloud_function_service_account
             )
@@ -568,6 +571,7 @@ def provision_bq_remote_function(
         cloud_function_timeout,
         cloud_function_max_instance_count,
         is_row_processor,
+        cloud_function_vpc_connector,
     ):
         """Provision a BigQuery remote function."""
         # If reuse of any existing function with the same name (indicated by the
@@ -595,6 +599,7 @@ def provision_bq_remote_function(
                 cloud_function_timeout,
                 cloud_function_max_instance_count,
                 is_row_processor,
+                cloud_function_vpc_connector,
             )
         else:
             logger.info(f"Cloud function {cloud_function_name} already exists.")
@@ -750,6 +755,7 @@ def remote_function(
     max_batching_rows: Optional[int] = 1000,
     cloud_function_timeout: Optional[int] = 600,
     cloud_function_max_instances: Optional[int] = None,
+    cloud_function_vpc_connector: Optional[str] = None,
 ):
     """Decorator to turn a user defined function into a BigQuery remote function.
 
@@ -894,7 +900,12 @@ def remote_function(
             control the spike in the billing. Higher setting can help
             support processing larger scale data. When not specified, cloud
             function's default setting applies. For more details see
-            https://cloud.google.com/functions/docs/configuring/max-instances
+            https://cloud.google.com/functions/docs/configuring/max-instances.
+        cloud_function_vpc_connector (str, Optional):
+            The VPC connector you would like to configure for your cloud
+            function. This is useful if your code needs access to data or
+            service(s) that are on a VPC network. See for more details
+            https://cloud.google.com/functions/docs/networking/connecting-vpc.
     """
     is_row_processor = False
 
@@ -1041,6 +1052,7 @@ def wrapper(f):
             cloud_function_timeout,
             cloud_function_max_instances,
             is_row_processor,
+            cloud_function_vpc_connector,
         )
 
         # TODO: Move ibis logic to compiler step
diff --git a/bigframes/pandas/__init__.py b/bigframes/pandas/__init__.py
index 1d6da46fae..8d2c0b148c 100644
--- a/bigframes/pandas/__init__.py
+++ b/bigframes/pandas/__init__.py
@@ -654,6 +654,7 @@ def remote_function(
     max_batching_rows: Optional[int] = 1000,
     cloud_function_timeout: Optional[int] = 600,
     cloud_function_max_instances: Optional[int] = None,
+    cloud_function_vpc_connector: Optional[str] = None,
 ):
     return global_session.with_default_session(
         bigframes.session.Session.remote_function,
@@ -670,6 +671,7 @@ def remote_function(
         max_batching_rows=max_batching_rows,
         cloud_function_timeout=cloud_function_timeout,
         cloud_function_max_instances=cloud_function_max_instances,
+        cloud_function_vpc_connector=cloud_function_vpc_connector,
     )
 
 
diff --git a/bigframes/session/__init__.py b/bigframes/session/__init__.py
index 473fc4f098..727269e7ee 100644
--- a/bigframes/session/__init__.py
+++ b/bigframes/session/__init__.py
@@ -1412,6 +1412,7 @@ def remote_function(
         max_batching_rows: Optional[int] = 1000,
         cloud_function_timeout: Optional[int] = 600,
         cloud_function_max_instances: Optional[int] = None,
+        cloud_function_vpc_connector: Optional[str] = None,
     ):
         """Decorator to turn a user defined function into a BigQuery remote function. Check out
         the code samples at: https://cloud.google.com/bigquery/docs/remote-functions#bigquery-dataframes.
@@ -1537,7 +1538,12 @@ def remote_function(
                 control the spike in the billing. Higher setting can help
                 support processing larger scale data. When not specified, cloud
                 function's default setting applies. For more details see
-                https://cloud.google.com/functions/docs/configuring/max-instances
+                https://cloud.google.com/functions/docs/configuring/max-instances.
+            cloud_function_vpc_connector (str, Optional):
+                The VPC connector you would like to configure for your cloud
+                function. This is useful if your code needs access to data or
+                service(s) that are on a VPC network. See for more details
+                https://cloud.google.com/functions/docs/networking/connecting-vpc.
         Returns:
             callable: A remote function object pointing to the cloud assets created
             in the background to support the remote execution. The cloud assets can be
@@ -1562,6 +1568,7 @@ def remote_function(
             max_batching_rows=max_batching_rows,
             cloud_function_timeout=cloud_function_timeout,
             cloud_function_max_instances=cloud_function_max_instances,
+            cloud_function_vpc_connector=cloud_function_vpc_connector,
         )
 
     def read_gbq_function(
diff --git a/tests/system/large/test_remote_function.py b/tests/system/large/test_remote_function.py
index e086903d03..b7d99ea36c 100644
--- a/tests/system/large/test_remote_function.py
+++ b/tests/system/large/test_remote_function.py
@@ -21,7 +21,7 @@
 import tempfile
 import textwrap
 
-from google.api_core.exceptions import BadRequest, NotFound
+from google.api_core.exceptions import BadRequest, InvalidArgument, NotFound
 from google.cloud import bigquery, storage
 import pandas
 import pytest
@@ -1333,6 +1333,79 @@ def square_num(x):
         )
 
 
+@pytest.mark.flaky(retries=2, delay=120)
+def test_remote_function_via_session_vpc(scalars_dfs):
+    # TODO(shobs): Automate the following set-up during testing in the test project.
+    #
+    # For upfront convenience, the following set up has been statically created
+    # in the project bigfrmames-dev-perf via cloud console:
+    #
+    # 1. Create a vpc connector as per
+    #    https://cloud.google.com/vpc/docs/configure-serverless-vpc-access#gcloud
+    #
+    #    $ gcloud compute networks vpc-access connectors create bigframes-vpc --project=bigframes-dev-perf --region=us-central1 --range 10.8.0.0/28
+    #    Create request issued for: [bigframes-vpc]
+    #    Waiting for operation [projects/bigframes-dev-perf/locations/us-central1/operations/f9f90df6-7cf4-4420-8c2f-b3952775dcfb] to complete...done.
+    #    Created connector [bigframes-vpc].
+    #
+    #    $ gcloud compute networks vpc-access connectors list --project=bigframes-dev-perf --region=us-central1
+    #    CONNECTOR_ID   REGION       NETWORK  IP_CIDR_RANGE  SUBNET  SUBNET_PROJECT  MACHINE_TYPE  MIN_INSTANCES  MAX_INSTANCES  MIN_THROUGHPUT  MAX_THROUGHPUT  STATE
+    #    bigframes-vpc  us-central1  default  10.8.0.0/28                            e2-micro      2              10             200             1000            READY
+
+    project = "bigframes-dev-perf"
+    gcf_vpc_connector = "bigframes-vpc"
+
+    rf_session = bigframes.Session(context=bigframes.BigQueryOptions(project=project))
+
+    try:
+
+        def square_num(x):
+            if x is None:
+                return x
+            return x * x
+
+        square_num_remote = rf_session.remote_function(
+            [int], int, reuse=False, cloud_function_vpc_connector=gcf_vpc_connector
+        )(square_num)
+
+        scalars_df, scalars_pandas_df = scalars_dfs
+
+        bf_int64_col = scalars_df["int64_col"]
+        bf_result_col = bf_int64_col.apply(square_num_remote)
+        bf_result = bf_int64_col.to_frame().assign(result=bf_result_col).to_pandas()
+
+        pd_int64_col = scalars_pandas_df["int64_col"]
+        pd_result_col = pd_int64_col.apply(square_num)
+        pd_result = pd_int64_col.to_frame().assign(result=pd_result_col)
+
+        assert_pandas_df_equal(bf_result, pd_result, check_dtype=False)
+
+        # Assert that the GCF is created with the intended vpc connector
+        gcf = rf_session.cloudfunctionsclient.get_function(
+            name=square_num_remote.bigframes_cloud_function
+        )
+        assert gcf.service_config.vpc_connector == gcf_vpc_connector
+    finally:
+        # clean up the gcp assets created for the remote function
+        cleanup_remote_function_assets(
+            rf_session.bqclient, rf_session.cloudfunctionsclient, square_num_remote
+        )
+
+
+def test_remote_function_via_session_vpc_invalid(session):
+    with pytest.raises(
+        InvalidArgument, match="400.*Serverless VPC Access connector is not found"
+    ):
+
+        @session.remote_function(
+            [int], int, reuse=False, cloud_function_vpc_connector="does-not-exist"
+        )
+        def square_num(x):
+            if x is None:
+                return x
+            return x * x
+
+
 @pytest.mark.parametrize(
     ("max_batching_rows"),
     [

From 2fd1b8117bda0dee5d8fc0924c80ce257fa9e3f1 Mon Sep 17 00:00:00 2001
From: TrevorBergeron <tbergeron@google.com>
Date: Fri, 10 May 2024 18:22:01 -0700
Subject: [PATCH 14/17] feat: Add Series.combine (#680)

---
 bigframes/core/compile/scalar_op_compiler.py  | 22 ++++++--
 bigframes/operations/__init__.py              | 16 +++++-
 bigframes/series.py                           | 36 +++++++++++-
 tests/system/large/test_remote_function.py    | 35 ++++++++++++
 tests/system/small/test_numpy.py              | 53 ++++++++----------
 tests/system/small/test_series.py             | 35 ++++++++++++
 .../bigframes_vendored/pandas/core/series.py  | 56 +++++++++++++++++++
 7 files changed, 214 insertions(+), 39 deletions(-)

diff --git a/bigframes/core/compile/scalar_op_compiler.py b/bigframes/core/compile/scalar_op_compiler.py
index a65ff6fe0c..90025b3994 100644
--- a/bigframes/core/compile/scalar_op_compiler.py
+++ b/bigframes/core/compile/scalar_op_compiler.py
@@ -1298,22 +1298,36 @@ def coalesce_impl(
         return ibis.coalesce(x, y)
 
 
-@scalar_op_compiler.register_binary_op(ops.cliplower_op)
-def clip_lower(
+@scalar_op_compiler.register_binary_op(ops.maximum_op)
+def maximum_impl(
     value: ibis_types.Value,
     lower: ibis_types.Value,
 ):
+    # Note: propagates nulls
     return ibis.case().when(lower.isnull() | (value < lower), lower).else_(value).end()
 
 
-@scalar_op_compiler.register_binary_op(ops.clipupper_op)
-def clip_upper(
+@scalar_op_compiler.register_binary_op(ops.minimum_op)
+def minimum_impl(
     value: ibis_types.Value,
     upper: ibis_types.Value,
 ):
+    # Note: propagates nulls
     return ibis.case().when(upper.isnull() | (value > upper), upper).else_(value).end()
 
 
+@scalar_op_compiler.register_binary_op(ops.BinaryRemoteFunctionOp, pass_op=True)
+def binary_remote_function_op_impl(
+    x: ibis_types.Value, y: ibis_types.Value, op: ops.BinaryRemoteFunctionOp
+):
+    if not hasattr(op.func, "bigframes_remote_function"):
+        raise TypeError(
+            f"only a bigframes remote function is supported as a callable. {constants.FEEDBACK_LINK}"
+        )
+    x_transformed = op.func(x, y)
+    return x_transformed
+
+
 # Ternary Operations
 @scalar_op_compiler.register_ternary_op(ops.where_op)
 def where_op(
diff --git a/bigframes/operations/__init__.py b/bigframes/operations/__init__.py
index e52f488d38..6f99f71013 100644
--- a/bigframes/operations/__init__.py
+++ b/bigframes/operations/__init__.py
@@ -545,8 +545,8 @@ def output_type(self, *input_types):
 
 # Binary Ops
 fillna_op = create_binary_op(name="fillna", type_signature=op_typing.COERCE)
-cliplower_op = create_binary_op(name="clip_lower", type_signature=op_typing.COERCE)
-clipupper_op = create_binary_op(name="clip_upper", type_signature=op_typing.COERCE)
+maximum_op = create_binary_op(name="maximum", type_signature=op_typing.COERCE)
+minimum_op = create_binary_op(name="minimum", type_signature=op_typing.COERCE)
 coalesce_op = create_binary_op(name="coalesce", type_signature=op_typing.COERCE)
 
 
@@ -587,6 +587,16 @@ def output_type(self, *input_types):
         raise TypeError(f"Cannot subtract dtypes {left_type} and {right_type}")
 
 
+@dataclasses.dataclass(frozen=True)
+class BinaryRemoteFunctionOp(BinaryOp):
+    name: typing.ClassVar[str] = "binary_remote_function"
+    func: typing.Callable
+
+    def output_type(self, *input_types):
+        # This property should be set to a valid Dtype by the @remote_function decorator or read_gbq_function method
+        return self.func.output_dtype
+
+
 add_op = AddOp()
 sub_op = SubOp()
 mul_op = create_binary_op(name="mul", type_signature=op_typing.BINARY_NUMERIC)
@@ -713,4 +723,6 @@ def output_type(self, *input_types: dtypes.ExpressionType) -> dtypes.ExpressionT
     np.divide: div_op,
     np.power: pow_op,
     np.arctan2: arctan2_op,
+    np.maximum: maximum_op,
+    np.minimum: minimum_op,
 }
diff --git a/bigframes/series.py b/bigframes/series.py
index 313380e4a4..d1fb0d679b 100644
--- a/bigframes/series.py
+++ b/bigframes/series.py
@@ -1031,9 +1031,9 @@ def clip(self, lower, upper):
         if lower is None and upper is None:
             return self
         if lower is None:
-            return self._apply_binary_op(upper, ops.clipupper_op, alignment="left")
+            return self._apply_binary_op(upper, ops.minimum_op, alignment="left")
         if upper is None:
-            return self._apply_binary_op(lower, ops.cliplower_op, alignment="left")
+            return self._apply_binary_op(lower, ops.maximum_op, alignment="left")
         value_id, lower_id, upper_id, block = self._align3(lower, upper)
         block, result_id = block.apply_ternary_op(
             value_id, lower_id, upper_id, ops.clip_op
@@ -1374,6 +1374,38 @@ def apply(
         materialized_series = result_series._cached()
         return materialized_series
 
+    def combine(
+        self,
+        other,
+        func,
+    ) -> Series:
+        if not callable(func):
+            raise ValueError(
+                "Only a ufunc (a function that applies to the entire Series) or a remote function that only works on single values are supported."
+            )
+
+        if not hasattr(func, "bigframes_remote_function"):
+            # Keep this in sync with .apply
+            try:
+                return func(self, other)
+            except Exception as ex:
+                # This could happen if any of the operators in func is not
+                # supported on a Series. Let's guide the customer to use a
+                # remote function instead
+                if hasattr(ex, "message"):
+                    ex.message += f"\n{_remote_function_recommendation_message}"
+                raise
+
+        reprojected_series = Series(self._block._force_reproject())
+        result_series = reprojected_series._apply_binary_op(
+            other, ops.BinaryRemoteFunctionOp(func=func)
+        )
+
+        # return Series with materialized result so that any error in the remote
+        # function is caught early
+        materialized_series = result_series._cached()
+        return materialized_series
+
     def add_prefix(self, prefix: str, axis: int | str | None = None) -> Series:
         return Series(self._get_block().add_prefix(prefix))
 
diff --git a/tests/system/large/test_remote_function.py b/tests/system/large/test_remote_function.py
index b7d99ea36c..0fa1d90e8b 100644
--- a/tests/system/large/test_remote_function.py
+++ b/tests/system/large/test_remote_function.py
@@ -221,6 +221,41 @@ def stringify(x):
         )
 
 
+# @pytest.mark.flaky(retries=2, delay=120)
+def test_remote_function_binop(session, scalars_dfs, dataset_id, bq_cf_connection):
+    try:
+
+        def func(x, y):
+            return x * abs(y % 4)
+
+        remote_func = session.remote_function(
+            [str, int],
+            str,
+            dataset_id,
+            bq_cf_connection,
+            reuse=False,
+        )(func)
+
+        scalars_df, scalars_pandas_df = scalars_dfs
+
+        scalars_df = scalars_df.dropna()
+        scalars_pandas_df = scalars_pandas_df.dropna()
+        bf_result = (
+            scalars_df["string_col"]
+            .combine(scalars_df["int64_col"], remote_func)
+            .to_pandas()
+        )
+        pd_result = scalars_pandas_df["string_col"].combine(
+            scalars_pandas_df["int64_col"], func
+        )
+        pandas.testing.assert_series_equal(bf_result, pd_result)
+    finally:
+        # clean up the gcp assets created for the remote function
+        cleanup_remote_function_assets(
+            session.bqclient, session.cloudfunctionsclient, remote_func
+        )
+
+
 @pytest.mark.flaky(retries=2, delay=120)
 def test_remote_function_decorator_with_bigframes_series(
     session, scalars_dfs, dataset_id, bq_cf_connection
diff --git a/tests/system/small/test_numpy.py b/tests/system/small/test_numpy.py
index 8e349e472a..8f62d9628c 100644
--- a/tests/system/small/test_numpy.py
+++ b/tests/system/small/test_numpy.py
@@ -73,27 +73,6 @@ def test_df_ufuncs(scalars_dfs, opname):
     pd.testing.assert_frame_equal(bf_result, pd_result)
 
 
-@pytest.mark.parametrize(
-    ("opname",),
-    [
-        ("add",),
-        ("subtract",),
-        ("multiply",),
-        ("divide",),
-        ("power",),
-        ("arctan2",),
-    ],
-)
-def test_series_binary_ufuncs(floats_product_pd, floats_product_bf, opname):
-    bf_result = getattr(np, opname)(
-        floats_product_bf.float64_col_x, floats_product_bf.float64_col_y
-    ).to_pandas()
-    pd_result = getattr(np, opname)(
-        floats_product_pd.float64_col_x, floats_product_pd.float64_col_y
-    )
-    pd.testing.assert_series_equal(bf_result, pd_result)
-
-
 @pytest.mark.parametrize(
     ("opname",),
     [
@@ -106,17 +85,16 @@ def test_series_binary_ufuncs(floats_product_pd, floats_product_bf, opname):
 )
 def test_df_binary_ufuncs(scalars_dfs, opname):
     scalars_df, scalars_pandas_df = scalars_dfs
+    op = getattr(np, opname)
 
-    bf_result = getattr(np, opname)(
-        scalars_df[["float64_col", "int64_col"]], 5.1
-    ).to_pandas()
-    pd_result = getattr(np, opname)(
-        scalars_pandas_df[["float64_col", "int64_col"]], 5.1
-    )
+    bf_result = op(scalars_df[["float64_col", "int64_col"]], 5.1).to_pandas()
+    pd_result = op(scalars_pandas_df[["float64_col", "int64_col"]], 5.1)
 
     pd.testing.assert_frame_equal(bf_result, pd_result)
 
 
+# Operations tested here don't work on full dataframe in numpy+pandas
+# Maybe because of nullable dtypes?
 @pytest.mark.parametrize(
     ("x", "y"),
     [
@@ -124,12 +102,25 @@ def test_df_binary_ufuncs(scalars_dfs, opname):
         ("float64_col", "int64_col"),
     ],
 )
-def test_series_atan2(scalars_dfs, x, y):
-    # Test atan2 separately as pandas errors when passing entire df as input, so pass only series
+@pytest.mark.parametrize(
+    ("opname",),
+    [
+        ("add",),
+        ("subtract",),
+        ("multiply",),
+        ("divide",),
+        ("arctan2",),
+        ("minimum",),
+        ("maximum",),
+    ],
+)
+def test_series_binary_ufuncs(scalars_dfs, x, y, opname):
     scalars_df, scalars_pandas_df = scalars_dfs
 
-    bf_result = np.arctan2(scalars_df[x], scalars_df[y]).to_pandas()
-    pd_result = np.arctan2(scalars_pandas_df[x], scalars_pandas_df[y])
+    op = getattr(np, opname)
+
+    bf_result = op(scalars_df[x], scalars_df[y]).to_pandas()
+    pd_result = op(scalars_pandas_df[x], scalars_pandas_df[y])
 
     pd.testing.assert_series_equal(bf_result, pd_result)
 
diff --git a/tests/system/small/test_series.py b/tests/system/small/test_series.py
index beb99b1ada..fa514784c0 100644
--- a/tests/system/small/test_series.py
+++ b/tests/system/small/test_series.py
@@ -3509,6 +3509,41 @@ def test_apply_numpy_ufunc(scalars_dfs, ufunc):
     assert_series_equal(bf_result, pd_result)
 
 
+@pytest.mark.parametrize(
+    ("ufunc",),
+    [
+        pytest.param(numpy.add),
+        pytest.param(numpy.divide),
+    ],
+    ids=[
+        "add",
+        "divide",
+    ],
+)
+def test_combine_series_ufunc(scalars_dfs, ufunc):
+    scalars_df, scalars_pandas_df = scalars_dfs
+
+    bf_col = scalars_df["int64_col"].dropna()
+    bf_result = bf_col.combine(bf_col, ufunc).to_pandas()
+
+    pd_col = scalars_pandas_df["int64_col"].dropna()
+    pd_result = pd_col.combine(pd_col, ufunc)
+
+    assert_series_equal(bf_result, pd_result, check_dtype=False)
+
+
+def test_combine_scalar_ufunc(scalars_dfs):
+    scalars_df, scalars_pandas_df = scalars_dfs
+
+    bf_col = scalars_df["int64_col"].dropna()
+    bf_result = bf_col.combine(2.5, numpy.add).to_pandas()
+
+    pd_col = scalars_pandas_df["int64_col"].dropna()
+    pd_result = pd_col.combine(2.5, numpy.add)
+
+    assert_series_equal(bf_result, pd_result, check_dtype=False)
+
+
 def test_apply_simple_udf(scalars_dfs):
     scalars_df, scalars_pandas_df = scalars_dfs
 
diff --git a/third_party/bigframes_vendored/pandas/core/series.py b/third_party/bigframes_vendored/pandas/core/series.py
index e155fb073a..585e20275c 100644
--- a/third_party/bigframes_vendored/pandas/core/series.py
+++ b/third_party/bigframes_vendored/pandas/core/series.py
@@ -1279,6 +1279,62 @@ def apply(
         """
         raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)
 
+    def combine(
+        self,
+        other: Series | Hashable,
+        func,
+    ) -> Series:
+        """
+        Combine the Series with a Series or scalar according to `func`.
+
+        Combine the Series and `other` using `func` to perform elementwise
+        selection for combined Series.
+        `fill_value` is assumed when value is missing at some index
+        from one of the two objects being combined.
+
+        **Examples:**
+
+            >>> import bigframes.pandas as bpd
+            >>> import numpy as np
+            >>> bpd.options.display.progress_bar = None
+
+            Consider 2 Datasets ``s1`` and ``s2`` containing
+            highest clocked speeds of different birds.
+
+            >>> s1 = bpd.Series({'falcon': 330.0, 'eagle': 160.0})
+            >>> s1
+            falcon    330.0
+            eagle     160.0
+            dtype: Float64
+            >>> s2 = bpd.Series({'falcon': 345.0, 'eagle': 200.0, 'duck': 30.0})
+            >>> s2
+            falcon    345.0
+            eagle     200.0
+            duck       30.0
+            dtype: Float64
+
+            Now, to combine the two datasets and view the highest speeds
+            of the birds across the two datasets
+
+            >>> s1.combine(s2, np.maximum)
+            falcon    345.0
+            eagle     200.0
+            duck       <NA>
+            dtype: Float64
+
+        Args:
+            other (Series or scalar):
+                The value(s) to be combined with the `Series`.
+            func (function):
+                BigFrames DataFrames ``remote_function`` to apply.
+                Takes two scalars as inputs and returns an element.
+                Also accepts some numpy binary functions.
+
+        Returns:
+            Series: The result of combining the Series with the other object.
+        """
+        raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)
+
     def groupby(
         self,
         by=None,

From 6eb19a7288155b093aa7cc9bcbc710b31e7dc87a Mon Sep 17 00:00:00 2001
From: Chelsea Lin <124939984+chelsea-lin@users.noreply.github.com>
Date: Mon, 13 May 2024 10:13:08 -0700
Subject: [PATCH 15/17] feat: Series.str.split (#675)

* feat: Series.str.split

* add more tests

* format fix
---
 bigframes/core/compile/scalar_op_compiler.py  |  5 ++
 bigframes/dtypes.py                           |  6 +++
 bigframes/operations/__init__.py              | 17 +++++--
 bigframes/operations/strings.py               | 12 +++++
 tests/system/small/operations/test_strings.py | 31 ++++++++++++
 .../pandas/core/strings/accessor.py           | 48 +++++++++++++++++++
 6 files changed, 116 insertions(+), 3 deletions(-)

diff --git a/bigframes/core/compile/scalar_op_compiler.py b/bigframes/core/compile/scalar_op_compiler.py
index 90025b3994..8a44844fba 100644
--- a/bigframes/core/compile/scalar_op_compiler.py
+++ b/bigframes/core/compile/scalar_op_compiler.py
@@ -588,6 +588,11 @@ def endswith_op_impl(x: ibis_types.Value, op: ops.EndsWithOp):
     return any_match if any_match is not None else ibis_types.literal(False)
 
 
+@scalar_op_compiler.register_unary_op(ops.StringSplitOp, pass_op=True)
+def stringsplit_op_impl(x: ibis_types.Value, op: ops.StringSplitOp):
+    return typing.cast(ibis_types.StringValue, x).split(op.pat)
+
+
 @scalar_op_compiler.register_unary_op(ops.ZfillOp, pass_op=True)
 def zfill_op_impl(x: ibis_types.Value, op: ops.ZfillOp):
     str_value = typing.cast(ibis_types.StringValue, x)
diff --git a/bigframes/dtypes.py b/bigframes/dtypes.py
index d2dc210e0d..2a344aff2d 100644
--- a/bigframes/dtypes.py
+++ b/bigframes/dtypes.py
@@ -405,6 +405,12 @@ def bigframes_dtype_to_ibis_dtype(
     return BIGFRAMES_TO_IBIS[bigframes_dtype]
 
 
+def bigframes_dtype_to_arrow_dtype(
+    bigframes_dtype: Union[DtypeString, Dtype, np.dtype[Any]]
+) -> pa.DataType:
+    return ibis_dtype_to_arrow_dtype(bigframes_dtype_to_ibis_dtype(bigframes_dtype))
+
+
 def literal_to_ibis_scalar(
     literal, force_dtype: typing.Optional[Dtype] = None, validate: bool = True
 ):
diff --git a/bigframes/operations/__init__.py b/bigframes/operations/__init__.py
index 6f99f71013..929ccaecc5 100644
--- a/bigframes/operations/__init__.py
+++ b/bigframes/operations/__init__.py
@@ -386,6 +386,19 @@ def output_type(self, *input_types):
         return op_typing.STRING_PREDICATE.output_type(input_types[0])
 
 
+@dataclasses.dataclass(frozen=True)
+class StringSplitOp(UnaryOp):
+    name: typing.ClassVar[str] = "str_split"
+    pat: typing.Sequence[str]
+
+    def output_type(self, *input_types):
+        input_type = input_types[0]
+        if not isinstance(input_type, pd.StringDtype):
+            raise TypeError("field accessor input must be a string type")
+        arrow_type = dtypes.bigframes_dtype_to_arrow_dtype(input_type)
+        return pd.ArrowDtype(pa.list_(arrow_type))
+
+
 @dataclasses.dataclass(frozen=True)
 class EndsWithOp(UnaryOp):
     name: typing.ClassVar[str] = "str_endswith"
@@ -463,9 +476,7 @@ def output_type(self, *input_types):
             raise TypeError("field accessor input must be a struct type")
 
         pa_result_type = pa_type[self.name_or_index].type
-        # TODO: Directly convert from arrow to pandas type
-        ibis_result_type = dtypes.arrow_dtype_to_ibis_dtype(pa_result_type)
-        return dtypes.ibis_dtype_to_bigframes_dtype(ibis_result_type)
+        return dtypes.arrow_dtype_to_bigframes_dtype(pa_result_type)
 
 
 @dataclasses.dataclass(frozen=True)
diff --git a/bigframes/operations/strings.py b/bigframes/operations/strings.py
index 883d19a1e3..22c325d7e0 100644
--- a/bigframes/operations/strings.py
+++ b/bigframes/operations/strings.py
@@ -247,6 +247,18 @@ def endswith(
             pat = (pat,)
         return self._apply_unary_op(ops.EndsWithOp(pat=pat))
 
+    def split(
+        self,
+        pat: str = " ",
+        regex: Union[bool, None] = None,
+    ) -> series.Series:
+        if regex is True or (regex is None and len(pat) > 1):
+            raise NotImplementedError(
+                "Regular expressions aren't currently supported. Please set "
+                + f"`regex=False` and try again. {constants.FEEDBACK_LINK}"
+            )
+        return self._apply_unary_op(ops.StringSplitOp(pat=pat))
+
     def zfill(self, width: int) -> series.Series:
         return self._apply_unary_op(ops.ZfillOp(width=width))
 
diff --git a/tests/system/small/operations/test_strings.py b/tests/system/small/operations/test_strings.py
index 9654c77ec4..b8a8ad2d1e 100644
--- a/tests/system/small/operations/test_strings.py
+++ b/tests/system/small/operations/test_strings.py
@@ -531,3 +531,34 @@ def test_str_rjust(scalars_dfs):
         pd_result,
         bf_result,
     )
+
+
+@pytest.mark.parametrize(
+    ("pat", "regex"),
+    [
+        pytest.param(" ", None, id="one_char"),
+        pytest.param("ll", False, id="two_chars"),
+        pytest.param(
+            " ",
+            True,
+            id="one_char_reg",
+            marks=pytest.mark.xfail(raises=NotImplementedError),
+        ),
+        pytest.param(
+            "ll",
+            None,
+            id="two_chars_reg",
+            marks=pytest.mark.xfail(raises=NotImplementedError),
+        ),
+    ],
+)
+def test_str_split_raise_errors(scalars_dfs, pat, regex):
+    scalars_df, scalars_pandas_df = scalars_dfs
+    col_name = "string_col"
+    bf_result = scalars_df[col_name].str.split(pat=pat, regex=regex).to_pandas()
+    pd_result = scalars_pandas_df[col_name].str.split(pat=pat, regex=regex)
+
+    # TODO(b/336880368): Allow for NULL values for ARRAY columns in BigQuery.
+    pd_result = pd_result.apply(lambda x: [] if pd.isnull(x) is True else x)
+
+    assert_series_equal(pd_result, bf_result, check_dtype=False)
diff --git a/third_party/bigframes_vendored/pandas/core/strings/accessor.py b/third_party/bigframes_vendored/pandas/core/strings/accessor.py
index 5bb69dc1f2..b02c23f945 100644
--- a/third_party/bigframes_vendored/pandas/core/strings/accessor.py
+++ b/third_party/bigframes_vendored/pandas/core/strings/accessor.py
@@ -940,6 +940,54 @@ def endswith(
         """
         raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)
 
+    def split(
+        self,
+        pat: str = " ",
+        regex: typing.Union[bool, None] = None,
+    ):
+        """
+        Split strings around given separator/delimiter.
+
+        **Examples:**
+
+            >>> import bigframes.pandas as bpd
+            >>> import numpy as np
+            >>> bpd.options.display.progress_bar = None
+
+            >>> s = bpd.Series(
+            ...     [
+            ...         "a regular sentence",
+            ...         "https://docs.python.org/index.html",
+            ...         np.nan
+            ...     ]
+            ... )
+            >>> s.str.split()
+            0                ['a' 'regular' 'sentence']
+            1    ['https://docs.python.org/index.html']
+            2                                        []
+            dtype: list<item: string>[pyarrow]
+
+            The pat parameter can be used to split by other characters.
+
+            >>> s.str.split("//", regex=False)
+            0                     ['a regular sentence']
+            1    ['https:' 'docs.python.org/index.html']
+            2                                         []
+            dtype: list<item: string>[pyarrow]
+
+        Args:
+            pat (str, default " "):
+                String to split on. If not specified, split on whitespace.
+            regex (bool, default None):
+                Determines if the passed-in pattern is a regular expression. Regular
+                expressions aren't currently supported. Please set `regex=False` when
+                `pat` length is not 1.
+
+        Returns:
+            bigframes.series.Series: Type matches caller.
+        """
+        raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)
+
     def match(self, pat: str, case: bool = True, flags: int = 0):
         """
         Determine if each string starts with a match of a regular expression.

From c7e0eadfaa33ae2fde6e6c666b6c670258b5d643 Mon Sep 17 00:00:00 2001
From: Huan Chen <142538604+Genesis929@users.noreply.github.com>
Date: Mon, 13 May 2024 10:53:20 -0700
Subject: [PATCH 16/17] chore: add logger support for properties. (#683)

* chore: add logger support for properties.

* update function

* update label format
---
 bigframes/core/log_adapter.py          | 33 ++++++++++++++++++++++++++
 tests/unit/session/test_io_bigquery.py |  8 ++++---
 2 files changed, 38 insertions(+), 3 deletions(-)

diff --git a/bigframes/core/log_adapter.py b/bigframes/core/log_adapter.py
index b5afafbe7c..877e4a9fa1 100644
--- a/bigframes/core/log_adapter.py
+++ b/bigframes/core/log_adapter.py
@@ -30,6 +30,10 @@ def class_logger(decorated_cls):
     for attr_name, attr_value in decorated_cls.__dict__.items():
         if callable(attr_value) and (attr_name not in _excluded_methods):
             setattr(decorated_cls, attr_name, method_logger(attr_value, decorated_cls))
+        elif isinstance(attr_value, property):
+            setattr(
+                decorated_cls, attr_name, property_logger(attr_value, decorated_cls)
+            )
     return decorated_cls
 
 
@@ -56,6 +60,35 @@ def wrapper(*args, **kwargs):
     return wrapper
 
 
+def property_logger(prop, decorated_cls):
+    """Decorator that adds logging functionality to a property."""
+
+    def shared_wrapper(f):
+        @functools.wraps(f)
+        def wrapped(*args, **kwargs):
+            class_name = decorated_cls.__name__
+            property_name = f.__name__
+            full_property_name = f"{class_name.lower()}-{property_name.lower()}"
+
+            if len(_call_stack) == 0:
+                add_api_method(full_property_name)
+
+            _call_stack.append(full_property_name)
+            try:
+                return f(*args, **kwargs)
+            finally:
+                _call_stack.pop()
+
+        return wrapped
+
+    # Apply the wrapper to the getter, setter, and deleter
+    return property(
+        shared_wrapper(prop.fget),
+        shared_wrapper(prop.fset) if prop.fset else None,
+        shared_wrapper(prop.fdel) if prop.fdel else None,
+    )
+
+
 def add_api_method(api_method_name):
     global _lock
     global _api_methods
diff --git a/tests/unit/session/test_io_bigquery.py b/tests/unit/session/test_io_bigquery.py
index 5f4072e9c2..5a3470e883 100644
--- a/tests/unit/session/test_io_bigquery.py
+++ b/tests/unit/session/test_io_bigquery.py
@@ -64,6 +64,7 @@ def test_create_job_configs_labels_log_adaptor_call_method_under_length_limit():
     # Test running two methods
     df.head()
     df.max()
+    df.columns
     api_methods = log_adapter._api_methods
 
     labels = io_bq.create_job_configs_labels(
@@ -71,9 +72,10 @@ def test_create_job_configs_labels_log_adaptor_call_method_under_length_limit():
     )
     expected_dict = {
         "source": "bigquery-dataframes-temp",
-        "bigframes-api": "dataframe-max",
-        "recent-bigframes-api-0": "dataframe-head",
-        "recent-bigframes-api-1": "dataframe-__init__",
+        "bigframes-api": "dataframe-columns",
+        "recent-bigframes-api-0": "dataframe-max",
+        "recent-bigframes-api-1": "dataframe-head",
+        "recent-bigframes-api-2": "dataframe-__init__",
     }
     assert labels == expected_dict
 

From 0b8b82761f94b240ea671b20732144a5481899aa Mon Sep 17 00:00:00 2001
From: "release-please[bot]"
 <55107282+release-please[bot]@users.noreply.github.com>
Date: Mon, 13 May 2024 14:27:20 -0700
Subject: [PATCH 17/17] chore(main): release 1.6.0 (#667)

Co-authored-by: release-please[bot] <55107282+release-please[bot]@users.noreply.github.com>
Co-authored-by: Huan Chen <142538604+Genesis929@users.noreply.github.com>
---
 CHANGELOG.md         | 33 +++++++++++++++++++++++++++++++++
 bigframes/version.py |  2 +-
 2 files changed, 34 insertions(+), 1 deletion(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 4457c2e443..568efa68b4 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -4,6 +4,39 @@
 
 [1]: https://pypi.org/project/bigframes/#history
 
+## [1.6.0](https://github.com/googleapis/python-bigquery-dataframes/compare/v1.5.0...v1.6.0) (2024-05-13)
+
+
+### Features
+
+* Add `DataFrame.__delitem__` ([#673](https://github.com/googleapis/python-bigquery-dataframes/issues/673)) ([2218c21](https://github.com/googleapis/python-bigquery-dataframes/commit/2218c21b5bb0f9e54a365ba1ada0203cbc4c9efc))
+* Add `Series.case_when()` ([#673](https://github.com/googleapis/python-bigquery-dataframes/issues/673)) ([2218c21](https://github.com/googleapis/python-bigquery-dataframes/commit/2218c21b5bb0f9e54a365ba1ada0203cbc4c9efc))
+* Add `strategy="quantile"` in KBinsDiscretizer ([#654](https://github.com/googleapis/python-bigquery-dataframes/issues/654)) ([c6c487f](https://github.com/googleapis/python-bigquery-dataframes/commit/c6c487fb3e39a980a05ff2dab5fb2b528d44016a))
+* Add Series.combine ([#680](https://github.com/googleapis/python-bigquery-dataframes/issues/680)) ([2fd1b81](https://github.com/googleapis/python-bigquery-dataframes/commit/2fd1b8117bda0dee5d8fc0924c80ce257fa9e3f1))
+* Series.str.split ([#675](https://github.com/googleapis/python-bigquery-dataframes/issues/675)) ([6eb19a7](https://github.com/googleapis/python-bigquery-dataframes/commit/6eb19a7288155b093aa7cc9bcbc710b31e7dc87a))
+* Suggest correct options in bpd.options.bigquery.location ([#666](https://github.com/googleapis/python-bigquery-dataframes/issues/666)) ([57ccabc](https://github.com/googleapis/python-bigquery-dataframes/commit/57ccabcd1402b7938e2c7068e5b4880ef018f39c))
+* Support `axis=1` in `df.apply` for scalar outputs ([#629](https://github.com/googleapis/python-bigquery-dataframes/issues/629)) ([f6bdc4a](https://github.com/googleapis/python-bigquery-dataframes/commit/f6bdc4aeb3f81a1e0b955521c04ac0dd22981c76))
+* Support gcf vpc connector in `remote_function` ([#677](https://github.com/googleapis/python-bigquery-dataframes/issues/677)) ([9ca92d0](https://github.com/googleapis/python-bigquery-dataframes/commit/9ca92d09e9c56db408350b35ec698152c13954ed))
+* Warn with a more specific `DefaultLocationWarning` category when no location can be detected ([#648](https://github.com/googleapis/python-bigquery-dataframes/issues/648)) ([e084e54](https://github.com/googleapis/python-bigquery-dataframes/commit/e084e54557addff78522bbd710637ecb4b46d23e))
+
+
+### Bug Fixes
+
+* Include `index_col` when selecting `columns` and `filters` in `read_gbq_table` ([#648](https://github.com/googleapis/python-bigquery-dataframes/issues/648)) ([e084e54](https://github.com/googleapis/python-bigquery-dataframes/commit/e084e54557addff78522bbd710637ecb4b46d23e))
+
+
+### Dependencies
+
+* Add jellyfish as a dependency for spelling correction ([57ccabc](https://github.com/googleapis/python-bigquery-dataframes/commit/57ccabcd1402b7938e2c7068e5b4880ef018f39c))
+
+
+### Documentation
+
+* Add code snippets for llm text generatiion ([#669](https://github.com/googleapis/python-bigquery-dataframes/issues/669)) ([93416ed](https://github.com/googleapis/python-bigquery-dataframes/commit/93416ed2f8353c12eb162e21e9bf155312b0ed8c))
+* Add logistic regression samples ([#673](https://github.com/googleapis/python-bigquery-dataframes/issues/673)) ([2218c21](https://github.com/googleapis/python-bigquery-dataframes/commit/2218c21b5bb0f9e54a365ba1ada0203cbc4c9efc))
+* Address lint errors in code samples ([#665](https://github.com/googleapis/python-bigquery-dataframes/issues/665)) ([4fc8964](https://github.com/googleapis/python-bigquery-dataframes/commit/4fc89644e47a6da9367b54826b25c6abbe97327b))
+* Document inlining of small data in `read_*` APIs ([#670](https://github.com/googleapis/python-bigquery-dataframes/issues/670)) ([306953a](https://github.com/googleapis/python-bigquery-dataframes/commit/306953aaae69e57c7c2f5eefb88d55a35bdcca9d))
+
 ## [1.5.0](https://github.com/googleapis/python-bigquery-dataframes/compare/v1.4.0...v1.5.0) (2024-05-07)
 
 
diff --git a/bigframes/version.py b/bigframes/version.py
index 5f56ef9c61..e139eaa89e 100644
--- a/bigframes/version.py
+++ b/bigframes/version.py
@@ -12,4 +12,4 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-__version__ = "1.5.0"
+__version__ = "1.6.0"