From 5a7b1c9a0d1607a46823863a6a7bc861f9c9b5af Mon Sep 17 00:00:00 2001 From: Garrett Wu <6505921+GarrettWu@users.noreply.github.com> Date: Tue, 7 May 2024 11:44:07 -0700 Subject: [PATCH 01/17] refactor: ml model load read from class type hints (#656) * refactor: ml model load read from class type hints * exclude unrelated files * fix NoneType * fix tests * fix tests * fix param mappings * fix tests --- bigframes/ml/cluster.py | 28 +++--- bigframes/ml/decomposition.py | 23 ++--- bigframes/ml/ensemble.py | 119 ++++++++++--------------- bigframes/ml/forecasting.py | 42 +++------ bigframes/ml/imported.py | 32 +++---- bigframes/ml/linear_model.py | 61 ++++--------- bigframes/ml/llm.py | 71 +++++++-------- bigframes/ml/loader.py | 4 +- bigframes/ml/utils.py | 39 +++++++- tests/system/large/ml/test_ensemble.py | 4 +- 10 files changed, 192 insertions(+), 231 deletions(-) diff --git a/bigframes/ml/cluster.py b/bigframes/ml/cluster.py index e572bb3bfb..43cfbdd424 100644 --- a/bigframes/ml/cluster.py +++ b/bigframes/ml/cluster.py @@ -34,6 +34,7 @@ "distance_type": "distanceType", "max_iter": "maxIterations", "tol": "minRelativeProgress", + "warm_start": "warmStart", } @@ -67,27 +68,18 @@ def __init__( self._bqml_model_factory = globals.bqml_model_factory() @classmethod - def _from_bq(cls, session: bigframes.Session, model: bigquery.Model) -> KMeans: - assert model.model_type == "KMEANS" + def _from_bq(cls, session: bigframes.Session, bq_model: bigquery.Model) -> KMeans: + assert bq_model.model_type == "KMEANS" kwargs: dict = {} - # See https://cloud.google.com/bigquery/docs/reference/rest/v2/models#trainingrun - last_fitting = model.training_runs[-1]["trainingOptions"] - dummy_kmeans = cls() - for bf_param, bf_value in dummy_kmeans.__dict__.items(): - bqml_param = _BQML_PARAMS_MAPPING.get(bf_param) - if bqml_param in last_fitting: - # Convert types - kwargs[bf_param] = ( - str(last_fitting[bqml_param]) - if bf_param in ["init"] - else type(bf_value)(last_fitting[bqml_param]) - ) - - new_kmeans = cls(**kwargs) - new_kmeans._bqml_model = core.BqmlModel(session, model) - return new_kmeans + kwargs = utils.retrieve_params_from_bq_model( + cls, bq_model, _BQML_PARAMS_MAPPING + ) + + model = cls(**kwargs) + model._bqml_model = core.BqmlModel(session, bq_model) + return model @property def _bqml_options(self) -> dict: diff --git a/bigframes/ml/decomposition.py b/bigframes/ml/decomposition.py index 01b1fda628..ad0bce481f 100644 --- a/bigframes/ml/decomposition.py +++ b/bigframes/ml/decomposition.py @@ -27,6 +27,8 @@ from bigframes.ml import base, core, globals, utils import bigframes.pandas as bpd +_BQML_PARAMS_MAPPING = {"svd_solver": "pcaSolver"} + @log_adapter.class_logger class PCA( @@ -47,23 +49,22 @@ def __init__( self._bqml_model_factory = globals.bqml_model_factory() @classmethod - def _from_bq(cls, session: bigframes.Session, model: bigquery.Model) -> PCA: - assert model.model_type == "PCA" + def _from_bq(cls, session: bigframes.Session, bq_model: bigquery.Model) -> PCA: + assert bq_model.model_type == "PCA" - kwargs: dict = {} + kwargs = utils.retrieve_params_from_bq_model( + cls, bq_model, _BQML_PARAMS_MAPPING + ) - # See https://cloud.google.com/bigquery/docs/reference/rest/v2/models#trainingrun - last_fitting = model.training_runs[-1]["trainingOptions"] + last_fitting = bq_model.training_runs[-1]["trainingOptions"] if "numPrincipalComponents" in last_fitting: kwargs["n_components"] = int(last_fitting["numPrincipalComponents"]) - if "pcaExplainedVarianceRatio" in last_fitting: + elif "pcaExplainedVarianceRatio" in last_fitting: kwargs["n_components"] = float(last_fitting["pcaExplainedVarianceRatio"]) - if "pcaSolver" in last_fitting: - kwargs["svd_solver"] = str(last_fitting["pcaSolver"]) - new_pca = cls(**kwargs) - new_pca._bqml_model = core.BqmlModel(session, model) - return new_pca + model = cls(**kwargs) + model._bqml_model = core.BqmlModel(session, bq_model) + return model @property def _bqml_options(self) -> dict: diff --git a/bigframes/ml/ensemble.py b/bigframes/ml/ensemble.py index b248c295f4..8fc1e22146 100644 --- a/bigframes/ml/ensemble.py +++ b/bigframes/ml/ensemble.py @@ -30,9 +30,10 @@ _BQML_PARAMS_MAPPING = { "booster": "boosterType", + "dart_normalized_type": "dartNormalizeType", "tree_method": "treeMethod", - "colsample_bytree": "colsampleBylevel", - "colsample_bylevel": "colsampleBytree", + "colsample_bytree": "colsampleBytree", + "colsample_bylevel": "colsampleBylevel", "colsample_bynode": "colsampleBynode", "gamma": "minSplitLoss", "subsample": "subsample", @@ -44,6 +45,8 @@ "min_tree_child_weight": "minTreeChildWeight", "max_depth": "maxTreeDepth", "max_iterations": "maxIterations", + "enable_global_explain": "enableGlobalExplain", + "xgboost_version": "xgboostVersion", } @@ -99,24 +102,17 @@ def __init__( @classmethod def _from_bq( - cls, session: bigframes.Session, model: bigquery.Model + cls, session: bigframes.Session, bq_model: bigquery.Model ) -> XGBRegressor: - assert model.model_type == "BOOSTED_TREE_REGRESSOR" + assert bq_model.model_type == "BOOSTED_TREE_REGRESSOR" - kwargs = {} - - # See https://cloud.google.com/bigquery/docs/reference/rest/v2/models#trainingrun - last_fitting = model.training_runs[-1]["trainingOptions"] - - dummy_regressor = cls() - for bf_param, bf_value in dummy_regressor.__dict__.items(): - bqml_param = _BQML_PARAMS_MAPPING.get(bf_param) - if bqml_param in last_fitting: - kwargs[bf_param] = type(bf_value)(last_fitting[bqml_param]) + kwargs = utils.retrieve_params_from_bq_model( + cls, bq_model, _BQML_PARAMS_MAPPING + ) - new_xgb_regressor = cls(**kwargs) - new_xgb_regressor._bqml_model = core.BqmlModel(session, model) - return new_xgb_regressor + model = cls(**kwargs) + model._bqml_model = core.BqmlModel(session, bq_model) + return model @property def _bqml_options(self) -> Dict[str, str | int | bool | float | List[str]]: @@ -255,24 +251,17 @@ def __init__( @classmethod def _from_bq( - cls, session: bigframes.Session, model: bigquery.Model + cls, session: bigframes.Session, bq_model: bigquery.Model ) -> XGBClassifier: - assert model.model_type == "BOOSTED_TREE_CLASSIFIER" + assert bq_model.model_type == "BOOSTED_TREE_CLASSIFIER" - kwargs = {} - - # See https://cloud.google.com/bigquery/docs/reference/rest/v2/models#trainingrun - last_fitting = model.training_runs[-1]["trainingOptions"] - - dummy_classifier = XGBClassifier() - for bf_param, bf_value in dummy_classifier.__dict__.items(): - bqml_param = _BQML_PARAMS_MAPPING.get(bf_param) - if bqml_param is not None: - kwargs[bf_param] = type(bf_value)(last_fitting[bqml_param]) + kwargs = utils.retrieve_params_from_bq_model( + cls, bq_model, _BQML_PARAMS_MAPPING + ) - new_xgb_classifier = cls(**kwargs) - new_xgb_classifier._bqml_model = core.BqmlModel(session, model) - return new_xgb_classifier + model = cls(**kwargs) + model._bqml_model = core.BqmlModel(session, bq_model) + return model @property def _bqml_options(self) -> Dict[str, str | int | bool | float | List[str]]: @@ -370,16 +359,16 @@ def __init__( *, tree_method: Literal["auto", "exact", "approx", "hist"] = "auto", min_tree_child_weight: int = 1, - colsample_bytree=1.0, - colsample_bylevel=1.0, - colsample_bynode=0.8, - gamma=0.00, + colsample_bytree: float = 1.0, + colsample_bylevel: float = 1.0, + colsample_bynode: float = 0.8, + gamma: float = 0.0, max_depth: int = 15, - subsample=0.8, - reg_alpha=0.0, - reg_lambda=1.0, - tol=0.01, - enable_global_explain=False, + subsample: float = 0.8, + reg_alpha: float = 0.0, + reg_lambda: float = 1.0, + tol: float = 0.01, + enable_global_explain: bool = False, xgboost_version: Literal["0.9", "1.1"] = "0.9", ): self.n_estimators = n_estimators @@ -401,24 +390,17 @@ def __init__( @classmethod def _from_bq( - cls, session: bigframes.Session, model: bigquery.Model + cls, session: bigframes.Session, bq_model: bigquery.Model ) -> RandomForestRegressor: - assert model.model_type == "RANDOM_FOREST_REGRESSOR" - - kwargs = {} - - # See https://cloud.google.com/bigquery/docs/reference/rest/v2/models#trainingrun - last_fitting = model.training_runs[-1]["trainingOptions"] + assert bq_model.model_type == "RANDOM_FOREST_REGRESSOR" - dummy_model = cls() - for bf_param, bf_value in dummy_model.__dict__.items(): - bqml_param = _BQML_PARAMS_MAPPING.get(bf_param) - if bqml_param in last_fitting: - kwargs[bf_param] = type(bf_value)(last_fitting[bqml_param]) + kwargs = utils.retrieve_params_from_bq_model( + cls, bq_model, _BQML_PARAMS_MAPPING + ) - new_random_forest_regressor = cls(**kwargs) - new_random_forest_regressor._bqml_model = core.BqmlModel(session, model) - return new_random_forest_regressor + model = cls(**kwargs) + model._bqml_model = core.BqmlModel(session, bq_model) + return model @property def _bqml_options(self) -> Dict[str, str | int | bool | float | List[str]]: @@ -542,7 +524,7 @@ def __init__( reg_alpha: float = 0.0, reg_lambda: float = 1.0, tol: float = 0.01, - enable_global_explain=False, + enable_global_explain: bool = False, xgboost_version: Literal["0.9", "1.1"] = "0.9", ): self.n_estimators = n_estimators @@ -564,24 +546,17 @@ def __init__( @classmethod def _from_bq( - cls, session: bigframes.Session, model: bigquery.Model + cls, session: bigframes.Session, bq_model: bigquery.Model ) -> RandomForestClassifier: - assert model.model_type == "RANDOM_FOREST_CLASSIFIER" - - kwargs = {} + assert bq_model.model_type == "RANDOM_FOREST_CLASSIFIER" - # See https://cloud.google.com/bigquery/docs/reference/rest/v2/models#trainingrun - last_fitting = model.training_runs[-1]["trainingOptions"] - - dummy_model = RandomForestClassifier() - for bf_param, bf_value in dummy_model.__dict__.items(): - bqml_param = _BQML_PARAMS_MAPPING.get(bf_param) - if bqml_param is not None: - kwargs[bf_param] = type(bf_value)(last_fitting[bqml_param]) + kwargs = utils.retrieve_params_from_bq_model( + cls, bq_model, _BQML_PARAMS_MAPPING + ) - new_random_forest_classifier = cls(**kwargs) - new_random_forest_classifier._bqml_model = core.BqmlModel(session, model) - return new_random_forest_classifier + model = cls(**kwargs) + model._bqml_model = core.BqmlModel(session, bq_model) + return model @property def _bqml_options(self) -> Dict[str, str | int | bool | float | List[str]]: diff --git a/bigframes/ml/forecasting.py b/bigframes/ml/forecasting.py index 783e7741b8..a87df61801 100644 --- a/bigframes/ml/forecasting.py +++ b/bigframes/ml/forecasting.py @@ -32,6 +32,7 @@ "auto_arima_min_order": "autoArimaMinOrder", "order": "nonSeasonalOrder", "data_frequency": "dataFrequency", + "include_drift": "includeDrift", "holiday_region": "holidayRegion", "clean_spikes_and_dips": "cleanSpikesAndDips", "adjust_step_changes": "adjustStepChanges", @@ -131,35 +132,18 @@ def __init__( self._bqml_model_factory = globals.bqml_model_factory() @classmethod - def _from_bq(cls, session: bigframes.Session, model: bigquery.Model) -> ARIMAPlus: - assert model.model_type == "ARIMA_PLUS" - - kwargs: dict = {} - last_fitting = model.training_runs[-1]["trainingOptions"] - - dummy_arima = cls() - for bf_param, bf_value in dummy_arima.__dict__.items(): - bqml_param = _BQML_PARAMS_MAPPING.get(bf_param) - if bqml_param in last_fitting: - # Convert types - if bf_param in ["time_series_length_fraction"]: - kwargs[bf_param] = float(last_fitting[bqml_param]) - elif bf_param in [ - "auto_arima_max_order", - "auto_arima_min_order", - "min_time_series_length", - "max_time_series_length", - "trend_smoothing_window_size", - ]: - kwargs[bf_param] = int(last_fitting[bqml_param]) - elif bf_param in ["holiday_region"]: - kwargs[bf_param] = str(last_fitting[bqml_param]) - else: - kwargs[bf_param] = type(bf_value)(last_fitting[bqml_param]) - - new_arima_plus = cls(**kwargs) - new_arima_plus._bqml_model = core.BqmlModel(session, model) - return new_arima_plus + def _from_bq( + cls, session: bigframes.Session, bq_model: bigquery.Model + ) -> ARIMAPlus: + assert bq_model.model_type == "ARIMA_PLUS" + + kwargs = utils.retrieve_params_from_bq_model( + cls, bq_model, _BQML_PARAMS_MAPPING + ) + + model = cls(**kwargs) + model._bqml_model = core.BqmlModel(session, bq_model) + return model @property def _bqml_options(self) -> dict: diff --git a/bigframes/ml/imported.py b/bigframes/ml/imported.py index a642fae74d..cb8fe7a96e 100644 --- a/bigframes/ml/imported.py +++ b/bigframes/ml/imported.py @@ -56,13 +56,13 @@ def _create_bqml_model(self): @classmethod def _from_bq( - cls, session: bigframes.Session, model: bigquery.Model + cls, session: bigframes.Session, bq_model: bigquery.Model ) -> TensorFlowModel: - assert model.model_type == "TENSORFLOW" + assert bq_model.model_type == "TENSORFLOW" - tf_model = cls(session=session, model_path="") - tf_model._bqml_model = core.BqmlModel(session, model) - return tf_model + model = cls(session=session, model_path="") + model._bqml_model = core.BqmlModel(session, bq_model) + return model def predict(self, X: Union[bpd.DataFrame, bpd.Series]) -> bpd.DataFrame: """Predict the result from input DataFrame. @@ -134,12 +134,14 @@ def _create_bqml_model(self): ) @classmethod - def _from_bq(cls, session: bigframes.Session, model: bigquery.Model) -> ONNXModel: - assert model.model_type == "ONNX" + def _from_bq( + cls, session: bigframes.Session, bq_model: bigquery.Model + ) -> ONNXModel: + assert bq_model.model_type == "ONNX" - onnx_model = cls(session=session, model_path="") - onnx_model._bqml_model = core.BqmlModel(session, model) - return onnx_model + model = cls(session=session, model_path="") + model._bqml_model = core.BqmlModel(session, bq_model) + return model def predict(self, X: Union[bpd.DataFrame, bpd.Series]) -> bpd.DataFrame: """Predict the result from input DataFrame. @@ -249,13 +251,13 @@ def _create_bqml_model(self): @classmethod def _from_bq( - cls, session: bigframes.Session, model: bigquery.Model + cls, session: bigframes.Session, bq_model: bigquery.Model ) -> XGBoostModel: - assert model.model_type == "XGBOOST" + assert bq_model.model_type == "XGBOOST" - xgboost_model = cls(session=session, model_path="") - xgboost_model._bqml_model = core.BqmlModel(session, model) - return xgboost_model + model = cls(session=session, model_path="") + model._bqml_model = core.BqmlModel(session, bq_model) + return model def predict(self, X: Union[bpd.DataFrame, bpd.Series]) -> bpd.DataFrame: """Predict the result from input DataFrame. diff --git a/bigframes/ml/linear_model.py b/bigframes/ml/linear_model.py index 0c76a39a1c..32168e9a34 100644 --- a/bigframes/ml/linear_model.py +++ b/bigframes/ml/linear_model.py @@ -42,7 +42,6 @@ "warm_start": "warmStart", "calculate_p_values": "calculatePValues", "enable_global_explain": "enableGlobalExplain", - "category_encoding_method": "categoryEncodingMethod", } @@ -88,30 +87,17 @@ def __init__( @classmethod def _from_bq( - cls, session: bigframes.Session, model: bigquery.Model + cls, session: bigframes.Session, bq_model: bigquery.Model ) -> LinearRegression: - assert model.model_type == "LINEAR_REGRESSION" + assert bq_model.model_type == "LINEAR_REGRESSION" - # TODO(bmil): construct a standard way to extract these properties - kwargs = {} - - # See https://cloud.google.com/bigquery/docs/reference/rest/v2/models#trainingrun - last_fitting = model.training_runs[-1]["trainingOptions"] - - dummy_linear = cls() - for bf_param, bf_value in dummy_linear.__dict__.items(): - bqml_param = _BQML_PARAMS_MAPPING.get(bf_param) - if bqml_param in last_fitting: - # Convert types - kwargs[bf_param] = ( - float(last_fitting[bqml_param]) - if bf_param in ["l1_reg", "learning_rate", "ls_init_learning_rate"] - else type(bf_value)(last_fitting[bqml_param]) - ) + kwargs = utils.retrieve_params_from_bq_model( + cls, bq_model, _BQML_PARAMS_MAPPING + ) - new_linear_regression = cls(**kwargs) - new_linear_regression._bqml_model = core.BqmlModel(session, model) - return new_linear_regression + model = cls(**kwargs) + model._bqml_model = core.BqmlModel(session, bq_model) + return model @property def _bqml_options(self) -> dict: @@ -243,33 +229,24 @@ def __init__( @classmethod def _from_bq( - cls, session: bigframes.Session, model: bigquery.Model + cls, session: bigframes.Session, bq_model: bigquery.Model ) -> LogisticRegression: - assert model.model_type == "LOGISTIC_REGRESSION" - - kwargs = {} - - # See https://cloud.google.com/bigquery/docs/reference/rest/v2/models#trainingrun - last_fitting = model.training_runs[-1]["trainingOptions"] - dummy_logistic = cls() - for bf_param, bf_value in dummy_logistic.__dict__.items(): - bqml_param = _BQML_PARAMS_MAPPING.get(bf_param) - if bqml_param in last_fitting: - # Convert types - kwargs[bf_param] = ( - float(last_fitting[bqml_param]) - if bf_param in ["l1_reg", "learning_rate", "ls_init_learning_rate"] - else type(bf_value)(last_fitting[bqml_param]) - ) + assert bq_model.model_type == "LOGISTIC_REGRESSION" + + kwargs = utils.retrieve_params_from_bq_model( + cls, bq_model, _BQML_PARAMS_MAPPING + ) + + last_fitting = bq_model.training_runs[-1]["trainingOptions"] if last_fitting["autoClassWeights"]: kwargs["class_weight"] = "balanced" # TODO(ashleyxu) support class_weight in the constructor. # if "labelClassWeights" in last_fitting: # kwargs["class_weight"] = last_fitting["labelClassWeights"] - new_logistic_regression = cls(**kwargs) - new_logistic_regression._bqml_model = core.BqmlModel(session, model) - return new_logistic_regression + model = cls(**kwargs) + model._bqml_model = core.BqmlModel(session, bq_model) + return model @property def _bqml_options(self) -> dict: diff --git a/bigframes/ml/llm.py b/bigframes/ml/llm.py index 4b07524194..77dc1d2b0f 100644 --- a/bigframes/ml/llm.py +++ b/bigframes/ml/llm.py @@ -128,38 +128,30 @@ def _create_bqml_model(self): @classmethod def _from_bq( - cls, session: bigframes.Session, model: bigquery.Model + cls, session: bigframes.Session, bq_model: bigquery.Model ) -> PaLM2TextGenerator: - assert model.model_type == "MODEL_TYPE_UNSPECIFIED" - assert "remoteModelInfo" in model._properties - assert "endpoint" in model._properties["remoteModelInfo"] - assert "connection" in model._properties["remoteModelInfo"] + assert bq_model.model_type == "MODEL_TYPE_UNSPECIFIED" + assert "remoteModelInfo" in bq_model._properties + assert "endpoint" in bq_model._properties["remoteModelInfo"] + assert "connection" in bq_model._properties["remoteModelInfo"] # Parse the remote model endpoint - bqml_endpoint = model._properties["remoteModelInfo"]["endpoint"] - model_connection = model._properties["remoteModelInfo"]["connection"] + bqml_endpoint = bq_model._properties["remoteModelInfo"]["endpoint"] + model_connection = bq_model._properties["remoteModelInfo"]["connection"] model_endpoint = bqml_endpoint.split("/")[-1] - # Get the optional params - kwargs: dict = {} - last_fitting = model.training_runs[-1]["trainingOptions"] - - dummy_text_generator = cls(session=session) - for bf_param, _ in dummy_text_generator.__dict__.items(): - bqml_param = _BQML_PARAMS_MAPPING.get(bf_param) - if bqml_param in last_fitting: - # Convert types - if bf_param in ["max_iterations"]: - kwargs[bf_param] = int(last_fitting[bqml_param]) + kwargs = utils.retrieve_params_from_bq_model( + cls, bq_model, _BQML_PARAMS_MAPPING + ) - text_generator_model = cls( + model = cls( **kwargs, session=session, model_name=model_endpoint, connection_name=model_connection, ) - text_generator_model._bqml_model = core.BqmlModel(session, model) - return text_generator_model + model._bqml_model = core.BqmlModel(session, bq_model) + return model @property def _bqml_options(self) -> dict: @@ -464,29 +456,30 @@ def _create_bqml_model(self): @classmethod def _from_bq( - cls, session: bigframes.Session, model: bigquery.Model + cls, session: bigframes.Session, bq_model: bigquery.Model ) -> PaLM2TextEmbeddingGenerator: - assert model.model_type == "MODEL_TYPE_UNSPECIFIED" - assert "remoteModelInfo" in model._properties - assert "endpoint" in model._properties["remoteModelInfo"] - assert "connection" in model._properties["remoteModelInfo"] + assert bq_model.model_type == "MODEL_TYPE_UNSPECIFIED" + assert "remoteModelInfo" in bq_model._properties + assert "endpoint" in bq_model._properties["remoteModelInfo"] + assert "connection" in bq_model._properties["remoteModelInfo"] # Parse the remote model endpoint - bqml_endpoint = model._properties["remoteModelInfo"]["endpoint"] - model_connection = model._properties["remoteModelInfo"]["connection"] + bqml_endpoint = bq_model._properties["remoteModelInfo"]["endpoint"] + model_connection = bq_model._properties["remoteModelInfo"]["connection"] model_endpoint = bqml_endpoint.split("/")[-1] model_name, version = utils.parse_model_endpoint(model_endpoint) - embedding_generator_model = cls( + model = cls( session=session, # str to literals model_name=model_name, # type: ignore version=version, connection_name=model_connection, ) - embedding_generator_model._bqml_model = core.BqmlModel(session, model) - return embedding_generator_model + + model._bqml_model = core.BqmlModel(session, bq_model) + return model def predict(self, X: Union[bpd.DataFrame, bpd.Series]) -> bpd.DataFrame: """Predict the result from input DataFrame. @@ -616,18 +609,18 @@ def _create_bqml_model(self): @classmethod def _from_bq( - cls, session: bigframes.Session, model: bigquery.Model + cls, session: bigframes.Session, bq_model: bigquery.Model ) -> GeminiTextGenerator: - assert model.model_type == "MODEL_TYPE_UNSPECIFIED" - assert "remoteModelInfo" in model._properties - assert "connection" in model._properties["remoteModelInfo"] + assert bq_model.model_type == "MODEL_TYPE_UNSPECIFIED" + assert "remoteModelInfo" in bq_model._properties + assert "connection" in bq_model._properties["remoteModelInfo"] # Parse the remote model endpoint - model_connection = model._properties["remoteModelInfo"]["connection"] + model_connection = bq_model._properties["remoteModelInfo"]["connection"] - text_generator_model = cls(session=session, connection_name=model_connection) - text_generator_model._bqml_model = core.BqmlModel(session, model) - return text_generator_model + model = cls(session=session, connection_name=model_connection) + model._bqml_model = core.BqmlModel(session, bq_model) + return model def predict( self, diff --git a/bigframes/ml/loader.py b/bigframes/ml/loader.py index c6e38e6534..916949077f 100644 --- a/bigframes/ml/loader.py +++ b/bigframes/ml/loader.py @@ -121,7 +121,7 @@ def _transformer_from_bq(session: bigframes.Session, bq_model: bigquery.Model): def _model_from_bq(session: bigframes.Session, bq_model: bigquery.Model): if bq_model.model_type in _BQML_MODEL_TYPE_MAPPING: return _BQML_MODEL_TYPE_MAPPING[bq_model.model_type]._from_bq( # type: ignore - session=session, model=bq_model + session=session, bq_model=bq_model ) if _is_bq_model_remote(bq_model): # Parse the remote model endpoint @@ -130,7 +130,7 @@ def _model_from_bq(session: bigframes.Session, bq_model: bigquery.Model): model_name, _ = utils.parse_model_endpoint(model_endpoint) return _BQML_ENDPOINT_TYPE_MAPPING[model_name]._from_bq( # type: ignore - session=session, model=bq_model + session=session, bq_model=bq_model ) raise NotImplementedError( diff --git a/bigframes/ml/utils.py b/bigframes/ml/utils.py index 364fb5e88d..75dfb916f6 100644 --- a/bigframes/ml/utils.py +++ b/bigframes/ml/utils.py @@ -13,7 +13,9 @@ # limitations under the License. import typing -from typing import Iterable, Optional, Union +from typing import Any, Iterable, Literal, Mapping, Optional, Union + +from google.cloud import bigquery import bigframes.constants as constants from bigframes.core import blocks @@ -69,3 +71,38 @@ def parse_model_endpoint(model_endpoint: str) -> tuple[str, Optional[str]]: model_name = model_endpoint[:at_idx] return model_name, version + + +def _resolve_param_type(t: type) -> type: + def is_optional(t): + return typing.get_origin(t) is Union and type(None) in typing.get_args(t) + + # Optional[type] to type + if is_optional(t): + union_set = set(typing.get_args(t)) + union_set.remove(type(None)) + t = Union[tuple(union_set)] # type: ignore + + # Literal[value0, value1...] to type(value0) + if typing.get_origin(t) is Literal: + return type(typing.get_args(t)[0]) + + return t + + +def retrieve_params_from_bq_model( + cls, bq_model: bigquery.Model, params_mapping: Mapping[str, str] +) -> dict[str, Any]: + """Retrieve parameters of class constructor from BQ model. params_mapping specifies the names mapping param_name -> bqml_name. Params couldn't be found will be ignored.""" + kwargs = {} + + # See https://cloud.google.com/bigquery/docs/reference/rest/v2/models#trainingrun + last_fitting = bq_model.training_runs[-1]["trainingOptions"] + + for bf_param, bf_param_type in typing.get_type_hints(cls.__init__).items(): + bqml_param = params_mapping.get(bf_param) + if bqml_param in last_fitting: + bf_param_type = _resolve_param_type(bf_param_type) + kwargs[bf_param] = bf_param_type(last_fitting[bqml_param]) + + return kwargs diff --git a/tests/system/large/ml/test_ensemble.py b/tests/system/large/ml/test_ensemble.py index 2260e7bbce..3d1fcaf41c 100644 --- a/tests/system/large/ml/test_ensemble.py +++ b/tests/system/large/ml/test_ensemble.py @@ -123,7 +123,7 @@ def test_xgbregressor_dart_booster_multiple_params( in reloaded_model._bqml_model.model_name ) assert reloaded_model.booster == "DART" - assert reloaded_model.dart_normalized_type == "tree" + assert reloaded_model.dart_normalized_type == "TREE" assert reloaded_model.tree_method == "AUTO" assert reloaded_model.colsample_bytree == 0.95 assert reloaded_model.colsample_bylevel == 0.95 @@ -236,7 +236,7 @@ def test_xgbclassifier_dart_booster_multiple_params( in reloaded_model._bqml_model.model_name ) assert reloaded_model.booster == "DART" - assert reloaded_model.dart_normalized_type == "tree" + assert reloaded_model.dart_normalized_type == "TREE" assert reloaded_model.tree_method == "AUTO" assert reloaded_model.colsample_bytree == 0.95 assert reloaded_model.colsample_bylevel == 0.95 From c6c487fb3e39a980a05ff2dab5fb2b528d44016a Mon Sep 17 00:00:00 2001 From: Ashley Xu <139821907+ashleyxuu@users.noreply.github.com> Date: Tue, 7 May 2024 13:24:17 -0700 Subject: [PATCH 02/17] feat: add `strategy="quantile"` in KBinsDiscretizer (#654) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Thank you for opening a Pull Request! Before submitting your PR, there are a few things you can do to make sure it goes smoothly: - [ ] Make sure to open an issue as a [bug/issue](https://togithub.com/googleapis/python-bigquery-dataframes/issues/new/choose) before writing your code! That way we can discuss the change, evaluate designs, and agree on the general idea - [ ] Ensure the tests and linter pass - [ ] Code coverage does not decrease (if any source code was changed) - [ ] Appropriate docs were updated (if necessary) Fixes internal #310685445 🦕 --- bigframes/ml/compose.py | 1 + bigframes/ml/preprocessing.py | 51 +++++++++++----- bigframes/ml/sql.py | 11 +++- tests/system/small/ml/test_preprocessing.py | 58 +++++++++++++++++++ tests/unit/ml/test_sql.py | 7 +++ .../sklearn/preprocessing/_discretization.py | 2 +- 6 files changed, 112 insertions(+), 18 deletions(-) diff --git a/bigframes/ml/compose.py b/bigframes/ml/compose.py index 89969f23e7..77bfd76bde 100644 --- a/bigframes/ml/compose.py +++ b/bigframes/ml/compose.py @@ -38,6 +38,7 @@ "ML.MAX_ABS_SCALER": preprocessing.MaxAbsScaler, "ML.MIN_MAX_SCALER": preprocessing.MinMaxScaler, "ML.BUCKETIZE": preprocessing.KBinsDiscretizer, + "ML.QUANTILE_BUCKETIZE": preprocessing.KBinsDiscretizer, "ML.LABEL_ENCODER": preprocessing.LabelEncoder, } ) diff --git a/bigframes/ml/preprocessing.py b/bigframes/ml/preprocessing.py index 673ee27db0..954d5adff0 100644 --- a/bigframes/ml/preprocessing.py +++ b/bigframes/ml/preprocessing.py @@ -290,10 +290,6 @@ def __init__( n_bins: int = 5, strategy: Literal["uniform", "quantile"] = "quantile", ): - if strategy != "uniform": - raise NotImplementedError( - f"Only strategy = 'uniform' is supported now, input is {strategy}." - ) if n_bins < 2: raise ValueError( f"n_bins has to be larger than or equal to 2, input is {n_bins}." @@ -337,30 +333,53 @@ def _compile_to_sql( min_value + i * bin_size for i in range(self.n_bins - 1) ] - return [ - ( - self._base_sql_generator.ml_bucketize( - column, array_split_points[column], f"kbinsdiscretizer_{column}" - ), - f"kbinsdiscretizer_{column}", + return [ + ( + self._base_sql_generator.ml_bucketize( + column, array_split_points[column], f"kbinsdiscretizer_{column}" + ), + f"kbinsdiscretizer_{column}", + ) + for column in columns + ] + + elif self.strategy == "quantile": + + return [ + ( + self._base_sql_generator.ml_quantile_bucketize( + column, self.n_bins, f"kbinsdiscretizer_{column}" + ), + f"kbinsdiscretizer_{column}", + ) + for column in columns + ] + + else: + raise ValueError( + f"strategy should be set 'quantile' or 'uniform', but your input is {self.strategy}." ) - for column in columns - ] @classmethod def _parse_from_sql(cls, sql: str) -> tuple[KBinsDiscretizer, str]: """Parse SQL to tuple(KBinsDiscretizer, column_label). Args: - sql: SQL string of format "ML.BUCKETIZE({col_label}, array_split_points, FALSE) OVER()" + sql: SQL string of format "ML.BUCKETIZE({col_label}, array_split_points, FALSE)" + or ML.QUANTILE_BUCKETIZE({col_label}, num_bucket) OVER()" Returns: tuple(KBinsDiscretizer, column_label)""" s = sql[sql.find("(") + 1 : sql.find(")")] - array_split_points = s[s.find("[") + 1 : s.find("]")] col_label = s[: s.find(",")] - n_bins = array_split_points.count(",") + 2 - return cls(n_bins, "uniform"), col_label + + if sql.startswith("ML.QUANTILE_BUCKETIZE"): + num_bins = s.split(",")[1] + return cls(int(num_bins), "quantile"), col_label + else: + array_split_points = s[s.find("[") + 1 : s.find("]")] + n_bins = array_split_points.count(",") + 2 + return cls(n_bins, "uniform"), col_label def fit( self, diff --git a/bigframes/ml/sql.py b/bigframes/ml/sql.py index ea693e3437..b701ab301c 100644 --- a/bigframes/ml/sql.py +++ b/bigframes/ml/sql.py @@ -109,9 +109,18 @@ def ml_bucketize( array_split_points: Iterable[Union[int, float]], name: str, ) -> str: - """Encode ML.MIN_MAX_SCALER for BQML""" + """Encode ML.BUCKETIZE for BQML""" return f"""ML.BUCKETIZE({numeric_expr_sql}, {array_split_points}, FALSE) AS {name}""" + def ml_quantile_bucketize( + self, + numeric_expr_sql: str, + num_bucket: int, + name: str, + ) -> str: + """Encode ML.QUANTILE_BUCKETIZE for BQML""" + return f"""ML.QUANTILE_BUCKETIZE({numeric_expr_sql}, {num_bucket}) OVER() AS {name}""" + def ml_one_hot_encoder( self, numeric_expr_sql: str, diff --git a/tests/system/small/ml/test_preprocessing.py b/tests/system/small/ml/test_preprocessing.py index faa0cd7bbd..5b457cc9c0 100644 --- a/tests/system/small/ml/test_preprocessing.py +++ b/tests/system/small/ml/test_preprocessing.py @@ -373,6 +373,27 @@ def test_k_bins_discretizer_normalized_fit_transform_default_params(new_penguins pd.testing.assert_frame_equal(result, expected, rtol=0.1) +def test_k_bins_discretizer_normalized_fit_transform_default_params_quantile( + new_penguins_df, +): + discretizer = preprocessing.KBinsDiscretizer(strategy="quantile") + result = discretizer.fit_transform( + new_penguins_df[["culmen_length_mm", "culmen_depth_mm", "flipper_length_mm"]] + ).to_pandas() + + expected = pd.DataFrame( + { + "kbinsdiscretizer_culmen_length_mm": ["bin_2", "bin_2", "bin_1"], + "kbinsdiscretizer_culmen_depth_mm": ["bin_2", "bin_1", "bin_2"], + "kbinsdiscretizer_flipper_length_mm": ["bin_2", "bin_1", "bin_2"], + }, + dtype="string[pyarrow]", + index=pd.Index([1633, 1672, 1690], name="tag_number", dtype="Int64"), + ) + + pd.testing.assert_frame_equal(result, expected, rtol=0.1) + + def test_k_bins_discretizer_series_normalizes( penguins_df_default_index, new_penguins_df ): @@ -395,6 +416,28 @@ def test_k_bins_discretizer_series_normalizes( pd.testing.assert_frame_equal(result, expected, rtol=0.1) +def test_k_bins_discretizer_series_normalizes_quantile( + penguins_df_default_index, new_penguins_df +): + discretizer = preprocessing.KBinsDiscretizer(strategy="quantile") + discretizer.fit(penguins_df_default_index["culmen_length_mm"]) + + result = discretizer.transform( + penguins_df_default_index["culmen_length_mm"] + ).to_pandas() + result = discretizer.transform(new_penguins_df).to_pandas() + + expected = pd.DataFrame( + { + "kbinsdiscretizer_culmen_length_mm": ["bin_2", "bin_2", "bin_1"], + }, + dtype="string[pyarrow]", + index=pd.Index([1633, 1672, 1690], name="tag_number", dtype="Int64"), + ) + + pd.testing.assert_frame_equal(result, expected, rtol=0.1) + + def test_k_bins_discretizer_normalizes(penguins_df_default_index, new_penguins_df): # TODO(http://b/292431644): add a second test that compares output to sklearn.preprocessing.KBinsDiscretizer, when BQML's change is in prod. discretizer = preprocessing.KBinsDiscretizer(strategy="uniform") @@ -488,6 +531,21 @@ def test_k_bins_discretizer_save_load(new_penguins_df, dataset_id): pd.testing.assert_frame_equal(result, expected, rtol=0.1) +def test_k_bins_discretizer_save_load_quantile(new_penguins_df, dataset_id): + transformer = preprocessing.KBinsDiscretizer(n_bins=6, strategy="quantile") + transformer.fit( + new_penguins_df[["culmen_length_mm", "culmen_depth_mm", "flipper_length_mm"]] + ) + + reloaded_transformer = transformer.to_gbq( + f"{dataset_id}.temp_configured_model", replace=True + ) + assert isinstance(reloaded_transformer, preprocessing.KBinsDiscretizer) + assert reloaded_transformer.n_bins == transformer.n_bins + assert reloaded_transformer.strategy == transformer.strategy + assert reloaded_transformer._bqml_model is not None + + def test_one_hot_encoder_default_params(new_penguins_df): encoder = preprocessing.OneHotEncoder() encoder.fit(new_penguins_df[["species", "sex"]]) diff --git a/tests/unit/ml/test_sql.py b/tests/unit/ml/test_sql.py index 4dd90b2c4a..07b247fb41 100644 --- a/tests/unit/ml/test_sql.py +++ b/tests/unit/ml/test_sql.py @@ -113,6 +113,13 @@ def test_k_bins_discretizer_correct( assert sql == "ML.BUCKETIZE(col_a, [1, 2, 3, 4], FALSE) AS scaled_col_a" +def test_k_bins_discretizer_quantile_correct( + base_sql_generator: ml_sql.BaseSqlGenerator, +): + sql = base_sql_generator.ml_quantile_bucketize("col_a", 5, "scaled_col_a") + assert sql == "ML.QUANTILE_BUCKETIZE(col_a, 5) OVER() AS scaled_col_a" + + def test_one_hot_encoder_correct( base_sql_generator: ml_sql.BaseSqlGenerator, ): diff --git a/third_party/bigframes_vendored/sklearn/preprocessing/_discretization.py b/third_party/bigframes_vendored/sklearn/preprocessing/_discretization.py index 98b9d0371f..54c81af71d 100644 --- a/third_party/bigframes_vendored/sklearn/preprocessing/_discretization.py +++ b/third_party/bigframes_vendored/sklearn/preprocessing/_discretization.py @@ -18,7 +18,7 @@ class KBinsDiscretizer(TransformerMixin, BaseEstimator): strategy ({'uniform', 'quantile'}, default='quantile'): Strategy used to define the widths of the bins. 'uniform': All bins in each feature have identical widths. 'quantile': All bins in each - feature have the same number of points. Only `uniform` is supported. + feature have the same number of points. """ def fit(self, X, y=None): From 4fc89644e47a6da9367b54826b25c6abbe97327b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tim=20Swe=C3=B1a=20=28Swast=29?= Date: Wed, 8 May 2024 12:03:54 -0500 Subject: [PATCH 03/17] docs: address lint errors in code samples (#665) Towards internal issue 332735129 test: move samples tests to their own kokoro jobs --- .kokoro/continuous/e2e.cfg | 2 +- .kokoro/presubmit/e2e-gerrit.cfg | 2 +- .kokoro/presubmit/e2e.cfg | 2 +- noxfile.py | 20 -- owlbot.py | 6 +- samples/snippets/bqml_getting_started_test.py | 2 +- samples/snippets/clustering_model_test.py | 2 +- samples/snippets/conftest.py | 2 +- samples/snippets/create_kmeans_model_test.py | 8 +- ...e_multiple_timeseries_forecasting_model.py | 2 +- ...ingle_timeseries_forecasting_model_test.py | 3 +- samples/snippets/explore_query_result_test.py | 2 +- samples/snippets/gemini_model_test.py | 2 +- samples/snippets/gen_ai_model_test.py | 2 +- .../snippets/load_data_from_bigquery_test.py | 2 +- .../load_data_from_biquery_job_test.py | 2 +- samples/snippets/load_data_from_csv_test.py | 2 +- samples/snippets/noxfile.py | 292 ++++++++++++++++++ samples/snippets/noxfile_config.py | 42 +++ samples/snippets/pandas_methods_test.py | 2 +- samples/snippets/quickstart.py | 2 +- samples/snippets/regression_model_test.py | 2 +- samples/snippets/remote_function.py | 6 +- samples/snippets/remote_function_test.py | 31 ++ samples/snippets/requirements-test.txt | 3 + samples/snippets/requirements.txt | 2 + samples/snippets/set_options_test.py | 2 +- 27 files changed, 401 insertions(+), 46 deletions(-) create mode 100644 samples/snippets/noxfile.py create mode 100644 samples/snippets/noxfile_config.py create mode 100644 samples/snippets/requirements-test.txt create mode 100644 samples/snippets/requirements.txt diff --git a/.kokoro/continuous/e2e.cfg b/.kokoro/continuous/e2e.cfg index 7479346590..774b63313e 100644 --- a/.kokoro/continuous/e2e.cfg +++ b/.kokoro/continuous/e2e.cfg @@ -3,7 +3,7 @@ # Only run this nox session. env_vars: { key: "NOX_SESSION" - value: "unit_prerelease system_prerelease system_noextras e2e notebook samples" + value: "unit_prerelease system_prerelease system_noextras e2e notebook" } env_vars: { diff --git a/.kokoro/presubmit/e2e-gerrit.cfg b/.kokoro/presubmit/e2e-gerrit.cfg index d875f36060..19913344b6 100644 --- a/.kokoro/presubmit/e2e-gerrit.cfg +++ b/.kokoro/presubmit/e2e-gerrit.cfg @@ -3,5 +3,5 @@ # Only run this nox session. env_vars: { key: "NOX_SESSION" - value: "system_noextras e2e notebook samples" + value: "system_noextras e2e notebook" } diff --git a/.kokoro/presubmit/e2e.cfg b/.kokoro/presubmit/e2e.cfg index 7479346590..774b63313e 100644 --- a/.kokoro/presubmit/e2e.cfg +++ b/.kokoro/presubmit/e2e.cfg @@ -3,7 +3,7 @@ # Only run this nox session. env_vars: { key: "NOX_SESSION" - value: "unit_prerelease system_prerelease system_noextras e2e notebook samples" + value: "unit_prerelease system_prerelease system_noextras e2e notebook" } env_vars: { diff --git a/noxfile.py b/noxfile.py index 91ad6bc0e6..af73495a7f 100644 --- a/noxfile.py +++ b/noxfile.py @@ -402,26 +402,6 @@ def load(session: nox.sessions.Session): ) -@nox.session(python=SYSTEM_TEST_PYTHON_VERSIONS) -def samples(session): - """Run the samples test suite.""" - - constraints_path = str( - CURRENT_DIRECTORY / "testing" / f"constraints-{session.python}.txt" - ) - - # TODO(b/332735129): Remove this session and use python_samples templates - # where each samples directory has its own noxfile.py file, instead. - install_test_extra = True - install_systemtest_dependencies(session, install_test_extra, "-c", constraints_path) - - session.run( - "py.test", - "samples", - *session.posargs, - ) - - @nox.session(python=DEFAULT_PYTHON_VERSION) def cover(session): """Run the final coverage report. diff --git a/owlbot.py b/owlbot.py index f804859689..ddc578c3a2 100644 --- a/owlbot.py +++ b/owlbot.py @@ -74,7 +74,7 @@ import pandas import pyarrow import sqlglot - + print(f"Python: {sys.version}") print(f"bigframes=={bigframes.__version__}") print(f"google-cloud-bigquery=={google.cloud.bigquery.__version__}") @@ -83,7 +83,7 @@ print(f"pyarrow=={pyarrow.__version__}") print(f"sqlglot=={sqlglot.__version__}") ``` - + #### Steps to reproduce """, ), @@ -148,3 +148,5 @@ # ---------------------------------------------------------------------------- s.shell.run(["nox", "-s", "format"], hide_output=False) +for noxfile in REPO_ROOT.glob("samples/**/noxfile.py"): + s.shell.run(["nox", "-s", "format"], cwd=noxfile.parent, hide_output=False) diff --git a/samples/snippets/bqml_getting_started_test.py b/samples/snippets/bqml_getting_started_test.py index d9f9135faa..d249ca4ff3 100644 --- a/samples/snippets/bqml_getting_started_test.py +++ b/samples/snippets/bqml_getting_started_test.py @@ -13,7 +13,7 @@ # limitations under the License. -def test_bqml_getting_started(random_model_id): +def test_bqml_getting_started(random_model_id: str) -> None: your_model_id = random_model_id # for example: bqml_tutorial.sample_model # [START bigquery_dataframes_bqml_getting_started_tutorial] diff --git a/samples/snippets/clustering_model_test.py b/samples/snippets/clustering_model_test.py index a407fc7805..fec4bbcefd 100644 --- a/samples/snippets/clustering_model_test.py +++ b/samples/snippets/clustering_model_test.py @@ -13,7 +13,7 @@ # limitations under the License. -def test_clustering_model(): +def test_clustering_model() -> None: # [START bigquery_dataframes_clustering_model] from bigframes.ml.cluster import KMeans import bigframes.pandas as bpd diff --git a/samples/snippets/conftest.py b/samples/snippets/conftest.py index d34837b3e2..9171ac78a4 100644 --- a/samples/snippets/conftest.py +++ b/samples/snippets/conftest.py @@ -46,7 +46,7 @@ def project_id(bigquery_client: bigquery.Client) -> str: @pytest.fixture(autouse=True) -def reset_session(): +def reset_session() -> None: """An autouse fixture ensuring each sample runs in a fresh session. This allows us to have samples that query data in different locations. diff --git a/samples/snippets/create_kmeans_model_test.py b/samples/snippets/create_kmeans_model_test.py index 2429060d09..32ebc60a69 100644 --- a/samples/snippets/create_kmeans_model_test.py +++ b/samples/snippets/create_kmeans_model_test.py @@ -13,12 +13,14 @@ # limitations under the License. -def test_kmeans_sample(project_id: str, random_model_id_eu: str): +def test_kmeans_sample(project_id: str, random_model_id_eu: str) -> None: your_gcp_project_id = project_id your_model_id = random_model_id_eu # [START bigquery_dataframes_bqml_kmeans] import datetime + import pandas as pd + import bigframes import bigframes.pandas as bpd @@ -92,7 +94,9 @@ def test_kmeans_sample(project_id: str, random_model_id_eu: str): stationstats = merged_df.groupby(["station_name", "isweekday"]).agg( {"duration": ["mean", "count"], "distance_from_city_center": "max"} ) - stationstats.columns = ["duration", "num_trips", "distance_from_city_center"] + stationstats.columns = pd.Index( + ["duration", "num_trips", "distance_from_city_center"] + ) stationstats = stationstats.sort_values( by="distance_from_city_center", ascending=True ).reset_index() diff --git a/samples/snippets/create_multiple_timeseries_forecasting_model.py b/samples/snippets/create_multiple_timeseries_forecasting_model.py index 26fc15595f..b749c37d50 100644 --- a/samples/snippets/create_multiple_timeseries_forecasting_model.py +++ b/samples/snippets/create_multiple_timeseries_forecasting_model.py @@ -13,7 +13,7 @@ # limitations under the License. -def test_multiple_timeseries_forecasting_model(random_model_id): +def test_multiple_timeseries_forecasting_model(random_model_id: str) -> None: your_model_id = random_model_id # [START bigquery_dataframes_bqml_arima_multiple_step_2_visualize] diff --git a/samples/snippets/create_single_timeseries_forecasting_model_test.py b/samples/snippets/create_single_timeseries_forecasting_model_test.py index 5750933713..0c694de2dc 100644 --- a/samples/snippets/create_single_timeseries_forecasting_model_test.py +++ b/samples/snippets/create_single_timeseries_forecasting_model_test.py @@ -13,8 +13,7 @@ # limitations under the License. -def test_create_single_timeseries(): - +def test_create_single_timeseries() -> None: # [START bigquery_dataframes_single_timeseries_forecasting_model_tutorial] import bigframes.pandas as bpd diff --git a/samples/snippets/explore_query_result_test.py b/samples/snippets/explore_query_result_test.py index 5f0ec7d9b6..42f48fd94e 100644 --- a/samples/snippets/explore_query_result_test.py +++ b/samples/snippets/explore_query_result_test.py @@ -13,7 +13,7 @@ # limitations under the License. -def test_bigquery_dataframes_explore_query_result(): +def test_bigquery_dataframes_explore_query_result() -> None: import bigframes.pandas as bpd # [START bigquery_dataframes_explore_query_result] diff --git a/samples/snippets/gemini_model_test.py b/samples/snippets/gemini_model_test.py index 89212875ae..24b4e7d26d 100644 --- a/samples/snippets/gemini_model_test.py +++ b/samples/snippets/gemini_model_test.py @@ -13,7 +13,7 @@ # limitations under the License. -def test_gemini_text_generator_model(): +def test_gemini_text_generator_model() -> None: # Determine project id, in this case prefer the one set in the environment # variable GOOGLE_CLOUD_PROJECT (if any) import os diff --git a/samples/snippets/gen_ai_model_test.py b/samples/snippets/gen_ai_model_test.py index e4bead0e46..5cdcd6d3a7 100644 --- a/samples/snippets/gen_ai_model_test.py +++ b/samples/snippets/gen_ai_model_test.py @@ -13,7 +13,7 @@ # limitations under the License. -def test_llm_model(): +def test_llm_model() -> None: # Determine project id, in this case prefer the one set in the environment # variable GOOGLE_CLOUD_PROJECT (if any) import os diff --git a/samples/snippets/load_data_from_bigquery_test.py b/samples/snippets/load_data_from_bigquery_test.py index e4c65688bd..4523eece97 100644 --- a/samples/snippets/load_data_from_bigquery_test.py +++ b/samples/snippets/load_data_from_bigquery_test.py @@ -13,7 +13,7 @@ # limitations under the License. -def test_bigquery_dataframes_load_data_from_bigquery(): +def test_bigquery_dataframes_load_data_from_bigquery() -> None: # [START bigquery_dataframes_load_data_from_bigquery] # Create a DataFrame from a BigQuery table: import bigframes.pandas as bpd diff --git a/samples/snippets/load_data_from_biquery_job_test.py b/samples/snippets/load_data_from_biquery_job_test.py index 9a7793a7e5..4f1ddc062f 100644 --- a/samples/snippets/load_data_from_biquery_job_test.py +++ b/samples/snippets/load_data_from_biquery_job_test.py @@ -13,7 +13,7 @@ # limitations under the License. -def test_bigquery_dataframes_load_data_from_bigquery_job(): +def test_bigquery_dataframes_load_data_from_bigquery_job() -> None: # Determine project id, in this case prefer the one set in the environment # variable GOOGLE_CLOUD_PROJECT (if any) import os diff --git a/samples/snippets/load_data_from_csv_test.py b/samples/snippets/load_data_from_csv_test.py index 31ab9255bf..cc96b92fb8 100644 --- a/samples/snippets/load_data_from_csv_test.py +++ b/samples/snippets/load_data_from_csv_test.py @@ -13,7 +13,7 @@ # limitations under the License. -def test_bigquery_dataframes_load_data_from_csv(): +def test_bigquery_dataframes_load_data_from_csv() -> None: # [START bigquery_dataframes_load_data_from_csv] import bigframes.pandas as bpd diff --git a/samples/snippets/noxfile.py b/samples/snippets/noxfile.py new file mode 100644 index 0000000000..c36d5f2d81 --- /dev/null +++ b/samples/snippets/noxfile.py @@ -0,0 +1,292 @@ +# Copyright 2019 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +import glob +import os +from pathlib import Path +import sys +from typing import Callable, Dict, Optional + +import nox + +# WARNING - WARNING - WARNING - WARNING - WARNING +# WARNING - WARNING - WARNING - WARNING - WARNING +# DO NOT EDIT THIS FILE EVER! +# WARNING - WARNING - WARNING - WARNING - WARNING +# WARNING - WARNING - WARNING - WARNING - WARNING + +BLACK_VERSION = "black==22.3.0" +ISORT_VERSION = "isort==5.10.1" + +# Copy `noxfile_config.py` to your directory and modify it instead. + +# `TEST_CONFIG` dict is a configuration hook that allows users to +# modify the test configurations. The values here should be in sync +# with `noxfile_config.py`. Users will copy `noxfile_config.py` into +# their directory and modify it. + +TEST_CONFIG = { + # You can opt out from the test for specific Python versions. + "ignored_versions": [], + # Old samples are opted out of enforcing Python type hints + # All new samples should feature them + "enforce_type_hints": False, + # An envvar key for determining the project id to use. Change it + # to 'BUILD_SPECIFIC_GCLOUD_PROJECT' if you want to opt in using a + # build specific Cloud project. You can also use your own string + # to use your own Cloud project. + "gcloud_project_env": "GOOGLE_CLOUD_PROJECT", + # 'gcloud_project_env': 'BUILD_SPECIFIC_GCLOUD_PROJECT', + # If you need to use a specific version of pip, + # change pip_version_override to the string representation + # of the version number, for example, "20.2.4" + "pip_version_override": None, + # A dictionary you want to inject into your test. Don't put any + # secrets here. These values will override predefined values. + "envs": {}, +} + + +try: + # Ensure we can import noxfile_config in the project's directory. + sys.path.append(".") + from noxfile_config import TEST_CONFIG_OVERRIDE +except ImportError as e: + print("No user noxfile_config found: detail: {}".format(e)) + TEST_CONFIG_OVERRIDE = {} + +# Update the TEST_CONFIG with the user supplied values. +TEST_CONFIG.update(TEST_CONFIG_OVERRIDE) + + +def get_pytest_env_vars() -> Dict[str, str]: + """Returns a dict for pytest invocation.""" + ret = {} + + # Override the GCLOUD_PROJECT and the alias. + env_key = TEST_CONFIG["gcloud_project_env"] + # This should error out if not set. + ret["GOOGLE_CLOUD_PROJECT"] = os.environ[env_key] + + # Apply user supplied envs. + ret.update(TEST_CONFIG["envs"]) + return ret + + +# DO NOT EDIT - automatically generated. +# All versions used to test samples. +ALL_VERSIONS = ["3.7", "3.8", "3.9", "3.10", "3.11", "3.12"] + +# Any default versions that should be ignored. +IGNORED_VERSIONS = TEST_CONFIG["ignored_versions"] + +TESTED_VERSIONS = sorted([v for v in ALL_VERSIONS if v not in IGNORED_VERSIONS]) + +INSTALL_LIBRARY_FROM_SOURCE = os.environ.get("INSTALL_LIBRARY_FROM_SOURCE", False) in ( + "True", + "true", +) + +# Error if a python version is missing +nox.options.error_on_missing_interpreters = True + +# +# Style Checks +# + + +# Linting with flake8. +# +# We ignore the following rules: +# E203: whitespace before ‘:’ +# E266: too many leading ‘#’ for block comment +# E501: line too long +# I202: Additional newline in a section of imports +# +# We also need to specify the rules which are ignored by default: +# ['E226', 'W504', 'E126', 'E123', 'W503', 'E24', 'E704', 'E121'] +FLAKE8_COMMON_ARGS = [ + "--show-source", + "--builtin=gettext", + "--max-complexity=20", + "--exclude=.nox,.cache,env,lib,generated_pb2,*_pb2.py,*_pb2_grpc.py", + "--ignore=E121,E123,E126,E203,E226,E24,E266,E501,E704,W503,W504,I202", + "--max-line-length=88", +] + + +@nox.session +def lint(session: nox.sessions.Session) -> None: + if not TEST_CONFIG["enforce_type_hints"]: + session.install("flake8") + else: + session.install("flake8", "flake8-annotations") + + args = FLAKE8_COMMON_ARGS + [ + ".", + ] + session.run("flake8", *args) + + +# +# Black +# + + +@nox.session +def blacken(session: nox.sessions.Session) -> None: + """Run black. Format code to uniform standard.""" + session.install(BLACK_VERSION) + python_files = [path for path in os.listdir(".") if path.endswith(".py")] + + session.run("black", *python_files) + + +# +# format = isort + black +# + + +@nox.session +def format(session: nox.sessions.Session) -> None: + """ + Run isort to sort imports. Then run black + to format code to uniform standard. + """ + session.install(BLACK_VERSION, ISORT_VERSION) + python_files = [path for path in os.listdir(".") if path.endswith(".py")] + + # Use the --fss option to sort imports using strict alphabetical order. + # See https://pycqa.github.io/isort/docs/configuration/options.html#force-sort-within-sections + session.run("isort", "--fss", *python_files) + session.run("black", *python_files) + + +# +# Sample Tests +# + + +PYTEST_COMMON_ARGS = ["--junitxml=sponge_log.xml"] + + +def _session_tests( + session: nox.sessions.Session, post_install: Callable = None +) -> None: + # check for presence of tests + test_list = glob.glob("**/*_test.py", recursive=True) + glob.glob( + "**/test_*.py", recursive=True + ) + test_list.extend(glob.glob("**/tests", recursive=True)) + + if len(test_list) == 0: + print("No tests found, skipping directory.") + return + + if TEST_CONFIG["pip_version_override"]: + pip_version = TEST_CONFIG["pip_version_override"] + session.install(f"pip=={pip_version}") + """Runs py.test for a particular project.""" + concurrent_args = [] + if os.path.exists("requirements.txt"): + if os.path.exists("constraints.txt"): + session.install("-r", "requirements.txt", "-c", "constraints.txt") + else: + session.install("-r", "requirements.txt") + with open("requirements.txt") as rfile: + packages = rfile.read() + + if os.path.exists("requirements-test.txt"): + if os.path.exists("constraints-test.txt"): + session.install("-r", "requirements-test.txt", "-c", "constraints-test.txt") + else: + session.install("-r", "requirements-test.txt") + with open("requirements-test.txt") as rtfile: + packages += rtfile.read() + + if INSTALL_LIBRARY_FROM_SOURCE: + session.install("-e", _get_repo_root()) + + if post_install: + post_install(session) + + if "pytest-parallel" in packages: + concurrent_args.extend(["--workers", "auto", "--tests-per-worker", "auto"]) + elif "pytest-xdist" in packages: + concurrent_args.extend(["-n", "auto"]) + + session.run( + "pytest", + *(PYTEST_COMMON_ARGS + session.posargs + concurrent_args), + # Pytest will return 5 when no tests are collected. This can happen + # on travis where slow and flaky tests are excluded. + # See http://doc.pytest.org/en/latest/_modules/_pytest/main.html + success_codes=[0, 5], + env=get_pytest_env_vars(), + ) + + +@nox.session(python=ALL_VERSIONS) +def py(session: nox.sessions.Session) -> None: + """Runs py.test for a sample using the specified version of Python.""" + if session.python in TESTED_VERSIONS: + _session_tests(session) + else: + session.skip( + "SKIPPED: {} tests are disabled for this sample.".format(session.python) + ) + + +# +# Readmegen +# + + +def _get_repo_root() -> Optional[str]: + """Returns the root folder of the project.""" + # Get root of this repository. Assume we don't have directories nested deeper than 10 items. + p = Path(os.getcwd()) + for i in range(10): + if p is None: + break + if Path(p / ".git").exists(): + return str(p) + # .git is not available in repos cloned via Cloud Build + # setup.py is always in the library's root, so use that instead + # https://github.com/googleapis/synthtool/issues/792 + if Path(p / "setup.py").exists(): + return str(p) + p = p.parent + raise Exception("Unable to detect repository root.") + + +GENERATED_READMES = sorted([x for x in Path(".").rglob("*.rst.in")]) + + +@nox.session +@nox.parametrize("path", GENERATED_READMES) +def readmegen(session: nox.sessions.Session, path: str) -> None: + """(Re-)generates the readme for a sample.""" + session.install("jinja2", "pyyaml") + dir_ = os.path.dirname(path) + + if os.path.exists(os.path.join(dir_, "requirements.txt")): + session.install("-r", os.path.join(dir_, "requirements.txt")) + + in_file = os.path.join(dir_, "README.rst.in") + session.run( + "python", _get_repo_root() + "/scripts/readme-gen/readme_gen.py", in_file + ) diff --git a/samples/snippets/noxfile_config.py b/samples/snippets/noxfile_config.py new file mode 100644 index 0000000000..211d6974b9 --- /dev/null +++ b/samples/snippets/noxfile_config.py @@ -0,0 +1,42 @@ +# Copyright 2020 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Default TEST_CONFIG_OVERRIDE for python repos. + +# You can copy this file into your directory, then it will be inported from +# the noxfile.py. + +# The source of truth: +# https://github.com/GoogleCloudPlatform/python-docs-samples/blob/master/noxfile_config.py + +TEST_CONFIG_OVERRIDE = { + # You can opt out from the test for specific Python versions. + "ignored_versions": ["2.7", "3.7", "3.8"], + # Old samples are opted out of enforcing Python type hints + # All new samples should feature them + "enforce_type_hints": True, + # An envvar key for determining the project id to use. Change it + # to 'BUILD_SPECIFIC_GCLOUD_PROJECT' if you want to opt in using a + # build specific Cloud project. You can also use your own string + # to use your own Cloud project. + "gcloud_project_env": "GOOGLE_CLOUD_PROJECT", + # "gcloud_project_env": "BUILD_SPECIFIC_GCLOUD_PROJECT", + # If you need to use a specific version of pip, + # change pip_version_override to the string representation + # of the version number, for example, "20.2.4" + "pip_version_override": None, + # A dictionary you want to inject into your test. Don't put any + # secrets here. These values will override predefined values. + "envs": {}, +} diff --git a/samples/snippets/pandas_methods_test.py b/samples/snippets/pandas_methods_test.py index bd8e29c003..0f128f9e6a 100644 --- a/samples/snippets/pandas_methods_test.py +++ b/samples/snippets/pandas_methods_test.py @@ -13,7 +13,7 @@ # limitations under the License. -def test_bigquery_dataframes_pandas_methods(): +def test_bigquery_dataframes_pandas_methods() -> None: # [START bigquery_dataframes_pandas_methods] import bigframes.pandas as bpd diff --git a/samples/snippets/quickstart.py b/samples/snippets/quickstart.py index ae3a934004..c26c6f4442 100644 --- a/samples/snippets/quickstart.py +++ b/samples/snippets/quickstart.py @@ -13,7 +13,7 @@ # limitations under the License. -def run_quickstart(project_id: str): +def run_quickstart(project_id: str) -> None: import bigframes session_options = bigframes.BigQueryOptions() diff --git a/samples/snippets/regression_model_test.py b/samples/snippets/regression_model_test.py index 7d1bde689c..43cdabacb4 100644 --- a/samples/snippets/regression_model_test.py +++ b/samples/snippets/regression_model_test.py @@ -13,7 +13,7 @@ # limitations under the License. -def test_regression_model(): +def test_regression_model() -> None: # [START bigquery_dataframes_regression_model] from bigframes.ml.linear_model import LinearRegression import bigframes.pandas as bpd diff --git a/samples/snippets/remote_function.py b/samples/snippets/remote_function.py index 4db4e67619..c35daf35fc 100644 --- a/samples/snippets/remote_function.py +++ b/samples/snippets/remote_function.py @@ -13,7 +13,7 @@ # limitations under the License. -def run_remote_function_and_read_gbq_function(project_id: str): +def run_remote_function_and_read_gbq_function(project_id: str) -> None: your_gcp_project_id = project_id # [START bigquery_dataframes_remote_function] @@ -51,7 +51,7 @@ def run_remote_function_and_read_gbq_function(project_id: str): str, reuse=False, ) - def get_bucket(num): + def get_bucket(num: float) -> str: if not num: return "NA" boundary = 4000 @@ -96,7 +96,7 @@ def get_bucket(num): reuse=False, packages=["cryptography"], ) - def get_hash(input): + def get_hash(input: str) -> str: from cryptography.fernet import Fernet # handle missing value diff --git a/samples/snippets/remote_function_test.py b/samples/snippets/remote_function_test.py index 8f891274de..24bc7e854e 100644 --- a/samples/snippets/remote_function_test.py +++ b/samples/snippets/remote_function_test.py @@ -12,6 +12,8 @@ # See the License for the specific language governing permissions and # limitations under the License. +import google.api_core.exceptions +import google.cloud.bigquery_connection_v1 import pytest import bigframes.pandas @@ -19,6 +21,35 @@ from . import remote_function +# TODO(tswast): Once the connections are cleaned up in the sample test project +# and https://github.com/GoogleCloudPlatform/python-docs-samples/issues/11720 +# is closed, we shouldn't need this because AFAIK we only use one BQ connection +# in this sample. +@pytest.fixture(autouse=True) +def cleanup_connections() -> None: + client = google.cloud.bigquery_connection_v1.ConnectionServiceClient() + + for conn in client.list_connections( + parent="projects/python-docs-samples-tests/locations/us" + ): + try: + int(conn.name.split("/")[-1].split("-")[0], base=16) + except ValueError: + print(f"Couldn't parse {conn.name}") + continue + + try: + print(f"removing {conn.name}") + client.delete_connection( + google.cloud.bigquery_connection_v1.DeleteConnectionRequest( + {"name": conn.name}, + ) + ) + except google.api_core.exceptions.GoogleAPIError: + # We did as much clean up as we can. + break + + def test_remote_function_and_read_gbq_function( capsys: pytest.CaptureFixture[str], ) -> None: diff --git a/samples/snippets/requirements-test.txt b/samples/snippets/requirements-test.txt new file mode 100644 index 0000000000..62b0c02e79 --- /dev/null +++ b/samples/snippets/requirements-test.txt @@ -0,0 +1,3 @@ +# samples/snippets should be runnable with no "extras" +google-cloud-testutils==1.4.0 +pytest==8.1.1 diff --git a/samples/snippets/requirements.txt b/samples/snippets/requirements.txt new file mode 100644 index 0000000000..1da77c1715 --- /dev/null +++ b/samples/snippets/requirements.txt @@ -0,0 +1,2 @@ +# samples/snippets should be runnable with no "extras" +bigframes==1.4.0 diff --git a/samples/snippets/set_options_test.py b/samples/snippets/set_options_test.py index f981009e9a..3dea524a17 100644 --- a/samples/snippets/set_options_test.py +++ b/samples/snippets/set_options_test.py @@ -13,7 +13,7 @@ # limitations under the License. -def test_bigquery_dataframes_set_options(): +def test_bigquery_dataframes_set_options() -> None: # Close the session before resetting the options import bigframes.pandas as bpd From bcc054b90b7f84f79e127b27fd41ab5125f6c496 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tim=20Swe=C3=B1a=20=28Swast=29?= Date: Wed, 8 May 2024 15:29:01 -0500 Subject: [PATCH 04/17] chore: remove outdated `NoDefaultIndexError` info from CHANGELOG.md (#668) * chore: remove outdated `NoDefaultIndexError` info from CHANGELOG.md We don't actually raise an error, just a `DefaultIndexWarning ` warning. * remove redundant line --- CHANGELOG.md | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index f73d4b5750..4457c2e443 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -15,7 +15,7 @@ * Add the `bigframes.bigquery` sub-package with a `bigframes.bigquery.array_length` function ([#630](https://github.com/googleapis/python-bigquery-dataframes/issues/630)) ([9963f85](https://github.com/googleapis/python-bigquery-dataframes/commit/9963f85b84c3b3c681447ab79e22ac93ac48349c)) * Always do a query dry run when `option.repr_mode == "deferred"` ([#652](https://github.com/googleapis/python-bigquery-dataframes/issues/652)) ([651fd7d](https://github.com/googleapis/python-bigquery-dataframes/commit/651fd7daf14273f172c6c55e5d6c374eb590a22d)) * Custom query labels for compute options ([#638](https://github.com/googleapis/python-bigquery-dataframes/issues/638)) ([f561799](https://github.com/googleapis/python-bigquery-dataframes/commit/f5617994bc136de5caa72719b8c3c297c512cb36)) -* Raise `NoDefaultIndexError` from `read_gbq` on clustered/partitioned tables with no `index_col` or `filters` set ([#631](https://github.com/googleapis/python-bigquery-dataframes/issues/631)) ([73064dd](https://github.com/googleapis/python-bigquery-dataframes/commit/73064dd2aa1ece5de8f5849a0fd337d0ba677404)) +* Warn with `DefaultIndexWarning` from `read_gbq` on clustered/partitioned tables with no `index_col` or `filters` set ([#631](https://github.com/googleapis/python-bigquery-dataframes/issues/631), [#658](https://github.com/googleapis/python-bigquery-dataframes/issues/658)) ([2715d2b](https://github.com/googleapis/python-bigquery-dataframes/commit/2715d2b4a353710175a66a4f6149356f583f2c45), [73064dd](https://github.com/googleapis/python-bigquery-dataframes/commit/73064dd2aa1ece5de8f5849a0fd337d0ba677404)) * Support `index_col=False` in `read_csv` and `engine="bigquery"` ([73064dd](https://github.com/googleapis/python-bigquery-dataframes/commit/73064dd2aa1ece5de8f5849a0fd337d0ba677404)) * Support gcf max instance count in `remote_function` ([#657](https://github.com/googleapis/python-bigquery-dataframes/issues/657)) ([36578ab](https://github.com/googleapis/python-bigquery-dataframes/commit/36578ab431119f71dda746de415d0c6417bb4de2)) @@ -23,7 +23,6 @@ ### Bug Fixes * Don't raise UnknownLocationWarning for US or EU multi-regions ([#653](https://github.com/googleapis/python-bigquery-dataframes/issues/653)) ([8e4616b](https://github.com/googleapis/python-bigquery-dataframes/commit/8e4616b896f4e0d13d8bb0424c89335d3a1fe697)) -* Downgrade NoDefaultIndexError to DefaultIndexWarning ([#658](https://github.com/googleapis/python-bigquery-dataframes/issues/658)) ([2715d2b](https://github.com/googleapis/python-bigquery-dataframes/commit/2715d2b4a353710175a66a4f6149356f583f2c45)) * Fix bug with na in the column labels in stack ([#659](https://github.com/googleapis/python-bigquery-dataframes/issues/659)) ([4a34293](https://github.com/googleapis/python-bigquery-dataframes/commit/4a342933559fba417fe42e2bd386838defdb2778)) * Use explicit session in `PaLM2TextGenerator` ([#651](https://github.com/googleapis/python-bigquery-dataframes/issues/651)) ([e4f13c3](https://github.com/googleapis/python-bigquery-dataframes/commit/e4f13c3633b90e32d3171976d8b27ed10049882f)) From e084e54557addff78522bbd710637ecb4b46d23e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tim=20Swe=C3=B1a=20=28Swast=29?= Date: Wed, 8 May 2024 17:57:41 -0500 Subject: [PATCH 05/17] fix: include `index_col` when selecting `columns` and `filters` in `read_gbq_table` (#648) * fix: include `index_col` when selecting `columns` and `filters` in `read_gbq_table` Fixes internal issue 339430305 * feat: warn with a more specific `DefaultLocationWarning` category when no location can be detected (#648) test: refactor `read_gbq` / `read_gbq_table` tests to test with all parameters combined (#648) refactor: move query generation code to BigQuery I/O module (#648) --- bigframes/exceptions.py | 6 + bigframes/pandas/__init__.py | 3 +- bigframes/session/__init__.py | 158 +++++++-------------- bigframes/session/_io/bigquery/__init__.py | 97 ++++++++++++- tests/system/small/test_pandas_options.py | 26 ++-- tests/system/small/test_session.py | 94 +++++++++--- tests/unit/session/test_io_bigquery.py | 106 ++++++++++++++ tests/unit/session/test_session.py | 82 ----------- 8 files changed, 349 insertions(+), 223 deletions(-) diff --git a/bigframes/exceptions.py b/bigframes/exceptions.py index 5caf2aa1df..3ca6d8e1af 100644 --- a/bigframes/exceptions.py +++ b/bigframes/exceptions.py @@ -17,6 +17,12 @@ # NOTE: This module should not depend on any others in the package. +# Uses UserWarning for backwards compatibility with warning without a category +# set. +class DefaultLocationWarning(UserWarning): + """No location was specified, so using a default one.""" + + class UnknownLocationWarning(Warning): """The location is set to an unknown value.""" diff --git a/bigframes/pandas/__init__.py b/bigframes/pandas/__init__.py index 2200fd6aa4..1d6da46fae 100644 --- a/bigframes/pandas/__init__.py +++ b/bigframes/pandas/__init__.py @@ -67,6 +67,7 @@ import bigframes.operations as ops import bigframes.series import bigframes.session +import bigframes.session._io.bigquery import bigframes.session.clients @@ -391,7 +392,7 @@ def _set_default_session_location_if_possible(query): bqclient = clients_provider.bqclient - if bigframes.session._is_query(query): + if bigframes.session._io.bigquery.is_query(query): job = bqclient.query(query, bigquery.QueryJobConfig(dry_run=True)) options.bigquery.location = job.location else: diff --git a/bigframes/session/__init__.py b/bigframes/session/__init__.py index 7c7d93541c..89845bb842 100644 --- a/bigframes/session/__init__.py +++ b/bigframes/session/__init__.py @@ -20,7 +20,6 @@ import datetime import logging import os -import re import secrets import typing from typing import ( @@ -86,10 +85,11 @@ import bigframes.core.tree_properties as tree_properties import bigframes.core.utils as utils import bigframes.dtypes +import bigframes.exceptions import bigframes.formatting_helpers as formatting_helpers from bigframes.functions.remote_function import read_gbq_function as bigframes_rgf from bigframes.functions.remote_function import remote_function as bigframes_rf -import bigframes.session._io.bigquery as bigframes_io +import bigframes.session._io.bigquery as bf_io_bigquery import bigframes.session._io.bigquery.read_gbq_table as bf_read_gbq_table import bigframes.session.clients import bigframes.version @@ -145,14 +145,18 @@ ) -def _is_query(query_or_table: str) -> bool: - """Determine if `query_or_table` is a table ID or a SQL string""" - return re.search(r"\s", query_or_table.strip(), re.MULTILINE) is not None +def _to_index_cols( + index_col: Iterable[str] | str | bigframes.enums.DefaultIndexKind = (), +) -> List[str]: + """Convert index_col into a list of column names.""" + if isinstance(index_col, bigframes.enums.DefaultIndexKind): + index_cols: List[str] = [] + elif isinstance(index_col, str): + index_cols = [index_col] + else: + index_cols = list(index_col) - -def _is_table_with_wildcard_suffix(query_or_table: str) -> bool: - """Determine if `query_or_table` is a table and contains a wildcard suffix.""" - return not _is_query(query_or_table) and query_or_table.endswith("*") + return index_cols class Session( @@ -181,12 +185,26 @@ def __init__( if context is None: context = bigquery_options.BigQueryOptions() - # TODO(swast): Get location from the environment. if context.location is None: self._location = "US" warnings.warn( f"No explicit location is set, so using location {self._location} for the session.", - stacklevel=2, + # User's code + # -> get_global_session() + # -> connect() + # -> Session() + # + # Note: We could also have: + # User's code + # -> read_gbq() + # -> with_default_session() + # -> get_global_session() + # -> connect() + # -> Session() + # but we currently have no way to disambiguate these + # situations. + stacklevel=4, + category=bigframes.exceptions.DefaultLocationWarning, ) else: self._location = context.location @@ -322,13 +340,19 @@ def read_gbq( columns = col_order filters = list(filters) - if len(filters) != 0 or _is_table_with_wildcard_suffix(query_or_table): + if len(filters) != 0 or bf_io_bigquery.is_table_with_wildcard_suffix( + query_or_table + ): # TODO(b/338111344): This appears to be missing index_cols, which # are necessary to be selected. - # TODO(b/338039517): Also, need to account for primary keys. - query_or_table = self._to_query(query_or_table, columns, filters) + # TODO(b/338039517): Refactor this to be called inside both + # _read_gbq_query and _read_gbq_table (after detecting primary keys) + # so we can make sure index_col/index_cols reflects primary keys. + query_or_table = bf_io_bigquery.to_query( + query_or_table, _to_index_cols(index_col), columns, filters + ) - if _is_query(query_or_table): + if bf_io_bigquery.is_query(query_or_table): return self._read_gbq_query( query_or_table, index_col=index_col, @@ -355,85 +379,6 @@ def read_gbq( use_cache=use_cache if use_cache is not None else True, ) - def _to_query( - self, - query_or_table: str, - columns: Iterable[str], - filters: third_party_pandas_gbq.FiltersType, - ) -> str: - """Compile query_or_table with conditions(filters, wildcards) to query.""" - filters = list(filters) - sub_query = ( - f"({query_or_table})" - if _is_query(query_or_table) - else f"`{query_or_table}`" - ) - - # TODO(b/338111344): Generate an index based on DefaultIndexKind if we - # don't have index columns specified. - select_clause = "SELECT " + ( - ", ".join(f"`{column}`" for column in columns) if columns else "*" - ) - - where_clause = "" - if filters: - valid_operators: Mapping[third_party_pandas_gbq.FilterOps, str] = { - "in": "IN", - "not in": "NOT IN", - "LIKE": "LIKE", - "==": "=", - ">": ">", - "<": "<", - ">=": ">=", - "<=": "<=", - "!=": "!=", - } - - # If single layer filter, add another pseudo layer. So the single layer represents "and" logic. - if isinstance(filters[0], tuple) and ( - len(filters[0]) == 0 or not isinstance(list(filters[0])[0], tuple) - ): - filters = typing.cast(third_party_pandas_gbq.FiltersType, [filters]) - - or_expressions = [] - for group in filters: - if not isinstance(group, Iterable): - group = [group] - - and_expressions = [] - for filter_item in group: - if not isinstance(filter_item, tuple) or (len(filter_item) != 3): - raise ValueError( - f"Filter condition should be a tuple of length 3, {filter_item} is not valid." - ) - - column, operator, value = filter_item - - if not isinstance(column, str): - raise ValueError( - f"Column name should be a string, but received '{column}' of type {type(column).__name__}." - ) - - if operator not in valid_operators: - raise ValueError(f"Operator {operator} is not valid.") - - operator_str = valid_operators[operator] - - if operator_str in ["IN", "NOT IN"]: - value_list = ", ".join([repr(v) for v in value]) - expression = f"`{column}` {operator_str} ({value_list})" - else: - expression = f"`{column}` {operator_str} {repr(value)}" - and_expressions.append(expression) - - or_expressions.append(" AND ".join(and_expressions)) - - if or_expressions: - where_clause = " WHERE " + " OR ".join(or_expressions) - - full_query = f"{select_clause} FROM {sub_query} AS sub{where_clause}" - return full_query - def _query_to_destination( self, query: str, @@ -610,12 +555,7 @@ def _read_gbq_query( True if use_cache is None else use_cache ) - if isinstance(index_col, bigframes.enums.DefaultIndexKind): - index_cols = [] - elif isinstance(index_col, str): - index_cols = [index_col] - else: - index_cols = list(index_col) + index_cols = _to_index_cols(index_col) destination, query_job = self._query_to_destination( query, @@ -682,8 +622,13 @@ def read_gbq_table( columns = col_order filters = list(filters) - if len(filters) != 0 or _is_table_with_wildcard_suffix(query): - query = self._to_query(query, columns, filters) + if len(filters) != 0 or bf_io_bigquery.is_table_with_wildcard_suffix(query): + # TODO(b/338039517): Refactor this to be called inside both + # _read_gbq_query and _read_gbq_table (after detecting primary keys) + # so we can make sure index_col/index_cols reflects primary keys. + query = bf_io_bigquery.to_query( + query, _to_index_cols(index_col), columns, filters + ) return self._read_gbq_query( query, @@ -838,12 +783,7 @@ def _read_bigquery_load_job( index_col: Iterable[str] | str | bigframes.enums.DefaultIndexKind = (), columns: Iterable[str] = (), ) -> dataframe.DataFrame: - if isinstance(index_col, bigframes.enums.DefaultIndexKind): - index_cols = [] - elif isinstance(index_col, str): - index_cols = [index_col] - else: - index_cols = list(index_col) + index_cols = _to_index_cols(index_col) if not job_config.clustering_fields and index_cols: job_config.clustering_fields = index_cols[:_MAX_CLUSTER_COLUMNS] @@ -1430,7 +1370,7 @@ def _create_empty_temp_table( datetime.datetime.now(datetime.timezone.utc) + constants.DEFAULT_EXPIRATION ) - table = bigframes_io.create_temp_table( + table = bf_io_bigquery.create_temp_table( self, expiration, schema=schema, diff --git a/bigframes/session/_io/bigquery/__init__.py b/bigframes/session/_io/bigquery/__init__.py index 79108c71a2..98e0dac1e8 100644 --- a/bigframes/session/_io/bigquery/__init__.py +++ b/bigframes/session/_io/bigquery/__init__.py @@ -19,10 +19,13 @@ import datetime import itertools import os +import re import textwrap import types -from typing import Dict, Iterable, Optional, Sequence, Tuple, Union +import typing +from typing import Dict, Iterable, Mapping, Optional, Sequence, Tuple, Union +import bigframes_vendored.pandas.io.gbq as third_party_pandas_gbq import google.api_core.exceptions import google.cloud.bigquery as bigquery @@ -311,3 +314,95 @@ def create_bq_dataset_reference( query_destination.project, query_destination.dataset_id, ) + + +def is_query(query_or_table: str) -> bool: + """Determine if `query_or_table` is a table ID or a SQL string""" + return re.search(r"\s", query_or_table.strip(), re.MULTILINE) is not None + + +def is_table_with_wildcard_suffix(query_or_table: str) -> bool: + """Determine if `query_or_table` is a table and contains a wildcard suffix.""" + return not is_query(query_or_table) and query_or_table.endswith("*") + + +def to_query( + query_or_table: str, + index_cols: Iterable[str], + columns: Iterable[str], + filters: third_party_pandas_gbq.FiltersType, +) -> str: + """Compile query_or_table with conditions(filters, wildcards) to query.""" + filters = list(filters) + sub_query = ( + f"({query_or_table})" if is_query(query_or_table) else f"`{query_or_table}`" + ) + + # TODO(b/338111344): Generate an index based on DefaultIndexKind if we + # don't have index columns specified. + if columns: + # We only reduce the selection if columns is set, but we always + # want to make sure index_cols is also included. + all_columns = itertools.chain(index_cols, columns) + select_clause = "SELECT " + ", ".join(f"`{column}`" for column in all_columns) + else: + select_clause = "SELECT *" + + where_clause = "" + if filters: + valid_operators: Mapping[third_party_pandas_gbq.FilterOps, str] = { + "in": "IN", + "not in": "NOT IN", + "LIKE": "LIKE", + "==": "=", + ">": ">", + "<": "<", + ">=": ">=", + "<=": "<=", + "!=": "!=", + } + + # If single layer filter, add another pseudo layer. So the single layer represents "and" logic. + if isinstance(filters[0], tuple) and ( + len(filters[0]) == 0 or not isinstance(list(filters[0])[0], tuple) + ): + filters = typing.cast(third_party_pandas_gbq.FiltersType, [filters]) + + or_expressions = [] + for group in filters: + if not isinstance(group, Iterable): + group = [group] + + and_expressions = [] + for filter_item in group: + if not isinstance(filter_item, tuple) or (len(filter_item) != 3): + raise ValueError( + f"Filter condition should be a tuple of length 3, {filter_item} is not valid." + ) + + column, operator, value = filter_item + + if not isinstance(column, str): + raise ValueError( + f"Column name should be a string, but received '{column}' of type {type(column).__name__}." + ) + + if operator not in valid_operators: + raise ValueError(f"Operator {operator} is not valid.") + + operator_str = valid_operators[operator] + + if operator_str in ["IN", "NOT IN"]: + value_list = ", ".join([repr(v) for v in value]) + expression = f"`{column}` {operator_str} ({value_list})" + else: + expression = f"`{column}` {operator_str} {repr(value)}" + and_expressions.append(expression) + + or_expressions.append(" AND ".join(and_expressions)) + + if or_expressions: + where_clause = " WHERE " + " OR ".join(or_expressions) + + full_query = f"{select_clause} FROM {sub_query} AS sub{where_clause}" + return full_query diff --git a/tests/system/small/test_pandas_options.py b/tests/system/small/test_pandas_options.py index afb75c65e3..c580f926c9 100644 --- a/tests/system/small/test_pandas_options.py +++ b/tests/system/small/test_pandas_options.py @@ -13,6 +13,7 @@ # limitations under the License. import datetime +import re from unittest import mock import warnings @@ -69,8 +70,12 @@ def test_read_gbq_start_sets_session_location( assert not bpd.options.bigquery.location # Starting user journey with read_gbq* should work for a table in any - # location, in this case tokyo - df = read_method(query_tokyo) + # location, in this case tokyo. + with warnings.catch_warnings(): + # Since the query refers to a specific location, no warning should be + # raised. + warnings.simplefilter("error", bigframes.exceptions.DefaultLocationWarning) + df = read_method(query_tokyo) assert df is not None # Now bigquery options location should be set to tokyo @@ -146,7 +151,11 @@ def test_read_gbq_after_session_start_must_comply_with_default_location( # Starting user journey with anything other than read_gbq*, such as # read_pandas would bind the session to default location US - df = bpd.read_pandas(scalars_pandas_df_index) + with pytest.warns( + bigframes.exceptions.DefaultLocationWarning, + match=re.escape("using location US for the session"), + ): + df = bpd.read_pandas(scalars_pandas_df_index) assert df is not None # Doing read_gbq* from a table in another location should fail @@ -262,17 +271,18 @@ def test_read_gbq_must_comply_with_set_location_non_US( def test_credentials_need_reauthentication(monkeypatch): # Use a simple test query to verify that default session works to interact - # with BQ + # with BQ. test_query = "SELECT 1" - # Confirm that default session has BQ client with valid credentials - session = bpd.get_global_session() - assert session.bqclient._credentials.valid - # Confirm that default session works as usual df = bpd.read_gbq(test_query) assert df is not None + # Call get_global_session() *after* read_gbq so that our location detection + # has a chance to work. + session = bpd.get_global_session() + assert session.bqclient._credentials.valid + with monkeypatch.context() as m: # Simulate expired credentials to trigger the credential refresh flow m.setattr(session.bqclient._credentials, "expiry", datetime.datetime.utcnow()) diff --git a/tests/system/small/test_session.py b/tests/system/small/test_session.py index 6b2d7df50d..5daa01ad38 100644 --- a/tests/system/small/test_session.py +++ b/tests/system/small/test_session.py @@ -18,7 +18,7 @@ import textwrap import time import typing -from typing import List +from typing import List, Sequence import google import google.cloud.bigquery as bigquery @@ -338,30 +338,80 @@ def test_read_gbq_table_clustered_with_filter(session: bigframes.Session): assert "OLI_TIRS" in sensors.index -def test_read_gbq_wildcard(session: bigframes.Session): - df = session.read_gbq("bigquery-public-data.noaa_gsod.gsod193*") - assert df.shape == (348485, 32) +_GSOD_ALL_TABLES = "bigquery-public-data.noaa_gsod.gsod*" +_GSOD_1930S = "bigquery-public-data.noaa_gsod.gsod193*" -def test_read_gbq_wildcard_with_filter(session: bigframes.Session): - df = session.read_gbq( - "bigquery-public-data.noaa_gsod.gsod19*", - filters=[("_table_suffix", ">=", "30"), ("_table_suffix", "<=", "39")], # type: ignore - ) - assert df.shape == (348485, 32) - - -def test_read_gbq_table_wildcard(session: bigframes.Session): - df = session.read_gbq_table("bigquery-public-data.noaa_gsod.gsod193*") - assert df.shape == (348485, 32) - - -def test_read_gbq_table_wildcard_with_filter(session: bigframes.Session): - df = session.read_gbq_table( - "bigquery-public-data.noaa_gsod.gsod19*", - filters=[("_table_suffix", ">=", "30"), ("_table_suffix", "<=", "39")], # type: ignore +@pytest.mark.parametrize( + "api_method", + # Test that both methods work as there's a risk that read_gbq / + # read_gbq_table makes for an infinite loop. Table reads can convert to + # queries and read_gbq reads from tables. + ["read_gbq", "read_gbq_table"], +) +@pytest.mark.parametrize( + ("filters", "table_id", "index_col", "columns"), + [ + pytest.param( + [("_table_suffix", ">=", "1930"), ("_table_suffix", "<=", "1939")], + _GSOD_ALL_TABLES, + ["stn", "wban", "year", "mo", "da"], + ["temp", "max", "min"], + id="all", + ), + pytest.param( + (), # filters + _GSOD_1930S, + (), # index_col + ["temp", "max", "min"], + id="columns", + ), + pytest.param( + [("_table_suffix", ">=", "1930"), ("_table_suffix", "<=", "1939")], + _GSOD_ALL_TABLES, + (), # index_col, + (), # columns + id="filters", + ), + pytest.param( + (), # filters + _GSOD_1930S, + ["stn", "wban", "year", "mo", "da"], + (), # columns + id="index_col", + ), + ], +) +def test_read_gbq_wildcard( + session: bigframes.Session, + api_method: str, + filters, + table_id: str, + index_col: Sequence[str], + columns: Sequence[str], +): + table_metadata = session.bqclient.get_table(table_id) + method = getattr(session, api_method) + df = method(table_id, filters=filters, index_col=index_col, columns=columns) + num_rows, num_columns = df.shape + + if index_col: + assert list(df.index.names) == list(index_col) + else: + assert df.index.name is None + + expected_columns = ( + columns + if columns + else [ + field.name + for field in table_metadata.schema + if field.name not in index_col and field.name not in columns + ] ) - assert df.shape == (348485, 32) + assert list(df.columns) == expected_columns + assert num_rows > 0 + assert num_columns == len(expected_columns) @pytest.mark.parametrize( diff --git a/tests/unit/session/test_io_bigquery.py b/tests/unit/session/test_io_bigquery.py index 43865fc2c8..9da085e824 100644 --- a/tests/unit/session/test_io_bigquery.py +++ b/tests/unit/session/test_io_bigquery.py @@ -210,3 +210,109 @@ def test_create_temp_table_default_expiration(): def test_bq_schema_to_sql(schema: Iterable[bigquery.SchemaField], expected: str): sql = io_bq.bq_schema_to_sql(schema) assert sql == expected + + +@pytest.mark.parametrize( + ("query_or_table", "index_cols", "columns", "filters", "expected_output"), + [ + pytest.param( + "test_table", + [], + [], + ["date_col", ">", "2022-10-20"], + None, + marks=pytest.mark.xfail( + raises=ValueError, + ), + id="raise_error", + ), + pytest.param( + "test_table", + ["row_index"], + ["string_col"], + [ + (("rowindex", "not in", [0, 6]),), + (("string_col", "in", ["Hello, World!", "こんにちは"]),), + ], + ( + "SELECT `row_index`, `string_col` FROM `test_table` AS sub WHERE " + "`rowindex` NOT IN (0, 6) OR `string_col` IN ('Hello, World!', " + "'こんにちは')" + ), + id="table-all_params-filter_or_operation", + ), + pytest.param( + """SELECT + rowindex, + string_col, + FROM `test_table` AS t + """, + ["rowindex"], + ["string_col"], + [ + ("rowindex", "<", 4), + ("string_col", "==", "Hello, World!"), + ], + """SELECT `rowindex`, `string_col` FROM (SELECT + rowindex, + string_col, + FROM `test_table` AS t + ) AS sub WHERE `rowindex` < 4 AND `string_col` = 'Hello, World!'""", + id="subquery-all_params-filter_and_operation", + ), + pytest.param( + "test_table", + [], + ["col_a", "col_b"], + [], + "SELECT `col_a`, `col_b` FROM `test_table` AS sub", + id="table-columns", + ), + pytest.param( + "test_table", + [], + [], + [("date_col", ">", "2022-10-20")], + "SELECT * FROM `test_table` AS sub WHERE `date_col` > '2022-10-20'", + id="table-filter", + ), + pytest.param( + "test_table*", + [], + [], + [], + "SELECT * FROM `test_table*` AS sub", + id="wildcard-no_params", + ), + pytest.param( + "test_table*", + [], + [], + [("_TABLE_SUFFIX", ">", "2022-10-20")], + "SELECT * FROM `test_table*` AS sub WHERE `_TABLE_SUFFIX` > '2022-10-20'", + id="wildcard-filter", + ), + ], +) +def test_to_query(query_or_table, index_cols, columns, filters, expected_output): + query = io_bq.to_query( + query_or_table, + index_cols, + columns, + filters, + ) + assert query == expected_output + + +@pytest.mark.parametrize( + ("query_or_table", "filters", "expected_output"), + [], +) +def test_to_query_with_wildcard_table(query_or_table, filters, expected_output): + query = io_bq.to_query( + query_or_table, + (), # index_cols + (), # columns + filters, + ) + assert query == expected_output diff --git a/tests/unit/session/test_session.py b/tests/unit/session/test_session.py index a161c2df76..bea858e037 100644 --- a/tests/unit/session/test_session.py +++ b/tests/unit/session/test_session.py @@ -398,85 +398,3 @@ def test_session_init_fails_with_no_project(): credentials=mock.Mock(spec=google.auth.credentials.Credentials) ) ) - - -@pytest.mark.parametrize( - ("query_or_table", "columns", "filters", "expected_output"), - [ - pytest.param( - """SELECT - rowindex, - string_col, - FROM `test_table` AS t - """, - [], - [("rowindex", "<", 4), ("string_col", "==", "Hello, World!")], - """SELECT * FROM (SELECT - rowindex, - string_col, - FROM `test_table` AS t - ) AS sub WHERE `rowindex` < 4 AND `string_col` = 'Hello, World!'""", - id="query_input", - ), - pytest.param( - "test_table", - [], - [("date_col", ">", "2022-10-20")], - "SELECT * FROM `test_table` AS sub WHERE `date_col` > '2022-10-20'", - id="table_input", - ), - pytest.param( - "test_table", - ["row_index", "string_col"], - [ - (("rowindex", "not in", [0, 6]),), - (("string_col", "in", ["Hello, World!", "こんにちは"]),), - ], - ( - "SELECT `row_index`, `string_col` FROM `test_table` AS sub WHERE " - "`rowindex` NOT IN (0, 6) OR `string_col` IN ('Hello, World!', " - "'こんにちは')" - ), - id="or_operation", - ), - pytest.param( - "test_table", - [], - ["date_col", ">", "2022-10-20"], - None, - marks=pytest.mark.xfail( - raises=ValueError, - ), - id="raise_error", - ), - ], -) -def test_read_gbq_with_filters(query_or_table, columns, filters, expected_output): - session = resources.create_bigquery_session() - query = session._to_query(query_or_table, columns, filters) - assert query == expected_output - - -@pytest.mark.parametrize( - ("query_or_table", "columns", "filters", "expected_output"), - [ - pytest.param( - "test_table*", - [], - [], - "SELECT * FROM `test_table*` AS sub", - id="wildcard_table_input", - ), - pytest.param( - "test_table*", - [], - [("_TABLE_SUFFIX", ">", "2022-10-20")], - "SELECT * FROM `test_table*` AS sub WHERE `_TABLE_SUFFIX` > '2022-10-20'", - id="wildcard_table_input_with_filter", - ), - ], -) -def test_read_gbq_wildcard(query_or_table, columns, filters, expected_output): - session = resources.create_bigquery_session() - query = session._to_query(query_or_table, columns, filters) - assert query == expected_output From 306953aaae69e57c7c2f5eefb88d55a35bdcca9d Mon Sep 17 00:00:00 2001 From: Shobhit Singh Date: Wed, 8 May 2024 22:58:29 +0000 Subject: [PATCH 06/17] docs: document inlining of small data in `read_*` APIs (#670) * docs: document inlining of small data in `read_*` APIs * mention that threshold is in memory size * non-bigquery instead of non-"bigquery" --- bigframes/session/__init__.py | 5 +++++ third_party/bigframes_vendored/pandas/io/parquet.py | 5 +++++ .../bigframes_vendored/pandas/io/parsers/readers.py | 10 ++++++++++ third_party/bigframes_vendored/pandas/io/pickle.py | 5 +++++ 4 files changed, 25 insertions(+) diff --git a/bigframes/session/__init__.py b/bigframes/session/__init__.py index 89845bb842..5f70fd77f9 100644 --- a/bigframes/session/__init__.py +++ b/bigframes/session/__init__.py @@ -874,6 +874,11 @@ def read_pandas( The pandas DataFrame will be persisted as a temporary BigQuery table, which can be automatically recycled after the Session is closed. + .. note:: + Data is inlined in the query SQL if it is small enough (roughly 5MB + or less in memory). Larger size data is loaded to a BigQuery table + instead. + **Examples:** >>> import bigframes.pandas as bpd diff --git a/third_party/bigframes_vendored/pandas/io/parquet.py b/third_party/bigframes_vendored/pandas/io/parquet.py index 877a384b6d..1f5563c962 100644 --- a/third_party/bigframes_vendored/pandas/io/parquet.py +++ b/third_party/bigframes_vendored/pandas/io/parquet.py @@ -19,6 +19,11 @@ def read_parquet( Instead, set a serialized index column as the index and sort by that in the resulting DataFrame. + .. note:: + For non-"bigquery" engine, data is inlined in the query SQL if it is + small enough (roughly 5MB or less in memory). Larger size data is + loaded to a BigQuery table instead. + **Examples:** >>> import bigframes.pandas as bpd diff --git a/third_party/bigframes_vendored/pandas/io/parsers/readers.py b/third_party/bigframes_vendored/pandas/io/parsers/readers.py index d147abfd22..248cf8e0fe 100644 --- a/third_party/bigframes_vendored/pandas/io/parsers/readers.py +++ b/third_party/bigframes_vendored/pandas/io/parsers/readers.py @@ -62,6 +62,11 @@ def read_csv( file. Instead, set a serialized index column as the index and sort by that in the resulting DataFrame. + .. note:: + For non-bigquery engine, data is inlined in the query SQL if it is + small enough (roughly 5MB or less in memory). Larger size data is + loaded to a BigQuery table instead. + **Examples:** >>> import bigframes.pandas as bpd @@ -167,6 +172,11 @@ def read_json( file. Instead, set a serialized index column as the index and sort by that in the resulting DataFrame. + .. note:: + For non-bigquery engine, data is inlined in the query SQL if it is + small enough (roughly 5MB or less in memory). Larger size data is + loaded to a BigQuery table instead. + **Examples:** >>> import bigframes.pandas as bpd diff --git a/third_party/bigframes_vendored/pandas/io/pickle.py b/third_party/bigframes_vendored/pandas/io/pickle.py index 096d9b13d6..88684309f9 100644 --- a/third_party/bigframes_vendored/pandas/io/pickle.py +++ b/third_party/bigframes_vendored/pandas/io/pickle.py @@ -25,6 +25,11 @@ def read_pickle( If the content of the pickle file is a Series and its name attribute is None, the name will be set to '0' by default. + .. note:: + Data is inlined in the query SQL if it is small enough (roughly 5MB + or less in memory). Larger size data is loaded to a BigQuery table + instead. + **Examples:** >>> import bigframes.pandas as bpd From 57ccabcd1402b7938e2c7068e5b4880ef018f39c Mon Sep 17 00:00:00 2001 From: Stephanie A <129541811+DevStephanie@users.noreply.github.com> Date: Thu, 9 May 2024 09:38:28 -0500 Subject: [PATCH 07/17] feat: suggest correct options in bpd.options.bigquery.location (#666) * feat: suggest correct options in bpd.options.bigquery.location deps: add jellyfish as a dependency for spelling correction --- bigframes/_config/bigquery_options.py | 11 +++++++++-- setup.py | 2 ++ testing/constraints-3.9.txt | 1 + tests/unit/_config/test_bigquery_options.py | 11 ++++++----- 4 files changed, 18 insertions(+), 7 deletions(-) diff --git a/bigframes/_config/bigquery_options.py b/bigframes/_config/bigquery_options.py index 74561e6f24..6f841a36b3 100644 --- a/bigframes/_config/bigquery_options.py +++ b/bigframes/_config/bigquery_options.py @@ -21,6 +21,7 @@ import google.api_core.exceptions import google.auth.credentials +import jellyfish import bigframes.constants import bigframes.exceptions @@ -30,7 +31,8 @@ "Call bigframes.pandas.close_session() first, if you are using the bigframes.pandas API." ) -UNKNOWN_LOCATION_MESSAGE = "The location '{location}' is set to an unknown value." + +UNKNOWN_LOCATION_MESSAGE = "The location '{location}' is set to an unknown value. Did you mean '{possibility}'?" def _validate_location(value: Optional[str]): @@ -39,8 +41,13 @@ def _validate_location(value: Optional[str]): return if value not in bigframes.constants.ALL_BIGQUERY_LOCATIONS: + location = str(value) + possibility = min( + bigframes.constants.ALL_BIGQUERY_LOCATIONS, + key=lambda item: jellyfish.levenshtein_distance(location, item), + ) warnings.warn( - UNKNOWN_LOCATION_MESSAGE.format(location=value), + UNKNOWN_LOCATION_MESSAGE.format(location=location, possibility=possibility), # There are many layers before we get to (possibly) the user's code: # -> bpd.options.bigquery.location = "us-central-1" # -> location.setter diff --git a/setup.py b/setup.py index 2ccf63259c..d5d282d11a 100644 --- a/setup.py +++ b/setup.py @@ -12,6 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. + import io import itertools import os @@ -45,6 +46,7 @@ "google-cloud-resource-manager >=1.10.3", "google-cloud-storage >=2.0.0", "ibis-framework[bigquery] >=8.0.0,<9.0.0dev", + "jellyfish >=0.8.9", # TODO: Relax upper bound once we have fixed `system_prerelease` tests. "pandas >=1.5.0", "pyarrow >=8.0.0", diff --git a/testing/constraints-3.9.txt b/testing/constraints-3.9.txt index f5007ed564..3c51668655 100644 --- a/testing/constraints-3.9.txt +++ b/testing/constraints-3.9.txt @@ -11,6 +11,7 @@ google-cloud-iam==2.12.1 google-cloud-resource-manager==1.10.3 google-cloud-storage==2.0.0 ibis-framework==8.0.0 +jellyfish==0.8.9 pandas==1.5.0 pyarrow==8.0.0 pydata-google-auth==1.8.2 diff --git a/tests/unit/_config/test_bigquery_options.py b/tests/unit/_config/test_bigquery_options.py index 7d9a452f42..b827b0723d 100644 --- a/tests/unit/_config/test_bigquery_options.py +++ b/tests/unit/_config/test_bigquery_options.py @@ -108,24 +108,25 @@ def test_location_set_to_valid_no_warning(valid_location): @pytest.mark.parametrize( [ "invalid_location", + "possibility", ], [ # Test with common mistakes, see article. # https://en.wikipedia.org/wiki/Edit_distance#Formal_definition_and_properties # Substitution - ("us-wist-3",), + ("us-wist3", "us-west3"), # Insertion - ("us-central-1",), + ("us-central-1", "us-central1"), # Deletion - ("asia-suth2",), + ("asia-suth2", "asia-south2"), ], ) -def test_location_set_to_invalid_warning(invalid_location): +def test_location_set_to_invalid_warning(invalid_location, possibility): options = bigquery_options.BigQueryOptions() with pytest.warns( bigframes.exceptions.UnknownLocationWarning, match=re.escape( - f"The location '{invalid_location}' is set to an unknown value." + f"The location '{invalid_location}' is set to an unknown value. Did you mean '{possibility}'?" ), ): options.location = invalid_location From 93416ed2f8353c12eb162e21e9bf155312b0ed8c Mon Sep 17 00:00:00 2001 From: Ashley Xu <139821907+ashleyxuu@users.noreply.github.com> Date: Thu, 9 May 2024 13:20:20 -0700 Subject: [PATCH 08/17] docs: add code snippets for llm text generatiion (#669) * docs: add code snippets for llm text generatiion --- samples/snippets/text_generation_test.py | 68 ++++++++++++++++++++++++ 1 file changed, 68 insertions(+) create mode 100644 samples/snippets/text_generation_test.py diff --git a/samples/snippets/text_generation_test.py b/samples/snippets/text_generation_test.py new file mode 100644 index 0000000000..c4df1dde3b --- /dev/null +++ b/samples/snippets/text_generation_test.py @@ -0,0 +1,68 @@ +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +def test_llm_text_generation() -> None: + # Determine project id, in this case prefer the one set in the environment + # variable GOOGLE_CLOUD_PROJECT (if any) + import os + + PROJECT_ID = os.getenv("GOOGLE_CLOUD_PROJECT", "bigframes-dev") + LOCATION = "US" + + # [START bigquery_dataframes_generate_text_tutorial_create_remote_model] + import bigframes + from bigframes.ml.llm import PaLM2TextGenerator + + bigframes.options.bigquery.project = PROJECT_ID + bigframes.options.bigquery.location = LOCATION + + model = PaLM2TextGenerator() + # [END bigquery_dataframes_generate_text_tutorial_create_remote_model] + assert model is not None + + # [START bigquery_dataframes_generate_text_tutorial_perform_keyword_extraction] + import bigframes.pandas as bpd + + df = bpd.read_gbq("bigquery-public-data.imdb.reviews", max_results=5) + df_prompt_prefix = "Extract the key words from the text below: " + df_prompt = df_prompt_prefix + df["review"] + + # Predict using the model + df_pred = model.predict(df_prompt, temperature=0.2, max_output_tokens=100) + df_pred.peek(5) + # [END bigquery_dataframes_generate_text_tutorial_perform_keyword_extraction] + # peek() is used to show a preview of the results. If the output + # of this sample changes, also update the screenshot for the associated + # tutorial on cloud.google.com. + assert df_pred["ml_generate_text_llm_result"] is not None + assert df_pred["ml_generate_text_llm_result"].iloc[0] is not None + + # [START bigquery_dataframes_generate_text_tutorial_perform_sentiment_analysis] + import bigframes.pandas as bpd + + df = bpd.read_gbq("bigquery-public-data.imdb.reviews", max_results=5) + df_prompt_prefix = "perform sentiment analysis on the following text, return one the following categories: positive, negative: " + df_prompt = df_prompt_prefix + df["review"] + + # Predict using the model + df_pred = model.predict(df_prompt, temperature=0.2, max_output_tokens=100) + df_pred.peek(5) + # [END bigquery_dataframes_generate_text_tutorial_perform_sentiment_analysis] + # peek() is used to show a preview of the results. If the output + # of this sample changes, also update the screenshot for the associated + # tutorial on cloud.google.com. + + assert df_pred["ml_generate_text_llm_result"] is not None + assert df_pred["ml_generate_text_llm_result"].iloc[0] is not None From 2218c21b5bb0f9e54a365ba1ada0203cbc4c9efc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tim=20Swe=C3=B1a=20=28Swast=29?= Date: Thu, 9 May 2024 21:17:57 -0500 Subject: [PATCH 09/17] feat: add `Series.case_when()` (#673) * feat: add `Series.case_when()` * rename to ScalarOp * rename to exprs * add type annotations feat: add `DataFrame.__delitem__` (#673) docs: add logistic regression samples (#673) --- bigframes/core/__init__.py | 17 ++- bigframes/core/blocks.py | 9 ++ bigframes/dataframe.py | 4 + bigframes/operations/__init__.py | 51 ++++--- bigframes/operations/base.py | 23 ++- bigframes/series.py | 19 +++ .../logistic_regression_prediction_test.py | 137 ++++++++++++++++++ tests/system/small/test_series.py | 30 ++++ .../bigframes_vendored/pandas/core/series.py | 55 +++++++ 9 files changed, 311 insertions(+), 34 deletions(-) create mode 100644 samples/snippets/logistic_regression_prediction_test.py diff --git a/bigframes/core/__init__.py b/bigframes/core/__init__.py index eef0efcf83..79c6bb6495 100644 --- a/bigframes/core/__init__.py +++ b/bigframes/core/__init__.py @@ -16,6 +16,7 @@ from dataclasses import dataclass import functools import io +import itertools import typing from typing import Iterable, Sequence @@ -370,14 +371,16 @@ def unpivot( for col_id, input_ids in unpivot_columns: # row explode offset used to choose the input column # we use offset instead of label as labels are not necessarily unique - cases = tuple( - ( - ops.eq_op.as_expr(explode_offsets_id, ex.const(i)), - ex.free_var(id_or_null) - if (id_or_null is not None) - else ex.const(None), + cases = itertools.chain( + *( + ( + ops.eq_op.as_expr(explode_offsets_id, ex.const(i)), + ex.free_var(id_or_null) + if (id_or_null is not None) + else ex.const(None), + ) + for i, id_or_null in enumerate(input_ids) ) - for i, id_or_null in enumerate(input_ids) ) col_expr = ops.case_when_op.as_expr(*cases) unpivot_exprs.append((col_expr, col_id)) diff --git a/bigframes/core/blocks.py b/bigframes/core/blocks.py index 402581eb6f..277409f3a3 100644 --- a/bigframes/core/blocks.py +++ b/bigframes/core/blocks.py @@ -803,6 +803,15 @@ def apply_ternary_op( expr = op.as_expr(col_id_1, col_id_2, col_id_3) return self.project_expr(expr, result_label) + def apply_nary_op( + self, + columns: Iterable[str], + op: ops.NaryOp, + result_label: Label = None, + ) -> typing.Tuple[Block, str]: + expr = op.as_expr(*columns) + return self.project_expr(expr, result_label) + def multi_apply_window_op( self, columns: typing.Sequence[str], diff --git a/bigframes/dataframe.py b/bigframes/dataframe.py index 1f1fb5467f..47730630e3 100644 --- a/bigframes/dataframe.py +++ b/bigframes/dataframe.py @@ -655,6 +655,10 @@ def _repr_html_(self) -> str: html_string += f"[{row_count} rows x {column_count} columns in total]" return html_string + def __delitem__(self, key: str): + df = self.drop(columns=[key]) + self._set_block(df._get_block()) + def __setitem__(self, key: str, value: SingleItemValue): df = self._assign_single_item(key, value) self._set_block(df._get_block()) diff --git a/bigframes/operations/__init__.py b/bigframes/operations/__init__.py index a7c385a2b8..e52f488d38 100644 --- a/bigframes/operations/__init__.py +++ b/bigframes/operations/__init__.py @@ -17,7 +17,7 @@ import dataclasses import functools import typing -from typing import Tuple, Union +from typing import Union import numpy as np import pandas as pd @@ -46,7 +46,7 @@ def order_preserving(self) -> bool: @dataclasses.dataclass(frozen=True) -class NaryOp: +class ScalarOp: @property def name(self) -> str: raise NotImplementedError("RowOp abstract base class has no implementation") @@ -60,10 +60,30 @@ def order_preserving(self) -> bool: return False +@dataclasses.dataclass(frozen=True) +class NaryOp(ScalarOp): + def as_expr( + self, + *exprs: Union[str | bigframes.core.expression.Expression], + ) -> bigframes.core.expression.Expression: + import bigframes.core.expression + + # Keep this in sync with output_type and compilers + inputs: list[bigframes.core.expression.Expression] = [] + + for expr in exprs: + inputs.append(_convert_expr_input(expr)) + + return bigframes.core.expression.OpExpression( + self, + tuple(inputs), + ) + + # These classes can be used to create simple ops that don't take local parameters # All is needed is a unique name, and to register an implementation in ibis_mappings.py @dataclasses.dataclass(frozen=True) -class UnaryOp(NaryOp): +class UnaryOp(ScalarOp): @property def arguments(self) -> int: return 1 @@ -79,7 +99,7 @@ def as_expr( @dataclasses.dataclass(frozen=True) -class BinaryOp(NaryOp): +class BinaryOp(ScalarOp): @property def arguments(self) -> int: return 2 @@ -101,7 +121,7 @@ def as_expr( @dataclasses.dataclass(frozen=True) -class TernaryOp(NaryOp): +class TernaryOp(ScalarOp): @property def arguments(self) -> int: return 3 @@ -655,27 +675,6 @@ def output_type(self, *input_types: dtypes.ExpressionType) -> dtypes.ExpressionT output_expr_types, ) - def as_expr( - self, - *case_output_pairs: Tuple[ - Union[str | bigframes.core.expression.Expression], - Union[str | bigframes.core.expression.Expression], - ], - ) -> bigframes.core.expression.Expression: - import bigframes.core.expression - - # Keep this in sync with output_type and compilers - inputs: list[bigframes.core.expression.Expression] = [] - - for case, output in case_output_pairs: - inputs.append(_convert_expr_input(case)) - inputs.append(_convert_expr_input(output)) - - return bigframes.core.expression.OpExpression( - self, - tuple(inputs), - ) - case_when_op = CaseWhenOp() diff --git a/bigframes/operations/base.py b/bigframes/operations/base.py index b003ce59cc..75d14f3fbc 100644 --- a/bigframes/operations/base.py +++ b/bigframes/operations/base.py @@ -15,6 +15,7 @@ from __future__ import annotations import typing +from typing import List, Sequence import bigframes_vendored.pandas.pandas._typing as vendored_pandas_typing import numpy @@ -205,6 +206,21 @@ def _apply_binary_op( block, result_id = self._block.project_expr(expr, name) return series.Series(block.select_column(result_id)) + def _apply_nary_op( + self, + op: ops.NaryOp, + others: Sequence[typing.Union[series.Series, scalars.Scalar]], + ignore_self=False, + ): + """Applies an n-ary operator to the series and others.""" + values, block = self._align_n(others, ignore_self=ignore_self) + block, result_id = block.apply_nary_op( + values, + op, + self._name, + ) + return series.Series(block.select_column(result_id)) + def _apply_binary_aggregation( self, other: series.Series, stat: agg_ops.BinaryAggregateOp ) -> float: @@ -226,8 +242,13 @@ def _align_n( self, others: typing.Sequence[typing.Union[series.Series, scalars.Scalar]], how="outer", + ignore_self=False, ) -> tuple[typing.Sequence[str], blocks.Block]: - value_ids = [self._value_column] + if ignore_self: + value_ids: List[str] = [] + else: + value_ids = [self._value_column] + block = self._block for other in others: if isinstance(other, series.Series): diff --git a/bigframes/series.py b/bigframes/series.py index aea3d60ff5..ce13d205bd 100644 --- a/bigframes/series.py +++ b/bigframes/series.py @@ -410,6 +410,25 @@ def between(self, left, right, inclusive="both"): self._apply_binary_op(right, right_op) ) + def case_when(self, caselist) -> Series: + return self._apply_nary_op( + ops.case_when_op, + tuple( + itertools.chain( + itertools.chain(*caselist), + # Fallback to current value if no other matches. + ( + # We make a Series with a constant value to avoid casts to + # types other than boolean. + Series(True, index=self.index, dtype=pandas.BooleanDtype()), + self, + ), + ), + ), + # Self is already included in "others". + ignore_self=True, + ) + def cumsum(self) -> Series: return self._apply_window_op( agg_ops.sum_op, bigframes.core.window_spec.WindowSpec(following=0) diff --git a/samples/snippets/logistic_regression_prediction_test.py b/samples/snippets/logistic_regression_prediction_test.py new file mode 100644 index 0000000000..6a40369ba8 --- /dev/null +++ b/samples/snippets/logistic_regression_prediction_test.py @@ -0,0 +1,137 @@ +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""BigQuery DataFrames code samples for +https://cloud.google.com/bigquery/docs/logistic-regression-prediction. +""" + + +def test_logistic_regression_prediction(random_model_id: str) -> None: + your_model_id = random_model_id + + # [START bigquery_dataframes_logistic_regression_prediction_examine] + import bigframes.pandas as bpd + + df = bpd.read_gbq( + "bigquery-public-data.ml_datasets.census_adult_income", + columns=( + "age", + "workclass", + "marital_status", + "education_num", + "occupation", + "hours_per_week", + "income_bracket", + "functional_weight", + ), + max_results=100, + ) + df.peek() + # Output: + # age workclass marital_status education_num occupation hours_per_week income_bracket functional_weight + # 47 Local-gov Married-civ-spouse 13 Prof-specialty 40 >50K 198660 + # 56 Private Never-married 9 Adm-clerical 40 <=50K 85018 + # 40 Private Married-civ-spouse 12 Tech-support 40 >50K 285787 + # 34 Self-emp-inc Married-civ-spouse 9 Craft-repair 54 >50K 207668 + # 23 Private Married-civ-spouse 10 Handlers-cleaners 40 <=50K 40060 + # [END bigquery_dataframes_logistic_regression_prediction_examine] + + # [START bigquery_dataframes_logistic_regression_prediction_prepare] + import bigframes.pandas as bpd + + input_data = bpd.read_gbq( + "bigquery-public-data.ml_datasets.census_adult_income", + columns=( + "age", + "workclass", + "marital_status", + "education_num", + "occupation", + "hours_per_week", + "income_bracket", + "functional_weight", + ), + ) + input_data["dataframe"] = bpd.Series("training", index=input_data.index,).case_when( + [ + (((input_data["functional_weight"] % 10) == 8), "evaluation"), + (((input_data["functional_weight"] % 10) == 9), "prediction"), + ] + ) + del input_data["functional_weight"] + # [END bigquery_dataframes_logistic_regression_prediction_prepare] + + # [START bigquery_dataframes_logistic_regression_prediction_create_model] + import bigframes.ml.linear_model + + # input_data is defined in an earlier step. + training_data = input_data[input_data["dataframe"] == "training"] + X = training_data.drop(columns=["income_bracket", "dataframe"]) + y = training_data["income_bracket"] + + census_model = bigframes.ml.linear_model.LogisticRegression() + census_model.fit(X, y) + + census_model.to_gbq( + your_model_id, # For example: "your-project.census.census_model" + replace=True, + ) + # [END bigquery_dataframes_logistic_regression_prediction_create_model] + + # [START bigquery_dataframes_logistic_regression_prediction_evaluate_model] + # Select model you'll use for predictions. `read_gbq_model` loads model + # data from BigQuery, but you could also use the `census_model` object + # from previous steps. + census_model = bpd.read_gbq_model( + your_model_id, # For example: "your-project.census.census_model" + ) + + # input_data is defined in an earlier step. + evaluation_data = input_data[input_data["dataframe"] == "evaluation"] + X = evaluation_data.drop(columns=["income_bracket", "dataframe"]) + y = evaluation_data["income_bracket"] + + # The score() method evaluates how the model performs compared to the + # actual data. Output DataFrame matches that of ML.EVALUATE(). + score = census_model.score(X, y) + score.peek() + # Output: + # precision recall accuracy f1_score log_loss roc_auc + # 0 0.685764 0.536685 0.83819 0.602134 0.350417 0.882953 + # [END bigquery_dataframes_logistic_regression_prediction_evaluate_model] + + # [START bigquery_dataframes_logistic_regression_prediction_predict_income_bracket] + # Select model you'll use for predictions. `read_gbq_model` loads model + # data from BigQuery, but you could also use the `census_model` object + # from previous steps. + census_model = bpd.read_gbq_model( + your_model_id, # For example: "your-project.census.census_model" + ) + + # input_data is defined in an earlier step. + prediction_data = input_data[input_data["dataframe"] == "prediction"] + + predictions = census_model.predict(prediction_data) + predictions.peek() + # Output: + # predicted_income_bracket predicted_income_bracket_probs age workclass ... occupation hours_per_week income_bracket dataframe + # 18004 <=50K [{'label': ' >50K', 'prob': 0.0763305999358786... 75 ? ... ? 6 <=50K prediction + # 18886 <=50K [{'label': ' >50K', 'prob': 0.0448866871906495... 73 ? ... ? 22 >50K prediction + # 31024 <=50K [{'label': ' >50K', 'prob': 0.0362982319421936... 69 ? ... ? 1 <=50K prediction + # 31022 <=50K [{'label': ' >50K', 'prob': 0.0787836112058324... 75 ? ... ? 5 <=50K prediction + # 23295 <=50K [{'label': ' >50K', 'prob': 0.3385373037905673... 78 ? ... ? 32 <=50K prediction + # [END bigquery_dataframes_logistic_regression_prediction_predict_income_bracket] + + # TODO(tswast): Implement ML.EXPLAIN_PREDICT() and corresponding sample. + # TODO(tswast): Implement ML.GLOBAL_EXPLAIN() and corresponding sample. diff --git a/tests/system/small/test_series.py b/tests/system/small/test_series.py index 38aed19f05..beb99b1ada 100644 --- a/tests/system/small/test_series.py +++ b/tests/system/small/test_series.py @@ -2565,6 +2565,36 @@ def test_between(scalars_df_index, scalars_pandas_df_index, left, right, inclusi ) +def test_case_when(scalars_df_index, scalars_pandas_df_index): + pytest.importorskip( + "pandas", + minversion="2.2.0", + reason="case_when added in pandas 2.2.0", + ) + + bf_series = scalars_df_index["int64_col"] + pd_series = scalars_pandas_df_index["int64_col"] + + # TODO(tswast): pandas case_when appears to assume True when a value is + # null. I suspect this should be considered a bug in pandas. + bf_result = bf_series.case_when( + [ + ((bf_series > 100).fillna(True), 1000), + ((bf_series < -100).fillna(True), -1000), + ] + ).to_pandas() + pd_result = pd_series.case_when( + [ + (pd_series > 100, 1000), + (pd_series < -100, -1000), + ] + ) + pd.testing.assert_series_equal( + bf_result, + pd_result.astype(pd.Int64Dtype()), + ) + + def test_to_frame(scalars_dfs): scalars_df, scalars_pandas_df = scalars_dfs diff --git a/third_party/bigframes_vendored/pandas/core/series.py b/third_party/bigframes_vendored/pandas/core/series.py index 4833c41ff7..e155fb073a 100644 --- a/third_party/bigframes_vendored/pandas/core/series.py +++ b/third_party/bigframes_vendored/pandas/core/series.py @@ -6,10 +6,12 @@ from typing import ( Hashable, IO, + List, Literal, Mapping, Optional, Sequence, + Tuple, TYPE_CHECKING, Union, ) @@ -1937,6 +1939,59 @@ def between( """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + def case_when( + self, + caselist: List[Tuple[Series, Series]], + ) -> Series: + """Replace values where the conditions are True. + + **Examples:** + + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + + >>> c = bpd.Series([6, 7, 8, 9], name="c") + >>> a = bpd.Series([0, 0, 1, 2]) + >>> b = bpd.Series([0, 3, 4, 5]) + + >>> c.case_when( + ... caselist=[ + ... (a.gt(0), a), # condition, replacement + ... (b.gt(0), b), + ... ] + ... ) + 0 6 + 1 3 + 2 1 + 3 2 + Name: c, dtype: Int64 + + **See also:** + + - :func:`bigframes.series.Series.mask` : Replace values where the condition is True. + + Args: + caselist: + A list of tuples of conditions and expected replacements + Takes the form: ``(condition0, replacement0)``, + ``(condition1, replacement1)``, ... . + ``condition`` should be a 1-D boolean array-like object + or a callable. If ``condition`` is a callable, + it is computed on the Series + and should return a boolean Series or array. + The callable must not change the input Series + (though pandas doesn`t check it). ``replacement`` should be a + 1-D array-like object, a scalar or a callable. + If ``replacement`` is a callable, it is computed on the Series + and should return a scalar or Series. The callable + must not change the input Series + (though pandas doesn`t check it). + + Returns: + bigframes.series.Series + """ + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + def cumprod(self): """ Return cumulative product over a DataFrame or Series axis. From f2ed29cba8866508d1c68e45818c275b99732333 Mon Sep 17 00:00:00 2001 From: TrevorBergeron Date: Fri, 10 May 2024 09:21:52 -0700 Subject: [PATCH 10/17] refactor: Distinguish between range and row windows (#672) --- bigframes/core/block_transforms.py | 49 +++++----- bigframes/core/blocks.py | 7 +- bigframes/core/compile/compiled.py | 42 ++++++--- bigframes/core/groupby/__init__.py | 27 +++--- bigframes/core/reshape/__init__.py | 6 +- bigframes/core/window_spec.py | 129 +++++++++++++++++++++++++-- bigframes/dataframe.py | 27 +++--- bigframes/operations/aggregations.py | 44 +++++---- bigframes/series.py | 24 ++--- 9 files changed, 249 insertions(+), 106 deletions(-) diff --git a/bigframes/core/block_transforms.py b/bigframes/core/block_transforms.py index a221b343a5..e12e6bf054 100644 --- a/bigframes/core/block_transforms.py +++ b/bigframes/core/block_transforms.py @@ -71,21 +71,19 @@ def indicate_duplicates( if keep == "first": # Count how many copies occur up to current copy of value # Discard this value if there are copies BEFORE - window_spec = windows.WindowSpec( + window_spec = windows.cumulative_rows( grouping_keys=tuple(columns), - following=0, ) elif keep == "last": # Count how many copies occur up to current copy of values # Discard this value if there are copies AFTER - window_spec = windows.WindowSpec( + window_spec = windows.inverse_cumulative_rows( grouping_keys=tuple(columns), - preceding=0, ) else: # keep == False # Count how many copies of the value occur in entire series. # Discard this value if there are copies ANYWHERE - window_spec = windows.WindowSpec(grouping_keys=tuple(columns)) + window_spec = windows.unbound(grouping_keys=tuple(columns)) block, dummy = block.create_constant(1) block, val_count_col_id = block.apply_window_op( dummy, @@ -114,7 +112,7 @@ def quantile( dropna: bool = False, ) -> blocks.Block: # TODO: handle windowing and more interpolation methods - window = core.WindowSpec( + window = windows.unbound( grouping_keys=tuple(grouping_column_ids), ) quantile_cols = [] @@ -212,8 +210,8 @@ def _interpolate_column( if interpolate_method not in ["linear", "nearest", "ffill"]: raise ValueError("interpolate method not supported") window_ordering = (ordering.OrderingExpression(ex.free_var(x_values)),) - backwards_window = windows.WindowSpec(following=0, ordering=window_ordering) - forwards_window = windows.WindowSpec(preceding=0, ordering=window_ordering) + backwards_window = windows.rows(following=0, ordering=window_ordering) + forwards_window = windows.rows(preceding=0, ordering=window_ordering) # Note, this method may block, notnull = block.apply_unary_op(column, ops.notnull_op) @@ -364,7 +362,7 @@ def value_counts( ) count_id = agg_ids[0] if normalize: - unbound_window = windows.WindowSpec() + unbound_window = windows.unbound() block, total_count_id = block.apply_window_op( count_id, agg_ops.sum_op, unbound_window ) @@ -388,7 +386,7 @@ def value_counts( def pct_change(block: blocks.Block, periods: int = 1) -> blocks.Block: column_labels = block.column_labels - window_spec = windows.WindowSpec( + window_spec = windows.rows( preceding=periods if periods > 0 else None, following=-periods if periods < 0 else None, ) @@ -430,23 +428,22 @@ def rank( ops.isnull_op, ) nullity_col_ids.append(nullity_col_id) - window = windows.WindowSpec( - # BigQuery has syntax to reorder nulls with "NULLS FIRST/LAST", but that is unavailable through ibis presently, so must order on a separate nullity expression first. - ordering=( - ordering.OrderingExpression( - ex.free_var(col), - ordering.OrderingDirection.ASC - if ascending - else ordering.OrderingDirection.DESC, - na_last=(na_option in ["bottom", "keep"]), - ), + window_ordering = ( + ordering.OrderingExpression( + ex.free_var(col), + ordering.OrderingDirection.ASC + if ascending + else ordering.OrderingDirection.DESC, + na_last=(na_option in ["bottom", "keep"]), ), ) # Count_op ignores nulls, so if na_option is "top" or "bottom", we instead count the nullity columns, where nulls have been mapped to bools block, rownum_id = block.apply_window_op( col if na_option == "keep" else nullity_col_id, agg_ops.dense_rank_op if method == "dense" else agg_ops.count_op, - window_spec=window, + window_spec=windows.unbound(ordering=window_ordering) + if method == "dense" + else windows.rows(following=0, ordering=window_ordering), skip_reproject_unsafe=(col != columns[-1]), ) rownum_col_ids.append(rownum_id) @@ -464,7 +461,7 @@ def rank( block, result_id = block.apply_window_op( rownum_col_ids[i], agg_op, - window_spec=windows.WindowSpec(grouping_keys=(columns[i],)), + window_spec=windows.unbound(grouping_keys=(columns[i],)), skip_reproject_unsafe=(i < (len(columns) - 1)), ) post_agg_rownum_col_ids.append(result_id) @@ -528,7 +525,7 @@ def nsmallest( block, counter = block.apply_window_op( column_ids[0], agg_ops.rank_op, - window_spec=windows.WindowSpec(ordering=tuple(order_refs)), + window_spec=windows.unbound(ordering=tuple(order_refs)), ) block, condition = block.project_expr(ops.le_op.as_expr(counter, ex.const(n))) block = block.filter_by_id(condition) @@ -558,7 +555,7 @@ def nlargest( block, counter = block.apply_window_op( column_ids[0], agg_ops.rank_op, - window_spec=windows.WindowSpec(ordering=tuple(order_refs)), + window_spec=windows.unbound(ordering=tuple(order_refs)), ) block, condition = block.project_expr(ops.le_op.as_expr(counter, ex.const(n))) block = block.filter_by_id(condition) @@ -653,7 +650,7 @@ def _mean_delta_to_power( grouping_column_ids: typing.Sequence[str], ) -> typing.Tuple[blocks.Block, typing.Sequence[str]]: """Calculate (x-mean(x))^n. Useful for calculating moment statistics such as skew and kurtosis.""" - window = windows.WindowSpec(grouping_keys=tuple(grouping_column_ids)) + window = windows.unbound(grouping_keys=tuple(grouping_column_ids)) block, mean_ids = block.multi_apply_window_op(column_ids, agg_ops.mean_op, window) delta_ids = [] for val_id, mean_val_id in zip(column_ids, mean_ids): @@ -845,7 +842,7 @@ def _idx_extrema( for idx_col in original_block.index_columns ], ] - window_spec = windows.WindowSpec(ordering=tuple(order_refs)) + window_spec = windows.unbound(ordering=tuple(order_refs)) idx_col = original_block.index_columns[0] block, result_col = block.apply_window_op( idx_col, agg_ops.first_op, window_spec diff --git a/bigframes/core/blocks.py b/bigframes/core/blocks.py index 277409f3a3..2b2803b649 100644 --- a/bigframes/core/blocks.py +++ b/bigframes/core/blocks.py @@ -47,6 +47,7 @@ import bigframes.core.tree_properties as tree_properties import bigframes.core.utils import bigframes.core.utils as utils +import bigframes.core.window_spec as window_specs import bigframes.dtypes import bigframes.features import bigframes.operations as ops @@ -816,7 +817,7 @@ def multi_apply_window_op( self, columns: typing.Sequence[str], op: agg_ops.WindowOp, - window_spec: core.WindowSpec, + window_spec: window_specs.WindowSpec, *, skip_null_groups: bool = False, never_skip_nulls: bool = False, @@ -875,7 +876,7 @@ def apply_window_op( self, column: str, op: agg_ops.WindowOp, - window_spec: core.WindowSpec, + window_spec: window_specs.WindowSpec, *, result_label: Label = None, skip_null_groups: bool = False, @@ -2029,7 +2030,7 @@ def _is_monotonic( return self._stats_cache[column_name][op_name] period = 1 - window = bigframes.core.WindowSpec( + window = window_specs.rows( preceding=period, following=None, ) diff --git a/bigframes/core/compile/compiled.py b/bigframes/core/compile/compiled.py index d14a5d3241..cc1d6baaa1 100644 --- a/bigframes/core/compile/compiled.py +++ b/bigframes/core/compile/compiled.py @@ -40,7 +40,7 @@ OrderingExpression, ) import bigframes.core.schema as schemata -from bigframes.core.window_spec import WindowSpec +from bigframes.core.window_spec import RangeWindowBounds, RowsWindowBounds, WindowSpec import bigframes.dtypes import bigframes.operations.aggregations as agg_ops @@ -735,7 +735,9 @@ def project_window_op( skip_reproject_unsafe: skips the reprojection step, can be used when performing many non-dependent window operations, user responsible for not nesting window expressions, or using outputs as join, filter or aggregation keys before a reprojection """ column = typing.cast(ibis_types.Column, self._get_ibis_column(column_name)) - window = self._ibis_window_from_spec(window_spec, allow_ties=op.handles_ties) + window = self._ibis_window_from_spec( + window_spec, require_total_order=op.uses_total_row_ordering + ) bindings = {col: self._get_ibis_column(col) for col in self.column_ids} window_op = agg_compiler.compile_analytic( @@ -1162,7 +1164,9 @@ def _create_string_ordering_column(self) -> ibis_types.StringColumn: def _compile_expression(self, expr: ex.Expression): return op_compiler.compile_expression(expr, self._ibis_bindings) - def _ibis_window_from_spec(self, window_spec: WindowSpec, allow_ties: bool = False): + def _ibis_window_from_spec( + self, window_spec: WindowSpec, require_total_order: bool + ): group_by: typing.List[ibis_types.Value] = ( [ typing.cast( @@ -1175,26 +1179,40 @@ def _ibis_window_from_spec(self, window_spec: WindowSpec, allow_ties: bool = Fal ) if self._reduced_predicate is not None: group_by.append(self._reduced_predicate) + + # Construct ordering. There are basically 3 main cases + # 1. Order-independent op (aggregation, cut, rank) with unbound window - no ordering clause needed + # 2. Order-independent op (aggregation, cut, rank) with range window - use ordering clause, ties allowed + # 3. Order-depedenpent op (navigation functions, array_agg) or rows bounds - use total row order to break ties. if window_spec.ordering: order_by = _convert_ordering_to_table_values( {**self._column_names, **self._hidden_ordering_column_names}, window_spec.ordering, ) - if not allow_ties: - # Most operator need an unambiguous ordering, so the table's total ordering is appended + if require_total_order or isinstance(window_spec.bounds, RowsWindowBounds): + # Some operators need an unambiguous ordering, so the table's total ordering is appended order_by = tuple([*order_by, *self._ibis_order]) - elif (window_spec.following is not None) or (window_spec.preceding is not None): + elif isinstance(window_spec.bounds, RowsWindowBounds): # If window spec has following or preceding bounds, we need to apply an unambiguous ordering. order_by = tuple(self._ibis_order) else: # Unbound grouping window. Suitable for aggregations but not for analytic function application. order_by = None - return ibis.window( - preceding=window_spec.preceding, - following=window_spec.following, - order_by=order_by, - group_by=group_by, - ) + + bounds = window_spec.bounds + window = ibis.window(order_by=order_by, group_by=group_by) + if bounds is not None: + if isinstance(bounds, RangeWindowBounds): + window = window.preceding_following( + bounds.preceding, bounds.following, how="range" + ) + if isinstance(bounds, RowsWindowBounds): + window = window.preceding_following( + bounds.preceding, bounds.following, how="rows" + ) + else: + raise ValueError(f"unrecognized window bounds {bounds}") + return window class Builder: def __init__( diff --git a/bigframes/core/groupby/__init__.py b/bigframes/core/groupby/__init__.py index 05b1cc7f41..41d0750030 100644 --- a/bigframes/core/groupby/__init__.py +++ b/bigframes/core/groupby/__init__.py @@ -28,6 +28,7 @@ import bigframes.core.ordering as order import bigframes.core.utils as utils import bigframes.core.window as windows +import bigframes.core.window_spec as window_specs import bigframes.dataframe as df import bigframes.dtypes as dtypes import bigframes.operations.aggregations as agg_ops @@ -217,7 +218,7 @@ def cumprod(self, *args, **kwargs) -> df.DataFrame: return self._apply_window_op(agg_ops.product_op, numeric_only=True) def shift(self, periods=1) -> series.Series: - window = core.WindowSpec( + window = window_specs.rows( grouping_keys=tuple(self._by_col_ids), preceding=periods if periods > 0 else None, following=-periods if periods < 0 else None, @@ -225,7 +226,7 @@ def shift(self, periods=1) -> series.Series: return self._apply_window_op(agg_ops.ShiftOp(periods), window=window) def diff(self, periods=1) -> series.Series: - window = core.WindowSpec( + window = window_specs.rows( grouping_keys=tuple(self._by_col_ids), preceding=periods if periods > 0 else None, following=-periods if periods < 0 else None, @@ -234,7 +235,7 @@ def diff(self, periods=1) -> series.Series: def rolling(self, window: int, min_periods=None) -> windows.Window: # To get n size window, need current row and n-1 preceding rows. - window_spec = core.WindowSpec( + window_spec = window_specs.rows( grouping_keys=tuple(self._by_col_ids), preceding=window - 1, following=0, @@ -248,9 +249,8 @@ def rolling(self, window: int, min_periods=None) -> windows.Window: ) def expanding(self, min_periods: int = 1) -> windows.Window: - window_spec = core.WindowSpec( + window_spec = window_specs.cumulative_rows( grouping_keys=tuple(self._by_col_ids), - following=0, min_periods=min_periods, ) block = self._block.order_by( @@ -424,8 +424,8 @@ def _apply_window_op( numeric_only: bool = False, ): """Apply window op to groupby. Defaults to grouped cumulative window.""" - window_spec = window or core.WindowSpec( - grouping_keys=tuple(self._by_col_ids), following=0 + window_spec = window or window_specs.cumulative_rows( + grouping_keys=tuple(self._by_col_ids) ) columns = self._aggregated_columns(numeric_only=numeric_only) block, result_ids = self._block.multi_apply_window_op( @@ -594,7 +594,7 @@ def cumcount(self, *args, **kwargs) -> series.Series: def shift(self, periods=1) -> series.Series: """Shift index by desired number of periods.""" - window = core.WindowSpec( + window = window_specs.rows( grouping_keys=tuple(self._by_col_ids), preceding=periods if periods > 0 else None, following=-periods if periods < 0 else None, @@ -602,7 +602,7 @@ def shift(self, periods=1) -> series.Series: return self._apply_window_op(agg_ops.ShiftOp(periods), window=window) def diff(self, periods=1) -> series.Series: - window = core.WindowSpec( + window = window_specs.rows( grouping_keys=tuple(self._by_col_ids), preceding=periods if periods > 0 else None, following=-periods if periods < 0 else None, @@ -611,7 +611,7 @@ def diff(self, periods=1) -> series.Series: def rolling(self, window: int, min_periods=None) -> windows.Window: # To get n size window, need current row and n-1 preceding rows. - window_spec = core.WindowSpec( + window_spec = window_specs.rows( grouping_keys=tuple(self._by_col_ids), preceding=window - 1, following=0, @@ -629,9 +629,8 @@ def rolling(self, window: int, min_periods=None) -> windows.Window: ) def expanding(self, min_periods: int = 1) -> windows.Window: - window_spec = core.WindowSpec( + window_spec = window_specs.cumulative_rows( grouping_keys=tuple(self._by_col_ids), - following=0, min_periods=min_periods, ) block = self._block.order_by( @@ -661,8 +660,8 @@ def _apply_window_op( window: typing.Optional[core.WindowSpec] = None, ): """Apply window op to groupby. Defaults to grouped cumulative window.""" - window_spec = window or core.WindowSpec( - grouping_keys=tuple(self._by_col_ids), following=0 + window_spec = window or window_specs.cumulative_rows( + grouping_keys=tuple(self._by_col_ids) ) label = self._value_name if not discard_name else None diff --git a/bigframes/core/reshape/__init__.py b/bigframes/core/reshape/__init__.py index 6bcc25319b..05cb5c7e94 100644 --- a/bigframes/core/reshape/__init__.py +++ b/bigframes/core/reshape/__init__.py @@ -19,10 +19,10 @@ import pandas as pd import bigframes.constants as constants -import bigframes.core as core import bigframes.core.expression as ex import bigframes.core.ordering as order import bigframes.core.utils as utils +import bigframes.core.window_spec as window_specs import bigframes.dataframe import bigframes.operations as ops import bigframes.operations.aggregations as agg_ops @@ -159,7 +159,7 @@ def cut( ) return x._apply_window_op( - agg_ops.CutOp(bins, labels=labels), window_spec=core.WindowSpec() + agg_ops.CutOp(bins, labels=labels), window_spec=window_specs.unbound() ) @@ -189,7 +189,7 @@ def qcut( block, result = block.apply_window_op( x._value_column, agg_ops.QcutOp(q), # type: ignore - window_spec=core.WindowSpec( + window_spec=window_specs.unbound( grouping_keys=(nullity_id,), ordering=(order.ascending_over(x._value_column),), ), diff --git a/bigframes/core/window_spec.py b/bigframes/core/window_spec.py index b02f13d333..71e88a4c3d 100644 --- a/bigframes/core/window_spec.py +++ b/bigframes/core/window_spec.py @@ -11,13 +11,133 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. +from __future__ import annotations from dataclasses import dataclass -import typing +from typing import Optional, Tuple, Union import bigframes.core.ordering as orderings +# Unbound Windows +def unbound( + grouping_keys: Tuple[str, ...] = (), + min_periods: int = 0, + ordering: Tuple[orderings.OrderingExpression, ...] = (), +) -> WindowSpec: + """ + Create an unbound window. + + Args: + grouping_keys: + Columns ids of grouping keys + min_periods (int, default 0): + Minimum number of input rows to generate output. + ordering: + Orders the rows within the window. + + Returns: + WindowSpec + """ + return WindowSpec( + grouping_keys=grouping_keys, min_periods=min_periods, ordering=ordering + ) + + +### Rows-based Windows +def rows( + grouping_keys: Tuple[str, ...] = (), + preceding: Optional[int] = None, + following: Optional[int] = None, + min_periods: int = 0, + ordering: Tuple[orderings.OrderingExpression, ...] = (), +) -> WindowSpec: + """ + Create a row-bounded window. + + Args: + grouping_keys: + Columns ids of grouping keys + preceding: + number of preceding rows to include. If None, include all preceding rows + following: + number of following rows to include. If None, include all following rows + min_periods (int, default 0): + Minimum number of input rows to generate output. + ordering: + Ordering to apply on top of based dataframe ordering + Returns: + WindowSpec + """ + assert (preceding is not None) or (following is not None) + bounds = RowsWindowBounds(preceding=preceding, following=following) + return WindowSpec( + grouping_keys=grouping_keys, + bounds=bounds, + min_periods=min_periods, + ordering=ordering, + ) + + +def cumulative_rows( + grouping_keys: Tuple[str, ...] = (), min_periods: int = 0 +) -> WindowSpec: + """ + Create a expanding window that includes all preceding rows + + Args: + grouping_keys: + Columns ids of grouping keys + min_periods (int, default 0): + Minimum number of input rows to generate output. + Returns: + WindowSpec + """ + bounds = RowsWindowBounds(following=0) + return WindowSpec( + grouping_keys=grouping_keys, bounds=bounds, min_periods=min_periods + ) + + +def inverse_cumulative_rows( + grouping_keys: Tuple[str, ...] = (), min_periods: int = 0 +) -> WindowSpec: + """ + Create a shrinking window that includes all following rows + + Args: + grouping_keys: + Columns ids of grouping keys + min_periods (int, default 0): + Minimum number of input rows to generate output. + Returns: + WindowSpec + """ + bounds = RowsWindowBounds(preceding=0) + return WindowSpec( + grouping_keys=grouping_keys, bounds=bounds, min_periods=min_periods + ) + + +### Struct Classes + + +@dataclass(frozen=True) +class RowsWindowBounds: + preceding: Optional[int] = None + following: Optional[int] = None + + +# TODO: Expand to datetime offsets +OffsetType = Union[float, int] + + +@dataclass(frozen=True) +class RangeWindowBounds: + preceding: Optional[OffsetType] = None + following: Optional[OffsetType] = None + + @dataclass(frozen=True) class WindowSpec: """ @@ -28,8 +148,7 @@ class WindowSpec: ordering: List of columns ids and ordering direction to override base ordering """ - grouping_keys: typing.Tuple[str, ...] = tuple() - ordering: typing.Tuple[orderings.OrderingExpression, ...] = tuple() - preceding: typing.Optional[int] = None - following: typing.Optional[int] = None + grouping_keys: Tuple[str, ...] = tuple() + ordering: Tuple[orderings.OrderingExpression, ...] = tuple() + bounds: Union[RowsWindowBounds, RangeWindowBounds, None] = None min_periods: int = 0 diff --git a/bigframes/dataframe.py b/bigframes/dataframe.py index 47730630e3..5be28acf53 100644 --- a/bigframes/dataframe.py +++ b/bigframes/dataframe.py @@ -59,6 +59,7 @@ import bigframes.core.ordering as order import bigframes.core.utils as utils import bigframes.core.window +import bigframes.core.window_spec as window_spec import bigframes.dtypes import bigframes.formatting_helpers as formatter import bigframes.operations as ops @@ -1874,11 +1875,11 @@ def replace( ) def ffill(self, *, limit: typing.Optional[int] = None) -> DataFrame: - window = bigframes.core.WindowSpec(preceding=limit, following=0) + window = window_spec.rows(preceding=limit, following=0) return self._apply_window_op(agg_ops.LastNonNullOp(), window) def bfill(self, *, limit: typing.Optional[int] = None) -> DataFrame: - window = bigframes.core.WindowSpec(preceding=0, following=limit) + window = window_spec.rows(preceding=0, following=limit) return self._apply_window_op(agg_ops.FirstNonNullOp(), window) def isin(self, values) -> DataFrame: @@ -2574,17 +2575,17 @@ def _perform_join_by_index( def rolling(self, window: int, min_periods=None) -> bigframes.core.window.Window: # To get n size window, need current row and n-1 preceding rows. - window_spec = bigframes.core.WindowSpec( + window_def = window_spec.rows( preceding=window - 1, following=0, min_periods=min_periods or window ) return bigframes.core.window.Window( - self._block, window_spec, self._block.value_columns + self._block, window_def, self._block.value_columns ) def expanding(self, min_periods: int = 1) -> bigframes.core.window.Window: - window_spec = bigframes.core.WindowSpec(following=0, min_periods=min_periods) + window = window_spec.cumulative_rows(min_periods=min_periods) return bigframes.core.window.Window( - self._block, window_spec, self._block.value_columns + self._block, window, self._block.value_columns ) def groupby( @@ -2691,7 +2692,7 @@ def cumsum(self): raise ValueError("All values must be numeric to apply cumsum.") return self._apply_window_op( agg_ops.sum_op, - bigframes.core.WindowSpec(following=0), + window_spec.cumulative_rows(), ) def cumprod(self) -> DataFrame: @@ -2703,30 +2704,30 @@ def cumprod(self) -> DataFrame: raise ValueError("All values must be numeric to apply cumsum.") return self._apply_window_op( agg_ops.product_op, - bigframes.core.WindowSpec(following=0), + window_spec.cumulative_rows(), ) def cummin(self) -> DataFrame: return self._apply_window_op( agg_ops.min_op, - bigframes.core.WindowSpec(following=0), + window_spec.cumulative_rows(), ) def cummax(self) -> DataFrame: return self._apply_window_op( agg_ops.max_op, - bigframes.core.WindowSpec(following=0), + window_spec.cumulative_rows(), ) def shift(self, periods: int = 1) -> DataFrame: - window = bigframes.core.WindowSpec( + window = window_spec.rows( preceding=periods if periods > 0 else None, following=-periods if periods < 0 else None, ) return self._apply_window_op(agg_ops.ShiftOp(periods), window) def diff(self, periods: int = 1) -> DataFrame: - window = bigframes.core.WindowSpec( + window = window_spec.rows( preceding=periods if periods > 0 else None, following=-periods if periods < 0 else None, ) @@ -2740,7 +2741,7 @@ def pct_change(self, periods: int = 1) -> DataFrame: def _apply_window_op( self, op: agg_ops.WindowOp, - window_spec: bigframes.core.WindowSpec, + window_spec: window_spec.WindowSpec, ): block, result_ids = self._block.multi_apply_window_op( self._block.value_columns, diff --git a/bigframes/operations/aggregations.py b/bigframes/operations/aggregations.py index 0d27d1d75d..c57fac4112 100644 --- a/bigframes/operations/aggregations.py +++ b/bigframes/operations/aggregations.py @@ -34,8 +34,8 @@ def skips_nulls(self): return True @property - def handles_ties(self): - """Whether the operator can handle ties without nondeterministic output. (eg. rank operator can handle ties but not the count operator)""" + def uses_total_row_ordering(self): + """Whether the operator needs total row ordering. (eg. lead, lag, array_agg)""" return False @abc.abstractmethod @@ -232,10 +232,6 @@ class CutOp(UnaryWindowOp): def skips_nulls(self): return False - @property - def handles_ties(self): - return True - def output_type(self, *input_types: dtypes.ExpressionType) -> dtypes.ExpressionType: if isinstance(self.bins, int) and (self.labels is False): return dtypes.INT_DTYPE @@ -267,10 +263,6 @@ def name(self): def skips_nulls(self): return False - @property - def handles_ties(self): - return True - def output_type(self, *input_types: dtypes.ExpressionType) -> dtypes.ExpressionType: return signatures.FixedOutputType( dtypes.is_orderable, dtypes.INT_DTYPE, "orderable" @@ -308,10 +300,6 @@ class RankOp(UnaryWindowOp): def skips_nulls(self): return False - @property - def handles_ties(self): - return True - def output_type(self, *input_types: dtypes.ExpressionType) -> dtypes.ExpressionType: return signatures.FixedOutputType( dtypes.is_orderable, dtypes.INT_DTYPE, "orderable" @@ -324,10 +312,6 @@ class DenseRankOp(UnaryWindowOp): def skips_nulls(self): return False - @property - def handles_ties(self): - return True - def output_type(self, *input_types: dtypes.ExpressionType) -> dtypes.ExpressionType: return signatures.FixedOutputType( dtypes.is_orderable, dtypes.INT_DTYPE, "orderable" @@ -338,9 +322,17 @@ def output_type(self, *input_types: dtypes.ExpressionType) -> dtypes.ExpressionT class FirstOp(UnaryWindowOp): name: ClassVar[str] = "first" + @property + def uses_total_row_ordering(self): + return True + @dataclasses.dataclass(frozen=True) class FirstNonNullOp(UnaryWindowOp): + @property + def uses_total_row_ordering(self): + return True + @property def skips_nulls(self): return False @@ -350,9 +342,17 @@ def skips_nulls(self): class LastOp(UnaryWindowOp): name: ClassVar[str] = "last" + @property + def uses_total_row_ordering(self): + return True + @dataclasses.dataclass(frozen=True) class LastNonNullOp(UnaryWindowOp): + @property + def uses_total_row_ordering(self): + return True + @property def skips_nulls(self): return False @@ -362,6 +362,10 @@ def skips_nulls(self): class ShiftOp(UnaryWindowOp): periods: int + @property + def uses_total_row_ordering(self): + return True + @property def skips_nulls(self): return False @@ -371,6 +375,10 @@ def skips_nulls(self): class DiffOp(UnaryWindowOp): periods: int + @property + def uses_total_row_ordering(self): + return True + @property def skips_nulls(self): return False diff --git a/bigframes/series.py b/bigframes/series.py index ce13d205bd..313380e4a4 100644 --- a/bigframes/series.py +++ b/bigframes/series.py @@ -431,44 +431,44 @@ def case_when(self, caselist) -> Series: def cumsum(self) -> Series: return self._apply_window_op( - agg_ops.sum_op, bigframes.core.window_spec.WindowSpec(following=0) + agg_ops.sum_op, bigframes.core.window_spec.cumulative_rows() ) def ffill(self, *, limit: typing.Optional[int] = None) -> Series: - window = bigframes.core.window_spec.WindowSpec(preceding=limit, following=0) + window = bigframes.core.window_spec.rows(preceding=limit, following=0) return self._apply_window_op(agg_ops.LastNonNullOp(), window) pad = ffill pad.__doc__ = inspect.getdoc(vendored_pandas_series.Series.ffill) def bfill(self, *, limit: typing.Optional[int] = None) -> Series: - window = bigframes.core.window_spec.WindowSpec(preceding=0, following=limit) + window = bigframes.core.window_spec.rows(preceding=0, following=limit) return self._apply_window_op(agg_ops.FirstNonNullOp(), window) def cummax(self) -> Series: return self._apply_window_op( - agg_ops.max_op, bigframes.core.window_spec.WindowSpec(following=0) + agg_ops.max_op, bigframes.core.window_spec.cumulative_rows() ) def cummin(self) -> Series: return self._apply_window_op( - agg_ops.min_op, bigframes.core.window_spec.WindowSpec(following=0) + agg_ops.min_op, bigframes.core.window_spec.cumulative_rows() ) def cumprod(self) -> Series: return self._apply_window_op( - agg_ops.product_op, bigframes.core.window_spec.WindowSpec(following=0) + agg_ops.product_op, bigframes.core.window_spec.cumulative_rows() ) def shift(self, periods: int = 1) -> Series: - window = bigframes.core.window_spec.WindowSpec( + window = bigframes.core.window_spec.rows( preceding=periods if periods > 0 else None, following=-periods if periods < 0 else None, ) return self._apply_window_op(agg_ops.ShiftOp(periods), window) def diff(self, periods: int = 1) -> Series: - window = bigframes.core.window_spec.WindowSpec( + window = bigframes.core.window_spec.rows( preceding=periods if periods > 0 else None, following=-periods if periods < 0 else None, ) @@ -955,7 +955,7 @@ def mode(self) -> Series: block, max_value_count_col_id = block.apply_window_op( value_count_col_id, agg_ops.max_op, - window_spec=bigframes.core.window_spec.WindowSpec(), + window_spec=bigframes.core.window_spec.unbound(), ) block, is_mode_col_id = block.apply_binary_op( value_count_col_id, @@ -1226,7 +1226,7 @@ def sort_index(self, *, axis=0, ascending=True, na_position="last") -> Series: def rolling(self, window: int, min_periods=None) -> bigframes.core.window.Window: # To get n size window, need current row and n-1 preceding rows. - window_spec = bigframes.core.window_spec.WindowSpec( + window_spec = bigframes.core.window_spec.rows( preceding=window - 1, following=0, min_periods=min_periods or window ) return bigframes.core.window.Window( @@ -1234,8 +1234,8 @@ def rolling(self, window: int, min_periods=None) -> bigframes.core.window.Window ) def expanding(self, min_periods: int = 1) -> bigframes.core.window.Window: - window_spec = bigframes.core.window_spec.WindowSpec( - following=0, min_periods=min_periods + window_spec = bigframes.core.window_spec.cumulative_rows( + min_periods=min_periods ) return bigframes.core.window.Window( self._block, window_spec, self._block.value_columns, is_series=True From f6bdc4aeb3f81a1e0b955521c04ac0dd22981c76 Mon Sep 17 00:00:00 2001 From: Shobhit Singh Date: Fri, 10 May 2024 19:50:35 +0000 Subject: [PATCH 11/17] feat: Support `axis=1` in `df.apply` for scalar outputs (#629) * feat: Support `axis=1` in `df.apply` for scalar outputs * avoid mixing other changes in the input_types param * use guid instead of hard coded column name * check_exact=False to avoid failing system_prerelease * handle index in remote function, add large system tests * make the test case more robust * handle non-string column names, add unsupported dtype tests * fix import * use `_cached` in df.apply to catch any rf execution errors early * add test for row aggregates * add row dtype information, also test * preserve the order of input in the output * absorb to_numpy() disparity in prerelease tests * add tests for column multiindex and non remote function * add preview note for row processing * add warning for input_types="row" and axis=1 * introduce early check on the supported dtypes * asjust test after early dtype handling * address review comments * user NameError for column name parsing issue, address test coverage failure * address nan return handling in the gcf code * handle (nan, inf, -inf) * replace "row" by bpd.Series for input types * make the bq parity assert more readable * fix the series name before assert * fix docstring for args * move more low level string logic in sql module * raise explicit error when a column name cannot be supported * keep literal_eval check on the serialization side to match deserialization --- bigframes/core/blocks.py | 101 ++++++- bigframes/core/sql.py | 59 +++++ bigframes/dataframe.py | 56 +++- bigframes/exceptions.py | 4 + bigframes/functions/remote_function.py | 191 ++++++++++--- bigframes/session/__init__.py | 8 +- tests/system/large/test_remote_function.py | 250 ++++++++++++++++++ tests/system/small/test_remote_function.py | 146 ++++++++++ .../bigframes_vendored/pandas/core/frame.py | 29 +- 9 files changed, 792 insertions(+), 52 deletions(-) create mode 100644 bigframes/core/sql.py diff --git a/bigframes/core/blocks.py b/bigframes/core/blocks.py index 2b2803b649..58b8515418 100644 --- a/bigframes/core/blocks.py +++ b/bigframes/core/blocks.py @@ -21,11 +21,13 @@ from __future__ import annotations +import ast import dataclasses import functools import itertools import os import random +import textwrap import typing from typing import Iterable, List, Literal, Mapping, Optional, Sequence, Tuple, Union import warnings @@ -44,8 +46,8 @@ import bigframes.core.join_def as join_defs import bigframes.core.ordering as ordering import bigframes.core.schema as bf_schema +import bigframes.core.sql as sql import bigframes.core.tree_properties as tree_properties -import bigframes.core.utils import bigframes.core.utils as utils import bigframes.core.window_spec as window_specs import bigframes.dtypes @@ -1437,9 +1439,7 @@ def promote_offsets(self, label: Label = None) -> typing.Tuple[Block, str]: ) def add_prefix(self, prefix: str, axis: str | int | None = None) -> Block: - axis_number = bigframes.core.utils.get_axis_number( - "rows" if (axis is None) else axis - ) + axis_number = utils.get_axis_number("rows" if (axis is None) else axis) if axis_number == 0: expr = self._expr for index_col in self._index_columns: @@ -1460,9 +1460,7 @@ def add_prefix(self, prefix: str, axis: str | int | None = None) -> Block: return self.rename(columns=lambda label: f"{prefix}{label}") def add_suffix(self, suffix: str, axis: str | int | None = None) -> Block: - axis_number = bigframes.core.utils.get_axis_number( - "rows" if (axis is None) else axis - ) + axis_number = utils.get_axis_number("rows" if (axis is None) else axis) if axis_number == 0: expr = self._expr for index_col in self._index_columns: @@ -2072,6 +2070,95 @@ def _is_monotonic( self._stats_cache[column_name].update({op_name: result}) return result + def _get_rows_as_json_values(self) -> Block: + # We want to preserve any ordering currently present before turning to + # direct SQL manipulation. We will restore the ordering when we rebuild + # expression. + # TODO(shobs): Replace direct SQL manipulation by structured expression + # manipulation + ordering_column_name = guid.generate_guid() + expr = self.session._cache_with_offsets(self.expr) + expr = expr.promote_offsets(ordering_column_name) + expr_sql = self.session._to_sql(expr) + + # Names of the columns to serialize for the row. + # We will use the repr-eval pattern to serialize a value here and + # deserialize in the cloud function. Let's make sure that would work. + column_names = [] + for col in list(self.index_columns) + [col for col in self.column_labels]: + serialized_column_name = repr(col) + try: + ast.literal_eval(serialized_column_name) + except Exception: + raise NameError( + f"Column name type '{type(col).__name__}' is not supported for row serialization." + " Please consider using a name for which literal_eval(repr(name)) works." + ) + + column_names.append(serialized_column_name) + column_names_csv = sql.csv(column_names, quoted=True) + + # index columns count + index_columns_count = len(self.index_columns) + + # column references to form the array of values for the row + column_references_csv = sql.csv( + [sql.cast_as_string(col) for col in self.expr.column_ids] + ) + + # types of the columns to serialize for the row + column_types = list(self.index.dtypes) + list(self.dtypes) + column_types_csv = sql.csv([str(typ) for typ in column_types], quoted=True) + + # row dtype to use for deserializing the row as pandas series + pandas_row_dtype = bigframes.dtypes.lcd_type(*column_types) + if pandas_row_dtype is None: + pandas_row_dtype = "object" + pandas_row_dtype = sql.quote(str(pandas_row_dtype)) + + # create a json column representing row through SQL manipulation + row_json_column_name = guid.generate_guid() + select_columns = ( + [ordering_column_name] + list(self.index_columns) + [row_json_column_name] + ) + select_columns_csv = sql.csv( + [sql.column_reference(col) for col in select_columns] + ) + json_sql = f"""\ +With T0 AS ( +{textwrap.indent(expr_sql, " ")} +), +T1 AS ( + SELECT *, + JSON_OBJECT( + "names", [{column_names_csv}], + "types", [{column_types_csv}], + "values", [{column_references_csv}], + "indexlength", {index_columns_count}, + "dtype", {pandas_row_dtype} + ) AS {row_json_column_name} FROM T0 +) +SELECT {select_columns_csv} FROM T1 +""" + ibis_table = self.session.ibis_client.sql(json_sql) + order_for_ibis_table = ordering.ExpressionOrdering.from_offset_col( + ordering_column_name + ) + expr = core.ArrayValue.from_ibis( + self.session, + ibis_table, + [ibis_table[col] for col in select_columns if col != ordering_column_name], + hidden_ordering_columns=[ibis_table[ordering_column_name]], + ordering=order_for_ibis_table, + ) + block = Block( + expr, + index_columns=self.index_columns, + column_labels=[row_json_column_name], + index_labels=self._index_labels, + ) + return block + class BlockIndexProperties: """Accessor for the index-related block properties.""" diff --git a/bigframes/core/sql.py b/bigframes/core/sql.py new file mode 100644 index 0000000000..31ee5f9064 --- /dev/null +++ b/bigframes/core/sql.py @@ -0,0 +1,59 @@ +# Copyright 2023 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +Utility functions for SQL construction. +""" + +from typing import Iterable + + +def quote(value: str): + """Return quoted input string.""" + + # Let's use repr which also escapes any special characters + # + # >>> for val in [ + # ... "123", + # ... "str with no special chars", + # ... "str with special chars.,'\"/\\" + # ... ]: + # ... print(f"{val} -> {repr(val)}") + # ... + # 123 -> '123' + # str with no special chars -> 'str with no special chars' + # str with special chars.,'"/\ -> 'str with special chars.,\'"/\\' + + return repr(value) + + +def column_reference(column_name: str): + """Return a string representing column reference in a SQL.""" + + return f"`{column_name}`" + + +def cast_as_string(column_name: str): + """Return a string representing string casting of a column.""" + + return f"CAST({column_reference(column_name)} AS STRING)" + + +def csv(values: Iterable[str], quoted=False): + """Return a string of comma separated values.""" + + if quoted: + values = [quote(val) for val in values] + + return ", ".join(values) diff --git a/bigframes/dataframe.py b/bigframes/dataframe.py index 5be28acf53..d3fd39afa7 100644 --- a/bigframes/dataframe.py +++ b/bigframes/dataframe.py @@ -34,6 +34,7 @@ Tuple, Union, ) +import warnings import bigframes_vendored.pandas.core.frame as vendored_pandas_frame import bigframes_vendored.pandas.pandas._typing as vendored_pandas_typing @@ -61,6 +62,7 @@ import bigframes.core.window import bigframes.core.window_spec as window_spec import bigframes.dtypes +import bigframes.exceptions import bigframes.formatting_helpers as formatter import bigframes.operations as ops import bigframes.operations.aggregations as agg_ops @@ -3308,7 +3310,59 @@ def map(self, func, na_action: Optional[str] = None) -> DataFrame: ops.RemoteFunctionOp(func=func, apply_on_null=(na_action is None)) ) - def apply(self, func, *, args: typing.Tuple = (), **kwargs): + def apply(self, func, *, axis=0, args: typing.Tuple = (), **kwargs): + if utils.get_axis_number(axis) == 1: + warnings.warn( + "axis=1 scenario is in preview.", + category=bigframes.exceptions.PreviewWarning, + ) + + # Early check whether the dataframe dtypes are currently supported + # in the remote function + # NOTE: Keep in sync with the value converters used in the gcf code + # generated in generate_cloud_function_main_code in remote_function.py + remote_function_supported_dtypes = ( + bigframes.dtypes.INT_DTYPE, + bigframes.dtypes.FLOAT_DTYPE, + bigframes.dtypes.BOOL_DTYPE, + bigframes.dtypes.STRING_DTYPE, + ) + supported_dtypes_types = tuple( + type(dtype) for dtype in remote_function_supported_dtypes + ) + supported_dtypes_hints = tuple( + str(dtype) for dtype in remote_function_supported_dtypes + ) + + for dtype in self.dtypes: + if not isinstance(dtype, supported_dtypes_types): + raise NotImplementedError( + f"DataFrame has a column of dtype '{dtype}' which is not supported with axis=1." + f" Supported dtypes are {supported_dtypes_hints}." + ) + + # Check if the function is a remote function + if not hasattr(func, "bigframes_remote_function"): + raise ValueError("For axis=1 a remote function must be used.") + + # Serialize the rows as json values + block = self._get_block() + rows_as_json_series = bigframes.series.Series( + block._get_rows_as_json_values() + ) + + # Apply the function + result_series = rows_as_json_series._apply_unary_op( + ops.RemoteFunctionOp(func=func, apply_on_null=True) + ) + result_series.name = None + + # Return Series with materialized result so that any error in the remote + # function is caught early + materialized_series = result_series.cache() + return materialized_series + + # Per-column apply results = {name: func(col, *args, **kwargs) for name, col in self.items()} if all( [ diff --git a/bigframes/exceptions.py b/bigframes/exceptions.py index 3ca6d8e1af..eae021b4cd 100644 --- a/bigframes/exceptions.py +++ b/bigframes/exceptions.py @@ -33,3 +33,7 @@ class CleanupFailedWarning(Warning): class DefaultIndexWarning(Warning): """Default index may cause unexpected costs.""" + + +class PreviewWarning(Warning): + """The feature is in preview.""" diff --git a/bigframes/functions/remote_function.py b/bigframes/functions/remote_function.py index 9d826d0fa1..6e42ca9f48 100644 --- a/bigframes/functions/remote_function.py +++ b/bigframes/functions/remote_function.py @@ -25,8 +25,10 @@ import tempfile import textwrap from typing import List, NamedTuple, Optional, Sequence, TYPE_CHECKING, Union +import warnings import ibis +import pandas import requests if TYPE_CHECKING: @@ -262,7 +264,7 @@ def generate_udf_code(self, def_, dir): return udf_code_file_name, udf_bytecode_file_name - def generate_cloud_function_main_code(self, def_, dir): + def generate_cloud_function_main_code(self, def_, dir, is_row_processor=False): """Get main.py code for the cloud function for the given user defined function.""" # Pickle the udf with all its dependencies @@ -285,38 +287,120 @@ def generate_cloud_function_main_code(self, def_, dir): # ... # } # https://cloud.google.com/bigquery/docs/reference/standard-sql/remote-functions#input_format - code_template = textwrap.dedent( - """\ - import cloudpickle - import functions_framework - from flask import jsonify - import json - - # original udf code is in {udf_code_file} - # serialized udf code is in {udf_bytecode_file} - with open("{udf_bytecode_file}", "rb") as f: - udf = cloudpickle.load(f) - - def {handler_func_name}(request): - try: - request_json = request.get_json(silent=True) - calls = request_json["calls"] - replies = [] - for call in calls: - reply = udf(*call) - replies.append(reply) - return_json = json.dumps({{"replies" : replies}}) - return return_json - except Exception as e: - return jsonify( {{ "errorMessage": str(e) }} ), 400 - """ - ) - - code = code_template.format( - udf_code_file=udf_code_file, - udf_bytecode_file=udf_bytecode_file, - handler_func_name=handler_func_name, - ) + code = """\ +import cloudpickle +import functions_framework +from flask import jsonify +import json +""" + if is_row_processor: + code += """\ +import ast +import math +import pandas as pd + +def get_pd_series(row): + row_json = json.loads(row) + col_names = row_json["names"] + col_types = row_json["types"] + col_values = row_json["values"] + index_length = row_json["indexlength"] + dtype = row_json["dtype"] + + # At this point we are assuming that col_names, col_types and col_values are + # arrays of the same length, representing column names, types and values for + # one row of data + + # column names are not necessarily strings + # they are serialized as repr(name) at source + evaluated_col_names = [] + for col_name in col_names: + try: + col_name = ast.literal_eval(col_name) + except Exception as ex: + raise NameError(f"Failed to evaluate column name from '{col_name}': {ex}") + evaluated_col_names.append(col_name) + col_names = evaluated_col_names + + # Supported converters for pandas to python types + value_converters = { + "boolean": lambda val: val == "true", + "Int64": int, + "Float64": float, + "string": str, + } + + def convert_value(value, value_type): + value_converter = value_converters.get(value_type) + if value_converter is None: + raise ValueError(f"Don't know how to handle type '{value_type}'") + if value is None: + return None + return value_converter(value) + + index_values = [ + pd.Series([convert_value(col_values[i], col_types[i])], dtype=col_types[i])[0] + for i in range(index_length) + ] + + data_col_names = col_names[index_length:] + data_col_types = col_types[index_length:] + data_col_values = col_values[index_length:] + data_col_values = [ + pd.Series([convert_value(a, data_col_types[i])], dtype=data_col_types[i])[0] + for i, a in enumerate(data_col_values) + ] + + row_index = index_values[0] if len(index_values) == 1 else tuple(index_values) + row_series = pd.Series(data_col_values, index=data_col_names, name=row_index, dtype=dtype) + return row_series +""" + code += f"""\ + +# original udf code is in {udf_code_file} +# serialized udf code is in {udf_bytecode_file} +with open("{udf_bytecode_file}", "rb") as f: + udf = cloudpickle.load(f) + +def {handler_func_name}(request): + try: + request_json = request.get_json(silent=True) + calls = request_json["calls"] + replies = [] + for call in calls: +""" + + if is_row_processor: + code += """\ + reply = udf(get_pd_series(call[0])) + if isinstance(reply, float) and (math.isnan(reply) or math.isinf(reply)): + # json serialization of the special float values (nan, inf, -inf) + # is not in strict compliance of the JSON specification + # https://docs.python.org/3/library/json.html#basic-usage. + # Let's convert them to a quoted string representation ("NaN", + # "Infinity", "-Infinity" respectively) which is handled by + # BigQuery + reply = json.dumps(reply) + elif pd.isna(reply): + # Pandas N/A values are not json serializable, so use a python + # equivalent instead + reply = None + elif hasattr(reply, "item"): + # Numpy types are not json serializable, so use its Python + # value instead + reply = reply.item() +""" + else: + code += """\ + reply = udf(*call) +""" + code += """\ + replies.append(reply) + return_json = json.dumps({"replies" : replies}) + return return_json + except Exception as e: + return jsonify( { "errorMessage": str(e) } ), 400 +""" main_py = os.path.join(dir, "main.py") with open(main_py, "w") as f: @@ -325,11 +409,17 @@ def {handler_func_name}(request): return handler_func_name - def generate_cloud_function_code(self, def_, dir, package_requirements=None): + def generate_cloud_function_code( + self, def_, dir, package_requirements=None, is_row_processor=False + ): """Generate the cloud function code for a given user defined function.""" # requirements.txt requirements = ["cloudpickle >= 2.1.0"] + if is_row_processor: + # bigframes remote function will send an entire row of data as json, + # which would be converted to a pandas series and processed + requirements.append(f"pandas=={pandas.__version__}") if package_requirements: requirements.extend(package_requirements) requirements = sorted(requirements) @@ -338,7 +428,9 @@ def generate_cloud_function_code(self, def_, dir, package_requirements=None): f.write("\n".join(requirements)) # main.py - entry_point = self.generate_cloud_function_main_code(def_, dir) + entry_point = self.generate_cloud_function_main_code( + def_, dir, is_row_processor + ) return entry_point def create_cloud_function( @@ -348,13 +440,14 @@ def create_cloud_function( package_requirements=None, timeout_seconds=600, max_instance_count=None, + is_row_processor=False, ): """Create a cloud function from the given user defined function.""" # Build and deploy folder structure containing cloud function with tempfile.TemporaryDirectory() as dir: entry_point = self.generate_cloud_function_code( - def_, dir, package_requirements + def_, dir, package_requirements, is_row_processor ) archive_path = shutil.make_archive(dir, "zip", dir) @@ -474,6 +567,7 @@ def provision_bq_remote_function( max_batching_rows, cloud_function_timeout, cloud_function_max_instance_count, + is_row_processor, ): """Provision a BigQuery remote function.""" # If reuse of any existing function with the same name (indicated by the @@ -500,6 +594,7 @@ def provision_bq_remote_function( package_requirements, cloud_function_timeout, cloud_function_max_instance_count, + is_row_processor, ) else: logger.info(f"Cloud function {cloud_function_name} already exists.") @@ -700,8 +795,9 @@ def remote_function( Args: input_types (type or sequence(type)): - Input data type, or sequence of input data types in the user - defined function. + For scalar user defined function it should be the input type or + sequence of input types. For row processing user defined function, + type `Series` should be specified. output_type (type): Data type of the output in the user defined function. session (bigframes.Session, Optional): @@ -800,9 +896,25 @@ def remote_function( function's default setting applies. For more details see https://cloud.google.com/functions/docs/configuring/max-instances """ - if isinstance(input_types, type): + is_row_processor = False + + import bigframes.series + + if input_types == bigframes.series.Series: + warnings.warn( + "input_types=Series scenario is in preview.", + stacklevel=1, + category=bigframes.exceptions.PreviewWarning, + ) + + # we will model the row as a json serialized string containing the data + # and the metadata representing the row + input_types = [str] + is_row_processor = True + elif isinstance(input_types, type): input_types = [input_types] + # Some defaults may be used from the session if not provided otherwise import bigframes.pandas as bpd session = session or bpd.get_global_session() @@ -928,6 +1040,7 @@ def wrapper(f): max_batching_rows, cloud_function_timeout, cloud_function_max_instances, + is_row_processor, ) # TODO: Move ibis logic to compiler step diff --git a/bigframes/session/__init__.py b/bigframes/session/__init__.py index 5f70fd77f9..473fc4f098 100644 --- a/bigframes/session/__init__.py +++ b/bigframes/session/__init__.py @@ -1416,6 +1416,9 @@ def remote_function( """Decorator to turn a user defined function into a BigQuery remote function. Check out the code samples at: https://cloud.google.com/bigquery/docs/remote-functions#bigquery-dataframes. + .. note:: + ``input_types=Series`` scenario is in preview. + .. note:: Please make sure following is setup before using this API: @@ -1455,8 +1458,9 @@ def remote_function( Args: input_types (type or sequence(type)): - Input data type, or sequence of input data types in the user - defined function. + For scalar user defined function it should be the input type or + sequence of input types. For row processing user defined function, + type `Series` should be specified. output_type (type): Data type of the output in the user defined function. dataset (str, Optional): diff --git a/tests/system/large/test_remote_function.py b/tests/system/large/test_remote_function.py index eb2a0884fe..e086903d03 100644 --- a/tests/system/large/test_remote_function.py +++ b/tests/system/large/test_remote_function.py @@ -12,6 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. +from datetime import datetime import importlib.util import inspect import math # must keep this at top level to test udf referring global import @@ -28,6 +29,7 @@ import bigframes from bigframes.functions.remote_function import get_cloud_function_name +import bigframes.series from tests.system.utils import ( assert_pandas_df_equal, delete_cloud_function, @@ -1454,3 +1456,251 @@ def square(x): cleanup_remote_function_assets( session.bqclient, session.cloudfunctionsclient, square_remote ) + + +@pytest.mark.flaky(retries=2, delay=120) +def test_df_apply_axis_1(session, scalars_dfs): + columns = ["bool_col", "int64_col", "int64_too", "float64_col", "string_col"] + scalars_df, scalars_pandas_df = scalars_dfs + try: + + def serialize_row(row): + custom = { + "name": row.name, + "index": [idx for idx in row.index], + "values": [ + val.item() if hasattr(val, "item") else val for val in row.values + ], + } + + return str( + { + "default": row.to_json(), + "split": row.to_json(orient="split"), + "records": row.to_json(orient="records"), + "index": row.to_json(orient="index"), + "table": row.to_json(orient="table"), + "custom": custom, + } + ) + + serialize_row_remote = session.remote_function( + bigframes.series.Series, str, reuse=False + )(serialize_row) + + bf_result = scalars_df[columns].apply(serialize_row_remote, axis=1).to_pandas() + pd_result = scalars_pandas_df[columns].apply(serialize_row, axis=1) + + # bf_result.dtype is 'string[pyarrow]' while pd_result.dtype is 'object' + # , ignore this mismatch by using check_dtype=False. + pandas.testing.assert_series_equal(pd_result, bf_result, check_dtype=False) + finally: + # clean up the gcp assets created for the remote function + cleanup_remote_function_assets( + session.bqclient, session.cloudfunctionsclient, serialize_row_remote + ) + + +@pytest.mark.flaky(retries=2, delay=120) +def test_df_apply_axis_1_aggregates(session, scalars_dfs): + columns = ["int64_col", "int64_too", "float64_col"] + scalars_df, scalars_pandas_df = scalars_dfs + + try: + + def analyze(row): + return str( + { + "dtype": row.dtype, + "count": row.count(), + "min": row.max(), + "max": row.max(), + "mean": row.mean(), + "std": row.std(), + "var": row.var(), + } + ) + + analyze_remote = session.remote_function(bigframes.series.Series, str)(analyze) + + bf_result = ( + scalars_df[columns].dropna().apply(analyze_remote, axis=1).to_pandas() + ) + pd_result = scalars_pandas_df[columns].dropna().apply(analyze, axis=1) + + # bf_result.dtype is 'string[pyarrow]' while pd_result.dtype is 'object' + # , ignore this mismatch by using check_dtype=False. + pandas.testing.assert_series_equal(pd_result, bf_result, check_dtype=False) + finally: + # clean up the gcp assets created for the remote function + cleanup_remote_function_assets( + session.bqclient, session.cloudfunctionsclient, analyze_remote + ) + + +@pytest.mark.parametrize( + ("pd_df"), + [ + pytest.param( + pandas.DataFrame( + { + "2": [1, 2, 3], + 2: [1.5, 3.75, 5], + "name, [with. special'- chars\")/\\": [10, 20, 30], + (3, 4): ["pq", "rs", "tu"], + (5.0, "six", 7): [8, 9, 10], + 'raise Exception("hacked!")': [11, 12, 13], + } + ), + id="all-kinds-of-column-names", + ), + pytest.param( + pandas.DataFrame( + { + "x": [1, 2, 3], + "y": [1.5, 3.75, 5], + "z": ["pq", "rs", "tu"], + }, + index=pandas.MultiIndex.from_tuples( + [ + ("a", 100), + ("a", 200), + ("b", 300), + ] + ), + ), + id="multiindex", + ), + pytest.param( + pandas.DataFrame( + [ + [10, 1.5, "pq"], + [20, 3.75, "rs"], + [30, 8.0, "tu"], + ], + columns=pandas.MultiIndex.from_arrays( + [ + ["first", "last_two", "last_two"], + [1, 2, 3], + ] + ), + ), + id="column-multiindex", + ), + pytest.param( + pandas.DataFrame( + { + datetime.now(): [1, 2, 3], + } + ), + id="column-name-not-supported", + marks=pytest.mark.xfail(raises=NameError), + ), + ], +) +@pytest.mark.flaky(retries=2, delay=120) +def test_df_apply_axis_1_complex(session, pd_df): + bf_df = session.read_pandas(pd_df) + + try: + + def serialize_row(row): + custom = { + "name": row.name, + "index": [idx for idx in row.index], + "values": [ + val.item() if hasattr(val, "item") else val for val in row.values + ], + } + return str( + { + "default": row.to_json(), + "split": row.to_json(orient="split"), + "records": row.to_json(orient="records"), + "index": row.to_json(orient="index"), + "custom": custom, + } + ) + + serialize_row_remote = session.remote_function( + bigframes.series.Series, str, reuse=False + )(serialize_row) + + bf_result = bf_df.apply(serialize_row_remote, axis=1).to_pandas() + pd_result = pd_df.apply(serialize_row, axis=1) + + # bf_result.dtype is 'string[pyarrow]' while pd_result.dtype is 'object' + # , ignore this mismatch by using check_dtype=False. + # + # bf_result.index[0].dtype is 'string[pyarrow]' while + # pd_result.index[0].dtype is 'object', ignore this mismatch by using + # check_index_type=False. + pandas.testing.assert_series_equal( + pd_result, bf_result, check_dtype=False, check_index_type=False + ) + finally: + # clean up the gcp assets created for the remote function + cleanup_remote_function_assets( + session.bqclient, session.cloudfunctionsclient, serialize_row_remote + ) + + +@pytest.mark.flaky(retries=2, delay=120) +def test_df_apply_axis_1_na_nan_inf(session): + """This test is for special cases of float values, to make sure any (nan, + inf, -inf) produced by user code is honored. + """ + bf_df = session.read_gbq( + """\ +SELECT "1" AS text, 1 AS num +UNION ALL +SELECT "2.5" AS text, 2.5 AS num +UNION ALL +SELECT "nan" AS text, IEEE_DIVIDE(0, 0) AS num +UNION ALL +SELECT "inf" AS text, IEEE_DIVIDE(1, 0) AS num +UNION ALL +SELECT "-inf" AS text, IEEE_DIVIDE(-1, 0) AS num +UNION ALL +SELECT "numpy nan" AS text, IEEE_DIVIDE(0, 0) AS num +UNION ALL +SELECT "pandas na" AS text, NULL AS num + """ + ) + + pd_df = bf_df.to_pandas() + + try: + + def float_parser(row): + import numpy as mynp + import pandas as mypd + + if row["text"] == "pandas na": + return mypd.NA + if row["text"] == "numpy nan": + return mynp.nan + return float(row["text"]) + + float_parser_remote = session.remote_function( + bigframes.series.Series, float, reuse=False + )(float_parser) + + pd_result = pd_df.apply(float_parser, axis=1) + bf_result = bf_df.apply(float_parser_remote, axis=1).to_pandas() + + # bf_result.dtype is 'Float64' while pd_result.dtype is 'object' + # , ignore this mismatch by using check_dtype=False. + pandas.testing.assert_series_equal(pd_result, bf_result, check_dtype=False) + + # Let's also assert that the data is consistent in this round trip + # (BQ -> BigFrames -> BQ -> GCF -> BQ -> BigFrames) w.r.t. their + # expected values in BQ + bq_result = bf_df["num"].to_pandas() + bq_result.name = None + pandas.testing.assert_series_equal(bq_result, bf_result) + finally: + # clean up the gcp assets created for the remote function + cleanup_remote_function_assets( + session.bqclient, session.cloudfunctionsclient, float_parser_remote + ) diff --git a/tests/system/small/test_remote_function.py b/tests/system/small/test_remote_function.py index 106638cef3..9c60c821a7 100644 --- a/tests/system/small/test_remote_function.py +++ b/tests/system/small/test_remote_function.py @@ -12,12 +12,15 @@ # See the License for the specific language governing permissions and # limitations under the License. +import re + import google.api_core.exceptions from google.cloud import bigquery import pandas as pd import pytest import bigframes +import bigframes.exceptions from bigframes.functions import remote_function as rf from tests.system.utils import assert_pandas_df_equal @@ -685,3 +688,146 @@ def test_read_gbq_function_enforces_explicit_types(bigquery_client, dataset_id): rf.read_gbq_function( str(neither_type_specified.reference), bigquery_client=bigquery_client ) + + +@pytest.mark.flaky(retries=2, delay=120) +def test_df_apply_axis_1(session, scalars_dfs): + columns = ["bool_col", "int64_col", "int64_too", "float64_col", "string_col"] + scalars_df, scalars_pandas_df = scalars_dfs + + def add_ints(row): + return row["int64_col"] + row["int64_too"] + + with pytest.warns( + bigframes.exceptions.PreviewWarning, + match="input_types=Series scenario is in preview.", + ): + add_ints_remote = session.remote_function(bigframes.series.Series, int)( + add_ints + ) + + with pytest.warns( + bigframes.exceptions.PreviewWarning, match="axis=1 scenario is in preview." + ): + bf_result = scalars_df[columns].apply(add_ints_remote, axis=1).to_pandas() + + pd_result = scalars_pandas_df[columns].apply(add_ints, axis=1) + + # bf_result.dtype is 'Int64' while pd_result.dtype is 'object', ignore this + # mismatch by using check_dtype=False. + # + # bf_result.to_numpy() produces an array of numpy.float64's + # (in system_prerelease tests), while pd_result.to_numpy() produces an + # array of ints, ignore this mismatch by using check_exact=False. + pd.testing.assert_series_equal( + pd_result, bf_result, check_dtype=False, check_exact=False + ) + + +@pytest.mark.flaky(retries=2, delay=120) +def test_df_apply_axis_1_ordering(session, scalars_dfs): + columns = ["bool_col", "int64_col", "int64_too", "float64_col", "string_col"] + ordering_columns = ["bool_col", "int64_col"] + scalars_df, scalars_pandas_df = scalars_dfs + + def add_ints(row): + return row["int64_col"] + row["int64_too"] + + add_ints_remote = session.remote_function(bigframes.series.Series, int)(add_ints) + + bf_result = ( + scalars_df[columns] + .sort_values(ordering_columns) + .apply(add_ints_remote, axis=1) + .to_pandas() + ) + pd_result = ( + scalars_pandas_df[columns].sort_values(ordering_columns).apply(add_ints, axis=1) + ) + + # bf_result.dtype is 'Int64' while pd_result.dtype is 'object', ignore this + # mismatch by using check_dtype=False. + # + # bf_result.to_numpy() produces an array of numpy.float64's + # (in system_prerelease tests), while pd_result.to_numpy() produces an + # array of ints, ignore this mismatch by using check_exact=False. + pd.testing.assert_series_equal( + pd_result, bf_result, check_dtype=False, check_exact=False + ) + + +@pytest.mark.flaky(retries=2, delay=120) +def test_df_apply_axis_1_multiindex(session): + pd_df = pd.DataFrame( + {"x": [1, 2, 3], "y": [1.5, 3.75, 5], "z": ["pq", "rs", "tu"]}, + index=pd.MultiIndex.from_tuples([("a", 100), ("a", 200), ("b", 300)]), + ) + bf_df = session.read_pandas(pd_df) + + def add_numbers(row): + return row["x"] + row["y"] + + add_numbers_remote = session.remote_function(bigframes.series.Series, float)( + add_numbers + ) + + bf_result = bf_df.apply(add_numbers_remote, axis=1).to_pandas() + pd_result = pd_df.apply(add_numbers, axis=1) + + # bf_result.dtype is 'Float64' while pd_result.dtype is 'float64', ignore this + # mismatch by using check_dtype=False. + # + # bf_result.index[0].dtype is 'string[pyarrow]' while + # pd_result.index[0].dtype is 'object', ignore this mismatch by using + # check_index_type=False. + pd.testing.assert_series_equal( + pd_result, bf_result, check_dtype=False, check_index_type=False + ) + + +def test_df_apply_axis_1_unsupported_callable(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + columns = ["bool_col", "int64_col", "int64_too", "float64_col", "string_col"] + + def add_ints(row): + return row["int64_col"] + row["int64_too"] + + # pandas works + scalars_pandas_df.apply(add_ints, axis=1) + + with pytest.raises(ValueError, match="For axis=1 a remote function must be used."): + scalars_df[columns].apply(add_ints, axis=1) + + +@pytest.mark.parametrize( + ("column"), + [ + pytest.param("bytes_col"), + pytest.param("date_col"), + pytest.param("datetime_col"), + pytest.param("geography_col"), + pytest.param("numeric_col"), + pytest.param("time_col"), + pytest.param("timestamp_col"), + ], +) +def test_df_apply_axis_1_unsupported_dtype(scalars_dfs, column): + scalars_df, scalars_pandas_df = scalars_dfs + + # It doesn't matter if it is a remote function or not, the dtype check + # is done even before the function type check with axis=1 + def echo(row): + return row[column] + + # pandas works + scalars_pandas_df[[column]].apply(echo, axis=1) + + dtype = scalars_df[column].dtype + + with pytest.raises( + NotImplementedError, + match=re.escape( + f"DataFrame has a column of dtype '{dtype}' which is not supported with axis=1. Supported dtypes are ('Int64', 'Float64', 'boolean', 'string')." + ), + ): + scalars_df[[column]].apply(echo, axis=1) diff --git a/third_party/bigframes_vendored/pandas/core/frame.py b/third_party/bigframes_vendored/pandas/core/frame.py index 4e17bca54d..31d5e88c7e 100644 --- a/third_party/bigframes_vendored/pandas/core/frame.py +++ b/third_party/bigframes_vendored/pandas/core/frame.py @@ -4200,12 +4200,16 @@ def merge( """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) - def apply(self, func, *, args=(), **kwargs): + def apply(self, func, *, axis=0, args=(), **kwargs): """Apply a function along an axis of the DataFrame. Objects passed to the function are Series objects whose index is - the DataFrame's index (``axis=0``) the final return type - is inferred from the return type of the applied function. + the DataFrame's index (``axis=0``) or the DataFrame's columns (``axis=1``). + The final return type is inferred from the return type of the applied + function. + + .. note:: + ``axis=1`` scenario is in preview. **Examples:** @@ -4230,9 +4234,28 @@ def apply(self, func, *, args=(), **kwargs): [2 rows x 2 columns] + You could apply a user defined function to every row of the DataFrame by + creating a remote function out of it, and using it with `axis=1`. + + >>> @bpd.remote_function(bpd.Series, int, reuse=False) + ... def foo(row): + ... result = 1 + ... result += row["col1"] + ... result += row["col2"]*row["col2"] + ... return result + + >>> df.apply(foo, axis=1) + 0 11 + 1 19 + dtype: Int64 + Args: func (function): Function to apply to each column or row. + axis ({index (0), columns (1)}): + Axis along which the function is applied. Specify 0 or 'index' + to apply function to each column. Specify 1 or 'columns' to + apply function to each row. args (tuple): Positional arguments to pass to `func` in addition to the array/series. From 21bd3e4f0be64c20ed8774f1a6e0b79fe40bba5a Mon Sep 17 00:00:00 2001 From: Huan Chen <142538604+Genesis929@users.noreply.github.com> Date: Fri, 10 May 2024 14:40:40 -0700 Subject: [PATCH 12/17] chore: log and labels update (#674) * chore: log and labels update * remove unused logic * Update unit test. * fixes for mypy * lint update --- bigframes/core/log_adapter.py | 16 +++++++++-- bigframes/session/_io/bigquery/__init__.py | 17 +++++++---- .../session/_io/bigquery/read_gbq_table.py | 4 --- tests/unit/session/test_io_bigquery.py | 28 ++++++------------- 4 files changed, 33 insertions(+), 32 deletions(-) diff --git a/bigframes/core/log_adapter.py b/bigframes/core/log_adapter.py index 860d394cd2..b5afafbe7c 100644 --- a/bigframes/core/log_adapter.py +++ b/bigframes/core/log_adapter.py @@ -21,6 +21,9 @@ _api_methods: List = [] _excluded_methods = ["__setattr__", "__getattr__"] +# Stack to track method calls +_call_stack: List = [] + def class_logger(decorated_cls): """Decorator that adds logging functionality to each method of the class.""" @@ -38,10 +41,17 @@ def wrapper(*args, **kwargs): class_name = decorated_cls.__name__ # Access decorated class name api_method_name = str(method.__name__) full_method_name = f"{class_name.lower()}-{api_method_name}" - # Track regular and "dunder" methods - if api_method_name.startswith("__") or not api_method_name.startswith("_"): + + # Track directly called methods + if len(_call_stack) == 0: add_api_method(full_method_name) - return method(*args, **kwargs) + + _call_stack.append(full_method_name) + + try: + return method(*args, **kwargs) + finally: + _call_stack.pop() return wrapper diff --git a/bigframes/session/_io/bigquery/__init__.py b/bigframes/session/_io/bigquery/__init__.py index 98e0dac1e8..ed1bd39ada 100644 --- a/bigframes/session/_io/bigquery/__init__.py +++ b/bigframes/session/_io/bigquery/__init__.py @@ -23,7 +23,7 @@ import textwrap import types import typing -from typing import Dict, Iterable, Mapping, Optional, Sequence, Tuple, Union +from typing import Dict, Iterable, Mapping, Optional, Tuple, Union import bigframes_vendored.pandas.io.gbq as third_party_pandas_gbq import google.api_core.exceptions @@ -43,11 +43,15 @@ def create_job_configs_labels( job_configs_labels: Optional[Dict[str, str]], - api_methods: Sequence[str], + api_methods: typing.List[str], ) -> Dict[str, str]: if job_configs_labels is None: job_configs_labels = {} + if api_methods: + job_configs_labels["bigframes-api"] = api_methods[0] + del api_methods[0] + labels = list( itertools.chain( job_configs_labels.keys(), @@ -198,10 +202,11 @@ def start_query_with_client( """ Starts query job and waits for results. """ - api_methods = log_adapter.get_and_reset_api_methods() - job_config.labels = create_job_configs_labels( - job_configs_labels=job_config.labels, api_methods=api_methods - ) + if not job_config.dry_run: + api_methods = log_adapter.get_and_reset_api_methods() + job_config.labels = create_job_configs_labels( + job_configs_labels=job_config.labels, api_methods=api_methods + ) try: query_job = bq_client.query(sql, job_config=job_config, timeout=timeout) diff --git a/bigframes/session/_io/bigquery/read_gbq_table.py b/bigframes/session/_io/bigquery/read_gbq_table.py index f6c1463e6c..0f6a3dadd2 100644 --- a/bigframes/session/_io/bigquery/read_gbq_table.py +++ b/bigframes/session/_io/bigquery/read_gbq_table.py @@ -112,8 +112,6 @@ def get_table_metadata( # atomically. table = bqclient.get_table(table_ref) - # TODO(b/336521938): Refactor to make sure we set the "bigframes-api" - # whereever we execute a query. job_config = bigquery.QueryJobConfig() job_config.labels["bigframes-api"] = api_name snapshot_timestamp = list( @@ -344,8 +342,6 @@ def get_time_travel_datetime_and_table_metadata( # atomically. table = bqclient.get_table(table_ref) - # TODO(b/336521938): Refactor to make sure we set the "bigframes-api" - # whereever we execute a query. job_config = bigquery.QueryJobConfig() job_config.labels["bigframes-api"] = api_name snapshot_timestamp = list( diff --git a/tests/unit/session/test_io_bigquery.py b/tests/unit/session/test_io_bigquery.py index 9da085e824..5f4072e9c2 100644 --- a/tests/unit/session/test_io_bigquery.py +++ b/tests/unit/session/test_io_bigquery.py @@ -30,17 +30,13 @@ def test_create_job_configs_labels_is_none(): labels = io_bq.create_job_configs_labels( job_configs_labels=None, api_methods=api_methods ) - expected_dict = { - "recent-bigframes-api-0": "agg", - "recent-bigframes-api-1": "series-mode", - } + expected_dict = {"bigframes-api": "agg", "recent-bigframes-api-0": "series-mode"} assert labels is not None assert labels == expected_dict def test_create_job_configs_labels_length_limit_not_met(): cur_labels = { - "bigframes-api": "read_pandas", "source": "bigquery-dataframes-temp", } api_methods = ["agg", "series-mode"] @@ -48,20 +44,18 @@ def test_create_job_configs_labels_length_limit_not_met(): job_configs_labels=cur_labels, api_methods=api_methods ) expected_dict = { - "bigframes-api": "read_pandas", "source": "bigquery-dataframes-temp", - "recent-bigframes-api-0": "agg", - "recent-bigframes-api-1": "series-mode", + "bigframes-api": "agg", + "recent-bigframes-api-0": "series-mode", } assert labels is not None - assert len(labels) == 4 + assert len(labels) == 3 assert labels == expected_dict def test_create_job_configs_labels_log_adaptor_call_method_under_length_limit(): log_adapter.get_and_reset_api_methods() cur_labels = { - "bigframes-api": "read_pandas", "source": "bigquery-dataframes-temp", } df = bpd.DataFrame( @@ -76,14 +70,10 @@ def test_create_job_configs_labels_log_adaptor_call_method_under_length_limit(): job_configs_labels=cur_labels, api_methods=api_methods ) expected_dict = { - "bigframes-api": "read_pandas", "source": "bigquery-dataframes-temp", - "recent-bigframes-api-0": "series-__init__", - "recent-bigframes-api-1": "dataframe-max", - "recent-bigframes-api-2": "dataframe-__init__", - "recent-bigframes-api-3": "dataframe-head", - "recent-bigframes-api-4": "dataframe-__init__", - "recent-bigframes-api-5": "dataframe-__init__", + "bigframes-api": "dataframe-max", + "recent-bigframes-api-0": "dataframe-head", + "recent-bigframes-api-1": "dataframe-__init__", } assert labels == expected_dict @@ -94,7 +84,7 @@ def test_create_job_configs_labels_length_limit_met_and_labels_is_none(): {"col1": [1, 2], "col2": [3, 4]}, session=resources.create_bigquery_session() ) # Test running methods more than the labels' length limit - for i in range(66): + for i in range(100): df.head() api_methods = log_adapter._api_methods @@ -112,7 +102,7 @@ def test_create_job_configs_labels_length_limit_met(): "bigframes-api": "read_pandas", "source": "bigquery-dataframes-temp", } - for i in range(60): + for i in range(100): key = f"bigframes-api-test-{i}" value = f"test{i}" cur_labels[key] = value From 9ca92d09e9c56db408350b35ec698152c13954ed Mon Sep 17 00:00:00 2001 From: Shobhit Singh Date: Fri, 10 May 2024 23:54:16 +0000 Subject: [PATCH 13/17] feat: support gcf vpc connector in `remote_function` (#677) --- bigframes/functions/remote_function.py | 14 +++- bigframes/pandas/__init__.py | 2 + bigframes/session/__init__.py | 9 ++- tests/system/large/test_remote_function.py | 75 +++++++++++++++++++++- 4 files changed, 97 insertions(+), 3 deletions(-) diff --git a/bigframes/functions/remote_function.py b/bigframes/functions/remote_function.py index 6e42ca9f48..2a7a900779 100644 --- a/bigframes/functions/remote_function.py +++ b/bigframes/functions/remote_function.py @@ -441,6 +441,7 @@ def create_cloud_function( timeout_seconds=600, max_instance_count=None, is_row_processor=False, + vpc_connector=None, ): """Create a cloud function from the given user defined function.""" @@ -519,6 +520,8 @@ def create_cloud_function( function.service_config.timeout_seconds = timeout_seconds if max_instance_count is not None: function.service_config.max_instance_count = max_instance_count + if vpc_connector is not None: + function.service_config.vpc_connector = vpc_connector function.service_config.service_account_email = ( self._cloud_function_service_account ) @@ -568,6 +571,7 @@ def provision_bq_remote_function( cloud_function_timeout, cloud_function_max_instance_count, is_row_processor, + cloud_function_vpc_connector, ): """Provision a BigQuery remote function.""" # If reuse of any existing function with the same name (indicated by the @@ -595,6 +599,7 @@ def provision_bq_remote_function( cloud_function_timeout, cloud_function_max_instance_count, is_row_processor, + cloud_function_vpc_connector, ) else: logger.info(f"Cloud function {cloud_function_name} already exists.") @@ -750,6 +755,7 @@ def remote_function( max_batching_rows: Optional[int] = 1000, cloud_function_timeout: Optional[int] = 600, cloud_function_max_instances: Optional[int] = None, + cloud_function_vpc_connector: Optional[str] = None, ): """Decorator to turn a user defined function into a BigQuery remote function. @@ -894,7 +900,12 @@ def remote_function( control the spike in the billing. Higher setting can help support processing larger scale data. When not specified, cloud function's default setting applies. For more details see - https://cloud.google.com/functions/docs/configuring/max-instances + https://cloud.google.com/functions/docs/configuring/max-instances. + cloud_function_vpc_connector (str, Optional): + The VPC connector you would like to configure for your cloud + function. This is useful if your code needs access to data or + service(s) that are on a VPC network. See for more details + https://cloud.google.com/functions/docs/networking/connecting-vpc. """ is_row_processor = False @@ -1041,6 +1052,7 @@ def wrapper(f): cloud_function_timeout, cloud_function_max_instances, is_row_processor, + cloud_function_vpc_connector, ) # TODO: Move ibis logic to compiler step diff --git a/bigframes/pandas/__init__.py b/bigframes/pandas/__init__.py index 1d6da46fae..8d2c0b148c 100644 --- a/bigframes/pandas/__init__.py +++ b/bigframes/pandas/__init__.py @@ -654,6 +654,7 @@ def remote_function( max_batching_rows: Optional[int] = 1000, cloud_function_timeout: Optional[int] = 600, cloud_function_max_instances: Optional[int] = None, + cloud_function_vpc_connector: Optional[str] = None, ): return global_session.with_default_session( bigframes.session.Session.remote_function, @@ -670,6 +671,7 @@ def remote_function( max_batching_rows=max_batching_rows, cloud_function_timeout=cloud_function_timeout, cloud_function_max_instances=cloud_function_max_instances, + cloud_function_vpc_connector=cloud_function_vpc_connector, ) diff --git a/bigframes/session/__init__.py b/bigframes/session/__init__.py index 473fc4f098..727269e7ee 100644 --- a/bigframes/session/__init__.py +++ b/bigframes/session/__init__.py @@ -1412,6 +1412,7 @@ def remote_function( max_batching_rows: Optional[int] = 1000, cloud_function_timeout: Optional[int] = 600, cloud_function_max_instances: Optional[int] = None, + cloud_function_vpc_connector: Optional[str] = None, ): """Decorator to turn a user defined function into a BigQuery remote function. Check out the code samples at: https://cloud.google.com/bigquery/docs/remote-functions#bigquery-dataframes. @@ -1537,7 +1538,12 @@ def remote_function( control the spike in the billing. Higher setting can help support processing larger scale data. When not specified, cloud function's default setting applies. For more details see - https://cloud.google.com/functions/docs/configuring/max-instances + https://cloud.google.com/functions/docs/configuring/max-instances. + cloud_function_vpc_connector (str, Optional): + The VPC connector you would like to configure for your cloud + function. This is useful if your code needs access to data or + service(s) that are on a VPC network. See for more details + https://cloud.google.com/functions/docs/networking/connecting-vpc. Returns: callable: A remote function object pointing to the cloud assets created in the background to support the remote execution. The cloud assets can be @@ -1562,6 +1568,7 @@ def remote_function( max_batching_rows=max_batching_rows, cloud_function_timeout=cloud_function_timeout, cloud_function_max_instances=cloud_function_max_instances, + cloud_function_vpc_connector=cloud_function_vpc_connector, ) def read_gbq_function( diff --git a/tests/system/large/test_remote_function.py b/tests/system/large/test_remote_function.py index e086903d03..b7d99ea36c 100644 --- a/tests/system/large/test_remote_function.py +++ b/tests/system/large/test_remote_function.py @@ -21,7 +21,7 @@ import tempfile import textwrap -from google.api_core.exceptions import BadRequest, NotFound +from google.api_core.exceptions import BadRequest, InvalidArgument, NotFound from google.cloud import bigquery, storage import pandas import pytest @@ -1333,6 +1333,79 @@ def square_num(x): ) +@pytest.mark.flaky(retries=2, delay=120) +def test_remote_function_via_session_vpc(scalars_dfs): + # TODO(shobs): Automate the following set-up during testing in the test project. + # + # For upfront convenience, the following set up has been statically created + # in the project bigfrmames-dev-perf via cloud console: + # + # 1. Create a vpc connector as per + # https://cloud.google.com/vpc/docs/configure-serverless-vpc-access#gcloud + # + # $ gcloud compute networks vpc-access connectors create bigframes-vpc --project=bigframes-dev-perf --region=us-central1 --range 10.8.0.0/28 + # Create request issued for: [bigframes-vpc] + # Waiting for operation [projects/bigframes-dev-perf/locations/us-central1/operations/f9f90df6-7cf4-4420-8c2f-b3952775dcfb] to complete...done. + # Created connector [bigframes-vpc]. + # + # $ gcloud compute networks vpc-access connectors list --project=bigframes-dev-perf --region=us-central1 + # CONNECTOR_ID REGION NETWORK IP_CIDR_RANGE SUBNET SUBNET_PROJECT MACHINE_TYPE MIN_INSTANCES MAX_INSTANCES MIN_THROUGHPUT MAX_THROUGHPUT STATE + # bigframes-vpc us-central1 default 10.8.0.0/28 e2-micro 2 10 200 1000 READY + + project = "bigframes-dev-perf" + gcf_vpc_connector = "bigframes-vpc" + + rf_session = bigframes.Session(context=bigframes.BigQueryOptions(project=project)) + + try: + + def square_num(x): + if x is None: + return x + return x * x + + square_num_remote = rf_session.remote_function( + [int], int, reuse=False, cloud_function_vpc_connector=gcf_vpc_connector + )(square_num) + + scalars_df, scalars_pandas_df = scalars_dfs + + bf_int64_col = scalars_df["int64_col"] + bf_result_col = bf_int64_col.apply(square_num_remote) + bf_result = bf_int64_col.to_frame().assign(result=bf_result_col).to_pandas() + + pd_int64_col = scalars_pandas_df["int64_col"] + pd_result_col = pd_int64_col.apply(square_num) + pd_result = pd_int64_col.to_frame().assign(result=pd_result_col) + + assert_pandas_df_equal(bf_result, pd_result, check_dtype=False) + + # Assert that the GCF is created with the intended vpc connector + gcf = rf_session.cloudfunctionsclient.get_function( + name=square_num_remote.bigframes_cloud_function + ) + assert gcf.service_config.vpc_connector == gcf_vpc_connector + finally: + # clean up the gcp assets created for the remote function + cleanup_remote_function_assets( + rf_session.bqclient, rf_session.cloudfunctionsclient, square_num_remote + ) + + +def test_remote_function_via_session_vpc_invalid(session): + with pytest.raises( + InvalidArgument, match="400.*Serverless VPC Access connector is not found" + ): + + @session.remote_function( + [int], int, reuse=False, cloud_function_vpc_connector="does-not-exist" + ) + def square_num(x): + if x is None: + return x + return x * x + + @pytest.mark.parametrize( ("max_batching_rows"), [ From 2fd1b8117bda0dee5d8fc0924c80ce257fa9e3f1 Mon Sep 17 00:00:00 2001 From: TrevorBergeron Date: Fri, 10 May 2024 18:22:01 -0700 Subject: [PATCH 14/17] feat: Add Series.combine (#680) --- bigframes/core/compile/scalar_op_compiler.py | 22 ++++++-- bigframes/operations/__init__.py | 16 +++++- bigframes/series.py | 36 +++++++++++- tests/system/large/test_remote_function.py | 35 ++++++++++++ tests/system/small/test_numpy.py | 53 ++++++++---------- tests/system/small/test_series.py | 35 ++++++++++++ .../bigframes_vendored/pandas/core/series.py | 56 +++++++++++++++++++ 7 files changed, 214 insertions(+), 39 deletions(-) diff --git a/bigframes/core/compile/scalar_op_compiler.py b/bigframes/core/compile/scalar_op_compiler.py index a65ff6fe0c..90025b3994 100644 --- a/bigframes/core/compile/scalar_op_compiler.py +++ b/bigframes/core/compile/scalar_op_compiler.py @@ -1298,22 +1298,36 @@ def coalesce_impl( return ibis.coalesce(x, y) -@scalar_op_compiler.register_binary_op(ops.cliplower_op) -def clip_lower( +@scalar_op_compiler.register_binary_op(ops.maximum_op) +def maximum_impl( value: ibis_types.Value, lower: ibis_types.Value, ): + # Note: propagates nulls return ibis.case().when(lower.isnull() | (value < lower), lower).else_(value).end() -@scalar_op_compiler.register_binary_op(ops.clipupper_op) -def clip_upper( +@scalar_op_compiler.register_binary_op(ops.minimum_op) +def minimum_impl( value: ibis_types.Value, upper: ibis_types.Value, ): + # Note: propagates nulls return ibis.case().when(upper.isnull() | (value > upper), upper).else_(value).end() +@scalar_op_compiler.register_binary_op(ops.BinaryRemoteFunctionOp, pass_op=True) +def binary_remote_function_op_impl( + x: ibis_types.Value, y: ibis_types.Value, op: ops.BinaryRemoteFunctionOp +): + if not hasattr(op.func, "bigframes_remote_function"): + raise TypeError( + f"only a bigframes remote function is supported as a callable. {constants.FEEDBACK_LINK}" + ) + x_transformed = op.func(x, y) + return x_transformed + + # Ternary Operations @scalar_op_compiler.register_ternary_op(ops.where_op) def where_op( diff --git a/bigframes/operations/__init__.py b/bigframes/operations/__init__.py index e52f488d38..6f99f71013 100644 --- a/bigframes/operations/__init__.py +++ b/bigframes/operations/__init__.py @@ -545,8 +545,8 @@ def output_type(self, *input_types): # Binary Ops fillna_op = create_binary_op(name="fillna", type_signature=op_typing.COERCE) -cliplower_op = create_binary_op(name="clip_lower", type_signature=op_typing.COERCE) -clipupper_op = create_binary_op(name="clip_upper", type_signature=op_typing.COERCE) +maximum_op = create_binary_op(name="maximum", type_signature=op_typing.COERCE) +minimum_op = create_binary_op(name="minimum", type_signature=op_typing.COERCE) coalesce_op = create_binary_op(name="coalesce", type_signature=op_typing.COERCE) @@ -587,6 +587,16 @@ def output_type(self, *input_types): raise TypeError(f"Cannot subtract dtypes {left_type} and {right_type}") +@dataclasses.dataclass(frozen=True) +class BinaryRemoteFunctionOp(BinaryOp): + name: typing.ClassVar[str] = "binary_remote_function" + func: typing.Callable + + def output_type(self, *input_types): + # This property should be set to a valid Dtype by the @remote_function decorator or read_gbq_function method + return self.func.output_dtype + + add_op = AddOp() sub_op = SubOp() mul_op = create_binary_op(name="mul", type_signature=op_typing.BINARY_NUMERIC) @@ -713,4 +723,6 @@ def output_type(self, *input_types: dtypes.ExpressionType) -> dtypes.ExpressionT np.divide: div_op, np.power: pow_op, np.arctan2: arctan2_op, + np.maximum: maximum_op, + np.minimum: minimum_op, } diff --git a/bigframes/series.py b/bigframes/series.py index 313380e4a4..d1fb0d679b 100644 --- a/bigframes/series.py +++ b/bigframes/series.py @@ -1031,9 +1031,9 @@ def clip(self, lower, upper): if lower is None and upper is None: return self if lower is None: - return self._apply_binary_op(upper, ops.clipupper_op, alignment="left") + return self._apply_binary_op(upper, ops.minimum_op, alignment="left") if upper is None: - return self._apply_binary_op(lower, ops.cliplower_op, alignment="left") + return self._apply_binary_op(lower, ops.maximum_op, alignment="left") value_id, lower_id, upper_id, block = self._align3(lower, upper) block, result_id = block.apply_ternary_op( value_id, lower_id, upper_id, ops.clip_op @@ -1374,6 +1374,38 @@ def apply( materialized_series = result_series._cached() return materialized_series + def combine( + self, + other, + func, + ) -> Series: + if not callable(func): + raise ValueError( + "Only a ufunc (a function that applies to the entire Series) or a remote function that only works on single values are supported." + ) + + if not hasattr(func, "bigframes_remote_function"): + # Keep this in sync with .apply + try: + return func(self, other) + except Exception as ex: + # This could happen if any of the operators in func is not + # supported on a Series. Let's guide the customer to use a + # remote function instead + if hasattr(ex, "message"): + ex.message += f"\n{_remote_function_recommendation_message}" + raise + + reprojected_series = Series(self._block._force_reproject()) + result_series = reprojected_series._apply_binary_op( + other, ops.BinaryRemoteFunctionOp(func=func) + ) + + # return Series with materialized result so that any error in the remote + # function is caught early + materialized_series = result_series._cached() + return materialized_series + def add_prefix(self, prefix: str, axis: int | str | None = None) -> Series: return Series(self._get_block().add_prefix(prefix)) diff --git a/tests/system/large/test_remote_function.py b/tests/system/large/test_remote_function.py index b7d99ea36c..0fa1d90e8b 100644 --- a/tests/system/large/test_remote_function.py +++ b/tests/system/large/test_remote_function.py @@ -221,6 +221,41 @@ def stringify(x): ) +# @pytest.mark.flaky(retries=2, delay=120) +def test_remote_function_binop(session, scalars_dfs, dataset_id, bq_cf_connection): + try: + + def func(x, y): + return x * abs(y % 4) + + remote_func = session.remote_function( + [str, int], + str, + dataset_id, + bq_cf_connection, + reuse=False, + )(func) + + scalars_df, scalars_pandas_df = scalars_dfs + + scalars_df = scalars_df.dropna() + scalars_pandas_df = scalars_pandas_df.dropna() + bf_result = ( + scalars_df["string_col"] + .combine(scalars_df["int64_col"], remote_func) + .to_pandas() + ) + pd_result = scalars_pandas_df["string_col"].combine( + scalars_pandas_df["int64_col"], func + ) + pandas.testing.assert_series_equal(bf_result, pd_result) + finally: + # clean up the gcp assets created for the remote function + cleanup_remote_function_assets( + session.bqclient, session.cloudfunctionsclient, remote_func + ) + + @pytest.mark.flaky(retries=2, delay=120) def test_remote_function_decorator_with_bigframes_series( session, scalars_dfs, dataset_id, bq_cf_connection diff --git a/tests/system/small/test_numpy.py b/tests/system/small/test_numpy.py index 8e349e472a..8f62d9628c 100644 --- a/tests/system/small/test_numpy.py +++ b/tests/system/small/test_numpy.py @@ -73,27 +73,6 @@ def test_df_ufuncs(scalars_dfs, opname): pd.testing.assert_frame_equal(bf_result, pd_result) -@pytest.mark.parametrize( - ("opname",), - [ - ("add",), - ("subtract",), - ("multiply",), - ("divide",), - ("power",), - ("arctan2",), - ], -) -def test_series_binary_ufuncs(floats_product_pd, floats_product_bf, opname): - bf_result = getattr(np, opname)( - floats_product_bf.float64_col_x, floats_product_bf.float64_col_y - ).to_pandas() - pd_result = getattr(np, opname)( - floats_product_pd.float64_col_x, floats_product_pd.float64_col_y - ) - pd.testing.assert_series_equal(bf_result, pd_result) - - @pytest.mark.parametrize( ("opname",), [ @@ -106,17 +85,16 @@ def test_series_binary_ufuncs(floats_product_pd, floats_product_bf, opname): ) def test_df_binary_ufuncs(scalars_dfs, opname): scalars_df, scalars_pandas_df = scalars_dfs + op = getattr(np, opname) - bf_result = getattr(np, opname)( - scalars_df[["float64_col", "int64_col"]], 5.1 - ).to_pandas() - pd_result = getattr(np, opname)( - scalars_pandas_df[["float64_col", "int64_col"]], 5.1 - ) + bf_result = op(scalars_df[["float64_col", "int64_col"]], 5.1).to_pandas() + pd_result = op(scalars_pandas_df[["float64_col", "int64_col"]], 5.1) pd.testing.assert_frame_equal(bf_result, pd_result) +# Operations tested here don't work on full dataframe in numpy+pandas +# Maybe because of nullable dtypes? @pytest.mark.parametrize( ("x", "y"), [ @@ -124,12 +102,25 @@ def test_df_binary_ufuncs(scalars_dfs, opname): ("float64_col", "int64_col"), ], ) -def test_series_atan2(scalars_dfs, x, y): - # Test atan2 separately as pandas errors when passing entire df as input, so pass only series +@pytest.mark.parametrize( + ("opname",), + [ + ("add",), + ("subtract",), + ("multiply",), + ("divide",), + ("arctan2",), + ("minimum",), + ("maximum",), + ], +) +def test_series_binary_ufuncs(scalars_dfs, x, y, opname): scalars_df, scalars_pandas_df = scalars_dfs - bf_result = np.arctan2(scalars_df[x], scalars_df[y]).to_pandas() - pd_result = np.arctan2(scalars_pandas_df[x], scalars_pandas_df[y]) + op = getattr(np, opname) + + bf_result = op(scalars_df[x], scalars_df[y]).to_pandas() + pd_result = op(scalars_pandas_df[x], scalars_pandas_df[y]) pd.testing.assert_series_equal(bf_result, pd_result) diff --git a/tests/system/small/test_series.py b/tests/system/small/test_series.py index beb99b1ada..fa514784c0 100644 --- a/tests/system/small/test_series.py +++ b/tests/system/small/test_series.py @@ -3509,6 +3509,41 @@ def test_apply_numpy_ufunc(scalars_dfs, ufunc): assert_series_equal(bf_result, pd_result) +@pytest.mark.parametrize( + ("ufunc",), + [ + pytest.param(numpy.add), + pytest.param(numpy.divide), + ], + ids=[ + "add", + "divide", + ], +) +def test_combine_series_ufunc(scalars_dfs, ufunc): + scalars_df, scalars_pandas_df = scalars_dfs + + bf_col = scalars_df["int64_col"].dropna() + bf_result = bf_col.combine(bf_col, ufunc).to_pandas() + + pd_col = scalars_pandas_df["int64_col"].dropna() + pd_result = pd_col.combine(pd_col, ufunc) + + assert_series_equal(bf_result, pd_result, check_dtype=False) + + +def test_combine_scalar_ufunc(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + + bf_col = scalars_df["int64_col"].dropna() + bf_result = bf_col.combine(2.5, numpy.add).to_pandas() + + pd_col = scalars_pandas_df["int64_col"].dropna() + pd_result = pd_col.combine(2.5, numpy.add) + + assert_series_equal(bf_result, pd_result, check_dtype=False) + + def test_apply_simple_udf(scalars_dfs): scalars_df, scalars_pandas_df = scalars_dfs diff --git a/third_party/bigframes_vendored/pandas/core/series.py b/third_party/bigframes_vendored/pandas/core/series.py index e155fb073a..585e20275c 100644 --- a/third_party/bigframes_vendored/pandas/core/series.py +++ b/third_party/bigframes_vendored/pandas/core/series.py @@ -1279,6 +1279,62 @@ def apply( """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + def combine( + self, + other: Series | Hashable, + func, + ) -> Series: + """ + Combine the Series with a Series or scalar according to `func`. + + Combine the Series and `other` using `func` to perform elementwise + selection for combined Series. + `fill_value` is assumed when value is missing at some index + from one of the two objects being combined. + + **Examples:** + + >>> import bigframes.pandas as bpd + >>> import numpy as np + >>> bpd.options.display.progress_bar = None + + Consider 2 Datasets ``s1`` and ``s2`` containing + highest clocked speeds of different birds. + + >>> s1 = bpd.Series({'falcon': 330.0, 'eagle': 160.0}) + >>> s1 + falcon 330.0 + eagle 160.0 + dtype: Float64 + >>> s2 = bpd.Series({'falcon': 345.0, 'eagle': 200.0, 'duck': 30.0}) + >>> s2 + falcon 345.0 + eagle 200.0 + duck 30.0 + dtype: Float64 + + Now, to combine the two datasets and view the highest speeds + of the birds across the two datasets + + >>> s1.combine(s2, np.maximum) + falcon 345.0 + eagle 200.0 + duck + dtype: Float64 + + Args: + other (Series or scalar): + The value(s) to be combined with the `Series`. + func (function): + BigFrames DataFrames ``remote_function`` to apply. + Takes two scalars as inputs and returns an element. + Also accepts some numpy binary functions. + + Returns: + Series: The result of combining the Series with the other object. + """ + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + def groupby( self, by=None, From 6eb19a7288155b093aa7cc9bcbc710b31e7dc87a Mon Sep 17 00:00:00 2001 From: Chelsea Lin <124939984+chelsea-lin@users.noreply.github.com> Date: Mon, 13 May 2024 10:13:08 -0700 Subject: [PATCH 15/17] feat: Series.str.split (#675) * feat: Series.str.split * add more tests * format fix --- bigframes/core/compile/scalar_op_compiler.py | 5 ++ bigframes/dtypes.py | 6 +++ bigframes/operations/__init__.py | 17 +++++-- bigframes/operations/strings.py | 12 +++++ tests/system/small/operations/test_strings.py | 31 ++++++++++++ .../pandas/core/strings/accessor.py | 48 +++++++++++++++++++ 6 files changed, 116 insertions(+), 3 deletions(-) diff --git a/bigframes/core/compile/scalar_op_compiler.py b/bigframes/core/compile/scalar_op_compiler.py index 90025b3994..8a44844fba 100644 --- a/bigframes/core/compile/scalar_op_compiler.py +++ b/bigframes/core/compile/scalar_op_compiler.py @@ -588,6 +588,11 @@ def endswith_op_impl(x: ibis_types.Value, op: ops.EndsWithOp): return any_match if any_match is not None else ibis_types.literal(False) +@scalar_op_compiler.register_unary_op(ops.StringSplitOp, pass_op=True) +def stringsplit_op_impl(x: ibis_types.Value, op: ops.StringSplitOp): + return typing.cast(ibis_types.StringValue, x).split(op.pat) + + @scalar_op_compiler.register_unary_op(ops.ZfillOp, pass_op=True) def zfill_op_impl(x: ibis_types.Value, op: ops.ZfillOp): str_value = typing.cast(ibis_types.StringValue, x) diff --git a/bigframes/dtypes.py b/bigframes/dtypes.py index d2dc210e0d..2a344aff2d 100644 --- a/bigframes/dtypes.py +++ b/bigframes/dtypes.py @@ -405,6 +405,12 @@ def bigframes_dtype_to_ibis_dtype( return BIGFRAMES_TO_IBIS[bigframes_dtype] +def bigframes_dtype_to_arrow_dtype( + bigframes_dtype: Union[DtypeString, Dtype, np.dtype[Any]] +) -> pa.DataType: + return ibis_dtype_to_arrow_dtype(bigframes_dtype_to_ibis_dtype(bigframes_dtype)) + + def literal_to_ibis_scalar( literal, force_dtype: typing.Optional[Dtype] = None, validate: bool = True ): diff --git a/bigframes/operations/__init__.py b/bigframes/operations/__init__.py index 6f99f71013..929ccaecc5 100644 --- a/bigframes/operations/__init__.py +++ b/bigframes/operations/__init__.py @@ -386,6 +386,19 @@ def output_type(self, *input_types): return op_typing.STRING_PREDICATE.output_type(input_types[0]) +@dataclasses.dataclass(frozen=True) +class StringSplitOp(UnaryOp): + name: typing.ClassVar[str] = "str_split" + pat: typing.Sequence[str] + + def output_type(self, *input_types): + input_type = input_types[0] + if not isinstance(input_type, pd.StringDtype): + raise TypeError("field accessor input must be a string type") + arrow_type = dtypes.bigframes_dtype_to_arrow_dtype(input_type) + return pd.ArrowDtype(pa.list_(arrow_type)) + + @dataclasses.dataclass(frozen=True) class EndsWithOp(UnaryOp): name: typing.ClassVar[str] = "str_endswith" @@ -463,9 +476,7 @@ def output_type(self, *input_types): raise TypeError("field accessor input must be a struct type") pa_result_type = pa_type[self.name_or_index].type - # TODO: Directly convert from arrow to pandas type - ibis_result_type = dtypes.arrow_dtype_to_ibis_dtype(pa_result_type) - return dtypes.ibis_dtype_to_bigframes_dtype(ibis_result_type) + return dtypes.arrow_dtype_to_bigframes_dtype(pa_result_type) @dataclasses.dataclass(frozen=True) diff --git a/bigframes/operations/strings.py b/bigframes/operations/strings.py index 883d19a1e3..22c325d7e0 100644 --- a/bigframes/operations/strings.py +++ b/bigframes/operations/strings.py @@ -247,6 +247,18 @@ def endswith( pat = (pat,) return self._apply_unary_op(ops.EndsWithOp(pat=pat)) + def split( + self, + pat: str = " ", + regex: Union[bool, None] = None, + ) -> series.Series: + if regex is True or (regex is None and len(pat) > 1): + raise NotImplementedError( + "Regular expressions aren't currently supported. Please set " + + f"`regex=False` and try again. {constants.FEEDBACK_LINK}" + ) + return self._apply_unary_op(ops.StringSplitOp(pat=pat)) + def zfill(self, width: int) -> series.Series: return self._apply_unary_op(ops.ZfillOp(width=width)) diff --git a/tests/system/small/operations/test_strings.py b/tests/system/small/operations/test_strings.py index 9654c77ec4..b8a8ad2d1e 100644 --- a/tests/system/small/operations/test_strings.py +++ b/tests/system/small/operations/test_strings.py @@ -531,3 +531,34 @@ def test_str_rjust(scalars_dfs): pd_result, bf_result, ) + + +@pytest.mark.parametrize( + ("pat", "regex"), + [ + pytest.param(" ", None, id="one_char"), + pytest.param("ll", False, id="two_chars"), + pytest.param( + " ", + True, + id="one_char_reg", + marks=pytest.mark.xfail(raises=NotImplementedError), + ), + pytest.param( + "ll", + None, + id="two_chars_reg", + marks=pytest.mark.xfail(raises=NotImplementedError), + ), + ], +) +def test_str_split_raise_errors(scalars_dfs, pat, regex): + scalars_df, scalars_pandas_df = scalars_dfs + col_name = "string_col" + bf_result = scalars_df[col_name].str.split(pat=pat, regex=regex).to_pandas() + pd_result = scalars_pandas_df[col_name].str.split(pat=pat, regex=regex) + + # TODO(b/336880368): Allow for NULL values for ARRAY columns in BigQuery. + pd_result = pd_result.apply(lambda x: [] if pd.isnull(x) is True else x) + + assert_series_equal(pd_result, bf_result, check_dtype=False) diff --git a/third_party/bigframes_vendored/pandas/core/strings/accessor.py b/third_party/bigframes_vendored/pandas/core/strings/accessor.py index 5bb69dc1f2..b02c23f945 100644 --- a/third_party/bigframes_vendored/pandas/core/strings/accessor.py +++ b/third_party/bigframes_vendored/pandas/core/strings/accessor.py @@ -940,6 +940,54 @@ def endswith( """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + def split( + self, + pat: str = " ", + regex: typing.Union[bool, None] = None, + ): + """ + Split strings around given separator/delimiter. + + **Examples:** + + >>> import bigframes.pandas as bpd + >>> import numpy as np + >>> bpd.options.display.progress_bar = None + + >>> s = bpd.Series( + ... [ + ... "a regular sentence", + ... "https://docs.python.org/index.html", + ... np.nan + ... ] + ... ) + >>> s.str.split() + 0 ['a' 'regular' 'sentence'] + 1 ['https://docs.python.org/index.html'] + 2 [] + dtype: list[pyarrow] + + The pat parameter can be used to split by other characters. + + >>> s.str.split("//", regex=False) + 0 ['a regular sentence'] + 1 ['https:' 'docs.python.org/index.html'] + 2 [] + dtype: list[pyarrow] + + Args: + pat (str, default " "): + String to split on. If not specified, split on whitespace. + regex (bool, default None): + Determines if the passed-in pattern is a regular expression. Regular + expressions aren't currently supported. Please set `regex=False` when + `pat` length is not 1. + + Returns: + bigframes.series.Series: Type matches caller. + """ + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + def match(self, pat: str, case: bool = True, flags: int = 0): """ Determine if each string starts with a match of a regular expression. From c7e0eadfaa33ae2fde6e6c666b6c670258b5d643 Mon Sep 17 00:00:00 2001 From: Huan Chen <142538604+Genesis929@users.noreply.github.com> Date: Mon, 13 May 2024 10:53:20 -0700 Subject: [PATCH 16/17] chore: add logger support for properties. (#683) * chore: add logger support for properties. * update function * update label format --- bigframes/core/log_adapter.py | 33 ++++++++++++++++++++++++++ tests/unit/session/test_io_bigquery.py | 8 ++++--- 2 files changed, 38 insertions(+), 3 deletions(-) diff --git a/bigframes/core/log_adapter.py b/bigframes/core/log_adapter.py index b5afafbe7c..877e4a9fa1 100644 --- a/bigframes/core/log_adapter.py +++ b/bigframes/core/log_adapter.py @@ -30,6 +30,10 @@ def class_logger(decorated_cls): for attr_name, attr_value in decorated_cls.__dict__.items(): if callable(attr_value) and (attr_name not in _excluded_methods): setattr(decorated_cls, attr_name, method_logger(attr_value, decorated_cls)) + elif isinstance(attr_value, property): + setattr( + decorated_cls, attr_name, property_logger(attr_value, decorated_cls) + ) return decorated_cls @@ -56,6 +60,35 @@ def wrapper(*args, **kwargs): return wrapper +def property_logger(prop, decorated_cls): + """Decorator that adds logging functionality to a property.""" + + def shared_wrapper(f): + @functools.wraps(f) + def wrapped(*args, **kwargs): + class_name = decorated_cls.__name__ + property_name = f.__name__ + full_property_name = f"{class_name.lower()}-{property_name.lower()}" + + if len(_call_stack) == 0: + add_api_method(full_property_name) + + _call_stack.append(full_property_name) + try: + return f(*args, **kwargs) + finally: + _call_stack.pop() + + return wrapped + + # Apply the wrapper to the getter, setter, and deleter + return property( + shared_wrapper(prop.fget), + shared_wrapper(prop.fset) if prop.fset else None, + shared_wrapper(prop.fdel) if prop.fdel else None, + ) + + def add_api_method(api_method_name): global _lock global _api_methods diff --git a/tests/unit/session/test_io_bigquery.py b/tests/unit/session/test_io_bigquery.py index 5f4072e9c2..5a3470e883 100644 --- a/tests/unit/session/test_io_bigquery.py +++ b/tests/unit/session/test_io_bigquery.py @@ -64,6 +64,7 @@ def test_create_job_configs_labels_log_adaptor_call_method_under_length_limit(): # Test running two methods df.head() df.max() + df.columns api_methods = log_adapter._api_methods labels = io_bq.create_job_configs_labels( @@ -71,9 +72,10 @@ def test_create_job_configs_labels_log_adaptor_call_method_under_length_limit(): ) expected_dict = { "source": "bigquery-dataframes-temp", - "bigframes-api": "dataframe-max", - "recent-bigframes-api-0": "dataframe-head", - "recent-bigframes-api-1": "dataframe-__init__", + "bigframes-api": "dataframe-columns", + "recent-bigframes-api-0": "dataframe-max", + "recent-bigframes-api-1": "dataframe-head", + "recent-bigframes-api-2": "dataframe-__init__", } assert labels == expected_dict From 0b8b82761f94b240ea671b20732144a5481899aa Mon Sep 17 00:00:00 2001 From: "release-please[bot]" <55107282+release-please[bot]@users.noreply.github.com> Date: Mon, 13 May 2024 14:27:20 -0700 Subject: [PATCH 17/17] chore(main): release 1.6.0 (#667) Co-authored-by: release-please[bot] <55107282+release-please[bot]@users.noreply.github.com> Co-authored-by: Huan Chen <142538604+Genesis929@users.noreply.github.com> --- CHANGELOG.md | 33 +++++++++++++++++++++++++++++++++ bigframes/version.py | 2 +- 2 files changed, 34 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 4457c2e443..568efa68b4 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -4,6 +4,39 @@ [1]: https://pypi.org/project/bigframes/#history +## [1.6.0](https://github.com/googleapis/python-bigquery-dataframes/compare/v1.5.0...v1.6.0) (2024-05-13) + + +### Features + +* Add `DataFrame.__delitem__` ([#673](https://github.com/googleapis/python-bigquery-dataframes/issues/673)) ([2218c21](https://github.com/googleapis/python-bigquery-dataframes/commit/2218c21b5bb0f9e54a365ba1ada0203cbc4c9efc)) +* Add `Series.case_when()` ([#673](https://github.com/googleapis/python-bigquery-dataframes/issues/673)) ([2218c21](https://github.com/googleapis/python-bigquery-dataframes/commit/2218c21b5bb0f9e54a365ba1ada0203cbc4c9efc)) +* Add `strategy="quantile"` in KBinsDiscretizer ([#654](https://github.com/googleapis/python-bigquery-dataframes/issues/654)) ([c6c487f](https://github.com/googleapis/python-bigquery-dataframes/commit/c6c487fb3e39a980a05ff2dab5fb2b528d44016a)) +* Add Series.combine ([#680](https://github.com/googleapis/python-bigquery-dataframes/issues/680)) ([2fd1b81](https://github.com/googleapis/python-bigquery-dataframes/commit/2fd1b8117bda0dee5d8fc0924c80ce257fa9e3f1)) +* Series.str.split ([#675](https://github.com/googleapis/python-bigquery-dataframes/issues/675)) ([6eb19a7](https://github.com/googleapis/python-bigquery-dataframes/commit/6eb19a7288155b093aa7cc9bcbc710b31e7dc87a)) +* Suggest correct options in bpd.options.bigquery.location ([#666](https://github.com/googleapis/python-bigquery-dataframes/issues/666)) ([57ccabc](https://github.com/googleapis/python-bigquery-dataframes/commit/57ccabcd1402b7938e2c7068e5b4880ef018f39c)) +* Support `axis=1` in `df.apply` for scalar outputs ([#629](https://github.com/googleapis/python-bigquery-dataframes/issues/629)) ([f6bdc4a](https://github.com/googleapis/python-bigquery-dataframes/commit/f6bdc4aeb3f81a1e0b955521c04ac0dd22981c76)) +* Support gcf vpc connector in `remote_function` ([#677](https://github.com/googleapis/python-bigquery-dataframes/issues/677)) ([9ca92d0](https://github.com/googleapis/python-bigquery-dataframes/commit/9ca92d09e9c56db408350b35ec698152c13954ed)) +* Warn with a more specific `DefaultLocationWarning` category when no location can be detected ([#648](https://github.com/googleapis/python-bigquery-dataframes/issues/648)) ([e084e54](https://github.com/googleapis/python-bigquery-dataframes/commit/e084e54557addff78522bbd710637ecb4b46d23e)) + + +### Bug Fixes + +* Include `index_col` when selecting `columns` and `filters` in `read_gbq_table` ([#648](https://github.com/googleapis/python-bigquery-dataframes/issues/648)) ([e084e54](https://github.com/googleapis/python-bigquery-dataframes/commit/e084e54557addff78522bbd710637ecb4b46d23e)) + + +### Dependencies + +* Add jellyfish as a dependency for spelling correction ([57ccabc](https://github.com/googleapis/python-bigquery-dataframes/commit/57ccabcd1402b7938e2c7068e5b4880ef018f39c)) + + +### Documentation + +* Add code snippets for llm text generatiion ([#669](https://github.com/googleapis/python-bigquery-dataframes/issues/669)) ([93416ed](https://github.com/googleapis/python-bigquery-dataframes/commit/93416ed2f8353c12eb162e21e9bf155312b0ed8c)) +* Add logistic regression samples ([#673](https://github.com/googleapis/python-bigquery-dataframes/issues/673)) ([2218c21](https://github.com/googleapis/python-bigquery-dataframes/commit/2218c21b5bb0f9e54a365ba1ada0203cbc4c9efc)) +* Address lint errors in code samples ([#665](https://github.com/googleapis/python-bigquery-dataframes/issues/665)) ([4fc8964](https://github.com/googleapis/python-bigquery-dataframes/commit/4fc89644e47a6da9367b54826b25c6abbe97327b)) +* Document inlining of small data in `read_*` APIs ([#670](https://github.com/googleapis/python-bigquery-dataframes/issues/670)) ([306953a](https://github.com/googleapis/python-bigquery-dataframes/commit/306953aaae69e57c7c2f5eefb88d55a35bdcca9d)) + ## [1.5.0](https://github.com/googleapis/python-bigquery-dataframes/compare/v1.4.0...v1.5.0) (2024-05-07) diff --git a/bigframes/version.py b/bigframes/version.py index 5f56ef9c61..e139eaa89e 100644 --- a/bigframes/version.py +++ b/bigframes/version.py @@ -12,4 +12,4 @@ # See the License for the specific language governing permissions and # limitations under the License. -__version__ = "1.5.0" +__version__ = "1.6.0"