diff --git a/CHANGELOG.md b/CHANGELOG.md index 0091fb4a11..d8e11d47e3 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -4,6 +4,53 @@ [1]: https://pypi.org/project/bigframes/#history +## [0.22.0](https://github.com/googleapis/python-bigquery-dataframes/compare/v0.21.0...v0.22.0) (2024-02-27) + + +### ⚠ BREAKING CHANGES + +* rename cosine_similarity to paired_cosine_distances ([#393](https://github.com/googleapis/python-bigquery-dataframes/issues/393)) +* move model optional args to kwargs ([#381](https://github.com/googleapis/python-bigquery-dataframes/issues/381)) + +### Features + +* Add `DataFrames.corr()` method ([#379](https://github.com/googleapis/python-bigquery-dataframes/issues/379)) ([67fd434](https://github.com/googleapis/python-bigquery-dataframes/commit/67fd434bbb1c73f9013f65252d1ecc8da79542f6)) +* Add ml.metrics.pairwise.manhattan_distance ([#392](https://github.com/googleapis/python-bigquery-dataframes/issues/392)) ([9d31865](https://github.com/googleapis/python-bigquery-dataframes/commit/9d318653c001287bcc8ae9d8e09d0187413cbed6)) +* Enable regional endpoints for me-central2 ([#386](https://github.com/googleapis/python-bigquery-dataframes/issues/386)) ([469674d](https://github.com/googleapis/python-bigquery-dataframes/commit/469674d64f6ad5dac0f24ad450a7b8b6998fdf68)) + + +### Bug Fixes + +* Avoid ibis warning for "database" table() method argument ([#390](https://github.com/googleapis/python-bigquery-dataframes/issues/390)) ([a0490a4](https://github.com/googleapis/python-bigquery-dataframes/commit/a0490a492a43db24a314b3f42bfac61da7683151)) +* Correct the numeric literal dtype ([#365](https://github.com/googleapis/python-bigquery-dataframes/issues/365)) ([93b02cd](https://github.com/googleapis/python-bigquery-dataframes/commit/93b02cd8bc620823563f8214b43bc5f2f35c155b)) +* Rename cosine_similarity to paired_cosine_distances ([#393](https://github.com/googleapis/python-bigquery-dataframes/issues/393)) ([81ece46](https://github.com/googleapis/python-bigquery-dataframes/commit/81ece463b69765b0f93585d6b866fb642ddc65dc)) + + +### Performance Improvements + +* Inline read_pandas for small data ([#383](https://github.com/googleapis/python-bigquery-dataframes/issues/383)) ([59b446b](https://github.com/googleapis/python-bigquery-dataframes/commit/59b446bad8d2c5fca791c384616cfa7e54d54c09)) + + +### Dependencies + +* Add minimum version constraint for sqlglot to 19.9.0 ([#389](https://github.com/googleapis/python-bigquery-dataframes/issues/389)) ([8b62d77](https://github.com/googleapis/python-bigquery-dataframes/commit/8b62d77d8274cff2842c98b032bf98d69c483482)) + + +### Documentation + +* Add a code sample for creating a kmeans model ([#267](https://github.com/googleapis/python-bigquery-dataframes/issues/267)) ([4291d65](https://github.com/googleapis/python-bigquery-dataframes/commit/4291d656f30dc50b8ffcdd10ccbfa7f327711100)) +* Fix `bigframes.pandas.concat` documentation ([#382](https://github.com/googleapis/python-bigquery-dataframes/issues/382)) ([234b61c](https://github.com/googleapis/python-bigquery-dataframes/commit/234b61cdfe75b402adf1b56f53b5f06934777f95)) + + +### Miscellaneous Chores + +* Release 0.22.0 ([#396](https://github.com/googleapis/python-bigquery-dataframes/issues/396)) ([8f73d9e](https://github.com/googleapis/python-bigquery-dataframes/commit/8f73d9e37827ecdc90683313000364922ae61dab)) + + +### Code Refactoring + +* Move model optional args to kwargs ([#381](https://github.com/googleapis/python-bigquery-dataframes/issues/381)) ([4037992](https://github.com/googleapis/python-bigquery-dataframes/commit/4037992b61ff352320d5dfb87dcf5f274791ace1)) + ## [0.21.0](https://github.com/googleapis/python-bigquery-dataframes/compare/v0.20.1...v0.21.0) (2024-02-13) diff --git a/bigframes/_config/bigquery_options.py b/bigframes/_config/bigquery_options.py index 2875a11de3..74b83429d0 100644 --- a/bigframes/_config/bigquery_options.py +++ b/bigframes/_config/bigquery_options.py @@ -125,9 +125,8 @@ def use_regional_endpoints(self) -> bool: """Flag to connect to regional API endpoints. .. deprecated:: 0.13.0 - BigQuery regional endpoints is a feature in preview and - available only to selected projects. - Enable it only if your project has regional endpoints access. + Use of regional endpoints is a feature in preview and + available only in selected regions and projects. Requires ``location`` to also be set. For example, set ``location='asia-northeast1'`` and ``use_regional_endpoints=True`` to @@ -144,9 +143,8 @@ def use_regional_endpoints(self, value: bool): if value: warnings.warn( - "BigQuery regional endpoints is a feature in preview and " - "available only to selected projects. " - "Enable it only if your project has regional endpoints access." + "Use of regional endpoints is a feature in preview and " + "available only in selected regions and projects. " ) self._use_regional_endpoints = value diff --git a/bigframes/core/blocks.py b/bigframes/core/blocks.py index 34df7231cc..993f2caa47 100644 --- a/bigframes/core/blocks.py +++ b/bigframes/core/blocks.py @@ -102,11 +102,11 @@ def __init__( ): """Construct a block object, will create default index if no index columns specified.""" index_columns = list(index_columns) - if index_labels: + if index_labels is not None: index_labels = list(index_labels) if len(index_labels) != len(index_columns): raise ValueError( - "'index_columns' and 'index_labels' must have equal length" + f"'index_columns' (size {len(index_columns)}) and 'index_labels' (size {len(index_labels)}) must have equal length" ) if len(index_columns) == 0: new_index_col_id = guid.generate_guid() @@ -1089,6 +1089,46 @@ def summarize( labels = self._get_labels_for_columns(column_ids) return Block(expr, column_labels=labels, index_columns=[label_col_id]) + def corr(self): + """Returns a block object to compute the self-correlation on this block.""" + aggregations = [ + ( + ex.BinaryAggregation( + agg_ops.CorrOp(), ex.free_var(left_col), ex.free_var(right_col) + ), + f"{left_col}-{right_col}", + ) + for left_col in self.value_columns + for right_col in self.value_columns + ] + expr = self.expr.aggregate(aggregations) + + index_col_ids = [ + guid.generate_guid() for i in range(self.column_labels.nlevels) + ] + input_count = len(self.value_columns) + unpivot_columns = tuple( + ( + guid.generate_guid(), + tuple(expr.column_ids[input_count * i : input_count * (i + 1)]), + ) + for i in range(input_count) + ) + labels = self._get_labels_for_columns(self.value_columns) + + expr = expr.unpivot( + row_labels=labels, + index_col_ids=index_col_ids, + unpivot_columns=unpivot_columns, + ) + + return Block( + expr, + column_labels=self.column_labels, + index_columns=index_col_ids, + index_labels=self.column_labels.names, + ) + def _standard_stats(self, column_id) -> typing.Sequence[agg_ops.UnaryAggregateOp]: """ Gets a standard set of stats to preemptively fetch for a column if @@ -1889,7 +1929,7 @@ def to_pandas(self) -> pd.Index: df = expr.session._rows_to_dataframe(results, dtypes) df = df.set_index(index_columns) index = df.index - index.names = list(self._block._index_labels) + index.names = list(self._block._index_labels) # type:ignore return index def resolve_level(self, level: LevelsType) -> typing.Sequence[str]: diff --git a/bigframes/dataframe.py b/bigframes/dataframe.py index 9db567a497..d467239ea6 100644 --- a/bigframes/dataframe.py +++ b/bigframes/dataframe.py @@ -69,10 +69,6 @@ import bigframes.session -# BigQuery has 1 MB query size limit, 5000 items shouldn't take more than 10% of this depending on data type. -# TODO(tbergeron): Convert to bytes-based limit -MAX_INLINE_DF_SIZE = 5000 - LevelType = typing.Hashable LevelsType = typing.Union[LevelType, typing.Sequence[LevelType]] SingleItemValue = Union[bigframes.series.Series, int, float, Callable] @@ -170,17 +166,7 @@ def __init__( columns=columns, # type:ignore dtype=dtype, # type:ignore ) - if ( - pd_dataframe.size < MAX_INLINE_DF_SIZE - # TODO(swast): Workaround data types limitation in inline data. - and not any( - dt.pyarrow_dtype - for dt in pd_dataframe.dtypes - if isinstance(dt, pandas.ArrowDtype) - ) - ): - self._block = blocks.Block.from_local(pd_dataframe) - elif session: + if session: self._block = session.read_pandas(pd_dataframe)._get_block() else: self._block = bigframes.pandas.read_pandas(pd_dataframe)._get_block() @@ -1017,6 +1003,27 @@ def combine( def combine_first(self, other: DataFrame): return self._apply_dataframe_binop(other, ops.fillna_op) + def corr(self, method="pearson", min_periods=None, numeric_only=False) -> DataFrame: + if method != "pearson": + raise NotImplementedError( + f"Only Pearson correlation is currently supported. {constants.FEEDBACK_LINK}" + ) + if min_periods: + raise NotImplementedError( + f"min_periods not yet supported. {constants.FEEDBACK_LINK}" + ) + if len(self.columns) > 30: + raise NotImplementedError( + f"Only work with dataframes containing fewer than 30 columns. Current: {len(self.columns)}. {constants.FEEDBACK_LINK}" + ) + + if not numeric_only: + frame = self._raise_on_non_numeric("corr") + else: + frame = self._drop_non_numeric() + + return DataFrame(frame._block.corr()) + def to_pandas( self, max_download_size: Optional[int] = None, diff --git a/bigframes/dtypes.py b/bigframes/dtypes.py index cb2210bec6..6e3bc25c47 100644 --- a/bigframes/dtypes.py +++ b/bigframes/dtypes.py @@ -378,11 +378,29 @@ def literal_to_ibis_scalar( scalar_expr = ibis.literal(literal, ibis_dtypes.float64) elif scalar_expr.type().is_integer(): scalar_expr = ibis.literal(literal, ibis_dtypes.int64) + elif scalar_expr.type().is_decimal(): + precision = scalar_expr.type().precision + scale = scalar_expr.type().scale + if (not precision and not scale) or ( + precision and scale and scale <= 9 and precision + (9 - scale) <= 38 + ): + scalar_expr = ibis.literal( + literal, ibis_dtypes.decimal(precision=38, scale=9) + ) + elif precision and scale and scale <= 38 and precision + (38 - scale) <= 76: + scalar_expr = ibis.literal( + literal, ibis_dtypes.decimal(precision=76, scale=38) + ) + else: + raise TypeError( + "BigQuery's decimal data type supports a maximum precision of 76 and a maximum scale of 38." + f"Current precision: {precision}. Current scale: {scale}" + ) # TODO(bmil): support other literals that can be coerced to compatible types if validate and (scalar_expr.type() not in BIGFRAMES_TO_IBIS.values()): raise ValueError( - f"Literal did not coerce to a supported data type: {literal}. {constants.FEEDBACK_LINK}" + f"Literal did not coerce to a supported data type: {scalar_expr.type()}. {constants.FEEDBACK_LINK}" ) return scalar_expr diff --git a/bigframes/ml/base.py b/bigframes/ml/base.py index f2478b1ce2..845b64caf1 100644 --- a/bigframes/ml/base.py +++ b/bigframes/ml/base.py @@ -127,6 +127,10 @@ def register(self: _T, vertex_ai_model_id: Optional[str] = None) -> _T: self._bqml_model.register(vertex_ai_model_id) return self + @abc.abstractmethod + def to_gbq(self, model_name, replace): + pass + class TrainablePredictor(Predictor): """A BigQuery DataFrames ML Model base class that can be used to fit and predict outputs. @@ -141,11 +145,6 @@ def _fit(self, X, y, transforms=None): def score(self, X, y): pass - # TODO(b/291812029): move to Predictor after implement in LLM and imported models - @abc.abstractmethod - def to_gbq(self, model_name, replace): - pass - class SupervisedTrainablePredictor(TrainablePredictor): """A BigQuery DataFrames ML Supervised Model base class that can be used to fit and predict outputs. @@ -165,7 +164,7 @@ def fit( class UnsupervisedTrainablePredictor(TrainablePredictor): """A BigQuery DataFrames ML Unsupervised Model base class that can be used to fit and predict outputs. - Only need to provide both X (y is optional and ignored) in unsupervised tasks.""" + Only need to provide X (y is optional and ignored) in unsupervised tasks.""" _T = TypeVar("_T", bound="UnsupervisedTrainablePredictor") diff --git a/bigframes/ml/ensemble.py b/bigframes/ml/ensemble.py index 1cc9fb3739..7fcaa926ed 100644 --- a/bigframes/ml/ensemble.py +++ b/bigframes/ml/ensemble.py @@ -58,6 +58,7 @@ class XGBRegressor( def __init__( self, num_parallel_tree: int = 1, + *, booster: Literal["gbtree", "dart"] = "gbtree", dart_normalized_type: Literal["tree", "forest"] = "tree", tree_method: Literal["auto", "exact", "approx", "hist"] = "auto", @@ -215,6 +216,7 @@ class XGBClassifier( def __init__( self, num_parallel_tree: int = 1, + *, booster: Literal["gbtree", "dart"] = "gbtree", dart_normalized_type: Literal["tree", "forest"] = "tree", tree_method: Literal["auto", "exact", "approx", "hist"] = "auto", @@ -372,6 +374,7 @@ class RandomForestRegressor( def __init__( self, num_parallel_tree: int = 100, + *, tree_method: Literal["auto", "exact", "approx", "hist"] = "auto", min_tree_child_weight: int = 1, colsample_bytree=1.0, @@ -538,6 +541,7 @@ class RandomForestClassifier( def __init__( self, num_parallel_tree: int = 100, + *, tree_method: Literal["auto", "exact", "approx", "hist"] = "auto", min_tree_child_weight: int = 1, colsample_bytree: float = 1.0, diff --git a/bigframes/ml/forecasting.py b/bigframes/ml/forecasting.py index 8d448fbace..0c33660475 100644 --- a/bigframes/ml/forecasting.py +++ b/bigframes/ml/forecasting.py @@ -87,7 +87,7 @@ def _fit( ) def predict( - self, X=None, horizon: int = 3, confidence_level: float = 0.95 + self, X=None, *, horizon: int = 3, confidence_level: float = 0.95 ) -> bpd.DataFrame: """Predict the closest cluster for each sample in X. diff --git a/bigframes/ml/imported.py b/bigframes/ml/imported.py index e2be154703..98b23931f3 100644 --- a/bigframes/ml/imported.py +++ b/bigframes/ml/imported.py @@ -32,15 +32,17 @@ class TensorFlowModel(base.Predictor): """Imported TensorFlow model. Args: + model_path (str): + GCS path that holds the model files. session (BigQuery Session): BQ session to create the model - model_path (str): - GCS path that holds the model files.""" + """ def __init__( self, + model_path: str, + *, session: Optional[bigframes.Session] = None, - model_path: Optional[str] = None, ): self.session = session or bpd.get_global_session() self.model_path = model_path @@ -59,7 +61,7 @@ def _from_bq( ) -> TensorFlowModel: assert model.model_type == "TENSORFLOW" - tf_model = cls(session=session, model_path=None) + tf_model = cls(session=session, model_path="") tf_model._bqml_model = core.BqmlModel(session, model) return tf_model @@ -109,15 +111,17 @@ class ONNXModel(base.Predictor): """Imported Open Neural Network Exchange (ONNX) model. Args: + model_path (str): + Cloud Storage path that holds the model files. session (BigQuery Session): BQ session to create the model - model_path (str): - Cloud Storage path that holds the model files.""" + """ def __init__( self, + model_path: str, + *, session: Optional[bigframes.Session] = None, - model_path: Optional[str] = None, ): self.session = session or bpd.get_global_session() self.model_path = model_path @@ -134,7 +138,7 @@ def _create_bqml_model(self): def _from_bq(cls, session: bigframes.Session, model: bigquery.Model) -> ONNXModel: assert model.model_type == "ONNX" - onnx_model = cls(session=session, model_path=None) + onnx_model = cls(session=session, model_path="") onnx_model._bqml_model = core.BqmlModel(session, model) return onnx_model @@ -189,8 +193,8 @@ class XGBoostModel(base.Predictor): https://cloud.google.com/bigquery/docs/reference/standard-sql/bigqueryml-syntax-create-xgboost#limitations Args: - session (BigQuery Session): - BQ session to create the model + model_path (str): + Cloud Storage path that holds the model files. input (Dict, default None): Specify the model input schema information when you create the XGBoost model. The input should be the format of @@ -203,15 +207,17 @@ class XGBoostModel(base.Predictor): {field_name: field_type}. Output is optional only if feature_names and feature_types are both specified in the model file. Supported types are "bool", "string", "int64", "float64", "array", "array", "array", "array". - model_path (str): - Cloud Storage path that holds the model files.""" + session (BigQuery Session): + BQ session to create the model + """ def __init__( self, - session: Optional[bigframes.Session] = None, + model_path: str, + *, input: Mapping[str, str] = {}, output: Mapping[str, str] = {}, - model_path: Optional[str] = None, + session: Optional[bigframes.Session] = None, ): self.session = session or bpd.get_global_session() self.model_path = model_path @@ -248,7 +254,7 @@ def _from_bq( ) -> XGBoostModel: assert model.model_type == "XGBOOST" - xgboost_model = cls(session=session, model_path=None) + xgboost_model = cls(session=session, model_path="") xgboost_model._bqml_model = core.BqmlModel(session, model) return xgboost_model diff --git a/bigframes/ml/linear_model.py b/bigframes/ml/linear_model.py index 5ee87b8850..b0c4069352 100644 --- a/bigframes/ml/linear_model.py +++ b/bigframes/ml/linear_model.py @@ -58,6 +58,7 @@ class LinearRegression( def __init__( self, + *, optimize_strategy: Literal[ "auto_strategy", "batch_gradient_descent", "normal_equation" ] = "normal_equation", @@ -192,6 +193,7 @@ class LogisticRegression( # TODO(ashleyxu) support class_weights in the constructor. def __init__( self, + *, fit_intercept: bool = True, class_weights: Optional[Union[Literal["balanced"], Dict[str, float]]] = None, ): diff --git a/bigframes/ml/llm.py b/bigframes/ml/llm.py index f44310f6a5..dfe0af2f25 100644 --- a/bigframes/ml/llm.py +++ b/bigframes/ml/llm.py @@ -66,6 +66,7 @@ class PaLM2TextGenerator(base.Predictor): def __init__( self, + *, model_name: Literal["text-bison", "text-bison-32k"] = "text-bison", session: Optional[bigframes.Session] = None, connection_name: Optional[str] = None, @@ -140,6 +141,7 @@ def _from_bq( def predict( self, X: Union[bpd.DataFrame, bpd.Series], + *, temperature: float = 0.0, max_output_tokens: int = 128, top_k: int = 40, @@ -273,6 +275,7 @@ class PaLM2TextEmbeddingGenerator(base.Predictor): def __init__( self, + *, model_name: Literal[ "textembedding-gecko", "textembedding-gecko-multilingual" ] = "textembedding-gecko", @@ -415,6 +418,7 @@ class GeminiTextGenerator(base.Predictor): def __init__( self, + *, session: Optional[bigframes.Session] = None, connection_name: Optional[str] = None, ): @@ -475,6 +479,7 @@ def _from_bq( def predict( self, X: Union[bpd.DataFrame, bpd.Series], + *, temperature: float = 0.9, max_output_tokens: int = 8192, top_k: int = 40, diff --git a/bigframes/ml/metrics/_metrics.py b/bigframes/ml/metrics/_metrics.py index 5731b946ca..5c81f16e31 100644 --- a/bigframes/ml/metrics/_metrics.py +++ b/bigframes/ml/metrics/_metrics.py @@ -34,6 +34,7 @@ def r2_score( y_true: Union[bpd.DataFrame, bpd.Series], y_pred: Union[bpd.DataFrame, bpd.Series], + *, force_finite=True, ) -> float: y_true_series, y_pred_series = utils.convert_to_series(y_true, y_pred) @@ -61,6 +62,7 @@ def r2_score( def accuracy_score( y_true: Union[bpd.DataFrame, bpd.Series], y_pred: Union[bpd.DataFrame, bpd.Series], + *, normalize=True, ) -> float: # TODO(ashleyxu): support sample_weight as the parameter @@ -83,6 +85,7 @@ def accuracy_score( def roc_curve( y_true: Union[bpd.DataFrame, bpd.Series], y_score: Union[bpd.DataFrame, bpd.Series], + *, drop_intermediate: bool = True, ) -> Tuple[bpd.Series, bpd.Series, bpd.Series]: # TODO(bmil): Add multi-class support @@ -227,6 +230,7 @@ def confusion_matrix( def recall_score( y_true: Union[bpd.DataFrame, bpd.Series], y_pred: Union[bpd.DataFrame, bpd.Series], + *, average: str = "binary", ) -> pd.Series: # TODO(ashleyxu): support more average type, default to "binary" @@ -263,6 +267,7 @@ def recall_score( def precision_score( y_true: Union[bpd.DataFrame, bpd.Series], y_pred: Union[bpd.DataFrame, bpd.Series], + *, average: str = "binary", ) -> pd.Series: # TODO(ashleyxu): support more average type, default to "binary" @@ -301,6 +306,7 @@ def precision_score( def f1_score( y_true: Union[bpd.DataFrame, bpd.Series], y_pred: Union[bpd.DataFrame, bpd.Series], + *, average: str = "binary", ) -> pd.Series: # TODO(ashleyxu): support more average type, default to "binary" diff --git a/bigframes/ml/metrics/pairwise.py b/bigframes/ml/metrics/pairwise.py index 04577c89d3..9ebea4ef42 100644 --- a/bigframes/ml/metrics/pairwise.py +++ b/bigframes/ml/metrics/pairwise.py @@ -20,7 +20,7 @@ import third_party.bigframes_vendored.sklearn.metrics.pairwise as vendored_metrics_pairwise -def cosine_similarity( +def paired_cosine_distances( X: Union[bpd.DataFrame, bpd.Series], Y: Union[bpd.DataFrame, bpd.Series] ) -> bpd.DataFrame: X, Y = utils.convert_to_dataframe(X, Y) @@ -28,7 +28,25 @@ def cosine_similarity( raise ValueError("Inputs X and Y can only contain 1 column.") base_bqml = core.BaseBqml(session=X._session) - return base_bqml.distance(X, Y, type="COSINE", name="cosine_similarity") + return base_bqml.distance(X, Y, type="COSINE", name="cosine_distance") -cosine_similarity.__doc__ = inspect.getdoc(vendored_metrics_pairwise.cosine_similarity) +paired_cosine_distances.__doc__ = inspect.getdoc( + vendored_metrics_pairwise.paired_cosine_distances +) + + +def paired_manhattan_distance( + X: Union[bpd.DataFrame, bpd.Series], Y: Union[bpd.DataFrame, bpd.Series] +) -> bpd.DataFrame: + X, Y = utils.convert_to_dataframe(X, Y) + if len(X.columns) != 1 or len(Y.columns) != 1: + raise ValueError("Inputs X and Y can only contain 1 column.") + + base_bqml = core.BaseBqml(session=X._session) + return base_bqml.distance(X, Y, type="MANHATTAN", name="manhattan_distance") + + +paired_manhattan_distance.__doc__ = inspect.getdoc( + vendored_metrics_pairwise.paired_manhattan_distance +) diff --git a/bigframes/ml/remote.py b/bigframes/ml/remote.py index 44fde4f32f..a4a95b39d1 100644 --- a/bigframes/ml/remote.py +++ b/bigframes/ml/remote.py @@ -54,6 +54,7 @@ def __init__( endpoint: str, input: Mapping[str, str], output: Mapping[str, str], + *, session: Optional[bigframes.Session] = None, connection_name: Optional[str] = None, ): diff --git a/bigframes/operations/base.py b/bigframes/operations/base.py index 04114b43cb..154247c033 100644 --- a/bigframes/operations/base.py +++ b/bigframes/operations/base.py @@ -30,10 +30,6 @@ import bigframes.session import third_party.bigframes_vendored.pandas.pandas._typing as vendored_pandas_typing -# BigQuery has 1 MB query size limit, 5000 items shouldn't take more than 10% of this depending on data type. -# TODO(tbergeron): Convert to bytes-based limit -MAX_INLINE_SERIES_SIZE = 5000 - class SeriesMethods: def __init__( @@ -104,17 +100,7 @@ def __init__( if pd_series.name is None: # to_frame will set default numeric column label if unnamed, but we do not support int column label, so must rename pd_dataframe = pd_dataframe.set_axis(["unnamed_col"], axis=1) - if ( - pd_dataframe.size < MAX_INLINE_SERIES_SIZE - # TODO(swast): Workaround data types limitation in inline data. - and not any( - dt.pyarrow_dtype - for dt in pd_dataframe.dtypes - if isinstance(dt, pd.ArrowDtype) - ) - ): - block = blocks.Block.from_local(pd_dataframe) - elif session: + if session: block = session.read_pandas(pd_dataframe)._get_block() else: # Uses default global session diff --git a/bigframes/session/__init__.py b/bigframes/session/__init__.py index df0cd6e947..4bd205afea 100644 --- a/bigframes/session/__init__.py +++ b/bigframes/session/__init__.py @@ -108,6 +108,10 @@ "UTF-32LE", } +# BigQuery has 1 MB query size limit, 5000 items shouldn't take more than 10% of this depending on data type. +# TODO(tbergeron): Convert to bytes-based limit +MAX_INLINE_DF_SIZE = 5000 + logger = logging.getLogger(__name__) @@ -148,7 +152,7 @@ def __init__( context = bigquery_options.BigQueryOptions() # TODO(swast): Get location from the environment. - if context is None or context.location is None: + if context.location is None: self._location = "US" warnings.warn( f"No explicit location is set, so using location {self._location} for the session.", @@ -882,6 +886,29 @@ def read_pandas(self, pandas_dataframe: pandas.DataFrame) -> dataframe.DataFrame def _read_pandas( self, pandas_dataframe: pandas.DataFrame, api_name: str + ) -> dataframe.DataFrame: + if ( + pandas_dataframe.size < MAX_INLINE_DF_SIZE + # TODO(swast): Workaround data types limitation in inline data. + and not any( + ( + isinstance(s.dtype, pandas.ArrowDtype) + or (len(s) > 0 and pandas.api.types.is_list_like(s.iloc[0])) + or pandas.api.types.is_datetime64_any_dtype(s) + ) + for _, s in pandas_dataframe.items() + ) + ): + return self._read_pandas_inline(pandas_dataframe) + return self._read_pandas_load_job(pandas_dataframe, api_name) + + def _read_pandas_inline( + self, pandas_dataframe: pandas.DataFrame + ) -> dataframe.DataFrame: + return dataframe.DataFrame(blocks.Block.from_local(pandas_dataframe)) + + def _read_pandas_load_job( + self, pandas_dataframe: pandas.DataFrame, api_name: str ) -> dataframe.DataFrame: col_labels, idx_labels = ( pandas_dataframe.columns.to_list(), @@ -932,8 +959,8 @@ def _read_pandas( ) table_expression = self.ibis_client.table( # type: ignore load_table_destination.table_id, - # TODO: use "dataset_id" as the "schema" - database=f"{load_table_destination.project}.{load_table_destination.dataset_id}", + schema=load_table_destination.dataset_id, + database=load_table_destination.project, ) # b/297590178 Potentially a bug in bqclient.load_table_from_dataframe(), that only when the DF is empty, the index columns disappear in table_expression. @@ -1079,7 +1106,7 @@ def read_csv( encoding=encoding, **kwargs, ) - return self.read_pandas(pandas_df) # type: ignore + return self._read_pandas(pandas_df, "read_csv") # type: ignore def read_pickle( self, @@ -1096,7 +1123,7 @@ def read_pickle( if isinstance(pandas_obj, pandas.Series): if pandas_obj.name is None: pandas_obj.name = "0" - bigframes_df = self.read_pandas(pandas_obj.to_frame()) + bigframes_df = self._read_pandas(pandas_obj.to_frame(), "read_pickle") return bigframes_df[bigframes_df.columns[0]] return self._read_pandas(pandas_obj, "read_pickle") @@ -1196,7 +1223,7 @@ def read_json( engine=engine, **kwargs, ) - return self.read_pandas(pandas_df) + return self._read_pandas(pandas_df, "read_json") def _check_file_size(self, filepath: str): max_size = 1024 * 1024 * 1024 # 1 GB in bytes @@ -1515,7 +1542,9 @@ def _cache_with_cluster_cols( ibis_expr, cluster_cols=cluster_cols, api_name="cached" ) table_expression = self.ibis_client.table( - f"{tmp_table.project}.{tmp_table.dataset_id}.{tmp_table.table_id}" + tmp_table.table_id, + schema=tmp_table.dataset_id, + database=tmp_table.project, ) new_columns = [table_expression[column] for column in compiled_value.column_ids] new_hidden_columns = [ @@ -1544,7 +1573,9 @@ def _cache_with_offsets(self, array_value: core.ArrayValue) -> core.ArrayValue: ibis_expr, cluster_cols=["bigframes_offsets"], api_name="cached" ) table_expression = self.ibis_client.table( - f"{tmp_table.project}.{tmp_table.dataset_id}.{tmp_table.table_id}" + tmp_table.table_id, + schema=tmp_table.dataset_id, + database=tmp_table.project, ) new_columns = [table_expression[column] for column in compiled_value.column_ids] new_hidden_columns = [table_expression["bigframes_offsets"]] diff --git a/bigframes/session/clients.py b/bigframes/session/clients.py index e33413002f..627c9258a6 100644 --- a/bigframes/session/clients.py +++ b/bigframes/session/clients.py @@ -37,13 +37,21 @@ _APPLICATION_NAME = f"bigframes/{bigframes.version.__version__} ibis/{ibis.__version__}" _SCOPES = ["https://www.googleapis.com/auth/cloud-platform"] +# Regions for which Regional Endpoints (REPs) are supported +_REP_SUPPORTED_REGIONS = {"me-central2"} + + # BigQuery is a REST API, which requires the protocol as part of the URL. -_BIGQUERY_REGIONAL_ENDPOINT = "https://{location}-bigquery.googleapis.com" +_BIGQUERY_LOCATIONAL_ENDPOINT = "https://{location}-bigquery.googleapis.com" +_BIGQUERY_REGIONAL_ENDPOINT = "https://bigquery.{location}.rep.googleapis.com" # BigQuery Connection and Storage are gRPC APIs, which don't support the # https:// protocol in the API endpoint URL. -_BIGQUERYCONNECTION_REGIONAL_ENDPOINT = "{location}-bigqueryconnection.googleapis.com" -_BIGQUERYSTORAGE_REGIONAL_ENDPOINT = "{location}-bigquerystorage.googleapis.com" +_BIGQUERYCONNECTION_LOCATIONAL_ENDPOINT = "{location}-bigqueryconnection.googleapis.com" +_BIGQUERYSTORAGE_LOCATIONAL_ENDPOINT = "{location}-bigquerystorage.googleapis.com" +_BIGQUERYSTORAGE_REGIONAL_ENDPOINT = ( + "https://bigquerystorage.{location}.rep.googleapis.com" +) def _get_default_credentials_with_project(): @@ -104,9 +112,11 @@ def bqclient(self): bq_options = None if self._use_regional_endpoints: bq_options = google.api_core.client_options.ClientOptions( - api_endpoint=_BIGQUERY_REGIONAL_ENDPOINT.format( - location=self._location - ), + api_endpoint=( + _BIGQUERY_REGIONAL_ENDPOINT + if self._location.lower() in _REP_SUPPORTED_REGIONS + else _BIGQUERY_LOCATIONAL_ENDPOINT + ).format(location=self._location), ) bq_info = google.api_core.client_info.ClientInfo( user_agent=self._application_name @@ -127,7 +137,7 @@ def bqconnectionclient(self): bqconnection_options = None if self._use_regional_endpoints: bqconnection_options = google.api_core.client_options.ClientOptions( - api_endpoint=_BIGQUERYCONNECTION_REGIONAL_ENDPOINT.format( + api_endpoint=_BIGQUERYCONNECTION_LOCATIONAL_ENDPOINT.format( location=self._location ) ) @@ -150,9 +160,11 @@ def bqstoragereadclient(self): bqstorage_options = None if self._use_regional_endpoints: bqstorage_options = google.api_core.client_options.ClientOptions( - api_endpoint=_BIGQUERYSTORAGE_REGIONAL_ENDPOINT.format( - location=self._location - ) + api_endpoint=( + _BIGQUERYSTORAGE_REGIONAL_ENDPOINT + if self._location.lower() in _REP_SUPPORTED_REGIONS + else _BIGQUERYSTORAGE_LOCATIONAL_ENDPOINT + ).format(location=self._location), ) bqstorage_info = google.api_core.gapic_v1.client_info.ClientInfo( user_agent=self._application_name diff --git a/bigframes/version.py b/bigframes/version.py index a713192ada..387b7663f2 100644 --- a/bigframes/version.py +++ b/bigframes/version.py @@ -12,4 +12,4 @@ # See the License for the specific language governing permissions and # limitations under the License. -__version__ = "0.21.0" +__version__ = "0.22.0" diff --git a/noxfile.py b/noxfile.py index 7cf9faf685..259943aaa4 100644 --- a/noxfile.py +++ b/noxfile.py @@ -551,7 +551,11 @@ def prerelease(session: nox.sessions.Session, tests_path): # https://github.com/googleapis/python-bigquery-dataframes/issues/341 # https://github.com/googleapis/python-bigquery-dataframes/issues/337 # are resolved - "pandas!=2.1.4, !=2.2.0rc0, !=2.2.0", + # + # We exclude each version individually so that we can continue to test + # some prerelease packages. See: + # https://github.com/googleapis/python-bigquery-dataframes/pull/268#discussion_r1423205172 + "pandas!=2.1.4, !=2.2.0rc0, !=2.2.0, !=2.2.1", ) already_installed.add("pandas") diff --git a/samples/snippets/conftest.py b/samples/snippets/conftest.py index 1ce54b3c0c..d34837b3e2 100644 --- a/samples/snippets/conftest.py +++ b/samples/snippets/conftest.py @@ -18,6 +18,8 @@ import pytest import test_utils.prefixer +import bigframes.pandas as bpd + prefixer = test_utils.prefixer.Prefixer( "python-bigquery-dataframes", "samples/snippets" ) @@ -43,6 +45,16 @@ def project_id(bigquery_client: bigquery.Client) -> str: return bigquery_client.project +@pytest.fixture(autouse=True) +def reset_session(): + """An autouse fixture ensuring each sample runs in a fresh session. + + This allows us to have samples that query data in different locations. + """ + bpd.reset_session() + bpd.options.bigquery.location = None + + @pytest.fixture(scope="session") def dataset_id(bigquery_client: bigquery.Client, project_id: str) -> Iterator[str]: dataset_id = prefixer.create_prefix() @@ -53,6 +65,17 @@ def dataset_id(bigquery_client: bigquery.Client, project_id: str) -> Iterator[st bigquery_client.delete_dataset(dataset, delete_contents=True, not_found_ok=True) +@pytest.fixture(scope="session") +def dataset_id_eu(bigquery_client: bigquery.Client, project_id: str) -> Iterator[str]: + dataset_id = prefixer.create_prefix() + full_dataset_id = f"{project_id}.{dataset_id}" + dataset = bigquery.Dataset(full_dataset_id) + dataset.location = "EU" + bigquery_client.create_dataset(dataset) + yield dataset_id + bigquery_client.delete_dataset(dataset, delete_contents=True, not_found_ok=True) + + @pytest.fixture def random_model_id( bigquery_client: bigquery.Client, project_id: str, dataset_id: str @@ -64,3 +87,17 @@ def random_model_id( full_model_id = f"{project_id}.{dataset_id}.{random_model_id}" yield full_model_id bigquery_client.delete_model(full_model_id, not_found_ok=True) + + +@pytest.fixture +def random_model_id_eu( + bigquery_client: bigquery.Client, project_id: str, dataset_id_eu: str +) -> Iterator[str]: + """ + Create a new table ID each time, so random_model_id_eu can be used + as a target for load jobs. + """ + random_model_id_eu = prefixer.create_prefix() + full_model_id = f"{project_id}.{dataset_id_eu}.{random_model_id_eu}" + yield full_model_id + bigquery_client.delete_model(full_model_id, not_found_ok=True) diff --git a/samples/snippets/create_kmeans_model_test.py b/samples/snippets/create_kmeans_model_test.py new file mode 100644 index 0000000000..2429060d09 --- /dev/null +++ b/samples/snippets/create_kmeans_model_test.py @@ -0,0 +1,152 @@ +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +def test_kmeans_sample(project_id: str, random_model_id_eu: str): + your_gcp_project_id = project_id + your_model_id = random_model_id_eu + # [START bigquery_dataframes_bqml_kmeans] + import datetime + + import bigframes + import bigframes.pandas as bpd + + bigframes.options.bigquery.project = your_gcp_project_id + # Compute in the EU multi-region to query the London bicycles dataset. + bigframes.options.bigquery.location = "EU" + + # Extract the information you'll need to train the k-means model in this + # tutorial. Use the read_gbq function to represent cycle hires + # data as a DataFrame. + h = bpd.read_gbq( + "bigquery-public-data.london_bicycles.cycle_hire", + col_order=["start_station_name", "start_station_id", "start_date", "duration"], + ).rename( + columns={ + "start_station_name": "station_name", + "start_station_id": "station_id", + } + ) + + s = bpd.read_gbq( + # Use ST_GEOPOINT and ST_DISTANCE to analyze geographical + # data. These functions determine spatial relationships between + # geographical features. + """ + SELECT + id, + ST_DISTANCE( + ST_GEOGPOINT(s.longitude, s.latitude), + ST_GEOGPOINT(-0.1, 51.5) + ) / 1000 AS distance_from_city_center + FROM + `bigquery-public-data.london_bicycles.cycle_stations` s + """ + ) + + # Define Python datetime objects in the UTC timezone for range comparison, + # because BigQuery stores timestamp data in the UTC timezone. + sample_time = datetime.datetime(2015, 1, 1, 0, 0, 0, tzinfo=datetime.timezone.utc) + sample_time2 = datetime.datetime(2016, 1, 1, 0, 0, 0, tzinfo=datetime.timezone.utc) + + h = h.loc[(h["start_date"] >= sample_time) & (h["start_date"] <= sample_time2)] + + # Replace each day-of-the-week number with the corresponding "weekday" or + # "weekend" label by using the Series.map method. + h = h.assign( + isweekday=h.start_date.dt.dayofweek.map( + { + 0: "weekday", + 1: "weekday", + 2: "weekday", + 3: "weekday", + 4: "weekday", + 5: "weekend", + 6: "weekend", + } + ) + ) + + # Supplement each trip in "h" with the station distance information from + # "s" by merging the two DataFrames by station ID. + merged_df = h.merge( + right=s, + how="inner", + left_on="station_id", + right_on="id", + ) + + # Engineer features to cluster the stations. For each station, find the + # average trip duration, number of trips, and distance from city center. + stationstats = merged_df.groupby(["station_name", "isweekday"]).agg( + {"duration": ["mean", "count"], "distance_from_city_center": "max"} + ) + stationstats.columns = ["duration", "num_trips", "distance_from_city_center"] + stationstats = stationstats.sort_values( + by="distance_from_city_center", ascending=True + ).reset_index() + + # Expected output results: >>> stationstats.head(3) + # station_name isweekday duration num_trips distance_from_city_center + # Borough Road... weekday 1110 5749 0.12624 + # Borough Road... weekend 2125 1774 0.12624 + # Webber Street... weekday 795 6517 0.164021 + # 3 rows × 5 columns + + # [END bigquery_dataframes_bqml_kmeans] + + # [START bigquery_dataframes_bqml_kmeans_fit] + + from bigframes.ml.cluster import KMeans + + # To determine an optimal number of clusters, construct and fit several + # K-Means objects with different values of num_clusters, find the error + # measure, and pick the point at which the error measure is at its minimum + # value. + cluster_model = KMeans(n_clusters=4) + cluster_model.fit(stationstats) + cluster_model.to_gbq( + your_model_id, # For example: "bqml_tutorial.london_station_clusters" + replace=True, + ) + # [END bigquery_dataframes_bqml_kmeans_fit] + + # [START bigquery_dataframes_bqml_kmeans_predict] + + # Select model you'll use for predictions. `read_gbq_model` loads model + # data from BigQuery, but you could also use the `cluster_model` object + # from previous steps. + cluster_model = bpd.read_gbq_model( + your_model_id, + # For example: "bqml_tutorial.london_station_clusters", + ) + + # Use 'contains' function to filter by stations containing the string + # "Kennington". + stationstats = stationstats.loc[ + stationstats["station_name"].str.contains("Kennington") + ] + + result = cluster_model.predict(stationstats) + + # Expected output results: >>>results.peek(3) + # CENTROID... NEAREST... station_name isweekday duration num_trips dist... + # 1 [{'CENTROID_ID'... Borough... weekday 1110 5749 0.13 + # 2 [{'CENTROID_ID'... Borough... weekend 2125 1774 0.13 + # 1 [{'CENTROID_ID'... Webber... weekday 795 6517 0.16 + # 3 rows × 7 columns + + # [END bigquery_dataframes_bqml_kmeans_predict] + + assert result is not None diff --git a/setup.py b/setup.py index 345d1ea752..4aa07904f7 100644 --- a/setup.py +++ b/setup.py @@ -32,11 +32,12 @@ # 'Development Status :: 5 - Production/Stable' release_status = "Development Status :: 3 - Alpha" dependencies = [ + # please keep these in sync with the minimum versions in testing/constraints-3.9.txt "cloudpickle >= 2.0.0", "fsspec >=2023.3.0", "gcsfs >=2023.3.0", "geopandas >=0.12.2", - "google-auth >2.14.1,<3.0dev", + "google-auth >=2.15.0,<3.0dev", "google-cloud-bigquery[bqstorage,pandas] >=3.10.0", "google-cloud-functions >=1.10.1", "google-cloud-bigquery-connection >=1.12.0", @@ -51,6 +52,10 @@ "requests >=2.27.1", "scikit-learn >=1.2.2", "sqlalchemy >=1.4,<3.0dev", + # Keep sqlglot versions in sync with ibis-framework. This avoids problems + # where the incorrect version of sqlglot is installed, such as + # https://github.com/googleapis/python-bigquery-dataframes/issues/315 + "sqlglot >=19.9.0,<20", "tabulate >= 0.9", "ipywidgets >=7.7.1", "humanize >= 4.6.0", diff --git a/testing/constraints-3.9.txt b/testing/constraints-3.9.txt index 218255c77e..42cc68eb04 100644 --- a/testing/constraints-3.9.txt +++ b/testing/constraints-3.9.txt @@ -1,121 +1,24 @@ -argcomplete==2.1.2 -asyncmock==0.4.2 -atpublic==3.1.1 -attrs==22.2.0 -bidict==0.22.1 -black==23.3.0 -bleach==6.0.0 -cachetools==5.3.0 -certifi==2022.12.7 -cffi==1.15.1 -cfgv==3.3.1 -charset-normalizer==2.0.0 -click==8.1.3 +# please keep these in sync with the minimum versions in setup.py cloudpickle==2.0.0 -colorlog==6.7.0 -coverage==7.2.2 -cryptography==40.0.1 -distlib==0.3.6 -docstring-inheritance==2.0.0 -docutils==0.19 -exceptiongroup==1.1.1 -execnet==1.9.0 -filelock==3.10.7 fsspec==2023.3.0 -gcp-docuploader==0.6.5 -gcp-releasetool==1.11.0 gcsfs==2023.3.0 geopandas==0.12.2 -google-api-core==2.11.0 -google-auth==2.17.0 -google-auth-oauthlib==1.0.0 +google-auth==2.15.0 google-cloud-bigquery==3.10.0 -google-cloud-bigquery-connection==1.12.0 -google-cloud-bigquery-storage==2.19.1 -google-cloud-core==2.3.2 google-cloud-functions==1.10.1 +google-cloud-bigquery-connection==1.12.0 google-cloud-iam==2.12.1 google-cloud-resource-manager==1.10.3 google-cloud-storage==2.0.0 -google-cloud-testutils==1.3.3 -google-crc32c==1.5.0 -google-resumable-media==2.4.1 -googleapis-common-protos==1.59.0 -greenlet==2.0.2 -grpc-google-iam-v1==0.12.6 -grpcio==1.53.0 -grpcio-status==1.48.2 ibis-framework==7.1.0 -humanize==4.6.0 -identify==2.5.22 -idna==3.4 -importlib-metadata==6.1.0 -iniconfig==2.0.0 -ipywidgets==7.7.1 -jaraco.classes==3.2.3 -jeepney==0.8.0 -Jinja2==3.1.2 -keyring==23.13.1 -markdown-it-py==2.2.0 -MarkupSafe==2.1.2 -mdurl==0.1.2 -mock==5.0.1 -more-itertools==9.1.0 -multipledispatch==0.6.0 -mypy-extensions==1.0.0 -nodeenv==1.7.0 -nox==2022.11.21 -numpy==1.24.2 -oauthlib==3.2.2 -packaging==23.0 pandas==1.5.0 -pandas-gbq==0.19.0 -parsy==2.1 -pathspec==0.11.1 -pkginfo==1.9.6 -platformdirs==3.2.0 -pluggy==1.0.0 -pooch==1.7.0 -pre-commit==3.2.1 -proto-plus==1.22.2 -protobuf==3.20.3 -pyarrow==11.0.0 -pyasn1==0.4.8 -pyasn1-modules==0.2.8 -pycparser==2.21 pydata-google-auth==1.8.2 -Pygments==2.14.0 -PyJWT==2.6.0 -pyperclip==1.8.2 -pytest==7.2.2 -pytest-asyncio==0.21.0 -pytest-cov==4.0.0 -pytest-mock==3.11.1 -pytest-retry==1.1.0 -pytest-xdist==3.2.1 -python-dateutil==2.8.2 -pytz==2023.3 -PyYAML==6.0 -readme-renderer==37.3 requests==2.27.1 -requests-oauthlib==1.3.1 -requests-toolbelt==0.10.1 -rfc3986==2.0.0 -rich==13.3.3 -rsa==4.9 scikit-learn==1.2.2 -SecretStorage==3.3.3 -six==1.16.0 -SQLAlchemy==1.4.0 -sqlglot==18.12.0 -tomli==2.0.1 -toolz==0.12.0 -tqdm==4.65.0 -twine==4.0.2 -typing_extensions==4.5.0 -tzdata==2023.3 -urllib3==1.26.15 -virtualenv==20.21.0 -webencodings==0.5.1 -xxhash==3.2.0 -zipp==3.15.0 +sqlalchemy==1.4 +sqlglot==19.9.0 +tabulate==0.9 +ipywidgets==7.7.1 +humanize==4.6.0 +# extras +pandas-gbq==0.19.0 diff --git a/tests/system/small/ml/test_metrics_pairwise.py b/tests/system/small/ml/test_metrics_pairwise.py index c02a36abbc..e2aee971ee 100644 --- a/tests/system/small/ml/test_metrics_pairwise.py +++ b/tests/system/small/ml/test_metrics_pairwise.py @@ -19,17 +19,31 @@ import bigframes.pandas as bpd -def test_cosine_similarity(): +def test_paired_cosine_distances(): x_col = [np.array([4.1, 0.5, 1.0])] y_col = [np.array([3.0, 0.0, 2.5])] X = bpd.read_pandas(pd.DataFrame({"X": x_col})) Y = bpd.read_pandas(pd.DataFrame({"Y": y_col})) - result = metrics.pairwise.cosine_similarity(X, Y) + result = metrics.pairwise.paired_cosine_distances(X, Y) expected_pd_df = pd.DataFrame( - {"X": x_col, "Y": y_col, "cosine_similarity": [0.108199]} + {"X": x_col, "Y": y_col, "cosine_distance": [0.108199]} ) pd.testing.assert_frame_equal( result.to_pandas(), expected_pd_df, check_dtype=False, check_index_type=False ) + + +def test_paired_manhattan_distance(): + x_col = [np.array([4.1, 0.5, 1.0])] + y_col = [np.array([3.0, 0.0, 2.5])] + X = bpd.read_pandas(pd.DataFrame({"X": x_col})) + Y = bpd.read_pandas(pd.DataFrame({"Y": y_col})) + + result = metrics.pairwise.paired_manhattan_distance(X, Y) + expected_pd_df = pd.DataFrame({"X": x_col, "Y": y_col, "manhattan_distance": [3.1]}) + + pd.testing.assert_frame_equal( + result.to_pandas(), expected_pd_df, check_dtype=False, check_index_type=False + ) diff --git a/tests/system/small/test_dataframe.py b/tests/system/small/test_dataframe.py index 3d31253021..8f75534fc6 100644 --- a/tests/system/small/test_dataframe.py +++ b/tests/system/small/test_dataframe.py @@ -1783,6 +1783,46 @@ def test_combine_first( pd.testing.assert_frame_equal(bf_result, pd_result, check_dtype=False) +@pytest.mark.parametrize( + ("columns", "numeric_only"), + [ + (["bool_col", "int64_col", "float64_col"], True), + (["bool_col", "int64_col", "float64_col"], False), + (["bool_col", "int64_col", "float64_col", "string_col"], True), + pytest.param( + ["bool_col", "int64_col", "float64_col", "string_col"], + False, + marks=pytest.mark.xfail( + raises=NotImplementedError, + ), + ), + ], +) +def test_corr_w_numeric_only(scalars_dfs, columns, numeric_only): + scalars_df, scalars_pandas_df = scalars_dfs + + bf_result = scalars_df[columns].corr(numeric_only=numeric_only).to_pandas() + pd_result = scalars_pandas_df[columns].corr(numeric_only=numeric_only) + + # BigFrames and Pandas differ in their data type handling: + # - Column types: BigFrames uses Float64, Pandas uses float64. + # - Index types: BigFrames uses strign, Pandas uses object. + pd.testing.assert_frame_equal( + bf_result, pd_result, check_dtype=False, check_index_type=False + ) + + +def test_corr_w_invalid_parameters(scalars_dfs): + columns = ["int64_too", "int64_col", "float64_col"] + scalars_df, _ = scalars_dfs + + with pytest.raises(NotImplementedError): + scalars_df[columns].corr(method="kendall") + + with pytest.raises(NotImplementedError): + scalars_df[columns].corr(min_periods=1) + + @pytest.mark.parametrize( ("op"), [ diff --git a/tests/system/small/test_multiindex.py b/tests/system/small/test_multiindex.py index c5e8b45b8e..e0b9164315 100644 --- a/tests/system/small/test_multiindex.py +++ b/tests/system/small/test_multiindex.py @@ -880,6 +880,27 @@ def test_column_multi_index_w_na_stack(scalars_df_index, scalars_pandas_df_index pandas.testing.assert_frame_equal(bf_result, pd_result, check_dtype=False) +def test_corr_w_multi_index(scalars_df_index, scalars_pandas_df_index): + columns = ["int64_too", "float64_col", "int64_col"] + multi_columns = pandas.MultiIndex.from_tuples(zip(["a", "b", "b"], [1, 2, 2])) + + bf = scalars_df_index[columns].copy() + bf.columns = multi_columns + + pd_df = scalars_pandas_df_index[columns].copy() + pd_df.columns = multi_columns + + bf_result = bf.corr(numeric_only=True).to_pandas() + pd_result = pd_df.corr(numeric_only=True) + + # BigFrames and Pandas differ in their data type handling: + # - Column types: BigFrames uses Float64, Pandas uses float64. + # - Index types: BigFrames uses strign, Pandas uses object. + pandas.testing.assert_frame_equal( + bf_result, pd_result, check_dtype=False, check_index_type=False + ) + + @pytest.mark.parametrize( ("index_names",), [ diff --git a/tests/system/small/test_progress_bar.py b/tests/system/small/test_progress_bar.py index bd13ac2240..1c04b580fc 100644 --- a/tests/system/small/test_progress_bar.py +++ b/tests/system/small/test_progress_bar.py @@ -15,10 +15,12 @@ import re import tempfile +import numpy as np import pandas as pd import bigframes as bf import bigframes.formatting_helpers as formatting_helpers +from bigframes.session import MAX_INLINE_DF_SIZE job_load_message_regex = r"\w+ job [\w-]+ is \w+\." @@ -66,10 +68,15 @@ def test_progress_bar_extract_jobs( def test_progress_bar_load_jobs( session: bf.Session, penguins_pandas_df_default_index: pd.DataFrame, capsys ): + # repeat the DF to be big enough to trigger the load job. + df = penguins_pandas_df_default_index + while len(df) < MAX_INLINE_DF_SIZE: + df = pd.DataFrame(np.repeat(df.values, 2, axis=0)) + bf.options.display.progress_bar = "terminal" with tempfile.TemporaryDirectory() as dir: path = dir + "/test_read_csv_progress_bar*.csv" - penguins_pandas_df_default_index.to_csv(path, index=False) + df.to_csv(path, index=False) capsys.readouterr() # clear output session.read_csv(path) diff --git a/tests/system/small/test_series.py b/tests/system/small/test_series.py index 42651ed96f..37b4f8c1de 100644 --- a/tests/system/small/test_series.py +++ b/tests/system/small/test_series.py @@ -1228,6 +1228,16 @@ def test_median(scalars_dfs): assert pd_min < bf_result < pd_max +def test_numeric_literal(scalars_dfs): + scalars_df, _ = scalars_dfs + col_name = "numeric_col" + assert scalars_df[col_name].dtype == pd.ArrowDtype(pa.decimal128(38, 9)) + bf_result = scalars_df[col_name] - scalars_df[col_name].median() + assert bf_result.size == scalars_df[col_name].size + # TODO(b/323387826): The precision increased by 1 unexpectedly. + # assert bf_result.dtype == pd.ArrowDtype(pa.decimal128(38, 9)) + + def test_repr(scalars_dfs): scalars_df, scalars_pandas_df = scalars_dfs if scalars_pandas_df.index.name != "rowindex": diff --git a/tests/unit/session/test_io_bigquery.py b/tests/unit/session/test_io_bigquery.py index 96bb7bf67f..406de2b88e 100644 --- a/tests/unit/session/test_io_bigquery.py +++ b/tests/unit/session/test_io_bigquery.py @@ -23,6 +23,7 @@ from bigframes.core import log_adapter import bigframes.pandas as bpd import bigframes.session._io.bigquery as io_bq +from tests.unit import resources def test_create_job_configs_labels_is_none(): @@ -64,7 +65,9 @@ def test_create_job_configs_labels_log_adaptor_call_method_under_length_limit(): "bigframes-api": "read_pandas", "source": "bigquery-dataframes-temp", } - df = bpd.DataFrame({"col1": [1, 2], "col2": [3, 4]}) + df = bpd.DataFrame( + {"col1": [1, 2], "col2": [3, 4]}, session=resources.create_bigquery_session() + ) # Test running two methods df.head() df.max() @@ -81,15 +84,16 @@ def test_create_job_configs_labels_log_adaptor_call_method_under_length_limit(): "recent-bigframes-api-2": "dataframe-__init__", "recent-bigframes-api-3": "dataframe-head", "recent-bigframes-api-4": "dataframe-__init__", + "recent-bigframes-api-5": "dataframe-__init__", } - assert labels is not None - assert len(labels) == 7 assert labels == expected_dict def test_create_job_configs_labels_length_limit_met_and_labels_is_none(): log_adapter.get_and_reset_api_methods() - df = bpd.DataFrame({"col1": [1, 2], "col2": [3, 4]}) + df = bpd.DataFrame( + {"col1": [1, 2], "col2": [3, 4]}, session=resources.create_bigquery_session() + ) # Test running methods more than the labels' length limit for i in range(66): df.head() @@ -114,7 +118,9 @@ def test_create_job_configs_labels_length_limit_met(): value = f"test{i}" cur_labels[key] = value # If cur_labels length is 62, we can only add one label from api_methods - df = bpd.DataFrame({"col1": [1, 2], "col2": [3, 4]}) + df = bpd.DataFrame( + {"col1": [1, 2], "col2": [3, 4]}, session=resources.create_bigquery_session() + ) # Test running two methods df.head() df.max() diff --git a/third_party/bigframes_vendored/pandas/core/frame.py b/third_party/bigframes_vendored/pandas/core/frame.py index 05f4167838..84d2aa7fcb 100644 --- a/third_party/bigframes_vendored/pandas/core/frame.py +++ b/third_party/bigframes_vendored/pandas/core/frame.py @@ -2805,6 +2805,40 @@ def combine_first(self, other) -> DataFrame: """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + def corr(self, method, min_periods, numeric_only) -> DataFrame: + """ + Compute pairwise correlation of columns, excluding NA/null values. + + **Examples:** + + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + + >>> df = bpd.DataFrame({'A': [1, 2, 3], + ... 'B': [400, 500, 600], + ... 'C': [0.8, 0.4, 0.9]}) + >>> df.corr(numeric_only=True) + A B C + A 1.0 1.0 0.188982 + B 1.0 1.0 0.188982 + C 0.188982 0.188982 1.0 + + [3 rows x 3 columns] + + Args: + method (string, default "pearson"): + Correlation method to use - currently only "pearson" is supported. + min_periods (int, default None): + The minimum number of observations needed to return a result. Non-default values + are not yet supported, so a result will be returned for at least two observations. + numeric_only(bool, default False): + Include only float, int, boolean, decimal data. + + Returns: + DataFrame: Correlation matrix. + """ + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + def update( self, other, join: str = "left", overwrite: bool = True, filter_func=None ) -> DataFrame: diff --git a/third_party/bigframes_vendored/pandas/core/reshape/concat.py b/third_party/bigframes_vendored/pandas/core/reshape/concat.py index b0472c524a..685a73abc2 100644 --- a/third_party/bigframes_vendored/pandas/core/reshape/concat.py +++ b/third_party/bigframes_vendored/pandas/core/reshape/concat.py @@ -14,8 +14,7 @@ def concat( join: str = "outer", ignore_index: bool = False, ): - """ - Concatenate BigQuery DataFrames objects along a particular axis. + """Concatenate BigQuery DataFrames objects along a particular axis. Allows optional set logic along the other axes. @@ -23,118 +22,116 @@ def concat( which may be useful if the labels are the same (or overlapping) on the passed axis number. - Parameters - ---------- - objs: - Objects to concatenate. Any None objects will be dropped silently unless - they are all None in which case a ValueError will be raised. - axis : {0/'index', 1/'columns'}, default 0 - The axis to concatenate along. - join: {'inner', 'outer'}, default 'outer' - How to handle indexes on other axis (or axes). - ignore_index : bool, default False - If True, do not use the index values along the concatenation axis. The - resulting axis will be labeled 0, ..., n - 1. This is useful if you are - concatenating objects where the concatenation axis does not have - meaningful indexing information. Note the index values on the other - axes are still respected in the join. - - Returns - ------- - object, type of objs - When concatenating all ``Series`` along the index (axis=0), a - ``Series`` is returned. When ``objs`` contains at least one - ``DataFrame``, a ``DataFrame`` is returned. - - Notes - ----- - It is not recommended to build DataFrames by adding single rows in a - for loop. Build a list of rows and make a DataFrame in a single concat. - - Examples - -------- + .. note:: + It is not recommended to build DataFrames by adding single rows in a + for loop. Build a list of rows and make a DataFrame in a single concat. + + **Examples:** + + >>> import bigframes.pandas as pd + >>> pd.options.display.progress_bar = None + Combine two ``Series``. - >>> import bigframes.pandas as pd - >>> pd.options.display.progress_bar = None - >>> s1 = pd.Series(['a', 'b']) - >>> s2 = pd.Series(['c', 'd']) - >>> pd.concat([s1, s2]) - 0 a - 1 b - 0 c - 1 d - dtype: string + >>> s1 = pd.Series(['a', 'b']) + >>> s2 = pd.Series(['c', 'd']) + >>> pd.concat([s1, s2]) + 0 a + 1 b + 0 c + 1 d + dtype: string Clear the existing index and reset it in the result by setting the ``ignore_index`` option to ``True``. - >>> pd.concat([s1, s2], ignore_index=True) - 0 a - 1 b - 2 c - 3 d - dtype: string + >>> pd.concat([s1, s2], ignore_index=True) + 0 a + 1 b + 2 c + 3 d + dtype: string Combine two ``DataFrame`` objects with identical columns. - >>> df1 = pd.DataFrame([['a', 1], ['b', 2]], - ... columns=['letter', 'number']) - >>> df1 - letter number - 0 a 1 - 1 b 2 - - [2 rows x 2 columns] - >>> df2 = pd.DataFrame([['c', 3], ['d', 4]], - ... columns=['letter', 'number']) - >>> df2 - letter number - 0 c 3 - 1 d 4 - - [2 rows x 2 columns] - >>> pd.concat([df1, df2]) - letter number - 0 a 1 - 1 b 2 - 0 c 3 - 1 d 4 - - [4 rows x 2 columns] + >>> df1 = pd.DataFrame([['a', 1], ['b', 2]], + ... columns=['letter', 'number']) + >>> df1 + letter number + 0 a 1 + 1 b 2 + + [2 rows x 2 columns] + >>> df2 = pd.DataFrame([['c', 3], ['d', 4]], + ... columns=['letter', 'number']) + >>> df2 + letter number + 0 c 3 + 1 d 4 + + [2 rows x 2 columns] + >>> pd.concat([df1, df2]) + letter number + 0 a 1 + 1 b 2 + 0 c 3 + 1 d 4 + + [4 rows x 2 columns] Combine ``DataFrame`` objects with overlapping columns and return everything. Columns outside the intersection will be filled with ``NaN`` values. - >>> df3 = pd.DataFrame([['c', 3, 'cat'], ['d', 4, 'dog']], - ... columns=['letter', 'number', 'animal']) - >>> df3 - letter number animal - 0 c 3 cat - 1 d 4 dog - - [2 rows x 3 columns] - >>> pd.concat([df1, df3]) - letter number animal - 0 a 1 - 1 b 2 - 0 c 3 cat - 1 d 4 dog - - [4 rows x 3 columns] + >>> df3 = pd.DataFrame([['c', 3, 'cat'], ['d', 4, 'dog']], + ... columns=['letter', 'number', 'animal']) + >>> df3 + letter number animal + 0 c 3 cat + 1 d 4 dog + + [2 rows x 3 columns] + >>> pd.concat([df1, df3]) + letter number animal + 0 a 1 + 1 b 2 + 0 c 3 cat + 1 d 4 dog + + [4 rows x 3 columns] Combine ``DataFrame`` objects with overlapping columns and return only those that are shared by passing ``inner`` to the ``join`` keyword argument. - >>> pd.concat([df1, df3], join="inner") - letter number - 0 a 1 - 1 b 2 - 0 c 3 - 1 d 4 - - [4 rows x 2 columns] + >>> pd.concat([df1, df3], join="inner") + letter number + 0 a 1 + 1 b 2 + 0 c 3 + 1 d 4 + + [4 rows x 2 columns] + + Args: + objs (list of objects): + Objects to concatenate. Any None objects will be dropped silently unless + they are all None in which case a ValueError will be raised. + axis ({0 or 'index', 1 or 'columns'}, default 0): + The axis to concatenate along. + join ({'inner', 'outer'}, default 'outer'): + How to handle indexes on other axis (or axes). + ignore_index (bool, default False): + If True, do not use the index values along the concatenation axis. The + resulting axis will be labeled 0, ..., n - 1. This is useful if you are + concatenating objects where the concatenation axis does not have + meaningful indexing information. Note the index values on the other + axes are still respected in the join. + + Returns: + object, type of objs: + When concatenating all ``Series`` along the index (axis=0), a + ``Series`` is returned. When ``objs`` contains at least one + ``DataFrame``, a ``DataFrame`` is returned. """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) diff --git a/third_party/bigframes_vendored/pandas/core/series.py b/third_party/bigframes_vendored/pandas/core/series.py index b203471606..6c01a6dd0c 100644 --- a/third_party/bigframes_vendored/pandas/core/series.py +++ b/third_party/bigframes_vendored/pandas/core/series.py @@ -840,7 +840,7 @@ def corr(self, other, method="pearson", min_periods=None) -> float: float: Will return NaN if there are fewer than two numeric pairs, either series has a variance or covariance of zero, or any input value is infinite. """ - raise NotImplementedError("abstract method") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) def cov( self, diff --git a/third_party/bigframes_vendored/sklearn/metrics/pairwise.py b/third_party/bigframes_vendored/sklearn/metrics/pairwise.py index 3ef5431178..5791d850ff 100644 --- a/third_party/bigframes_vendored/sklearn/metrics/pairwise.py +++ b/third_party/bigframes_vendored/sklearn/metrics/pairwise.py @@ -11,13 +11,23 @@ import bigframes.pandas as bpd -def cosine_similarity(X, Y) -> bpd.DataFrame: - """Compute cosine similarity between samples in X and Y. +def paired_cosine_distances(X, Y) -> bpd.DataFrame: + """Compute the paired cosine distances between X and Y. + + Args: + X (Series or single column DataFrame of array of numeric type): + Input data. + Y (Series or single column DataFrame of array of numeric type): + Input data. X and Y are mapped by indexes, must have the same index. + + Returns: + bigframes.dataframe.DataFrame: DataFrame with columns of X, Y and cosine_distance + """ + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) - Cosine similarity, or the cosine kernel, computes similarity as the - normalized dot product of X and Y: - K(X, Y) = / (||X||*||Y||) +def paired_manhattan_distance(X, Y) -> bpd.DataFrame: + """Compute the L1 distances between the vectors in X and Y. Args: X (Series or single column DataFrame of array of numeric type): @@ -26,6 +36,6 @@ def cosine_similarity(X, Y) -> bpd.DataFrame: Input data. X and Y are mapped by indexes, must have the same index. Returns: - bigframes.dataframe.DataFrame: DataFrame with columns of X, Y and cosine_similarity + bigframes.dataframe.DataFrame: DataFrame with columns of X, Y and manhattan_distance """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)