diff --git a/CHANGELOG.md b/CHANGELOG.md index 565fe43241..4edd37bed3 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -4,6 +4,14 @@ [1]: https://pypi.org/project/bigframes/#history +## [0.25.0](https://github.com/googleapis/python-bigquery-dataframes/compare/v0.24.0...v0.25.0) (2024-03-14) + + +### Features + +* (Series|DataFrame).plot.(line|area|scatter) ([#431](https://github.com/googleapis/python-bigquery-dataframes/issues/431)) ([0772510](https://github.com/googleapis/python-bigquery-dataframes/commit/077251084e3121019c56e5d6c16aebab16be8dc7)) +* Support CMEK for `remote_function` cloud functions ([#430](https://github.com/googleapis/python-bigquery-dataframes/issues/430)) ([2fd69f4](https://github.com/googleapis/python-bigquery-dataframes/commit/2fd69f4bed143fc8c040dac1c55288c1cb660f6e)) + ## [0.24.0](https://github.com/googleapis/python-bigquery-dataframes/compare/v0.23.0...v0.24.0) (2024-03-12) diff --git a/bigframes/dtypes.py b/bigframes/dtypes.py index f29d653d4f..366820f9f6 100644 --- a/bigframes/dtypes.py +++ b/bigframes/dtypes.py @@ -20,7 +20,6 @@ import typing from typing import Any, Dict, Iterable, Literal, Tuple, Union -import bigframes_vendored.google_cloud_bigquery._pandas_helpers as gcb3p_pandas_helpers import bigframes_vendored.ibis.backends.bigquery.datatypes as third_party_ibis_bqtypes import bigframes_vendored.ibis.expr.operations as vendored_ibis_ops import geopandas as gpd # type: ignore @@ -68,7 +67,7 @@ "date32[day][pyarrow]", "time64[us][pyarrow]", "decimal128(38, 9)[pyarrow]", - "decimal256(38, 9)[pyarrow]", + "decimal256(76, 38)[pyarrow]", "binary[pyarrow]", ] @@ -171,7 +170,7 @@ } # special case - string[pyarrow] doesn't include the storage in its name, and both -# "string" and "string[pyarrow] are accepted" +# "string" and "string[pyarrow]" are accepted BIGFRAMES_STRING_TO_BIGFRAMES["string[pyarrow]"] = pd.StringDtype(storage="pyarrow") # For the purposes of dataframe.memory_usage @@ -492,21 +491,6 @@ def cast_ibis_value( ) -def to_pandas_dtypes_overrides(schema: Iterable[bigquery.SchemaField]) -> Dict: - """For each STRUCT field, make sure we specify the full type to use.""" - # TODO(swast): Also override ARRAY fields. - dtypes = {} - for field in schema: - if field.field_type == "RECORD" and field.mode != "REPEATED": - # TODO(swast): We're using a private API here. Would likely be - # better if we called `to_arrow()` and converted to a pandas - # DataFrame ourselves from that. - dtypes[field.name] = pd.ArrowDtype( - gcb3p_pandas_helpers.bq_to_arrow_data_type(field) - ) - return dtypes - - def is_dtype(scalar: typing.Any, dtype: Dtype) -> bool: """Captures whether a scalar can be losslessly represented by a dtype.""" if scalar is None: diff --git a/bigframes/functions/remote_function.py b/bigframes/functions/remote_function.py index 29c1c68e7c..09a9d97869 100644 --- a/bigframes/functions/remote_function.py +++ b/bigframes/functions/remote_function.py @@ -130,6 +130,8 @@ def __init__( bq_connection_id, cloud_resource_manager_client, cloud_function_service_account, + cloud_function_kms_key_name, + cloud_function_docker_repository, ): self._gcp_project_id = gcp_project_id self._cloud_function_region = cloud_function_region @@ -142,6 +144,8 @@ def __init__( bq_connection_client, cloud_resource_manager_client ) self._cloud_function_service_account = cloud_function_service_account + self._cloud_function_kms_key_name = cloud_function_kms_key_name + self._cloud_function_docker_repository = cloud_function_docker_repository def create_bq_remote_function( self, input_args, input_types, output_type, endpoint, bq_function_name @@ -344,7 +348,9 @@ def create_cloud_function(self, def_, cf_name, package_requirements=None): ) # Determine an upload URL for user code - upload_url_request = functions_v2.GenerateUploadUrlRequest() + upload_url_request = functions_v2.GenerateUploadUrlRequest( + kms_key_name=self._cloud_function_kms_key_name + ) upload_url_request.parent = self.get_cloud_function_fully_qualified_parent() upload_url_response = self._cloud_functions_client.generate_upload_url( request=upload_url_request @@ -383,12 +389,16 @@ def create_cloud_function(self, def_, cf_name, package_requirements=None): function.build_config.source.storage_source.object_ = ( upload_url_response.storage_source.object_ ) + function.build_config.docker_repository = ( + self._cloud_function_docker_repository + ) function.service_config = functions_v2.ServiceConfig() function.service_config.available_memory = "1024M" function.service_config.timeout_seconds = 600 function.service_config.service_account_email = ( self._cloud_function_service_account ) + function.kms_key_name = self._cloud_function_kms_key_name create_function_request.function = function # Create the cloud function and wait for it to be ready to use @@ -597,6 +607,8 @@ def remote_function( name: Optional[str] = None, packages: Optional[Sequence[str]] = None, cloud_function_service_account: Optional[str] = None, + cloud_function_kms_key_name: Optional[str] = None, + cloud_function_docker_repository: Optional[str] = None, ): """Decorator to turn a user defined function into a BigQuery remote function. @@ -699,6 +711,20 @@ def remote_function( for more details. Please make sure the service account has the necessary IAM permissions configured as described in https://cloud.google.com/functions/docs/reference/iam/roles#additional-configuration. + cloud_function_kms_key_name (str, Optional): + Customer managed encryption key to protect cloud functions and + related data at rest. This is of the format + projects/PROJECT_ID/locations/LOCATION/keyRings/KEYRING/cryptoKeys/KEY. + Read https://cloud.google.com/functions/docs/securing/cmek for + more details including granting necessary service accounts + access to the key. + cloud_function_docker_repository (str, Optional): + Docker repository created with the same encryption key as + `cloud_function_kms_key_name` to store encrypted artifacts + created to support the cloud function. This is of the format + projects/PROJECT_ID/locations/LOCATION/repositories/REPOSITORY_NAME. + For more details see + https://cloud.google.com/functions/docs/securing/cmek#before_you_begin. """ import bigframes.pandas as bpd @@ -780,6 +806,16 @@ def remote_function( f"{bq_location}." ) + # If any CMEK is intended then check that a docker repository is also specified + if ( + cloud_function_kms_key_name is not None + and cloud_function_docker_repository is None + ): + raise ValueError( + "cloud_function_docker_repository must be specified with cloud_function_kms_key_name." + " For more details see https://cloud.google.com/functions/docs/securing/cmek#before_you_begin" + ) + def wrapper(f): if not callable(f): raise TypeError("f must be callable, got {}".format(f)) @@ -800,6 +836,8 @@ def wrapper(f): bq_connection_id, resource_manager_client, cloud_function_service_account, + cloud_function_kms_key_name, + cloud_function_docker_repository, ) rf_name, cf_name = remote_function_client.provision_bq_remote_function( diff --git a/bigframes/ml/core.py b/bigframes/ml/core.py index 43a882ecac..03d9b806b9 100644 --- a/bigframes/ml/core.py +++ b/bigframes/ml/core.py @@ -245,7 +245,7 @@ def register(self, vertex_ai_model_id: Optional[str] = None) -> BqmlModel: options={"vertex_ai_model_id": vertex_ai_model_id} ) # Register the model and wait it to finish - self._session._start_query_create_model(sql) + self._session._start_query_ml_ddl(sql) self._model = self._session.bqclient.get_model(self.model_name) return self @@ -264,7 +264,7 @@ def _create_model_ref( def _create_model_with_sql(self, session: bigframes.Session, sql: str) -> BqmlModel: # fit the model, synchronously - _, job = session._start_query_create_model(sql) + _, job = session._start_query_ml_ddl(sql) # real model path in the session specific hidden dataset and table prefix model_name_full = f"{job.destination.project}.{job.destination.dataset_id}.{job.destination.table_id}" diff --git a/bigframes/operations/_matplotlib/__init__.py b/bigframes/operations/_matplotlib/__init__.py index f8770a9ef8..02aca8cf5d 100644 --- a/bigframes/operations/_matplotlib/__init__.py +++ b/bigframes/operations/_matplotlib/__init__.py @@ -17,6 +17,9 @@ PLOT_CLASSES: dict[str, type[core.MPLPlot]] = { "hist": hist.HistPlot, + "line": core.LinePlot, + "area": core.AreaPlot, + "scatter": core.ScatterPlot, } diff --git a/bigframes/operations/_matplotlib/core.py b/bigframes/operations/_matplotlib/core.py index 4b15d6f4dd..5c9d771f61 100644 --- a/bigframes/operations/_matplotlib/core.py +++ b/bigframes/operations/_matplotlib/core.py @@ -13,6 +13,7 @@ # limitations under the License. import abc +import typing import matplotlib.pyplot as plt @@ -28,3 +29,44 @@ def draw(self) -> None: @property def result(self): return self.axes + + +class SamplingPlot(MPLPlot): + @abc.abstractproperty + def _kind(self): + pass + + def __init__(self, data, **kwargs) -> None: + self.kwargs = kwargs + self.data = self._compute_plot_data(data) + + def generate(self) -> None: + self.axes = self.data.plot(kind=self._kind, **self.kwargs) + + def _compute_plot_data(self, data): + # TODO: Cache the sampling data in the PlotAccessor. + sampling_n = self.kwargs.pop("sampling_n", 100) + sampling_random_state = self.kwargs.pop("sampling_random_state", 0) + return ( + data.sample(n=sampling_n, random_state=sampling_random_state) + .to_pandas() + .sort_index() + ) + + +class LinePlot(SamplingPlot): + @property + def _kind(self) -> typing.Literal["line"]: + return "line" + + +class AreaPlot(SamplingPlot): + @property + def _kind(self) -> typing.Literal["area"]: + return "area" + + +class ScatterPlot(SamplingPlot): + @property + def _kind(self) -> typing.Literal["scatter"]: + return "scatter" diff --git a/bigframes/operations/plotting.py b/bigframes/operations/plotting.py index d19485e65e..cc9f71e5d1 100644 --- a/bigframes/operations/plotting.py +++ b/bigframes/operations/plotting.py @@ -12,7 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -from typing import Optional, Sequence +import typing import bigframes_vendored.pandas.plotting._core as vendordt @@ -20,16 +20,65 @@ import bigframes.operations._matplotlib as bfplt -class PlotAccessor: +class PlotAccessor(vendordt.PlotAccessor): __doc__ = vendordt.PlotAccessor.__doc__ def __init__(self, data) -> None: self._parent = data - def hist(self, by: Optional[Sequence[str]] = None, bins: int = 10, **kwargs): + def hist( + self, by: typing.Optional[typing.Sequence[str]] = None, bins: int = 10, **kwargs + ): if kwargs.pop("backend", None) is not None: raise NotImplementedError( f"Only support matplotlib backend for now. {constants.FEEDBACK_LINK}" ) - # Calls matplotlib backend to plot the data. return bfplt.plot(self._parent.copy(), kind="hist", by=by, bins=bins, **kwargs) + + def line( + self, + x: typing.Optional[typing.Hashable] = None, + y: typing.Optional[typing.Hashable] = None, + **kwargs, + ): + return bfplt.plot( + self._parent.copy(), + kind="line", + x=x, + y=y, + **kwargs, + ) + + def area( + self, + x: typing.Optional[typing.Hashable] = None, + y: typing.Optional[typing.Hashable] = None, + stacked: bool = True, + **kwargs, + ): + return bfplt.plot( + self._parent.copy(), + kind="area", + x=x, + y=y, + stacked=stacked, + **kwargs, + ) + + def scatter( + self, + x: typing.Optional[typing.Hashable] = None, + y: typing.Optional[typing.Hashable] = None, + s: typing.Union[typing.Hashable, typing.Sequence[typing.Hashable]] = None, + c: typing.Union[typing.Hashable, typing.Sequence[typing.Hashable]] = None, + **kwargs, + ): + return bfplt.plot( + self._parent.copy(), + kind="scatter", + x=x, + y=y, + s=s, + c=c, + **kwargs, + ) diff --git a/bigframes/pandas/__init__.py b/bigframes/pandas/__init__.py index 03c8412907..10caf17b79 100644 --- a/bigframes/pandas/__init__.py +++ b/bigframes/pandas/__init__.py @@ -620,6 +620,8 @@ def remote_function( name: Optional[str] = None, packages: Optional[Sequence[str]] = None, cloud_function_service_account: Optional[str] = None, + cloud_function_kms_key_name: Optional[str] = None, + cloud_function_docker_repository: Optional[str] = None, ): return global_session.with_default_session( bigframes.session.Session.remote_function, @@ -631,6 +633,8 @@ def remote_function( name=name, packages=packages, cloud_function_service_account=cloud_function_service_account, + cloud_function_kms_key_name=cloud_function_kms_key_name, + cloud_function_docker_repository=cloud_function_docker_repository, ) diff --git a/bigframes/session/__init__.py b/bigframes/session/__init__.py index 5266267a22..656c62ef19 100644 --- a/bigframes/session/__init__.py +++ b/bigframes/session/__init__.py @@ -1364,6 +1364,8 @@ def remote_function( name: Optional[str] = None, packages: Optional[Sequence[str]] = None, cloud_function_service_account: Optional[str] = None, + cloud_function_kms_key_name: Optional[str] = None, + cloud_function_docker_repository: Optional[str] = None, ): """Decorator to turn a user defined function into a BigQuery remote function. Check out the code samples at: https://cloud.google.com/bigquery/docs/remote-functions#bigquery-dataframes. @@ -1444,6 +1446,20 @@ def remote_function( for more details. Please make sure the service account has the necessary IAM permissions configured as described in https://cloud.google.com/functions/docs/reference/iam/roles#additional-configuration. + cloud_function_kms_key_name (str, Optional): + Customer managed encryption key to protect cloud functions and + related data at rest. This is of the format + projects/PROJECT_ID/locations/LOCATION/keyRings/KEYRING/cryptoKeys/KEY. + Read https://cloud.google.com/functions/docs/securing/cmek for + more details including granting necessary service accounts + access to the key. + cloud_function_docker_repository (str, Optional): + Docker repository created with the same encryption key as + `cloud_function_kms_key_name` to store encrypted artifacts + created to support the cloud function. This is of the format + projects/PROJECT_ID/locations/LOCATION/repositories/REPOSITORY_NAME. + For more details see + https://cloud.google.com/functions/docs/securing/cmek#before_you_begin. Returns: callable: A remote function object pointing to the cloud assets created in the background to support the remote execution. The cloud assets can be @@ -1463,6 +1479,8 @@ def remote_function( name=name, packages=packages, cloud_function_service_account=cloud_function_service_account, + cloud_function_kms_key_name=cloud_function_kms_key_name, + cloud_function_docker_repository=cloud_function_docker_repository, ) def read_gbq_function( @@ -1592,12 +1610,13 @@ def _start_query( self.bqclient, sql, job_config, max_results ) - def _start_query_create_model( + def _start_query_ml_ddl( self, sql: str, ) -> Tuple[bigquery.table.RowIterator, bigquery.QueryJob]: """ - Starts BigQuery ML CREATE MODEL query job and waits for results. + Starts BigQuery ML DDL query job (CREATE MODEL/ALTER MODEL/...) and + waits for results. """ job_config = self._prepare_query_job_config() diff --git a/bigframes/version.py b/bigframes/version.py index ae18e113ef..708390a7cd 100644 --- a/bigframes/version.py +++ b/bigframes/version.py @@ -12,4 +12,4 @@ # See the License for the specific language governing permissions and # limitations under the License. -__version__ = "0.24.0" +__version__ = "0.25.0" diff --git a/scripts/get_code_sample_coverage.py b/scripts/get_code_sample_coverage.py new file mode 100755 index 0000000000..d81023394f --- /dev/null +++ b/scripts/get_code_sample_coverage.py @@ -0,0 +1,147 @@ +# Copyright 2023 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import argparse +import importlib +import inspect +import sys +from typing import Dict, List + +import bigframes +import bigframes.pandas as bpd + +PRESENT = "present" +NOT_PRESENT = "not_present" + +CLASSES = [ + bpd.DataFrame, + bpd.Series, + bpd.Index, + bigframes.session.Session, + bigframes.operations.strings.StringMethods, + bigframes.operations.datetimes.DatetimeMethods, + bigframes.operations.structs.StructAccessor, +] + +ML_MODULE_NAMES = [ + "cluster", + "compose", + "decomposition", + "ensemble", + "linear_model", + "metrics", + "model_selection", + "pipeline", + "preprocessing", + "llm", + "forecasting", + "imported", + "remote", +] + +for module_name in ML_MODULE_NAMES: + module = importlib.import_module(f"bigframes.ml.{module_name}") + classes_ = [ + class_ for _, class_ in inspect.getmembers(module, predicate=inspect.isclass) + ] + CLASSES.extend(classes_) + + +def get_code_samples_summary() -> Dict[str, Dict[str, List[str]]]: + """Get Summary of the code samples coverage in BigFrames APIs. + + Returns: + Summary: A dictionary of the format + { + class_1: { + "present": [method1, method2, ...], + "not_present": [method3, method4, ...] + }, + class_2: { + ... + } + } + """ + summary: Dict[str, Dict[str, List[str]]] = dict() + + for class_ in CLASSES: + class_key = f"{class_.__module__}.{class_.__name__}" + summary[class_key] = {PRESENT: [], NOT_PRESENT: []} + + members = inspect.getmembers(class_) + + for name, obj in members: + # ignore private methods + if name.startswith("_") and not name.startswith("__"): + continue + + def predicate(impl): + return ( + # This includes class methods like `from_dict`, `from_records` + inspect.ismethod(impl) + # This includes instance methods like `dropna`, join` + or inspect.isfunction(impl) + # This includes properties like `shape`, `values` but not + # generic properties like `__weakref__` + or (inspect.isdatadescriptor(impl) and not name.startswith("__")) + ) + + if not predicate(obj): + continue + + # At this point we have a property or a public method + impl = getattr(class_, name) + + docstr = inspect.getdoc(impl) + code_samples_present = docstr and "**Examples:**" in docstr + key = PRESENT if code_samples_present else NOT_PRESENT + summary[class_key][key].append(name) + + return summary + + +if __name__ == "__main__": + parser = argparse.ArgumentParser( + description="Get a summary of code samples coverage in BigFrames APIs." + ) + parser.add_argument( + "-d", + "--details", + type=bool, + action=argparse.BooleanOptionalAction, + default=False, + help="Whether to print APIs with and without code samples.", + ) + + args = parser.parse_args(sys.argv[1:]) + + summary = get_code_samples_summary() + + total_with_code_samples = 0 + total = 0 + for class_, class_summary in summary.items(): + apis_with_code_samples = len(class_summary[PRESENT]) + total_with_code_samples += apis_with_code_samples + + apis_total = len(class_summary[PRESENT]) + len(class_summary[NOT_PRESENT]) + total += apis_total + + coverage = 100 * apis_with_code_samples / apis_total + print(f"{class_}: {coverage:.1f}% ({apis_with_code_samples}/{apis_total})") + if args.details: + print(f"===> APIs WITH code samples: {class_summary[PRESENT]}") + print(f"===> APIs WITHOUT code samples: {class_summary[NOT_PRESENT]}") + + coverage = 100 * total_with_code_samples / total + print(f"Total: {coverage:.1f}% ({total_with_code_samples}/{total})") diff --git a/setup.py b/setup.py index a626fd4b34..5258a7d6f9 100644 --- a/setup.py +++ b/setup.py @@ -39,7 +39,7 @@ "geopandas >=0.12.2", "google-auth >=2.15.0,<3.0dev", "google-cloud-bigquery[bqstorage,pandas] >=3.10.0", - "google-cloud-functions >=1.10.1", + "google-cloud-functions >=1.12.0", "google-cloud-bigquery-connection >=1.12.0", "google-cloud-iam >=2.12.1", "google-cloud-resource-manager >=1.10.3", diff --git a/testing/constraints-3.9.txt b/testing/constraints-3.9.txt index 07c8b763f3..0aeb15eab8 100644 --- a/testing/constraints-3.9.txt +++ b/testing/constraints-3.9.txt @@ -5,7 +5,7 @@ gcsfs==2023.3.0 geopandas==0.12.2 google-auth==2.15.0 google-cloud-bigquery==3.10.0 -google-cloud-functions==1.10.1 +google-cloud-functions==1.12.0 google-cloud-bigquery-connection==1.12.0 google-cloud-iam==2.12.1 google-cloud-resource-manager==1.10.3 diff --git a/tests/system/large/test_remote_function.py b/tests/system/large/test_remote_function.py index 77aa3c7603..f8c5e98f1d 100644 --- a/tests/system/large/test_remote_function.py +++ b/tests/system/large/test_remote_function.py @@ -22,7 +22,7 @@ import textwrap from google.api_core.exceptions import BadRequest, NotFound, ResourceExhausted -from google.cloud import bigquery, functions_v2 +from google.cloud import bigquery, functions_v2, storage import pandas import pytest import test_utils.prefixer @@ -1322,3 +1322,68 @@ def square_num(x): cleanup_remote_function_assets( rf_session.bqclient, rf_session.cloudfunctionsclient, square_num ) + + +@pytest.mark.flaky(retries=2, delay=120) +def test_remote_function_with_gcf_cmek(): + # TODO(shobs): Automate the following set-up during testing in the test project. + # + # For upfront convenience, the following set up has been statically created + # in the project bigfrmames-dev-perf via cloud console: + # + # 1. Created an encryption key and granting the necessary service accounts + # the required IAM permissions as per https://cloud.google.com/kms/docs/create-key + # 2. Created a docker repository with CMEK (created in step 1) enabled as per + # https://cloud.google.com/artifact-registry/docs/repositories/create-repos#overview + # + project = "bigframes-dev-perf" + cmek = "projects/bigframes-dev-perf/locations/us-central1/keyRings/bigframesKeyRing/cryptoKeys/bigframesKey" + docker_repository = ( + "projects/bigframes-dev-perf/locations/us-central1/repositories/rf-artifacts" + ) + + session = bigframes.Session(context=bigframes.BigQueryOptions(project=project)) + try: + + @session.remote_function( + [int], + int, + reuse=False, + cloud_function_kms_key_name=cmek, + cloud_function_docker_repository=docker_repository, + ) + def square_num(x): + if x is None: + return x + return x * x + + df = pandas.DataFrame({"num": [-1, 0, None, 1]}, dtype="Int64") + bf = session.read_pandas(df) + + bf_result_col = bf["num"].apply(square_num) + bf_result = bf.assign(result=bf_result_col).to_pandas() + + pd_result_col = df["num"].apply(lambda x: x if x is None else x * x) + pd_result = df.assign(result=pd_result_col) + + assert_pandas_df_equal( + bf_result, pd_result, check_dtype=False, check_index_type=False + ) + + # Assert that the GCF is created with the intended SA + gcf = session.cloudfunctionsclient.get_function( + name=square_num.bigframes_cloud_function + ) + assert gcf.kms_key_name == cmek + + # Assert that GCS artifact has CMEK applied + storage_client = storage.Client() + bucket = storage_client.bucket(gcf.build_config.source.storage_source.bucket) + blob = bucket.get_blob(gcf.build_config.source.storage_source.object_) + assert blob.kms_key_name.startswith(cmek) + + finally: + # clean up the gcp assets created for the remote function + cleanup_remote_function_assets( + session.bqclient, session.cloudfunctionsclient, square_num + ) diff --git a/tests/system/small/operations/test_plot.py b/tests/system/small/operations/test_plotting.py similarity index 69% rename from tests/system/small/operations/test_plot.py rename to tests/system/small/operations/test_plotting.py index 44f31ec071..ce320b6f57 100644 --- a/tests/system/small/operations/test_plot.py +++ b/tests/system/small/operations/test_plotting.py @@ -12,9 +12,12 @@ # See the License for the specific language governing permissions and # limitations under the License. +import numpy as np import pandas._testing as tm import pytest +import bigframes.pandas as bpd + def _check_legend_labels(ax, labels): """ @@ -166,3 +169,67 @@ def test_hist_kwargs_ticks_props(scalars_dfs): for i in range(len(pd_xlables)): tm.assert_almost_equal(ylabels[i].get_fontsize(), pd_ylables[i].get_fontsize()) tm.assert_almost_equal(ylabels[i].get_rotation(), pd_ylables[i].get_rotation()) + + +def test_line(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + col_names = ["int64_col", "float64_col", "int64_too", "bool_col"] + ax = scalars_df[col_names].plot.line() + pd_ax = scalars_pandas_df[col_names].plot.line() + tm.assert_almost_equal(ax.get_xticks(), pd_ax.get_xticks()) + tm.assert_almost_equal(ax.get_yticks(), pd_ax.get_yticks()) + for line, pd_line in zip(ax.lines, pd_ax.lines): + # Compare y coordinates between the lines + tm.assert_almost_equal(line.get_data()[1], pd_line.get_data()[1]) + + +def test_area(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + col_names = ["int64_col", "float64_col", "int64_too"] + ax = scalars_df[col_names].plot.area(stacked=False) + pd_ax = scalars_pandas_df[col_names].plot.area(stacked=False) + tm.assert_almost_equal(ax.get_xticks(), pd_ax.get_xticks()) + tm.assert_almost_equal(ax.get_yticks(), pd_ax.get_yticks()) + for line, pd_line in zip(ax.lines, pd_ax.lines): + # Compare y coordinates between the lines + tm.assert_almost_equal(line.get_data()[1], pd_line.get_data()[1]) + + +def test_scatter(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + col_names = ["int64_col", "float64_col", "int64_too", "bool_col"] + ax = scalars_df[col_names].plot.scatter(x="int64_col", y="float64_col") + pd_ax = scalars_pandas_df[col_names].plot.scatter(x="int64_col", y="float64_col") + tm.assert_almost_equal(ax.get_xticks(), pd_ax.get_xticks()) + tm.assert_almost_equal(ax.get_yticks(), pd_ax.get_yticks()) + tm.assert_almost_equal( + ax.collections[0].get_sizes(), pd_ax.collections[0].get_sizes() + ) + + +def test_sampling_plot_args_n(): + df = bpd.DataFrame(np.arange(1000), columns=["one"]) + ax = df.plot.line() + assert len(ax.lines) == 1 + # Default sampling_n is 100 + assert len(ax.lines[0].get_data()[1]) == 100 + + ax = df.plot.line(sampling_n=2) + assert len(ax.lines) == 1 + assert len(ax.lines[0].get_data()[1]) == 2 + + +def test_sampling_plot_args_random_state(): + df = bpd.DataFrame(np.arange(1000), columns=["one"]) + ax_0 = df.plot.line() + ax_1 = df.plot.line() + ax_2 = df.plot.line(sampling_random_state=100) + ax_3 = df.plot.line(sampling_random_state=100) + + # Setting a fixed sampling_random_state guarantees reproducible plotted sampling. + tm.assert_almost_equal(ax_0.lines[0].get_data()[1], ax_1.lines[0].get_data()[1]) + tm.assert_almost_equal(ax_2.lines[0].get_data()[1], ax_3.lines[0].get_data()[1]) + + msg = "numpy array are different" + with pytest.raises(AssertionError, match=msg): + tm.assert_almost_equal(ax_0.lines[0].get_data()[1], ax_2.lines[0].get_data()[1]) diff --git a/tests/system/small/test_encryption.py b/tests/system/small/test_encryption.py index 0ce9d881fd..f13d2b9e1a 100644 --- a/tests/system/small/test_encryption.py +++ b/tests/system/small/test_encryption.py @@ -130,7 +130,7 @@ def test_df_apis(bq_cmek, session_with_bq_cmek, scalars_table_id): # Read a BQ table and assert encryption df = session_with_bq_cmek.read_gbq(scalars_table_id) - # Perform a few dataframe operations and assert assertion + # Perform a few dataframe operations and assert encryption df1 = df.dropna() _assert_bq_table_is_encrypted(df1, bq_cmek, session_with_bq_cmek) @@ -179,15 +179,32 @@ def test_to_gbq(bq_cmek, session_with_bq_cmek, scalars_table_id): df = session_with_bq_cmek.read_gbq(scalars_table_id) _assert_bq_table_is_encrypted(df, bq_cmek, session_with_bq_cmek) - # Modify the dataframe and assert assertion + # Modify the dataframe and assert encryption df = df.dropna().head() _assert_bq_table_is_encrypted(df, bq_cmek, session_with_bq_cmek) - # Write the result to BQ and assert assertion + # Write the result to BQ and assert encryption output_table_id = df.to_gbq() output_table = session_with_bq_cmek.bqclient.get_table(output_table_id) assert output_table.encryption_configuration.kms_key_name == bq_cmek + # Write the result to BQ custom table and assert encryption + session_with_bq_cmek.bqclient.get_table(output_table_id) + output_table_ref = bigframes.session._io.bigquery.random_table( + session_with_bq_cmek._anonymous_dataset + ) + output_table_id = str(output_table_ref) + df.to_gbq(output_table_id) + output_table = session_with_bq_cmek.bqclient.get_table(output_table_id) + assert output_table.encryption_configuration.kms_key_name == bq_cmek + + # Lastly, assert that the encryption is not because of any default set at + # the dataset level + output_table_dataset = session_with_bq_cmek.bqclient.get_dataset( + output_table.dataset_id + ) + assert output_table_dataset.default_encryption_configuration is None + @pytest.mark.skip( reason="Internal issue 327544164, cmek does not propagate to the dataframe." @@ -254,3 +271,21 @@ def test_bqml(bq_cmek, session_with_bq_cmek, penguins_table_id): # Assert that model exists in BQ with intended encryption model_bq = session_with_bq_cmek.bqclient.get_model(new_model._bqml_model.model_name) assert model_bq.encryption_configuration.kms_key_name == bq_cmek + + # Assert that model registration keeps the encryption + # Note that model registration only creates an entry (metadata) to be + # included in the Vertex AI Model Registry. See for more details + # https://cloud.google.com/bigquery/docs/update_vertex#add-existing. + # When use deploys the model to an endpoint from the Model Registry then + # they can specify an encryption key to further protect the artifacts at + # rest on the Vertex AI side. See for more details: + # https://cloud.google.com/vertex-ai/docs/general/deployment#deploy_a_model_to_an_endpoint, + # https://cloud.google.com/vertex-ai/docs/general/cmek#create_resources_with_the_kms_key. + # bigframes.ml does not provide any API for the model deployment. + model_registered = new_model.register() + assert ( + model_registered._bqml_model.model.encryption_configuration.kms_key_name + == bq_cmek + ) + model_bq = session_with_bq_cmek.bqclient.get_model(new_model._bqml_model.model_name) + assert model_bq.encryption_configuration.kms_key_name == bq_cmek diff --git a/tests/unit/ml/test_golden_sql.py b/tests/unit/ml/test_golden_sql.py index 25e12d87bf..d63bc7aaa1 100644 --- a/tests/unit/ml/test_golden_sql.py +++ b/tests/unit/ml/test_golden_sql.py @@ -43,7 +43,7 @@ def mock_session(): mock_session._anonymous_dataset, TEMP_MODEL_ID.model_id ) ) - mock_session._start_query_create_model.return_value = (None, query_job) + mock_session._start_query_ml_ddl.return_value = (None, query_job) return mock_session @@ -104,7 +104,7 @@ def test_linear_regression_default_fit( model._bqml_model_factory = bqml_model_factory model.fit(mock_X, mock_y) - mock_session._start_query_create_model.assert_called_once_with( + mock_session._start_query_ml_ddl.assert_called_once_with( 'CREATE OR REPLACE MODEL `test-project`.`_anon123`.`temp_model_id`\nOPTIONS(\n model_type="LINEAR_REG",\n data_split_method="NO_SPLIT",\n optimize_strategy="normal_equation",\n fit_intercept=True,\n l2_reg=0.0,\n max_iterations=20,\n learn_rate_strategy="line_search",\n early_stop=True,\n min_rel_progress=0.01,\n ls_init_learn_rate=0.1,\n calculate_p_values=False,\n enable_global_explain=False,\n INPUT_LABEL_COLS=["input_column_label"])\nAS input_X_y_sql' ) @@ -114,7 +114,7 @@ def test_linear_regression_params_fit(bqml_model_factory, mock_session, mock_X, model._bqml_model_factory = bqml_model_factory model.fit(mock_X, mock_y) - mock_session._start_query_create_model.assert_called_once_with( + mock_session._start_query_ml_ddl.assert_called_once_with( 'CREATE OR REPLACE MODEL `test-project`.`_anon123`.`temp_model_id`\nOPTIONS(\n model_type="LINEAR_REG",\n data_split_method="NO_SPLIT",\n optimize_strategy="normal_equation",\n fit_intercept=False,\n l2_reg=0.0,\n max_iterations=20,\n learn_rate_strategy="line_search",\n early_stop=True,\n min_rel_progress=0.01,\n ls_init_learn_rate=0.1,\n calculate_p_values=False,\n enable_global_explain=False,\n INPUT_LABEL_COLS=["input_column_label"])\nAS input_X_y_sql' ) @@ -147,7 +147,7 @@ def test_logistic_regression_default_fit( model._bqml_model_factory = bqml_model_factory model.fit(mock_X, mock_y) - mock_session._start_query_create_model.assert_called_once_with( + mock_session._start_query_ml_ddl.assert_called_once_with( 'CREATE OR REPLACE MODEL `test-project`.`_anon123`.`temp_model_id`\nOPTIONS(\n model_type="LOGISTIC_REG",\n data_split_method="NO_SPLIT",\n fit_intercept=True,\n auto_class_weights=False,\n INPUT_LABEL_COLS=["input_column_label"])\nAS input_X_y_sql' ) @@ -161,7 +161,7 @@ def test_logistic_regression_params_fit( model._bqml_model_factory = bqml_model_factory model.fit(mock_X, mock_y) - mock_session._start_query_create_model.assert_called_once_with( + mock_session._start_query_ml_ddl.assert_called_once_with( 'CREATE OR REPLACE MODEL `test-project`.`_anon123`.`temp_model_id`\nOPTIONS(\n model_type="LOGISTIC_REG",\n data_split_method="NO_SPLIT",\n fit_intercept=False,\n auto_class_weights=True,\n INPUT_LABEL_COLS=["input_column_label"])\nAS input_X_y_sql' ) diff --git a/third_party/bigframes_vendored/pandas/plotting/_core.py b/third_party/bigframes_vendored/pandas/plotting/_core.py index d0425737ee..2b0f077695 100644 --- a/third_party/bigframes_vendored/pandas/plotting/_core.py +++ b/third_party/bigframes_vendored/pandas/plotting/_core.py @@ -1,14 +1,14 @@ -from typing import Optional, Sequence +import typing from bigframes import constants class PlotAccessor: - """ - Make plots of Series or DataFrame with the `matplotlib` backend. - """ + """Make plots of Series or DataFrame with the `matplotlib` backend.""" - def hist(self, by: Optional[Sequence[str]] = None, bins: int = 10, **kwargs): + def hist( + self, by: typing.Optional[typing.Sequence[str]] = None, bins: int = 10, **kwargs + ): """ Draw one histogram of the DataFrame’s columns. @@ -17,32 +17,237 @@ def hist(self, by: Optional[Sequence[str]] = None, bins: int = 10, **kwargs): into bins and draws all bins in one :class:`matplotlib.axes.Axes`. This is useful when the DataFrame's Series are in a similar scale. - Parameters - ---------- - by : str or sequence, optional - Column in the DataFrame to group by. It is not supported yet. - bins : int, default 10 - Number of histogram bins to be used. - **kwargs - Additional keyword arguments are documented in - :meth:`DataFrame.plot`. - - Returns - ------- - class:`matplotlib.AxesSubplot` - Return a histogram plot. - - Examples - -------- - For Series: - - .. plot:: - :context: close-figs + **Examples:** >>> import bigframes.pandas as bpd >>> import numpy as np >>> df = bpd.DataFrame(np.random.randint(1, 7, 6000), columns=['one']) >>> df['two'] = np.random.randint(1, 7, 6000) + np.random.randint(1, 7, 6000) >>> ax = df.plot.hist(bins=12, alpha=0.5) + + Args: + by (str or sequence, optional): + Column in the DataFrame to group by. It is not supported yet. + bins (int, default 10): + Number of histogram bins to be used. + **kwargs: + Additional keyword arguments are documented in + :meth:`DataFrame.plot`. + + Returns: + class:`matplotlib.AxesSubplot`: A histogram plot. + + """ + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + + def line( + self, + x: typing.Optional[typing.Hashable] = None, + y: typing.Optional[typing.Hashable] = None, + **kwargs, + ): + """ + Plot Series or DataFrame as lines. This function is useful to plot lines + using DataFrame's values as coordinates. + + This function calls `pandas.plot` to generate a plot with a random sample + of items. For consistent results, the random sampling is reproducible. + Use the `sampling_random_state` parameter to modify the sampling seed. + + **Examples:** + + >>> import bigframes.pandas as bpd + >>> df = bpd.DataFrame( + ... { + ... 'one': [1, 2, 3, 4], + ... 'three': [3, 6, 9, 12], + ... 'reverse_ten': [40, 30, 20, 10], + ... } + ... ) + >>> ax = df.plot.line(x='one') + + Args: + x (label or position, optional): + Allows plotting of one column versus another. If not specified, + the index of the DataFrame is used. + y (label or position, optional): + Allows plotting of one column versus another. If not specified, + all numerical columns are used. + color (str, array-like, or dict, optional): + The color for each of the DataFrame's columns. Possible values are: + + - A single color string referred to by name, RGB or RGBA code, + for instance 'red' or '#a98d19'. + + - A sequence of color strings referred to by name, RGB or RGBA + code, which will be used for each column recursively. For + instance ['green','yellow'] each column's %(kind)s will be filled in + green or yellow, alternatively. If there is only a single column to + be plotted, then only the first color from the color list will be + used. + + - A dict of the form {column name : color}, so that each column will be + colored accordingly. For example, if your columns are called `a` and + `b`, then passing {'a': 'green', 'b': 'red'} will color %(kind)ss for + column `a` in green and %(kind)ss for column `b` in red. + sampling_n (int, default 100): + Number of random items for plotting. + sampling_random_state (int, default 0): + Seed for random number generator. + **kwargs: + Additional keyword arguments are documented in + :meth:`DataFrame.plot`. + + Returns: + matplotlib.axes.Axes or np.ndarray of them: + An ndarray is returned with one :class:`matplotlib.axes.Axes` + per column when ``subplots=True``. + """ + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + + def area( + self, + x: typing.Optional[typing.Hashable] = None, + y: typing.Optional[typing.Hashable] = None, + stacked: bool = True, + **kwargs, + ): + """ + Draw a stacked area plot. An area plot displays quantitative data visually. + + This function calls `pandas.plot` to generate a plot with a random sample + of items. For consistent results, the random sampling is reproducible. + Use the `sampling_random_state` parameter to modify the sampling seed. + + **Examples:** + + Draw an area plot based on basic business metrics: + + >>> import bigframes.pandas as bpd + >>> df = bpd.DataFrame( + ... { + ... 'sales': [3, 2, 3, 9, 10, 6], + ... 'signups': [5, 5, 6, 12, 14, 13], + ... 'visits': [20, 42, 28, 62, 81, 50], + ... }, + ... index=["01-31", "02-28", "03-31", "04-30", "05-31", "06-30"] + ... ) + >>> ax = df.plot.area() + + Area plots are stacked by default. To produce an unstacked plot, + pass ``stacked=False``: + + >>> ax = df.plot.area(stacked=False) + + Draw an area plot for a single column: + + >>> ax = df.plot.area(y='sales') + + Draw with a different `x`: + + >>> df = bpd.DataFrame({ + ... 'sales': [3, 2, 3], + ... 'visits': [20, 42, 28], + ... 'day': [1, 2, 3], + ... }) + >>> ax = df.plot.area(x='day') + + Args: + x (label or position, optional): + Coordinates for the X axis. By default uses the index. + y (label or position, optional): + Column to plot. By default uses all columns. + stacked (bool, default True): + Area plots are stacked by default. Set to False to create a + unstacked plot. + sampling_n (int, default 100): + Number of random items for plotting. + sampling_random_state (int, default 0): + Seed for random number generator. + **kwargs: + Additional keyword arguments are documented in + :meth:`DataFrame.plot`. + + Returns: + matplotlib.axes.Axes or numpy.ndarray: + Area plot, or array of area plots if subplots is True. + """ + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + + def scatter( + self, + x: typing.Optional[typing.Hashable] = None, + y: typing.Optional[typing.Hashable] = None, + s: typing.Union[typing.Hashable, typing.Sequence[typing.Hashable]] = None, + c: typing.Union[typing.Hashable, typing.Sequence[typing.Hashable]] = None, + **kwargs, + ): + """ + Create a scatter plot with varying marker point size and color. + + This function calls `pandas.plot` to generate a plot with a random sample + of items. For consistent results, the random sampling is reproducible. + Use the `sampling_random_state` parameter to modify the sampling seed. + + **Examples:** + + Let's see how to draw a scatter plot using coordinates from the values + in a DataFrame's columns. + + >>> import bigframes.pandas as bpd + >>> df = bpd.DataFrame([[5.1, 3.5, 0], [4.9, 3.0, 0], [7.0, 3.2, 1], + ... [6.4, 3.2, 1], [5.9, 3.0, 2]], + ... columns=['length', 'width', 'species']) + >>> ax1 = df.plot.scatter(x='length', + ... y='width', + ... c='DarkBlue') + + And now with the color determined by a column as well. + + >>> ax2 = df.plot.scatter(x='length', + ... y='width', + ... c='species', + ... colormap='viridis') + + Args: + x (int or str): + The column name or column position to be used as horizontal + coordinates for each point. + y (int or str): + The column name or column position to be used as vertical + coordinates for each point. + s (str, scalar or array-like, optional): + The size of each point. Possible values are: + + - A string with the name of the column to be used for marker's size. + - A single scalar so all points have the same size. + - A sequence of scalars, which will be used for each point's size + recursively. For instance, when passing [2,14] all points size + will be either 2 or 14, alternatively. + + c (str, int or array-like, optional): + The color of each point. Possible values are: + + - A single color string referred to by name, RGB or RGBA code, + for instance 'red' or '#a98d19'. + - A sequence of color strings referred to by name, RGB or RGBA + code, which will be used for each point's color recursively. For + instance ['green','yellow'] all points will be filled in green or + yellow, alternatively. + - A column name or position whose values will be used to color the + marker points according to a colormap. + + sampling_n (int, default 100): + Number of random items for plotting. + sampling_random_state (int, default 0): + Seed for random number generator. + **kwargs: + Additional keyword arguments are documented in + :meth:`DataFrame.plot`. + + Returns: + matplotlib.axes.Axes or np.ndarray of them: + An ndarray is returned with one :class:`matplotlib.axes.Axes` + per column when ``subplots=True``. """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)