diff --git a/.kokoro/continuous/e2e.cfg b/.kokoro/continuous/e2e.cfg index d875f36060..2f93a58212 100644 --- a/.kokoro/continuous/e2e.cfg +++ b/.kokoro/continuous/e2e.cfg @@ -3,5 +3,5 @@ # Only run this nox session. env_vars: { key: "NOX_SESSION" - value: "system_noextras e2e notebook samples" + value: "unit_prerelease system_prerelease system_noextras e2e notebook samples" } diff --git a/.kokoro/presubmit/e2e.cfg b/.kokoro/presubmit/e2e.cfg index d875f36060..2f93a58212 100644 --- a/.kokoro/presubmit/e2e.cfg +++ b/.kokoro/presubmit/e2e.cfg @@ -3,5 +3,5 @@ # Only run this nox session. env_vars: { key: "NOX_SESSION" - value: "system_noextras e2e notebook samples" + value: "unit_prerelease system_prerelease system_noextras e2e notebook samples" } diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 6e0fd8b98f..517176da89 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -38,4 +38,4 @@ repos: rev: v1.1.1 hooks: - id: mypy - additional_dependencies: [types-requests] + additional_dependencies: [types-requests, types-tabulate] diff --git a/CHANGELOG.md b/CHANGELOG.md index 091967513a..ef75a017e0 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -4,6 +4,51 @@ [1]: https://pypi.org/project/bigframes/#history +## [0.15.0](https://github.com/googleapis/python-bigquery-dataframes/compare/v0.14.1...v0.15.0) (2023-11-29) + + +### ⚠ BREAKING CHANGES + +* model.predict returns all the columns ([#204](https://github.com/googleapis/python-bigquery-dataframes/issues/204)) + +### Features + +* Add info and memory_usage methods to dataframe ([#219](https://github.com/googleapis/python-bigquery-dataframes/issues/219)) ([9d6613d](https://github.com/googleapis/python-bigquery-dataframes/commit/9d6613d318b558722b7bab12773efdea4bbe9931)) +* Add remote vertex model support ([#237](https://github.com/googleapis/python-bigquery-dataframes/issues/237)) ([0bfc4fb](https://github.com/googleapis/python-bigquery-dataframes/commit/0bfc4fb117686c734d4a2503d5a6de0e64e9f9b9)) +* Add the recent api method for ML component ([#225](https://github.com/googleapis/python-bigquery-dataframes/issues/225)) ([ed8876d](https://github.com/googleapis/python-bigquery-dataframes/commit/ed8876d3439a3b45b65e8789737c3c2e3a7f1adb)) +* Model.predict returns all the columns ([#204](https://github.com/googleapis/python-bigquery-dataframes/issues/204)) ([416171a](https://github.com/googleapis/python-bigquery-dataframes/commit/416171a70d91d4a6b71622ba72685147ab7d6186)) +* Send warnings on LLM prediction partial failures ([#216](https://github.com/googleapis/python-bigquery-dataframes/issues/216)) ([81125f9](https://github.com/googleapis/python-bigquery-dataframes/commit/81125f9505ad98e89939769a8e1fcf30518705f0)) + + +### Bug Fixes + +* Add df snapshots lookup for `read_gbq` ([#229](https://github.com/googleapis/python-bigquery-dataframes/issues/229)) ([d0d9b84](https://github.com/googleapis/python-bigquery-dataframes/commit/d0d9b84b101eb03c499d85e74dcfc900dedd4137)) +* Avoid unnecessary row_number() on sort key for io ([#211](https://github.com/googleapis/python-bigquery-dataframes/issues/211)) ([a18d40e](https://github.com/googleapis/python-bigquery-dataframes/commit/a18d40e808ee0822d21715cc3e8f794c418aeebc)) +* Dedup special character ([#209](https://github.com/googleapis/python-bigquery-dataframes/issues/209)) ([dd78acb](https://github.com/googleapis/python-bigquery-dataframes/commit/dd78acb174545ba292776a642afcec46f8ee4a2a)) +* Invalid JSON type of the notebook ([#215](https://github.com/googleapis/python-bigquery-dataframes/issues/215)) ([a729831](https://github.com/googleapis/python-bigquery-dataframes/commit/a7298317ea2604faa6ae31817f1f729d7e0b9818)) +* Make to_pandas override enable_downsampling when sampling_method is manually set. ([#200](https://github.com/googleapis/python-bigquery-dataframes/issues/200)) ([ae03756](https://github.com/googleapis/python-bigquery-dataframes/commit/ae03756f5ee45e0e74e0c0bdd4777e018eba2273)) +* Polish the llm+kmeans notebook ([#208](https://github.com/googleapis/python-bigquery-dataframes/issues/208)) ([e8532b1](https://github.com/googleapis/python-bigquery-dataframes/commit/e8532b1d999d26ea1ebdd30efb8f2c0a93a6a28d)) +* Update the llm+kmeans notebook with recent change ([#236](https://github.com/googleapis/python-bigquery-dataframes/issues/236)) ([f8917ab](https://github.com/googleapis/python-bigquery-dataframes/commit/f8917abc094e222e0435891d4d184b77bfe67722)) +* Use anonymous dataset to create `remote_function` ([#205](https://github.com/googleapis/python-bigquery-dataframes/issues/205)) ([69b016e](https://github.com/googleapis/python-bigquery-dataframes/commit/69b016eae7ea97d84ceeb22ba09f5472841db072)) + + +### Documentation + +* Add code samples for `index` and `column` properties ([#212](https://github.com/googleapis/python-bigquery-dataframes/issues/212)) ([c88d38e](https://github.com/googleapis/python-bigquery-dataframes/commit/c88d38e69682f4c620174086b8f16f4780c04811)) +* Add code samples for df reshaping, function, merge, and join methods ([#203](https://github.com/googleapis/python-bigquery-dataframes/issues/203)) ([010486c](https://github.com/googleapis/python-bigquery-dataframes/commit/010486c3494e05d714da6cc7d51514518d9ae1ea)) +* Add examples for dataframe.kurt, dataframe.std, dataframe.count ([#232](https://github.com/googleapis/python-bigquery-dataframes/issues/232)) ([f9c6e72](https://github.com/googleapis/python-bigquery-dataframes/commit/f9c6e727e2b901310bb5301da449d616ea85e135)) +* Add examples for dataframe.mean, dataframe.median, dataframe.va… ([#228](https://github.com/googleapis/python-bigquery-dataframes/issues/228)) ([edd0522](https://github.com/googleapis/python-bigquery-dataframes/commit/edd0522747eadb74780124fb18ed7face251441d)) +* Add examples for dataframe.min, dataframe.max and dataframe.sum ([#227](https://github.com/googleapis/python-bigquery-dataframes/issues/227)) ([3a375e8](https://github.com/googleapis/python-bigquery-dataframes/commit/3a375e87b64b8fb51370bfec8f2cfdbcd8fe960a)) +* Code samples for `Series.dot` and `DataFrame.dot` ([#226](https://github.com/googleapis/python-bigquery-dataframes/issues/226)) ([b62a07a](https://github.com/googleapis/python-bigquery-dataframes/commit/b62a07a95cd60f995a48825c9874822d0eb02483)) +* Code samples for `Series.where` and `Series.mask` ([#217](https://github.com/googleapis/python-bigquery-dataframes/issues/217)) ([52dfad2](https://github.com/googleapis/python-bigquery-dataframes/commit/52dfad281def82548751a276ce42b087dbb09f9a)) +* Code samples for dataframe.any, dataframe.all and dataframe.prod ([#223](https://github.com/googleapis/python-bigquery-dataframes/issues/223)) ([d7957fa](https://github.com/googleapis/python-bigquery-dataframes/commit/d7957fad071d223ef8f6fb8f3de395c865ff60aa)) +* Make the code samples reflect default bq connection usage ([#206](https://github.com/googleapis/python-bigquery-dataframes/issues/206)) ([71844b0](https://github.com/googleapis/python-bigquery-dataframes/commit/71844b03cdbfe684320c186a0488c8c7fb4fcd6e)) + + +### Miscellaneous Chores + +* Release 0.15.0 ([#241](https://github.com/googleapis/python-bigquery-dataframes/issues/241)) ([6c899be](https://github.com/googleapis/python-bigquery-dataframes/commit/6c899be2989e24f697d72fe1bb92ebbf7dec84cb)) + ## [0.14.1](https://github.com/googleapis/python-bigquery-dataframes/compare/v0.14.0...v0.14.1) (2023-11-16) diff --git a/README.rst b/README.rst index 5ddb4a7639..91dac12751 100644 --- a/README.rst +++ b/README.rst @@ -267,10 +267,9 @@ definition. To view and manage connections, do the following: 3. In the Explorer pane, expand that project and then expand External connections. BigQuery remote functions are created in the dataset you specify, or -in a dataset with the name ``bigframes_temp_location``, where location is -the location used by the BigQuery DataFrames session. For example, -``bigframes_temp_us_central1``. To view and manage remote functions, do -the following: +in a special type of `hidden dataset `__ +referred to as an anonymous dataset. To view and manage remote functions created +in a user provided dataset, do the following: 1. Go to `BigQuery in the Google Cloud Console `__. 2. Select the project in which you created the remote function. diff --git a/bigframes/_config/display_options.py b/bigframes/_config/display_options.py index ad3ea3f68c..afa36aa84c 100644 --- a/bigframes/_config/display_options.py +++ b/bigframes/_config/display_options.py @@ -32,6 +32,10 @@ class DisplayOptions: progress_bar: Optional[str] = "auto" repr_mode: Literal["head", "deferred"] = "head" + max_info_columns: int = 100 + max_info_rows: Optional[int] = 200000 + memory_usage: bool = True + @contextlib.contextmanager def pandas_repr(display_options: DisplayOptions): diff --git a/bigframes/core/__init__.py b/bigframes/core/__init__.py index b476961bdc..e19fec8f3f 100644 --- a/bigframes/core/__init__.py +++ b/bigframes/core/__init__.py @@ -125,14 +125,18 @@ def to_sql( col_id_overrides: typing.Mapping[str, str] = {}, sorted: bool = False, ) -> str: - if sorted or offset_column: - return self._compile_ordered().to_sql( - offset_column=offset_column, + array_value = self + if offset_column: + array_value = self.promote_offsets(offset_column) + if sorted: + return array_value._compile_ordered().to_sql( col_id_overrides=col_id_overrides, sorted=sorted, ) else: - return self._compile_unordered().to_sql(col_id_overrides=col_id_overrides) + return array_value._compile_unordered().to_sql( + col_id_overrides=col_id_overrides + ) def start_query( self, diff --git a/bigframes/core/blocks.py b/bigframes/core/blocks.py index f1113d938e..34913872e7 100644 --- a/bigframes/core/blocks.py +++ b/bigframes/core/blocks.py @@ -389,23 +389,6 @@ def to_pandas( ordered: bool = True, ) -> Tuple[pd.DataFrame, bigquery.QueryJob]: """Run query and download results as a pandas DataFrame.""" - if max_download_size is None: - max_download_size = bigframes.options.sampling.max_download_size - if sampling_method is None: - sampling_method = ( - bigframes.options.sampling.sampling_method - if bigframes.options.sampling.sampling_method is not None - else _UNIFORM - ) - if random_state is None: - random_state = bigframes.options.sampling.random_state - - sampling_method = sampling_method.lower() - if sampling_method not in _SAMPLING_METHODS: - raise NotImplementedError( - f"The downsampling method {sampling_method} is not implemented, " - f"please choose from {','.join(_SAMPLING_METHODS)}." - ) df, _, query_job = self._compute_and_count( value_keys=value_keys, @@ -453,6 +436,28 @@ def _compute_and_count( ) -> Tuple[pd.DataFrame, int, bigquery.QueryJob]: """Run query and download results as a pandas DataFrame. Return the total number of results as well.""" # TODO(swast): Allow for dry run and timeout. + enable_downsampling = ( + True + if sampling_method is not None + else bigframes.options.sampling.enable_downsampling + ) + + max_download_size = ( + max_download_size or bigframes.options.sampling.max_download_size + ) + + random_state = random_state or bigframes.options.sampling.random_state + + if sampling_method is None: + sampling_method = bigframes.options.sampling.sampling_method or _UNIFORM + sampling_method = sampling_method.lower() + + if sampling_method not in _SAMPLING_METHODS: + raise NotImplementedError( + f"The downsampling method {sampling_method} is not implemented, " + f"please choose from {','.join(_SAMPLING_METHODS)}." + ) + expr = self._apply_value_keys_to_expr(value_keys=value_keys) results_iterator, query_job = expr.start_query( @@ -469,7 +474,7 @@ def _compute_and_count( ) if fraction < 1: - if not bigframes.options.sampling.enable_downsampling: + if not enable_downsampling: raise RuntimeError( f"The data size ({table_size:.2f} MB) exceeds the maximum download limit of " f"{max_download_size} MB. You can:\n\t* Enable downsampling in global options:\n" diff --git a/bigframes/core/compile/compiled.py b/bigframes/core/compile/compiled.py index 78050ed4f0..461c2c005a 100644 --- a/bigframes/core/compile/compiled.py +++ b/bigframes/core/compile/compiled.py @@ -1031,31 +1031,42 @@ def _reproject_to_table(self) -> OrderedIR: def to_sql( self, - offset_column: typing.Optional[str] = None, col_id_overrides: typing.Mapping[str, str] = {}, sorted: bool = False, ) -> str: - offsets_id = offset_column or ORDER_ID_COLUMN - sql = ibis_bigquery.Backend().compile( self._to_ibis_expr( - ordering_mode="offset_col" - if (offset_column or sorted) - else "unordered", - order_col_name=offsets_id, + ordering_mode="unordered", col_id_overrides=col_id_overrides, + expose_hidden_cols=sorted, ) ) if sorted: + output_columns = [ + col_id_overrides.get(col) if (col in col_id_overrides) else col + for col in self.column_ids + ] + selection = ", ".join(map(lambda col_id: f"`{col_id}`", output_columns)) + order_by_clause = self._ordering_clause(self._ordering.all_ordering_columns) + sql = textwrap.dedent( - f"SELECT * EXCEPT (`{offsets_id}`)\n" + f"SELECT {selection}\n" "FROM (\n" f"{sql}\n" ")\n" - f"ORDER BY `{offsets_id}`\n" + f"{order_by_clause}\n" ) return typing.cast(str, sql) + def _ordering_clause(self, ordering: Iterable[OrderingColumnReference]) -> str: + parts = [] + for col_ref in ordering: + asc_desc = "ASC" if col_ref.direction.is_ascending else "DESC" + null_clause = "NULLS LAST" if col_ref.na_last else "NULLS FIRST" + part = f"`{col_ref.column_id}` {asc_desc} {null_clause}" + parts.append(part) + return f"ORDER BY {' ,'.join(parts)}" + def _to_ibis_expr( self, *, diff --git a/bigframes/core/indexes/index.py b/bigframes/core/indexes/index.py index 6c66c36062..fc7cf167d4 100644 --- a/bigframes/core/indexes/index.py +++ b/bigframes/core/indexes/index.py @@ -155,6 +155,14 @@ def _block(self) -> blocks.Block: def T(self) -> Index: return self.transpose() + def _memory_usage(self) -> int: + (n_rows,) = self.shape + return sum( + self.dtypes.map( + lambda dtype: bigframes.dtypes.DTYPE_BYTE_SIZES.get(dtype, 8) * n_rows + ) + ) + def transpose(self) -> Index: return self @@ -326,7 +334,10 @@ def _apply_aggregation(self, op: agg_ops.AggregateOp) -> typing.Any: def __getitem__(self, key: int) -> typing.Any: if isinstance(key, int): - result_pd_df, _ = self._block.slice(key, key + 1, 1).to_pandas() + if key != -1: + result_pd_df, _ = self._block.slice(key, key + 1, 1).to_pandas() + else: # special case, want [-1:] instead of [-1:0] + result_pd_df, _ = self._block.slice(key).to_pandas() if result_pd_df.empty: raise IndexError("single positional indexer is out-of-bounds") return result_pd_df.index[0] diff --git a/bigframes/dataframe.py b/bigframes/dataframe.py index 57b4ca42cf..f7796291b9 100644 --- a/bigframes/dataframe.py +++ b/bigframes/dataframe.py @@ -18,6 +18,7 @@ import datetime import re +import sys import textwrap import typing from typing import ( @@ -36,6 +37,7 @@ import google.cloud.bigquery as bigquery import numpy import pandas +import tabulate import bigframes import bigframes._config.display_options as display_options @@ -350,6 +352,88 @@ def query_job(self) -> Optional[bigquery.QueryJob]: self._set_internal_query_job(self._compute_dry_run()) return self._query_job + def memory_usage(self, index: bool = True): + n_rows, _ = self.shape + # like pandas, treat all variable-size objects as just 8-byte pointers, ignoring actual object + column_sizes = self.dtypes.map( + lambda dtype: bigframes.dtypes.DTYPE_BYTE_SIZES.get(dtype, 8) * n_rows + ) + if index: + index_size = pandas.Series([self.index._memory_usage()], index=["Index"]) + column_sizes = pandas.concat([index_size, column_sizes]) + return column_sizes + + def info( + self, + verbose: Optional[bool] = None, + buf=None, + max_cols: Optional[int] = None, + memory_usage: Optional[bool] = None, + show_counts: Optional[bool] = None, + ): + obuf = buf or sys.stdout + + n_rows, n_columns = self.shape + + max_cols = ( + max_cols + if max_cols is not None + else bigframes.options.display.max_info_columns + ) + + show_all_columns = verbose if verbose is not None else (n_columns < max_cols) + + obuf.write(f"{type(self)}\n") + + index_type = "MultiIndex" if self.index.nlevels > 1 else "Index" + + # These accessses are kind of expensive, maybe should try to skip? + first_indice = self.index[0] + last_indice = self.index[-1] + obuf.write(f"{index_type}: {n_rows} entries, {first_indice} to {last_indice}\n") + + dtype_strings = self.dtypes.astype("string") + if show_all_columns: + obuf.write(f"Data columns (total {n_columns} columns):\n") + column_info = self.columns.to_frame(name="Column") + + max_rows = bigframes.options.display.max_info_rows + too_many_rows = n_rows > max_rows if max_rows is not None else False + + if show_counts if show_counts is not None else (not too_many_rows): + non_null_counts = self.count().to_pandas() + column_info["Non-Null Count"] = non_null_counts.map( + lambda x: f"{int(x)} non-null" + ) + + column_info["Dtype"] = dtype_strings + + column_info = column_info.reset_index(drop=True) + column_info.index.name = "#" + + column_info_formatted = tabulate.tabulate(column_info, headers="keys") # type: ignore + obuf.write(column_info_formatted) + obuf.write("\n") + + else: # Just number of columns and first, last + obuf.write( + f"Columns: {n_columns} entries, {self.columns[0]} to {self.columns[-1]}\n" + ) + dtype_counts = dtype_strings.value_counts().sort_index(ascending=True).items() + dtype_counts_formatted = ", ".join( + f"{dtype}({count})" for dtype, count in dtype_counts + ) + obuf.write(f"dtypes: {dtype_counts_formatted}\n") + + show_memory = ( + memory_usage + if memory_usage is not None + else bigframes.options.display.memory_usage + ) + if show_memory: + # TODO: Convert to different units (kb, mb, etc.) + obuf.write(f"memory usage: {self.memory_usage().sum()} bytes\n") + def _set_internal_query_job(self, query_job: bigquery.QueryJob): self._query_job = query_job @@ -2577,14 +2661,10 @@ def _create_io_query(self, index: bool, ordering_id: Optional[str]) -> str: } if ordering_id is not None: - return array_value.to_sql( - offset_column=ordering_id, - col_id_overrides=id_overrides, - ) - else: - return array_value.to_sql( - col_id_overrides=id_overrides, - ) + array_value = array_value.promote_offsets(ordering_id) + return array_value.to_sql( + col_id_overrides=id_overrides, + ) def _run_io_query( self, @@ -2801,7 +2881,8 @@ def get_right_id(id): result = result[other_frame.columns] if isinstance(other, bf_series.Series): - result = result[other.name].rename() + # There should be exactly one column in the result + result = result[result.columns[0]].rename() return result diff --git a/bigframes/dtypes.py b/bigframes/dtypes.py index cd35e380c0..774eb74d06 100644 --- a/bigframes/dtypes.py +++ b/bigframes/dtypes.py @@ -143,6 +143,19 @@ # "string" and "string[pyarrow] are accepted" BIGFRAMES_STRING_TO_BIGFRAMES["string[pyarrow]"] = pd.StringDtype(storage="pyarrow") +# For the purposes of dataframe.memory_usage +# https://cloud.google.com/bigquery/docs/reference/standard-sql/data-types#data_type_sizes +DTYPE_BYTE_SIZES = { + pd.BooleanDtype(): 1, + pd.Int64Dtype(): 8, + pd.Float32Dtype(): 8, + pd.StringDtype(): 8, + pd.ArrowDtype(pa.time64("us")): 8, + pd.ArrowDtype(pa.timestamp("us")): 8, + pd.ArrowDtype(pa.timestamp("us", tz="UTC")): 8, + pd.ArrowDtype(pa.date32()): 8, +} + def ibis_dtype_to_bigframes_dtype( ibis_dtype: ibis_dtypes.DataType, diff --git a/bigframes/ml/__init__.py b/bigframes/ml/__init__.py index 55c8709d8d..b2c62ff961 100644 --- a/bigframes/ml/__init__.py +++ b/bigframes/ml/__init__.py @@ -26,4 +26,5 @@ "llm", "forecasting", "imported", + "remote", ] diff --git a/bigframes/ml/cluster.py b/bigframes/ml/cluster.py index 772b90f666..6b79d356a2 100644 --- a/bigframes/ml/cluster.py +++ b/bigframes/ml/cluster.py @@ -17,16 +17,18 @@ from __future__ import annotations -from typing import cast, Dict, List, Optional, Union +from typing import Dict, List, Optional, Union from google.cloud import bigquery import bigframes +from bigframes.core import log_adapter from bigframes.ml import base, core, globals, utils import bigframes.pandas as bpd import third_party.bigframes_vendored.sklearn.cluster._kmeans +@log_adapter.class_logger class KMeans( base.UnsupervisedTrainablePredictor, third_party.bigframes_vendored.sklearn.cluster._kmeans.KMeans, @@ -92,7 +94,7 @@ def predict( (X,) = utils.convert_to_dataframe(X) - return cast(bpd.DataFrame, self._bqml_model.predict(X)[["CENTROID_ID"]]) + return self._bqml_model.predict(X) def to_gbq(self, model_name: str, replace: bool = False) -> KMeans: """Save the model to BigQuery. diff --git a/bigframes/ml/compose.py b/bigframes/ml/compose.py index bf046ff691..ace876dd2d 100644 --- a/bigframes/ml/compose.py +++ b/bigframes/ml/compose.py @@ -22,6 +22,7 @@ from typing import List, Optional, Tuple, Union from bigframes import constants +from bigframes.core import log_adapter from bigframes.ml import base, core, globals, preprocessing, utils import bigframes.pandas as bpd import third_party.bigframes_vendored.sklearn.compose._column_transformer @@ -36,6 +37,7 @@ ] +@log_adapter.class_logger class ColumnTransformer( base.Transformer, third_party.bigframes_vendored.sklearn.compose._column_transformer.ColumnTransformer, diff --git a/bigframes/ml/core.py b/bigframes/ml/core.py index d8135f7085..5aad77a394 100644 --- a/bigframes/ml/core.py +++ b/bigframes/ml/core.py @@ -294,6 +294,8 @@ def create_remote_model( self, session: bigframes.Session, connection_name: str, + input: Mapping[str, str] = {}, + output: Mapping[str, str] = {}, options: Mapping[str, Union[str, int, float, Iterable[str]]] = {}, ) -> BqmlModel: """Create a session-temporary BQML remote model with the CREATE OR REPLACE MODEL statement @@ -301,6 +303,10 @@ def create_remote_model( Args: connection_name: a BQ connection to talk with Vertex AI, of the format ... https://cloud.google.com/bigquery/docs/create-cloud-resource-connection + input: + input schema for general remote models + output: + output schema for general remote models options: a dict of options to configure the model. Generates a BQML OPTIONS clause @@ -311,6 +317,8 @@ def create_remote_model( sql = self._model_creation_sql_generator.create_remote_model( connection_name=connection_name, model_ref=model_ref, + input=input, + output=output, options=options, ) diff --git a/bigframes/ml/decomposition.py b/bigframes/ml/decomposition.py index 8e6be6d28c..ef777cb33a 100644 --- a/bigframes/ml/decomposition.py +++ b/bigframes/ml/decomposition.py @@ -17,16 +17,18 @@ from __future__ import annotations -from typing import cast, List, Optional, Union +from typing import List, Optional, Union from google.cloud import bigquery import bigframes +from bigframes.core import log_adapter from bigframes.ml import base, core, globals, utils import bigframes.pandas as bpd import third_party.bigframes_vendored.sklearn.decomposition._pca +@log_adapter.class_logger class PCA( base.UnsupervisedTrainablePredictor, third_party.bigframes_vendored.sklearn.decomposition._pca.PCA, @@ -106,12 +108,7 @@ def predict(self, X: Union[bpd.DataFrame, bpd.Series]) -> bpd.DataFrame: (X,) = utils.convert_to_dataframe(X) - return cast( - bpd.DataFrame, - self._bqml_model.predict(X)[ - ["principal_component_" + str(i + 1) for i in range(self.n_components)] - ], - ) + return self._bqml_model.predict(X) def to_gbq(self, model_name: str, replace: bool = False) -> PCA: """Save the model to BigQuery. diff --git a/bigframes/ml/ensemble.py b/bigframes/ml/ensemble.py index 19ca8608ff..1cc9fb3739 100644 --- a/bigframes/ml/ensemble.py +++ b/bigframes/ml/ensemble.py @@ -17,11 +17,12 @@ from __future__ import annotations -from typing import cast, Dict, List, Literal, Optional, Union +from typing import Dict, List, Literal, Optional, Union from google.cloud import bigquery import bigframes +from bigframes.core import log_adapter from bigframes.ml import base, core, globals, utils import bigframes.pandas as bpd import third_party.bigframes_vendored.sklearn.ensemble._forest @@ -47,6 +48,7 @@ } +@log_adapter.class_logger class XGBRegressor( base.SupervisedTrainablePredictor, third_party.bigframes_vendored.xgboost.sklearn.XGBRegressor, @@ -168,16 +170,7 @@ def predict( raise RuntimeError("A model must be fitted before predict") (X,) = utils.convert_to_dataframe(X) - df = self._bqml_model.predict(X) - return cast( - bpd.DataFrame, - df[ - [ - cast(str, field.name) - for field in self._bqml_model.model.label_columns - ] - ], - ) + return self._bqml_model.predict(X) def score( self, @@ -211,6 +204,7 @@ def to_gbq(self, model_name: str, replace: bool = False) -> XGBRegressor: return new_model.session.read_gbq_model(model_name) +@log_adapter.class_logger class XGBClassifier( base.SupervisedTrainablePredictor, third_party.bigframes_vendored.xgboost.sklearn.XGBClassifier, @@ -328,19 +322,9 @@ def _fit( def predict(self, X: Union[bpd.DataFrame, bpd.Series]) -> bpd.DataFrame: if not self._bqml_model: raise RuntimeError("A model must be fitted before predict") - (X,) = utils.convert_to_dataframe(X) - df = self._bqml_model.predict(X) - return cast( - bpd.DataFrame, - df[ - [ - cast(str, field.name) - for field in self._bqml_model.model.label_columns - ] - ], - ) + return self._bqml_model.predict(X) def score( self, @@ -375,6 +359,7 @@ def to_gbq(self, model_name: str, replace: bool = False) -> XGBClassifier: return new_model.session.read_gbq_model(model_name) +@log_adapter.class_logger class RandomForestRegressor( base.SupervisedTrainablePredictor, third_party.bigframes_vendored.sklearn.ensemble._forest.RandomForestRegressor, @@ -486,19 +471,9 @@ def predict( ) -> bpd.DataFrame: if not self._bqml_model: raise RuntimeError("A model must be fitted before predict") - (X,) = utils.convert_to_dataframe(X) - df = self._bqml_model.predict(X) - return cast( - bpd.DataFrame, - df[ - [ - cast(str, field.name) - for field in self._bqml_model.model.label_columns - ] - ], - ) + return self._bqml_model.predict(X) def score( self, @@ -550,6 +525,7 @@ def to_gbq(self, model_name: str, replace: bool = False) -> RandomForestRegresso return new_model.session.read_gbq_model(model_name) +@log_adapter.class_logger class RandomForestClassifier( base.SupervisedTrainablePredictor, third_party.bigframes_vendored.sklearn.ensemble._forest.RandomForestClassifier, @@ -661,19 +637,9 @@ def predict( ) -> bpd.DataFrame: if not self._bqml_model: raise RuntimeError("A model must be fitted before predict") - (X,) = utils.convert_to_dataframe(X) - df = self._bqml_model.predict(X) - return cast( - bpd.DataFrame, - df[ - [ - cast(str, field.name) - for field in self._bqml_model.model.label_columns - ] - ], - ) + return self._bqml_model.predict(X) def score( self, diff --git a/bigframes/ml/forecasting.py b/bigframes/ml/forecasting.py index 8e309d5e73..995201062b 100644 --- a/bigframes/ml/forecasting.py +++ b/bigframes/ml/forecasting.py @@ -16,17 +16,17 @@ from __future__ import annotations -from typing import cast, Dict, List, Optional, Union +from typing import Dict, List, Optional, Union from google.cloud import bigquery import bigframes +from bigframes.core import log_adapter from bigframes.ml import base, core, globals, utils import bigframes.pandas as bpd -_PREDICT_OUTPUT_COLUMNS = ["forecast_timestamp", "forecast_value"] - +@log_adapter.class_logger class ARIMAPlus(base.SupervisedTrainablePredictor): """Time Series ARIMA Plus model.""" @@ -100,10 +100,7 @@ def predict(self, X=None) -> bpd.DataFrame: if not self._bqml_model: raise RuntimeError("A model must be fitted before predict") - return cast( - bpd.DataFrame, - self._bqml_model.forecast()[_PREDICT_OUTPUT_COLUMNS], - ) + return self._bqml_model.forecast() def score( self, diff --git a/bigframes/ml/imported.py b/bigframes/ml/imported.py index fb8aa98bef..4ae0a8ea4d 100644 --- a/bigframes/ml/imported.py +++ b/bigframes/ml/imported.py @@ -21,10 +21,12 @@ from google.cloud import bigquery import bigframes +from bigframes.core import log_adapter from bigframes.ml import base, core, globals, utils import bigframes.pandas as bpd +@log_adapter.class_logger class TensorFlowModel(base.Predictor): """Imported TensorFlow model. @@ -78,16 +80,7 @@ def predict(self, X: Union[bpd.DataFrame, bpd.Series]) -> bpd.DataFrame: (X,) = utils.convert_to_dataframe(X) - df = self._bqml_model.predict(X) - return cast( - bpd.DataFrame, - df[ - [ - cast(str, field.name) - for field in self._bqml_model.model.label_columns - ] - ], - ) + return self._bqml_model.predict(X) def to_gbq(self, model_name: str, replace: bool = False) -> TensorFlowModel: """Save the model to BigQuery. @@ -110,6 +103,7 @@ def to_gbq(self, model_name: str, replace: bool = False) -> TensorFlowModel: return new_model.session.read_gbq_model(model_name) +@log_adapter.class_logger class ONNXModel(base.Predictor): """Imported Open Neural Network Exchange (ONNX) model. @@ -161,16 +155,7 @@ def predict(self, X: Union[bpd.DataFrame, bpd.Series]) -> bpd.DataFrame: (X,) = utils.convert_to_dataframe(X) - df = self._bqml_model.predict(X) - return cast( - bpd.DataFrame, - df[ - [ - cast(str, field.name) - for field in self._bqml_model.model.label_columns - ] - ], - ) + return self._bqml_model.predict(X) def to_gbq(self, model_name: str, replace: bool = False) -> ONNXModel: """Save the model to BigQuery. diff --git a/bigframes/ml/linear_model.py b/bigframes/ml/linear_model.py index f11879500b..5ee87b8850 100644 --- a/bigframes/ml/linear_model.py +++ b/bigframes/ml/linear_model.py @@ -17,12 +17,13 @@ from __future__ import annotations -from typing import cast, Dict, List, Literal, Optional, Union +from typing import Dict, List, Literal, Optional, Union from google.cloud import bigquery import bigframes import bigframes.constants as constants +from bigframes.core import log_adapter from bigframes.ml import base, core, globals, utils import bigframes.pandas as bpd import third_party.bigframes_vendored.sklearn.linear_model._base @@ -46,6 +47,7 @@ } +@log_adapter.class_logger class LinearRegression( base.SupervisedTrainablePredictor, third_party.bigframes_vendored.sklearn.linear_model._base.LinearRegression, @@ -145,16 +147,7 @@ def predict(self, X: Union[bpd.DataFrame, bpd.Series]) -> bpd.DataFrame: (X,) = utils.convert_to_dataframe(X) - df = self._bqml_model.predict(X) - return cast( - bpd.DataFrame, - df[ - [ - cast(str, field.name) - for field in self._bqml_model.model.label_columns - ] - ], - ) + return self._bqml_model.predict(X) def score( self, @@ -187,6 +180,7 @@ def to_gbq(self, model_name: str, replace: bool = False) -> LinearRegression: return new_model.session.read_gbq_model(model_name) +@log_adapter.class_logger class LogisticRegression( base.SupervisedTrainablePredictor, third_party.bigframes_vendored.sklearn.linear_model._logistic.LogisticRegression, @@ -267,16 +261,7 @@ def predict( (X,) = utils.convert_to_dataframe(X) - df = self._bqml_model.predict(X) - return cast( - bpd.DataFrame, - df[ - [ - cast(str, field.name) - for field in self._bqml_model.model.label_columns - ] - ], - ) + return self._bqml_model.predict(X) def score( self, diff --git a/bigframes/ml/llm.py b/bigframes/ml/llm.py index 3cfc28e61f..5beb54a32d 100644 --- a/bigframes/ml/llm.py +++ b/bigframes/ml/llm.py @@ -17,24 +17,33 @@ from __future__ import annotations from typing import cast, Literal, Optional, Union +import warnings import bigframes from bigframes import clients, constants -from bigframes.core import blocks +from bigframes.core import blocks, log_adapter from bigframes.ml import base, core, globals, utils import bigframes.pandas as bpd -_REMOTE_TEXT_GENERATOR_MODEL_ENDPOINT = "text-bison" -_REMOTE_TEXT_GENERATOR_32K_MODEL_ENDPOINT = "text-bison-32k" -_TEXT_GENERATE_RESULT_COLUMN = "ml_generate_text_llm_result" +_TEXT_GENERATOR_BISON_ENDPOINT = "text-bison" +_TEXT_GENERATOR_BISON_32K_ENDPOINT = "text-bison-32k" +_TEXT_GENERATOR_ENDPOINTS = ( + _TEXT_GENERATOR_BISON_ENDPOINT, + _TEXT_GENERATOR_BISON_32K_ENDPOINT, +) -_REMOTE_EMBEDDING_GENERATOR_MODEL_ENDPOINT = "textembedding-gecko" -_REMOTE_EMBEDDING_GENERATOR_MUlTILINGUAL_MODEL_ENDPOINT = ( - "textembedding-gecko-multilingual" +_EMBEDDING_GENERATOR_GECKO_ENDPOINT = "textembedding-gecko" +_EMBEDDING_GENERATOR_GECKO_MULTILINGUAL_ENDPOINT = "textembedding-gecko-multilingual" +_EMBEDDING_GENERATOR_ENDPOINTS = ( + _EMBEDDING_GENERATOR_GECKO_ENDPOINT, + _EMBEDDING_GENERATOR_GECKO_MULTILINGUAL_ENDPOINT, ) -_EMBED_TEXT_RESULT_COLUMN = "text_embedding" + +_ML_GENERATE_TEXT_STATUS = "ml_generate_text_status" +_ML_EMBED_TEXT_STATUS = "ml_embed_text_status" +@log_adapter.class_logger class PaLM2TextGenerator(base.Predictor): """PaLM2 text generator LLM model. @@ -90,18 +99,16 @@ def _create_bqml_model(self): connection_id=connection_name_parts[2], iam_role="aiplatform.user", ) - if self.model_name == _REMOTE_TEXT_GENERATOR_MODEL_ENDPOINT: - options = { - "endpoint": _REMOTE_TEXT_GENERATOR_MODEL_ENDPOINT, - } - elif self.model_name == _REMOTE_TEXT_GENERATOR_32K_MODEL_ENDPOINT: - options = { - "endpoint": _REMOTE_TEXT_GENERATOR_32K_MODEL_ENDPOINT, - } - else: + + if self.model_name not in _TEXT_GENERATOR_ENDPOINTS: raise ValueError( - f"Model name {self.model_name} is not supported. We only support {_REMOTE_TEXT_GENERATOR_MODEL_ENDPOINT} and {_REMOTE_TEXT_GENERATOR_32K_MODEL_ENDPOINT}." + f"Model name {self.model_name} is not supported. We only support {', '.join(_TEXT_GENERATOR_ENDPOINTS)}." ) + + options = { + "endpoint": self.model_name, + } + return self._bqml_model_factory.create_remote_model( session=self.session, connection_name=self.connection_name, options=options ) @@ -149,7 +156,8 @@ def predict( Returns: - bigframes.dataframe.DataFrame: Output DataFrame with only 1 column as the output text results.""" + bigframes.dataframe.DataFrame: DataFrame of shape (n_samples, n_input_columns + n_prediction_columns). Returns predicted values. + """ # Params reference: https://cloud.google.com/vertex-ai/docs/generative-ai/learn/models if temperature < 0.0 or temperature > 1.0: @@ -181,13 +189,19 @@ def predict( "top_p": top_p, "flatten_json_output": True, } + df = self._bqml_model.generate_text(X, options) - return cast( - bpd.DataFrame, - df[[_TEXT_GENERATE_RESULT_COLUMN]], - ) + if (df[_ML_GENERATE_TEXT_STATUS] != "").any(): + warnings.warn( + f"Some predictions failed. Check column {_ML_GENERATE_TEXT_STATUS} for detailed status. You may want to filter the failed rows and retry.", + RuntimeWarning, + ) + return df + + +@log_adapter.class_logger class PaLM2TextEmbeddingGenerator(base.Predictor): """PaLM2 text embedding generator LLM model. @@ -244,19 +258,15 @@ def _create_bqml_model(self): connection_id=connection_name_parts[2], iam_role="aiplatform.user", ) - if self.model_name == "textembedding-gecko": - options = { - "endpoint": _REMOTE_EMBEDDING_GENERATOR_MODEL_ENDPOINT, - } - elif self.model_name == _REMOTE_EMBEDDING_GENERATOR_MUlTILINGUAL_MODEL_ENDPOINT: - options = { - "endpoint": _REMOTE_EMBEDDING_GENERATOR_MUlTILINGUAL_MODEL_ENDPOINT, - } - else: + + if self.model_name not in _EMBEDDING_GENERATOR_ENDPOINTS: raise ValueError( - f"Model name {self.model_name} is not supported. We only support {_REMOTE_EMBEDDING_GENERATOR_MODEL_ENDPOINT} and {_REMOTE_EMBEDDING_GENERATOR_MUlTILINGUAL_MODEL_ENDPOINT}." + f"Model name {self.model_name} is not supported. We only support {', '.join(_EMBEDDING_GENERATOR_ENDPOINTS)}." ) + options = { + "endpoint": self.model_name, + } return self._bqml_model_factory.create_remote_model( session=self.session, connection_name=self.connection_name, options=options ) @@ -269,7 +279,7 @@ def predict(self, X: Union[bpd.DataFrame, bpd.Series]) -> bpd.DataFrame: Input DataFrame, which needs to contain a column with name "content". Only the column will be used as input. Content can include preamble, questions, suggestions, instructions, or examples. Returns: - bigframes.dataframe.DataFrame: Output DataFrame with only 1 column as the output embedding results + bigframes.dataframe.DataFrame: DataFrame of shape (n_samples, n_input_columns + n_prediction_columns). Returns predicted values. """ # Params reference: https://cloud.google.com/vertex-ai/docs/generative-ai/learn/models @@ -287,8 +297,13 @@ def predict(self, X: Union[bpd.DataFrame, bpd.Series]) -> bpd.DataFrame: options = { "flatten_json_output": True, } + df = self._bqml_model.generate_text_embedding(X, options) - return cast( - bpd.DataFrame, - df[[_EMBED_TEXT_RESULT_COLUMN]], - ) + + if (df[_ML_EMBED_TEXT_STATUS] != "").any(): + warnings.warn( + f"Some predictions failed. Check column {_ML_EMBED_TEXT_STATUS} for detailed status. You may want to filter the failed rows and retry.", + RuntimeWarning, + ) + + return df diff --git a/bigframes/ml/pipeline.py b/bigframes/ml/pipeline.py index ad0b3fae11..4ae2bfe555 100644 --- a/bigframes/ml/pipeline.py +++ b/bigframes/ml/pipeline.py @@ -24,11 +24,13 @@ import bigframes import bigframes.constants as constants +from bigframes.core import log_adapter from bigframes.ml import base, compose, forecasting, loader, preprocessing, utils import bigframes.pandas as bpd import third_party.bigframes_vendored.sklearn.pipeline +@log_adapter.class_logger class Pipeline( base.BaseEstimator, third_party.bigframes_vendored.sklearn.pipeline.Pipeline, diff --git a/bigframes/ml/preprocessing.py b/bigframes/ml/preprocessing.py index 5f44d40218..a403e57e71 100644 --- a/bigframes/ml/preprocessing.py +++ b/bigframes/ml/preprocessing.py @@ -20,6 +20,7 @@ import typing from typing import Any, cast, List, Literal, Optional, Tuple, Union +from bigframes.core import log_adapter from bigframes.ml import base, core, globals, utils import bigframes.pandas as bpd import third_party.bigframes_vendored.sklearn.preprocessing._data @@ -28,6 +29,7 @@ import third_party.bigframes_vendored.sklearn.preprocessing._label +@log_adapter.class_logger class StandardScaler( base.Transformer, third_party.bigframes_vendored.sklearn.preprocessing._data.StandardScaler, @@ -111,6 +113,7 @@ def transform(self, X: Union[bpd.DataFrame, bpd.Series]) -> bpd.DataFrame: ) +@log_adapter.class_logger class MaxAbsScaler( base.Transformer, third_party.bigframes_vendored.sklearn.preprocessing._data.MaxAbsScaler, @@ -194,6 +197,7 @@ def transform(self, X: Union[bpd.DataFrame, bpd.Series]) -> bpd.DataFrame: ) +@log_adapter.class_logger class MinMaxScaler( base.Transformer, third_party.bigframes_vendored.sklearn.preprocessing._data.MinMaxScaler, @@ -277,6 +281,7 @@ def transform(self, X: Union[bpd.DataFrame, bpd.Series]) -> bpd.DataFrame: ) +@log_adapter.class_logger class KBinsDiscretizer( base.Transformer, third_party.bigframes_vendored.sklearn.preprocessing._discretization.KBinsDiscretizer, @@ -395,6 +400,7 @@ def transform(self, X: Union[bpd.DataFrame, bpd.Series]) -> bpd.DataFrame: ) +@log_adapter.class_logger class OneHotEncoder( base.Transformer, third_party.bigframes_vendored.sklearn.preprocessing._encoder.OneHotEncoder, @@ -524,6 +530,7 @@ def transform(self, X: Union[bpd.DataFrame, bpd.Series]) -> bpd.DataFrame: ) +@log_adapter.class_logger class LabelEncoder( base.LabelTransformer, third_party.bigframes_vendored.sklearn.preprocessing._label.LabelEncoder, diff --git a/bigframes/ml/remote.py b/bigframes/ml/remote.py new file mode 100644 index 0000000000..d4c34bbd0d --- /dev/null +++ b/bigframes/ml/remote.py @@ -0,0 +1,157 @@ +# Copyright 2023 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""BigFrames general remote models.""" + +from __future__ import annotations + +from typing import Mapping, Optional, Union +import warnings + +import bigframes +from bigframes import clients +from bigframes.core import log_adapter +from bigframes.ml import base, core, globals, utils +import bigframes.pandas as bpd + +_SUPPORTED_DTYPES = ( + "bool", + "string", + "int64", + "float64", + "array", + "array", + "array", + "array", +) + +_REMOTE_MODEL_STATUS = "remote_model_status" + + +@log_adapter.class_logger +class VertexAIModel(base.BaseEstimator): + """Remote model from a Vertex AI https endpoint. User must specify https endpoint, input schema and output schema. + How to deploy a model in Vertex AI https://cloud.google.com/bigquery/docs/bigquery-ml-remote-model-tutorial#Deploy-Model-on-Vertex-AI. + + Args: + endpoint (str): + Vertex AI https endpoint. + input ({column_name: column_type}): + Input schema. Supported types are "bool", "string", "int64", "float64", "array", "array", "array", "array". + output ({column_name: column_type}): + Output label schema. Supported the same types as the input. + session (bigframes.Session or None): + BQ session to create the model. If None, use the global default session. + connection_name (str or None): + Connection to connect with remote service. str of the format ... + if None, use default connection in session context. BigQuery DataFrame will try to create the connection and attach + permission if the connection isn't fully setup. + """ + + def __init__( + self, + endpoint: str, + input: Mapping[str, str], + output: Mapping[str, str], + session: Optional[bigframes.Session] = None, + connection_name: Optional[str] = None, + ): + self.endpoint = endpoint + self.input = input + self.output = output + self.session = session or bpd.get_global_session() + + self._bq_connection_manager = clients.BqConnectionManager( + self.session.bqconnectionclient, self.session.resourcemanagerclient + ) + connection_name = connection_name or self.session._bq_connection + self.connection_name = self._bq_connection_manager.resolve_full_connection_name( + connection_name, + default_project=self.session._project, + default_location=self.session._location, + ) + + self._bqml_model_factory = globals.bqml_model_factory() + self._bqml_model: core.BqmlModel = self._create_bqml_model() + + def _create_bqml_model(self): + # Parse and create connection if needed. + if not self.connection_name: + raise ValueError( + "Must provide connection_name, either in constructor or through session options." + ) + connection_name_parts = self.connection_name.split(".") + if len(connection_name_parts) != 3: + raise ValueError( + f"connection_name must be of the format .., got {self.connection_name}." + ) + self._bq_connection_manager.create_bq_connection( + project_id=connection_name_parts[0], + location=connection_name_parts[1], + connection_id=connection_name_parts[2], + iam_role="aiplatform.user", + ) + + options = { + "endpoint": self.endpoint, + } + + def standardize_type(v: str): + v = v.lower() + v = v.replace("boolean", "bool") + + if v not in _SUPPORTED_DTYPES: + raise ValueError( + f"Data type {v} is not supported. We only support {', '.join(_SUPPORTED_DTYPES)}." + ) + + return v + + self.input = {k: standardize_type(v) for k, v in self.input.items()} + self.output = {k: standardize_type(v) for k, v in self.output.items()} + + return self._bqml_model_factory.create_remote_model( + session=self.session, + connection_name=self.connection_name, + input=self.input, + output=self.output, + options=options, + ) + + def predict( + self, + X: Union[bpd.DataFrame, bpd.Series], + ) -> bpd.DataFrame: + """Predict the result from the input DataFrame. + + Args: + X (bigframes.dataframe.DataFrame or bigframes.series.Series): + Input DataFrame or Series, which needs to comply with the input parameter of the model. + + Returns: + bigframes.dataframe.DataFrame: DataFrame of shape (n_samples, n_input_columns + n_prediction_columns). Returns predicted values. + """ + + (X,) = utils.convert_to_dataframe(X) + + df = self._bqml_model.predict(X) + + # unlike LLM models, the general remote model status is null for successful runs. + if (df[_REMOTE_MODEL_STATUS].notna()).any(): + warnings.warn( + f"Some predictions failed. Check column {_REMOTE_MODEL_STATUS} for detailed status. You may want to filter the failed rows and retry.", + RuntimeWarning, + ) + + return df diff --git a/bigframes/ml/sql.py b/bigframes/ml/sql.py index ab051231fb..1c88eda4ab 100644 --- a/bigframes/ml/sql.py +++ b/bigframes/ml/sql.py @@ -57,6 +57,12 @@ def build_expressions(self, *expr_sqls: str) -> str: indent_str = " " return "\n" + indent_str + f",\n{indent_str}".join(expr_sqls) + def build_schema(self, **kwargs: str) -> str: + """Encode a dict of values into a formatted schema type items for SQL""" + indent_str = " " + param_strs = [f"{k} {v}" for k, v in kwargs.items()] + return "\n" + indent_str + f",\n{indent_str}".join(param_strs) + def options(self, **kwargs: Union[str, int, float, Iterable[str]]) -> str: """Encode the OPTIONS clause for BQML""" return f"OPTIONS({self.build_parameters(**kwargs)})" @@ -65,6 +71,14 @@ def struct_options(self, **kwargs: Union[int, float]) -> str: """Encode a BQ STRUCT as options.""" return f"STRUCT({self.build_structs(**kwargs)})" + def input(self, **kwargs: str) -> str: + """Encode a BQML INPUT clause.""" + return f"INPUT({self.build_schema(**kwargs)})" + + def output(self, **kwargs: str) -> str: + """Encode a BQML OUTPUT clause.""" + return f"OUTPUT({self.build_schema(**kwargs)})" + # Connection def connection(self, conn_name: str) -> str: """Encode the REMOTE WITH CONNECTION clause for BQML. conn_name is of the format ...""" @@ -154,15 +168,19 @@ def create_remote_model( self, connection_name: str, model_ref: google.cloud.bigquery.ModelReference, + input: Mapping[str, str] = {}, + output: Mapping[str, str] = {}, options: Mapping[str, Union[str, int, float, Iterable[str]]] = {}, ) -> str: """Encode the CREATE OR REPLACE MODEL statement for BQML remote model.""" - options_sql = self.options(**options) - parts = [f"CREATE OR REPLACE MODEL {self._model_id_sql(model_ref)}"] + if input: + parts.append(self.input(**input)) + if output: + parts.append(self.output(**output)) parts.append(self.connection(connection_name)) - if options_sql: - parts.append(options_sql) + if options: + parts.append(self.options(**options)) return "\n".join(parts) def create_imported_model( diff --git a/bigframes/operations/base.py b/bigframes/operations/base.py index d33befe4da..85ce1dd9e6 100644 --- a/bigframes/operations/base.py +++ b/bigframes/operations/base.py @@ -141,7 +141,7 @@ def _apply_binary_op( if isinstance(other, pd.Series): # TODO: Convert to BigQuery DataFrames series raise NotImplementedError( - f"Pandas series not supported supported as operand. {constants.FEEDBACK_LINK}" + f"Pandas series not supported as operand. {constants.FEEDBACK_LINK}" ) if isinstance(other, series.Series): (left, right, block) = self._align(other, how=alignment) diff --git a/bigframes/pandas/__init__.py b/bigframes/pandas/__init__.py index d35f838366..0c2c1f87aa 100644 --- a/bigframes/pandas/__init__.py +++ b/bigframes/pandas/__init__.py @@ -486,6 +486,7 @@ def read_gbq( index_col: Iterable[str] | str = (), col_order: Iterable[str] = (), max_results: Optional[int] = None, + use_cache: bool = True, ) -> bigframes.dataframe.DataFrame: _set_default_session_location_if_possible(query_or_table) return global_session.with_default_session( @@ -494,6 +495,7 @@ def read_gbq( index_col=index_col, col_order=col_order, max_results=max_results, + use_cache=use_cache, ) @@ -516,6 +518,7 @@ def read_gbq_query( index_col: Iterable[str] | str = (), col_order: Iterable[str] = (), max_results: Optional[int] = None, + use_cache: bool = True, ) -> bigframes.dataframe.DataFrame: _set_default_session_location_if_possible(query) return global_session.with_default_session( @@ -524,6 +527,7 @@ def read_gbq_query( index_col=index_col, col_order=col_order, max_results=max_results, + use_cache=use_cache, ) @@ -536,6 +540,7 @@ def read_gbq_table( index_col: Iterable[str] | str = (), col_order: Iterable[str] = (), max_results: Optional[int] = None, + use_cache: bool = True, ) -> bigframes.dataframe.DataFrame: _set_default_session_location_if_possible(query) return global_session.with_default_session( @@ -544,6 +549,7 @@ def read_gbq_table( index_col=index_col, col_order=col_order, max_results=max_results, + use_cache=use_cache, ) diff --git a/bigframes/remote_function.py b/bigframes/remote_function.py index a39cd033f6..7280ac7d42 100644 --- a/bigframes/remote_function.py +++ b/bigframes/remote_function.py @@ -188,6 +188,7 @@ def create_bq_remote_function( # https://cloud.google.com/bigquery/docs/reference/standard-sql/remote-functions#create_a_remote_function_2 bq_function_args = [] bq_function_return_type = BigQueryType.from_ibis(output_type) + # We are expecting the input type annotations to be 1:1 with the input args for idx, name in enumerate(input_args): bq_function_args.append( @@ -204,14 +205,22 @@ def create_bq_remote_function( logger.info(f"Creating BQ remote function: {create_function_ddl}") - # Make sure the dataset exists + # Make sure the dataset exists. I.e. if it doesn't exist, go ahead and + # create it dataset = bigquery.Dataset( bigquery.DatasetReference.from_string( self._bq_dataset, default_project=self._gcp_project_id ) ) dataset.location = self._bq_location - self._bq_client.create_dataset(dataset, exists_ok=True) + try: + # This check does not require bigquery.datasets.create IAM + # permission. So, if the data set already exists, then user can work + # without having that permission. + self._bq_client.get_dataset(dataset) + except google.api_core.exceptions.NotFound: + # This requires bigquery.datasets.create IAM permission + self._bq_client.create_dataset(dataset, exists_ok=True) # TODO: Use session._start_query() so we get progress bar query_job = self._bq_client.query(create_function_ddl) # Make an API request. @@ -610,7 +619,7 @@ def get_routine_reference( raise DatasetMissingError dataset_ref = bigquery.DatasetReference( - bigquery_client.project, session._session_dataset_id + bigquery_client.project, session._anonymous_dataset.dataset_id ) return dataset_ref.routine(routine_ref_str) @@ -778,9 +787,7 @@ def remote_function( dataset, default_project=bigquery_client.project ) else: - dataset_ref = bigquery.DatasetReference.from_string( - session._session_dataset_id, default_project=bigquery_client.project - ) + dataset_ref = session._anonymous_dataset bq_location, cloud_function_region = get_remote_function_locations( bigquery_client.location diff --git a/bigframes/session/__init__.py b/bigframes/session/__init__.py index 928123ce74..84a6eb5638 100644 --- a/bigframes/session/__init__.py +++ b/bigframes/session/__init__.py @@ -177,6 +177,7 @@ def __init__( # Now that we're starting the session, don't allow the options to be # changed. context._session_started = True + self._df_snapshot: Dict[bigquery.TableReference, datetime.datetime] = {} @property def bqclient(self): @@ -198,13 +199,6 @@ def cloudfunctionsclient(self): def resourcemanagerclient(self): return self._clients_provider.resourcemanagerclient - @property - def _session_dataset_id(self): - """A dataset for storing temporary objects local to the session - This is a workaround for remote functions that do not - yet support session-temporary instances.""" - return self._session_dataset.dataset_id - @property def _project(self): return self.bqclient.project @@ -229,13 +223,6 @@ def _create_bq_datasets(self): query_destination.dataset_id, ) - # Dataset for storing remote functions, which don't yet - # support proper session temporary storage yet - self._session_dataset = bigquery.Dataset( - f"{self.bqclient.project}.bigframes_temp_{self._location.lower().replace('-', '_')}" - ) - self._session_dataset.location = self._location - def close(self): """No-op. Temporary resources are deleted after 7 days.""" @@ -246,6 +233,7 @@ def read_gbq( index_col: Iterable[str] | str = (), col_order: Iterable[str] = (), max_results: Optional[int] = None, + use_cache: bool = True, # Add a verify index argument that fails if the index is not unique. ) -> dataframe.DataFrame: # TODO(b/281571214): Generate prompt to show the progress of read_gbq. @@ -256,6 +244,7 @@ def read_gbq( col_order=col_order, max_results=max_results, api_name="read_gbq", + use_cache=use_cache, ) else: # TODO(swast): Query the snapshot table but mark it as a @@ -267,6 +256,7 @@ def read_gbq( col_order=col_order, max_results=max_results, api_name="read_gbq", + use_cache=use_cache, ) def _query_to_destination( @@ -274,6 +264,7 @@ def _query_to_destination( query: str, index_cols: List[str], api_name: str, + use_cache: bool = True, ) -> Tuple[Optional[bigquery.TableReference], Optional[bigquery.QueryJob]]: # If a dry_run indicates this is not a query type job, then don't # bother trying to do a CREATE TEMP TABLE ... AS SELECT ... statement. @@ -298,6 +289,7 @@ def _query_to_destination( job_config = bigquery.QueryJobConfig() job_config.labels["bigframes-api"] = api_name job_config.destination = temp_table + job_config.use_query_cache = use_cache try: # Write to temp table to workaround BigQuery 10 GB query results @@ -319,6 +311,7 @@ def read_gbq_query( index_col: Iterable[str] | str = (), col_order: Iterable[str] = (), max_results: Optional[int] = None, + use_cache: bool = True, ) -> dataframe.DataFrame: """Turn a SQL query into a DataFrame. @@ -376,6 +369,7 @@ def read_gbq_query( col_order=col_order, max_results=max_results, api_name="read_gbq_query", + use_cache=use_cache, ) def _read_gbq_query( @@ -386,6 +380,7 @@ def _read_gbq_query( col_order: Iterable[str] = (), max_results: Optional[int] = None, api_name: str = "read_gbq_query", + use_cache: bool = True, ) -> dataframe.DataFrame: if isinstance(index_col, str): index_cols = [index_col] @@ -393,7 +388,10 @@ def _read_gbq_query( index_cols = list(index_col) destination, query_job = self._query_to_destination( - query, index_cols, api_name=api_name + query, + index_cols, + api_name=api_name, + use_cache=use_cache, ) # If there was no destination table, that means the query must have @@ -417,6 +415,7 @@ def _read_gbq_query( index_col=index_cols, col_order=col_order, max_results=max_results, + use_cache=use_cache, ) def read_gbq_table( @@ -426,6 +425,7 @@ def read_gbq_table( index_col: Iterable[str] | str = (), col_order: Iterable[str] = (), max_results: Optional[int] = None, + use_cache: bool = True, ) -> dataframe.DataFrame: """Turn a BigQuery table into a DataFrame. @@ -448,6 +448,7 @@ def read_gbq_table( col_order=col_order, max_results=max_results, api_name="read_gbq_table", + use_cache=use_cache, ) def _get_snapshot_sql_and_primary_key( @@ -455,6 +456,7 @@ def _get_snapshot_sql_and_primary_key( table_ref: bigquery.table.TableReference, *, api_name: str, + use_cache: bool = True, ) -> Tuple[ibis_types.Table, Optional[Sequence[str]]]: """Create a read-only Ibis table expression representing a table. @@ -462,19 +464,6 @@ def _get_snapshot_sql_and_primary_key( column(s), then return those too so that ordering generation can be avoided. """ - if table_ref.dataset_id.upper() == "_SESSION": - # _SESSION tables aren't supported by the tables.get REST API. - return ( - self.ibis_client.sql( - f"SELECT * FROM `_SESSION`.`{table_ref.table_id}`" - ), - None, - ) - table_expression = self.ibis_client.table( - table_ref.table_id, - database=f"{table_ref.project}.{table_ref.dataset_id}", - ) - # If there are primary keys defined, the query engine assumes these # columns are unique, even if the constraint is not enforced. We make # the same assumption and use these columns as the total ordering keys. @@ -495,14 +484,18 @@ def _get_snapshot_sql_and_primary_key( job_config = bigquery.QueryJobConfig() job_config.labels["bigframes-api"] = api_name - current_timestamp = list( - self.bqclient.query( - "SELECT CURRENT_TIMESTAMP() AS `current_timestamp`", - job_config=job_config, - ).result() - )[0][0] + if use_cache and table_ref in self._df_snapshot.keys(): + snapshot_timestamp = self._df_snapshot[table_ref] + else: + snapshot_timestamp = list( + self.bqclient.query( + "SELECT CURRENT_TIMESTAMP() AS `current_timestamp`", + job_config=job_config, + ).result() + )[0][0] + self._df_snapshot[table_ref] = snapshot_timestamp table_expression = self.ibis_client.sql( - bigframes_io.create_snapshot_sql(table_ref, current_timestamp) + bigframes_io.create_snapshot_sql(table_ref, snapshot_timestamp) ) return table_expression, primary_keys @@ -514,12 +507,11 @@ def _read_gbq_table( col_order: Iterable[str] = (), max_results: Optional[int] = None, api_name: str, + use_cache: bool = True, ) -> dataframe.DataFrame: if max_results and max_results <= 0: raise ValueError("`max_results` should be a positive number.") - # TODO(swast): Can we re-use the temp table from other reads in the - # session, if the original table wasn't modified? table_ref = bigquery.table.TableReference.from_string( query, default_project=self.bqclient.project ) @@ -527,7 +519,9 @@ def _read_gbq_table( ( table_expression, total_ordering_cols, - ) = self._get_snapshot_sql_and_primary_key(table_ref, api_name=api_name) + ) = self._get_snapshot_sql_and_primary_key( + table_ref, api_name=api_name, use_cache=use_cache + ) for key in col_order: if key not in table_expression.columns: diff --git a/bigframes/session/_io/bigquery.py b/bigframes/session/_io/bigquery.py index dae73301e7..4770f12089 100644 --- a/bigframes/session/_io/bigquery.py +++ b/bigframes/session/_io/bigquery.py @@ -117,11 +117,6 @@ def create_snapshot_sql( table_ref: bigquery.TableReference, current_timestamp: datetime.datetime ) -> str: """Query a table via 'time travel' for consistent reads.""" - - # If we have a _SESSION table, assume that it's already a copy. Nothing to do here. - if table_ref.dataset_id.upper() == "_SESSION": - return f"SELECT * FROM `_SESSION`.`{table_ref.table_id}`" - # If we have an anonymous query results table, it can't be modified and # there isn't any BigQuery time travel. if table_ref.dataset_id.startswith("_"): diff --git a/bigframes/version.py b/bigframes/version.py index 46e57e5b88..920cb95c3d 100644 --- a/bigframes/version.py +++ b/bigframes/version.py @@ -12,4 +12,4 @@ # See the License for the specific language governing permissions and # limitations under the License. -__version__ = "0.14.1" +__version__ = "0.15.0" diff --git a/docs/reference/bigframes.ml/index.rst b/docs/reference/bigframes.ml/index.rst index f3cbe1174a..1975d62e6d 100644 --- a/docs/reference/bigframes.ml/index.rst +++ b/docs/reference/bigframes.ml/index.rst @@ -30,3 +30,5 @@ API Reference pipeline preprocessing + + remote diff --git a/docs/reference/bigframes.ml/remote.rst b/docs/reference/bigframes.ml/remote.rst new file mode 100644 index 0000000000..7827acfe92 --- /dev/null +++ b/docs/reference/bigframes.ml/remote.rst @@ -0,0 +1,7 @@ +bigframes.ml.remote +=================== + +.. automodule:: bigframes.ml.remote + :members: + :inherited-members: + :undoc-members: diff --git a/docs/templates/toc.yml b/docs/templates/toc.yml index 9879721d28..58ac1c0efe 100644 --- a/docs/templates/toc.yml +++ b/docs/templates/toc.yml @@ -108,6 +108,12 @@ - name: PaLM2TextEmbeddingGenerator uid: bigframes.ml.llm.PaLM2TextEmbeddingGenerator name: llm + - items: + - name: Overview + uid: bigframes.ml.remote + - name: VertexAIModel + uid: bigframes.ml.remote.VertexAIModel + name: remote - items: - name: metrics uid: bigframes.ml.metrics diff --git a/notebooks/generative_ai/bq_dataframes_llm_code_generation.ipynb b/notebooks/generative_ai/bq_dataframes_llm_code_generation.ipynb index 0f113b84c6..0a41447a53 100644 --- a/notebooks/generative_ai/bq_dataframes_llm_code_generation.ipynb +++ b/notebooks/generative_ai/bq_dataframes_llm_code_generation.ipynb @@ -34,7 +34,7 @@ "\n", "\n", " \n", diff --git a/notebooks/generative_ai/bq_dataframes_llm_kmeans.ipynb b/notebooks/generative_ai/bq_dataframes_llm_kmeans.ipynb index 46c4955288..5f74046fc0 100644 --- a/notebooks/generative_ai/bq_dataframes_llm_kmeans.ipynb +++ b/notebooks/generative_ai/bq_dataframes_llm_kmeans.ipynb @@ -31,7 +31,7 @@ "
\n", - " \n", + " \n", " \"Colab Run in Colab\n", " \n", "
\n", "\n", " \n", @@ -118,14 +118,10 @@ "\n", "2. [Make sure that billing is enabled for your project](https://cloud.google.com/billing/docs/how-to/modify-project).\n", "\n", - "3. [Click here](https://console.cloud.google.com/flows/enableapi?apiid=bigquery.googleapis.com,bigqueryconnection.googleapis.com,run.googleapis.com,artifactregistry.googleapis.com,cloudbuild.googleapis.com,cloudresourcemanager.googleapis.com) to enable the following APIs:\n", + "3. [Click here](https://console.cloud.google.com/flows/enableapi?apiid=bigquery.googleapis.com,bigqueryconnection.googleapis.com,aiplatform.googleapis.com) to enable the following APIs:\n", "\n", " * BigQuery API\n", " * BigQuery Connection API\n", - " * Cloud Run API\n", - " * Artifact Registry API\n", - " * Cloud Build API\n", - " * Cloud Resource Manager API\n", " * Vertex AI API\n", "\n", "4. If you are running this notebook locally, install the [Cloud SDK](https://cloud.google.com/sdk)." @@ -232,87 +228,6 @@ "# auth.authenticate_user()" ] }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "If you want to reset the location of the created DataFrame or Series objects, reset the session by executing `bf.close_session()`. After that, you can reuse `bf.options.bigquery.location` to specify another location." - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Connect to Vertex AI\n", - "\n", - "In order to use PaLM2TextGenerator, we will need to set up a [cloud resource connection](https://cloud.google.com/bigquery/docs/create-cloud-resource-connection)." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from google.cloud import bigquery_connection_v1 as bq_connection\n", - "\n", - "CONN_NAME = \"bqdf-llm\"\n", - "\n", - "client = bq_connection.ConnectionServiceClient()\n", - "new_conn_parent = f\"projects/{PROJECT_ID}/locations/{REGION}\"\n", - "exists_conn_parent = f\"projects/{PROJECT_ID}/locations/{REGION}/connections/{CONN_NAME}\"\n", - "cloud_resource_properties = bq_connection.CloudResourceProperties({})\n", - "\n", - "try:\n", - " request = client.get_connection(\n", - " request=bq_connection.GetConnectionRequest(name=exists_conn_parent)\n", - " )\n", - " CONN_SERVICE_ACCOUNT = f\"serviceAccount:{request.cloud_resource.service_account_id}\"\n", - "except Exception:\n", - " connection = bq_connection.types.Connection(\n", - " {\"friendly_name\": CONN_NAME, \"cloud_resource\": cloud_resource_properties}\n", - " )\n", - " request = bq_connection.CreateConnectionRequest(\n", - " {\n", - " \"parent\": new_conn_parent,\n", - " \"connection_id\": CONN_NAME,\n", - " \"connection\": connection,\n", - " }\n", - " )\n", - " response = client.create_connection(request)\n", - " CONN_SERVICE_ACCOUNT = (\n", - " f\"serviceAccount:{response.cloud_resource.service_account_id}\"\n", - " )\n", - "print(CONN_SERVICE_ACCOUNT)" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Set permissions for the service account\n", - "\n", - "The resource connection service account requires certain project-level permissions:\n", - " - `roles/aiplatform.user` and `roles/bigquery.connectionUser`: These roles are required for the connection to create a model definition using the LLM model in Vertex AI ([documentation](https://cloud.google.com/bigquery/docs/generate-text#give_the_service_account_access)).\n", - " - `roles/run.invoker`: This role is required for the connection to have read-only access to Cloud Run services that back custom/remote functions ([documentation](https://cloud.google.com/bigquery/docs/remote-functions#grant_permission_on_function)).\n", - "\n", - "Set these permissions by running the following `gcloud` commands:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "!gcloud projects add-iam-policy-binding {PROJECT_ID} --condition=None --no-user-output-enabled --member={CONN_SERVICE_ACCOUNT} --role='roles/bigquery.connectionUser'\n", - "!gcloud projects add-iam-policy-binding {PROJECT_ID} --condition=None --no-user-output-enabled --member={CONN_SERVICE_ACCOUNT} --role='roles/aiplatform.user'\n", - "!gcloud projects add-iam-policy-binding {PROJECT_ID} --condition=None --no-user-output-enabled --member={CONN_SERVICE_ACCOUNT} --role='roles/run.invoker'" - ] - }, { "attachments": {}, "cell_type": "markdown", @@ -336,7 +251,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Project Setup" + "BigQuery DataFrames setup" ] }, { @@ -353,6 +268,14 @@ "bf.options.bigquery.location = REGION" ] }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "If you want to reset the location of the created DataFrame or Series objects, reset the session by executing `bf.close_session()`. After that, you can reuse `bf.options.bigquery.location` to specify another location." + ] + }, { "attachments": {}, "cell_type": "markdown", @@ -391,7 +314,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Download 10000 complaints to use with PaLM2TextEmbeddingGenerator" + "Downsample DataFrame to 10,000 records for model training." ] }, { @@ -443,18 +366,6 @@ "predicted_embeddings.head() " ] }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "4H_etYfsEOFP" - }, - "outputs": [], - "source": [ - "# Join the complaints with their embeddings in the same DataFrame\n", - "combined_df = downsampled_issues_df.join(predicted_embeddings)" - ] - }, { "attachments": {}, "cell_type": "markdown", @@ -470,7 +381,7 @@ "id": "OUZ3NNbzo1Tb" }, "source": [ - "## Step 2: KMeans clustering" + "## Step 2: Create k-means model and predict clusters" ] }, { @@ -503,29 +414,19 @@ "outputs": [], "source": [ "# Use KMeans clustering to calculate our groups. Will take ~3 minutes.\n", - "cluster_model.fit(combined_df[[\"text_embedding\"]])\n", - "clustered_result = cluster_model.predict(combined_df[[\"text_embedding\"]])\n", + "cluster_model.fit(predicted_embeddings[[\"text_embedding\"]])\n", + "clustered_result = cluster_model.predict(predicted_embeddings)\n", "# Notice the CENTROID_ID column, which is the ID number of the group that\n", "# each complaint belongs to.\n", "clustered_result.head(n=5)" ] }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Join the group number to the complaints and their text embeddings\n", - "combined_clustered_result = combined_df.join(clustered_result)" - ] - }, { "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ - "Our dataframe combined_clustered_result now has three columns: the complaints, their text embeddings, and an ID from 1-10 (inclusive) indicating which semantically similar group they belong to." + "Our dataframe combined_clustered_result now has three complaint columns: the content, their text embeddings, and an ID from 1-10 (inclusive) indicating which semantically similar group they belong to." ] }, { @@ -535,7 +436,7 @@ "id": "21rNsFMHo8hO" }, "source": [ - "## Step 3: Summarize the complaints" + "## Step 3: Use PaLM2 LLM model to summarize complaint clusters" ] }, { @@ -556,14 +457,14 @@ "source": [ "# Using bigframes, with syntax identical to pandas,\n", "# filter out the first and second groups\n", - "cluster_1_result = combined_clustered_result[\n", - " combined_clustered_result[\"CENTROID_ID\"] == 1\n", - "][[\"consumer_complaint_narrative\"]]\n", + "cluster_1_result = clustered_result[\n", + " clustered_result[\"CENTROID_ID\"] == 1\n", + "][[\"content\"]]\n", "cluster_1_result_pandas = cluster_1_result.head(5).to_pandas()\n", "\n", - "cluster_2_result = combined_clustered_result[\n", - " combined_clustered_result[\"CENTROID_ID\"] == 2\n", - "][[\"consumer_complaint_narrative\"]]\n", + "cluster_2_result = clustered_result[\n", + " clustered_result[\"CENTROID_ID\"] == 2\n", + "][[\"content\"]]\n", "cluster_2_result_pandas = cluster_2_result.head(5).to_pandas()" ] }, @@ -579,15 +480,15 @@ "prompt1 = 'comment list 1:\\n'\n", "for i in range(5):\n", " prompt1 += str(i + 1) + '. ' + \\\n", - " cluster_1_result_pandas[\"consumer_complaint_narrative\"].iloc[i] + '\\n'\n", + " cluster_1_result_pandas[\"content\"].iloc[i] + '\\n'\n", "\n", "prompt2 = 'comment list 2:\\n'\n", "for i in range(5):\n", " prompt2 += str(i + 1) + '. ' + \\\n", - " cluster_2_result_pandas[\"consumer_complaint_narrative\"].iloc[i] + '\\n'\n", + " cluster_2_result_pandas[\"content\"].iloc[i] + '\\n'\n", "\n", "print(prompt1)\n", - "print(prompt2)\n" + "print(prompt2)" ] }, { @@ -624,9 +525,7 @@ "source": [ "from bigframes.ml.llm import PaLM2TextGenerator\n", "\n", - "session = bf.get_global_session()\n", - "connection = f\"{PROJECT_ID}.{REGION}.{CONN_NAME}\"\n", - "q_a_model = PaLM2TextGenerator(session=session, connection_name=connection)" + "q_a_model = PaLM2TextGenerator()" ] }, { @@ -662,6 +561,17 @@ "source": [ "We now see PaLM2TextGenerator's characterization of the different comment groups. Thanks for using BigQuery DataFrames!" ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Summary and next steps\n", + "\n", + "You've used the ML and LLM capabilities of BigQuery DataFrames to help analyze and understand a large dataset of unstructured feedback.\n", + "\n", + "Learn more about BigQuery DataFrames in the [documentation](https://cloud.google.com/python/docs/reference/bigframes/latest) and find more sample notebooks in the [GitHub repo](https://github.com/googleapis/python-bigquery-dataframes/tree/main/notebooks)." + ] } ], "metadata": { @@ -682,7 +592,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.9.16" + "version": "3.10.13" } }, "nbformat": 4, diff --git a/notebooks/getting_started/getting_started_bq_dataframes.ipynb b/notebooks/getting_started/getting_started_bq_dataframes.ipynb index 6cc6acc993..18be5e48fd 100644 --- a/notebooks/getting_started/getting_started_bq_dataframes.ipynb +++ b/notebooks/getting_started/getting_started_bq_dataframes.ipynb @@ -802,7 +802,7 @@ "source": [ "Running the cell below creates a custom function using the `remote_function` method. This function categorizes a value into one of two buckets: >= 4000 or <4000.\n", "\n", - "> Note: Creating a function requires a [BigQuery connection](https://cloud.google.com/bigquery/docs/remote-functions#create_a_remote_function). This code assumes a pre-created connection named `bigframes-rf-conn`. If\n", + "> Note: Creating a function requires a [BigQuery connection](https://cloud.google.com/bigquery/docs/remote-functions#create_a_remote_function). This code assumes a pre-created connection named `bigframes-default-connection`. If\n", "the connection is not already created, BigQuery DataFrames attempts to create one assuming the [necessary APIs\n", "and IAM permissions](https://cloud.google.com/python/docs/reference/bigframes/latest/bigframes.pandas#bigframes_pandas_remote_function) are set up in the project.\n", "\n", @@ -817,7 +817,7 @@ }, "outputs": [], "source": [ - "@bf.remote_function([float], str, bigquery_connection='bigframes-rf-conn')\n", + "@bf.remote_function([float], str)\n", "def get_bucket(num):\n", " if not num: return \"NA\"\n", " boundary = 4000\n", diff --git a/notebooks/getting_started/ml_fundamentals.ipynb b/notebooks/getting_started/ml_fundamentals.ipynb index 2f566dd704..165bd90f31 100644 --- a/notebooks/getting_started/ml_fundamentals.ipynb +++ b/notebooks/getting_started/ml_fundamentals.ipynb @@ -14,46 +14,16 @@ }, { "cell_type": "code", - "execution_count": 18, + "execution_count": 1, "metadata": {}, "outputs": [ { "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "0c8a8bc0b4d64448aef68d6a98fae666", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - "HTML(value='Query job 28e903c6-e874-4b99-8f53-0755e0b0c188 is RUNNING. " ] }, "metadata": {}, @@ -61,13 +31,11 @@ }, { "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "9680fd748e0546b4a010fda0155c5027", - "version_major": 2, - "version_minor": 0 - }, + "text/html": [ + "Query job e8aba858-7660-4274-8d90-8d2b0382f8f6 is DONE. 28.9 kB processed. Open Job" + ], "text/plain": [ - "HTML(value='Query job 7950d6a7-3747-4454-bba2-9660e830647f is DONE. 31.7 kB processed. " ] }, "metadata": {}, @@ -117,250 +85,250 @@ " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", " \n", " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", " \n", " \n", - " \n", + " \n", " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", " \n", " \n", " \n", " \n", - " \n", + " \n", " \n", - " \n", - " \n", + " \n", + " \n", " \n", " \n", " \n", " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", " \n", " \n", " \n", " \n", - " \n", + " \n", " \n", - " \n", - " \n", + " \n", + " \n", " \n", " \n", " \n", " \n", " \n", " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", " \n", " \n", " \n", @@ -370,86 +338,86 @@ "[334 rows x 7 columns in total]" ], "text/plain": [ - " species island \\\n", - "penguin_id \n", - "0 Adelie Penguin (Pygoscelis adeliae) Dream \n", - "1 Adelie Penguin (Pygoscelis adeliae) Dream \n", - "2 Adelie Penguin (Pygoscelis adeliae) Dream \n", - "3 Chinstrap penguin (Pygoscelis antarctica) Dream \n", - "4 Adelie Penguin (Pygoscelis adeliae) Dream \n", - "5 Adelie Penguin (Pygoscelis adeliae) Dream \n", - "6 Chinstrap penguin (Pygoscelis antarctica) Dream \n", - "7 Chinstrap penguin (Pygoscelis antarctica) Dream \n", - "8 Chinstrap penguin (Pygoscelis antarctica) Dream \n", - "9 Adelie Penguin (Pygoscelis adeliae) Dream \n", - "10 Adelie Penguin (Pygoscelis adeliae) Dream \n", - "11 Adelie Penguin (Pygoscelis adeliae) Dream \n", - "12 Adelie Penguin (Pygoscelis adeliae) Dream \n", - "13 Chinstrap penguin (Pygoscelis antarctica) Dream \n", - "14 Adelie Penguin (Pygoscelis adeliae) Dream \n", - "15 Adelie Penguin (Pygoscelis adeliae) Dream \n", - "16 Chinstrap penguin (Pygoscelis antarctica) Dream \n", - "17 Chinstrap penguin (Pygoscelis antarctica) Dream \n", - "18 Adelie Penguin (Pygoscelis adeliae) Dream \n", - "19 Adelie Penguin (Pygoscelis adeliae) Dream \n", - "20 Chinstrap penguin (Pygoscelis antarctica) Dream \n", - "21 Chinstrap penguin (Pygoscelis antarctica) Dream \n", - "22 Adelie Penguin (Pygoscelis adeliae) Dream \n", - "23 Adelie Penguin (Pygoscelis adeliae) Dream \n", - "24 Chinstrap penguin (Pygoscelis antarctica) Dream \n", + " species island \\\n", + "penguin_id \n", + "0 Adelie Penguin (Pygoscelis adeliae) Biscoe \n", + "1 Adelie Penguin (Pygoscelis adeliae) Torgersen \n", + "2 Gentoo penguin (Pygoscelis papua) Biscoe \n", + "3 Chinstrap penguin (Pygoscelis antarctica) Dream \n", + "4 Adelie Penguin (Pygoscelis adeliae) Biscoe \n", + "5 Gentoo penguin (Pygoscelis papua) Biscoe \n", + "6 Adelie Penguin (Pygoscelis adeliae) Biscoe \n", + "7 Gentoo penguin (Pygoscelis papua) Biscoe \n", + "8 Gentoo penguin (Pygoscelis papua) Biscoe \n", + "9 Gentoo penguin (Pygoscelis papua) Biscoe \n", + "10 Gentoo penguin (Pygoscelis papua) Biscoe \n", + "11 Adelie Penguin (Pygoscelis adeliae) Dream \n", + "12 Gentoo penguin (Pygoscelis papua) Biscoe \n", + "13 Adelie Penguin (Pygoscelis adeliae) Biscoe \n", + "14 Adelie Penguin (Pygoscelis adeliae) Biscoe \n", + "15 Gentoo penguin (Pygoscelis papua) Biscoe \n", + "16 Adelie Penguin (Pygoscelis adeliae) Torgersen \n", + "17 Gentoo penguin (Pygoscelis papua) Biscoe \n", + "18 Chinstrap penguin (Pygoscelis antarctica) Dream \n", + "19 Adelie Penguin (Pygoscelis adeliae) Dream \n", + "20 Chinstrap penguin (Pygoscelis antarctica) Dream \n", + "21 Adelie Penguin (Pygoscelis adeliae) Biscoe \n", + "22 Gentoo penguin (Pygoscelis papua) Biscoe \n", + "23 Adelie Penguin (Pygoscelis adeliae) Dream \n", + "24 Chinstrap penguin (Pygoscelis antarctica) Dream \n", "\n", " culmen_length_mm culmen_depth_mm flipper_length_mm body_mass_g \\\n", "penguin_id \n", - "0 36.6 18.4 184.0 3475.0 \n", - "1 39.8 19.1 184.0 4650.0 \n", - "2 40.9 18.9 184.0 3900.0 \n", - "3 46.5 17.9 192.0 3500.0 \n", - "4 37.3 16.8 192.0 3000.0 \n", - "5 43.2 18.5 192.0 4100.0 \n", - "6 46.9 16.6 192.0 2700.0 \n", - "7 50.5 18.4 200.0 3400.0 \n", - "8 49.5 19.0 200.0 3800.0 \n", - "9 40.2 20.1 200.0 3975.0 \n", - "10 40.8 18.9 208.0 4300.0 \n", - "11 39.0 18.7 185.0 3650.0 \n", - "12 37.0 16.9 185.0 3000.0 \n", - "13 47.0 17.3 185.0 3700.0 \n", - "14 34.0 17.1 185.0 3400.0 \n", - "15 37.0 16.5 185.0 3400.0 \n", - "16 45.7 17.3 193.0 3600.0 \n", - "17 50.6 19.4 193.0 3800.0 \n", - "18 39.7 17.9 193.0 4250.0 \n", - "19 37.8 18.1 193.0 3750.0 \n", - "20 46.6 17.8 193.0 3800.0 \n", - "21 51.3 19.2 193.0 3650.0 \n", - "22 40.2 17.1 193.0 3400.0 \n", - "23 36.8 18.5 193.0 3500.0 \n", - "24 49.6 18.2 193.0 3775.0 \n", + "0 40.1 18.9 188.0 4300.0 \n", + "1 39.1 18.7 181.0 3750.0 \n", + "2 47.4 14.6 212.0 4725.0 \n", + "3 42.5 16.7 187.0 3350.0 \n", + "4 43.2 19.0 197.0 4775.0 \n", + "5 46.7 15.3 219.0 5200.0 \n", + "6 41.3 21.1 195.0 4400.0 \n", + "7 45.2 13.8 215.0 4750.0 \n", + "8 46.5 13.5 210.0 4550.0 \n", + "9 50.5 15.2 216.0 5000.0 \n", + "10 48.2 15.6 221.0 5100.0 \n", + "11 38.1 18.6 190.0 3700.0 \n", + "12 50.7 15.0 223.0 5550.0 \n", + "13 37.8 20.0 190.0 4250.0 \n", + "14 35.0 17.9 190.0 3450.0 \n", + "15 48.7 15.7 208.0 5350.0 \n", + "16 34.6 21.1 198.0 4400.0 \n", + "17 46.8 15.4 215.0 5150.0 \n", + "18 50.3 20.0 197.0 3300.0 \n", + "19 37.2 18.1 178.0 3900.0 \n", + "20 51.0 18.8 203.0 4100.0 \n", + "21 40.5 17.9 187.0 3200.0 \n", + "22 45.5 13.9 210.0 4200.0 \n", + "23 42.2 18.5 180.0 3550.0 \n", + "24 51.7 20.3 194.0 3775.0 \n", "\n", " sex \n", "penguin_id \n", - "0 FEMALE \n", + "0 MALE \n", "1 MALE \n", - "2 MALE \n", + "2 FEMALE \n", "3 FEMALE \n", - "4 FEMALE \n", + "4 MALE \n", "5 MALE \n", - "6 FEMALE \n", + "6 MALE \n", "7 FEMALE \n", - "8 MALE \n", - "9 MALE \n", + "8 FEMALE \n", + "9 FEMALE \n", "10 MALE \n", - "11 MALE \n", - "12 FEMALE \n", - "13 FEMALE \n", + "11 FEMALE \n", + "12 MALE \n", + "13 MALE \n", "14 FEMALE \n", - "15 FEMALE \n", - "16 FEMALE \n", + "15 MALE \n", + "16 MALE \n", "17 MALE \n", "18 MALE \n", "19 MALE \n", - "20 FEMALE \n", - "21 MALE \n", + "20 MALE \n", + "21 FEMALE \n", "22 FEMALE \n", "23 FEMALE \n", "24 MALE \n", @@ -458,7 +426,7 @@ "[334 rows x 7 columns]" ] }, - "execution_count": 18, + "execution_count": 1, "metadata": {}, "output_type": "execute_result" } @@ -490,18 +458,16 @@ }, { "cell_type": "code", - "execution_count": 19, + "execution_count": 2, "metadata": {}, "outputs": [ { "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "171160f246eb43d1832aeefb055c0851", - "version_major": 2, - "version_minor": 0 - }, + "text/html": [ + "Query job deda90a8-6ec7-419c-8067-e85777bd916f is DONE. 28.9 kB processed. Open Job" + ], "text/plain": [ - "HTML(value='Query job 1408053d-cb80-4870-af28-e94b90a20a6d is DONE. 28.9 kB processed. " ] }, "metadata": {}, @@ -509,13 +475,11 @@ }, { "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "eaffac40f94745728e6bd618bebd2c53", - "version_major": 2, - "version_minor": 0 - }, + "text/html": [ + "Query job efe8fa0a-d450-475a-99d5-36beeb985247 is DONE. 28.9 kB processed. Open Job" + ], "text/plain": [ - "HTML(value='Query job 262885fe-973c-4338-a853-227f9db4835a is DONE. 31.7 kB processed. " ] }, "metadata": {}, @@ -523,13 +487,11 @@ }, { "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "68e7ecdc639f4d3ab482830bf6a9da04", - "version_major": 2, - "version_minor": 0 - }, + "text/html": [ + "Query job 5022c56d-e605-4cab-be1b-1ecf189588a1 is DONE. 28.9 kB processed. Open Job" + ], "text/plain": [ - "HTML(value='Query job fb1dc831-7f6f-42ce-96da-1292d73919b4 is DONE. 31.7 kB processed. " ] }, "metadata": {}, @@ -537,13 +499,11 @@ }, { "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "ebfe197fd88348129ebe2f7d288bf4b9", - "version_major": 2, - "version_minor": 0 - }, + "text/html": [ + "Query job 175bd293-d448-4510-b926-1d8cfb4eb5e7 is DONE. 28.9 kB processed. Open Job" + ], "text/plain": [ - "HTML(value='Query job e79add79-f1e4-4cf0-bb97-04d153222f19 is DONE. 31.7 kB processed. " ] }, "metadata": {}, @@ -551,13 +511,11 @@ }, { "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "2ae69ea7da5247e8a1f7cd0e049629cb", - "version_major": 2, - "version_minor": 0 - }, + "text/html": [ + "Query job a3a2e68c-f5f3-4237-99ad-44974f29d090 is DONE. 28.9 kB processed. Open Job" + ], "text/plain": [ - "HTML(value='Query job cb5ee343-f86e-4795-b0ce-d58854e72e5c is RUNNING. " ] }, "metadata": {}, @@ -596,18 +554,16 @@ }, { "cell_type": "code", - "execution_count": 20, + "execution_count": 3, "metadata": {}, "outputs": [ { "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "5ed4206cd3ad4cd485315605bf033df2", - "version_major": 2, - "version_minor": 0 - }, + "text/html": [ + "Query job db3365fb-67ca-44cc-a117-88a80dc63cca is DONE. 28.9 kB processed. Open Job" + ], "text/plain": [ - "HTML(value='Query job e65af31c-feda-468d-89c9-dec033574640 is DONE. 31.7 kB processed. " ] }, "metadata": {}, @@ -615,13 +571,11 @@ }, { "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "ac72db21945542558fdd62093d9dc0c3", - "version_major": 2, - "version_minor": 0 - }, + "text/html": [ + "Query job ab78f7ab-a115-448b-92d0-19c091a831ca is DONE. 28.9 kB processed. Open Job" + ], "text/plain": [ - "HTML(value='Query job 0455f252-2b94-457e-bad5-672b91d9b51f is RUNNING. " ] }, "metadata": {}, @@ -667,47 +621,47 @@ " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", + " \n", " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", " \n", - " \n", + " \n", " \n", " \n", - " \n", + " \n", " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", " \n", " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", " \n", @@ -719,24 +673,24 @@ "text/plain": [ " island culmen_length_mm culmen_depth_mm flipper_length_mm \\\n", "penguin_id \n", - "156 Biscoe 46.2 14.5 209.0 \n", - "189 Biscoe 35.3 18.9 187.0 \n", - "279 Biscoe 45.1 14.5 215.0 \n", - "245 Biscoe 49.5 16.2 229.0 \n", - "343 Torgersen 37.3 20.5 199.0 \n", + "249 Torgersen 41.1 18.6 189.0 \n", + "36 Biscoe 43.4 14.4 218.0 \n", + "74 Biscoe 42.8 14.2 209.0 \n", + "235 Dream 34.0 17.1 185.0 \n", + "117 Dream 37.8 18.1 193.0 \n", "\n", " sex species \n", "penguin_id \n", - "156 FEMALE Gentoo penguin (Pygoscelis papua) \n", - "189 FEMALE Adelie Penguin (Pygoscelis adeliae) \n", - "279 FEMALE Gentoo penguin (Pygoscelis papua) \n", - "245 MALE Gentoo penguin (Pygoscelis papua) \n", - "343 MALE Adelie Penguin (Pygoscelis adeliae) \n", + "249 MALE Adelie Penguin (Pygoscelis adeliae) \n", + "36 FEMALE Gentoo penguin (Pygoscelis papua) \n", + "74 FEMALE Gentoo penguin (Pygoscelis papua) \n", + "235 FEMALE Adelie Penguin (Pygoscelis adeliae) \n", + "117 MALE Adelie Penguin (Pygoscelis adeliae) \n", "\n", "[5 rows x 6 columns]" ] }, - "execution_count": 20, + "execution_count": 3, "metadata": {}, "output_type": "execute_result" } @@ -749,18 +703,16 @@ }, { "cell_type": "code", - "execution_count": 21, + "execution_count": 4, "metadata": {}, "outputs": [ { "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "d6dd794f89724099950dcc927d63d0f5", - "version_major": 2, - "version_minor": 0 - }, + "text/html": [ + "Query job 22a72cad-11a6-4f8e-b16d-f92853b8112e is DONE. 28.9 kB processed. Open Job" + ], "text/plain": [ - "HTML(value='Query job d5a173bd-a7dc-42fa-8468-b088d47ccfe0 is RUNNING. " ] }, "metadata": {}, @@ -768,13 +720,11 @@ }, { "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "a8ab7ca12e0d43a6803483480e837c6e", - "version_major": 2, - "version_minor": 0 - }, + "text/html": [ + "Query job bc952727-8806-4fe2-abf2-c3a8a2bd9b6d is DONE. 28.9 kB processed. Open Job" + ], "text/plain": [ - "HTML(value='Query job c6b6518b-2689-4dc1-a5b0-2a9ab75301eb is RUNNING. " ] }, "metadata": {}, @@ -810,24 +760,24 @@ " \n", " \n", " \n", - " \n", - " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", + " \n", + " \n", " \n", " \n", "
\n", - " \n", + " \n", " \"Colab Run in Colab\n", " \n", "
0Adelie Penguin (Pygoscelis adeliae)Dream36.618.4184.03475.0FEMALEBiscoe40.118.9188.04300.0MALE
1Adelie Penguin (Pygoscelis adeliae)Dream39.819.1184.04650.0Torgersen39.118.7181.03750.0MALE
2Adelie Penguin (Pygoscelis adeliae)Dream40.918.9184.03900.0MALEGentoo penguin (Pygoscelis papua)Biscoe47.414.6212.04725.0FEMALE
3Chinstrap penguin (Pygoscelis antarctica)Dream46.517.9192.03500.042.516.7187.03350.0FEMALE
4Adelie Penguin (Pygoscelis adeliae)Dream37.316.8192.03000.0FEMALEBiscoe43.219.0197.04775.0MALE
5Adelie Penguin (Pygoscelis adeliae)Dream43.218.5192.04100.0Gentoo penguin (Pygoscelis papua)Biscoe46.715.3219.05200.0MALE
6Chinstrap penguin (Pygoscelis antarctica)Dream46.916.6192.02700.0FEMALEAdelie Penguin (Pygoscelis adeliae)Biscoe41.321.1195.04400.0MALE
7Chinstrap penguin (Pygoscelis antarctica)Dream50.518.4200.03400.0Gentoo penguin (Pygoscelis papua)Biscoe45.213.8215.04750.0FEMALE
8Chinstrap penguin (Pygoscelis antarctica)Dream49.519.0200.03800.0MALEGentoo penguin (Pygoscelis papua)Biscoe46.513.5210.04550.0FEMALE
9Adelie Penguin (Pygoscelis adeliae)Dream40.220.1200.03975.0MALEGentoo penguin (Pygoscelis papua)Biscoe50.515.2216.05000.0FEMALE
10Adelie Penguin (Pygoscelis adeliae)Dream40.818.9208.04300.0Gentoo penguin (Pygoscelis papua)Biscoe48.215.6221.05100.0MALE
11Adelie Penguin (Pygoscelis adeliae)Dream39.018.7185.03650.0MALE38.118.6190.03700.0FEMALE
12Adelie Penguin (Pygoscelis adeliae)Dream37.016.9185.03000.0FEMALEGentoo penguin (Pygoscelis papua)Biscoe50.715.0223.05550.0MALE
13Chinstrap penguin (Pygoscelis antarctica)Dream47.017.3185.03700.0FEMALEAdelie Penguin (Pygoscelis adeliae)Biscoe37.820.0190.04250.0MALE
14Adelie Penguin (Pygoscelis adeliae)Dream34.017.1185.03400.0Biscoe35.017.9190.03450.0FEMALE
15Adelie Penguin (Pygoscelis adeliae)Dream37.016.5185.03400.0FEMALEGentoo penguin (Pygoscelis papua)Biscoe48.715.7208.05350.0MALE
16Chinstrap penguin (Pygoscelis antarctica)Dream45.717.3193.03600.0FEMALEAdelie Penguin (Pygoscelis adeliae)Torgersen34.621.1198.04400.0MALE
17Chinstrap penguin (Pygoscelis antarctica)Dream50.619.4193.03800.0Gentoo penguin (Pygoscelis papua)Biscoe46.815.4215.05150.0MALE
18Adelie Penguin (Pygoscelis adeliae)Chinstrap penguin (Pygoscelis antarctica)Dream39.717.9193.04250.050.320.0197.03300.0MALE
19Adelie Penguin (Pygoscelis adeliae)Dream37.837.218.1193.03750.0178.03900.0MALE
20Chinstrap penguin (Pygoscelis antarctica)Dream46.617.8193.03800.0FEMALE51.018.8203.04100.0MALE
21Chinstrap penguin (Pygoscelis antarctica)Dream51.319.2193.03650.0MALEAdelie Penguin (Pygoscelis adeliae)Biscoe40.517.9187.03200.0FEMALE
22Adelie Penguin (Pygoscelis adeliae)Dream40.217.1193.03400.0Gentoo penguin (Pygoscelis papua)Biscoe45.513.9210.04200.0FEMALE
23Adelie Penguin (Pygoscelis adeliae)Dream36.842.218.5193.03500.0180.03550.0FEMALE
24Chinstrap penguin (Pygoscelis antarctica)Dream49.618.2193.051.720.3194.03775.0MALE
156Biscoe46.214.5209.0FEMALEGentoo penguin (Pygoscelis papua)249Torgersen41.118.6189.0MALEAdelie Penguin (Pygoscelis adeliae)
18936Biscoe35.318.9187.043.414.4218.0FEMALEAdelie Penguin (Pygoscelis adeliae)Gentoo penguin (Pygoscelis papua)
27974Biscoe45.114.5215.042.814.2209.0FEMALEGentoo penguin (Pygoscelis papua)
245Biscoe49.516.2229.0MALEGentoo penguin (Pygoscelis papua)235Dream34.017.1185.0FEMALEAdelie Penguin (Pygoscelis adeliae)
343Torgersen37.320.5199.0117Dream37.818.1193.0MALEAdelie Penguin (Pygoscelis adeliae)
1564800.02493325.0
1893800.0364600.0
2795000.0744700.0
2455800.02353400.0
3433775.01173750.0
\n", @@ -837,16 +787,16 @@ "text/plain": [ " body_mass_g\n", "penguin_id \n", - "156 4800.0\n", - "189 3800.0\n", - "279 5000.0\n", - "245 5800.0\n", - "343 3775.0\n", + "249 3325.0\n", + "36 4600.0\n", + "74 4700.0\n", + "235 3400.0\n", + "117 3750.0\n", "\n", "[5 rows x 1 columns]" ] }, - "execution_count": 21, + "execution_count": 4, "metadata": {}, "output_type": "execute_result" } @@ -880,18 +830,16 @@ }, { "cell_type": "code", - "execution_count": 22, + "execution_count": 5, "metadata": {}, "outputs": [ { "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "380c57dc3fe54fbd8ad2fb23f1e66e37", - "version_major": 2, - "version_minor": 0 - }, + "text/html": [ + "Query job f239341e-785f-43e1-bfe0-683132d6f15f is DONE. 28.9 kB processed. Open Job" + ], "text/plain": [ - "HTML(value='Query job 03a0eb1c-747e-4c2a-b7b5-d3e4e5a78134 is RUNNING. " ] }, "metadata": {}, @@ -899,13 +847,11 @@ }, { "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "3db47aadba854beca71960d846838dc4", - "version_major": 2, - "version_minor": 0 - }, + "text/html": [ + "Query job 2d5bbbb9-efc4-4f4e-a8dc-2c7b66b0e5e0 is DONE. 0 Bytes processed. Open Job" + ], "text/plain": [ - "HTML(value='Query job 70608c84-dac8-4e77-8a9e-00d823b24f37 is RUNNING. " ] }, "metadata": {}, @@ -913,13 +859,11 @@ }, { "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "1de81f2944a44cbda3f16fa8a1fae813", - "version_major": 2, - "version_minor": 0 - }, + "text/html": [ + "Query job 66120e1c-2471-4a0c-8b82-aeb189c8866a is DONE. 28.9 kB processed. Open Job" + ], "text/plain": [ - "HTML(value='Query job d18fdc32-2152-45d3-8c62-bf9b1556ec47 is RUNNING. " ] }, "metadata": {}, @@ -927,13 +871,11 @@ }, { "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "b06cae61a4534388a4e9ed26ce442cc2", - "version_major": 2, - "version_minor": 0 - }, + "text/html": [ + "Query job 62825fc4-5b77-43e5-a3e4-525ebfd1285b is DONE. 2.1 kB processed. Open Job" + ], "text/plain": [ - "HTML(value='Query job 2a022682-535f-4dc0-80ba-1640306ad9ef is RUNNING. " ] }, "metadata": {}, @@ -941,13 +883,11 @@ }, { "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "977c8eae2c9848e98c5478c41af82633", - "version_major": 2, - "version_minor": 0 - }, + "text/html": [ + "Query job 656d1d69-b4ff-4db6-9f2d-28dcf91e2fd7 is DONE. 0 Bytes processed. Open Job" + ], "text/plain": [ - "HTML(value='Query job c145b39d-7d02-4394-80f0-fc605b2ba256 is DONE. 0 Bytes processed. " ] }, "metadata": {}, @@ -955,13 +895,11 @@ }, { "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "aefc3085fee04c438d0327d400b4b72a", - "version_major": 2, - "version_minor": 0 - }, + "text/html": [ + "Query job 466507c8-1474-4725-93e5-baf8ee292e39 is DONE. 8.5 kB processed. Open Job" + ], "text/plain": [ - "HTML(value='Query job fc156a2b-db95-44a3-9ad1-d95b9d290080 is RUNNING. " ] }, "metadata": {}, @@ -1002,153 +940,153 @@ " \n", " \n", " 0\n", - " -1.344188\n", - " 0.642519\n", - " -1.193942\n", - " \n", - " \n", - " 1\n", - " -0.750047\n", - " 1.005876\n", - " -1.193942\n", + " -0.750505\n", + " 0.84903\n", + " -0.937262\n", " \n", " \n", " 2\n", - " -0.545811\n", - " 0.90206\n", - " -1.193942\n", + " 0.622496\n", + " -1.322402\n", + " 0.804051\n", " \n", " \n", - " 4\n", - " -1.214219\n", - " -0.188011\n", - " -0.619171\n", + " 3\n", + " -0.299107\n", + " -0.261935\n", + " -1.009817\n", " \n", " \n", " 5\n", - " -0.118772\n", - " 0.694427\n", - " -0.619171\n", + " 0.490839\n", + " -0.968913\n", + " 1.311935\n", " \n", " \n", " 6\n", - " 0.568203\n", - " -0.291828\n", - " -0.619171\n", + " -0.524806\n", + " 1.959995\n", + " -0.429379\n", " \n", " \n", " 7\n", - " 1.236611\n", - " 0.642519\n", - " -0.044401\n", + " 0.208715\n", + " -1.726389\n", + " 1.021716\n", " \n", " \n", " 9\n", - " -0.675779\n", - " 1.524957\n", - " -0.044401\n", + " 1.205551\n", + " -1.019412\n", + " 1.09427\n", " \n", " \n", " 10\n", - " -0.564378\n", - " 0.90206\n", - " 0.530369\n", - " \n", - " \n", - " 11\n", - " -0.898582\n", - " 0.798243\n", - " -1.122096\n", + " 0.772962\n", + " -0.817418\n", + " 1.457044\n", " \n", " \n", " 12\n", - " -1.26992\n", - " -0.136103\n", - " -1.122096\n", - " \n", - " \n", - " 13\n", - " 0.58677\n", - " 0.071529\n", - " -1.122096\n", + " 1.243168\n", + " -1.120408\n", + " 1.602153\n", " \n", " \n", " 14\n", - " -1.826927\n", - " -0.032287\n", - " -1.122096\n", + " -1.709725\n", + " 0.344046\n", + " -0.792152\n", " \n", " \n", - " 15\n", - " -1.26992\n", - " -0.343736\n", - " -1.122096\n", - " \n", - " \n", - " 16\n", - " 0.3454\n", - " 0.071529\n", - " -0.547325\n", + " 17\n", + " 0.509647\n", + " -0.918415\n", + " 1.021716\n", " \n", " \n", " 18\n", - " -0.768614\n", - " 0.382978\n", - " -0.547325\n", + " 1.167935\n", + " 1.404513\n", + " -0.284269\n", " \n", " \n", " 19\n", - " -1.121385\n", - " 0.486795\n", - " -0.547325\n", + " -1.295944\n", + " 0.445043\n", + " -1.662809\n", " \n", " \n", " 20\n", - " 0.512502\n", - " 0.33107\n", - " -0.547325\n", + " 1.299593\n", + " 0.798532\n", + " 0.151059\n", " \n", " \n", " 21\n", - " 1.385146\n", - " 1.057784\n", - " -0.547325\n", + " -0.675272\n", + " 0.344046\n", + " -1.009817\n", " \n", " \n", " 22\n", - " -0.675779\n", - " -0.032287\n", - " -0.547325\n", + " 0.26514\n", + " -1.675891\n", + " 0.658942\n", " \n", " \n", " 24\n", - " 1.069509\n", - " 0.538703\n", - " -0.547325\n", + " 1.43125\n", + " 1.556008\n", + " -0.501934\n", + " \n", + " \n", + " 25\n", + " 0.302756\n", + " 0.041055\n", + " -0.574488\n", " \n", " \n", " 26\n", - " -0.43441\n", - " 0.694427\n", - " 0.027445\n", + " 0.302756\n", + " -1.675891\n", + " 0.949161\n", + " \n", + " \n", + " 27\n", + " 0.227523\n", + " -1.776888\n", + " 0.658942\n", " \n", " \n", " 28\n", - " 1.923586\n", - " 1.888314\n", - " 0.027445\n", + " 1.318401\n", + " -0.362932\n", + " 1.747263\n", + " \n", + " \n", + " 29\n", + " 2.202388\n", + " 1.303516\n", + " 0.441278\n", " \n", " \n", " 30\n", - " 1.292312\n", - " 0.694427\n", - " 0.027445\n", + " -0.919779\n", + " 1.959995\n", + " -0.356824\n", " \n", " \n", " 31\n", - " -1.994029\n", - " -0.551368\n", - " -1.62502\n", + " 1.036277\n", + " -0.615424\n", + " 1.747263\n", + " \n", + " \n", + " 32\n", + " -0.223874\n", + " 0.19255\n", + " -0.356824\n", " \n", " \n", "\n", @@ -1158,65 +1096,65 @@ "text/plain": [ " standard_scaled_culmen_length_mm standard_scaled_culmen_depth_mm \\\n", "penguin_id \n", - "0 -1.344188 0.642519 \n", - "1 -0.750047 1.005876 \n", - "2 -0.545811 0.90206 \n", - "4 -1.214219 -0.188011 \n", - "5 -0.118772 0.694427 \n", - "6 0.568203 -0.291828 \n", - "7 1.236611 0.642519 \n", - "9 -0.675779 1.524957 \n", - "10 -0.564378 0.90206 \n", - "11 -0.898582 0.798243 \n", - "12 -1.26992 -0.136103 \n", - "13 0.58677 0.071529 \n", - "14 -1.826927 -0.032287 \n", - "15 -1.26992 -0.343736 \n", - "16 0.3454 0.071529 \n", - "18 -0.768614 0.382978 \n", - "19 -1.121385 0.486795 \n", - "20 0.512502 0.33107 \n", - "21 1.385146 1.057784 \n", - "22 -0.675779 -0.032287 \n", - "24 1.069509 0.538703 \n", - "26 -0.43441 0.694427 \n", - "28 1.923586 1.888314 \n", - "30 1.292312 0.694427 \n", - "31 -1.994029 -0.551368 \n", + "0 -0.750505 0.84903 \n", + "2 0.622496 -1.322402 \n", + "3 -0.299107 -0.261935 \n", + "5 0.490839 -0.968913 \n", + "6 -0.524806 1.959995 \n", + "7 0.208715 -1.726389 \n", + "9 1.205551 -1.019412 \n", + "10 0.772962 -0.817418 \n", + "12 1.243168 -1.120408 \n", + "14 -1.709725 0.344046 \n", + "17 0.509647 -0.918415 \n", + "18 1.167935 1.404513 \n", + "19 -1.295944 0.445043 \n", + "20 1.299593 0.798532 \n", + "21 -0.675272 0.344046 \n", + "22 0.26514 -1.675891 \n", + "24 1.43125 1.556008 \n", + "25 0.302756 0.041055 \n", + "26 0.302756 -1.675891 \n", + "27 0.227523 -1.776888 \n", + "28 1.318401 -0.362932 \n", + "29 2.202388 1.303516 \n", + "30 -0.919779 1.959995 \n", + "31 1.036277 -0.615424 \n", + "32 -0.223874 0.19255 \n", "\n", " standard_scaled_flipper_length_mm \n", "penguin_id \n", - "0 -1.193942 \n", - "1 -1.193942 \n", - "2 -1.193942 \n", - "4 -0.619171 \n", - "5 -0.619171 \n", - "6 -0.619171 \n", - "7 -0.044401 \n", - "9 -0.044401 \n", - "10 0.530369 \n", - "11 -1.122096 \n", - "12 -1.122096 \n", - "13 -1.122096 \n", - "14 -1.122096 \n", - "15 -1.122096 \n", - "16 -0.547325 \n", - "18 -0.547325 \n", - "19 -0.547325 \n", - "20 -0.547325 \n", - "21 -0.547325 \n", - "22 -0.547325 \n", - "24 -0.547325 \n", - "26 0.027445 \n", - "28 0.027445 \n", - "30 0.027445 \n", - "31 -1.62502 \n", + "0 -0.937262 \n", + "2 0.804051 \n", + "3 -1.009817 \n", + "5 1.311935 \n", + "6 -0.429379 \n", + "7 1.021716 \n", + "9 1.09427 \n", + "10 1.457044 \n", + "12 1.602153 \n", + "14 -0.792152 \n", + "17 1.021716 \n", + "18 -0.284269 \n", + "19 -1.662809 \n", + "20 0.151059 \n", + "21 -1.009817 \n", + "22 0.658942 \n", + "24 -0.501934 \n", + "25 -0.574488 \n", + "26 0.949161 \n", + "27 0.658942 \n", + "28 1.747263 \n", + "29 0.441278 \n", + "30 -0.356824 \n", + "31 1.747263 \n", + "32 -0.356824 \n", "...\n", "\n", "[267 rows x 3 columns]" ] }, - "execution_count": 22, + "execution_count": 5, "metadata": {}, "output_type": "execute_result" } @@ -1237,32 +1175,16 @@ }, { "cell_type": "code", - "execution_count": 23, + "execution_count": 6, "metadata": {}, "outputs": [ { "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "74f3c24c0a434e12bf6a56dc4809b501", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - "HTML(value='Query job c6268b07-0d3d-4fe0-971d-cc99fd98cd7e is RUNNING. Open Job" + ], "text/plain": [ - "HTML(value='Query job 31550d88-fc7b-4fcb-9975-9ed24bf2e009 is RUNNING. " ] }, "metadata": {}, @@ -1270,13 +1192,11 @@ }, { "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "5a04e46a7d0248b1ae523f2ca6903ee8", - "version_major": 2, - "version_minor": 0 - }, + "text/html": [ + "Query job 1e17f5f7-2956-4bdd-baa9-c07591481341 is DONE. 536 Bytes processed. Open Job" + ], "text/plain": [ - "HTML(value='Query job 5ec7c8b1-037c-466c-a51e-963f8274e76b is RUNNING. " ] }, "metadata": {}, @@ -1284,13 +1204,11 @@ }, { "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "62563820bfb245be85bbc1bf3dfb993c", - "version_major": 2, - "version_minor": 0 - }, + "text/html": [ + "Query job e2fde7a6-67b4-45a4-91d4-1cb9eff66ae5 is DONE. 0 Bytes processed. Open Job" + ], "text/plain": [ - "HTML(value='Query job 4e860716-bc41-4ef6-83ff-310d085ed7cc is DONE. 0 Bytes processed. " ] }, "metadata": {}, @@ -1298,13 +1216,11 @@ }, { "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "98aff3bfded44868bf120451c89df9f5", - "version_major": 2, - "version_minor": 0 - }, + "text/html": [ + "Query job e0683619-23c5-44fd-8930-9d3c9d02729a is DONE. 2.1 kB processed. Open Job" + ], "text/plain": [ - "HTML(value='Query job 6b96a757-42fe-4b65-92fd-a3ae339fe769 is RUNNING. " ] }, "metadata": {}, @@ -1344,154 +1260,154 @@ " \n", " \n", " \n", - " 3\n", - " 0.493935\n", - " 0.382978\n", - " -0.619171\n", - " \n", - " \n", - " 8\n", - " 1.050942\n", - " 0.953968\n", - " -0.044401\n", + " 1\n", + " -0.938587\n", + " 0.748033\n", + " -1.445145\n", " \n", " \n", - " 17\n", - " 1.255178\n", - " 1.1616\n", - " -0.547325\n", + " 4\n", + " -0.16745\n", + " 0.899528\n", + " -0.284269\n", " \n", " \n", - " 23\n", - " -1.307054\n", - " 0.694427\n", - " -0.547325\n", + " 8\n", + " 0.453222\n", + " -1.877885\n", + " 0.658942\n", " \n", " \n", - " 25\n", - " 1.515114\n", - " 0.486795\n", - " 0.027445\n", + " 11\n", + " -1.12667\n", + " 0.697535\n", + " -0.792152\n", " \n", " \n", - " 27\n", - " 1.236611\n", - " 1.265417\n", - " 0.027445\n", + " 13\n", + " -1.183094\n", + " 1.404513\n", + " -0.792152\n", " \n", " \n", - " 29\n", - " 1.403713\n", - " 0.953968\n", - " 0.027445\n", + " 15\n", + " 0.867003\n", + " -0.766919\n", + " 0.513833\n", " \n", " \n", - " 34\n", - " 0.419668\n", - " 0.538703\n", - " -1.62502\n", + " 16\n", + " -1.784958\n", + " 1.959995\n", + " -0.211715\n", " \n", " \n", - " 35\n", - " -1.455589\n", - " 0.694427\n", - " -1.050249\n", + " 23\n", + " -0.355532\n", + " 0.647036\n", + " -1.5177\n", " \n", " \n", - " 39\n", - " 0.326833\n", - " 1.1616\n", - " -0.475479\n", + " 34\n", + " -0.600039\n", + " -1.776888\n", + " 0.949161\n", " \n", " \n", - " 51\n", - " -1.065684\n", - " 0.227254\n", - " -0.978403\n", + " 36\n", + " -0.129833\n", + " -1.423399\n", + " 1.23938\n", " \n", " \n", - " 52\n", - " -0.248741\n", - " 0.071529\n", - " -0.978403\n", + " 42\n", + " -1.615684\n", + " -0.514427\n", + " -0.429379\n", " \n", " \n", - " 60\n", - " 0.531069\n", - " 0.382978\n", - " -0.403633\n", + " 48\n", + " 0.415606\n", + " -0.716421\n", + " 1.021716\n", " \n", " \n", " 61\n", - " 0.401101\n", - " 0.90206\n", - " -0.403633\n", + " 0.396797\n", + " -1.170907\n", + " 1.457044\n", " \n", " \n", " 64\n", - " -1.455589\n", - " 0.33107\n", - " -0.403633\n", + " 0.434414\n", + " -1.120408\n", + " 1.09427\n", " \n", " \n", " 65\n", - " -0.564378\n", - " 0.642519\n", - " -0.403633\n", + " -1.220711\n", + " 1.051024\n", + " -1.445145\n", " \n", " \n", - " 67\n", - " 1.273745\n", - " 1.317325\n", - " 0.171138\n", + " 68\n", + " -1.484026\n", + " -0.009443\n", + " -1.009817\n", " \n", " \n", - " 83\n", - " 2.629128\n", - " 0.33107\n", - " -1.409481\n", + " 70\n", + " 1.638141\n", + " 1.404513\n", + " 0.296168\n", " \n", " \n", - " 85\n", - " -1.288487\n", - " 0.746335\n", - " -0.83471\n", + " 72\n", + " 0.829387\n", + " 0.142052\n", + " -0.719598\n", " \n", " \n", - " 93\n", - " -0.508677\n", - " 0.486795\n", - " 0.314831\n", + " 74\n", + " -0.242683\n", + " -1.524396\n", + " 0.586387\n", " \n", " \n", - " 104\n", - " 0.382534\n", - " -0.032287\n", - " -0.762864\n", + " 77\n", + " -1.277136\n", + " -0.211437\n", + " -0.647043\n", " \n", " \n", - " 105\n", - " -1.065684\n", - " 0.746335\n", - " -0.762864\n", + " 81\n", + " 0.208715\n", + " -1.221405\n", + " 0.804051\n", + " \n", + " \n", + " 91\n", + " 1.261976\n", + " 0.647036\n", + " 0.005949\n", " \n", " \n", - " 108\n", - " 1.162343\n", - " 0.382978\n", - " -0.762864\n", + " 96\n", + " 0.246331\n", + " -1.322402\n", + " 0.731497\n", " \n", " \n", - " 113\n", - " 1.496547\n", - " 1.213509\n", - " 0.386677\n", + " 105\n", + " -1.803766\n", + " 0.445043\n", + " -1.009817\n", " \n", " \n", - " 130\n", - " -0.341575\n", - " 1.213509\n", - " -0.044401\n", + " 111\n", + " -1.164286\n", + " 0.697535\n", + " -2.098138\n", " \n", " \n", "\n", @@ -1501,65 +1417,65 @@ "text/plain": [ " standard_scaled_culmen_length_mm standard_scaled_culmen_depth_mm \\\n", "penguin_id \n", - "3 0.493935 0.382978 \n", - "8 1.050942 0.953968 \n", - "17 1.255178 1.1616 \n", - "23 -1.307054 0.694427 \n", - "25 1.515114 0.486795 \n", - "27 1.236611 1.265417 \n", - "29 1.403713 0.953968 \n", - "34 0.419668 0.538703 \n", - "35 -1.455589 0.694427 \n", - "39 0.326833 1.1616 \n", - "51 -1.065684 0.227254 \n", - "52 -0.248741 0.071529 \n", - "60 0.531069 0.382978 \n", - "61 0.401101 0.90206 \n", - "64 -1.455589 0.33107 \n", - "65 -0.564378 0.642519 \n", - "67 1.273745 1.317325 \n", - "83 2.629128 0.33107 \n", - "85 -1.288487 0.746335 \n", - "93 -0.508677 0.486795 \n", - "104 0.382534 -0.032287 \n", - "105 -1.065684 0.746335 \n", - "108 1.162343 0.382978 \n", - "113 1.496547 1.213509 \n", - "130 -0.341575 1.213509 \n", + "1 -0.938587 0.748033 \n", + "4 -0.16745 0.899528 \n", + "8 0.453222 -1.877885 \n", + "11 -1.12667 0.697535 \n", + "13 -1.183094 1.404513 \n", + "15 0.867003 -0.766919 \n", + "16 -1.784958 1.959995 \n", + "23 -0.355532 0.647036 \n", + "34 -0.600039 -1.776888 \n", + "36 -0.129833 -1.423399 \n", + "42 -1.615684 -0.514427 \n", + "48 0.415606 -0.716421 \n", + "61 0.396797 -1.170907 \n", + "64 0.434414 -1.120408 \n", + "65 -1.220711 1.051024 \n", + "68 -1.484026 -0.009443 \n", + "70 1.638141 1.404513 \n", + "72 0.829387 0.142052 \n", + "74 -0.242683 -1.524396 \n", + "77 -1.277136 -0.211437 \n", + "81 0.208715 -1.221405 \n", + "91 1.261976 0.647036 \n", + "96 0.246331 -1.322402 \n", + "105 -1.803766 0.445043 \n", + "111 -1.164286 0.697535 \n", "\n", " standard_scaled_flipper_length_mm \n", "penguin_id \n", - "3 -0.619171 \n", - "8 -0.044401 \n", - "17 -0.547325 \n", - "23 -0.547325 \n", - "25 0.027445 \n", - "27 0.027445 \n", - "29 0.027445 \n", - "34 -1.62502 \n", - "35 -1.050249 \n", - "39 -0.475479 \n", - "51 -0.978403 \n", - "52 -0.978403 \n", - "60 -0.403633 \n", - "61 -0.403633 \n", - "64 -0.403633 \n", - "65 -0.403633 \n", - "67 0.171138 \n", - "83 -1.409481 \n", - "85 -0.83471 \n", - "93 0.314831 \n", - "104 -0.762864 \n", - "105 -0.762864 \n", - "108 -0.762864 \n", - "113 0.386677 \n", - "130 -0.044401 \n", + "1 -1.445145 \n", + "4 -0.284269 \n", + "8 0.658942 \n", + "11 -0.792152 \n", + "13 -0.792152 \n", + "15 0.513833 \n", + "16 -0.211715 \n", + "23 -1.5177 \n", + "34 0.949161 \n", + "36 1.23938 \n", + "42 -0.429379 \n", + "48 1.021716 \n", + "61 1.457044 \n", + "64 1.09427 \n", + "65 -1.445145 \n", + "68 -1.009817 \n", + "70 0.296168 \n", + "72 -0.719598 \n", + "74 0.586387 \n", + "77 -0.647043 \n", + "81 0.804051 \n", + "91 0.005949 \n", + "96 0.731497 \n", + "105 -1.009817 \n", + "111 -2.098138 \n", "...\n", "\n", "[67 rows x 3 columns]" ] }, - "execution_count": 23, + "execution_count": 6, "metadata": {}, "output_type": "execute_result" } @@ -1581,32 +1497,16 @@ }, { "cell_type": "code", - "execution_count": 24, + "execution_count": 7, "metadata": {}, "outputs": [ { "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "d642a617d27f4e2493c80dbdd1686193", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - "HTML(value='Query job a8d8afa4-d91e-487e-8709-8727a73ab453 is RUNNING. Open Job" + ], "text/plain": [ - "HTML(value='Query job b9afd624-4345-4160-8809-05786563ce35 is RUNNING. " ] }, "metadata": {}, @@ -1614,13 +1514,11 @@ }, { "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "09217776c2294e8b929a56e7a73fbfa8", - "version_major": 2, - "version_minor": 0 - }, + "text/html": [ + "Query job 41962e2e-4d14-4053-9297-3ce61699551a is DONE. 0 Bytes processed. Open Job" + ], "text/plain": [ - "HTML(value='Query job c918fc7c-a956-4259-b5c5-09c2eac615cd is RUNNING. " ] }, "metadata": {}, @@ -1628,13 +1526,11 @@ }, { "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "9c1581fc9fcb49739d1d81b73506b894", - "version_major": 2, - "version_minor": 0 - }, + "text/html": [ + "Query job 5d3c22c9-c972-4213-8557-726c9e0aca37 is DONE. 22.9 kB processed. Open Job" + ], "text/plain": [ - "HTML(value='Query job 1d855341-282f-4d10-9ba9-3ce6683b729a is RUNNING. " ] }, "metadata": {}, @@ -1642,13 +1538,11 @@ }, { "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "d7749eb7cf554697a60c90f3718ad582", - "version_major": 2, - "version_minor": 0 - }, + "text/html": [ + "Query job 9cb7b33f-ea05-4cf4-9f92-bb3aa4ea8d10 is DONE. 2.1 kB processed. Open Job" + ], "text/plain": [ - "HTML(value='Query job c257ff78-3e15-4296-82f5-ba6c2eb6a6ff is RUNNING. " ] }, "metadata": {}, @@ -1656,13 +1550,11 @@ }, { "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "e900465918224249bccc781d992aadbb", - "version_major": 2, - "version_minor": 0 - }, + "text/html": [ + "Query job fe1f35d6-d82c-4aab-a284-637b72554f5b is DONE. 29.2 kB processed. Open Job" + ], "text/plain": [ - "HTML(value='Query job a17eec0c-10d0-4943-95be-60fced57d5cb is RUNNING. " ] }, "metadata": {}, @@ -1670,13 +1562,11 @@ }, { "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "b0272ee35c5745a491b7c5883b3fbb1b", - "version_major": 2, - "version_minor": 0 - }, + "text/html": [ + "Query job 37bc90ff-59cb-4b0c-8f9d-73bcda43524a is DONE. 536 Bytes processed. Open Job" + ], "text/plain": [ - "HTML(value='Query job 1db53c8a-cf45-4c69-a443-6b7a49fc3a07 is DONE. 536 Bytes processed. " ] }, "metadata": {}, @@ -1684,13 +1574,11 @@ }, { "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "00f9d4b55bb94997aaebdae298cefab3", - "version_major": 2, - "version_minor": 0 - }, + "text/html": [ + "Query job e23f4724-fdd8-45a9-8c87-defd8d471035 is DONE. 0 Bytes processed. Open Job" + ], "text/plain": [ - "HTML(value='Query job ae870ee3-e633-4556-94e6-6669fa0bfde2 is DONE. 0 Bytes processed. " ] }, "metadata": {}, @@ -1698,13 +1586,11 @@ }, { "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "9cd8e791be5844669cba10dc53f862ae", - "version_major": 2, - "version_minor": 0 - }, + "text/html": [ + "Query job 257378db-0569-42d7-965a-7757154c710b is DONE. 21.4 kB processed. Open Job" + ], "text/plain": [ - "HTML(value='Query job baa7c70c-eabc-49e1-bae9-fdd4891cdb6e is RUNNING. " ] }, "metadata": {}, @@ -1751,226 +1637,226 @@ " \n", " \n", " 0\n", - " [{'index': 2, 'value': 1.0}]\n", - " -1.344188\n", - " 0.642519\n", - " -1.193942\n", - " [{'index': 2, 'value': 1.0}]\n", " [{'index': 1, 'value': 1.0}]\n", - " \n", - " \n", - " 1\n", + " -0.750505\n", + " 0.84903\n", + " -0.937262\n", " [{'index': 2, 'value': 1.0}]\n", - " -0.750047\n", - " 1.005876\n", - " -1.193942\n", - " [{'index': 3, 'value': 1.0}]\n", " [{'index': 1, 'value': 1.0}]\n", " \n", " \n", " 2\n", - " [{'index': 2, 'value': 1.0}]\n", - " -0.545811\n", - " 0.90206\n", - " -1.193942\n", - " [{'index': 3, 'value': 1.0}]\n", " [{'index': 1, 'value': 1.0}]\n", + " 0.622496\n", + " -1.322402\n", + " 0.804051\n", + " [{'index': 1, 'value': 1.0}]\n", + " [{'index': 3, 'value': 1.0}]\n", " \n", " \n", - " 4\n", - " [{'index': 2, 'value': 1.0}]\n", - " -1.214219\n", - " -0.188011\n", - " -0.619171\n", + " 3\n", " [{'index': 2, 'value': 1.0}]\n", + " -0.299107\n", + " -0.261935\n", + " -1.009817\n", " [{'index': 1, 'value': 1.0}]\n", + " [{'index': 2, 'value': 1.0}]\n", " \n", " \n", " 5\n", + " [{'index': 1, 'value': 1.0}]\n", + " 0.490839\n", + " -0.968913\n", + " 1.311935\n", " [{'index': 2, 'value': 1.0}]\n", - " -0.118772\n", - " 0.694427\n", - " -0.619171\n", " [{'index': 3, 'value': 1.0}]\n", - " [{'index': 1, 'value': 1.0}]\n", " \n", " \n", " 6\n", + " [{'index': 1, 'value': 1.0}]\n", + " -0.524806\n", + " 1.959995\n", + " -0.429379\n", " [{'index': 2, 'value': 1.0}]\n", - " 0.568203\n", - " -0.291828\n", - " -0.619171\n", - " [{'index': 2, 'value': 1.0}]\n", - " [{'index': 2, 'value': 1.0}]\n", + " [{'index': 1, 'value': 1.0}]\n", " \n", " \n", " 7\n", - " [{'index': 2, 'value': 1.0}]\n", - " 1.236611\n", - " 0.642519\n", - " -0.044401\n", - " [{'index': 2, 'value': 1.0}]\n", - " [{'index': 2, 'value': 1.0}]\n", + " [{'index': 1, 'value': 1.0}]\n", + " 0.208715\n", + " -1.726389\n", + " 1.021716\n", + " [{'index': 1, 'value': 1.0}]\n", + " [{'index': 3, 'value': 1.0}]\n", " \n", " \n", " 9\n", - " [{'index': 2, 'value': 1.0}]\n", - " -0.675779\n", - " 1.524957\n", - " -0.044401\n", - " [{'index': 3, 'value': 1.0}]\n", " [{'index': 1, 'value': 1.0}]\n", + " 1.205551\n", + " -1.019412\n", + " 1.09427\n", + " [{'index': 1, 'value': 1.0}]\n", + " [{'index': 3, 'value': 1.0}]\n", " \n", " \n", " 10\n", + " [{'index': 1, 'value': 1.0}]\n", + " 0.772962\n", + " -0.817418\n", + " 1.457044\n", " [{'index': 2, 'value': 1.0}]\n", - " -0.564378\n", - " 0.90206\n", - " 0.530369\n", " [{'index': 3, 'value': 1.0}]\n", - " [{'index': 1, 'value': 1.0}]\n", " \n", " \n", - " 11\n", + " 12\n", + " [{'index': 1, 'value': 1.0}]\n", + " 1.243168\n", + " -1.120408\n", + " 1.602153\n", " [{'index': 2, 'value': 1.0}]\n", - " -0.898582\n", - " 0.798243\n", - " -1.122096\n", " [{'index': 3, 'value': 1.0}]\n", - " [{'index': 1, 'value': 1.0}]\n", " \n", " \n", - " 12\n", - " [{'index': 2, 'value': 1.0}]\n", - " -1.26992\n", - " -0.136103\n", - " -1.122096\n", - " [{'index': 2, 'value': 1.0}]\n", + " 14\n", + " [{'index': 1, 'value': 1.0}]\n", + " -1.709725\n", + " 0.344046\n", + " -0.792152\n", + " [{'index': 1, 'value': 1.0}]\n", " [{'index': 1, 'value': 1.0}]\n", " \n", " \n", - " 13\n", - " [{'index': 2, 'value': 1.0}]\n", - " 0.58677\n", - " 0.071529\n", - " -1.122096\n", - " [{'index': 2, 'value': 1.0}]\n", + " 17\n", + " [{'index': 1, 'value': 1.0}]\n", + " 0.509647\n", + " -0.918415\n", + " 1.021716\n", " [{'index': 2, 'value': 1.0}]\n", + " [{'index': 3, 'value': 1.0}]\n", " \n", " \n", - " 14\n", + " 18\n", + " [{'index': 2, 'value': 1.0}]\n", + " 1.167935\n", + " 1.404513\n", + " -0.284269\n", " [{'index': 2, 'value': 1.0}]\n", - " -1.826927\n", - " -0.032287\n", - " -1.122096\n", " [{'index': 2, 'value': 1.0}]\n", - " [{'index': 1, 'value': 1.0}]\n", " \n", " \n", - " 15\n", + " 19\n", " [{'index': 2, 'value': 1.0}]\n", - " -1.26992\n", - " -0.343736\n", - " -1.122096\n", + " -1.295944\n", + " 0.445043\n", + " -1.662809\n", " [{'index': 2, 'value': 1.0}]\n", " [{'index': 1, 'value': 1.0}]\n", " \n", " \n", - " 16\n", + " 20\n", " [{'index': 2, 'value': 1.0}]\n", - " 0.3454\n", - " 0.071529\n", - " -0.547325\n", + " 1.299593\n", + " 0.798532\n", + " 0.151059\n", " [{'index': 2, 'value': 1.0}]\n", " [{'index': 2, 'value': 1.0}]\n", " \n", " \n", - " 18\n", - " [{'index': 2, 'value': 1.0}]\n", - " -0.768614\n", - " 0.382978\n", - " -0.547325\n", - " [{'index': 3, 'value': 1.0}]\n", + " 21\n", + " [{'index': 1, 'value': 1.0}]\n", + " -0.675272\n", + " 0.344046\n", + " -1.009817\n", + " [{'index': 1, 'value': 1.0}]\n", " [{'index': 1, 'value': 1.0}]\n", " \n", " \n", - " 19\n", - " [{'index': 2, 'value': 1.0}]\n", - " -1.121385\n", - " 0.486795\n", - " -0.547325\n", - " [{'index': 3, 'value': 1.0}]\n", + " 22\n", + " [{'index': 1, 'value': 1.0}]\n", + " 0.26514\n", + " -1.675891\n", + " 0.658942\n", " [{'index': 1, 'value': 1.0}]\n", + " [{'index': 3, 'value': 1.0}]\n", " \n", " \n", - " 20\n", + " 24\n", " [{'index': 2, 'value': 1.0}]\n", - " 0.512502\n", - " 0.33107\n", - " -0.547325\n", + " 1.43125\n", + " 1.556008\n", + " -0.501934\n", " [{'index': 2, 'value': 1.0}]\n", " [{'index': 2, 'value': 1.0}]\n", " \n", " \n", - " 21\n", + " 25\n", " [{'index': 2, 'value': 1.0}]\n", - " 1.385146\n", - " 1.057784\n", - " -0.547325\n", - " [{'index': 3, 'value': 1.0}]\n", + " 0.302756\n", + " 0.041055\n", + " -0.574488\n", + " [{'index': 1, 'value': 1.0}]\n", " [{'index': 2, 'value': 1.0}]\n", " \n", " \n", - " 22\n", - " [{'index': 2, 'value': 1.0}]\n", - " -0.675779\n", - " -0.032287\n", - " -0.547325\n", - " [{'index': 2, 'value': 1.0}]\n", + " 26\n", + " [{'index': 1, 'value': 1.0}]\n", + " 0.302756\n", + " -1.675891\n", + " 0.949161\n", " [{'index': 1, 'value': 1.0}]\n", + " [{'index': 3, 'value': 1.0}]\n", " \n", " \n", - " 24\n", - " [{'index': 2, 'value': 1.0}]\n", - " 1.069509\n", - " 0.538703\n", - " -0.547325\n", + " 27\n", + " [{'index': 1, 'value': 1.0}]\n", + " 0.227523\n", + " -1.776888\n", + " 0.658942\n", + " [{'index': 1, 'value': 1.0}]\n", " [{'index': 3, 'value': 1.0}]\n", - " [{'index': 2, 'value': 1.0}]\n", " \n", " \n", - " 26\n", + " 28\n", + " [{'index': 1, 'value': 1.0}]\n", + " 1.318401\n", + " -0.362932\n", + " 1.747263\n", " [{'index': 2, 'value': 1.0}]\n", - " -0.43441\n", - " 0.694427\n", - " 0.027445\n", " [{'index': 3, 'value': 1.0}]\n", - " [{'index': 1, 'value': 1.0}]\n", " \n", " \n", - " 28\n", + " 29\n", + " [{'index': 2, 'value': 1.0}]\n", + " 2.202388\n", + " 1.303516\n", + " 0.441278\n", " [{'index': 2, 'value': 1.0}]\n", - " 1.923586\n", - " 1.888314\n", - " 0.027445\n", - " [{'index': 3, 'value': 1.0}]\n", " [{'index': 2, 'value': 1.0}]\n", " \n", " \n", " 30\n", " [{'index': 2, 'value': 1.0}]\n", - " 1.292312\n", - " 0.694427\n", - " 0.027445\n", - " [{'index': 3, 'value': 1.0}]\n", + " -0.919779\n", + " 1.959995\n", + " -0.356824\n", " [{'index': 2, 'value': 1.0}]\n", + " [{'index': 1, 'value': 1.0}]\n", " \n", " \n", " 31\n", + " [{'index': 1, 'value': 1.0}]\n", + " 1.036277\n", + " -0.615424\n", + " 1.747263\n", " [{'index': 2, 'value': 1.0}]\n", - " -1.994029\n", - " -0.551368\n", - " -1.62502\n", + " [{'index': 3, 'value': 1.0}]\n", + " \n", + " \n", + " 32\n", + " [{'index': 3, 'value': 1.0}]\n", + " -0.223874\n", + " 0.19255\n", + " -0.356824\n", " [{'index': 2, 'value': 1.0}]\n", " [{'index': 1, 'value': 1.0}]\n", " \n", @@ -1982,121 +1868,121 @@ "text/plain": [ " onehotencoded_island standard_scaled_culmen_length_mm \\\n", "penguin_id \n", - "0 [{'index': 2, 'value': 1.0}] -1.344188 \n", - "1 [{'index': 2, 'value': 1.0}] -0.750047 \n", - "2 [{'index': 2, 'value': 1.0}] -0.545811 \n", - "4 [{'index': 2, 'value': 1.0}] -1.214219 \n", - "5 [{'index': 2, 'value': 1.0}] -0.118772 \n", - "6 [{'index': 2, 'value': 1.0}] 0.568203 \n", - "7 [{'index': 2, 'value': 1.0}] 1.236611 \n", - "9 [{'index': 2, 'value': 1.0}] -0.675779 \n", - "10 [{'index': 2, 'value': 1.0}] -0.564378 \n", - "11 [{'index': 2, 'value': 1.0}] -0.898582 \n", - "12 [{'index': 2, 'value': 1.0}] -1.26992 \n", - "13 [{'index': 2, 'value': 1.0}] 0.58677 \n", - "14 [{'index': 2, 'value': 1.0}] -1.826927 \n", - "15 [{'index': 2, 'value': 1.0}] -1.26992 \n", - "16 [{'index': 2, 'value': 1.0}] 0.3454 \n", - "18 [{'index': 2, 'value': 1.0}] -0.768614 \n", - "19 [{'index': 2, 'value': 1.0}] -1.121385 \n", - "20 [{'index': 2, 'value': 1.0}] 0.512502 \n", - "21 [{'index': 2, 'value': 1.0}] 1.385146 \n", - "22 [{'index': 2, 'value': 1.0}] -0.675779 \n", - "24 [{'index': 2, 'value': 1.0}] 1.069509 \n", - "26 [{'index': 2, 'value': 1.0}] -0.43441 \n", - "28 [{'index': 2, 'value': 1.0}] 1.923586 \n", - "30 [{'index': 2, 'value': 1.0}] 1.292312 \n", - "31 [{'index': 2, 'value': 1.0}] -1.994029 \n", + "0 [{'index': 1, 'value': 1.0}] -0.750505 \n", + "2 [{'index': 1, 'value': 1.0}] 0.622496 \n", + "3 [{'index': 2, 'value': 1.0}] -0.299107 \n", + "5 [{'index': 1, 'value': 1.0}] 0.490839 \n", + "6 [{'index': 1, 'value': 1.0}] -0.524806 \n", + "7 [{'index': 1, 'value': 1.0}] 0.208715 \n", + "9 [{'index': 1, 'value': 1.0}] 1.205551 \n", + "10 [{'index': 1, 'value': 1.0}] 0.772962 \n", + "12 [{'index': 1, 'value': 1.0}] 1.243168 \n", + "14 [{'index': 1, 'value': 1.0}] -1.709725 \n", + "17 [{'index': 1, 'value': 1.0}] 0.509647 \n", + "18 [{'index': 2, 'value': 1.0}] 1.167935 \n", + "19 [{'index': 2, 'value': 1.0}] -1.295944 \n", + "20 [{'index': 2, 'value': 1.0}] 1.299593 \n", + "21 [{'index': 1, 'value': 1.0}] -0.675272 \n", + "22 [{'index': 1, 'value': 1.0}] 0.26514 \n", + "24 [{'index': 2, 'value': 1.0}] 1.43125 \n", + "25 [{'index': 2, 'value': 1.0}] 0.302756 \n", + "26 [{'index': 1, 'value': 1.0}] 0.302756 \n", + "27 [{'index': 1, 'value': 1.0}] 0.227523 \n", + "28 [{'index': 1, 'value': 1.0}] 1.318401 \n", + "29 [{'index': 2, 'value': 1.0}] 2.202388 \n", + "30 [{'index': 2, 'value': 1.0}] -0.919779 \n", + "31 [{'index': 1, 'value': 1.0}] 1.036277 \n", + "32 [{'index': 3, 'value': 1.0}] -0.223874 \n", "\n", " standard_scaled_culmen_depth_mm \\\n", "penguin_id \n", - "0 0.642519 \n", - "1 1.005876 \n", - "2 0.90206 \n", - "4 -0.188011 \n", - "5 0.694427 \n", - "6 -0.291828 \n", - "7 0.642519 \n", - "9 1.524957 \n", - "10 0.90206 \n", - "11 0.798243 \n", - "12 -0.136103 \n", - "13 0.071529 \n", - "14 -0.032287 \n", - "15 -0.343736 \n", - "16 0.071529 \n", - "18 0.382978 \n", - "19 0.486795 \n", - "20 0.33107 \n", - "21 1.057784 \n", - "22 -0.032287 \n", - "24 0.538703 \n", - "26 0.694427 \n", - "28 1.888314 \n", - "30 0.694427 \n", - "31 -0.551368 \n", + "0 0.84903 \n", + "2 -1.322402 \n", + "3 -0.261935 \n", + "5 -0.968913 \n", + "6 1.959995 \n", + "7 -1.726389 \n", + "9 -1.019412 \n", + "10 -0.817418 \n", + "12 -1.120408 \n", + "14 0.344046 \n", + "17 -0.918415 \n", + "18 1.404513 \n", + "19 0.445043 \n", + "20 0.798532 \n", + "21 0.344046 \n", + "22 -1.675891 \n", + "24 1.556008 \n", + "25 0.041055 \n", + "26 -1.675891 \n", + "27 -1.776888 \n", + "28 -0.362932 \n", + "29 1.303516 \n", + "30 1.959995 \n", + "31 -0.615424 \n", + "32 0.19255 \n", "\n", " standard_scaled_flipper_length_mm onehotencoded_sex \\\n", "penguin_id \n", - "0 -1.193942 [{'index': 2, 'value': 1.0}] \n", - "1 -1.193942 [{'index': 3, 'value': 1.0}] \n", - "2 -1.193942 [{'index': 3, 'value': 1.0}] \n", - "4 -0.619171 [{'index': 2, 'value': 1.0}] \n", - "5 -0.619171 [{'index': 3, 'value': 1.0}] \n", - "6 -0.619171 [{'index': 2, 'value': 1.0}] \n", - "7 -0.044401 [{'index': 2, 'value': 1.0}] \n", - "9 -0.044401 [{'index': 3, 'value': 1.0}] \n", - "10 0.530369 [{'index': 3, 'value': 1.0}] \n", - "11 -1.122096 [{'index': 3, 'value': 1.0}] \n", - "12 -1.122096 [{'index': 2, 'value': 1.0}] \n", - "13 -1.122096 [{'index': 2, 'value': 1.0}] \n", - "14 -1.122096 [{'index': 2, 'value': 1.0}] \n", - "15 -1.122096 [{'index': 2, 'value': 1.0}] \n", - "16 -0.547325 [{'index': 2, 'value': 1.0}] \n", - "18 -0.547325 [{'index': 3, 'value': 1.0}] \n", - "19 -0.547325 [{'index': 3, 'value': 1.0}] \n", - "20 -0.547325 [{'index': 2, 'value': 1.0}] \n", - "21 -0.547325 [{'index': 3, 'value': 1.0}] \n", - "22 -0.547325 [{'index': 2, 'value': 1.0}] \n", - "24 -0.547325 [{'index': 3, 'value': 1.0}] \n", - "26 0.027445 [{'index': 3, 'value': 1.0}] \n", - "28 0.027445 [{'index': 3, 'value': 1.0}] \n", - "30 0.027445 [{'index': 3, 'value': 1.0}] \n", - "31 -1.62502 [{'index': 2, 'value': 1.0}] \n", + "0 -0.937262 [{'index': 2, 'value': 1.0}] \n", + "2 0.804051 [{'index': 1, 'value': 1.0}] \n", + "3 -1.009817 [{'index': 1, 'value': 1.0}] \n", + "5 1.311935 [{'index': 2, 'value': 1.0}] \n", + "6 -0.429379 [{'index': 2, 'value': 1.0}] \n", + "7 1.021716 [{'index': 1, 'value': 1.0}] \n", + "9 1.09427 [{'index': 1, 'value': 1.0}] \n", + "10 1.457044 [{'index': 2, 'value': 1.0}] \n", + "12 1.602153 [{'index': 2, 'value': 1.0}] \n", + "14 -0.792152 [{'index': 1, 'value': 1.0}] \n", + "17 1.021716 [{'index': 2, 'value': 1.0}] \n", + "18 -0.284269 [{'index': 2, 'value': 1.0}] \n", + "19 -1.662809 [{'index': 2, 'value': 1.0}] \n", + "20 0.151059 [{'index': 2, 'value': 1.0}] \n", + "21 -1.009817 [{'index': 1, 'value': 1.0}] \n", + "22 0.658942 [{'index': 1, 'value': 1.0}] \n", + "24 -0.501934 [{'index': 2, 'value': 1.0}] \n", + "25 -0.574488 [{'index': 1, 'value': 1.0}] \n", + "26 0.949161 [{'index': 1, 'value': 1.0}] \n", + "27 0.658942 [{'index': 1, 'value': 1.0}] \n", + "28 1.747263 [{'index': 2, 'value': 1.0}] \n", + "29 0.441278 [{'index': 2, 'value': 1.0}] \n", + "30 -0.356824 [{'index': 2, 'value': 1.0}] \n", + "31 1.747263 [{'index': 2, 'value': 1.0}] \n", + "32 -0.356824 [{'index': 2, 'value': 1.0}] \n", "\n", " onehotencoded_species \n", "penguin_id \n", "0 [{'index': 1, 'value': 1.0}] \n", - "1 [{'index': 1, 'value': 1.0}] \n", - "2 [{'index': 1, 'value': 1.0}] \n", - "4 [{'index': 1, 'value': 1.0}] \n", - "5 [{'index': 1, 'value': 1.0}] \n", - "6 [{'index': 2, 'value': 1.0}] \n", - "7 [{'index': 2, 'value': 1.0}] \n", - "9 [{'index': 1, 'value': 1.0}] \n", - "10 [{'index': 1, 'value': 1.0}] \n", - "11 [{'index': 1, 'value': 1.0}] \n", - "12 [{'index': 1, 'value': 1.0}] \n", - "13 [{'index': 2, 'value': 1.0}] \n", + "2 [{'index': 3, 'value': 1.0}] \n", + "3 [{'index': 2, 'value': 1.0}] \n", + "5 [{'index': 3, 'value': 1.0}] \n", + "6 [{'index': 1, 'value': 1.0}] \n", + "7 [{'index': 3, 'value': 1.0}] \n", + "9 [{'index': 3, 'value': 1.0}] \n", + "10 [{'index': 3, 'value': 1.0}] \n", + "12 [{'index': 3, 'value': 1.0}] \n", "14 [{'index': 1, 'value': 1.0}] \n", - "15 [{'index': 1, 'value': 1.0}] \n", - "16 [{'index': 2, 'value': 1.0}] \n", - "18 [{'index': 1, 'value': 1.0}] \n", + "17 [{'index': 3, 'value': 1.0}] \n", + "18 [{'index': 2, 'value': 1.0}] \n", "19 [{'index': 1, 'value': 1.0}] \n", "20 [{'index': 2, 'value': 1.0}] \n", - "21 [{'index': 2, 'value': 1.0}] \n", - "22 [{'index': 1, 'value': 1.0}] \n", + "21 [{'index': 1, 'value': 1.0}] \n", + "22 [{'index': 3, 'value': 1.0}] \n", "24 [{'index': 2, 'value': 1.0}] \n", - "26 [{'index': 1, 'value': 1.0}] \n", - "28 [{'index': 2, 'value': 1.0}] \n", - "30 [{'index': 2, 'value': 1.0}] \n", - "31 [{'index': 1, 'value': 1.0}] \n", + "25 [{'index': 2, 'value': 1.0}] \n", + "26 [{'index': 3, 'value': 1.0}] \n", + "27 [{'index': 3, 'value': 1.0}] \n", + "28 [{'index': 3, 'value': 1.0}] \n", + "29 [{'index': 2, 'value': 1.0}] \n", + "30 [{'index': 1, 'value': 1.0}] \n", + "31 [{'index': 3, 'value': 1.0}] \n", + "32 [{'index': 1, 'value': 1.0}] \n", "...\n", "\n", "[267 rows x 6 columns]" ] }, - "execution_count": 24, + "execution_count": 7, "metadata": {}, "output_type": "execute_result" } @@ -2138,18 +2024,28 @@ }, { "cell_type": "code", - "execution_count": 25, + "execution_count": 8, "metadata": {}, "outputs": [ { "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "5db4c5c80ba4417db151aa561dab5ee7", - "version_major": 2, - "version_minor": 0 - }, + "text/html": [ + "Query job 7d9c9f8b-6b4c-451f-ae3d-06fb7090d148 is DONE. 21.4 kB processed. Open Job" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Query job be87ccfa-72ab-4858-9d4a-b2f5f8b2a5e6 is DONE. 28.9 kB processed. Open Job" + ], "text/plain": [ - "HTML(value='Query job ceced0cc-13a7-4b14-b42c-4d5f69e7e49a is RUNNING. " ] }, "metadata": {}, @@ -2157,13 +2053,11 @@ }, { "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "e6b05d83de0e496d9e47392762046fc5", - "version_major": 2, - "version_minor": 0 - }, + "text/html": [ + "Query job 2d651fac-11bf-42da-8c18-bd33207379ca is DONE. 0 Bytes processed. Open Job" + ], "text/plain": [ - "HTML(value='Query job a708b8df-6040-49b1-a6da-d2c0d162f247 is RUNNING. " ] }, "metadata": {}, @@ -2171,13 +2065,11 @@ }, { "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "24d616c24a844abfbfd77ebd9f28486a", - "version_major": 2, - "version_minor": 0 - }, + "text/html": [ + "Query job 58836ccc-242b-4574-bc48-4c269e74dbf1 is DONE. 5.7 kB processed. Open Job" + ], "text/plain": [ - "HTML(value='Query job e9b9cbb5-f6a4-4d85-ba78-1edae77dce94 is RUNNING. " ] }, "metadata": {}, @@ -2185,13 +2077,11 @@ }, { "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "ce49b66c6fa0460aa3ee28746765b6ac", - "version_major": 2, - "version_minor": 0 - }, + "text/html": [ + "Query job 1bf531f0-0fde-489b-ab36-6040a2a12377 is DONE. 536 Bytes processed. Open Job" + ], "text/plain": [ - "HTML(value='Query job 6c0a41a7-a732-413a-b074-ba82f175eab8 is RUNNING. " ] }, "metadata": {}, @@ -2199,13 +2089,11 @@ }, { "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "4a6010d73ca04ea9a133de99aa90da3c", - "version_major": 2, - "version_minor": 0 - }, + "text/html": [ + "Query job 4245f4e6-4d5b-404f-81d7-50f0553e2456 is DONE. 0 Bytes processed. Open Job" + ], "text/plain": [ - "HTML(value='Query job 2d08b79d-9c36-4db7-824a-332fdd02e9fc is DONE. 0 Bytes processed. " ] }, "metadata": {}, @@ -2213,13 +2101,11 @@ }, { "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "ce9cfdca964a4062a52ebaae9d13ae59", - "version_major": 2, - "version_minor": 0 - }, + "text/html": [ + "Query job ed951699-c005-450e-a8b6-0916ec234e7f is DONE. 5.9 kB processed. Open Job" + ], "text/plain": [ - "HTML(value='Query job 7fa0bf53-1022-45ee-b3ac-78fa5c155585 is RUNNING. " ] }, "metadata": {}, @@ -2247,152 +2133,397 @@ " \n", " \n", " predicted_body_mass_g\n", + " onehotencoded_island\n", + " standard_scaled_culmen_length_mm\n", + " standard_scaled_culmen_depth_mm\n", + " standard_scaled_flipper_length_mm\n", + " onehotencoded_sex\n", + " onehotencoded_species\n", " \n", " \n", " penguin_id\n", " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", " \n", " \n", - " 3\n", - " 3394.118128\n", + " 1\n", + " 3781.402407\n", + " [{'index': 3, 'value': 1.0}]\n", + " -0.938587\n", + " 0.748033\n", + " -1.445145\n", + " [{'index': 2, 'value': 1.0}]\n", + " [{'index': 1, 'value': 1.0}]\n", " \n", " \n", - " 8\n", - " 4048.685642\n", + " 4\n", + " 4124.107944\n", + " [{'index': 1, 'value': 1.0}]\n", + " -0.16745\n", + " 0.899528\n", + " -0.284269\n", + " [{'index': 2, 'value': 1.0}]\n", + " [{'index': 1, 'value': 1.0}]\n", " \n", " \n", - " 17\n", - " 3976.454093\n", + " 8\n", + " 4670.344196\n", + " [{'index': 1, 'value': 1.0}]\n", + " 0.453222\n", + " -1.877885\n", + " 0.658942\n", + " [{'index': 1, 'value': 1.0}]\n", + " [{'index': 3, 'value': 1.0}]\n", " \n", " \n", - " 23\n", - " 3541.582194\n", + " 11\n", + " 3529.417214\n", + " [{'index': 2, 'value': 1.0}]\n", + " -1.12667\n", + " 0.697535\n", + " -0.792152\n", + " [{'index': 1, 'value': 1.0}]\n", + " [{'index': 1, 'value': 1.0}]\n", " \n", " \n", - " 25\n", - " 4032.844186\n", + " 13\n", + " 4014.101714\n", + " [{'index': 1, 'value': 1.0}]\n", + " -1.183094\n", + " 1.404513\n", + " -0.792152\n", + " [{'index': 2, 'value': 1.0}]\n", + " [{'index': 1, 'value': 1.0}]\n", " \n", " \n", - " 27\n", - " 4118.351772\n", + " 15\n", + " 5212.41288\n", + " [{'index': 1, 'value': 1.0}]\n", + " 0.867003\n", + " -0.766919\n", + " 0.513833\n", + " [{'index': 2, 'value': 1.0}]\n", + " [{'index': 3, 'value': 1.0}]\n", " \n", " \n", - " 29\n", - " 4087.767826\n", + " 16\n", + " 4163.595615\n", + " [{'index': 3, 'value': 1.0}]\n", + " -1.784958\n", + " 1.959995\n", + " -0.211715\n", + " [{'index': 2, 'value': 1.0}]\n", + " [{'index': 1, 'value': 1.0}]\n", " \n", " \n", - " 34\n", - " 3183.755249\n", + " 23\n", + " 3392.453069\n", + " [{'index': 2, 'value': 1.0}]\n", + " -0.355532\n", + " 0.647036\n", + " -1.5177\n", + " [{'index': 1, 'value': 1.0}]\n", + " [{'index': 1, 'value': 1.0}]\n", " \n", " \n", - " 35\n", - " 3418.802274\n", - " \n", - " \n", - " 39\n", - " 3519.186468\n", + " 34\n", + " 4698.305397\n", + " [{'index': 1, 'value': 1.0}]\n", + " -0.600039\n", + " -1.776888\n", + " 0.949161\n", + " [{'index': 1, 'value': 1.0}]\n", + " [{'index': 3, 'value': 1.0}]\n", " \n", " \n", - " 51\n", - " 3398.135365\n", + " 36\n", + " 4828.226949\n", + " [{'index': 1, 'value': 1.0}]\n", + " -0.129833\n", + " -1.423399\n", + " 1.23938\n", + " [{'index': 1, 'value': 1.0}]\n", + " [{'index': 3, 'value': 1.0}]\n", " \n", " \n", - " 52\n", - " 3223.615957\n", + " 42\n", + " 3430.58866\n", + " [{'index': 1, 'value': 1.0}]\n", + " -1.615684\n", + " -0.514427\n", + " -0.429379\n", + " [{'index': 1, 'value': 1.0}]\n", + " [{'index': 1, 'value': 1.0}]\n", " \n", " \n", - " 60\n", - " 3445.014718\n", + " 48\n", + " 5314.260221\n", + " [{'index': 1, 'value': 1.0}]\n", + " 0.415606\n", + " -0.716421\n", + " 1.021716\n", + " [{'index': 2, 'value': 1.0}]\n", + " [{'index': 3, 'value': 1.0}]\n", " \n", " \n", " 61\n", - " 3505.638864\n", + " 5363.205372\n", + " [{'index': 1, 'value': 1.0}]\n", + " 0.396797\n", + " -1.170907\n", + " 1.457044\n", + " [{'index': 2, 'value': 1.0}]\n", + " [{'index': 3, 'value': 1.0}]\n", " \n", " \n", " 64\n", - " 3515.905786\n", + " 4855.908314\n", + " [{'index': 1, 'value': 1.0}]\n", + " 0.434414\n", + " -1.120408\n", + " 1.09427\n", + " [{'index': 1, 'value': 1.0}]\n", + " [{'index': 3, 'value': 1.0}]\n", " \n", " \n", " 65\n", - " 4028.363185\n", + " 3413.100524\n", + " [{'index': 2, 'value': 1.0}]\n", + " -1.220711\n", + " 1.051024\n", + " -1.445145\n", + " [{'index': 1, 'value': 1.0}]\n", + " [{'index': 1, 'value': 1.0}]\n", " \n", " \n", - " 67\n", - " 4159.993943\n", + " 68\n", + " 3340.219002\n", + " [{'index': 3, 'value': 1.0}]\n", + " -1.484026\n", + " -0.009443\n", + " -1.009817\n", + " [{'index': 1, 'value': 1.0}]\n", + " [{'index': 1, 'value': 1.0}]\n", " \n", " \n", - " 83\n", - " 3348.16883\n", + " 70\n", + " 4228.73157\n", + " [{'index': 2, 'value': 1.0}]\n", + " 1.638141\n", + " 1.404513\n", + " 0.296168\n", + " [{'index': 2, 'value': 1.0}]\n", + " [{'index': 2, 'value': 1.0}]\n", " \n", " \n", - " 85\n", - " 3485.050273\n", + " 72\n", + " 3811.538478\n", + " [{'index': 2, 'value': 1.0}]\n", + " 0.829387\n", + " 0.142052\n", + " -0.719598\n", + " [{'index': 2, 'value': 1.0}]\n", + " [{'index': 2, 'value': 1.0}]\n", " \n", " \n", - " 93\n", - " 4172.874548\n", + " 74\n", + " 4659.770763\n", + " [{'index': 1, 'value': 1.0}]\n", + " -0.242683\n", + " -1.524396\n", + " 0.586387\n", + " [{'index': 1, 'value': 1.0}]\n", + " [{'index': 3, 'value': 1.0}]\n", " \n", " \n", - " 104\n", - " 3299.302424\n", + " 77\n", + " 3453.388804\n", + " [{'index': 2, 'value': 1.0}]\n", + " -1.277136\n", + " -0.211437\n", + " -0.647043\n", + " [{'index': 1, 'value': 1.0}]\n", + " [{'index': 1, 'value': 1.0}]\n", " \n", " \n", - " 105\n", - " 3515.687917\n", + " 81\n", + " 4766.245033\n", + " [{'index': 1, 'value': 1.0}]\n", + " 0.208715\n", + " -1.221405\n", + " 0.804051\n", + " [{'index': 1, 'value': 1.0}]\n", + " [{'index': 3, 'value': 1.0}]\n", + " \n", + " \n", + " 91\n", + " 4057.807281\n", + " [{'index': 2, 'value': 1.0}]\n", + " 1.261976\n", + " 0.647036\n", + " 0.005949\n", + " [{'index': 2, 'value': 1.0}]\n", + " [{'index': 2, 'value': 1.0}]\n", " \n", " \n", - " 108\n", - " 3405.224618\n", + " 96\n", + " 4739.827445\n", + " [{'index': 1, 'value': 1.0}]\n", + " 0.246331\n", + " -1.322402\n", + " 0.731497\n", + " [{'index': 1, 'value': 1.0}]\n", + " [{'index': 3, 'value': 1.0}]\n", " \n", " \n", - " 113\n", - " 4209.140425\n", + " 105\n", + " 3394.891976\n", + " [{'index': 1, 'value': 1.0}]\n", + " -1.803766\n", + " 0.445043\n", + " -1.009817\n", + " [{'index': 1, 'value': 1.0}]\n", + " [{'index': 1, 'value': 1.0}]\n", " \n", " \n", - " 130\n", - " 4197.905737\n", + " 111\n", + " 3201.493683\n", + " [{'index': 1, 'value': 1.0}]\n", + " -1.164286\n", + " 0.697535\n", + " -2.098138\n", + " [{'index': 1, 'value': 1.0}]\n", + " [{'index': 1, 'value': 1.0}]\n", " \n", " \n", "\n", - "

25 rows × 1 columns

\n", - "[67 rows x 1 columns in total]" - ], - "text/plain": [ - " predicted_body_mass_g\n", - "penguin_id \n", - "3 3394.118128\n", - "8 4048.685642\n", - "17 3976.454093\n", - "23 3541.582194\n", - "25 4032.844186\n", - "27 4118.351772\n", - "29 4087.767826\n", - "34 3183.755249\n", - "35 3418.802274\n", - "39 3519.186468\n", - "51 3398.135365\n", - "52 3223.615957\n", - "60 3445.014718\n", - "61 3505.638864\n", - "64 3515.905786\n", - "65 4028.363185\n", - "67 4159.993943\n", - "83 3348.16883\n", - "85 3485.050273\n", - "93 4172.874548\n", - "104 3299.302424\n", - "105 3515.687917\n", - "108 3405.224618\n", - "113 4209.140425\n", - "130 4197.905737\n", - "...\n", + "

25 rows × 7 columns

\n", + "[67 rows x 7 columns in total]" + ], + "text/plain": [ + " predicted_body_mass_g onehotencoded_island \\\n", + "penguin_id \n", + "1 3781.402407 [{'index': 3, 'value': 1.0}] \n", + "4 4124.107944 [{'index': 1, 'value': 1.0}] \n", + "8 4670.344196 [{'index': 1, 'value': 1.0}] \n", + "11 3529.417214 [{'index': 2, 'value': 1.0}] \n", + "13 4014.101714 [{'index': 1, 'value': 1.0}] \n", + "15 5212.41288 [{'index': 1, 'value': 1.0}] \n", + "16 4163.595615 [{'index': 3, 'value': 1.0}] \n", + "23 3392.453069 [{'index': 2, 'value': 1.0}] \n", + "34 4698.305397 [{'index': 1, 'value': 1.0}] \n", + "36 4828.226949 [{'index': 1, 'value': 1.0}] \n", + "42 3430.58866 [{'index': 1, 'value': 1.0}] \n", + "48 5314.260221 [{'index': 1, 'value': 1.0}] \n", + "61 5363.205372 [{'index': 1, 'value': 1.0}] \n", + "64 4855.908314 [{'index': 1, 'value': 1.0}] \n", + "65 3413.100524 [{'index': 2, 'value': 1.0}] \n", + "68 3340.219002 [{'index': 3, 'value': 1.0}] \n", + "70 4228.73157 [{'index': 2, 'value': 1.0}] \n", + "72 3811.538478 [{'index': 2, 'value': 1.0}] \n", + "74 4659.770763 [{'index': 1, 'value': 1.0}] \n", + "77 3453.388804 [{'index': 2, 'value': 1.0}] \n", + "81 4766.245033 [{'index': 1, 'value': 1.0}] \n", + "91 4057.807281 [{'index': 2, 'value': 1.0}] \n", + "96 4739.827445 [{'index': 1, 'value': 1.0}] \n", + "105 3394.891976 [{'index': 1, 'value': 1.0}] \n", + "111 3201.493683 [{'index': 1, 'value': 1.0}] \n", "\n", - "[67 rows x 1 columns]" + " standard_scaled_culmen_length_mm standard_scaled_culmen_depth_mm \\\n", + "penguin_id \n", + "1 -0.938587 0.748033 \n", + "4 -0.16745 0.899528 \n", + "8 0.453222 -1.877885 \n", + "11 -1.12667 0.697535 \n", + "13 -1.183094 1.404513 \n", + "15 0.867003 -0.766919 \n", + "16 -1.784958 1.959995 \n", + "23 -0.355532 0.647036 \n", + "34 -0.600039 -1.776888 \n", + "36 -0.129833 -1.423399 \n", + "42 -1.615684 -0.514427 \n", + "48 0.415606 -0.716421 \n", + "61 0.396797 -1.170907 \n", + "64 0.434414 -1.120408 \n", + "65 -1.220711 1.051024 \n", + "68 -1.484026 -0.009443 \n", + "70 1.638141 1.404513 \n", + "72 0.829387 0.142052 \n", + "74 -0.242683 -1.524396 \n", + "77 -1.277136 -0.211437 \n", + "81 0.208715 -1.221405 \n", + "91 1.261976 0.647036 \n", + "96 0.246331 -1.322402 \n", + "105 -1.803766 0.445043 \n", + "111 -1.164286 0.697535 \n", + "\n", + " standard_scaled_flipper_length_mm onehotencoded_sex \\\n", + "penguin_id \n", + "1 -1.445145 [{'index': 2, 'value': 1.0}] \n", + "4 -0.284269 [{'index': 2, 'value': 1.0}] \n", + "8 0.658942 [{'index': 1, 'value': 1.0}] \n", + "11 -0.792152 [{'index': 1, 'value': 1.0}] \n", + "13 -0.792152 [{'index': 2, 'value': 1.0}] \n", + "15 0.513833 [{'index': 2, 'value': 1.0}] \n", + "16 -0.211715 [{'index': 2, 'value': 1.0}] \n", + "23 -1.5177 [{'index': 1, 'value': 1.0}] \n", + "34 0.949161 [{'index': 1, 'value': 1.0}] \n", + "36 1.23938 [{'index': 1, 'value': 1.0}] \n", + "42 -0.429379 [{'index': 1, 'value': 1.0}] \n", + "48 1.021716 [{'index': 2, 'value': 1.0}] \n", + "61 1.457044 [{'index': 2, 'value': 1.0}] \n", + "64 1.09427 [{'index': 1, 'value': 1.0}] \n", + "65 -1.445145 [{'index': 1, 'value': 1.0}] \n", + "68 -1.009817 [{'index': 1, 'value': 1.0}] \n", + "70 0.296168 [{'index': 2, 'value': 1.0}] \n", + "72 -0.719598 [{'index': 2, 'value': 1.0}] \n", + "74 0.586387 [{'index': 1, 'value': 1.0}] \n", + "77 -0.647043 [{'index': 1, 'value': 1.0}] \n", + "81 0.804051 [{'index': 1, 'value': 1.0}] \n", + "91 0.005949 [{'index': 2, 'value': 1.0}] \n", + "96 0.731497 [{'index': 1, 'value': 1.0}] \n", + "105 -1.009817 [{'index': 1, 'value': 1.0}] \n", + "111 -2.098138 [{'index': 1, 'value': 1.0}] \n", + "\n", + " onehotencoded_species \n", + "penguin_id \n", + "1 [{'index': 1, 'value': 1.0}] \n", + "4 [{'index': 1, 'value': 1.0}] \n", + "8 [{'index': 3, 'value': 1.0}] \n", + "11 [{'index': 1, 'value': 1.0}] \n", + "13 [{'index': 1, 'value': 1.0}] \n", + "15 [{'index': 3, 'value': 1.0}] \n", + "16 [{'index': 1, 'value': 1.0}] \n", + "23 [{'index': 1, 'value': 1.0}] \n", + "34 [{'index': 3, 'value': 1.0}] \n", + "36 [{'index': 3, 'value': 1.0}] \n", + "42 [{'index': 1, 'value': 1.0}] \n", + "48 [{'index': 3, 'value': 1.0}] \n", + "61 [{'index': 3, 'value': 1.0}] \n", + "64 [{'index': 3, 'value': 1.0}] \n", + "65 [{'index': 1, 'value': 1.0}] \n", + "68 [{'index': 1, 'value': 1.0}] \n", + "70 [{'index': 2, 'value': 1.0}] \n", + "72 [{'index': 2, 'value': 1.0}] \n", + "74 [{'index': 3, 'value': 1.0}] \n", + "77 [{'index': 1, 'value': 1.0}] \n", + "81 [{'index': 3, 'value': 1.0}] \n", + "91 [{'index': 2, 'value': 1.0}] \n", + "96 [{'index': 3, 'value': 1.0}] \n", + "105 [{'index': 1, 'value': 1.0}] \n", + "111 [{'index': 1, 'value': 1.0}] \n", + "\n", + "[67 rows x 7 columns]" ] }, - "execution_count": 25, + "execution_count": 8, "metadata": {}, "output_type": "execute_result" } @@ -2423,18 +2554,16 @@ }, { "cell_type": "code", - "execution_count": 26, + "execution_count": 9, "metadata": {}, "outputs": [ { "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "d7a16e04253a42b7a5ce247d8f63b656", - "version_major": 2, - "version_minor": 0 - }, + "text/html": [ + "Query job 027042f1-9a18-43d8-a378-ab9410e395b1 is DONE. 23.5 kB processed.
Open Job" + ], "text/plain": [ - "HTML(value='Query job 6f19614c-82c0-4f8b-b74b-9d91a894efdd is RUNNING. " ] }, "metadata": {}, @@ -2442,13 +2571,11 @@ }, { "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "4a99ac15431e433595de1040872a4558", - "version_major": 2, - "version_minor": 0 - }, + "text/html": [ + "Query job 6c8484a0-a504-4e50-93d6-3d247c9ff558 is DONE. 0 Bytes processed. Open Job" + ], "text/plain": [ - "HTML(value='Query job 51899e2d-f6ef-4e62-98b6-c11550f74f4b is RUNNING. " ] }, "metadata": {}, @@ -2456,13 +2583,11 @@ }, { "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "90909b620e084f59b0f9da266257593f", - "version_major": 2, - "version_minor": 0 - }, + "text/html": [ + "Query job e81ca2de-df2e-41ec-af86-14f8dcec1b44 is DONE. 6.2 kB processed. Open Job" + ], "text/plain": [ - "HTML(value='Query job 44d3fddc-74bc-4de0-a458-2c73b38f74fb is RUNNING. " ] }, "metadata": {}, @@ -2470,13 +2595,11 @@ }, { "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "2a9c2c05041a4fb691809bab5310bb05", - "version_major": 2, - "version_minor": 0 - }, + "text/html": [ + "Query job 3e6d413c-f8c4-4390-95eb-3a1f5bc59aed is DONE. 536 Bytes processed. Open Job" + ], "text/plain": [ - "HTML(value='Query job 33584475-f02b-4c98-9a51-e29996f4f950 is RUNNING. " ] }, "metadata": {}, @@ -2484,13 +2607,11 @@ }, { "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "6b0677c228d54b409c66e5dfa98d7e00", - "version_major": 2, - "version_minor": 0 - }, + "text/html": [ + "Query job e448220d-0c50-45b7-bcbe-d1159b3d18ce is DONE. 0 Bytes processed. Open Job" + ], "text/plain": [ - "HTML(value='Query job df25ba49-280e-424d-a357-dde71a9b35dd is DONE. 0 Bytes processed. " ] }, "metadata": {}, @@ -2498,13 +2619,11 @@ }, { "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "379ae6497fb34f969d21b2cd664e8bfa", - "version_major": 2, - "version_minor": 0 - }, + "text/html": [ + "Query job e167a234-828d-4f05-8654-63cf97e50ba3 is DONE. 10.2 kB processed. Open Job" + ], "text/plain": [ - "HTML(value='Query job 6f92a04e-af7e-41d6-9303-6366c1751294 is RUNNING. " ] }, "metadata": {}, @@ -2532,152 +2651,452 @@ " \n", " \n", " CENTROID_ID\n", + " NEAREST_CENTROIDS_DISTANCE\n", + " onehotencoded_island\n", + " standard_scaled_culmen_length_mm\n", + " standard_scaled_culmen_depth_mm\n", + " standard_scaled_flipper_length_mm\n", + " onehotencoded_sex\n", + " onehotencoded_species\n", " \n", " \n", " penguin_id\n", " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", " \n", " \n", - " 3\n", - " 3\n", - " \n", - " \n", - " 8\n", + " 1\n", " 3\n", + " [{'CENTROID_ID': 3, 'DISTANCE': 1.236380597035...\n", + " [{'index': 3, 'value': 1.0}]\n", + " -0.938587\n", + " 0.748033\n", + " -1.445145\n", + " [{'index': 2, 'value': 1.0}]\n", + " [{'index': 1, 'value': 1.0}]\n", " \n", " \n", - " 17\n", + " 4\n", " 3\n", + " [{'CENTROID_ID': 3, 'DISTANCE': 1.039497631856...\n", + " [{'index': 1, 'value': 1.0}]\n", + " -0.16745\n", + " 0.899528\n", + " -0.284269\n", + " [{'index': 2, 'value': 1.0}]\n", + " [{'index': 1, 'value': 1.0}]\n", " \n", " \n", - " 23\n", + " 8\n", " 1\n", + " [{'CENTROID_ID': 1, 'DISTANCE': 1.171040485975...\n", + " [{'index': 1, 'value': 1.0}]\n", + " 0.453222\n", + " -1.877885\n", + " 0.658942\n", + " [{'index': 1, 'value': 1.0}]\n", + " [{'index': 3, 'value': 1.0}]\n", " \n", " \n", - " 25\n", - " 3\n", + " 11\n", + " 2\n", + " [{'CENTROID_ID': 2, 'DISTANCE': 0.969102754012...\n", + " [{'index': 2, 'value': 1.0}]\n", + " -1.12667\n", + " 0.697535\n", + " -0.792152\n", + " [{'index': 1, 'value': 1.0}]\n", + " [{'index': 1, 'value': 1.0}]\n", " \n", " \n", - " 27\n", + " 13\n", " 3\n", + " [{'CENTROID_ID': 3, 'DISTANCE': 1.113138945949...\n", + " [{'index': 1, 'value': 1.0}]\n", + " -1.183094\n", + " 1.404513\n", + " -0.792152\n", + " [{'index': 2, 'value': 1.0}]\n", + " [{'index': 1, 'value': 1.0}]\n", " \n", " \n", - " 29\n", - " 3\n", + " 15\n", + " 1\n", + " [{'CENTROID_ID': 1, 'DISTANCE': 1.070996026772...\n", + " [{'index': 1, 'value': 1.0}]\n", + " 0.867003\n", + " -0.766919\n", + " 0.513833\n", + " [{'index': 2, 'value': 1.0}]\n", + " [{'index': 3, 'value': 1.0}]\n", " \n", " \n", - " 34\n", + " 16\n", " 3\n", + " [{'CENTROID_ID': 3, 'DISTANCE': 1.780136190720...\n", + " [{'index': 3, 'value': 1.0}]\n", + " -1.784958\n", + " 1.959995\n", + " -0.211715\n", + " [{'index': 2, 'value': 1.0}]\n", + " [{'index': 1, 'value': 1.0}]\n", " \n", " \n", - " 35\n", - " 1\n", + " 23\n", + " 2\n", + " [{'CENTROID_ID': 2, 'DISTANCE': 1.382540667483...\n", + " [{'index': 2, 'value': 1.0}]\n", + " -0.355532\n", + " 0.647036\n", + " -1.5177\n", + " [{'index': 1, 'value': 1.0}]\n", + " [{'index': 1, 'value': 1.0}]\n", " \n", " \n", - " 39\n", - " 3\n", + " 34\n", + " 1\n", + " [{'CENTROID_ID': 1, 'DISTANCE': 1.598627908302...\n", + " [{'index': 1, 'value': 1.0}]\n", + " -0.600039\n", + " -1.776888\n", + " 0.949161\n", + " [{'index': 1, 'value': 1.0}]\n", + " [{'index': 3, 'value': 1.0}]\n", " \n", " \n", - " 51\n", + " 36\n", " 1\n", + " [{'CENTROID_ID': 1, 'DISTANCE': 1.095162305190...\n", + " [{'index': 1, 'value': 1.0}]\n", + " -0.129833\n", + " -1.423399\n", + " 1.23938\n", + " [{'index': 1, 'value': 1.0}]\n", + " [{'index': 3, 'value': 1.0}]\n", " \n", " \n", - " 52\n", - " 3\n", + " 42\n", + " 2\n", + " [{'CENTROID_ID': 2, 'DISTANCE': 1.275841743930...\n", + " [{'index': 1, 'value': 1.0}]\n", + " -1.615684\n", + " -0.514427\n", + " -0.429379\n", + " [{'index': 1, 'value': 1.0}]\n", + " [{'index': 1, 'value': 1.0}]\n", " \n", " \n", - " 60\n", - " 3\n", + " 48\n", + " 1\n", + " [{'CENTROID_ID': 1, 'DISTANCE': 0.882209023196...\n", + " [{'index': 1, 'value': 1.0}]\n", + " 0.415606\n", + " -0.716421\n", + " 1.021716\n", + " [{'index': 2, 'value': 1.0}]\n", + " [{'index': 3, 'value': 1.0}]\n", " \n", " \n", " 61\n", - " 3\n", + " 1\n", + " [{'CENTROID_ID': 1, 'DISTANCE': 0.816202832282...\n", + " [{'index': 1, 'value': 1.0}]\n", + " 0.396797\n", + " -1.170907\n", + " 1.457044\n", + " [{'index': 2, 'value': 1.0}]\n", + " [{'index': 3, 'value': 1.0}]\n", " \n", " \n", " 64\n", " 1\n", + " [{'CENTROID_ID': 1, 'DISTANCE': 0.735435721625...\n", + " [{'index': 1, 'value': 1.0}]\n", + " 0.434414\n", + " -1.120408\n", + " 1.09427\n", + " [{'index': 1, 'value': 1.0}]\n", + " [{'index': 3, 'value': 1.0}]\n", " \n", " \n", " 65\n", - " 1\n", + " 2\n", + " [{'CENTROID_ID': 2, 'DISTANCE': 1.292559869148...\n", + " [{'index': 2, 'value': 1.0}]\n", + " -1.220711\n", + " 1.051024\n", + " -1.445145\n", + " [{'index': 1, 'value': 1.0}]\n", + " [{'index': 1, 'value': 1.0}]\n", " \n", " \n", - " 67\n", - " 3\n", + " 68\n", + " 2\n", + " [{'CENTROID_ID': 2, 'DISTANCE': 0.876430138449...\n", + " [{'index': 3, 'value': 1.0}]\n", + " -1.484026\n", + " -0.009443\n", + " -1.009817\n", + " [{'index': 1, 'value': 1.0}]\n", + " [{'index': 1, 'value': 1.0}]\n", " \n", " \n", - " 83\n", - " 3\n", + " 70\n", + " 4\n", + " [{'CENTROID_ID': 4, 'DISTANCE': 1.314229913955...\n", + " [{'index': 2, 'value': 1.0}]\n", + " 1.638141\n", + " 1.404513\n", + " 0.296168\n", + " [{'index': 2, 'value': 1.0}]\n", + " [{'index': 2, 'value': 1.0}]\n", " \n", " \n", - " 85\n", - " 1\n", + " 72\n", + " 4\n", + " [{'CENTROID_ID': 4, 'DISTANCE': 0.938569518009...\n", + " [{'index': 2, 'value': 1.0}]\n", + " 0.829387\n", + " 0.142052\n", + " -0.719598\n", + " [{'index': 2, 'value': 1.0}]\n", + " [{'index': 2, 'value': 1.0}]\n", " \n", " \n", - " 93\n", + " 74\n", " 1\n", + " [{'CENTROID_ID': 1, 'DISTANCE': 1.350320088546...\n", + " [{'index': 1, 'value': 1.0}]\n", + " -0.242683\n", + " -1.524396\n", + " 0.586387\n", + " [{'index': 1, 'value': 1.0}]\n", + " [{'index': 3, 'value': 1.0}]\n", " \n", " \n", - " 104\n", - " 3\n", + " 77\n", + " 2\n", + " [{'CENTROID_ID': 2, 'DISTANCE': 0.904806634663...\n", + " [{'index': 2, 'value': 1.0}]\n", + " -1.277136\n", + " -0.211437\n", + " -0.647043\n", + " [{'index': 1, 'value': 1.0}]\n", + " [{'index': 1, 'value': 1.0}]\n", " \n", " \n", - " 105\n", + " 81\n", " 1\n", + " [{'CENTROID_ID': 1, 'DISTANCE': 0.919082578073...\n", + " [{'index': 1, 'value': 1.0}]\n", + " 0.208715\n", + " -1.221405\n", + " 0.804051\n", + " [{'index': 1, 'value': 1.0}]\n", + " [{'index': 3, 'value': 1.0}]\n", " \n", " \n", - " 108\n", - " 3\n", + " 91\n", + " 4\n", + " [{'CENTROID_ID': 4, 'DISTANCE': 0.760360038086...\n", + " [{'index': 2, 'value': 1.0}]\n", + " 1.261976\n", + " 0.647036\n", + " 0.005949\n", + " [{'index': 2, 'value': 1.0}]\n", + " [{'index': 2, 'value': 1.0}]\n", " \n", " \n", - " 113\n", - " 3\n", + " 96\n", + " 1\n", + " [{'CENTROID_ID': 1, 'DISTANCE': 0.950188657227...\n", + " [{'index': 1, 'value': 1.0}]\n", + " 0.246331\n", + " -1.322402\n", + " 0.731497\n", + " [{'index': 1, 'value': 1.0}]\n", + " [{'index': 3, 'value': 1.0}]\n", " \n", " \n", - " 130\n", - " 1\n", + " 105\n", + " 2\n", + " [{'CENTROID_ID': 2, 'DISTANCE': 1.101316467029...\n", + " [{'index': 1, 'value': 1.0}]\n", + " -1.803766\n", + " 0.445043\n", + " -1.009817\n", + " [{'index': 1, 'value': 1.0}]\n", + " [{'index': 1, 'value': 1.0}]\n", + " \n", + " \n", + " 111\n", + " 2\n", + " [{'CENTROID_ID': 2, 'DISTANCE': 1.549061068385...\n", + " [{'index': 1, 'value': 1.0}]\n", + " -1.164286\n", + " 0.697535\n", + " -2.098138\n", + " [{'index': 1, 'value': 1.0}]\n", + " [{'index': 1, 'value': 1.0}]\n", " \n", " \n", "\n", - "

25 rows × 1 columns

\n", - "[67 rows x 1 columns in total]" + "

25 rows × 8 columns

\n", + "[67 rows x 8 columns in total]" ], "text/plain": [ - " CENTROID_ID\n", - "penguin_id \n", - "3 3\n", - "8 3\n", - "17 3\n", - "23 1\n", - "25 3\n", - "27 3\n", - "29 3\n", - "34 3\n", - "35 1\n", - "39 3\n", - "51 1\n", - "52 3\n", - "60 3\n", - "61 3\n", - "64 1\n", - "65 1\n", - "67 3\n", - "83 3\n", - "85 1\n", - "93 1\n", - "104 3\n", - "105 1\n", - "108 3\n", - "113 3\n", - "130 1\n", - "...\n", + " CENTROID_ID NEAREST_CENTROIDS_DISTANCE \\\n", + "penguin_id \n", + "1 3 [{'CENTROID_ID': 3, 'DISTANCE': 1.236380597035... \n", + "4 3 [{'CENTROID_ID': 3, 'DISTANCE': 1.039497631856... \n", + "8 1 [{'CENTROID_ID': 1, 'DISTANCE': 1.171040485975... \n", + "11 2 [{'CENTROID_ID': 2, 'DISTANCE': 0.969102754012... \n", + "13 3 [{'CENTROID_ID': 3, 'DISTANCE': 1.113138945949... \n", + "15 1 [{'CENTROID_ID': 1, 'DISTANCE': 1.070996026772... \n", + "16 3 [{'CENTROID_ID': 3, 'DISTANCE': 1.780136190720... \n", + "23 2 [{'CENTROID_ID': 2, 'DISTANCE': 1.382540667483... \n", + "34 1 [{'CENTROID_ID': 1, 'DISTANCE': 1.598627908302... \n", + "36 1 [{'CENTROID_ID': 1, 'DISTANCE': 1.095162305190... \n", + "42 2 [{'CENTROID_ID': 2, 'DISTANCE': 1.275841743930... \n", + "48 1 [{'CENTROID_ID': 1, 'DISTANCE': 0.882209023196... \n", + "61 1 [{'CENTROID_ID': 1, 'DISTANCE': 0.816202832282... \n", + "64 1 [{'CENTROID_ID': 1, 'DISTANCE': 0.735435721625... \n", + "65 2 [{'CENTROID_ID': 2, 'DISTANCE': 1.292559869148... \n", + "68 2 [{'CENTROID_ID': 2, 'DISTANCE': 0.876430138449... \n", + "70 4 [{'CENTROID_ID': 4, 'DISTANCE': 1.314229913955... \n", + "72 4 [{'CENTROID_ID': 4, 'DISTANCE': 0.938569518009... \n", + "74 1 [{'CENTROID_ID': 1, 'DISTANCE': 1.350320088546... \n", + "77 2 [{'CENTROID_ID': 2, 'DISTANCE': 0.904806634663... \n", + "81 1 [{'CENTROID_ID': 1, 'DISTANCE': 0.919082578073... \n", + "91 4 [{'CENTROID_ID': 4, 'DISTANCE': 0.760360038086... \n", + "96 1 [{'CENTROID_ID': 1, 'DISTANCE': 0.950188657227... \n", + "105 2 [{'CENTROID_ID': 2, 'DISTANCE': 1.101316467029... \n", + "111 2 [{'CENTROID_ID': 2, 'DISTANCE': 1.549061068385... \n", "\n", - "[67 rows x 1 columns]" + " onehotencoded_island standard_scaled_culmen_length_mm \\\n", + "penguin_id \n", + "1 [{'index': 3, 'value': 1.0}] -0.938587 \n", + "4 [{'index': 1, 'value': 1.0}] -0.16745 \n", + "8 [{'index': 1, 'value': 1.0}] 0.453222 \n", + "11 [{'index': 2, 'value': 1.0}] -1.12667 \n", + "13 [{'index': 1, 'value': 1.0}] -1.183094 \n", + "15 [{'index': 1, 'value': 1.0}] 0.867003 \n", + "16 [{'index': 3, 'value': 1.0}] -1.784958 \n", + "23 [{'index': 2, 'value': 1.0}] -0.355532 \n", + "34 [{'index': 1, 'value': 1.0}] -0.600039 \n", + "36 [{'index': 1, 'value': 1.0}] -0.129833 \n", + "42 [{'index': 1, 'value': 1.0}] -1.615684 \n", + "48 [{'index': 1, 'value': 1.0}] 0.415606 \n", + "61 [{'index': 1, 'value': 1.0}] 0.396797 \n", + "64 [{'index': 1, 'value': 1.0}] 0.434414 \n", + "65 [{'index': 2, 'value': 1.0}] -1.220711 \n", + "68 [{'index': 3, 'value': 1.0}] -1.484026 \n", + "70 [{'index': 2, 'value': 1.0}] 1.638141 \n", + "72 [{'index': 2, 'value': 1.0}] 0.829387 \n", + "74 [{'index': 1, 'value': 1.0}] -0.242683 \n", + "77 [{'index': 2, 'value': 1.0}] -1.277136 \n", + "81 [{'index': 1, 'value': 1.0}] 0.208715 \n", + "91 [{'index': 2, 'value': 1.0}] 1.261976 \n", + "96 [{'index': 1, 'value': 1.0}] 0.246331 \n", + "105 [{'index': 1, 'value': 1.0}] -1.803766 \n", + "111 [{'index': 1, 'value': 1.0}] -1.164286 \n", + "\n", + " standard_scaled_culmen_depth_mm \\\n", + "penguin_id \n", + "1 0.748033 \n", + "4 0.899528 \n", + "8 -1.877885 \n", + "11 0.697535 \n", + "13 1.404513 \n", + "15 -0.766919 \n", + "16 1.959995 \n", + "23 0.647036 \n", + "34 -1.776888 \n", + "36 -1.423399 \n", + "42 -0.514427 \n", + "48 -0.716421 \n", + "61 -1.170907 \n", + "64 -1.120408 \n", + "65 1.051024 \n", + "68 -0.009443 \n", + "70 1.404513 \n", + "72 0.142052 \n", + "74 -1.524396 \n", + "77 -0.211437 \n", + "81 -1.221405 \n", + "91 0.647036 \n", + "96 -1.322402 \n", + "105 0.445043 \n", + "111 0.697535 \n", + "\n", + " standard_scaled_flipper_length_mm onehotencoded_sex \\\n", + "penguin_id \n", + "1 -1.445145 [{'index': 2, 'value': 1.0}] \n", + "4 -0.284269 [{'index': 2, 'value': 1.0}] \n", + "8 0.658942 [{'index': 1, 'value': 1.0}] \n", + "11 -0.792152 [{'index': 1, 'value': 1.0}] \n", + "13 -0.792152 [{'index': 2, 'value': 1.0}] \n", + "15 0.513833 [{'index': 2, 'value': 1.0}] \n", + "16 -0.211715 [{'index': 2, 'value': 1.0}] \n", + "23 -1.5177 [{'index': 1, 'value': 1.0}] \n", + "34 0.949161 [{'index': 1, 'value': 1.0}] \n", + "36 1.23938 [{'index': 1, 'value': 1.0}] \n", + "42 -0.429379 [{'index': 1, 'value': 1.0}] \n", + "48 1.021716 [{'index': 2, 'value': 1.0}] \n", + "61 1.457044 [{'index': 2, 'value': 1.0}] \n", + "64 1.09427 [{'index': 1, 'value': 1.0}] \n", + "65 -1.445145 [{'index': 1, 'value': 1.0}] \n", + "68 -1.009817 [{'index': 1, 'value': 1.0}] \n", + "70 0.296168 [{'index': 2, 'value': 1.0}] \n", + "72 -0.719598 [{'index': 2, 'value': 1.0}] \n", + "74 0.586387 [{'index': 1, 'value': 1.0}] \n", + "77 -0.647043 [{'index': 1, 'value': 1.0}] \n", + "81 0.804051 [{'index': 1, 'value': 1.0}] \n", + "91 0.005949 [{'index': 2, 'value': 1.0}] \n", + "96 0.731497 [{'index': 1, 'value': 1.0}] \n", + "105 -1.009817 [{'index': 1, 'value': 1.0}] \n", + "111 -2.098138 [{'index': 1, 'value': 1.0}] \n", + "\n", + " onehotencoded_species \n", + "penguin_id \n", + "1 [{'index': 1, 'value': 1.0}] \n", + "4 [{'index': 1, 'value': 1.0}] \n", + "8 [{'index': 3, 'value': 1.0}] \n", + "11 [{'index': 1, 'value': 1.0}] \n", + "13 [{'index': 1, 'value': 1.0}] \n", + "15 [{'index': 3, 'value': 1.0}] \n", + "16 [{'index': 1, 'value': 1.0}] \n", + "23 [{'index': 1, 'value': 1.0}] \n", + "34 [{'index': 3, 'value': 1.0}] \n", + "36 [{'index': 3, 'value': 1.0}] \n", + "42 [{'index': 1, 'value': 1.0}] \n", + "48 [{'index': 3, 'value': 1.0}] \n", + "61 [{'index': 3, 'value': 1.0}] \n", + "64 [{'index': 3, 'value': 1.0}] \n", + "65 [{'index': 1, 'value': 1.0}] \n", + "68 [{'index': 1, 'value': 1.0}] \n", + "70 [{'index': 2, 'value': 1.0}] \n", + "72 [{'index': 2, 'value': 1.0}] \n", + "74 [{'index': 3, 'value': 1.0}] \n", + "77 [{'index': 1, 'value': 1.0}] \n", + "81 [{'index': 3, 'value': 1.0}] \n", + "91 [{'index': 2, 'value': 1.0}] \n", + "96 [{'index': 3, 'value': 1.0}] \n", + "105 [{'index': 1, 'value': 1.0}] \n", + "111 [{'index': 1, 'value': 1.0}] \n", + "\n", + "[67 rows x 8 columns]" ] }, - "execution_count": 26, + "execution_count": 9, "metadata": {}, "output_type": "execute_result" } @@ -2704,7 +3123,7 @@ }, { "cell_type": "code", - "execution_count": 27, + "execution_count": 10, "metadata": {}, "outputs": [ { @@ -2721,7 +3140,7 @@ " ('linreg', LinearRegression())])" ] }, - "execution_count": 27, + "execution_count": 10, "metadata": {}, "output_type": "execute_result" } @@ -2748,18 +3167,16 @@ }, { "cell_type": "code", - "execution_count": 28, + "execution_count": 11, "metadata": {}, "outputs": [ { "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "887bf58cebf14bdba95db828390fd33d", - "version_major": 2, - "version_minor": 0 - }, + "text/html": [ + "Query job b11be0d8-e6f1-41cb-8cb2-25a38e7ef311 is DONE. 24.7 kB processed.
Open Job" + ], "text/plain": [ - "HTML(value='Query job ed42cbb3-3d25-47ca-96c5-71a84e426a8c is RUNNING. " ] }, "metadata": {}, @@ -2767,13 +3184,11 @@ }, { "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "24357055792a4eaaa60997fea0f76921", - "version_major": 2, - "version_minor": 0 - }, + "text/html": [ + "Query job f32ea25c-be39-4726-a8f5-604ae83849a6 is DONE. 8.5 kB processed. Open Job" + ], "text/plain": [ - "HTML(value='Query job 3fc74930-03b9-4a49-8ed3-c3edc4dd6e51 is RUNNING. " ] }, "metadata": {}, @@ -2781,13 +3196,11 @@ }, { "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "bba878d6d3e345f1a29aea50f7101e8f", - "version_major": 2, - "version_minor": 0 - }, + "text/html": [ + "Query job 86e29b78-76f5-4937-8bde-407b99af04a2 is DONE. 0 Bytes processed. Open Job" + ], "text/plain": [ - "HTML(value='Query job 38a4ce3b-5c2a-4d44-b826-f24529d6500b is RUNNING. " ] }, "metadata": {}, @@ -2795,13 +3208,11 @@ }, { "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "4bc2c53aeb7d4a8280f9fbbe373f4b55", - "version_major": 2, - "version_minor": 0 - }, + "text/html": [ + "Query job ca819734-0d41-4d9e-b743-09edae8c7fee is DONE. 29.6 kB processed. Open Job" + ], "text/plain": [ - "HTML(value='Query job ecad776d-77c8-4d94-8186-d5571b512b62 is RUNNING. " ] }, "metadata": {}, @@ -2809,13 +3220,11 @@ }, { "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "f4f695cb0a224102b6e26adeb1827981", - "version_major": 2, - "version_minor": 0 - }, + "text/html": [ + "Query job 49bb5bed-cc84-47e0-9a90-08ab01e00548 is DONE. 536 Bytes processed. Open Job" + ], "text/plain": [ - "HTML(value='Query job c9bfc58f-ce2c-47a9-bbc7-b10d9de9b5a6 is DONE. 0 Bytes processed. " ] }, "metadata": {}, @@ -2823,13 +3232,23 @@ }, { "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "cb1df595006d485288a1060299970e5e", - "version_major": 2, - "version_minor": 0 - }, + "text/html": [ + "Query job 1e40a085-2289-47dd-afd8-820413186b9f is DONE. 0 Bytes processed. Open Job" + ], "text/plain": [ - "HTML(value='Query job 8fd8036e-3753-433d-975b-c7b42406f648 is RUNNING. " + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Query job 60319296-a480-4f51-b7ad-190ac6de963a is DONE. 6.2 kB processed. Open Job" + ], + "text/plain": [ + "" ] }, "metadata": {}, @@ -2857,152 +3276,369 @@ " \n", " \n", " predicted_body_mass_g\n", + " island\n", + " culmen_length_mm\n", + " culmen_depth_mm\n", + " flipper_length_mm\n", + " sex\n", + " species\n", " \n", " \n", " penguin_id\n", " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", " \n", " \n", - " 3\n", - " 3394.116212\n", - " \n", - " \n", - " 8\n", - " 4048.683645\n", + " 1\n", + " 3781.396682\n", + " Torgersen\n", + " 39.1\n", + " 18.7\n", + " 181.0\n", + " MALE\n", + " Adelie Penguin (Pygoscelis adeliae)\n", " \n", " \n", - " 17\n", - " 3976.452358\n", + " 4\n", + " 4124.102574\n", + " Biscoe\n", + " 43.2\n", + " 19.0\n", + " 197.0\n", + " MALE\n", + " Adelie Penguin (Pygoscelis adeliae)\n", " \n", " \n", - " 23\n", - " 3541.580346\n", + " 8\n", + " 4670.338389\n", + " Biscoe\n", + " 46.5\n", + " 13.5\n", + " 210.0\n", + " FEMALE\n", + " Gentoo penguin (Pygoscelis papua)\n", " \n", " \n", - " 25\n", - " 4032.842027\n", + " 11\n", + " 3529.411644\n", + " Dream\n", + " 38.1\n", + " 18.6\n", + " 190.0\n", + " FEMALE\n", + " Adelie Penguin (Pygoscelis adeliae)\n", " \n", " \n", - " 27\n", - " 4118.34983\n", + " 13\n", + " 4014.09632\n", + " Biscoe\n", + " 37.8\n", + " 20.0\n", + " 190.0\n", + " MALE\n", + " Adelie Penguin (Pygoscelis adeliae)\n", " \n", " \n", - " 29\n", - " 4087.765797\n", + " 15\n", + " 5212.407319\n", + " Biscoe\n", + " 48.7\n", + " 15.7\n", + " 208.0\n", + " MALE\n", + " Gentoo penguin (Pygoscelis papua)\n", " \n", " \n", - " 34\n", - " 3183.75379\n", + " 16\n", + " 4163.590502\n", + " Torgersen\n", + " 34.6\n", + " 21.1\n", + " 198.0\n", + " MALE\n", + " Adelie Penguin (Pygoscelis adeliae)\n", " \n", " \n", - " 35\n", - " 3418.800633\n", + " 23\n", + " 3392.44731\n", + " Dream\n", + " 42.2\n", + " 18.5\n", + " 180.0\n", + " FEMALE\n", + " Adelie Penguin (Pygoscelis adeliae)\n", " \n", " \n", - " 39\n", - " 3519.18471\n", + " 34\n", + " 4698.299674\n", + " Biscoe\n", + " 40.9\n", + " 13.7\n", + " 214.0\n", + " FEMALE\n", + " Gentoo penguin (Pygoscelis papua)\n", " \n", " \n", - " 51\n", - " 3398.133564\n", + " 36\n", + " 4828.221398\n", + " Biscoe\n", + " 43.4\n", + " 14.4\n", + " 218.0\n", + " FEMALE\n", + " Gentoo penguin (Pygoscelis papua)\n", " \n", " \n", - " 52\n", - " 3223.614107\n", + " 42\n", + " 3430.582874\n", + " Biscoe\n", + " 35.5\n", + " 16.2\n", + " 195.0\n", + " FEMALE\n", + " Adelie Penguin (Pygoscelis adeliae)\n", " \n", " \n", - " 60\n", - " 3445.012713\n", + " 48\n", + " 5314.254798\n", + " Biscoe\n", + " 46.3\n", + " 15.8\n", + " 215.0\n", + " MALE\n", + " Gentoo penguin (Pygoscelis papua)\n", " \n", " \n", " 61\n", - " 3505.637004\n", + " 5363.19995\n", + " Biscoe\n", + " 46.2\n", + " 14.9\n", + " 221.0\n", + " MALE\n", + " Gentoo penguin (Pygoscelis papua)\n", " \n", " \n", " 64\n", - " 3515.903779\n", + " 4855.90281\n", + " Biscoe\n", + " 46.4\n", + " 15.0\n", + " 216.0\n", + " FEMALE\n", + " Gentoo penguin (Pygoscelis papua)\n", " \n", " \n", " 65\n", - " 4028.361259\n", + " 3413.094869\n", + " Dream\n", + " 37.6\n", + " 19.3\n", + " 181.0\n", + " FEMALE\n", + " Adelie Penguin (Pygoscelis adeliae)\n", " \n", " \n", - " 67\n", - " 4159.991956\n", + " 68\n", + " 3340.213193\n", + " Torgersen\n", + " 36.2\n", + " 17.2\n", + " 187.0\n", + " FEMALE\n", + " Adelie Penguin (Pygoscelis adeliae)\n", " \n", " \n", - " 83\n", - " 3348.167212\n", + " 70\n", + " 4228.726508\n", + " Dream\n", + " 52.8\n", + " 20.0\n", + " 205.0\n", + " MALE\n", + " Chinstrap penguin (Pygoscelis antarctica)\n", " \n", " \n", - " 85\n", - " 3485.048557\n", + " 72\n", + " 3811.532821\n", + " Dream\n", + " 48.5\n", + " 17.5\n", + " 191.0\n", + " MALE\n", + " Chinstrap penguin (Pygoscelis antarctica)\n", " \n", " \n", - " 93\n", - " 4172.872284\n", + " 74\n", + " 4659.765013\n", + " Biscoe\n", + " 42.8\n", + " 14.2\n", + " 209.0\n", + " FEMALE\n", + " Gentoo penguin (Pygoscelis papua)\n", " \n", " \n", - " 104\n", - " 3299.300454\n", + " 77\n", + " 3453.383042\n", + " Dream\n", + " 37.3\n", + " 16.8\n", + " 192.0\n", + " FEMALE\n", + " Adelie Penguin (Pygoscelis adeliae)\n", " \n", " \n", - " 105\n", - " 3515.68617\n", + " 81\n", + " 4766.239424\n", + " Biscoe\n", + " 45.2\n", + " 14.8\n", + " 212.0\n", + " FEMALE\n", + " Gentoo penguin (Pygoscelis papua)\n", " \n", " \n", - " 108\n", - " 3405.222757\n", + " 91\n", + " 4057.801947\n", + " Dream\n", + " 50.8\n", + " 18.5\n", + " 201.0\n", + " MALE\n", + " Chinstrap penguin (Pygoscelis antarctica)\n", + " \n", + " \n", + " 96\n", + " 4739.821792\n", + " Biscoe\n", + " 45.4\n", + " 14.6\n", + " 211.0\n", + " FEMALE\n", + " Gentoo penguin (Pygoscelis papua)\n", " \n", " \n", - " 113\n", - " 4209.13832\n", + " 105\n", + " 3394.886275\n", + " Biscoe\n", + " 34.5\n", + " 18.1\n", + " 187.0\n", + " FEMALE\n", + " Adelie Penguin (Pygoscelis adeliae)\n", " \n", " \n", - " 130\n", - " 4197.90382\n", + " 111\n", + " 3201.48777\n", + " Biscoe\n", + " 37.9\n", + " 18.6\n", + " 172.0\n", + " FEMALE\n", + " Adelie Penguin (Pygoscelis adeliae)\n", " \n", " \n", "\n", - "

25 rows × 1 columns

\n", - "[67 rows x 1 columns in total]" - ], - "text/plain": [ - " predicted_body_mass_g\n", - "penguin_id \n", - "3 3394.116212\n", - "8 4048.683645\n", - "17 3976.452358\n", - "23 3541.580346\n", - "25 4032.842027\n", - "27 4118.34983\n", - "29 4087.765797\n", - "34 3183.75379\n", - "35 3418.800633\n", - "39 3519.18471\n", - "51 3398.133564\n", - "52 3223.614107\n", - "60 3445.012713\n", - "61 3505.637004\n", - "64 3515.903779\n", - "65 4028.361259\n", - "67 4159.991956\n", - "83 3348.167212\n", - "85 3485.048557\n", - "93 4172.872284\n", - "104 3299.300454\n", - "105 3515.68617\n", - "108 3405.222757\n", - "113 4209.13832\n", - "130 4197.90382\n", - "...\n", + "

25 rows × 7 columns

\n", + "[67 rows x 7 columns in total]" + ], + "text/plain": [ + " predicted_body_mass_g island culmen_length_mm \\\n", + "penguin_id \n", + "1 3781.396682 Torgersen 39.1 \n", + "4 4124.102574 Biscoe 43.2 \n", + "8 4670.338389 Biscoe 46.5 \n", + "11 3529.411644 Dream 38.1 \n", + "13 4014.09632 Biscoe 37.8 \n", + "15 5212.407319 Biscoe 48.7 \n", + "16 4163.590502 Torgersen 34.6 \n", + "23 3392.44731 Dream 42.2 \n", + "34 4698.299674 Biscoe 40.9 \n", + "36 4828.221398 Biscoe 43.4 \n", + "42 3430.582874 Biscoe 35.5 \n", + "48 5314.254798 Biscoe 46.3 \n", + "61 5363.19995 Biscoe 46.2 \n", + "64 4855.90281 Biscoe 46.4 \n", + "65 3413.094869 Dream 37.6 \n", + "68 3340.213193 Torgersen 36.2 \n", + "70 4228.726508 Dream 52.8 \n", + "72 3811.532821 Dream 48.5 \n", + "74 4659.765013 Biscoe 42.8 \n", + "77 3453.383042 Dream 37.3 \n", + "81 4766.239424 Biscoe 45.2 \n", + "91 4057.801947 Dream 50.8 \n", + "96 4739.821792 Biscoe 45.4 \n", + "105 3394.886275 Biscoe 34.5 \n", + "111 3201.48777 Biscoe 37.9 \n", "\n", - "[67 rows x 1 columns]" + " culmen_depth_mm flipper_length_mm sex \\\n", + "penguin_id \n", + "1 18.7 181.0 MALE \n", + "4 19.0 197.0 MALE \n", + "8 13.5 210.0 FEMALE \n", + "11 18.6 190.0 FEMALE \n", + "13 20.0 190.0 MALE \n", + "15 15.7 208.0 MALE \n", + "16 21.1 198.0 MALE \n", + "23 18.5 180.0 FEMALE \n", + "34 13.7 214.0 FEMALE \n", + "36 14.4 218.0 FEMALE \n", + "42 16.2 195.0 FEMALE \n", + "48 15.8 215.0 MALE \n", + "61 14.9 221.0 MALE \n", + "64 15.0 216.0 FEMALE \n", + "65 19.3 181.0 FEMALE \n", + "68 17.2 187.0 FEMALE \n", + "70 20.0 205.0 MALE \n", + "72 17.5 191.0 MALE \n", + "74 14.2 209.0 FEMALE \n", + "77 16.8 192.0 FEMALE \n", + "81 14.8 212.0 FEMALE \n", + "91 18.5 201.0 MALE \n", + "96 14.6 211.0 FEMALE \n", + "105 18.1 187.0 FEMALE \n", + "111 18.6 172.0 FEMALE \n", + "\n", + " species \n", + "penguin_id \n", + "1 Adelie Penguin (Pygoscelis adeliae) \n", + "4 Adelie Penguin (Pygoscelis adeliae) \n", + "8 Gentoo penguin (Pygoscelis papua) \n", + "11 Adelie Penguin (Pygoscelis adeliae) \n", + "13 Adelie Penguin (Pygoscelis adeliae) \n", + "15 Gentoo penguin (Pygoscelis papua) \n", + "16 Adelie Penguin (Pygoscelis adeliae) \n", + "23 Adelie Penguin (Pygoscelis adeliae) \n", + "34 Gentoo penguin (Pygoscelis papua) \n", + "36 Gentoo penguin (Pygoscelis papua) \n", + "42 Adelie Penguin (Pygoscelis adeliae) \n", + "48 Gentoo penguin (Pygoscelis papua) \n", + "61 Gentoo penguin (Pygoscelis papua) \n", + "64 Gentoo penguin (Pygoscelis papua) \n", + "65 Adelie Penguin (Pygoscelis adeliae) \n", + "68 Adelie Penguin (Pygoscelis adeliae) \n", + "70 Chinstrap penguin (Pygoscelis antarctica) \n", + "72 Chinstrap penguin (Pygoscelis antarctica) \n", + "74 Gentoo penguin (Pygoscelis papua) \n", + "77 Adelie Penguin (Pygoscelis adeliae) \n", + "81 Gentoo penguin (Pygoscelis papua) \n", + "91 Chinstrap penguin (Pygoscelis antarctica) \n", + "96 Gentoo penguin (Pygoscelis papua) \n", + "105 Adelie Penguin (Pygoscelis adeliae) \n", + "111 Adelie Penguin (Pygoscelis adeliae) \n", + "\n", + "[67 rows x 7 columns]" ] }, - "execution_count": 28, + "execution_count": 11, "metadata": {}, "output_type": "execute_result" } @@ -3034,60 +3670,16 @@ }, { "cell_type": "code", - "execution_count": 29, + "execution_count": 12, "metadata": {}, "outputs": [ { "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "2d32081be31f44abb8de67e2209d76cd", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - "HTML(value='Query job 2a043039-670f-4eb8-9cf0-765ee6ed7de6 is RUNNING. Open Job" + ], "text/plain": [ - "HTML(value='Query job bc8b2042-1e13-441c-9531-300ed5badb7a is RUNNING. " ] }, "metadata": {}, @@ -3095,13 +3687,11 @@ }, { "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "4588ae10de634460bf4026ddd9076351", - "version_major": 2, - "version_minor": 0 - }, + "text/html": [ + "Query job 7f1f565b-0f73-4a4e-b33f-8484fa260838 is DONE. 0 Bytes processed. Open Job" + ], "text/plain": [ - "HTML(value='Query job 5e867182-dd7a-4aff-87a8-f7596e900fd5 is DONE. 0 Bytes processed. " ] }, "metadata": {}, @@ -3109,13 +3699,11 @@ }, { "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "8209cf8286a545ebb7b6ef9d002a43a1", - "version_major": 2, - "version_minor": 0 - }, + "text/html": [ + "Query job d4b9d4a6-d75e-46e1-b092-ab58e8aef890 is DONE. 48 Bytes processed. Open Job" + ], "text/plain": [ - "HTML(value='Query job d4cdb016-8f1e-4960-8ed7-4524ccc5a8a8 is RUNNING. " ] }, "metadata": {}, @@ -3153,12 +3741,12 @@ " \n", " \n", " 0\n", - " 229.48269\n", - " 82962.794947\n", - " 0.004248\n", - " 206.728384\n", - " 0.88633\n", - " 0.892953\n", + " 216.444357\n", + " 72639.698707\n", + " 0.00463\n", + " 170.588356\n", + " 0.896396\n", + " 0.900547\n", " \n", " \n", "\n", @@ -3167,15 +3755,15 @@ ], "text/plain": [ " mean_absolute_error mean_squared_error mean_squared_log_error \\\n", - "0 229.48269 82962.794947 0.004248 \n", + "0 216.444357 72639.698707 0.00463 \n", "\n", " median_absolute_error r2_score explained_variance \n", - "0 206.728384 0.88633 0.892953 \n", + "0 170.588356 0.896396 0.900547 \n", "\n", "[1 rows x 6 columns]" ] }, - "execution_count": 29, + "execution_count": 12, "metadata": {}, "output_type": "execute_result" } @@ -3195,18 +3783,16 @@ }, { "cell_type": "code", - "execution_count": 30, + "execution_count": 14, "metadata": {}, "outputs": [ { "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "f32692d89f00406499f4ea5aa55268fb", - "version_major": 2, - "version_minor": 0 - }, + "text/html": [ + "Query job 73448ee8-698b-435f-b11e-6fe2de3bcd8d is DONE. 28.9 kB processed. Open Job" + ], "text/plain": [ - "HTML(value='Query job e57383ef-f043-458b-96c6-893e7c5b0c00 is RUNNING. " ] }, "metadata": {}, @@ -3214,13 +3800,11 @@ }, { "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "72e5f23a99de4a818c8493b8b4f3854d", - "version_major": 2, - "version_minor": 0 - }, + "text/html": [ + "Query job e002f59d-a03c-4ec9-a85a-93adbfd7bd17 is DONE. 28.9 kB processed. Open Job" + ], "text/plain": [ - "HTML(value='Query job 1a9db485-477b-43e2-94eb-dea7dc21d45d is RUNNING. " ] }, "metadata": {}, @@ -3228,13 +3812,11 @@ }, { "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "9d5333a91b504dd9be51c997715530ab", - "version_major": 2, - "version_minor": 0 - }, + "text/html": [ + "Query job 4ab1febc-fb55-473a-b295-69e4329cc5f0 is DONE. 30.0 kB processed. Open Job" + ], "text/plain": [ - "HTML(value='Query job 4570a563-b8e0-4308-b8cb-c4731491d4f7 is RUNNING. " ] }, "metadata": {}, @@ -3243,10 +3825,10 @@ { "data": { "text/plain": [ - "0.8863300923278365" + "0.8963962044533755" ] }, - "execution_count": 30, + "execution_count": 14, "metadata": {}, "output_type": "execute_result" } @@ -3254,7 +3836,7 @@ "source": [ "from bigframes.ml.metrics import r2_score\n", "\n", - "r2_score(y_test, predicted_y_test)" + "r2_score(y_test, predicted_y_test[\"predicted_body_mass_g\"])" ] }, { @@ -3274,57 +3856,9 @@ }, { "cell_type": "code", - "execution_count": 33, + "execution_count": 15, "metadata": {}, - "outputs": [ - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "fbc4a70f31d4465b974a7f7c9cc97731", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - "HTML(value='Copy job c2413be4-6972-4e36-8234-5063628b6d71 is RUNNING. Open Job" + ], "text/plain": [ - "HTML(value='Query job 31a5b656-000e-4238-9fd9-c6e644ca298f is DONE. 31.7 kB processed. " ] }, "metadata": {}, @@ -67,13 +37,11 @@ }, { "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "0f25faa156584cc59dda9b0e60f72534", - "version_major": 2, - "version_minor": 0 - }, + "text/html": [ + "Query job 12e0f983-695e-4903-8ff1-2f353d7e8cba is DONE. 28.9 kB processed. Open Job" + ], "text/plain": [ - "HTML(value='Query job d8eed0ca-7ce9-4ed8-a592-e16af9f9db8d is DONE. 0 Bytes processed. " ] }, "metadata": {}, @@ -113,250 +81,250 @@ " \n", " 0\n", " Adelie Penguin (Pygoscelis adeliae)\n", - " Dream\n", - " 36.6\n", - " 18.4\n", - " 184.0\n", - " 3475.0\n", - " FEMALE\n", + " Biscoe\n", + " 40.1\n", + " 18.9\n", + " 188.0\n", + " 4300.0\n", + " MALE\n", " \n", " \n", " 1\n", " Adelie Penguin (Pygoscelis adeliae)\n", - " Dream\n", - " 39.8\n", - " 19.1\n", - " 184.0\n", - " 4650.0\n", + " Torgersen\n", + " 39.1\n", + " 18.7\n", + " 181.0\n", + " 3750.0\n", " MALE\n", " \n", " \n", " 2\n", - " Adelie Penguin (Pygoscelis adeliae)\n", - " Dream\n", - " 40.9\n", - " 18.9\n", - " 184.0\n", - " 3900.0\n", - " MALE\n", + " Gentoo penguin (Pygoscelis papua)\n", + " Biscoe\n", + " 47.4\n", + " 14.6\n", + " 212.0\n", + " 4725.0\n", + " FEMALE\n", " \n", " \n", " 3\n", " Chinstrap penguin (Pygoscelis antarctica)\n", " Dream\n", - " 46.5\n", - " 17.9\n", - " 192.0\n", - " 3500.0\n", + " 42.5\n", + " 16.7\n", + " 187.0\n", + " 3350.0\n", " FEMALE\n", " \n", " \n", " 4\n", " Adelie Penguin (Pygoscelis adeliae)\n", - " Dream\n", - " 37.3\n", - " 16.8\n", - " 192.0\n", - " 3000.0\n", - " FEMALE\n", + " Biscoe\n", + " 43.2\n", + " 19.0\n", + " 197.0\n", + " 4775.0\n", + " MALE\n", " \n", " \n", " 5\n", - " Adelie Penguin (Pygoscelis adeliae)\n", - " Dream\n", - " 43.2\n", - " 18.5\n", - " 192.0\n", - " 4100.0\n", + " Gentoo penguin (Pygoscelis papua)\n", + " Biscoe\n", + " 46.7\n", + " 15.3\n", + " 219.0\n", + " 5200.0\n", " MALE\n", " \n", " \n", " 6\n", - " Chinstrap penguin (Pygoscelis antarctica)\n", - " Dream\n", - " 46.9\n", - " 16.6\n", - " 192.0\n", - " 2700.0\n", - " FEMALE\n", + " Adelie Penguin (Pygoscelis adeliae)\n", + " Biscoe\n", + " 41.3\n", + " 21.1\n", + " 195.0\n", + " 4400.0\n", + " MALE\n", " \n", " \n", " 7\n", - " Chinstrap penguin (Pygoscelis antarctica)\n", - " Dream\n", - " 50.5\n", - " 18.4\n", - " 200.0\n", - " 3400.0\n", + " Gentoo penguin (Pygoscelis papua)\n", + " Biscoe\n", + " 45.2\n", + " 13.8\n", + " 215.0\n", + " 4750.0\n", " FEMALE\n", " \n", " \n", " 8\n", - " Chinstrap penguin (Pygoscelis antarctica)\n", - " Dream\n", - " 49.5\n", - " 19.0\n", - " 200.0\n", - " 3800.0\n", - " MALE\n", + " Gentoo penguin (Pygoscelis papua)\n", + " Biscoe\n", + " 46.5\n", + " 13.5\n", + " 210.0\n", + " 4550.0\n", + " FEMALE\n", " \n", " \n", " 9\n", - " Adelie Penguin (Pygoscelis adeliae)\n", - " Dream\n", - " 40.2\n", - " 20.1\n", - " 200.0\n", - " 3975.0\n", - " MALE\n", + " Gentoo penguin (Pygoscelis papua)\n", + " Biscoe\n", + " 50.5\n", + " 15.2\n", + " 216.0\n", + " 5000.0\n", + " FEMALE\n", " \n", " \n", " 10\n", - " Adelie Penguin (Pygoscelis adeliae)\n", - " Dream\n", - " 40.8\n", - " 18.9\n", - " 208.0\n", - " 4300.0\n", + " Gentoo penguin (Pygoscelis papua)\n", + " Biscoe\n", + " 48.2\n", + " 15.6\n", + " 221.0\n", + " 5100.0\n", " MALE\n", " \n", " \n", " 11\n", " Adelie Penguin (Pygoscelis adeliae)\n", " Dream\n", - " 39.0\n", - " 18.7\n", - " 185.0\n", - " 3650.0\n", - " MALE\n", + " 38.1\n", + " 18.6\n", + " 190.0\n", + " 3700.0\n", + " FEMALE\n", " \n", " \n", " 12\n", - " Adelie Penguin (Pygoscelis adeliae)\n", - " Dream\n", - " 37.0\n", - " 16.9\n", - " 185.0\n", - " 3000.0\n", - " FEMALE\n", + " Gentoo penguin (Pygoscelis papua)\n", + " Biscoe\n", + " 50.7\n", + " 15.0\n", + " 223.0\n", + " 5550.0\n", + " MALE\n", " \n", " \n", " 13\n", - " Chinstrap penguin (Pygoscelis antarctica)\n", - " Dream\n", - " 47.0\n", - " 17.3\n", - " 185.0\n", - " 3700.0\n", - " FEMALE\n", + " Adelie Penguin (Pygoscelis adeliae)\n", + " Biscoe\n", + " 37.8\n", + " 20.0\n", + " 190.0\n", + " 4250.0\n", + " MALE\n", " \n", " \n", " 14\n", " Adelie Penguin (Pygoscelis adeliae)\n", - " Dream\n", - " 34.0\n", - " 17.1\n", - " 185.0\n", - " 3400.0\n", + " Biscoe\n", + " 35.0\n", + " 17.9\n", + " 190.0\n", + " 3450.0\n", " FEMALE\n", " \n", " \n", " 15\n", - " Adelie Penguin (Pygoscelis adeliae)\n", - " Dream\n", - " 37.0\n", - " 16.5\n", - " 185.0\n", - " 3400.0\n", - " FEMALE\n", + " Gentoo penguin (Pygoscelis papua)\n", + " Biscoe\n", + " 48.7\n", + " 15.7\n", + " 208.0\n", + " 5350.0\n", + " MALE\n", " \n", " \n", " 16\n", - " Chinstrap penguin (Pygoscelis antarctica)\n", - " Dream\n", - " 45.7\n", - " 17.3\n", - " 193.0\n", - " 3600.0\n", - " FEMALE\n", + " Adelie Penguin (Pygoscelis adeliae)\n", + " Torgersen\n", + " 34.6\n", + " 21.1\n", + " 198.0\n", + " 4400.0\n", + " MALE\n", " \n", " \n", " 17\n", - " Chinstrap penguin (Pygoscelis antarctica)\n", - " Dream\n", - " 50.6\n", - " 19.4\n", - " 193.0\n", - " 3800.0\n", + " Gentoo penguin (Pygoscelis papua)\n", + " Biscoe\n", + " 46.8\n", + " 15.4\n", + " 215.0\n", + " 5150.0\n", " MALE\n", " \n", " \n", " 18\n", - " Adelie Penguin (Pygoscelis adeliae)\n", + " Chinstrap penguin (Pygoscelis antarctica)\n", " Dream\n", - " 39.7\n", - " 17.9\n", - " 193.0\n", - " 4250.0\n", + " 50.3\n", + " 20.0\n", + " 197.0\n", + " 3300.0\n", " MALE\n", " \n", " \n", " 19\n", " Adelie Penguin (Pygoscelis adeliae)\n", " Dream\n", - " 37.8\n", + " 37.2\n", " 18.1\n", - " 193.0\n", - " 3750.0\n", + " 178.0\n", + " 3900.0\n", " MALE\n", " \n", " \n", " 20\n", " Chinstrap penguin (Pygoscelis antarctica)\n", " Dream\n", - " 46.6\n", - " 17.8\n", - " 193.0\n", - " 3800.0\n", - " FEMALE\n", + " 51.0\n", + " 18.8\n", + " 203.0\n", + " 4100.0\n", + " MALE\n", " \n", " \n", " 21\n", - " Chinstrap penguin (Pygoscelis antarctica)\n", - " Dream\n", - " 51.3\n", - " 19.2\n", - " 193.0\n", - " 3650.0\n", - " MALE\n", + " Adelie Penguin (Pygoscelis adeliae)\n", + " Biscoe\n", + " 40.5\n", + " 17.9\n", + " 187.0\n", + " 3200.0\n", + " FEMALE\n", " \n", " \n", " 22\n", - " Adelie Penguin (Pygoscelis adeliae)\n", - " Dream\n", - " 40.2\n", - " 17.1\n", - " 193.0\n", - " 3400.0\n", + " Gentoo penguin (Pygoscelis papua)\n", + " Biscoe\n", + " 45.5\n", + " 13.9\n", + " 210.0\n", + " 4200.0\n", " FEMALE\n", " \n", " \n", " 23\n", " Adelie Penguin (Pygoscelis adeliae)\n", " Dream\n", - " 36.8\n", + " 42.2\n", " 18.5\n", - " 193.0\n", - " 3500.0\n", + " 180.0\n", + " 3550.0\n", " FEMALE\n", " \n", " \n", " 24\n", " Chinstrap penguin (Pygoscelis antarctica)\n", " Dream\n", - " 49.6\n", - " 18.2\n", - " 193.0\n", + " 51.7\n", + " 20.3\n", + " 194.0\n", " 3775.0\n", " MALE\n", " \n", @@ -366,65 +334,65 @@ "[344 rows x 7 columns in total]" ], "text/plain": [ - " species island culmen_length_mm \\\n", - "0 Adelie Penguin (Pygoscelis adeliae) Dream 36.6 \n", - "1 Adelie Penguin (Pygoscelis adeliae) Dream 39.8 \n", - "2 Adelie Penguin (Pygoscelis adeliae) Dream 40.9 \n", - "3 Chinstrap penguin (Pygoscelis antarctica) Dream 46.5 \n", - "4 Adelie Penguin (Pygoscelis adeliae) Dream 37.3 \n", - "5 Adelie Penguin (Pygoscelis adeliae) Dream 43.2 \n", - "6 Chinstrap penguin (Pygoscelis antarctica) Dream 46.9 \n", - "7 Chinstrap penguin (Pygoscelis antarctica) Dream 50.5 \n", - "8 Chinstrap penguin (Pygoscelis antarctica) Dream 49.5 \n", - "9 Adelie Penguin (Pygoscelis adeliae) Dream 40.2 \n", - "10 Adelie Penguin (Pygoscelis adeliae) Dream 40.8 \n", - "11 Adelie Penguin (Pygoscelis adeliae) Dream 39.0 \n", - "12 Adelie Penguin (Pygoscelis adeliae) Dream 37.0 \n", - "13 Chinstrap penguin (Pygoscelis antarctica) Dream 47.0 \n", - "14 Adelie Penguin (Pygoscelis adeliae) Dream 34.0 \n", - "15 Adelie Penguin (Pygoscelis adeliae) Dream 37.0 \n", - "16 Chinstrap penguin (Pygoscelis antarctica) Dream 45.7 \n", - "17 Chinstrap penguin (Pygoscelis antarctica) Dream 50.6 \n", - "18 Adelie Penguin (Pygoscelis adeliae) Dream 39.7 \n", - "19 Adelie Penguin (Pygoscelis adeliae) Dream 37.8 \n", - "20 Chinstrap penguin (Pygoscelis antarctica) Dream 46.6 \n", - "21 Chinstrap penguin (Pygoscelis antarctica) Dream 51.3 \n", - "22 Adelie Penguin (Pygoscelis adeliae) Dream 40.2 \n", - "23 Adelie Penguin (Pygoscelis adeliae) Dream 36.8 \n", - "24 Chinstrap penguin (Pygoscelis antarctica) Dream 49.6 \n", + " species island culmen_length_mm \\\n", + "0 Adelie Penguin (Pygoscelis adeliae) Biscoe 40.1 \n", + "1 Adelie Penguin (Pygoscelis adeliae) Torgersen 39.1 \n", + "2 Gentoo penguin (Pygoscelis papua) Biscoe 47.4 \n", + "3 Chinstrap penguin (Pygoscelis antarctica) Dream 42.5 \n", + "4 Adelie Penguin (Pygoscelis adeliae) Biscoe 43.2 \n", + "5 Gentoo penguin (Pygoscelis papua) Biscoe 46.7 \n", + "6 Adelie Penguin (Pygoscelis adeliae) Biscoe 41.3 \n", + "7 Gentoo penguin (Pygoscelis papua) Biscoe 45.2 \n", + "8 Gentoo penguin (Pygoscelis papua) Biscoe 46.5 \n", + "9 Gentoo penguin (Pygoscelis papua) Biscoe 50.5 \n", + "10 Gentoo penguin (Pygoscelis papua) Biscoe 48.2 \n", + "11 Adelie Penguin (Pygoscelis adeliae) Dream 38.1 \n", + "12 Gentoo penguin (Pygoscelis papua) Biscoe 50.7 \n", + "13 Adelie Penguin (Pygoscelis adeliae) Biscoe 37.8 \n", + "14 Adelie Penguin (Pygoscelis adeliae) Biscoe 35.0 \n", + "15 Gentoo penguin (Pygoscelis papua) Biscoe 48.7 \n", + "16 Adelie Penguin (Pygoscelis adeliae) Torgersen 34.6 \n", + "17 Gentoo penguin (Pygoscelis papua) Biscoe 46.8 \n", + "18 Chinstrap penguin (Pygoscelis antarctica) Dream 50.3 \n", + "19 Adelie Penguin (Pygoscelis adeliae) Dream 37.2 \n", + "20 Chinstrap penguin (Pygoscelis antarctica) Dream 51.0 \n", + "21 Adelie Penguin (Pygoscelis adeliae) Biscoe 40.5 \n", + "22 Gentoo penguin (Pygoscelis papua) Biscoe 45.5 \n", + "23 Adelie Penguin (Pygoscelis adeliae) Dream 42.2 \n", + "24 Chinstrap penguin (Pygoscelis antarctica) Dream 51.7 \n", "\n", " culmen_depth_mm flipper_length_mm body_mass_g sex \n", - "0 18.4 184.0 3475.0 FEMALE \n", - "1 19.1 184.0 4650.0 MALE \n", - "2 18.9 184.0 3900.0 MALE \n", - "3 17.9 192.0 3500.0 FEMALE \n", - "4 16.8 192.0 3000.0 FEMALE \n", - "5 18.5 192.0 4100.0 MALE \n", - "6 16.6 192.0 2700.0 FEMALE \n", - "7 18.4 200.0 3400.0 FEMALE \n", - "8 19.0 200.0 3800.0 MALE \n", - "9 20.1 200.0 3975.0 MALE \n", - "10 18.9 208.0 4300.0 MALE \n", - "11 18.7 185.0 3650.0 MALE \n", - "12 16.9 185.0 3000.0 FEMALE \n", - "13 17.3 185.0 3700.0 FEMALE \n", - "14 17.1 185.0 3400.0 FEMALE \n", - "15 16.5 185.0 3400.0 FEMALE \n", - "16 17.3 193.0 3600.0 FEMALE \n", - "17 19.4 193.0 3800.0 MALE \n", - "18 17.9 193.0 4250.0 MALE \n", - "19 18.1 193.0 3750.0 MALE \n", - "20 17.8 193.0 3800.0 FEMALE \n", - "21 19.2 193.0 3650.0 MALE \n", - "22 17.1 193.0 3400.0 FEMALE \n", - "23 18.5 193.0 3500.0 FEMALE \n", - "24 18.2 193.0 3775.0 MALE \n", + "0 18.9 188.0 4300.0 MALE \n", + "1 18.7 181.0 3750.0 MALE \n", + "2 14.6 212.0 4725.0 FEMALE \n", + "3 16.7 187.0 3350.0 FEMALE \n", + "4 19.0 197.0 4775.0 MALE \n", + "5 15.3 219.0 5200.0 MALE \n", + "6 21.1 195.0 4400.0 MALE \n", + "7 13.8 215.0 4750.0 FEMALE \n", + "8 13.5 210.0 4550.0 FEMALE \n", + "9 15.2 216.0 5000.0 FEMALE \n", + "10 15.6 221.0 5100.0 MALE \n", + "11 18.6 190.0 3700.0 FEMALE \n", + "12 15.0 223.0 5550.0 MALE \n", + "13 20.0 190.0 4250.0 MALE \n", + "14 17.9 190.0 3450.0 FEMALE \n", + "15 15.7 208.0 5350.0 MALE \n", + "16 21.1 198.0 4400.0 MALE \n", + "17 15.4 215.0 5150.0 MALE \n", + "18 20.0 197.0 3300.0 MALE \n", + "19 18.1 178.0 3900.0 MALE \n", + "20 18.8 203.0 4100.0 MALE \n", + "21 17.9 187.0 3200.0 FEMALE \n", + "22 13.9 210.0 4200.0 FEMALE \n", + "23 18.5 180.0 3550.0 FEMALE \n", + "24 20.3 194.0 3775.0 MALE \n", "...\n", "\n", "[344 rows x 7 columns]" ] }, - "execution_count": 12, + "execution_count": 1, "metadata": {}, "output_type": "execute_result" } @@ -450,32 +418,16 @@ }, { "cell_type": "code", - "execution_count": 13, + "execution_count": 2, "metadata": {}, "outputs": [ { "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "a9ad907fa6e64a61a9dce420bc7d2beb", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - "HTML(value='Query job 3537a10a-641a-4d40-ae47-449c641b1bc5 is DONE. 28.9 kB processed. Open Job" + ], "text/plain": [ - "HTML(value='Query job 34101409-7c65-4045-ad52-c6ba24dc9cbb is DONE. 31.7 kB processed. " ] }, "metadata": {}, @@ -483,13 +435,11 @@ }, { "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "514e68d5b0b4452a9ccdff947848541a", - "version_major": 2, - "version_minor": 0 - }, + "text/html": [ + "Query job 2af0b0d6-c11b-499e-8d25-a2c628b2853b is DONE. 28.9 kB processed. Open Job" + ], "text/plain": [ - "HTML(value='Query job 74190ac2-21a2-47b0-bc21-ef5373565f17 is DONE. 0 Bytes processed. " ] }, "metadata": {}, @@ -527,294 +477,294 @@ " \n", " \n", " 0\n", - " Dream\n", - " 36.6\n", - " 18.4\n", - " 184.0\n", - " 3475.0\n", - " FEMALE\n", - " \n", - " \n", - " 1\n", - " Dream\n", - " 39.8\n", - " 19.1\n", - " 184.0\n", - " 4650.0\n", + " Biscoe\n", + " 40.1\n", + " 18.9\n", + " 188.0\n", + " 4300.0\n", " MALE\n", " \n", " \n", - " 2\n", - " Dream\n", - " 40.9\n", - " 18.9\n", - " 184.0\n", - " 3900.0\n", + " 1\n", + " Torgersen\n", + " 39.1\n", + " 18.7\n", + " 181.0\n", + " 3750.0\n", " MALE\n", " \n", " \n", " 4\n", - " Dream\n", - " 37.3\n", - " 16.8\n", - " 192.0\n", - " 3000.0\n", - " FEMALE\n", - " \n", - " \n", - " 5\n", - " Dream\n", + " Biscoe\n", " 43.2\n", - " 18.5\n", - " 192.0\n", - " 4100.0\n", - " MALE\n", - " \n", - " \n", - " 9\n", - " Dream\n", - " 40.2\n", - " 20.1\n", - " 200.0\n", - " 3975.0\n", + " 19.0\n", + " 197.0\n", + " 4775.0\n", " MALE\n", " \n", " \n", - " 10\n", - " Dream\n", - " 40.8\n", - " 18.9\n", - " 208.0\n", - " 4300.0\n", + " 6\n", + " Biscoe\n", + " 41.3\n", + " 21.1\n", + " 195.0\n", + " 4400.0\n", " MALE\n", " \n", " \n", " 11\n", " Dream\n", - " 39.0\n", - " 18.7\n", - " 185.0\n", - " 3650.0\n", - " MALE\n", - " \n", - " \n", - " 12\n", - " Dream\n", - " 37.0\n", - " 16.9\n", - " 185.0\n", - " 3000.0\n", + " 38.1\n", + " 18.6\n", + " 190.0\n", + " 3700.0\n", " FEMALE\n", " \n", " \n", - " 14\n", - " Dream\n", - " 34.0\n", - " 17.1\n", - " 185.0\n", - " 3400.0\n", - " FEMALE\n", + " 13\n", + " Biscoe\n", + " 37.8\n", + " 20.0\n", + " 190.0\n", + " 4250.0\n", + " MALE\n", " \n", " \n", - " 15\n", - " Dream\n", - " 37.0\n", - " 16.5\n", - " 185.0\n", - " 3400.0\n", + " 14\n", + " Biscoe\n", + " 35.0\n", + " 17.9\n", + " 190.0\n", + " 3450.0\n", " FEMALE\n", " \n", " \n", - " 18\n", - " Dream\n", - " 39.7\n", - " 17.9\n", - " 193.0\n", - " 4250.0\n", + " 16\n", + " Torgersen\n", + " 34.6\n", + " 21.1\n", + " 198.0\n", + " 4400.0\n", " MALE\n", " \n", " \n", " 19\n", " Dream\n", - " 37.8\n", + " 37.2\n", " 18.1\n", - " 193.0\n", - " 3750.0\n", + " 178.0\n", + " 3900.0\n", " MALE\n", " \n", " \n", - " 22\n", - " Dream\n", - " 40.2\n", - " 17.1\n", - " 193.0\n", - " 3400.0\n", + " 21\n", + " Biscoe\n", + " 40.5\n", + " 17.9\n", + " 187.0\n", + " 3200.0\n", " FEMALE\n", " \n", " \n", " 23\n", " Dream\n", - " 36.8\n", + " 42.2\n", " 18.5\n", - " 193.0\n", - " 3500.0\n", + " 180.0\n", + " 3550.0\n", " FEMALE\n", " \n", " \n", - " 26\n", + " 30\n", " Dream\n", - " 41.5\n", - " 18.5\n", - " 201.0\n", - " 4000.0\n", + " 39.2\n", + " 21.1\n", + " 196.0\n", + " 4150.0\n", " MALE\n", " \n", " \n", - " 31\n", - " Dream\n", - " 33.1\n", - " 16.1\n", - " 178.0\n", - " 2900.0\n", - " FEMALE\n", + " 32\n", + " Torgersen\n", + " 42.9\n", + " 17.6\n", + " 196.0\n", + " 4700.0\n", + " MALE\n", " \n", " \n", - " 32\n", + " 38\n", " Dream\n", - " 37.2\n", - " 18.1\n", - " 178.0\n", + " 41.1\n", + " 17.5\n", + " 190.0\n", " 3900.0\n", " MALE\n", " \n", " \n", - " 33\n", - " Dream\n", - " 39.5\n", - " 16.7\n", - " 178.0\n", - " 3250.0\n", + " 40\n", + " Torgersen\n", + " 38.6\n", + " 21.2\n", + " 191.0\n", + " 3800.0\n", + " MALE\n", + " \n", + " \n", + " 42\n", + " Biscoe\n", + " 35.5\n", + " 16.2\n", + " 195.0\n", + " 3350.0\n", " FEMALE\n", " \n", " \n", - " 35\n", + " 44\n", " Dream\n", - " 36.0\n", - " 18.5\n", + " 39.2\n", + " 18.6\n", + " 190.0\n", + " 4250.0\n", + " MALE\n", + " \n", + " \n", + " 45\n", + " Torgersen\n", + " 35.2\n", + " 15.9\n", " 186.0\n", - " 3100.0\n", + " 3050.0\n", " FEMALE\n", " \n", " \n", - " 36\n", + " 46\n", " Dream\n", + " 43.2\n", + " 18.5\n", + " 192.0\n", + " 4100.0\n", + " MALE\n", + " \n", + " \n", + " 49\n", + " Biscoe\n", " 39.6\n", - " 18.1\n", + " 17.7\n", " 186.0\n", - " 4450.0\n", - " MALE\n", + " 3500.0\n", + " FEMALE\n", " \n", " \n", - " 38\n", - " Dream\n", - " 41.3\n", + " 53\n", + " Biscoe\n", + " 45.6\n", " 20.3\n", - " 194.0\n", - " 3550.0\n", + " 191.0\n", + " 4600.0\n", " MALE\n", " \n", " \n", - " 41\n", - " Dream\n", - " 35.7\n", - " 18.0\n", - " 202.0\n", - " 3550.0\n", + " 58\n", + " Torgersen\n", + " 40.9\n", + " 16.8\n", + " 191.0\n", + " 3700.0\n", " FEMALE\n", " \n", " \n", - " 51\n", - " Dream\n", - " 38.1\n", - " 17.6\n", - " 187.0\n", - " 3425.0\n", + " 60\n", + " Torgersen\n", + " 40.3\n", + " 18.0\n", + " 195.0\n", + " 3250.0\n", " FEMALE\n", " \n", " \n", - " 53\n", + " 62\n", " Dream\n", " 36.0\n", - " 17.1\n", - " 187.0\n", - " 3700.0\n", + " 18.5\n", + " 186.0\n", + " 3100.0\n", " FEMALE\n", " \n", + " \n", + " 63\n", + " Torgersen\n", + " 39.3\n", + " 20.6\n", + " 190.0\n", + " 3650.0\n", + " MALE\n", + " \n", " \n", "\n", "

25 rows × 6 columns

\n", "[146 rows x 6 columns in total]" ], "text/plain": [ - " island culmen_length_mm culmen_depth_mm flipper_length_mm body_mass_g \\\n", - "0 Dream 36.6 18.4 184.0 3475.0 \n", - "1 Dream 39.8 19.1 184.0 4650.0 \n", - "2 Dream 40.9 18.9 184.0 3900.0 \n", - "4 Dream 37.3 16.8 192.0 3000.0 \n", - "5 Dream 43.2 18.5 192.0 4100.0 \n", - "9 Dream 40.2 20.1 200.0 3975.0 \n", - "10 Dream 40.8 18.9 208.0 4300.0 \n", - "11 Dream 39.0 18.7 185.0 3650.0 \n", - "12 Dream 37.0 16.9 185.0 3000.0 \n", - "14 Dream 34.0 17.1 185.0 3400.0 \n", - "15 Dream 37.0 16.5 185.0 3400.0 \n", - "18 Dream 39.7 17.9 193.0 4250.0 \n", - "19 Dream 37.8 18.1 193.0 3750.0 \n", - "22 Dream 40.2 17.1 193.0 3400.0 \n", - "23 Dream 36.8 18.5 193.0 3500.0 \n", - "26 Dream 41.5 18.5 201.0 4000.0 \n", - "31 Dream 33.1 16.1 178.0 2900.0 \n", - "32 Dream 37.2 18.1 178.0 3900.0 \n", - "33 Dream 39.5 16.7 178.0 3250.0 \n", - "35 Dream 36.0 18.5 186.0 3100.0 \n", - "36 Dream 39.6 18.1 186.0 4450.0 \n", - "38 Dream 41.3 20.3 194.0 3550.0 \n", - "41 Dream 35.7 18.0 202.0 3550.0 \n", - "51 Dream 38.1 17.6 187.0 3425.0 \n", - "53 Dream 36.0 17.1 187.0 3700.0 \n", + " island culmen_length_mm culmen_depth_mm flipper_length_mm \\\n", + "0 Biscoe 40.1 18.9 188.0 \n", + "1 Torgersen 39.1 18.7 181.0 \n", + "4 Biscoe 43.2 19.0 197.0 \n", + "6 Biscoe 41.3 21.1 195.0 \n", + "11 Dream 38.1 18.6 190.0 \n", + "13 Biscoe 37.8 20.0 190.0 \n", + "14 Biscoe 35.0 17.9 190.0 \n", + "16 Torgersen 34.6 21.1 198.0 \n", + "19 Dream 37.2 18.1 178.0 \n", + "21 Biscoe 40.5 17.9 187.0 \n", + "23 Dream 42.2 18.5 180.0 \n", + "30 Dream 39.2 21.1 196.0 \n", + "32 Torgersen 42.9 17.6 196.0 \n", + "38 Dream 41.1 17.5 190.0 \n", + "40 Torgersen 38.6 21.2 191.0 \n", + "42 Biscoe 35.5 16.2 195.0 \n", + "44 Dream 39.2 18.6 190.0 \n", + "45 Torgersen 35.2 15.9 186.0 \n", + "46 Dream 43.2 18.5 192.0 \n", + "49 Biscoe 39.6 17.7 186.0 \n", + "53 Biscoe 45.6 20.3 191.0 \n", + "58 Torgersen 40.9 16.8 191.0 \n", + "60 Torgersen 40.3 18.0 195.0 \n", + "62 Dream 36.0 18.5 186.0 \n", + "63 Torgersen 39.3 20.6 190.0 \n", "\n", - " sex \n", - "0 FEMALE \n", - "1 MALE \n", - "2 MALE \n", - "4 FEMALE \n", - "5 MALE \n", - "9 MALE \n", - "10 MALE \n", - "11 MALE \n", - "12 FEMALE \n", - "14 FEMALE \n", - "15 FEMALE \n", - "18 MALE \n", - "19 MALE \n", - "22 FEMALE \n", - "23 FEMALE \n", - "26 MALE \n", - "31 FEMALE \n", - "32 MALE \n", - "33 FEMALE \n", - "35 FEMALE \n", - "36 MALE \n", - "38 MALE \n", - "41 FEMALE \n", - "51 FEMALE \n", - "53 FEMALE \n", + " body_mass_g sex \n", + "0 4300.0 MALE \n", + "1 3750.0 MALE \n", + "4 4775.0 MALE \n", + "6 4400.0 MALE \n", + "11 3700.0 FEMALE \n", + "13 4250.0 MALE \n", + "14 3450.0 FEMALE \n", + "16 4400.0 MALE \n", + "19 3900.0 MALE \n", + "21 3200.0 FEMALE \n", + "23 3550.0 FEMALE \n", + "30 4150.0 MALE \n", + "32 4700.0 MALE \n", + "38 3900.0 MALE \n", + "40 3800.0 MALE \n", + "42 3350.0 FEMALE \n", + "44 4250.0 MALE \n", + "45 3050.0 FEMALE \n", + "46 4100.0 MALE \n", + "49 3500.0 FEMALE \n", + "53 4600.0 MALE \n", + "58 3700.0 FEMALE \n", + "60 3250.0 FEMALE \n", + "62 3100.0 FEMALE \n", + "63 3650.0 MALE \n", "...\n", "\n", "[146 rows x 6 columns]" ] }, - "execution_count": 13, + "execution_count": 2, "metadata": {}, "output_type": "execute_result" } @@ -843,18 +793,16 @@ }, { "cell_type": "code", - "execution_count": 14, + "execution_count": 3, "metadata": {}, "outputs": [ { "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "81f9aa34c7234bd88b6b7a4bc77d4b4e", - "version_major": 2, - "version_minor": 0 - }, + "text/html": [ + "Query job 0808457b-a0df-4a37-b7a5-8885f4a4588c is DONE. 28.9 kB processed.
Open Job" + ], "text/plain": [ - "HTML(value='Query job 288f0daa-a51e-45b4-86bf-d054467c4a99 is DONE. 28.9 kB processed. " ] }, "metadata": {}, @@ -881,7 +829,7 @@ }, { "cell_type": "code", - "execution_count": 15, + "execution_count": 4, "metadata": {}, "outputs": [ { @@ -897,7 +845,7 @@ " ('linreg', LinearRegression(fit_intercept=False))])" ] }, - "execution_count": 15, + "execution_count": 4, "metadata": {}, "output_type": "execute_result" } @@ -936,9 +884,63 @@ }, { "cell_type": "code", - "execution_count": 16, + "execution_count": 5, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/html": [ + "Query job e9bfa6a5-a53f-4d8b-ae8c-cc8cd55d0947 is DONE. 28.9 kB processed. Open Job" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Query job d8d553cf-3d36-49aa-b18b-9a05576a1fb0 is DONE. 28.9 kB processed. Open Job" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Query job 75ef0083-9a4f-4ffb-a6c6-d82974a1659f is DONE. 0 Bytes processed. Open Job" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [ + "Pipeline(steps=[('preproc',\n", + " ColumnTransformer(transformers=[('onehot', OneHotEncoder(),\n", + " ['island', 'species', 'sex']),\n", + " ('scaler', StandardScaler(),\n", + " ['culmen_depth_mm',\n", + " 'culmen_length_mm',\n", + " 'flipper_length_mm'])])),\n", + " ('linreg', LinearRegression(fit_intercept=False))])" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "pipeline.fit(X_train, y_train)" ] @@ -953,18 +955,16 @@ }, { "cell_type": "code", - "execution_count": 17, + "execution_count": 6, "metadata": {}, "outputs": [ { "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "fcf406d36c0d4915b318cd30c0f3df25", - "version_major": 2, - "version_minor": 0 - }, + "text/html": [ + "Query job 55c5a9ce-8159-4a1a-99a4-af3a906640ba is DONE. 29.3 kB processed. Open Job" + ], "text/plain": [ - "HTML(value='Query job 81196f97-304b-4d77-bb0f-8fc8adb8fe75 is RUNNING. " ] }, "metadata": {}, @@ -972,13 +972,11 @@ }, { "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "41399a6b1d4f45328bacc6c868cefdf6", - "version_major": 2, - "version_minor": 0 - }, + "text/html": [ + "Query job 3e41c470-de70-4f13-89d9-c5564d0b2836 is DONE. 232 Bytes processed. Open Job" + ], "text/plain": [ - "HTML(value='Query job b417f27a-387d-4eb2-8d6d-287327ef0471 is DONE. 232 Bytes processed. " ] }, "metadata": {}, @@ -986,13 +984,11 @@ }, { "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "e3c17676eab448c0942c0c32689ba4b5", - "version_major": 2, - "version_minor": 0 - }, + "text/html": [ + "Query job ed2f9042-a737-4d13-bd21-8c3d29cd61a2 is DONE. 28.9 kB processed. Open Job" + ], "text/plain": [ - "HTML(value='Query job b7f89a61-d76a-47be-8b83-917d69f255a2 is DONE. 31.7 kB processed. " ] }, "metadata": {}, @@ -1000,13 +996,11 @@ }, { "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "6c903861564b412aad9d9decad26560c", - "version_major": 2, - "version_minor": 0 - }, + "text/html": [ + "Query job 815d16b5-0a5d-42be-a766-1cff5b8f22f2 is DONE. 28.9 kB processed. Open Job" + ], "text/plain": [ - "HTML(value='Query job 9619c393-90b3-4fea-a197-d09389e9486c is DONE. 31.7 kB processed. " ] }, "metadata": {}, @@ -1014,13 +1008,11 @@ }, { "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "2c2534cd90e64c81be45753b81b1be46", - "version_major": 2, - "version_minor": 0 - }, + "text/html": [ + "Query job 37a38dc6-5073-4544-a1e3-da145a843922 is DONE. 29.4 kB processed. Open Job" + ], "text/plain": [ - "HTML(value='Query job e5854451-ffb4-4a28-a25f-3bdd68e9edae is DONE. 32.2 kB processed. " ] }, "metadata": {}, @@ -1029,10 +1021,10 @@ { "data": { "text/plain": [ - "0.6757452736197735" + "0.2655729213572775" ] }, - "execution_count": 17, + "execution_count": 6, "metadata": {}, "output_type": "execute_result" } @@ -1040,9 +1032,9 @@ "source": [ "from bigframes.ml.metrics import r2_score\n", "\n", - "pred_y = pipeline.predict(X_test)\n", + "y_pred = pipeline.predict(X_test)[\"predicted_body_mass_g\"]\n", "\n", - "r2_score(y_test, pred_y)" + "r2_score(y_test, y_pred)" ] }, { @@ -1055,18 +1047,16 @@ }, { "cell_type": "code", - "execution_count": 18, + "execution_count": 7, "metadata": {}, "outputs": [ { "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "9295d6a3ff834f7a91a43d3f4ef4a61c", - "version_major": 2, - "version_minor": 0 - }, + "text/html": [ + "Load job 7b46750c-70b4-468d-87ba-9f84f579f2a6 is DONE. Open Job" + ], "text/plain": [ - "HTML(value='Load job d4c2f933-3514-4901-bcd7-888ee66eba82 is RUNNING. " ] }, "metadata": {}, @@ -1097,32 +1087,16 @@ }, { "cell_type": "code", - "execution_count": 19, + "execution_count": 8, "metadata": {}, "outputs": [ { "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "b7eb82b3b5fc4a8e97468070a3e76300", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - "HTML(value='Query job e4ffd919-6f69-4382-a7e5-db37c7c1fefa is RUNNING. Open Job" + ], "text/plain": [ - "HTML(value='Query job 6b3e3285-79e9-4137-bf3b-7b7185ef76a5 is DONE. 24 Bytes processed. " ] }, "metadata": {}, @@ -1130,13 +1104,11 @@ }, { "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "edc7bc6434bd4be4926626a235aab65a", - "version_major": 2, - "version_minor": 0 - }, + "text/html": [ + "Query job 207cb787-cf8a-43ea-8e73-644d3f58b11a is DONE. 24 Bytes processed. Open Job" + ], "text/plain": [ - "HTML(value='Query job 173c4194-e194-43d2-8359-7bec83d3c861 is DONE. 0 Bytes processed. " ] }, "metadata": {}, @@ -1144,13 +1116,11 @@ }, { "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "55a8cbd9b1ab47eeab6e1c305847630f", - "version_major": 2, - "version_minor": 0 - }, + "text/html": [ + "Query job c5dc5075-cac0-4947-9e9f-06aa9cc5bd2a is DONE. 0 Bytes processed. Open Job" + ], "text/plain": [ - "HTML(value='Query job 53ba2332-590c-488d-9505-23aebaaad9cb is DONE. 48 Bytes processed. " ] }, "metadata": {}, @@ -1158,13 +1128,11 @@ }, { "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "463a5b072148474db629b9346fa3a6d1", - "version_major": 2, - "version_minor": 0 - }, + "text/html": [ + "Query job 2ca4a569-7186-48ed-b3e4-004dca704798 is DONE. 282 Bytes processed. Open Job" + ], "text/plain": [ - "HTML(value='Query job 66e4a8e0-4cae-4e9d-86e0-17dc24f6cfbb is DONE. 0 Bytes processed. " ] }, "metadata": {}, @@ -1192,41 +1160,83 @@ " \n", " \n", " predicted_body_mass_g\n", + " species\n", + " island\n", + " culmen_length_mm\n", + " culmen_depth_mm\n", + " flipper_length_mm\n", + " sex\n", " \n", " \n", " tag_number\n", " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", " \n", " \n", " 1633\n", - " 3965.994361\n", + " 4017.203152\n", + " Adelie Penguin (Pygoscelis adeliae)\n", + " Torgersen\n", + " 39.5\n", + " 18.8\n", + " 196.0\n", + " MALE\n", " \n", " \n", " 1672\n", - " 3246.312058\n", + " 3127.601519\n", + " Adelie Penguin (Pygoscelis adeliae)\n", + " Torgersen\n", + " 38.5\n", + " 17.2\n", + " 181.0\n", + " FEMALE\n", " \n", " \n", " 1690\n", - " 3456.404062\n", + " 3386.101231\n", + " Adelie Penguin (Pygoscelis adeliae)\n", + " Dream\n", + " 37.9\n", + " 18.1\n", + " 188.0\n", + " FEMALE\n", " \n", " \n", "\n", - "

3 rows × 1 columns

\n", - "[3 rows x 1 columns in total]" + "

3 rows × 7 columns

\n", + "[3 rows x 7 columns in total]" ], "text/plain": [ - " predicted_body_mass_g\n", - "tag_number \n", - "1633 3965.994361\n", - "1672 3246.312058\n", - "1690 3456.404062\n", + " predicted_body_mass_g species \\\n", + "tag_number \n", + "1633 4017.203152 Adelie Penguin (Pygoscelis adeliae) \n", + "1672 3127.601519 Adelie Penguin (Pygoscelis adeliae) \n", + "1690 3386.101231 Adelie Penguin (Pygoscelis adeliae) \n", + "\n", + " island culmen_length_mm culmen_depth_mm flipper_length_mm \\\n", + "tag_number \n", + "1633 Torgersen 39.5 18.8 196.0 \n", + "1672 Torgersen 38.5 17.2 181.0 \n", + "1690 Dream 37.9 18.1 188.0 \n", "\n", - "[3 rows x 1 columns]" + " sex \n", + "tag_number \n", + "1633 MALE \n", + "1672 FEMALE \n", + "1690 FEMALE \n", + "\n", + "[3 rows x 7 columns]" ] }, - "execution_count": 19, + "execution_count": 8, "metadata": {}, "output_type": "execute_result" } @@ -1240,28 +1250,53 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "## 4. Save in BigQuery" + "## 6. Save in BigQuery" ] }, { "cell_type": "code", - "execution_count": 20, + "execution_count": 9, "metadata": {}, "outputs": [ { "data": { + "text/html": [ + "Copy job d1def4a4-1da1-43a9-8ae5-4459444d993d is DONE.
Open Job" + ], "text/plain": [ - "Pipeline(steps=[('preproc',\n", - " ColumnTransformer(transformers=[('onehot', OneHotEncoder(),\n", - " ['island', 'species', 'sex']),\n", - " ('scaler', StandardScaler(),\n", - " ['culmen_depth_mm',\n", - " 'culmen_length_mm',\n", - " 'flipper_length_mm'])])),\n", - " ('linreg', LinearRegression(fit_intercept=False))])" + "" ] }, - "execution_count": 20, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [ + "Pipeline(steps=[('transform',\n", + " ColumnTransformer(transformers=[('ont_hot_encoder',\n", + " OneHotEncoder(max_categories=1000001,\n", + " min_frequency=0),\n", + " 'island'),\n", + " ('standard_scaler',\n", + " StandardScaler(),\n", + " 'culmen_length_mm'),\n", + " ('standard_scaler',\n", + " StandardScaler(),\n", + " 'culmen_depth_mm'),\n", + " ('standard_scaler',\n", + " StandardScaler(),\n", + " 'flipper_length_mm'),\n", + " ('ont_hot_encoder',\n", + " OneHotEncoder(max_categories=1000001,\n", + " min_frequency=0),\n", + " 'sex')])),\n", + " ('estimator',\n", + " LinearRegression(fit_intercept=False,\n", + " optimize_strategy='NORMAL_EQUATION'))])" + ] + }, + "execution_count": 9, "metadata": {}, "output_type": "execute_result" } @@ -1269,6 +1304,13 @@ "source": [ "pipeline.to_gbq(\"bigframes-dev.bigframes_demo_us.penguin_model\", replace=True)" ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] } ], "metadata": { @@ -1287,7 +1329,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.10.12" + "version": "3.10.9" }, "orig_nbformat": 4, "vscode": { diff --git a/notebooks/remote_functions/remote_function.ipynb b/notebooks/remote_functions/remote_function.ipynb index 06be0e7293..063c1738b4 100644 --- a/notebooks/remote_functions/remote_function.ipynb +++ b/notebooks/remote_functions/remote_function.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "code", - "execution_count": 2, + "execution_count": 19, "id": "3613b1cd", "metadata": {}, "outputs": [], @@ -16,24 +16,16 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 20, "id": "f1175247", "metadata": {}, "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/usr/local/google/home/shobs/code/bigframes1/venv/lib/python3.10/site-packages/google/auth/_default.py:78: UserWarning: Your application has authenticated using end user credentials from Google Cloud SDK without a quota project. You might receive a \"quota exceeded\" or \"API not enabled\" error. See the following page for troubleshooting: https://cloud.google.com/docs/authentication/adc-troubleshooting/user-creds. \n", - " warnings.warn(_CLOUD_SDK_CREDENTIALS_WARNING)\n" - ] - }, { "name": "stdout", "output_type": "stream", "text": [ - "CPU times: user 25.4 s, sys: 2.5 s, total: 27.9 s\n", - "Wall time: 2min 31s\n" + "CPU times: user 2.34 s, sys: 307 ms, total: 2.65 s\n", + "Wall time: 17.8 s\n" ] }, { @@ -141,7 +133,7 @@ "9 154 Sure, but what about a solution using O(1) mem... 8" ] }, - "execution_count": 3, + "execution_count": 20, "metadata": {}, "output_type": "execute_result" } @@ -160,7 +152,7 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 21, "id": "fd8a04a3", "metadata": {}, "outputs": [], @@ -191,7 +183,7 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 22, "id": "2b5e4568", "metadata": {}, "outputs": [ @@ -199,8 +191,8 @@ "name": "stdout", "output_type": "stream", "text": [ - "CPU times: user 4.22 s, sys: 18.2 ms, total: 4.24 s\n", - "Wall time: 4.26 s\n" + "CPU times: user 3.32 s, sys: 0 ns, total: 3.32 s\n", + "Wall time: 3.32 s\n" ] }, { @@ -319,7 +311,7 @@ "9 154 Sure, but what about a solution using O(1) mem... 8 19" ] }, - "execution_count": 5, + "execution_count": 22, "metadata": {}, "output_type": "execute_result" } @@ -333,65 +325,25 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 23, "id": "b81feaef", "metadata": {}, "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/usr/local/google/home/shobs/code/bigframes1/venv/lib/python3.10/site-packages/google/auth/_default.py:78: UserWarning: Your application has authenticated using end user credentials from Google Cloud SDK without a quota project. You might receive a \"quota exceeded\" or \"API not enabled\" error. See the following page for troubleshooting: https://cloud.google.com/docs/authentication/adc-troubleshooting/user-creds. \n", - " warnings.warn(_CLOUD_SDK_CREDENTIALS_WARNING)\n", - "/usr/local/google/home/shobs/code/bigframes1/venv/lib/python3.10/site-packages/google/auth/_default.py:78: UserWarning: Your application has authenticated using end user credentials from Google Cloud SDK without a quota project. You might receive a \"quota exceeded\" or \"API not enabled\" error. See the following page for troubleshooting: https://cloud.google.com/docs/authentication/adc-troubleshooting/user-creds. \n", - " warnings.warn(_CLOUD_SDK_CREDENTIALS_WARNING)\n" - ] - }, - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "2b1c9d671db14d2ca3be6a0b0c698430", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - "HTML(value='Query job 6b0a39de-40a0-4dd4-be88-248bd8ebcd77 is RUNNING. " ] }, "metadata": {}, @@ -399,13 +351,11 @@ }, { "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "09706700e8dd4cf39f65a0d58371c1eb", - "version_major": 2, - "version_minor": 0 - }, + "text/html": [ + "Query job a283cb39-41b1-44cd-a6c3-f2a2c6a55b25 is DONE. 17.2 GB processed. Open Job" + ], "text/plain": [ - "HTML(value='Query job 4c1d9d3e-be25-4818-b74d-6214164d99ab is DONE. 0 Bytes processed. " ] }, "metadata": {}, @@ -440,62 +390,62 @@ " \n", " \n", " 0\n", - " 11012908\n", - " you're welcome! according to the docs it shoul...\n", + " 11231597\n", + " In your update, why are some of the system fun...\n", " 0\n", " \n", " \n", " 1\n", - " 11013760\n", - " You *should* be concerned with the disk being ...\n", - " 0\n", + " 49684807\n", + " what you have tried so far . ??\n", + " 1\n", " \n", " \n", " 2\n", - " 11013784\n", - " have you looked at `Integrate` or `NIntegrate`?\n", + " 7623925\n", + " @Michael: It should work. Perhaps you looked i...\n", " 0\n", " \n", " \n", " 3\n", - " 11015512\n", - " sorry, is a typo. The variable name is dist. (...\n", + " 34046685\n", + " Will it work with SQL compact? Please excuse m...\n", " 0\n", " \n", " \n", " 4\n", - " 11016238\n", - " Pfff, I'm having trouble with that formula too...\n", + " 6426146\n", + " do you know the equation to your pdf?\n", " 0\n", " \n", " \n", " 5\n", - " 11016276\n", - " Thanks thinksteep! Does this mean that by usin...\n", + " 60686114\n", + " m sorry but at least you have to think about it.\n", " 0\n", " \n", " \n", " 6\n", - " 11016551\n", - " Jason, thanks for the reply. I've been workin...\n", + " 16631986\n", + " i think also making disable this by only jquer...\n", " 0\n", " \n", " \n", " 7\n", - " 11017973\n", - " I assume an `off` of 0.5 would put be exactly ...\n", + " 16498565\n", + " I am including these files on my header of the...\n", " 0\n", " \n", " \n", " 8\n", - " 11018225\n", - " Thank you very much. I do worry too much abou...\n", + " 26601001\n", + " wrong answer, you didn't understand the logic\n", " 0\n", " \n", " \n", " 9\n", - " 11018370\n", - " @IanClelland, I edited my question a bit. The ...\n", + " 73255842\n", + " Call the setOnClickListener before return row.\n", " 0\n", " \n", " \n", @@ -505,21 +455,21 @@ ], "text/plain": [ " id text score\n", - "0 11012908 you're welcome! according to the docs it shoul... 0\n", - "1 11013760 You *should* be concerned with the disk being ... 0\n", - "2 11013784 have you looked at `Integrate` or `NIntegrate`? 0\n", - "3 11015512 sorry, is a typo. The variable name is dist. (... 0\n", - "4 11016238 Pfff, I'm having trouble with that formula too... 0\n", - "5 11016276 Thanks thinksteep! Does this mean that by usin... 0\n", - "6 11016551 Jason, thanks for the reply. I've been workin... 0\n", - "7 11017973 I assume an `off` of 0.5 would put be exactly ... 0\n", - "8 11018225 Thank you very much. I do worry too much abou... 0\n", - "9 11018370 @IanClelland, I edited my question a bit. The ... 0\n", + "0 11231597 In your update, why are some of the system fun... 0\n", + "1 49684807 what you have tried so far . ?? 1\n", + "2 7623925 @Michael: It should work. Perhaps you looked i... 0\n", + "3 34046685 Will it work with SQL compact? Please excuse m... 0\n", + "4 6426146 do you know the equation to your pdf? 0\n", + "5 60686114 m sorry but at least you have to think about it. 0\n", + "6 16631986 i think also making disable this by only jquer... 0\n", + "7 16498565 I am including these files on my header of the... 0\n", + "8 26601001 wrong answer, you didn't understand the logic 0\n", + "9 73255842 Call the setOnClickListener before return row. 0\n", "\n", "[10 rows x 3 columns]" ] }, - "execution_count": 6, + "execution_count": 23, "metadata": {}, "output_type": "execute_result" } @@ -539,7 +489,7 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 24, "id": "55ed241e", "metadata": {}, "outputs": [ @@ -549,8 +499,9 @@ "text": [ "Help on function remote_function in module bigframes.pandas:\n", "\n", - "remote_function(input_types: 'List[type]', output_type: 'type', dataset: 'Optional[str]' = None, bigquery_connection: 'Optional[str]' = None, reuse: 'bool' = True)\n", - " Decorator to turn a user defined function into a BigQuery remote function.\n", + "remote_function(input_types: 'List[type]', output_type: 'type', dataset: 'Optional[str]' = None, bigquery_connection: 'Optional[str]' = None, reuse: 'bool' = True, name: 'Optional[str]' = None, packages: 'Optional[Sequence[str]]' = None)\n", + " Decorator to turn a user defined function into a BigQuery remote function. Check out\n", + " the code samples at: https://cloud.google.com/bigquery/docs/remote-functions#bigquery-dataframes.\n", " \n", " .. note::\n", " Please make sure following is setup before using this API:\n", @@ -576,7 +527,7 @@ " * BigQuery Data Editor (roles/bigquery.dataEditor)\n", " * BigQuery Connection Admin (roles/bigquery.connectionAdmin)\n", " * Cloud Functions Developer (roles/cloudfunctions.developer)\n", - " * Service Account User (roles/iam.serviceAccountUser)\n", + " * Service Account User (roles/iam.serviceAccountUser) on the service account `PROJECT_NUMBER-compute@developer.gserviceaccount.com`\n", " * Storage Object Viewer (roles/storage.objectViewer)\n", " * Project IAM Admin (roles/resourcemanager.projectIamAdmin) (Only required if the bigquery connection being used is not pre-created and is created dynamically with user credentials.)\n", " \n", @@ -602,15 +553,25 @@ " Name of the BigQuery connection. You should either have the\n", " connection already created in the `location` you have chosen, or\n", " you should have the Project IAM Admin role to enable the service\n", - " to create the connection for you if you need it.If this parameter is\n", + " to create the connection for you if you need it. If this parameter is\n", " not provided then the BigQuery connection from the session is used.\n", " reuse (bool, Optional):\n", " Reuse the remote function if already exists.\n", " `True` by default, which will result in reusing an existing remote\n", - " function (if any) that was previously created for the same udf.\n", - " Setting it to false would force creating a unique remote function.\n", + " function and corresponding cloud function (if any) that was\n", + " previously created for the same udf.\n", + " Setting it to `False` would force creating a unique remote function.\n", " If the required remote function does not exist then it would be\n", " created irrespective of this param.\n", + " name (str, Optional):\n", + " Explicit name of the persisted BigQuery remote function. Use it with\n", + " caution, because two users working in the same project and dataset\n", + " could overwrite each other's remote functions if they use the same\n", + " persistent name.\n", + " packages (str[], Optional):\n", + " Explicit name of the external package dependencies. Each dependency\n", + " is added to the `requirements.txt` as is, and can be of the form\n", + " supported in https://pip.pypa.io/en/stable/reference/requirements-file-format/.\n", " Returns:\n", " callable: A remote function object pointing to the cloud assets created\n", " in the background to support the remote execution. The cloud assets can be\n", @@ -631,49 +592,16 @@ }, { "cell_type": "code", - "execution_count": 8, - "id": "c9a8d03d", - "metadata": {}, - "outputs": [], - "source": [ - "# BigQuery DataFrames user is a data scientist and may not have privileges to\n", - "# create a BQ connector and set it up for invoking a cloud function. They\n", - "# should get such a connector created from their cloud admin and use it with\n", - "# BigQuery DataFrames remote functions. If the provided connection name does not\n", - "# exist, BigQuery DataFrames will try to create it on the fly assuming the user\n", - "# has sufficient privileges.\n", - "bq_connection_name = 'bigframes-rf-conn'" - ] - }, - { - "cell_type": "code", - "execution_count": 9, + "execution_count": 25, "id": "fbc27f81", "metadata": {}, "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "[INFO][2023-08-18 21:23:29,687][bigframes.remote_function] Creating new cloud function: gcloud functions deploy bigframes-b0feb1fbaf8188b64d7e70118d93c5d4 --gen2 --runtime=python310 --project=bigframes-dev --region=us-central1 --source=/tmp/tmpl2ewfnue --entry-point=udf_http --trigger-http --no-allow-unauthenticated\n", - "[INFO][2023-08-18 21:24:43,689][bigframes.remote_function] Successfully created cloud function bigframes-b0feb1fbaf8188b64d7e70118d93c5d4 with uri (https://bigframes-b0feb1fbaf8188b64d7e70118d93c5d4-7krlje3eoq-uc.a.run.app)\n", - "[INFO][2023-08-18 21:24:57,348][bigframes.remote_function] Connector bigframes-rf-conn already exists\n", - "[INFO][2023-08-18 21:24:57,351][bigframes.remote_function] Creating BQ remote function: \n", - " CREATE OR REPLACE FUNCTION `bigframes-dev.bigframes_temp_us`.bigframes_b0feb1fbaf8188b64d7e70118d93c5d4(n INT64)\n", - " RETURNS INT64\n", - " REMOTE WITH CONNECTION `bigframes-dev.us.bigframes-rf-conn`\n", - " OPTIONS (\n", - " endpoint = \"https://bigframes-b0feb1fbaf8188b64d7e70118d93c5d4-7krlje3eoq-uc.a.run.app\"\n", - " )\n", - "[INFO][2023-08-18 21:24:58,300][bigframes.remote_function] Created remote function bigframes-dev.bigframes_temp_us.bigframes_b0feb1fbaf8188b64d7e70118d93c5d4\n" - ] - }, { "name": "stdout", "output_type": "stream", "text": [ "\n", - "Wall time: 89.0601 s\n" + "Wall time: 76.2628 s\n" ] } ], @@ -684,7 +612,7 @@ "\n", "# User defined function\n", "# https://www.codespeedy.com/find-nth-prime-number-in-python/\n", - "@pd.remote_function([int], int, bigquery_connection=bq_connection_name)\n", + "@pd.remote_function([int], int, reuse=False)\n", "def nth_prime(n):\n", " prime_numbers = [2,3]\n", " i=3\n", @@ -712,7 +640,7 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 26, "id": "c1c9355f", "metadata": {}, "outputs": [ @@ -720,33 +648,17 @@ "name": "stdout", "output_type": "stream", "text": [ - "CPU times: user 16.8 ms, sys: 61 µs, total: 16.8 ms\n", - "Wall time: 17 ms\n" + "CPU times: user 55.8 ms, sys: 182 µs, total: 56 ms\n", + "Wall time: 54.5 ms\n" ] }, { "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "2f840ad27c514ed19c759a004b32de33", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - "HTML(value='Query job 0f421233-9d02-4746-bb39-86a3b0880aba is RUNNING. Open Job" + ], "text/plain": [ - "HTML(value='Query job 4f8d5734-8070-4630-8a59-c05a31d60476 is RUNNING. " ] }, "metadata": {}, @@ -754,13 +666,11 @@ }, { "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "116d6ef3d6b247d3aaafef5fe6b970de", - "version_major": 2, - "version_minor": 0 - }, + "text/html": [ + "Query job c0a2c187-364d-4978-97bc-30352828f624 is DONE. 17.2 GB processed. Open Job" + ], "text/plain": [ - "HTML(value='Query job ec057f9e-726b-44f0-a5c0-24c05c7ecfeb is RUNNING. " ] }, "metadata": {}, @@ -796,71 +706,71 @@ " \n", " \n", " 0\n", - " 11012908\n", - " you're welcome! according to the docs it shoul...\n", + " 11231597\n", + " In your update, why are some of the system fun...\n", " 0\n", " -1\n", " \n", " \n", " 1\n", - " 11013760\n", - " You *should* be concerned with the disk being ...\n", - " 0\n", - " -1\n", + " 49684807\n", + " what you have tried so far . ??\n", + " 1\n", + " 2\n", " \n", " \n", " 2\n", - " 11013784\n", - " have you looked at `Integrate` or `NIntegrate`?\n", + " 7623925\n", + " @Michael: It should work. Perhaps you looked i...\n", " 0\n", " -1\n", " \n", " \n", " 3\n", - " 11015512\n", - " sorry, is a typo. The variable name is dist. (...\n", + " 34046685\n", + " Will it work with SQL compact? Please excuse m...\n", " 0\n", " -1\n", " \n", " \n", " 4\n", - " 11016238\n", - " Pfff, I'm having trouble with that formula too...\n", + " 6426146\n", + " do you know the equation to your pdf?\n", " 0\n", " -1\n", " \n", " \n", " 5\n", - " 11016276\n", - " Thanks thinksteep! Does this mean that by usin...\n", + " 60686114\n", + " m sorry but at least you have to think about it.\n", " 0\n", " -1\n", " \n", " \n", " 6\n", - " 11016551\n", - " Jason, thanks for the reply. I've been workin...\n", + " 16631986\n", + " i think also making disable this by only jquer...\n", " 0\n", " -1\n", " \n", " \n", " 7\n", - " 11017973\n", - " I assume an `off` of 0.5 would put be exactly ...\n", + " 16498565\n", + " I am including these files on my header of the...\n", " 0\n", " -1\n", " \n", " \n", " 8\n", - " 11018225\n", - " Thank you very much. I do worry too much abou...\n", + " 26601001\n", + " wrong answer, you didn't understand the logic\n", " 0\n", " -1\n", " \n", " \n", " 9\n", - " 11018370\n", - " @IanClelland, I edited my question a bit. The ...\n", + " 73255842\n", + " Call the setOnClickListener before return row.\n", " 0\n", " -1\n", " \n", @@ -871,21 +781,21 @@ ], "text/plain": [ " id text score n_prime\n", - "0 11012908 you're welcome! according to the docs it shoul... 0 -1\n", - "1 11013760 You *should* be concerned with the disk being ... 0 -1\n", - "2 11013784 have you looked at `Integrate` or `NIntegrate`? 0 -1\n", - "3 11015512 sorry, is a typo. The variable name is dist. (... 0 -1\n", - "4 11016238 Pfff, I'm having trouble with that formula too... 0 -1\n", - "5 11016276 Thanks thinksteep! Does this mean that by usin... 0 -1\n", - "6 11016551 Jason, thanks for the reply. I've been workin... 0 -1\n", - "7 11017973 I assume an `off` of 0.5 would put be exactly ... 0 -1\n", - "8 11018225 Thank you very much. I do worry too much abou... 0 -1\n", - "9 11018370 @IanClelland, I edited my question a bit. The ... 0 -1\n", + "0 11231597 In your update, why are some of the system fun... 0 -1\n", + "1 49684807 what you have tried so far . ?? 1 2\n", + "2 7623925 @Michael: It should work. Perhaps you looked i... 0 -1\n", + "3 34046685 Will it work with SQL compact? Please excuse m... 0 -1\n", + "4 6426146 do you know the equation to your pdf? 0 -1\n", + "5 60686114 m sorry but at least you have to think about it. 0 -1\n", + "6 16631986 i think also making disable this by only jquer... 0 -1\n", + "7 16498565 I am including these files on my header of the... 0 -1\n", + "8 26601001 wrong answer, you didn't understand the logic 0 -1\n", + "9 73255842 Call the setOnClickListener before return row. 0 -1\n", "\n", "[10 rows x 4 columns]" ] }, - "execution_count": 10, + "execution_count": 26, "metadata": {}, "output_type": "execute_result" } @@ -900,7 +810,7 @@ }, { "cell_type": "code", - "execution_count": 11, + "execution_count": 27, "id": "2701cb81", "metadata": {}, "outputs": [ @@ -908,8 +818,8 @@ "name": "stdout", "output_type": "stream", "text": [ - "bigframes-dev.bigframes_temp_us.bigframes_b0feb1fbaf8188b64d7e70118d93c5d4\n", - "projects/bigframes-dev/locations/us-central1/functions/bigframes-b0feb1fbaf8188b64d7e70118d93c5d4\n" + "shobs-test.bigframes_temp_us.bigframes_343b7b4bb93ca8747dae20c22bdaec8b_p27heyce\n", + "projects/shobs-test/locations/us-central1/functions/bigframes-343b7b4bb93ca8747dae20c22bdaec8b-p27heyce\n" ] } ], @@ -922,7 +832,7 @@ }, { "cell_type": "code", - "execution_count": 12, + "execution_count": 28, "id": "920fa18e", "metadata": {}, "outputs": [ @@ -937,6 +847,42 @@ " \n", " Then it can be applied to a DataFrame or Series.\n", " \n", + " .. note::\n", + " The return type of the function must be explicitly specified in the\n", + " function's original definition even if not otherwise required.\n", + " \n", + " BigQuery Utils provides many public functions under the ``bqutil`` project on Google Cloud Platform project\n", + " (See: https://github.com/GoogleCloudPlatform/bigquery-utils/tree/master/udfs#using-the-udfs).\n", + " You can checkout Community UDFs to use community-contributed functions.\n", + " (See: https://github.com/GoogleCloudPlatform/bigquery-utils/tree/master/udfs/community#community-udfs).\n", + " \n", + " **Examples:**\n", + " \n", + " Use the ``cw_lower_case_ascii_only`` function from Community UDFs.\n", + " (https://github.com/GoogleCloudPlatform/bigquery-utils/blob/master/udfs/community/cw_lower_case_ascii_only.sqlx)\n", + " \n", + " >>> import bigframes.pandas as bpd\n", + " >>> bpd.options.display.progress_bar = None\n", + " \n", + " >>> df = bpd.DataFrame({'id': [1, 2, 3], 'name': ['AURÉLIE', 'CÉLESTINE', 'DAPHNÉ']})\n", + " >>> df\n", + " id name\n", + " 0 1 AURÉLIE\n", + " 1 2 CÉLESTINE\n", + " 2 3 DAPHNÉ\n", + " \n", + " [3 rows x 2 columns]\n", + " \n", + " >>> func = bpd.read_gbq_function(\"bqutil.fn.cw_lower_case_ascii_only\")\n", + " >>> df1 = df.assign(new_name=df['name'].apply(func))\n", + " >>> df1\n", + " id name new_name\n", + " 0 1 AURÉLIE aurÉlie\n", + " 1 2 CÉLESTINE cÉlestine\n", + " 2 3 DAPHNÉ daphnÉ\n", + " \n", + " [3 rows x 3 columns]\n", + " \n", " Args:\n", " function_name (str):\n", " the function's name in BigQuery in the format\n", @@ -965,7 +911,7 @@ }, { "cell_type": "code", - "execution_count": 14, + "execution_count": 29, "id": "a6c9da0a", "metadata": {}, "outputs": [], @@ -978,7 +924,7 @@ }, { "cell_type": "code", - "execution_count": 15, + "execution_count": 30, "id": "d7e7de7f", "metadata": {}, "outputs": [ @@ -986,33 +932,17 @@ "name": "stdout", "output_type": "stream", "text": [ - "CPU times: user 10.9 ms, sys: 0 ns, total: 10.9 ms\n", - "Wall time: 11.4 ms\n" + "CPU times: user 70.8 ms, sys: 3.49 ms, total: 74.3 ms\n", + "Wall time: 75.2 ms\n" ] }, { "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "73d1a73593cb4115821ab128c221a48d", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - "HTML(value='Query job bec5f7d1-3df1-4292-8c68-c396bce7dc5d is RUNNING. Open Job" + ], "text/plain": [ - "HTML(value='Query job 02e3bf43-a387-41c7-85c7-4a5366251de7 is RUNNING. " ] }, "metadata": {}, @@ -1020,13 +950,11 @@ }, { "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "faf93766ce1e489183c86a9daf5ce7d1", - "version_major": 2, - "version_minor": 0 - }, + "text/html": [ + "Query job 4d3da7ed-42e6-4b2b-b656-ac9ef6d2e871 is DONE. 17.2 GB processed. Open Job" + ], "text/plain": [ - "HTML(value='Query job fa4329e8-2918-44c4-96c5-d8591364abc9 is RUNNING. " ] }, "metadata": {}, @@ -1063,80 +991,80 @@ " \n", " \n", " 0\n", - " 11012908\n", - " you're welcome! according to the docs it shoul...\n", + " 11231597\n", + " In your update, why are some of the system fun...\n", " 0\n", " -1\n", " -1\n", " \n", " \n", " 1\n", - " 11013760\n", - " You *should* be concerned with the disk being ...\n", - " 0\n", - " -1\n", - " -1\n", + " 49684807\n", + " what you have tried so far . ??\n", + " 1\n", + " 2\n", + " 2\n", " \n", " \n", " 2\n", - " 11013784\n", - " have you looked at `Integrate` or `NIntegrate`?\n", + " 7623925\n", + " @Michael: It should work. Perhaps you looked i...\n", " 0\n", " -1\n", " -1\n", " \n", " \n", " 3\n", - " 11015512\n", - " sorry, is a typo. The variable name is dist. (...\n", + " 34046685\n", + " Will it work with SQL compact? Please excuse m...\n", " 0\n", " -1\n", " -1\n", " \n", " \n", " 4\n", - " 11016238\n", - " Pfff, I'm having trouble with that formula too...\n", + " 6426146\n", + " do you know the equation to your pdf?\n", " 0\n", " -1\n", " -1\n", " \n", " \n", " 5\n", - " 11016276\n", - " Thanks thinksteep! Does this mean that by usin...\n", + " 60686114\n", + " m sorry but at least you have to think about it.\n", " 0\n", " -1\n", " -1\n", " \n", " \n", " 6\n", - " 11016551\n", - " Jason, thanks for the reply. I've been workin...\n", + " 16631986\n", + " i think also making disable this by only jquer...\n", " 0\n", " -1\n", " -1\n", " \n", " \n", " 7\n", - " 11017973\n", - " I assume an `off` of 0.5 would put be exactly ...\n", + " 16498565\n", + " I am including these files on my header of the...\n", " 0\n", " -1\n", " -1\n", " \n", " \n", " 8\n", - " 11018225\n", - " Thank you very much. I do worry too much abou...\n", + " 26601001\n", + " wrong answer, you didn't understand the logic\n", " 0\n", " -1\n", " -1\n", " \n", " \n", " 9\n", - " 11018370\n", - " @IanClelland, I edited my question a bit. The ...\n", + " 73255842\n", + " Call the setOnClickListener before return row.\n", " 0\n", " -1\n", " -1\n", @@ -1148,20 +1076,20 @@ ], "text/plain": [ " id text score \\\n", - "0 11012908 you're welcome! according to the docs it shoul... 0 \n", - "1 11013760 You *should* be concerned with the disk being ... 0 \n", - "2 11013784 have you looked at `Integrate` or `NIntegrate`? 0 \n", - "3 11015512 sorry, is a typo. The variable name is dist. (... 0 \n", - "4 11016238 Pfff, I'm having trouble with that formula too... 0 \n", - "5 11016276 Thanks thinksteep! Does this mean that by usin... 0 \n", - "6 11016551 Jason, thanks for the reply. I've been workin... 0 \n", - "7 11017973 I assume an `off` of 0.5 would put be exactly ... 0 \n", - "8 11018225 Thank you very much. I do worry too much abou... 0 \n", - "9 11018370 @IanClelland, I edited my question a bit. The ... 0 \n", + "0 11231597 In your update, why are some of the system fun... 0 \n", + "1 49684807 what you have tried so far . ?? 1 \n", + "2 7623925 @Michael: It should work. Perhaps you looked i... 0 \n", + "3 34046685 Will it work with SQL compact? Please excuse m... 0 \n", + "4 6426146 do you know the equation to your pdf? 0 \n", + "5 60686114 m sorry but at least you have to think about it. 0 \n", + "6 16631986 i think also making disable this by only jquer... 0 \n", + "7 16498565 I am including these files on my header of the... 0 \n", + "8 26601001 wrong answer, you didn't understand the logic 0 \n", + "9 73255842 Call the setOnClickListener before return row. 0 \n", "\n", " n_prime n_prime_again \n", "0 -1 -1 \n", - "1 -1 -1 \n", + "1 2 2 \n", "2 -1 -1 \n", "3 -1 -1 \n", "4 -1 -1 \n", @@ -1174,7 +1102,7 @@ "[10 rows x 5 columns]" ] }, - "execution_count": 15, + "execution_count": 30, "metadata": {}, "output_type": "execute_result" } @@ -1186,6 +1114,38 @@ "df = df.assign(n_prime_again=df['score'].apply(nth_prime_existing))\n", "df.head(10)" ] + }, + { + "cell_type": "code", + "execution_count": 31, + "id": "bafab950", + "metadata": {}, + "outputs": [], + "source": [ + "# Clean up GCP assets created as part of bigframes remote_function\n", + "def cleanup_remote_function_assets(remote_udf, ignore_failures=False):\n", + " \"\"\"Clean up the GCP assets behind a bigframes remote function.\"\"\"\n", + "\n", + " session = pd.get_global_session()\n", + "\n", + " # Clean up BQ remote function\n", + " try:\n", + " session.bqclient.delete_routine(remote_udf.bigframes_remote_function)\n", + " except Exception:\n", + " # By default don't raise exception in cleanup\n", + " if not ignore_failures:\n", + " raise\n", + "\n", + " # Clean up cloud function\n", + " try:\n", + " session.cloudfunctionsclient.delete_function(name=remote_udf.bigframes_cloud_function)\n", + " except Exception:\n", + " # By default don't raise exception in cleanup\n", + " if not ignore_failures:\n", + " raise\n", + "\n", + "cleanup_remote_function_assets(nth_prime)" + ] } ], "metadata": { diff --git a/noxfile.py b/noxfile.py index da61232fc7..3b10a37fc7 100644 --- a/noxfile.py +++ b/noxfile.py @@ -228,6 +228,7 @@ def mypy(session): "types-python-dateutil", "types-requests", "types-setuptools", + "types-tabulate", ] ) | set(SYSTEM_TEST_STANDARD_DEPENDENCIES) @@ -494,6 +495,11 @@ def prerelease(session: nox.sessions.Session, tests_path): CURRENT_DIRECTORY / "testing" / f"constraints-{session.python}.txt" ) + # Ignore officially released versions of certain packages specified in + # testing/constraints-*.txt and install a more recent, pre-release versions + # directly + already_installed = set() + # PyArrow prerelease packages are published to an alternative PyPI host. # https://arrow.apache.org/docs/python/install.html#installing-nightly-packages session.install( @@ -504,6 +510,8 @@ def prerelease(session: nox.sessions.Session, tests_path): "--upgrade", "pyarrow", ) + already_installed.add("pyarrow") + session.install( "--extra-index-url", "https://pypi.anaconda.org/scipy-wheels-nightly/simple", @@ -512,16 +520,47 @@ def prerelease(session: nox.sessions.Session, tests_path): "--upgrade", "pandas", ) + already_installed.add("pandas") + + # TODO(shobs): + # Commit https://github.com/ibis-project/ibis/commit/c20ba7feab6bdea6c299721310e04dbc10551cc2 + # introduced breaking change that removed the following: + # ibis.expr.rules.column + # ibis.expr.rules.value + # ibis.expr.rules.any + # Let's exclude ibis head from prerelease install list for now. Instead, use + # a working ibis-framework version resolved via setup.by (currently resolves + # to version 6.2.0 due to version requirement "6.2.0,<7.0.0dev"). + # We should enable the head back once bigframes support a version that + # includes the above commit. + # session.install( + # "--upgrade", + # "-e", # Use -e so that py.typed file is included. + # "git+https://github.com/ibis-project/ibis.git#egg=ibis-framework", + # ) + session.install("--no-deps", "ibis-framework==6.2.0") + already_installed.add("ibis-framework") + + # Workaround https://github.com/googleapis/python-db-dtypes-pandas/issues/178 + session.install("--no-deps", "db-dtypes") + already_installed.add("db-dtypes") + + # Ensure we catch breaking changes in the client libraries early. + session.install( + "--upgrade", + "git+https://github.com/googleapis/python-bigquery.git#egg=google-cloud-bigquery", + ) + already_installed.add("google-cloud-bigquery") session.install( "--upgrade", - "-e", # Use -e so that py.typed file is included. - "git+https://github.com/ibis-project/ibis.git#egg=ibis-framework", + "-e", + "git+https://github.com/googleapis/python-bigquery-storage.git#egg=google-cloud-bigquery-storage", ) - # Workaround https://github.com/googleapis/python-db-dtypes-pandas/issues/178 - session.install("--no-deps", "db-dtypes") + already_installed.add("google-cloud-bigquery-storage") # Workaround to install pandas-gbq >=0.15.0, which is required by test only. session.install("--no-deps", "pandas-gbq") + already_installed.add("pandas-gbq") session.install( *set(UNIT_TEST_STANDARD_DEPENDENCIES + SYSTEM_TEST_STANDARD_DEPENDENCIES), @@ -541,9 +580,6 @@ def prerelease(session: nox.sessions.Session, tests_path): constraints_text = constraints_file.read() # Ignore leading whitespace and comment lines. - already_installed = frozenset( - ("db-dtypes", "pandas", "pyarrow", "ibis-framework", "pandas-gbq") - ) deps = [ match.group(1) for match in re.finditer( diff --git a/samples/snippets/remote_function.py b/samples/snippets/remote_function.py index 646d7b0c30..61b7dc092a 100644 --- a/samples/snippets/remote_function.py +++ b/samples/snippets/remote_function.py @@ -38,8 +38,8 @@ def run_remote_function_and_read_gbq_function(project_id: str): # function. It requires a BigQuery connection. If the connection is not # already created, BigQuery DataFrames will attempt to create one assuming # the necessary APIs and IAM permissions are setup in the project. In our - # examples we would be using a pre-created connection named - # `bigframes-rf-conn`. We will also set `reuse=False` to make sure we don't + # examples we will be letting the default connection `bigframes-default-connection` + # be used. We will also set `reuse=False` to make sure we don't # step over someone else creating remote function in the same project from # the exact same source code at the same time. Let's try a `pandas`-like use # case in which we want to apply a user defined scalar function to every @@ -49,7 +49,6 @@ def run_remote_function_and_read_gbq_function(project_id: str): @bpd.remote_function( [float], str, - bigquery_connection="bigframes-rf-conn", reuse=False, ) def get_bucket(num): @@ -94,7 +93,6 @@ def get_bucket(num): @bpd.remote_function( [str], str, - bigquery_connection="bigframes-rf-conn", reuse=False, packages=["cryptography"], ) diff --git a/setup.py b/setup.py index 29eacb74a9..abf165b3df 100644 --- a/setup.py +++ b/setup.py @@ -50,6 +50,7 @@ "requests >=2.27.1", "scikit-learn >=1.2.2", "sqlalchemy >=1.4,<3.0dev", + "tabulate >= 0.9", "ipywidgets >=7.7.1", "humanize >= 4.6.0", ] diff --git a/tests/system/large/ml/test_cluster.py b/tests/system/large/ml/test_cluster.py index f01116665f..9244c4b9f1 100644 --- a/tests/system/large/ml/test_cluster.py +++ b/tests/system/large/ml/test_cluster.py @@ -98,7 +98,9 @@ def test_cluster_configure_fit_score_predict( score_result, score_expected, check_exact=False, rtol=0.1 ) - result = model.predict(new_penguins).to_pandas() + predictions = model.predict(new_penguins).to_pandas() + assert predictions.shape == (4, 9) + result = predictions[["CENTROID_ID"]] expected = pd.DataFrame( {"CENTROID_ID": [2, 3, 1, 2]}, dtype="Int64", diff --git a/tests/system/large/ml/test_ensemble.py b/tests/system/large/ml/test_ensemble.py index a8613dfeb9..b98d7a757c 100644 --- a/tests/system/large/ml/test_ensemble.py +++ b/tests/system/large/ml/test_ensemble.py @@ -179,7 +179,7 @@ def test_xgbclassifier_default_params(penguins_df_default_index, dataset_id): ) -@pytest.mark.flaky(retries=2, delay=120) +# @pytest.mark.flaky(retries=2, delay=120) def test_xgbclassifier_dart_booster_multiple_params( penguins_df_default_index, dataset_id ): diff --git a/tests/system/large/ml/test_pipeline.py b/tests/system/large/ml/test_pipeline.py index 3e56954058..2929baf3f7 100644 --- a/tests/system/large/ml/test_pipeline.py +++ b/tests/system/large/ml/test_pipeline.py @@ -545,7 +545,9 @@ def test_pipeline_standard_scaler_kmeans_fit_score_predict( score_result, score_expected, check_exact=False, rtol=0.1 ) - result = pl.predict(new_penguins).to_pandas().sort_index() + predictions = pl.predict(new_penguins).to_pandas().sort_index() + assert predictions.shape == (6, 9) + result = predictions[["CENTROID_ID"]] expected = pd.DataFrame( {"CENTROID_ID": [1, 2, 1, 2, 1, 2]}, dtype="Int64", diff --git a/tests/system/large/test_remote_function.py b/tests/system/large/test_remote_function.py index 6ed3e6511a..5cb4df188c 100644 --- a/tests/system/large/test_remote_function.py +++ b/tests/system/large/test_remote_function.py @@ -22,7 +22,7 @@ import textwrap from google.api_core.exceptions import NotFound, ResourceExhausted -from google.cloud import functions_v2 +from google.cloud import bigquery, functions_v2 import pandas import pytest import test_utils.prefixer @@ -1210,3 +1210,48 @@ def square(x): cleanup_remote_function_assets( session.bqclient, session.cloudfunctionsclient, square ) + + +@pytest.mark.flaky(retries=2, delay=120) +def test_remote_function_anonymous_dataset(session, scalars_dfs): + try: + # This usage of remote_function is expected to create the remote + # function in the bigframes session's anonymous dataset. Use reuse=False + # param to make sure parallel instances of the test don't step over each + # other due to the common anonymous dataset. + @session.remote_function([int], int, reuse=False) + def square(x): + return x * x + + assert ( + bigquery.Routine(square.bigframes_remote_function).dataset_id + == session._anonymous_dataset.dataset_id + ) + + scalars_df, scalars_pandas_df = scalars_dfs + + bf_int64_col = scalars_df["int64_col"] + bf_int64_col_filter = bf_int64_col.notnull() + bf_int64_col_filtered = bf_int64_col[bf_int64_col_filter] + bf_result_col = bf_int64_col_filtered.apply(square) + bf_result = ( + bf_int64_col_filtered.to_frame().assign(result=bf_result_col).to_pandas() + ) + + pd_int64_col = scalars_pandas_df["int64_col"] + pd_int64_col_filter = pd_int64_col.notnull() + pd_int64_col_filtered = pd_int64_col[pd_int64_col_filter] + pd_result_col = pd_int64_col_filtered.apply(lambda x: x * x) + # TODO(shobs): Figure why pandas .apply() changes the dtype, i.e. + # pd_int64_col_filtered.dtype is Int64Dtype() + # pd_int64_col_filtered.apply(lambda x: x * x).dtype is int64. + # For this test let's force the pandas dtype to be same as bigframes' dtype. + pd_result_col = pd_result_col.astype(pandas.Int64Dtype()) + pd_result = pd_int64_col_filtered.to_frame().assign(result=pd_result_col) + + assert_pandas_df_equal(bf_result, pd_result) + finally: + # clean up the gcp assets created for the remote function + cleanup_remote_function_assets( + session.bqclient, session.cloudfunctionsclient, square + ) diff --git a/tests/system/small/ml/conftest.py b/tests/system/small/ml/conftest.py index c11445b79a..c4a1272e44 100644 --- a/tests/system/small/ml/conftest.py +++ b/tests/system/small/ml/conftest.py @@ -29,6 +29,7 @@ imported, linear_model, llm, + remote, ) @@ -247,6 +248,46 @@ def palm2_embedding_generator_multilingual_model( ) +@pytest.fixture(scope="session") +def linear_remote_model_params() -> dict: + # Pre-deployed endpoint of linear reg model in Vertex. + # bigframes-test-linreg2 -> bigframes-test-linreg-endpoint2 + return { + "input": {"culmen_length_mm": "float64"}, + "output": {"predicted_body_mass_g": "array"}, + "endpoint": "https://us-central1-aiplatform.googleapis.com/v1/projects/1084210331973/locations/us-central1/endpoints/3193318217619603456", + } + + +@pytest.fixture(scope="session") +def bqml_linear_remote_model( + session, bq_connection, linear_remote_model_params +) -> core.BqmlModel: + options = { + "endpoint": linear_remote_model_params["endpoint"], + } + return globals.bqml_model_factory().create_remote_model( + session=session, + input=linear_remote_model_params["input"], + output=linear_remote_model_params["output"], + connection_name=bq_connection, + options=options, + ) + + +@pytest.fixture(scope="session") +def linear_remote_vertex_model( + session, bq_connection, linear_remote_model_params +) -> remote.VertexAIModel: + return remote.VertexAIModel( + endpoint=linear_remote_model_params["endpoint"], + input=linear_remote_model_params["input"], + output=linear_remote_model_params["output"], + session=session, + connection_name=bq_connection, + ) + + @pytest.fixture(scope="session") def time_series_bqml_arima_plus_model( session, time_series_arima_plus_model_name diff --git a/tests/system/small/ml/test_cluster.py b/tests/system/small/ml/test_cluster.py index 266a38e3ee..a9fec0bbce 100644 --- a/tests/system/small/ml/test_cluster.py +++ b/tests/system/small/ml/test_cluster.py @@ -62,7 +62,9 @@ def test_kmeans_predict(session, penguins_kmeans_model: cluster.KMeans): new_penguins = session.read_pandas(_PD_NEW_PENGUINS) - result = penguins_kmeans_model.predict(new_penguins).to_pandas() + predictions = penguins_kmeans_model.predict(new_penguins).to_pandas() + assert predictions.shape == (4, 9) + result = predictions[["CENTROID_ID"]] expected = pd.DataFrame( {"CENTROID_ID": [2, 3, 1, 2]}, dtype="Int64", diff --git a/tests/system/small/ml/test_core.py b/tests/system/small/ml/test_core.py index be34a4871c..22cbbb1932 100644 --- a/tests/system/small/ml/test_core.py +++ b/tests/system/small/ml/test_core.py @@ -289,6 +289,22 @@ def test_model_predict_with_unnamed_index( ) +def test_remote_model_predict( + bqml_linear_remote_model: core.BqmlModel, new_penguins_df +): + predictions = bqml_linear_remote_model.predict(new_penguins_df).to_pandas() + expected = pd.DataFrame( + {"predicted_body_mass_g": [[3739.54], [3675.79], [3619.54]]}, + index=pd.Index([1633, 1672, 1690], name="tag_number", dtype="Int64"), + ) + pd.testing.assert_frame_equal( + predictions[["predicted_body_mass_g"]].sort_index(), + expected, + check_exact=False, + rtol=0.1, + ) + + @pytest.mark.flaky(retries=2, delay=120) def test_model_generate_text( bqml_palm2_text_generator_model: core.BqmlModel, llm_text_df diff --git a/tests/system/small/ml/test_ensemble.py b/tests/system/small/ml/test_ensemble.py index bba083d98d..55d9fef661 100644 --- a/tests/system/small/ml/test_ensemble.py +++ b/tests/system/small/ml/test_ensemble.py @@ -98,7 +98,9 @@ def test_xgbregressor_model_score_series( def test_xgbregressor_model_predict( penguins_xgbregressor_model: bigframes.ml.ensemble.XGBRegressor, new_penguins_df ): - result = penguins_xgbregressor_model.predict(new_penguins_df).to_pandas() + predictions = penguins_xgbregressor_model.predict(new_penguins_df).to_pandas() + assert predictions.shape == (3, 8) + result = predictions[["predicted_body_mass_g"]] expected = pandas.DataFrame( {"predicted_body_mass_g": ["4293.1538089", "3410.0271", "3357.944"]}, dtype="Float64", @@ -220,7 +222,9 @@ def test_xgbclassifier_model_score_series( def test_xgbclassifier_model_predict( penguins_xgbclassifier_model: bigframes.ml.ensemble.XGBClassifier, new_penguins_df ): - result = penguins_xgbclassifier_model.predict(new_penguins_df).to_pandas() + predictions = penguins_xgbclassifier_model.predict(new_penguins_df).to_pandas() + assert predictions.shape == (3, 9) + result = predictions[["predicted_sex"]] expected = pandas.DataFrame( {"predicted_sex": ["MALE", "MALE", "FEMALE"]}, dtype="string[pyarrow]", @@ -363,7 +367,11 @@ def test_randomforestregressor_model_predict( penguins_randomforest_regressor_model: bigframes.ml.ensemble.RandomForestRegressor, new_penguins_df, ): - result = penguins_randomforest_regressor_model.predict(new_penguins_df).to_pandas() + predictions = penguins_randomforest_regressor_model.predict( + new_penguins_df + ).to_pandas() + assert predictions.shape == (3, 8) + result = predictions[["predicted_body_mass_g"]] expected = pandas.DataFrame( {"predicted_body_mass_g": ["3897.341797", "3458.385742", "3458.385742"]}, dtype="Float64", @@ -490,7 +498,11 @@ def test_randomforestclassifier_model_predict( penguins_randomforest_classifier_model: bigframes.ml.ensemble.RandomForestClassifier, new_penguins_df, ): - result = penguins_randomforest_classifier_model.predict(new_penguins_df).to_pandas() + predictions = penguins_randomforest_classifier_model.predict( + new_penguins_df + ).to_pandas() + assert predictions.shape == (3, 9) + result = predictions[["predicted_sex"]] expected = pandas.DataFrame( {"predicted_sex": ["MALE", "MALE", "FEMALE"]}, dtype="string[pyarrow]", diff --git a/tests/system/small/ml/test_forecasting.py b/tests/system/small/ml/test_forecasting.py index 55079c94cf..948db59650 100644 --- a/tests/system/small/ml/test_forecasting.py +++ b/tests/system/small/ml/test_forecasting.py @@ -22,6 +22,8 @@ def test_model_predict(time_series_arima_plus_model): utc = pytz.utc predictions = time_series_arima_plus_model.predict().to_pandas() + assert predictions.shape == (3, 8) + result = predictions[["forecast_timestamp", "forecast_value"]] expected = pd.DataFrame( { "forecast_timestamp": [ @@ -38,7 +40,7 @@ def test_model_predict(time_series_arima_plus_model): ) pd.testing.assert_frame_equal( - predictions, + result, expected, rtol=0.1, check_index_type=False, diff --git a/tests/system/small/ml/test_imported.py b/tests/system/small/ml/test_imported.py index d305567066..9008e85a0b 100644 --- a/tests/system/small/ml/test_imported.py +++ b/tests/system/small/ml/test_imported.py @@ -32,7 +32,9 @@ def test_tensorflow_create_model_default_session(imported_tensorflow_model_path) def test_tensorflow_model_predict(imported_tensorflow_model, llm_text_df): df = llm_text_df.rename(columns={"prompt": "input"}) - result = imported_tensorflow_model.predict(df).to_pandas() + predictions = imported_tensorflow_model.predict(df).to_pandas() + assert predictions.shape == (3, 2) + result = predictions[["dense_1"]] # The values are non-human-readable. As they are a dense layer of Neural Network. # And since it is pretrained and imported, the model is a opaque-box. # We may want to switch to better test model and cases. @@ -72,7 +74,9 @@ def test_onnx_create_model_default_session(imported_onnx_model_path): def test_onnx_model_predict(imported_onnx_model, onnx_iris_df): - result = imported_onnx_model.predict(onnx_iris_df).to_pandas() + predictions = imported_onnx_model.predict(onnx_iris_df).to_pandas() + assert predictions.shape == (3, 7) + result = predictions[["label", "probabilities"]] value1 = np.array([0.9999993443489075, 0.0, 0.0]) value2 = np.array([0.0, 0.0, 0.9999993443489075]) expected = pd.DataFrame( diff --git a/tests/system/small/ml/test_linear_model.py b/tests/system/small/ml/test_linear_model.py index 3a8232ed9e..218c1074ab 100644 --- a/tests/system/small/ml/test_linear_model.py +++ b/tests/system/small/ml/test_linear_model.py @@ -91,13 +91,15 @@ def test_linear_reg_model_score_series( def test_linear_reg_model_predict(penguins_linear_model, new_penguins_df): predictions = penguins_linear_model.predict(new_penguins_df).to_pandas() + assert predictions.shape == (3, 8) + result = predictions[["predicted_body_mass_g"]] expected = pandas.DataFrame( {"predicted_body_mass_g": [4030.1, 3280.8, 3177.9]}, dtype="Float64", index=pandas.Index([1633, 1672, 1690], name="tag_number", dtype="Int64"), ) pandas.testing.assert_frame_equal( - predictions.sort_index(), + result.sort_index(), expected, check_exact=False, rtol=0.1, @@ -224,13 +226,15 @@ def test_logistic_model_score_series( def test_logsitic_model_predict(penguins_logistic_model, new_penguins_df): predictions = penguins_logistic_model.predict(new_penguins_df).to_pandas() + assert predictions.shape == (3, 9) + result = predictions[["predicted_sex"]] expected = pandas.DataFrame( {"predicted_sex": ["MALE", "MALE", "FEMALE"]}, dtype="string[pyarrow]", index=pandas.Index([1633, 1672, 1690], name="tag_number", dtype="Int64"), ) pandas.testing.assert_frame_equal( - predictions.sort_index(), + result.sort_index(), expected, check_exact=False, rtol=0.1, diff --git a/tests/system/small/ml/test_llm.py b/tests/system/small/ml/test_llm.py index 79d3c40317..306098548e 100644 --- a/tests/system/small/ml/test_llm.py +++ b/tests/system/small/ml/test_llm.py @@ -12,8 +12,6 @@ # See the License for the specific language governing permissions and # limitations under the License. -from unittest import TestCase - import numpy as np import pytest @@ -48,7 +46,7 @@ def test_create_text_generator_model_default_session(bq_connection, llm_text_pan llm_text_df = bpd.read_pandas(llm_text_pandas_df) df = model.predict(llm_text_df).to_pandas() - TestCase().assertSequenceEqual(df.shape, (3, 1)) + assert df.shape == (3, 4) assert "ml_generate_text_llm_result" in df.columns series = df["ml_generate_text_llm_result"] assert all(series.str.len() > 20) @@ -72,7 +70,7 @@ def test_create_text_generator_32k_model_default_session( llm_text_df = bpd.read_pandas(llm_text_pandas_df) df = model.predict(llm_text_df).to_pandas() - TestCase().assertSequenceEqual(df.shape, (3, 1)) + assert df.shape == (3, 4) assert "ml_generate_text_llm_result" in df.columns series = df["ml_generate_text_llm_result"] assert all(series.str.len() > 20) @@ -97,7 +95,7 @@ def test_create_text_generator_model_default_connection(llm_text_pandas_df): ) df = model.predict(llm_text_df).to_pandas() - TestCase().assertSequenceEqual(df.shape, (3, 1)) + assert df.shape == (3, 4) assert "ml_generate_text_llm_result" in df.columns series = df["ml_generate_text_llm_result"] assert all(series.str.len() > 20) @@ -109,7 +107,7 @@ def test_text_generator_predict_default_params_success( palm2_text_generator_model, llm_text_df ): df = palm2_text_generator_model.predict(llm_text_df).to_pandas() - TestCase().assertSequenceEqual(df.shape, (3, 1)) + assert df.shape == (3, 4) assert "ml_generate_text_llm_result" in df.columns series = df["ml_generate_text_llm_result"] assert all(series.str.len() > 20) @@ -120,7 +118,7 @@ def test_text_generator_predict_series_default_params_success( palm2_text_generator_model, llm_text_df ): df = palm2_text_generator_model.predict(llm_text_df["prompt"]).to_pandas() - TestCase().assertSequenceEqual(df.shape, (3, 1)) + assert df.shape == (3, 4) assert "ml_generate_text_llm_result" in df.columns series = df["ml_generate_text_llm_result"] assert all(series.str.len() > 20) @@ -132,7 +130,7 @@ def test_text_generator_predict_arbitrary_col_label_success( ): llm_text_df = llm_text_df.rename(columns={"prompt": "arbitrary"}) df = palm2_text_generator_model.predict(llm_text_df).to_pandas() - TestCase().assertSequenceEqual(df.shape, (3, 1)) + assert df.shape == (3, 4) assert "ml_generate_text_llm_result" in df.columns series = df["ml_generate_text_llm_result"] assert all(series.str.len() > 20) @@ -145,7 +143,7 @@ def test_text_generator_predict_with_params_success( df = palm2_text_generator_model.predict( llm_text_df, temperature=0.5, max_output_tokens=100, top_k=20, top_p=0.5 ).to_pandas() - TestCase().assertSequenceEqual(df.shape, (3, 1)) + assert df.shape == (3, 4) assert "ml_generate_text_llm_result" in df.columns series = df["ml_generate_text_llm_result"] assert all(series.str.len() > 20) @@ -196,7 +194,7 @@ def test_embedding_generator_predict_success( palm2_embedding_generator_model, llm_text_df ): df = palm2_embedding_generator_model.predict(llm_text_df).to_pandas() - TestCase().assertSequenceEqual(df.shape, (3, 1)) + assert df.shape == (3, 4) assert "text_embedding" in df.columns series = df["text_embedding"] value = series[0] @@ -209,7 +207,7 @@ def test_embedding_generator_multilingual_predict_success( palm2_embedding_generator_multilingual_model, llm_text_df ): df = palm2_embedding_generator_multilingual_model.predict(llm_text_df).to_pandas() - TestCase().assertSequenceEqual(df.shape, (3, 1)) + assert df.shape == (3, 4) assert "text_embedding" in df.columns series = df["text_embedding"] value = series[0] @@ -222,7 +220,7 @@ def test_embedding_generator_predict_series_success( palm2_embedding_generator_model, llm_text_df ): df = palm2_embedding_generator_model.predict(llm_text_df["prompt"]).to_pandas() - TestCase().assertSequenceEqual(df.shape, (3, 1)) + assert df.shape == (3, 4) assert "text_embedding" in df.columns series = df["text_embedding"] value = series[0] diff --git a/tests/system/small/ml/test_remote.py b/tests/system/small/ml/test_remote.py new file mode 100644 index 0000000000..e8eb1c85e8 --- /dev/null +++ b/tests/system/small/ml/test_remote.py @@ -0,0 +1,33 @@ +# Copyright 2023 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pandas as pd + +from bigframes.ml import remote + + +def test_remote_linear_vertex_model_predict( + linear_remote_vertex_model: remote.VertexAIModel, new_penguins_df +): + predictions = linear_remote_vertex_model.predict(new_penguins_df).to_pandas() + expected = pd.DataFrame( + {"predicted_body_mass_g": [[3739.54], [3675.79], [3619.54]]}, + index=pd.Index([1633, 1672, 1690], name="tag_number", dtype="Int64"), + ) + pd.testing.assert_frame_equal( + predictions[["predicted_body_mass_g"]].sort_index(), + expected, + check_exact=False, + rtol=0.1, + ) diff --git a/tests/system/small/test_dataframe.py b/tests/system/small/test_dataframe.py index a0cf25807c..9744d3f6e9 100644 --- a/tests/system/small/test_dataframe.py +++ b/tests/system/small/test_dataframe.py @@ -12,6 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. +import io import operator import tempfile import typing @@ -255,6 +256,47 @@ def test_drop_with_custom_column_labels(scalars_dfs): assert_pandas_df_equal(bf_result, pd_result) +def test_df_memory_usage(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + + pd_result = scalars_pandas_df.memory_usage() + bf_result = scalars_df.memory_usage() + + pd.testing.assert_series_equal(pd_result, bf_result, rtol=1.5) + + +def test_df_info(scalars_dfs): + expected = ( + "\n" + "Index: 9 entries, 0 to 8\n" + "Data columns (total 13 columns):\n" + " # Column Non-Null Count Dtype\n" + "--- ------------- ---------------- ------------------------------\n" + " 0 bool_col 8 non-null boolean\n" + " 1 bytes_col 6 non-null object\n" + " 2 date_col 7 non-null date32[day][pyarrow]\n" + " 3 datetime_col 6 non-null timestamp[us][pyarrow]\n" + " 4 geography_col 4 non-null geometry\n" + " 5 int64_col 8 non-null Int64\n" + " 6 int64_too 9 non-null Int64\n" + " 7 numeric_col 6 non-null object\n" + " 8 float64_col 7 non-null Float64\n" + " 9 rowindex_2 9 non-null Int64\n" + " 10 string_col 8 non-null string\n" + " 11 time_col 6 non-null time64[us][pyarrow]\n" + " 12 timestamp_col 6 non-null timestamp[us, tz=UTC][pyarrow]\n" + "dtypes: Float64(1), Int64(3), boolean(1), date32[day][pyarrow](1), geometry(1), object(2), string(1), time64[us][pyarrow](1), timestamp[us, tz=UTC][pyarrow](1), timestamp[us][pyarrow](1)\n" + "memory usage: 945 bytes\n" + ) + + scalars_df, _ = scalars_dfs + bf_result = io.StringIO() + + scalars_df.info(buf=bf_result) + + assert expected == bf_result.getvalue() + + def test_drop_index(scalars_dfs): scalars_df, scalars_pandas_df = scalars_dfs @@ -3493,6 +3535,29 @@ def test_df_dot_operator( ) +def test_df_dot_series_inline(): + left = [[1, 2, 3], [2, 5, 7]] + right = [2, 1, 3] + + bf1 = dataframe.DataFrame(left) + bf2 = series.Series(right) + bf_result = bf1.dot(bf2).to_pandas() + + df1 = pd.DataFrame(left) + df2 = pd.Series(right) + pd_result = df1.dot(df2) + + # Patch pandas dtypes for testing parity + # Pandas result is int64 instead of Int64 (nullable) dtype. + pd_result = pd_result.astype(pd.Int64Dtype()) + pd_result.index = pd_result.index.astype(pd.Int64Dtype()) + + pd.testing.assert_series_equal( + bf_result, + pd_result, + ) + + def test_df_dot_series( matrix_2by3_df, matrix_2by3_pandas_df, matrix_3by4_df, matrix_3by4_pandas_df ): @@ -3523,3 +3588,14 @@ def test_df_dot_operator_series( bf_result, pd_result, ) + + +def test_to_pandas_downsampling_option_override(session): + df = session.read_gbq("bigframes-dev.bigframes_tests_sys.batting") + download_size = 1 + + df = df.to_pandas(max_download_size=download_size, sampling_method="head") + + total_memory_bytes = df.memory_usage(deep=True).sum() + total_memory_mb = total_memory_bytes / (1024 * 1024) + assert total_memory_mb == pytest.approx(download_size, rel=0.3) diff --git a/tests/system/small/test_remote_function.py b/tests/system/small/test_remote_function.py index 3d8532a13b..960a384126 100644 --- a/tests/system/small/test_remote_function.py +++ b/tests/system/small/test_remote_function.py @@ -62,13 +62,12 @@ def bq_cf_connection_location_project_mismatched() -> str: @pytest.fixture(scope="module") -def session_with_bq_connection_and_permanent_dataset( +def session_with_bq_connection( bq_cf_connection, dataset_id_permanent ) -> bigframes.Session: session = bigframes.Session( bigframes.BigQueryOptions(bq_connection=bq_cf_connection) ) - session._session_dataset = bigquery.Dataset(dataset_id_permanent) return session @@ -277,13 +276,11 @@ def square(x): @pytest.mark.flaky(retries=2, delay=120) -def test_remote_function_direct_session_param( - session_with_bq_connection_and_permanent_dataset, scalars_dfs -): +def test_remote_function_direct_session_param(session_with_bq_connection, scalars_dfs): @rf.remote_function( [int], int, - session=session_with_bq_connection_and_permanent_dataset, + session=session_with_bq_connection, ) def square(x): return x * x @@ -313,9 +310,7 @@ def square(x): @pytest.mark.flaky(retries=2, delay=120) -def test_remote_function_via_session_default( - session_with_bq_connection_and_permanent_dataset, scalars_dfs -): +def test_remote_function_via_session_default(session_with_bq_connection, scalars_dfs): # Session has bigquery connection initialized via context. Without an # explicit dataset the default dataset from the session would be used. # Without an explicit bigquery connection, the one present in Session set @@ -323,7 +318,7 @@ def test_remote_function_via_session_default( # the default behavior of reuse=True will take effect. Please note that the # udf is same as the one used in other tests in this file so the underlying # cloud function would be common and quickly reused. - @session_with_bq_connection_and_permanent_dataset.remote_function([int], int) + @session_with_bq_connection.remote_function([int], int) def square(x): return x * x @@ -391,15 +386,11 @@ def square(x): @pytest.mark.flaky(retries=2, delay=120) -def test_dataframe_applymap( - session_with_bq_connection_and_permanent_dataset, scalars_dfs -): +def test_dataframe_applymap(session_with_bq_connection, scalars_dfs): def add_one(x): return x + 1 - remote_add_one = session_with_bq_connection_and_permanent_dataset.remote_function( - [int], int - )(add_one) + remote_add_one = session_with_bq_connection.remote_function([int], int)(add_one) scalars_df, scalars_pandas_df = scalars_dfs int64_cols = ["int64_col", "int64_too"] @@ -422,15 +413,11 @@ def add_one(x): @pytest.mark.flaky(retries=2, delay=120) -def test_dataframe_applymap_na_ignore( - session_with_bq_connection_and_permanent_dataset, scalars_dfs -): +def test_dataframe_applymap_na_ignore(session_with_bq_connection, scalars_dfs): def add_one(x): return x + 1 - remote_add_one = session_with_bq_connection_and_permanent_dataset.remote_function( - [int], int - )(add_one) + remote_add_one = session_with_bq_connection.remote_function([int], int)(add_one) scalars_df, scalars_pandas_df = scalars_dfs int64_cols = ["int64_col", "int64_too"] @@ -451,13 +438,11 @@ def add_one(x): @pytest.mark.flaky(retries=2, delay=120) -def test_series_map(session_with_bq_connection_and_permanent_dataset, scalars_dfs): +def test_series_map(session_with_bq_connection, scalars_dfs): def add_one(x): return x + 1 - remote_add_one = session_with_bq_connection_and_permanent_dataset.remote_function( - [int], int - )(add_one) + remote_add_one = session_with_bq_connection.remote_function([int], int)(add_one) scalars_df, scalars_pandas_df = scalars_dfs diff --git a/tests/system/small/test_session.py b/tests/system/small/test_session.py index 7cd9f1dd59..26c5093b35 100644 --- a/tests/system/small/test_session.py +++ b/tests/system/small/test_session.py @@ -16,6 +16,7 @@ import random import tempfile import textwrap +import time import typing from typing import List @@ -308,6 +309,23 @@ def test_read_gbq_w_script_no_select(session, dataset_id: str): assert df["statement_type"][0] == "SCRIPT" +def test_read_gbq_twice_with_same_timestamp(session, penguins_table_id): + df1 = session.read_gbq(penguins_table_id) + time.sleep(1) + df2 = session.read_gbq(penguins_table_id) + df1.columns = [ + "species1", + "island1", + "culmen_length_mm1", + "culmen_depth_mm1", + "flipper_length_mm1", + "body_mass_g1", + "sex1", + ] + df3 = df1.join(df2) + assert df3 is not None + + def test_read_gbq_model(session, penguins_linear_model_name): model = session.read_gbq_model(penguins_linear_model_name) assert isinstance(model, bigframes.ml.linear_model.LinearRegression) diff --git a/tests/unit/core/test_bf_utils.py b/tests/unit/core/test_bf_utils.py index fc34f35d9c..10ce1fd09e 100644 --- a/tests/unit/core/test_bf_utils.py +++ b/tests/unit/core/test_bf_utils.py @@ -25,7 +25,7 @@ def test_get_standardized_ids_columns(): "0", utils.UNNAMED_COLUMN_ID, "duplicate", - "duplicate.1", + "duplicate_1", "with_space", ] assert idx_ids == [] @@ -37,13 +37,13 @@ def test_get_standardized_ids_indexes(): col_ids, idx_ids = utils.get_standardized_ids(col_labels, idx_labels) - assert col_ids == ["duplicate.2"] + assert col_ids == ["duplicate_2"] assert idx_ids == [ "string", "0", utils.UNNAMED_INDEX_ID, "duplicate", - "duplicate.1", + "duplicate_1", "with_space", ] diff --git a/tests/unit/ml/test_sql.py b/tests/unit/ml/test_sql.py index ea16722393..9223058540 100644 --- a/tests/unit/ml/test_sql.py +++ b/tests/unit/ml/test_sql.py @@ -190,6 +190,32 @@ def test_create_remote_model_produces_correct_sql( ) +def test_create_remote_model_with_params_produces_correct_sql( + model_creation_sql_generator: ml_sql.ModelCreationSqlGenerator, +): + sql = model_creation_sql_generator.create_remote_model( + connection_name="my_project.us.my_connection", + model_ref=bigquery.ModelReference.from_string( + "test-proj._anonXYZ.create_remote_model" + ), + input={"column1": "int64"}, + output={"result": "array"}, + options={"option_key1": "option_value1", "option_key2": 2}, + ) + assert ( + sql + == """CREATE OR REPLACE MODEL `test-proj`.`_anonXYZ`.`create_remote_model` +INPUT( + column1 int64) +OUTPUT( + result array) +REMOTE WITH CONNECTION `my_project.us.my_connection` +OPTIONS( + option_key1="option_value1", + option_key2=2)""" + ) + + def test_create_imported_model_produces_correct_sql( model_creation_sql_generator: ml_sql.ModelCreationSqlGenerator, ): diff --git a/tests/unit/session/test_io_bigquery.py b/tests/unit/session/test_io_bigquery.py index e1481d3f05..3f3bfbe7d3 100644 --- a/tests/unit/session/test_io_bigquery.py +++ b/tests/unit/session/test_io_bigquery.py @@ -59,6 +59,7 @@ def test_create_job_configs_labels_length_limit_not_met(): def test_create_job_configs_labels_log_adaptor_call_method_under_length_limit(): + log_adapter.get_and_reset_api_methods() cur_labels = { "bigframes-api": "read_pandas", "source": "bigquery-dataframes-temp", @@ -87,6 +88,7 @@ def test_create_job_configs_labels_log_adaptor_call_method_under_length_limit(): def test_create_job_configs_labels_length_limit_met_and_labels_is_none(): + log_adapter.get_and_reset_api_methods() df = bpd.DataFrame({"col1": [1, 2], "col2": [3, 4]}) # Test running methods more than the labels' length limit for i in range(66): @@ -102,6 +104,7 @@ def test_create_job_configs_labels_length_limit_met_and_labels_is_none(): def test_create_job_configs_labels_length_limit_met(): + log_adapter.get_and_reset_api_methods() cur_labels = { "bigframes-api": "read_pandas", "source": "bigquery-dataframes-temp", @@ -144,20 +147,6 @@ def test_create_snapshot_sql_doesnt_timetravel_anonymous_datasets(): assert "`my-test-project`.`_e8166e0cdb`.`anonbb92cd`" in sql -def test_create_snapshot_sql_doesnt_timetravel_session_tables(): - table_ref = bigquery.TableReference.from_string("my-test-project._session.abcdefg") - - sql = bigframes.session._io.bigquery.create_snapshot_sql( - table_ref, datetime.datetime.now(datetime.timezone.utc) - ) - - # We aren't modifying _SESSION tables, so don't use time travel. - assert "SYSTEM_TIME" not in sql - - # Don't need the project ID for _SESSION tables. - assert "my-test-project" not in sql - - def test_create_temp_table_default_expiration(): """Make sure the created table has an expiration.""" bqclient = mock.create_autospec(bigquery.Client) diff --git a/third_party/bigframes_vendored/pandas/core/config_init.py b/third_party/bigframes_vendored/pandas/core/config_init.py index 198654015e..dfb91dfeb8 100644 --- a/third_party/bigframes_vendored/pandas/core/config_init.py +++ b/third_party/bigframes_vendored/pandas/core/config_init.py @@ -33,6 +33,17 @@ Instead estimated bytes processed will be shown. Dataframe and Series objects can still be computed with methods that explicitly execute and download results. + max_info_columns (int): + max_info_columns is used in DataFrame.info method to decide if + per column information will be printed. + max_info_rows (int or None): + df.info() will usually show null-counts for each column. + For large frames this can be quite slow. max_info_rows and max_info_cols + limit this null check only to frames with smaller dimensions than + specified. + memory_usage (bool): + This specifies if the memory usage of a DataFrame should be displayed when + df.info() is called. Valid values True,False, """ sampling_options_doc = """ diff --git a/third_party/bigframes_vendored/pandas/core/frame.py b/third_party/bigframes_vendored/pandas/core/frame.py index b35d0f3b2e..099d8b8e66 100644 --- a/third_party/bigframes_vendored/pandas/core/frame.py +++ b/third_party/bigframes_vendored/pandas/core/frame.py @@ -92,6 +92,72 @@ def values(self) -> np.ndarray: """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + def info( + self, + verbose: bool | None = None, + buf=None, + max_cols: int | None = None, + memory_usage: bool | None = None, + show_counts: bool | None = None, + ) -> None: + """ + Print a concise summary of a DataFrame. + + This method prints information about a DataFrame including + the index dtypeand columns, non-null values and memory usage. + + Args: + verbose (bool, optional): + Whether to print the full summary. By default, the setting in + ``pandas.options.display.max_info_columns`` is followed. + buf (writable buffer, defaults to sys.stdout): + Where to send the output. By default, the output is printed to + sys.stdout. Pass a writable buffer if you need to further process + the output. + max_cols (int, optional): + When to switch from the verbose to the truncated output. If the + DataFrame has more than `max_cols` columns, the truncated output + is used. By default, the setting in + ``pandas.options.display.max_info_columns`` is used. + memory_usage (bool, optional): + Specifies whether total memory usage of the DataFrame + elements (including the index) should be displayed. By default, + this follows the ``pandas.options.display.memory_usage`` setting. + True always show memory usage. False never shows memory usage. + Memory estimation is made based in column dtype and number of rows + assuming values consume the same memory amount for corresponding dtypes. + show_counts (bool, optional): + Whether to show the non-null counts. By default, this is shown + only if the DataFrame is smaller than + ``pandas.options.display.max_info_rows`` and + ``pandas.options.display.max_info_columns``. A value of True always + shows the counts, and False never shows the counts. + + Returns: + None: This method prints a summary of a DataFrame and returns None.""" + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + + def memory_usage(self, index: bool = True): + """ + Return the memory usage of each column in bytes. + + The memory usage can optionally include the contribution of + the index and elements of `object` dtype. + + This value is displayed in `DataFrame.info` by default. This can be + suppressed by setting ``pandas.options.display.memory_usage`` to False. + + Args: + index (bool, default True): + Specifies whether to include the memory usage of the DataFrame's + index in returned Series. If ``index=True``, the memory usage of + the index is the first item in the output. + + Returns: + Series: A Series whose index is the original column names and whose values is the memory usage of each column in bytes. + """ + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + # ---------------------------------------------------------------------- # IO methods (to / from other formats) def to_numpy(self, dtype=None, copy=False, na_value=None, **kwargs) -> np.ndarray: @@ -2121,6 +2187,59 @@ def groupby( used to group large amounts of data and compute operations on these groups. + **Examples:** + + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + + >>> df = bpd.DataFrame({'Animal': ['Falcon', 'Falcon', + ... 'Parrot', 'Parrot'], + ... 'Max Speed': [380., 370., 24., 26.]}) + >>> df + Animal Max Speed + 0 Falcon 380.0 + 1 Falcon 370.0 + 2 Parrot 24.0 + 3 Parrot 26.0 + + [4 rows x 2 columns] + + >>> df.groupby(['Animal'])['Max Speed'].mean() + Animal + Falcon 375.0 + Parrot 25.0 + Name: Max Speed, dtype: Float64 + + We can also choose to include NA in group keys or not by setting `dropna`: + + >>> df = bpd.DataFrame([[1, 2, 3],[1, None, 4], [2, 1, 3], [1, 2, 2]], + ... columns=["a", "b", "c"]) + >>> df.groupby(by=["b"]).sum() + a c + b + 1.0 2 3 + 2.0 2 5 + + [2 rows x 2 columns] + + >>> df.groupby(by=["b"], dropna=False).sum() + a c + b + 1.0 2 3 + 2.0 2 5 + 1 4 + + [3 rows x 2 columns] + + We can also choose to return object with group labels or not by setting `as_index`: + + >>> df.groupby(by=["b"], as_index=False).sum() + b a c + 0 1.0 2 3 + 1 2.0 2 5 + + [2 rows x 3 columns] + Args: by (str, Sequence[str]): A label or list of labels may be passed to group by the columns @@ -2224,7 +2343,7 @@ def map(self, func, na_action: Optional[str] = None) -> DataFrame: Python function wrapped by ``remote_function`` decorator, returns a single value from a single value. na_action (Optional[str], default None): - ``{None, 'ignore'}``, default None. If ‘ignore’, propagate NaN + ``{None, 'ignore'}``, default None. If `ignore`, propagate NaN values, without passing them to func. Returns: @@ -2240,6 +2359,74 @@ def join(self, other, *, on: Optional[str] = None, how: str) -> DataFrame: Join columns with `other` DataFrame on index + **Examples:** + + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + + Join two DataFrames by specifying how to handle the operation: + + >>> df1 = bpd.DataFrame({'col1': ['foo', 'bar'], 'col2': [1, 2]}, index=[10, 11]) + >>> df1 + col1 col2 + 10 foo 1 + 11 bar 2 + + [2 rows x 2 columns] + + >>> df2 = bpd.DataFrame({'col3': ['foo', 'baz'], 'col4': [3, 4]}, index=[11, 22]) + >>> df2 + col3 col4 + 11 foo 3 + 22 baz 4 + + [2 rows x 2 columns] + + >>> df1.join(df2) + col1 col2 col3 col4 + 10 foo 1 + 11 bar 2 foo 3 + + [2 rows x 4 columns] + + >>> df1.join(df2, how="left") + col1 col2 col3 col4 + 10 foo 1 + 11 bar 2 foo 3 + + [2 rows x 4 columns] + + >>> df1.join(df2, how="right") + col1 col2 col3 col4 + 11 bar 2 foo 3 + 22 baz 4 + + [2 rows x 4 columns] + + >>> df1.join(df2, how="outer") + col1 col2 col3 col4 + 10 foo 1 + 11 bar 2 foo 3 + 22 baz 4 + + [3 rows x 4 columns] + + >>> df1.join(df2, how="inner") + col1 col2 col3 col4 + 11 bar 2 foo 3 + + [1 rows x 4 columns] + + + Another option to join using the key columns is to use the on parameter: + + >>> df1.join(df2, on="col1", how="right") + col1 col2 col3 col4 + 11 foo 3 + 22 baz 4 + + [2 rows x 4 columns] + Args: other: DataFrame with an Index similar to the Index of this one. @@ -2292,6 +2479,78 @@ def merge( rows will be matched against each other. This is different from usual SQL join behaviour and can lead to unexpected results. + **Examples:** + + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + + Merge DataFrames df1 and df2 by specifiying type of merge: + + >>> df1 = bpd.DataFrame({'a': ['foo', 'bar'], 'b': [1, 2]}) + >>> df1 + a b + 0 foo 1 + 1 bar 2 + + [2 rows x 2 columns] + + >>> df2 = bpd.DataFrame({'a': ['foo', 'baz'], 'c': [3, 4]}) + >>> df2 + a c + 0 foo 3 + 1 baz 4 + + [2 rows x 2 columns] + + >>> df1.merge(df2, how="inner", on="a") + a b c + 0 foo 1 3 + + [1 rows x 3 columns] + + >>> df1.merge(df2, how='left', on='a') + a b c + 0 foo 1 3 + 1 bar 2 + + [2 rows x 3 columns] + + Merge df1 and df2 on the lkey and rkey columns. The value columns have + the default suffixes, _x and _y, appended. + + >>> df1 = bpd.DataFrame({'lkey': ['foo', 'bar', 'baz', 'foo'], + ... 'value': [1, 2, 3, 5]}) + >>> df1 + lkey value + 0 foo 1 + 1 bar 2 + 2 baz 3 + 3 foo 5 + + [4 rows x 2 columns] + + >>> df2 = bpd.DataFrame({'rkey': ['foo', 'bar', 'baz', 'foo'], + ... 'value': [5, 6, 7, 8]}) + >>> df2 + rkey value + 0 foo 5 + 1 bar 6 + 2 baz 7 + 3 foo 8 + + [4 rows x 2 columns] + + >>> df1.merge(df2, left_on='lkey', right_on='rkey') + lkey value_x rkey value_y + 0 foo 1 foo 5 + 1 foo 1 foo 8 + 2 bar 2 bar 6 + 3 baz 3 baz 7 + 4 foo 5 foo 5 + 5 foo 5 foo 8 + + [6 rows x 4 columns] + Args: right: Object to merge with. @@ -2342,6 +2601,29 @@ def apply(self, func, *, args=(), **kwargs): the DataFrame's index (``axis=0``) the final return type is inferred from the return type of the applied function. + **Examples:** + + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + + >>> df = bpd.DataFrame({'col1': [1, 2], 'col2': [3, 4]}) + >>> df + col1 col2 + 0 1 3 + 1 2 4 + + [2 rows x 2 columns] + + >>> def sqaure(x): + ... return x * x + >>> df1 = df.apply(sqaure) + >>> df + col1 col2 + 0 1 3 + 1 2 4 + + [2 rows x 2 columns] + Args: func (function): Function to apply to each column or row. @@ -2368,6 +2650,33 @@ def any(self, *, axis=0, bool_only: bool = False): along a Dataframe axis that is True or equivalent (e.g. non-zero or non-empty). + **Examples:** + + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + + >>> df = bpd.DataFrame({"A": [True, True], "B": [False, False]}) + >>> df + A B + 0 True False + 1 True False + + [2 rows x 2 columns] + + Checking if each column contains at least one True element(the default behavior without an explicit axis parameter): + + >>> df.any() + A True + B False + dtype: boolean + + Checking if each row contains at least one True element: + + >>> df.any(axis=1) + 0 True + 1 True + dtype: boolean + Args: axis ({index (0), columns (1)}): Axis for the function to be applied on. @@ -2388,6 +2697,33 @@ def all(self, axis=0, *, bool_only: bool = False): along a DataFrame axis that is False or equivalent (e.g. zero or empty). + **Examples:** + + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + + >>> df = bpd.DataFrame({"A": [True, True], "B": [False, False]}) + >>> df + A B + 0 True False + 1 True False + + [2 rows x 2 columns] + + Checking if all values in each column are True(the default behavior without an explicit axis parameter): + + >>> df.all() + A True + B False + dtype: boolean + + Checking across rows to see if all values are True: + + >>> df.all(axis=1) + 0 False + 1 False + dtype: boolean + Args: axis ({index (0), columns (1)}): Axis for the function to be applied on. @@ -2404,8 +2740,37 @@ def prod(self, axis=0, *, numeric_only: bool = False): """ Return the product of the values over the requested axis. + **Examples:** + + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + + >>> df = bpd.DataFrame({"A": [1, 2, 3], "B": [4.5, 5.5, 6.5]}) + >>> df + A B + 0 1 4.5 + 1 2 5.5 + 2 3 6.5 + + [3 rows x 2 columns] + + Calculating the product of each column(the default behavior without an explicit axis parameter): + + >>> df.prod() + A 6.0 + B 160.875 + dtype: Float64 + + Calculating the product of each row: + + >>> df.prod(axis=1) + 0 4.5 + 1 11.0 + 2 19.5 + dtype: Float64 + Args: - aßxis ({index (0), columns (1)}): + axis ({index (0), columns (1)}): Axis for the function to be applied on. For Series this parameter is unused and defaults to 0. numeric_only (bool. default False): @@ -2422,6 +2787,33 @@ def min(self, axis=0, *, numeric_only: bool = False): If you want the *index* of the minimum, use ``idxmin``. This is the equivalent of the ``numpy.ndarray`` method ``argmin``. + **Examples:** + + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + + >>> df = bpd.DataFrame({"A": [1, 3], "B": [2, 4]}) + >>> df + A B + 0 1 2 + 1 3 4 + + [2 rows x 2 columns] + + Finding the minimum value in each column (the default behavior without an explicit axis parameter). + + >>> df.min() + A 1.0 + B 2.0 + dtype: Float64 + + Finding the minimum value in each row. + + >>> df.min(axis=1) + 0 1.0 + 1 3.0 + dtype: Float64 + Args: axis ({index (0), columns (1)}): Axis for the function to be applied on. @@ -2440,6 +2832,33 @@ def max(self, axis=0, *, numeric_only: bool = False): If you want the *index* of the maximum, use ``idxmax``. This is the equivalent of the ``numpy.ndarray`` method ``argmax``. + **Examples:** + + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + + >>> df = bpd.DataFrame({"A": [1, 3], "B": [2, 4]}) + >>> df + A B + 0 1 2 + 1 3 4 + + [2 rows x 2 columns] + + Finding the maximum value in each column (the default behavior without an explicit axis parameter). + + >>> df.max() + A 3.0 + B 4.0 + dtype: Float64 + + Finding the maximum value in each row. + + >>> df.max(axis=1) + 0 2.0 + 1 4.0 + dtype: Float64 + Args: axis ({index (0), columns (1)}): Axis for the function to be applied on. @@ -2457,6 +2876,33 @@ def sum(self, axis=0, *, numeric_only: bool = False): This is equivalent to the method ``numpy.sum``. + **Examples:** + + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + + >>> df = bpd.DataFrame({"A": [1, 3], "B": [2, 4]}) + >>> df + A B + 0 1 2 + 1 3 4 + + [2 rows x 2 columns] + + Calculating the sum of each column (the default behavior without an explicit axis parameter). + + >>> df.sum() + A 4.0 + B 6.0 + dtype: Float64 + + Calculating the sum of each row. + + >>> df.sum(axis=1) + 0 3.0 + 1 7.0 + dtype: Float64 + Args: axis ({index (0), columns (1)}): Axis for the function to be applied on. @@ -2472,6 +2918,33 @@ def sum(self, axis=0, *, numeric_only: bool = False): def mean(self, axis=0, *, numeric_only: bool = False): """Return the mean of the values over the requested axis. + **Examples:** + + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + + >>> df = bpd.DataFrame({"A": [1, 3], "B": [2, 4]}) + >>> df + A B + 0 1 2 + 1 3 4 + + [2 rows x 2 columns] + + Calculating the mean of each column (the default behavior without an explicit axis parameter). + + >>> df.mean() + A 2.0 + B 3.0 + dtype: Float64 + + Calculating the mean of each row. + + >>> df.mean(axis=1) + 0 1.5 + 1 3.5 + dtype: Float64 + Args: axis ({index (0), columns (1)}): Axis for the function to be applied on. @@ -2485,7 +2958,27 @@ def mean(self, axis=0, *, numeric_only: bool = False): raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) def median(self, *, numeric_only: bool = False, exact: bool = False): - """Return the median of the values over the requested axis. + """Return the median of the values over colunms. + + **Examples:** + + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + + >>> df = bpd.DataFrame({"A": [1, 3], "B": [2, 4]}) + >>> df + A B + 0 1 2 + 1 3 4 + + [2 rows x 2 columns] + + Finding the median value of each column. + + >>> df.median() + A 1.0 + B 2.0 + dtype: Float64 Args: numeric_only (bool. default False): @@ -2504,6 +2997,34 @@ def var(self, axis=0, *, numeric_only: bool = False): Normalized by N-1 by default. + **Examples:** + + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + + >>> df = bpd.DataFrame({"A": [1, 3], "B": [2, 4]}) + >>> df + A B + 0 1 2 + 1 3 4 + + [2 rows x 2 columns] + + Calculating the variance of each column (the default behavior without an explicit axis parameter). + + >>> df.var() + A 2.0 + B 2.0 + dtype: Float64 + + Calculating the variance of each row. + + >>> df.var(axis=1) + 0 0.5 + 1 0.5 + dtype: Float64 + + Args: axis ({index (0), columns (1)}): Axis for the function to be applied on. @@ -2517,10 +3038,36 @@ def var(self, axis=0, *, numeric_only: bool = False): raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) def skew(self, *, numeric_only: bool = False): - """Return unbiased skew over requested axis. + """Return unbiased skew over columns. Normalized by N-1. + **Examples:** + + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + + >>> df = bpd.DataFrame({'A': [1, 2, 3, 4, 5], + ... 'B': [5, 4, 3, 2, 1], + ... 'C': [2, 2, 3, 2, 2]}) + >>> df + A B C + 0 1 5 2 + 1 2 4 2 + 2 3 3 3 + 3 4 2 2 + 4 5 1 2 + + [5 rows x 3 columns] + + Calculating the skewness of each column. + + >>> df.skew() + A 0.0 + B 0.0 + C 2.236068 + dtype: Float64 + Args: numeric_only (bool, default False): Include only float, int, boolean columns. @@ -2531,11 +3078,37 @@ def skew(self, *, numeric_only: bool = False): raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) def kurt(self, *, numeric_only: bool = False): - """Return unbiased kurtosis over requested axis. + """Return unbiased kurtosis over columns. Kurtosis obtained using Fisher's definition of kurtosis (kurtosis of normal == 0.0). Normalized by N-1. + **Examples:** + + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + + >>> df = bpd.DataFrame({"A": [1, 2, 3, 4, 5], + ... "B": [3, 4, 3, 2, 1], + ... "C": [2, 2, 3, 2, 2]}) + >>> df + A B C + 0 1 3 2 + 1 2 4 2 + 2 3 3 3 + 3 4 2 2 + 4 5 1 2 + + [5 rows x 3 columns] + + Calculating the kurtosis value of each column: + + >>> df.kurt() + A -1.2 + B -0.177515 + C 5.0 + dtype: Float64 + Args: numeric_only (bool, default False): Include only float, int, boolean columns. @@ -2546,10 +3119,36 @@ def kurt(self, *, numeric_only: bool = False): raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) def std(self, *, numeric_only: bool = False): - """Return sample standard deviation over requested axis. + """Return sample standard deviation over columns. Normalized by N-1 by default. + **Examples:** + + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + + >>> df = bpd.DataFrame({"A": [1, 2, 3, 4, 5], + ... "B": [3, 4, 3, 2, 1], + ... "C": [2, 2, 3, 2, 2]}) + >>> df + A B C + 0 1 3 2 + 1 2 4 2 + 2 3 3 3 + 3 4 2 2 + 4 5 1 2 + + [5 rows x 3 columns] + + Calculating the standard deviation of each column: + + >>> df.std() + A 1.581139 + B 1.140175 + C 0.447214 + dtype: Float64 + Args: numeric_only (bool. default False): Default False. Include only float, int, boolean columns. @@ -2561,11 +3160,37 @@ def std(self, *, numeric_only: bool = False): def count(self, *, numeric_only: bool = False): """ - Count non-NA cells for each column or row. + Count non-NA cells for each column. The values `None`, `NaN`, `NaT`, and optionally `numpy.inf` (depending on `pandas.options.mode.use_inf_as_na`) are considered NA. + **Examples:** + + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + + >>> df = bpd.DataFrame({"A": [1, None, 3, 4, 5], + ... "B": [1, 2, 3, 4, 5], + ... "C": [None, 3.5, None, 4.5, 5.0]}) + >>> df + A B C + 0 1.0 1 + 1 2 3.5 + 2 3.0 3 + 3 4.0 4 4.5 + 4 5.0 5 5.0 + + [5 rows x 3 columns] + + Counting non-NA values for each column: + + >>> df.count() + A 4.0 + B 5.0 + C 3.0 + dtype: Float64 + Args: numeric_only (bool, default False): Include only `float`, `int` or `boolean` data. @@ -2890,6 +3515,47 @@ def index(self): index is used for label-based access and alignment, and can be accessed or modified using this attribute. + **Examples:** + + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + + You can access the index of a DataFrame via ``index`` property. + + >>> df = bpd.DataFrame({'Name': ['Alice', 'Bob', 'Aritra'], + ... 'Age': [25, 30, 35], + ... 'Location': ['Seattle', 'New York', 'Kona']}, + ... index=([10, 20, 30])) + >>> df + Name Age Location + 10 Alice 25 Seattle + 20 Bob 30 New York + 30 Aritra 35 Kona + + [3 rows x 3 columns] + >>> df.index # doctest: +ELLIPSIS + + >>> df.index.values + array([10, 20, 30], dtype=object) + + Let's try setting a new index for the dataframe and see that reflect via + ``index`` property. + + >>> df1 = df.set_index(["Name", "Location"]) + >>> df1 + Age + Name Location + Alice Seattle 25 + Bob New York 30 + Aritra Kona 35 + + [3 rows x 1 columns] + >>> df1.index # doctest: +ELLIPSIS + + >>> df1.index.values + array([('Alice', 'Seattle'), ('Bob', 'New York'), ('Aritra', 'Kona')], + dtype=object) + Returns: The index labels of the DataFrame. """ @@ -2897,7 +3563,43 @@ def index(self): @property def columns(self): - "The column labels of the DataFrame." + """The column labels of the DataFrame. + + **Examples:** + + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + + You can access the column labels of a DataFrame via ``columns`` property. + + >>> df = bpd.DataFrame({'Name': ['Alice', 'Bob', 'Aritra'], + ... 'Age': [25, 30, 35], + ... 'Location': ['Seattle', 'New York', 'Kona']}, + ... index=([10, 20, 30])) + >>> df + Name Age Location + 10 Alice 25 Seattle + 20 Bob 30 New York + 30 Aritra 35 Kona + + [3 rows x 3 columns] + >>> df.columns + Index(['Name', 'Age', 'Location'], dtype='object') + + You can also set new labels for columns. + + >>> df.columns = ["NewName", "NewAge", "NewLocation"] + >>> df + NewName NewAge NewLocation + 10 Alice 25 Seattle + 20 Bob 30 New York + 30 Aritra 35 Kona + + [3 rows x 3 columns] + >>> df.columns + Index(['NewName', 'NewAge', 'NewLocation'], dtype='object') + + """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) def value_counts( @@ -3028,6 +3730,77 @@ def dot(self, other): The dot method for Series computes the inner product, instead of the matrix product here. + **Examples:** + + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + + >>> left = bpd.DataFrame([[0, 1, -2, -1], [1, 1, 1, 1]]) + >>> left + 0 1 2 3 + 0 0 1 -2 -1 + 1 1 1 1 1 + + [2 rows x 4 columns] + >>> right = bpd.DataFrame([[0, 1], [1, 2], [-1, -1], [2, 0]]) + >>> right + 0 1 + 0 0 1 + 1 1 2 + 2 -1 -1 + 3 2 0 + + [4 rows x 2 columns] + >>> left.dot(right) + 0 1 + 0 1 4 + 1 2 2 + + [2 rows x 2 columns] + + You can also use the operator ``@`` for the dot product: + + >>> left @ right + 0 1 + 0 1 4 + 1 2 2 + + [2 rows x 2 columns] + + The right input can be a Series, in which case the result will also be a + Series: + + >>> right = bpd.Series([1, 2, -1,0]) + >>> left @ right + 0 4 + 1 2 + dtype: Int64 + + Any user defined index of the left matrix and columns of the right + matrix will reflect in the result. + + >>> left = bpd.DataFrame([[1, 2, 3], [2, 5, 7]], index=["alpha", "beta"]) + >>> left + 0 1 2 + alpha 1 2 3 + beta 2 5 7 + + [2 rows x 3 columns] + >>> right = bpd.DataFrame([[2, 4, 8], [1, 5, 10], [3, 6, 9]], columns=["red", "green", "blue"]) + >>> right + red green blue + 0 2 4 8 + 1 1 5 10 + 2 3 6 9 + + [3 rows x 3 columns] + >>> left.dot(right) + red green blue + alpha 13 32 55 + beta 30 75 129 + + [2 rows x 3 columns] + Args: other (Series or DataFrame): The other object to compute the matrix product with. diff --git a/third_party/bigframes_vendored/pandas/core/series.py b/third_party/bigframes_vendored/pandas/core/series.py index c6d98075f5..1b751ed83b 100644 --- a/third_party/bigframes_vendored/pandas/core/series.py +++ b/third_party/bigframes_vendored/pandas/core/series.py @@ -44,7 +44,54 @@ def struct(self): @property def index(self): - """The index (axis labels) of the Series.""" + """The index (axis labels) of the Series. + + The index of a Series is used to label and identify each element of the + underlying data. The index can be thought of as an immutable ordered set + (technically a multi-set, as it may contain duplicate labels), and is + used to index and align data. + + **Examples:** + + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + + You can access the index of a Series via ``index`` property. + + >>> df = bpd.DataFrame({'Name': ['Alice', 'Bob', 'Aritra'], + ... 'Age': [25, 30, 35], + ... 'Location': ['Seattle', 'New York', 'Kona']}, + ... index=([10, 20, 30])) + >>> s = df["Age"] + >>> s + 10 25 + 20 30 + 30 35 + Name: Age, dtype: Int64 + >>> s.index # doctest: +ELLIPSIS + + >>> s.index.values + array([10, 20, 30], dtype=object) + + Let's try setting a multi-index case reflect via ``index`` property. + + >>> df1 = df.set_index(["Name", "Location"]) + >>> s1 = df1["Age"] + >>> s1 + Name Location + Alice Seattle 25 + Bob New York 30 + Aritra Kona 35 + Name: Age, dtype: Int64 + >>> s1.index # doctest: +ELLIPSIS + + >>> s1.index.values + array([('Alice', 'Seattle'), ('Bob', 'New York'), ('Aritra', 'Kona')], + dtype=object) + + Returns: + The index labels of the Series. + """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) @property @@ -584,6 +631,21 @@ def dot(self, other) -> Series | np.ndarray: BigQuery Dataframes does not validate this property and will produce incorrect results if indices are not equal. + **Examples:** + + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + + >>> s = bpd.Series([0, 1, 2, 3]) + >>> other = bpd.Series([-1, 2, -3, 4]) + >>> s.dot(other) + 8 + + You can also use the operator ``@`` for the dot product: + + >>> s @ other + 8 + Args: other (Series): The other object to compute the dot product with its columns. @@ -1696,6 +1758,49 @@ def kurt(self): def where(self, cond, other): """Replace values where the condition is False. + **Examples:** + + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + + >>> s = bpd.Series([10, 11, 12, 13, 14]) + >>> s + 0 10 + 1 11 + 2 12 + 3 13 + 4 14 + dtype: Int64 + + You can filter the values in the Series based on a condition. The values + matching the condition would be kept, and not matching would be replaced. + The default replacement value is ``NA``. + + >>> s.where(s % 2 == 0) + 0 10 + 1 + 2 12 + 3 + 4 14 + dtype: Int64 + + You can specify a custom replacement value for non-matching values. + + >>> s.where(s % 2 == 0, -1) + 0 10 + 1 -1 + 2 12 + 3 -1 + 4 14 + dtype: Int64 + >>> s.where(s % 2 == 0, 100*s) + 0 10 + 1 1100 + 2 12 + 3 1300 + 4 14 + dtype: Int64 + Args: cond (bool Series/DataFrame, array-like, or callable): Where cond is True, keep the original value. Where False, replace @@ -1720,6 +1825,77 @@ def where(self, cond, other): def mask(self, cond, other): """Replace values where the condition is True. + **Examples:** + + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + + >>> s = bpd.Series([10, 11, 12, 13, 14]) + >>> s + 0 10 + 1 11 + 2 12 + 3 13 + 4 14 + dtype: Int64 + + You can mask the values in the Series based on a condition. The values + matching the condition would be masked. + + >>> s.mask(s % 2 == 0) + 0 + 1 11 + 2 + 3 13 + 4 + dtype: Int64 + + You can specify a custom mask value. + + >>> s.mask(s % 2 == 0, -1) + 0 -1 + 1 11 + 2 -1 + 3 13 + 4 -1 + dtype: Int64 + >>> s.mask(s % 2 == 0, 100*s) + 0 1000 + 1 11 + 2 1200 + 3 13 + 4 1400 + dtype: Int64 + + You can also use a remote function to evaluate the mask condition. This + is useful in situation such as the following, where the mask + condition is evaluated based on a complicated business logic which cannot + be expressed in form of a Series. + + >>> @bpd.remote_function([str], bool, reuse=False) + ... def should_mask(name): + ... hash = 0 + ... for char_ in name: + ... hash += ord(char_) + ... return hash % 2 == 0 + + >>> s = bpd.Series(["Alice", "Bob", "Caroline"]) + >>> s + 0 Alice + 1 Bob + 2 Caroline + dtype: string + >>> s.mask(should_mask) + 0 + 1 Bob + 2 Caroline + dtype: string + >>> s.mask(should_mask, "REDACTED") + 0 REDACTED + 1 Bob + 2 Caroline + dtype: string + Args: cond (bool Series/DataFrame, array-like, or callable): Where cond is False, keep the original value. Where True, replace diff --git a/third_party/bigframes_vendored/pandas/io/common.py b/third_party/bigframes_vendored/pandas/io/common.py index 506984e64d..e186f02b5b 100644 --- a/third_party/bigframes_vendored/pandas/io/common.py +++ b/third_party/bigframes_vendored/pandas/io/common.py @@ -13,13 +13,13 @@ def dedup_names( """ Rename column names if duplicates exist. - Currently the renaming is done by appending a period and an autonumeric, - but a custom pattern may be supported in the future. + Currently the renaming is done by appending a underscore and an + autonumeric, but a custom pattern may be supported in the future. Examples ``` dedup_names(["x", "y", "x", "x"], is_potential_multiindex=False) - ['x', 'y', 'x.1', 'x.2'] + ['x', 'y', 'x_1', 'x_2'] ``` """ names = list(names) # so we can index @@ -34,9 +34,9 @@ def dedup_names( if is_potential_multiindex: # for mypy assert isinstance(col, tuple) - col = col[:-1] + (f"{col[-1]}.{cur_count}",) + col = col[:-1] + (f"{col[-1]}_{cur_count}",) else: - col = f"{col}.{cur_count}" + col = f"{col}_{cur_count}" cur_count = counts[col] names[i] = col diff --git a/third_party/bigframes_vendored/pandas/io/gbq.py b/third_party/bigframes_vendored/pandas/io/gbq.py index 2161310b07..eabb48e600 100644 --- a/third_party/bigframes_vendored/pandas/io/gbq.py +++ b/third_party/bigframes_vendored/pandas/io/gbq.py @@ -16,6 +16,7 @@ def read_gbq( index_col: Iterable[str] | str = (), col_order: Iterable[str] = (), max_results: Optional[int] = None, + use_cache: bool = True, ): """Loads a DataFrame from BigQuery. @@ -83,6 +84,8 @@ def read_gbq( max_results (Optional[int], default None): If set, limit the maximum number of rows to fetch from the query results. + use_cache (bool, default True): + Whether to cache the query inputs. Default to True. Returns: bigframes.dataframe.DataFrame: A DataFrame representing results of the query or table. diff --git a/third_party/bigframes_vendored/sklearn/cluster/_kmeans.py b/third_party/bigframes_vendored/sklearn/cluster/_kmeans.py index 5369d3662d..be6c5e7c52 100644 --- a/third_party/bigframes_vendored/sklearn/cluster/_kmeans.py +++ b/third_party/bigframes_vendored/sklearn/cluster/_kmeans.py @@ -20,19 +20,7 @@ class _BaseKMeans(BaseEstimator, ABC): """Base class for KMeans and MiniBatchKMeans""" - def predict(self, X): - """Predict the closest cluster each sample in X belongs to. - - Args: - X (bigframes.dataframe.DataFrame or bigframes.series.Series): - Series or DataFrame of shape (n_samples, n_features). The data matrix for - which we want to get the predictions. - - Returns: - bigframes.dataframe.DataFrame: DataFrame of shape (n_samples,), containing the - class labels for each sample. - """ - raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + pass class KMeans(_BaseKMeans): @@ -73,7 +61,7 @@ def predict( DataFrame of shape (n_samples, n_features). New data to predict. Returns: - bigframes.dataframe.DataFrame: DataFrame of the cluster each sample belongs to. + bigframes.dataframe.DataFrame: DataFrame of shape (n_samples, n_input_columns + n_prediction_columns). Returns predicted labels. """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) diff --git a/third_party/bigframes_vendored/sklearn/linear_model/_base.py b/third_party/bigframes_vendored/sklearn/linear_model/_base.py index 8dc3b6280a..ab946e5861 100644 --- a/third_party/bigframes_vendored/sklearn/linear_model/_base.py +++ b/third_party/bigframes_vendored/sklearn/linear_model/_base.py @@ -16,7 +16,6 @@ # Original location: https://github.com/scikit-learn/scikit-learn/blob/main/sklearn/linear_model/_base.py from abc import ABCMeta -from typing import List, Optional from bigframes import constants from third_party.bigframes_vendored.sklearn.base import ( @@ -35,7 +34,7 @@ def predict(self, X): Series or DataFrame of shape (n_samples, n_features). Samples. Returns: - bigframes.dataframe.DataFrame: DataFrame of shape (n_samples,). Returns predicted values. + bigframes.dataframe.DataFrame: DataFrame of shape (n_samples, n_input_columns + n_prediction_columns). Returns predicted values. """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) @@ -50,8 +49,7 @@ def predict(self, X): which we want to get the predictions. Returns: - bigframes.dataframe.DataFrame: DataFrame of shape (n_samples,), containing - the class labels for each sample. + bigframes.dataframe.DataFrame: DataFrame of shape (n_samples, n_input_columns + n_prediction_columns). Returns predicted values. """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) diff --git a/third_party/bigframes_vendored/xgboost/sklearn.py b/third_party/bigframes_vendored/xgboost/sklearn.py index b7b43b85a3..dfd0ba7356 100644 --- a/third_party/bigframes_vendored/xgboost/sklearn.py +++ b/third_party/bigframes_vendored/xgboost/sklearn.py @@ -18,7 +18,7 @@ def predict(self, X): Series or DataFrame of shape (n_samples, n_features). Samples. Returns: - DataFrame of shape (n_samples,): Returns predicted values. + bigframes.dataframe.DataFrame: DataFrame of shape (n_samples, n_input_columns + n_prediction_columns). Returns predicted values. """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)