diff --git a/.kokoro/continuous/doctest.cfg b/.kokoro/continuous/doctest.cfg new file mode 100644 index 0000000000..dfdc78782f --- /dev/null +++ b/.kokoro/continuous/doctest.cfg @@ -0,0 +1,17 @@ +# Format: //devtools/kokoro/config/proto/build.proto + +# Only run this nox session. +env_vars: { + key: "NOX_SESSION" + value: "doctest" +} + +env_vars: { + key: "GOOGLE_CLOUD_PROJECT" + value: "bigframes-load-testing" +} + +env_vars: { + key: "BIGFRAMES_TEST_MODEL_VERTEX_ENDPOINT" + value: "https://us-central1-aiplatform.googleapis.com/v1/projects/272725758477/locations/us-central1/endpoints/590545496255234048" +} diff --git a/.kokoro/continuous/e2e.cfg b/.kokoro/continuous/e2e.cfg index 3dbd0b47f0..e049dd30b3 100644 --- a/.kokoro/continuous/e2e.cfg +++ b/.kokoro/continuous/e2e.cfg @@ -3,7 +3,7 @@ # Only run this nox session. env_vars: { key: "NOX_SESSION" - value: "e2e doctest notebook unit_prerelease system_prerelease system_noextras" + value: "e2e unit_prerelease system_prerelease system_noextras" } env_vars: { diff --git a/.kokoro/continuous/notebook.cfg b/.kokoro/continuous/notebook.cfg new file mode 100644 index 0000000000..94e2a3c686 --- /dev/null +++ b/.kokoro/continuous/notebook.cfg @@ -0,0 +1,17 @@ +# Format: //devtools/kokoro/config/proto/build.proto + +# Only run this nox session. +env_vars: { + key: "NOX_SESSION" + value: "notebook" +} + +env_vars: { + key: "GOOGLE_CLOUD_PROJECT" + value: "bigframes-load-testing" +} + +env_vars: { + key: "BIGFRAMES_TEST_MODEL_VERTEX_ENDPOINT" + value: "https://us-central1-aiplatform.googleapis.com/v1/projects/272725758477/locations/us-central1/endpoints/590545496255234048" +} diff --git a/.kokoro/load/benchmark.cfg b/.kokoro/load/benchmark.cfg new file mode 100644 index 0000000000..a489e05bbc --- /dev/null +++ b/.kokoro/load/benchmark.cfg @@ -0,0 +1,17 @@ +# Format: //devtools/kokoro/config/proto/build.proto + +# Only run this nox session. +env_vars: { + key: "NOX_SESSION" + value: "benchmark" +} + +env_vars: { + key: "GOOGLE_CLOUD_PROJECT" + value: "bigframes-load-testing" +} + +env_vars: { + key: "BIGFRAMES_TEST_MODEL_VERTEX_ENDPOINT" + value: "https://us-central1-aiplatform.googleapis.com/v1/projects/272725758477/locations/us-central1/endpoints/590545496255234048" +} diff --git a/.kokoro/presubmit/doctest.cfg b/.kokoro/presubmit/doctest.cfg new file mode 100644 index 0000000000..dfdc78782f --- /dev/null +++ b/.kokoro/presubmit/doctest.cfg @@ -0,0 +1,17 @@ +# Format: //devtools/kokoro/config/proto/build.proto + +# Only run this nox session. +env_vars: { + key: "NOX_SESSION" + value: "doctest" +} + +env_vars: { + key: "GOOGLE_CLOUD_PROJECT" + value: "bigframes-load-testing" +} + +env_vars: { + key: "BIGFRAMES_TEST_MODEL_VERTEX_ENDPOINT" + value: "https://us-central1-aiplatform.googleapis.com/v1/projects/272725758477/locations/us-central1/endpoints/590545496255234048" +} diff --git a/.kokoro/presubmit/e2e.cfg b/.kokoro/presubmit/e2e.cfg index 3dbd0b47f0..e049dd30b3 100644 --- a/.kokoro/presubmit/e2e.cfg +++ b/.kokoro/presubmit/e2e.cfg @@ -3,7 +3,7 @@ # Only run this nox session. env_vars: { key: "NOX_SESSION" - value: "e2e doctest notebook unit_prerelease system_prerelease system_noextras" + value: "e2e unit_prerelease system_prerelease system_noextras" } env_vars: { diff --git a/.kokoro/presubmit/notebook.cfg b/.kokoro/presubmit/notebook.cfg new file mode 100644 index 0000000000..94e2a3c686 --- /dev/null +++ b/.kokoro/presubmit/notebook.cfg @@ -0,0 +1,17 @@ +# Format: //devtools/kokoro/config/proto/build.proto + +# Only run this nox session. +env_vars: { + key: "NOX_SESSION" + value: "notebook" +} + +env_vars: { + key: "GOOGLE_CLOUD_PROJECT" + value: "bigframes-load-testing" +} + +env_vars: { + key: "BIGFRAMES_TEST_MODEL_VERTEX_ENDPOINT" + value: "https://us-central1-aiplatform.googleapis.com/v1/projects/272725758477/locations/us-central1/endpoints/590545496255234048" +} diff --git a/CHANGELOG.md b/CHANGELOG.md index cad061ce05..d585b5b1c2 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -4,6 +4,30 @@ [1]: https://pypi.org/project/bigframes/#history +## [1.9.0](https://github.com/googleapis/python-bigquery-dataframes/compare/v1.8.0...v1.9.0) (2024-06-10) + + +### Features + +* Allow functions returned from `bpd.read_gbq_function` to execute outside of `apply` ([#706](https://github.com/googleapis/python-bigquery-dataframes/issues/706)) ([ad7d8ac](https://github.com/googleapis/python-bigquery-dataframes/commit/ad7d8ac1247ec3b9532dd5375265c36907f50da2)) +* Support `bigquery.vector_search()` ([#736](https://github.com/googleapis/python-bigquery-dataframes/issues/736)) ([dad66fd](https://github.com/googleapis/python-bigquery-dataframes/commit/dad66fdd22bb2d507e7f366c970d971554598cf3)) +* Support `score()` in GeminiTextGenerator ([#740](https://github.com/googleapis/python-bigquery-dataframes/issues/740)) ([b2c7d8b](https://github.com/googleapis/python-bigquery-dataframes/commit/b2c7d8b28e235c839370818137fba71796c9f02a)) +* Support bytes type in `remote_function` ([#761](https://github.com/googleapis/python-bigquery-dataframes/issues/761)) ([4915424](https://github.com/googleapis/python-bigquery-dataframes/commit/4915424a68f36542e901a0ac27946f1ecb2d05ab)) +* Support fit() in GeminiTextGenerator ([#758](https://github.com/googleapis/python-bigquery-dataframes/issues/758)) ([d751f5c](https://github.com/googleapis/python-bigquery-dataframes/commit/d751f5cd1cf578618eabbb992cfb6b0a3c36608c)) + + +### Bug Fixes + +* ARIMAPlus loads auto_arima_min_order param ([#752](https://github.com/googleapis/python-bigquery-dataframes/issues/752)) ([39d7013](https://github.com/googleapis/python-bigquery-dataframes/commit/39d7013a8a8d2908f20bfe54a7dc8de166323b90)) +* Improve to_pandas_batches for large results ([#746](https://github.com/googleapis/python-bigquery-dataframes/issues/746)) ([61f18cb](https://github.com/googleapis/python-bigquery-dataframes/commit/61f18cb63f2785c03dc612a34c030079fc8f4172)) +* Resolve issue with unset thread-local options ([#741](https://github.com/googleapis/python-bigquery-dataframes/issues/741)) ([d93dbaf](https://github.com/googleapis/python-bigquery-dataframes/commit/d93dbafe2bb405c60f7141d9ae4135db4ffdb702)) + + +### Documentation + +* Fix ML.EVALUATE spelling ([#749](https://github.com/googleapis/python-bigquery-dataframes/issues/749)) ([7899749](https://github.com/googleapis/python-bigquery-dataframes/commit/7899749505a75ed89c68e9df64124a153644de96)) +* Remove LogisticRegression normal_equation strategy ([#753](https://github.com/googleapis/python-bigquery-dataframes/issues/753)) ([ea5d367](https://github.com/googleapis/python-bigquery-dataframes/commit/ea5d367d5ecc6826d30082e75c957af8362c9e61)) + ## [1.8.0](https://github.com/googleapis/python-bigquery-dataframes/compare/v1.7.0...v1.8.0) (2024-05-31) diff --git a/bigframes/_config/__init__.py b/bigframes/_config/__init__.py index 4729532e98..c9b2a3f95a 100644 --- a/bigframes/_config/__init__.py +++ b/bigframes/_config/__init__.py @@ -17,8 +17,12 @@ DataFrames from this package. """ +from __future__ import annotations + import copy +from dataclasses import dataclass, field import threading +from typing import Optional import bigframes_vendored.pandas._config.config as pandas_config @@ -28,18 +32,27 @@ import bigframes._config.sampling_options as sampling_options +@dataclass +class ThreadLocalConfig(threading.local): + # If unset, global settings will be used + bigquery_options: Optional[bigquery_options.BigQueryOptions] = None + # Note: use default factory instead of default instance so each thread initializes to default values + display_options: display_options.DisplayOptions = field( + default_factory=display_options.DisplayOptions + ) + sampling_options: sampling_options.SamplingOptions = field( + default_factory=sampling_options.SamplingOptions + ) + compute_options: compute_options.ComputeOptions = field( + default_factory=compute_options.ComputeOptions + ) + + class Options: """Global options affecting BigQuery DataFrames behavior.""" def __init__(self): - self._local = threading.local() - - # Initialize these in the property getters to make sure we do have a - # separate instance per thread. - self._local.bigquery_options = None - self._local.display_options = None - self._local.sampling_options = None - self._local.compute_options = None + self._local = ThreadLocalConfig() # BigQuery options are special because they can only be set once per # session, so we need an indicator as to whether we are using the @@ -61,21 +74,16 @@ def _init_bigquery_thread_local(self): @property def bigquery(self) -> bigquery_options.BigQueryOptions: """Options to use with the BigQuery engine.""" - if ( - bigquery_options := getattr(self._local, "bigquery_options", None) - ) is not None: + if self._local.bigquery_options is not None: # The only way we can get here is if someone called # _init_bigquery_thread_local. - return bigquery_options + return self._local.bigquery_options return self._bigquery_options @property def display(self) -> display_options.DisplayOptions: """Options controlling object representation.""" - if self._local.display_options is None: - self._local.display_options = display_options.DisplayOptions() - return self._local.display_options @property @@ -88,17 +96,11 @@ def sampling(self) -> sampling_options.SamplingOptions: matplotlib plotting). This option can be overriden by parameters in specific functions. """ - if self._local.sampling_options is None: - self._local.sampling_options = sampling_options.SamplingOptions() - return self._local.sampling_options @property def compute(self) -> compute_options.ComputeOptions: """Thread-local options controlling object computation.""" - if self._local.compute_options is None: - self._local.compute_options = compute_options.ComputeOptions() - return self._local.compute_options @property diff --git a/bigframes/bigquery/__init__.py b/bigframes/bigquery/__init__.py index 5808aa28bf..85a9010a7d 100644 --- a/bigframes/bigquery/__init__.py +++ b/bigframes/bigquery/__init__.py @@ -21,11 +21,15 @@ from __future__ import annotations import typing +from typing import Literal, Optional, Union import bigframes.constants as constants import bigframes.core.groupby as groupby +import bigframes.core.sql +import bigframes.ml.utils as utils import bigframes.operations as ops import bigframes.operations.aggregations as agg_ops +import bigframes.series if typing.TYPE_CHECKING: import bigframes.dataframe as dataframe @@ -148,3 +152,153 @@ def array_to_string(series: series.Series, delimiter: str) -> series.Series: """ return series._apply_unary_op(ops.ArrayToStringOp(delimiter=delimiter)) + + +def vector_search( + base_table: str, + column_to_search: str, + query: Union[dataframe.DataFrame, series.Series], + *, + query_column_to_search: Optional[str] = None, + top_k: Optional[int] = 10, + distance_type: Literal["euclidean", "cosine"] = "euclidean", + fraction_lists_to_search: Optional[float] = None, + use_brute_force: bool = False, +) -> dataframe.DataFrame: + """ + Conduct vector search which searches embeddings to find semantically similar entities. + + **Examples:** + + + >>> import bigframes.pandas as bpd + >>> import bigframes.bigquery as bbq + >>> bpd.options.display.progress_bar = None + + DataFrame embeddings for which to find nearest neighbors. The ``ARRAY`` column + is used as the search query: + + >>> search_query = bpd.DataFrame({"query_id": ["dog", "cat"], + ... "embedding": [[1.0, 2.0], [3.0, 5.2]]}) + >>> bbq.vector_search( + ... base_table="bigframes-dev.bigframes_tests_sys.base_table", + ... column_to_search="my_embedding", + ... query=search_query, + ... top_k=2) + query_id embedding id my_embedding distance + 1 cat [3. 5.2] 5 [5. 5.4] 2.009975 + 0 dog [1. 2.] 1 [1. 2.] 0.0 + 0 dog [1. 2.] 4 [1. 3.2] 1.2 + 1 cat [3. 5.2] 2 [2. 4.] 1.56205 + + [4 rows x 5 columns] + + Series embeddings for which to find nearest neighbors: + + >>> search_query = bpd.Series([[1.0, 2.0], [3.0, 5.2]], + ... index=["dog", "cat"], + ... name="embedding") + >>> bbq.vector_search( + ... base_table="bigframes-dev.bigframes_tests_sys.base_table", + ... column_to_search="my_embedding", + ... query=search_query, + ... top_k=2) + embedding id my_embedding distance + dog [1. 2.] 1 [1. 2.] 0.0 + cat [3. 5.2] 5 [5. 5.4] 2.009975 + dog [1. 2.] 4 [1. 3.2] 1.2 + cat [3. 5.2] 2 [2. 4.] 1.56205 + + [4 rows x 4 columns] + + You can specify the name of the column in the query DataFrame embeddings and distance type. + If you specify query_column_to_search_value, it will use the provided column which contains + the embeddings for which to find nearest neighbors. Otherwiese, it uses the column_to_search value. + + >>> search_query = bpd.DataFrame({"query_id": ["dog", "cat"], + ... "embedding": [[1.0, 2.0], [3.0, 5.2]], + ... "another_embedding": [[0.7, 2.2], [3.3, 5.2]]}) + >>> bbq.vector_search( + ... base_table="bigframes-dev.bigframes_tests_sys.base_table", + ... column_to_search="my_embedding", + ... query=search_query, + ... distance_type="cosine", + ... query_column_to_search="another_embedding", + ... top_k=2) + query_id embedding another_embedding id my_embedding distance + 1 cat [3. 5.2] [3.3 5.2] 2 [2. 4.] 0.005181 + 0 dog [1. 2.] [0.7 2.2] 4 [1. 3.2] 0.000013 + 1 cat [3. 5.2] [3.3 5.2] 1 [1. 2.] 0.005181 + 0 dog [1. 2.] [0.7 2.2] 3 [1.5 7. ] 0.004697 + + [4 rows x 6 columns] + + Args: + base_table (str): + The table to search for nearest neighbor embeddings. + column_to_search (str): + The name of the base table column to search for nearest neighbor embeddings. + The column must have a type of ``ARRAY``. All elements in the array must be non-NULL. + query (bigframes.dataframe.DataFrame | bigframes.dataframe.Series): + A Series or DataFrame that provides the embeddings for which to find nearest neighbors. + query_column_to_search (str): + Specifies the name of the column in the query that contains the embeddings for which to + find nearest neighbors. The column must have a type of ``ARRAY``. All elements in + the array must be non-NULL and all values in the column must have the same array dimensions + as the values in the ``column_to_search`` column. Can only be set when query is a DataFrame. + top_k (int, default 10): + Sepecifies the number of nearest neighbors to return. Default to 10. + distance_type (str, defalt "euclidean"): + Specifies the type of metric to use to compute the distance between two vectors. + Possible values are "euclidean" and "cosine". Default to "euclidean". + fraction_lists_to_search (float, range in [0.0, 1.0]): + Specifies the percentage of lists to search. Specifying a higher percentage leads to + higher recall and slower performance, and the converse is true when specifying a lower + percentage. It is only used when a vector index is also used. You can only specify + ``fraction_lists_to_search`` when ``use_brute_force`` is set to False. + use_brute_force (bool, default False): + Determines whether to use brute force search by skipping the vector index if one is available. + Default to False. + + Returns: + bigframes.dataframe.DataFrame: A DataFrame containing vector search result. + """ + if not fraction_lists_to_search and use_brute_force is True: + raise ValueError( + "You can't specify fraction_lists_to_search when use_brute_force is set to True." + ) + if ( + isinstance(query, bigframes.series.Series) + and query_column_to_search is not None + ): + raise ValueError( + "You can't specify query_column_to_search when query is a Series." + ) + # TODO(ashleyxu): Support options in vector search. b/344019989 + if fraction_lists_to_search is not None or use_brute_force is True: + raise NotImplementedError( + f"fraction_lists_to_search and use_brute_force is not supported. {constants.FEEDBACK_LINK}" + ) + options = { + "base_table": base_table, + "column_to_search": column_to_search, + "query_column_to_search": query_column_to_search, + "distance_type": distance_type, + "top_k": top_k, + "fraction_lists_to_search": fraction_lists_to_search, + "use_brute_force": use_brute_force, + } + + (query,) = utils.convert_to_dataframe(query) + sql_string, index_col_ids, index_labels = query._to_sql_query(include_index=True) + + sql = bigframes.core.sql.create_vector_search_sql( + sql_string=sql_string, options=options # type: ignore + ) + if index_col_ids is not None: + df = query._session.read_gbq(sql, index_col=index_col_ids) + else: + df = query._session.read_gbq(sql) + df.index.names = index_labels + + return df diff --git a/bigframes/core/__init__.py b/bigframes/core/__init__.py index 133d271fed..e0b63b4a8c 100644 --- a/bigframes/core/__init__.py +++ b/bigframes/core/__init__.py @@ -60,6 +60,7 @@ class ArrayValue: node: nodes.BigFrameNode + # DO NOT use, on deprecation path @classmethod def from_ibis( cls, @@ -69,11 +70,13 @@ def from_ibis( hidden_ordering_columns: Sequence[ibis_types.Value], ordering: orderings.ExpressionOrdering, ): + import bigframes.core.compile.ibis_types + node = nodes.ReadGbqNode( table=table, table_session=session, columns=tuple( - bigframes.dtypes.ibis_value_to_canonical_type(column) + bigframes.core.compile.ibis_types.ibis_value_to_canonical_type(column) for column in columns ), hidden_ordering_columns=tuple(hidden_ordering_columns), @@ -95,6 +98,23 @@ def from_pyarrow(cls, arrow_table: pa.Table, session: Session): ) return cls(node) + @classmethod + def from_cached( + cls, + original: ArrayValue, + table: google.cloud.bigquery.Table, + ordering: orderings.ExpressionOrdering, + ): + node = nodes.CachedTableNode( + original_node=original.node, + project_id=table.reference.project, + dataset_id=table.reference.dataset_id, + table_id=table.reference.table_id, + physical_schema=tuple(table.schema), + ordering=ordering, + ) + return cls(node) + @classmethod def from_table( cls, @@ -105,7 +125,10 @@ def from_table( predicate: Optional[str] = None, at_time: Optional[datetime.datetime] = None, primary_key: Sequence[str] = (), + offsets_col: Optional[str] = None, ): + if offsets_col and primary_key: + raise ValueError("must set at most one of 'offests', 'primary_key'") if any(i.field_type == "JSON" for i in table.schema if i.name in schema.names): warnings.warn( "Interpreting JSON column(s) as StringDtype. This behavior may change in future versions.", @@ -116,7 +139,8 @@ def from_table( dataset_id=table.reference.dataset_id, table_id=table.reference.table_id, physical_schema=tuple(table.schema), - total_order_cols=tuple(primary_key), + total_order_cols=(offsets_col,) if offsets_col else tuple(primary_key), + order_col_is_sequential=(offsets_col is not None), columns=schema, at_time=at_time, table_session=session, @@ -150,6 +174,24 @@ def _compiled_schema(self) -> schemata.ArraySchema: ) return schemata.ArraySchema(items) + def as_cached( + self: ArrayValue, + cache_table: google.cloud.bigquery.Table, + ordering: Optional[orderings.ExpressionOrdering], + ) -> ArrayValue: + """ + Replace the node with an equivalent one that references a tabel where the value has been materialized to. + """ + node = nodes.CachedTableNode( + original_node=self.node, + project_id=cache_table.reference.project, + dataset_id=cache_table.reference.dataset_id, + table_id=cache_table.reference.table_id, + physical_schema=tuple(cache_table.schema), + ordering=ordering, + ) + return ArrayValue(node) + def _try_evaluate_local(self): """Use only for unit testing paths - not fully featured. Will throw exception if fails.""" import ibis @@ -192,6 +234,8 @@ def promote_offsets(self, col_id: str) -> ArrayValue: """ Convenience function to promote copy of column offsets to a value column. Can be used to reset index. """ + if not self.session._strictly_ordered: + raise ValueError("Generating offsets not supported in unordered mode") return ArrayValue(nodes.PromoteOffsetsNode(child=self.node, col_id=col_id)) def concat(self, other: typing.Sequence[ArrayValue]) -> ArrayValue: @@ -340,6 +384,10 @@ def project_window_op( never_skip_nulls: will disable null skipping for operators that would otherwise do so skip_reproject_unsafe: skips the reprojection step, can be used when performing many non-dependent window operations, user responsible for not nesting window expressions, or using outputs as join, filter or aggregation keys before a reprojection """ + if not self.session._strictly_ordered: + # TODO: Support unbounded windows with aggregate ops and some row-order-independent analytic ops + # TODO: Support non-deterministic windowing + raise ValueError("Windowed ops not supported in unordered mode") return ArrayValue( nodes.WindowOpNode( child=self.node, @@ -391,8 +439,9 @@ def unpivot( """ # There will be N labels, used to disambiguate which of N source columns produced each output row explode_offsets_id = bigframes.core.guid.generate_guid("unpivot_offsets_") - labels_array = self._create_unpivot_labels_array(row_labels, index_col_ids) - labels_array = labels_array.promote_offsets(explode_offsets_id) + labels_array = self._create_unpivot_labels_array( + row_labels, index_col_ids, explode_offsets_id + ) # Unpivot creates N output rows for each input row, labels disambiguate these N rows joined_array = self._cross_join_w_labels(labels_array, join_side) @@ -458,6 +507,7 @@ def _create_unpivot_labels_array( self, former_column_labels: typing.Sequence[typing.Hashable], col_ids: typing.Sequence[str], + offsets_id: str, ) -> ArrayValue: """Create an ArrayValue from a list of label tuples.""" rows = [] @@ -468,6 +518,7 @@ def _create_unpivot_labels_array( col_ids[i]: (row_label[i] if pandas.notnull(row_label[i]) else None) for i in range(len(col_ids)) } + row[offsets_id] = row_offset rows.append(row) return ArrayValue.from_pyarrow(pa.Table.from_pylist(rows), session=self.session) diff --git a/bigframes/core/blocks.py b/bigframes/core/blocks.py index ea063669d5..301bcc20e9 100644 --- a/bigframes/core/blocks.py +++ b/bigframes/core/blocks.py @@ -508,11 +508,24 @@ def try_peek( else: return None - def to_pandas_batches(self): - """Download results one message at a time.""" + def to_pandas_batches( + self, page_size: Optional[int] = None, max_results: Optional[int] = None + ): + """Download results one message at a time. + + page_size and max_results determine the size and number of batches, + see https://cloud.google.com/python/docs/reference/bigquery/latest/google.cloud.bigquery.job.QueryJob#google_cloud_bigquery_job_QueryJob_result""" dtypes = dict(zip(self.index_columns, self.index.dtypes)) dtypes.update(zip(self.value_columns, self.dtypes)) - results_iterator, _ = self.session._execute(self.expr, sorted=True) + _, query_job = self.session._query_to_destination( + self.session._to_sql(self.expr, sorted=True), + list(self.index_columns), + api_name="cached", + do_clustering=False, + ) + results_iterator = query_job.result( + page_size=page_size, max_results=max_results + ) for arrow_table in results_iterator.to_arrow_iterable( bqstorage_client=self.session.bqstoragereadclient ): @@ -540,7 +553,7 @@ def _materialize_local( """Run query and download results as a pandas DataFrame. Return the total number of results as well.""" # TODO(swast): Allow for dry run and timeout. _, query_job = self.session._query_to_destination( - self.session._to_sql(self.expr, sorted=True), + self.session._to_sql(self.expr, sorted=materialize_options.ordered), list(self.index_columns), api_name="cached", do_clustering=False, @@ -1003,7 +1016,7 @@ def aggregate_all_and_stack( index_columns=[index_id], column_labels=self.column_labels, index_labels=[None], - ).transpose(original_row_index=pd.Index([None])) + ).transpose(original_row_index=pd.Index([None]), single_row_mode=True) else: # axis_n == 1 # using offsets as identity to group on. # TODO: Allow to promote identity/total_order columns instead for better perf @@ -1646,6 +1659,8 @@ def melt( value_vars=typing.Sequence[str], var_names=typing.Sequence[typing.Hashable], value_name: typing.Hashable = "value", + *, + create_offsets_index: bool = True, ): """ Unpivot columns to produce longer, narrower dataframe. @@ -1666,20 +1681,31 @@ def melt( index_col_ids=var_col_ids, join_side="right", ) - index_id = guid.generate_guid() - unpivot_expr = unpivot_expr.promote_offsets(index_id) + + if create_offsets_index: + index_id = guid.generate_guid() + unpivot_expr = unpivot_expr.promote_offsets(index_id) + index_cols = [index_id] + else: + index_cols = [] + # Need to reorder to get id_vars before var_col and unpivot_col unpivot_expr = unpivot_expr.select_columns( - [index_id, *id_vars, *var_col_ids, unpivot_col_id] + [*index_cols, *id_vars, *var_col_ids, unpivot_col_id] ) return Block( unpivot_expr, column_labels=[*id_labels, *var_names, value_name], - index_columns=[index_id], + index_columns=index_cols, ) - def transpose(self, *, original_row_index: Optional[pd.Index] = None) -> Block: + def transpose( + self, + *, + original_row_index: Optional[pd.Index] = None, + single_row_mode: bool = False, + ) -> Block: """Transpose the block. Will fail if dtypes aren't coercible to a common type or too many rows. Can provide the original_row_index directly if it is already known, otherwise a query is needed. """ @@ -1705,7 +1731,11 @@ def transpose(self, *, original_row_index: Optional[pd.Index] = None) -> Block: block.column_labels, pd.Index(range(len(block.column_labels))) ) ) - numbered_block, offsets = numbered_block.promote_offsets() + # TODO: Determine if single row from expression tree (after aggregation without groupby) + if single_row_mode: + numbered_block, offsets = numbered_block.create_constant(0) + else: + numbered_block, offsets = numbered_block.promote_offsets() stacked_block = numbered_block.melt( id_vars=(offsets,), @@ -1714,6 +1744,7 @@ def transpose(self, *, original_row_index: Optional[pd.Index] = None) -> Block: "col_offset", ), value_vars=block.value_columns, + create_offsets_index=False, ) col_labels = stacked_block.value_columns[-2 - original_col_index.nlevels : -2] col_offset = stacked_block.value_columns[-2] # disambiguator we created earlier @@ -2339,12 +2370,19 @@ def _get_rows_as_json_values(self) -> Block: index_columns_count = len(self.index_columns) # column references to form the array of values for the row - column_references_csv = sql.csv( - [sql.cast_as_string(col) for col in self.expr.column_ids] - ) + column_types = list(self.index.dtypes) + list(self.dtypes) + column_references = [] + for type_, col in zip(column_types, self.expr.column_ids): + if isinstance(type_, pd.ArrowDtype) and pa.types.is_binary( + type_.pyarrow_dtype + ): + column_references.append(sql.to_json_string(col)) + else: + column_references.append(sql.cast_as_string(col)) + + column_references_csv = sql.csv(column_references) # types of the columns to serialize for the row - column_types = list(self.index.dtypes) + list(self.dtypes) column_types_csv = sql.csv( [sql.simple_literal(str(typ)) for typ in column_types] ) diff --git a/bigframes/core/compile/aggregate_compiler.py b/bigframes/core/compile/aggregate_compiler.py index fada4ebbd8..58973b10eb 100644 --- a/bigframes/core/compile/aggregate_compiler.py +++ b/bigframes/core/compile/aggregate_compiler.py @@ -22,10 +22,10 @@ import pandas as pd import bigframes.constants as constants +import bigframes.core.compile.ibis_types as compile_ibis_types import bigframes.core.compile.scalar_op_compiler as scalar_compilers import bigframes.core.expression as ex import bigframes.core.window_spec as window_spec -import bigframes.dtypes as dtypes import bigframes.operations.aggregations as agg_ops scalar_compiler = scalar_compilers.scalar_op_compiler @@ -323,7 +323,7 @@ def _( for this_bin in range(op.bins - 1): out = out.when( x <= (col_min + (this_bin + 1) * bin_width), - dtypes.literal_to_ibis_scalar( + compile_ibis_types.literal_to_ibis_scalar( this_bin, force_dtype=pd.Int64Dtype() ), ) @@ -352,8 +352,8 @@ def _( out = out.when(x.notnull(), interval_struct) else: # Interpret as intervals for interval in op.bins: - left = dtypes.literal_to_ibis_scalar(interval[0]) - right = dtypes.literal_to_ibis_scalar(interval[1]) + left = compile_ibis_types.literal_to_ibis_scalar(interval[0]) + right = compile_ibis_types.literal_to_ibis_scalar(interval[1]) condition = (x > left) & (x <= right) interval_struct = ibis.struct( {"left_exclusive": left, "right_inclusive": right} @@ -370,7 +370,7 @@ def _( window=None, ) -> ibis_types.IntegerValue: if isinstance(self.quantiles, int): - quantiles_ibis = dtypes.literal_to_ibis_scalar(self.quantiles) + quantiles_ibis = compile_ibis_types.literal_to_ibis_scalar(self.quantiles) percent_ranks = cast( ibis_types.FloatingColumn, _apply_window_if_present(column.percent_rank(), window), @@ -383,13 +383,19 @@ def _( _apply_window_if_present(column.percent_rank(), window), ) out = ibis.case() - first_ibis_quantile = dtypes.literal_to_ibis_scalar(self.quantiles[0]) + first_ibis_quantile = compile_ibis_types.literal_to_ibis_scalar( + self.quantiles[0] + ) out = out.when(percent_ranks < first_ibis_quantile, None) for bucket_n in range(len(self.quantiles) - 1): - ibis_quantile = dtypes.literal_to_ibis_scalar(self.quantiles[bucket_n + 1]) + ibis_quantile = compile_ibis_types.literal_to_ibis_scalar( + self.quantiles[bucket_n + 1] + ) out = out.when( percent_ranks <= ibis_quantile, - dtypes.literal_to_ibis_scalar(bucket_n, force_dtype=pd.Int64Dtype()), + compile_ibis_types.literal_to_ibis_scalar( + bucket_n, force_dtype=pd.Int64Dtype() + ), ) out = out.else_(None) return out.end() # type: ignore diff --git a/bigframes/core/compile/compiled.py b/bigframes/core/compile/compiled.py index 552061f612..dac814a08c 100644 --- a/bigframes/core/compile/compiled.py +++ b/bigframes/core/compile/compiled.py @@ -28,6 +28,7 @@ import pandas import bigframes.core.compile.aggregate_compiler as agg_compiler +import bigframes.core.compile.ibis_types import bigframes.core.compile.scalar_op_compiler as op_compilers import bigframes.core.expression as ex import bigframes.core.guid @@ -157,16 +158,19 @@ def _get_ibis_column(self, key: str) -> ibis_types.Value: ) return typing.cast( ibis_types.Value, - bigframes.dtypes.ibis_value_to_canonical_type(self._column_names[key]), + bigframes.core.compile.ibis_types.ibis_value_to_canonical_type( + self._column_names[key] + ), ) def get_column_type(self, key: str) -> bigframes.dtypes.Dtype: ibis_type = typing.cast( - bigframes.dtypes.IbisDtype, self._get_ibis_column(key).type() + bigframes.core.compile.ibis_types.IbisDtype, + self._get_ibis_column(key).type(), ) return typing.cast( bigframes.dtypes.Dtype, - bigframes.dtypes.ibis_dtype_to_bigframes_dtype(ibis_type), + bigframes.core.compile.ibis_types.ibis_dtype_to_bigframes_dtype(ibis_type), ) def _aggregate_base( @@ -332,7 +336,8 @@ def _to_ibis_expr( # Make sure all dtypes are the "canonical" ones for BigFrames. This is # important for operations like UNION where the schema must match. table = self._table.select( - bigframes.dtypes.ibis_value_to_canonical_type(column) for column in columns + bigframes.core.compile.ibis_types.ibis_value_to_canonical_type(column) + for column in columns ) base_table = table if self._reduced_predicate is not None: @@ -579,7 +584,10 @@ def from_pandas( ibis_values = ibis_values.assign(**{ORDER_ID_COLUMN: range(len(pd_df))}) # derive the ibis schema from the original pandas schema ibis_schema = [ - (name, bigframes.dtypes.bigframes_dtype_to_ibis_dtype(dtype)) + ( + name, + bigframes.core.compile.ibis_types.bigframes_dtype_to_ibis_dtype(dtype), + ) for name, dtype in zip(schema.names, schema.dtypes) ] ibis_schema.append((ORDER_ID_COLUMN, ibis_dtypes.int64)) @@ -993,7 +1001,9 @@ def _to_ibis_expr( # Make sure all dtypes are the "canonical" ones for BigFrames. This is # important for operations like UNION where the schema must match. table = table.select( - bigframes.dtypes.ibis_value_to_canonical_type(table[column]) + bigframes.core.compile.ibis_types.ibis_value_to_canonical_type( + table[column] + ) for column in table.columns ) base_table = table diff --git a/bigframes/core/compile/compiler.py b/bigframes/core/compile/compiler.py index f948d10a5b..021ec8b176 100644 --- a/bigframes/core/compile/compiler.py +++ b/bigframes/core/compile/compiler.py @@ -26,11 +26,11 @@ import bigframes.core.compile.compiled as compiled import bigframes.core.compile.concat as concat_impl import bigframes.core.compile.default_ordering as default_ordering +import bigframes.core.compile.ibis_types import bigframes.core.compile.schema_translator import bigframes.core.compile.single_column import bigframes.core.nodes as nodes import bigframes.core.ordering as bf_ordering -import bigframes.dtypes as bigframes_dtypes if typing.TYPE_CHECKING: import bigframes.core @@ -96,6 +96,48 @@ def compile_readlocal(node: nodes.ReadLocalNode, ordered: bool = True): return ordered_ir.to_unordered() +@_compile_node.register +def compile_cached_table(node: nodes.CachedTableNode, ordered: bool = True): + full_table_name = f"{node.project_id}.{node.dataset_id}.{node.table_id}" + used_columns = ( + *node.schema.names, + *node.hidden_columns, + ) + # Physical schema might include unused columns, unsupported datatypes like JSON + physical_schema = ibis.backends.bigquery.BigQuerySchema.to_ibis( + list(i for i in node.physical_schema if i.name in used_columns) + ) + ibis_table = ibis.table(physical_schema, full_table_name) + if ordered: + if node.ordering is None: + # If this happens, session malfunctioned while applying cached results. + raise ValueError( + "Cannot use unordered cached value. Result requires ordering information." + ) + return compiled.OrderedIR( + ibis_table, + columns=tuple( + bigframes.core.compile.ibis_types.ibis_value_to_canonical_type( + ibis_table[col] + ) + for col in node.schema.names + ), + ordering=node.ordering, + hidden_ordering_columns=[ibis_table[c] for c in node.hidden_columns], + ) + + else: + return compiled.UnorderedIR( + ibis_table, + columns=tuple( + bigframes.core.compile.ibis_types.ibis_value_to_canonical_type( + ibis_table[col] + ) + for col in node.schema.names + ), + ) + + @_compile_node.register def compile_readtable(node: nodes.ReadTableNode, ordered: bool = True): if ordered: @@ -133,7 +175,9 @@ def compile_read_table_unordered(node: nodes.ReadTableNode): return compiled.UnorderedIR( ibis_table, tuple( - bigframes_dtypes.ibis_value_to_canonical_type(ibis_table[col]) + bigframes.core.compile.ibis_types.ibis_value_to_canonical_type( + ibis_table[col] + ) for col in node.schema.names ), ) @@ -169,7 +213,9 @@ def compile_read_table_ordered(node: nodes.ReadTableNode): return compiled.OrderedIR( ibis_table, columns=tuple( - bigframes_dtypes.ibis_value_to_canonical_type(ibis_table[col]) + bigframes.core.compile.ibis_types.ibis_value_to_canonical_type( + ibis_table[col] + ) for col in node.schema.names ), ordering=ordering, diff --git a/bigframes/core/compile/googlesql/__init__.py b/bigframes/core/compile/googlesql/__init__.py new file mode 100644 index 0000000000..32265c0d51 --- /dev/null +++ b/bigframes/core/compile/googlesql/__init__.py @@ -0,0 +1,52 @@ +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Python classes representing GoogleSQL syntax nodes, adhering to the official syntax: +https://cloud.google.com/bigquery/docs/reference/standard-sql/query-syntax""" + +from __future__ import annotations + +from bigframes.core.compile.googlesql.expression import ( + AliasExpression, + ColumnExpression, + CTEExpression, + StarExpression, + TableExpression, +) +from bigframes.core.compile.googlesql.query import ( + AsAlias, + FromClause, + FromItem, + NonRecursiveCTE, + QueryExpr, + Select, + SelectAll, + SelectExpression, +) + +__all__ = [ + "AliasExpression", + "AsAlias", + "ColumnExpression", + "CTEExpression", + "FromClause", + "FromItem", + "NonRecursiveCTE", + "QueryExpr", + "Select", + "SelectAll", + "SelectExpression", + "StarExpression", + "TableExpression", +] diff --git a/bigframes/core/compile/googlesql/abc.py b/bigframes/core/compile/googlesql/abc.py new file mode 100644 index 0000000000..081836467c --- /dev/null +++ b/bigframes/core/compile/googlesql/abc.py @@ -0,0 +1,25 @@ +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import annotations + +import abc + + +class SQLSyntax(abc.ABC): + """Abstract base class provides GoogleSQL syntax.""" + + @abc.abstractmethod + def sql(self): + ... diff --git a/bigframes/core/compile/googlesql/expression.py b/bigframes/core/compile/googlesql/expression.py new file mode 100644 index 0000000000..702aa2c5e5 --- /dev/null +++ b/bigframes/core/compile/googlesql/expression.py @@ -0,0 +1,95 @@ +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import annotations + +import dataclasses +import typing + +import bigframes.core.compile.googlesql.abc as abc + +"""This module represents GoogleSQL `expression` and its extensions. +Core class: + +* `expression`: Models basic SQL expressions. + +Extended classes (not part of standard GoogleSQL syntax, but added for convenience): + +* `ColumnExpression`: Represents column references. +* `TableExpression`: Represents table references. +* `AliasExpression`: Represents aliased expressions. +* ... +""" + + +@dataclasses.dataclass +class Expression(abc.SQLSyntax): + pass + + +@dataclasses.dataclass +class ColumnExpression(Expression): + name: str + parent: typing.Optional[TableExpression | AliasExpression | CTEExpression] = None + + def sql(self) -> str: + if self.parent is not None: + return f"{self.parent.sql()}.`{self.name}`" + return f"`{self.name}`" + + +@dataclasses.dataclass +class StarExpression(Expression): + parent: typing.Optional[TableExpression | AliasExpression | CTEExpression] = None + + def sql(self) -> str: + if self.parent is not None: + return f"{self.parent.sql()}.*" + return "*" + + +@dataclasses.dataclass +class TableExpression(Expression): + table_id: str + dataset_id: typing.Optional[str] = None + project_id: typing.Optional[str] = None + + def __post_init__(self): + if self.project_id is not None and self.dataset_id is None: + raise ValueError("The `dataset_id` is missing.") + + def sql(self) -> str: + text = [] + if self.project_id is not None: + text.append(f"`{self.project_id}`") + if self.dataset_id is not None: + text.append(f"`{self.dataset_id}`") + text.append(f"`{self.table_id}`") + return ".".join(text) + + +@dataclasses.dataclass +class AliasExpression(Expression): + alias: str + + def sql(self) -> str: + return f"`{self.alias}`" + + +@dataclasses.dataclass +class CTEExpression(Expression): + name: str + + def sql(self) -> str: + return f"`{self.name}`" diff --git a/bigframes/core/compile/googlesql/query.py b/bigframes/core/compile/googlesql/query.py new file mode 100644 index 0000000000..6210aa67f4 --- /dev/null +++ b/bigframes/core/compile/googlesql/query.py @@ -0,0 +1,165 @@ +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import annotations + +import dataclasses +import typing + +import bigframes.core.compile.googlesql.abc as abc +import bigframes.core.compile.googlesql.expression as expr + +"""This module provides a structured representation of GoogleSQL syntax using nodes. +Each node's name and child nodes are designed to strictly follow the official GoogleSQL +syntax rules outlined in the documentation: +https://cloud.google.com/bigquery/docs/reference/standard-sql/query-syntax""" + + +@dataclasses.dataclass +class QueryExpr(abc.SQLSyntax): + """This class represents GoogleSQL `query_expr` syntax.""" + + select: Select + with_cte_list: typing.Sequence[NonRecursiveCTE] = () + + def sql(self) -> str: + text = [] + if len(self.with_cte_list) > 0: + with_cte_text = ",\n".join( + [with_cte.sql() for with_cte in self.with_cte_list] + ) + text.append(f"WITH {with_cte_text}") + + text.append(self.select.sql()) + return "\n".join(text) + + +@dataclasses.dataclass +class Select(abc.SQLSyntax): + """This class represents GoogleSQL `select` syntax.""" + + select_list: typing.Sequence[typing.Union[SelectExpression, SelectAll]] + from_clause_list: typing.Sequence[FromClause] = () + + def sql(self) -> str: + text = ["SELECT"] + + select_list_sql = ",\n".join([select.sql() for select in self.select_list]) + text.append(select_list_sql) + + if self.from_clause_list is not None: + from_clauses_sql = ",\n".join( + [clause.sql() for clause in self.from_clause_list] + ) + text.append(f"FROM\n{from_clauses_sql}") + return "\n".join(text) + + +@dataclasses.dataclass +class SelectExpression(abc.SQLSyntax): + """This class represents `select_expression`.""" + + expression: expr.ColumnExpression + alias: typing.Optional[expr.AliasExpression] = None + + def sql(self) -> str: + if self.alias is None: + return self.expression.sql() + else: + return f"{self.expression.sql()} AS {self.alias.sql()}" + + +@dataclasses.dataclass +class SelectAll(abc.SQLSyntax): + """This class represents `select_all` (aka. `SELECT *`).""" + + expression: expr.StarExpression + + def sql(self) -> str: + return self.expression.sql() + + +@dataclasses.dataclass +class FromClause(abc.SQLSyntax): + """This class represents GoogleSQL `from_clause` syntax.""" + + from_item: FromItem + + def sql(self) -> str: + return self.from_item.sql() + + +@dataclasses.dataclass +class FromItem(abc.SQLSyntax): + """This class represents GoogleSQL `from_item` syntax.""" + + table_name: typing.Optional[expr.TableExpression] = None + # Note: Temporarily introduces the `str` type to interact with pre-existing, + # compiled SQL strings. + query_expr: typing.Optional[QueryExpr | str] = None + cte_name: typing.Optional[expr.CTEExpression] = None + as_alias: typing.Optional[AsAlias] = None + + def __post_init__(self): + non_none = sum( + expr is not None + for expr in [ + self.table_name, + self.query_expr, + self.cte_name, + ] + ) + if non_none != 1: + raise ValueError("Exactly one of expressions must be provided.") + + def sql(self) -> str: + if self.table_name is not None: + text = self.table_name.sql() + elif self.query_expr is not None: + text = ( + self.query_expr + if isinstance(self.query_expr, str) + else self.query_expr.sql() + ) + text = f"({text})" + elif self.cte_name is not None: + text = self.cte_name.sql() + else: + raise ValueError("One of from items must be provided.") + + if self.as_alias is None: + return text + else: + return f"{text} {self.as_alias.sql()}" + + +@dataclasses.dataclass +class NonRecursiveCTE(abc.SQLSyntax): + """This class represents GoogleSQL `non_recursive_cte` syntax.""" + + cte_name: expr.CTEExpression + query_expr: QueryExpr + + def sql(self) -> str: + return f"{self.cte_name.sql()} AS (\n{self.query_expr.sql()}\n)" + + +@dataclasses.dataclass +class AsAlias(abc.SQLSyntax): + """This class represents GoogleSQL `as_alias` syntax.""" + + alias: expr.AliasExpression + + def sql(self) -> str: + return f"AS {self.alias.sql()}" diff --git a/bigframes/core/compile/ibis_types.py b/bigframes/core/compile/ibis_types.py new file mode 100644 index 0000000000..f73fce3e4d --- /dev/null +++ b/bigframes/core/compile/ibis_types.py @@ -0,0 +1,476 @@ +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from __future__ import annotations + +import textwrap +from typing import Any, cast, Dict, Iterable, Optional, Tuple, Union +import warnings + +import bigframes_vendored.ibis.backends.bigquery.datatypes as third_party_ibis_bqtypes +import bigframes_vendored.ibis.expr.operations as vendored_ibis_ops +import geopandas as gpd # type: ignore +import google.cloud.bigquery as bigquery +import ibis +import ibis.expr.datatypes as ibis_dtypes +from ibis.expr.datatypes.core import dtype as python_type_to_bigquery_type +import ibis.expr.types as ibis_types +import numpy as np +import pandas as pd +import pyarrow as pa + +import bigframes.constants as constants +import bigframes.dtypes + +# Type hints for Ibis data types supported by BigQuery DataFrame +IbisDtype = Union[ + ibis_dtypes.Boolean, + ibis_dtypes.Float64, + ibis_dtypes.Int64, + ibis_dtypes.String, + ibis_dtypes.Date, + ibis_dtypes.Time, + ibis_dtypes.Timestamp, +] + + +BIDIRECTIONAL_MAPPINGS: Iterable[Tuple[IbisDtype, bigframes.dtypes.Dtype]] = ( + (ibis_dtypes.boolean, pd.BooleanDtype()), + (ibis_dtypes.date, pd.ArrowDtype(pa.date32())), + (ibis_dtypes.float64, pd.Float64Dtype()), + (ibis_dtypes.int64, pd.Int64Dtype()), + (ibis_dtypes.string, pd.StringDtype(storage="pyarrow")), + (ibis_dtypes.time, pd.ArrowDtype(pa.time64("us"))), + (ibis_dtypes.Timestamp(timezone=None), pd.ArrowDtype(pa.timestamp("us"))), + ( + ibis_dtypes.Timestamp(timezone="UTC"), + pd.ArrowDtype(pa.timestamp("us", tz="UTC")), + ), + (ibis_dtypes.binary, pd.ArrowDtype(pa.binary())), + ( + ibis_dtypes.Decimal(precision=38, scale=9, nullable=True), + pd.ArrowDtype(pa.decimal128(38, 9)), + ), + ( + ibis_dtypes.Decimal(precision=76, scale=38, nullable=True), + pd.ArrowDtype(pa.decimal256(76, 38)), + ), + ( + ibis_dtypes.GeoSpatial(geotype="geography", srid=4326, nullable=True), + gpd.array.GeometryDtype(), + ), +) + +BIGFRAMES_TO_IBIS: Dict[bigframes.dtypes.Dtype, ibis_dtypes.DataType] = { + pandas: ibis for ibis, pandas in BIDIRECTIONAL_MAPPINGS +} + +IBIS_TO_ARROW: Dict[ibis_dtypes.DataType, pa.DataType] = { + ibis_dtypes.boolean: pa.bool_(), + ibis_dtypes.date: pa.date32(), + ibis_dtypes.float64: pa.float64(), + ibis_dtypes.int64: pa.int64(), + ibis_dtypes.string: pa.string(), + ibis_dtypes.time: pa.time64("us"), + ibis_dtypes.Timestamp(timezone=None): pa.timestamp("us"), + ibis_dtypes.Timestamp(timezone="UTC"): pa.timestamp("us", tz="UTC"), + ibis_dtypes.binary: pa.binary(), + ibis_dtypes.Decimal(precision=38, scale=9, nullable=True): pa.decimal128(38, 9), + ibis_dtypes.Decimal(precision=76, scale=38, nullable=True): pa.decimal256(76, 38), +} + +ARROW_TO_IBIS = {arrow: ibis for ibis, arrow in IBIS_TO_ARROW.items()} + +IBIS_TO_BIGFRAMES: Dict[ibis_dtypes.DataType, bigframes.dtypes.Dtype] = { + ibis: pandas for ibis, pandas in BIDIRECTIONAL_MAPPINGS +} +# Allow REQUIRED fields to map correctly. +IBIS_TO_BIGFRAMES.update( + {ibis.copy(nullable=False): pandas for ibis, pandas in BIDIRECTIONAL_MAPPINGS} +) +IBIS_TO_BIGFRAMES.update( + { + # TODO: Interval + } +) + + +def cast_ibis_value( + value: ibis_types.Value, to_type: ibis_dtypes.DataType +) -> ibis_types.Value: + """Perform compatible type casts of ibis values + + Args: + value: + Ibis value, which could be a literal, scalar, or column + + to_type: + The Ibis type to cast to + + Returns: + A new Ibis value of type to_type + + Raises: + TypeError: if the type cast cannot be executed""" + if value.type() == to_type: + return value + # casts that just work + # TODO(bmil): add to this as more casts are verified + good_casts = { + ibis_dtypes.bool: (ibis_dtypes.int64,), + ibis_dtypes.int64: ( + ibis_dtypes.bool, + ibis_dtypes.float64, + ibis_dtypes.string, + ibis_dtypes.Decimal(precision=38, scale=9), + ibis_dtypes.Decimal(precision=76, scale=38), + ibis_dtypes.time, + ibis_dtypes.timestamp, + ibis_dtypes.Timestamp(timezone="UTC"), + ), + ibis_dtypes.float64: ( + ibis_dtypes.string, + ibis_dtypes.int64, + ibis_dtypes.Decimal(precision=38, scale=9), + ibis_dtypes.Decimal(precision=76, scale=38), + ), + ibis_dtypes.string: ( + ibis_dtypes.int64, + ibis_dtypes.float64, + ibis_dtypes.Decimal(precision=38, scale=9), + ibis_dtypes.Decimal(precision=76, scale=38), + ibis_dtypes.binary, + ibis_dtypes.date, + ibis_dtypes.timestamp, + ibis_dtypes.Timestamp(timezone="UTC"), + ), + ibis_dtypes.date: ( + ibis_dtypes.string, + ibis_dtypes.timestamp, + ibis_dtypes.Timestamp(timezone="UTC"), + ), + ibis_dtypes.Decimal(precision=38, scale=9): ( + ibis_dtypes.float64, + ibis_dtypes.Decimal(precision=76, scale=38), + ), + ibis_dtypes.Decimal(precision=76, scale=38): ( + ibis_dtypes.float64, + ibis_dtypes.Decimal(precision=38, scale=9), + ), + ibis_dtypes.time: ( + ibis_dtypes.int64, + ibis_dtypes.string, + ), + ibis_dtypes.timestamp: ( + ibis_dtypes.date, + ibis_dtypes.int64, + ibis_dtypes.string, + ibis_dtypes.time, + ibis_dtypes.Timestamp(timezone="UTC"), + ), + ibis_dtypes.Timestamp(timezone="UTC"): ( + ibis_dtypes.date, + ibis_dtypes.int64, + ibis_dtypes.string, + ibis_dtypes.time, + ibis_dtypes.timestamp, + ), + ibis_dtypes.binary: (ibis_dtypes.string,), + } + + value = ibis_value_to_canonical_type(value) + if value.type() in good_casts: + if to_type in good_casts[value.type()]: + return value.cast(to_type) + else: + # this should never happen + raise TypeError( + f"Unexpected value type {value.type()}. {constants.FEEDBACK_LINK}" + ) + + # casts that need some encouragement + + # BigQuery casts bools to lower case strings. Capitalize the result to match Pandas + # TODO(bmil): remove this workaround after fixing Ibis + if value.type() == ibis_dtypes.bool and to_type == ibis_dtypes.string: + return cast(ibis_types.StringValue, value.cast(to_type)).capitalize() + + if value.type() == ibis_dtypes.bool and to_type == ibis_dtypes.float64: + return value.cast(ibis_dtypes.int64).cast(ibis_dtypes.float64) + + if value.type() == ibis_dtypes.float64 and to_type == ibis_dtypes.bool: + return value != ibis_types.literal(0) + + raise TypeError( + f"Unsupported cast {value.type()} to {to_type}. {constants.FEEDBACK_LINK}" + ) + + +def ibis_value_to_canonical_type(value: ibis_types.Value) -> ibis_types.Value: + """Converts an Ibis expression to canonical type. + + This is useful in cases where multiple types correspond to the same BigFrames dtype. + """ + ibis_type = value.type() + name = value.get_name() + if ibis_type.is_json(): + value = vendored_ibis_ops.ToJsonString(value).to_expr() + return value.name(name) + # Allow REQUIRED fields to be joined with NULLABLE fields. + nullable_type = ibis_type.copy(nullable=True) + return value.cast(nullable_type).name(name) + + +def bigframes_dtype_to_ibis_dtype( + bigframes_dtype: Union[ + bigframes.dtypes.DtypeString, bigframes.dtypes.Dtype, np.dtype[Any] + ] +) -> ibis_dtypes.DataType: + """Converts a BigQuery DataFrames supported dtype to an Ibis dtype. + + Args: + bigframes_dtype: + A dtype supported by BigQuery DataFrame + + Returns: + IbisDtype: The corresponding Ibis type + + Raises: + ValueError: If passed a dtype not supported by BigQuery DataFrames. + """ + if isinstance(bigframes_dtype, pd.ArrowDtype): + return _arrow_dtype_to_ibis_dtype(bigframes_dtype.pyarrow_dtype) + + type_string = str(bigframes_dtype) + if type_string in bigframes.dtypes.BIGFRAMES_STRING_TO_BIGFRAMES: + bigframes_dtype = bigframes.dtypes.BIGFRAMES_STRING_TO_BIGFRAMES[ + cast(bigframes.dtypes.DtypeString, type_string) + ] + else: + raise ValueError( + textwrap.dedent( + f""" + Unexpected data type {bigframes_dtype}. The following + str dtypes are supppted: 'boolean','Float64','Int64', + 'int64[pyarrow]','string','string[pyarrow]', + 'timestamp[us, tz=UTC][pyarrow]','timestamp[us][pyarrow]', + 'date32[day][pyarrow]','time64[us][pyarrow]'. + The following pandas.ExtensionDtype are supported: + pandas.BooleanDtype(), pandas.Float64Dtype(), + pandas.Int64Dtype(), pandas.StringDtype(storage="pyarrow"), + pd.ArrowDtype(pa.date32()), pd.ArrowDtype(pa.time64("us")), + pd.ArrowDtype(pa.timestamp("us")), + pd.ArrowDtype(pa.timestamp("us", tz="UTC")). + {constants.FEEDBACK_LINK} + """ + ) + ) + + return BIGFRAMES_TO_IBIS[bigframes_dtype] + + +def ibis_dtype_to_bigframes_dtype( + ibis_dtype: ibis_dtypes.DataType, +) -> bigframes.dtypes.Dtype: + """Converts an Ibis dtype to a BigQuery DataFrames dtype + + Args: + ibis_dtype: The ibis dtype used to represent this type, which + should in turn correspond to an underlying BigQuery type + + Returns: + The supported BigQuery DataFrames dtype, which may be provided by + pandas, numpy, or db_types + + Raises: + ValueError: if passed an unexpected type + """ + # Special cases: Ibis supports variations on these types, but currently + # our IO returns them as objects. Eventually, we should support them as + # ArrowDType (and update the IO accordingly) + if isinstance(ibis_dtype, ibis_dtypes.Array): + return pd.ArrowDtype(_ibis_dtype_to_arrow_dtype(ibis_dtype)) + + if isinstance(ibis_dtype, ibis_dtypes.Struct): + return pd.ArrowDtype(_ibis_dtype_to_arrow_dtype(ibis_dtype)) + + # BigQuery only supports integers of size 64 bits. + if isinstance(ibis_dtype, ibis_dtypes.Integer): + return pd.Int64Dtype() + + # Temporary: Will eventually support an explicit json type instead of casting to string. + if isinstance(ibis_dtype, ibis_dtypes.JSON): + warnings.warn( + "Interpreting JSON as string. This behavior may change in future versions.", + bigframes.exceptions.PreviewWarning, + ) + return bigframes.dtypes.STRING_DTYPE + + if ibis_dtype in IBIS_TO_BIGFRAMES: + return IBIS_TO_BIGFRAMES[ibis_dtype] + elif isinstance(ibis_dtype, ibis_dtypes.Decimal): + # Temporary workaround for ibis decimal issue (b/323387826) + if ibis_dtype.precision >= 76: + return pd.ArrowDtype(pa.decimal256(76, 38)) + else: + return pd.ArrowDtype(pa.decimal128(38, 9)) + elif isinstance(ibis_dtype, ibis_dtypes.Null): + # Fallback to STRING for NULL values for most flexibility in SQL. + return IBIS_TO_BIGFRAMES[ibis_dtypes.string] + else: + raise ValueError( + f"Unexpected Ibis data type {ibis_dtype}. {constants.FEEDBACK_LINK}" + ) + + +def _ibis_dtype_to_arrow_dtype(ibis_dtype: ibis_dtypes.DataType) -> pa.DataType: + """Private utility to convert ibis dtype to equivalent arrow type.""" + if isinstance(ibis_dtype, ibis_dtypes.Array): + return pa.list_( + _ibis_dtype_to_arrow_dtype(ibis_dtype.value_type.copy(nullable=True)) + ) + + if isinstance(ibis_dtype, ibis_dtypes.Struct): + return pa.struct( + [ + (name, _ibis_dtype_to_arrow_dtype(dtype)) + for name, dtype in ibis_dtype.fields.items() + ] + ) + + if ibis_dtype in IBIS_TO_ARROW: + return IBIS_TO_ARROW[ibis_dtype] + else: + raise ValueError( + f"Unexpected Ibis data type {ibis_dtype}. {constants.FEEDBACK_LINK}" + ) + + +def _arrow_dtype_to_ibis_dtype(arrow_dtype: pa.DataType) -> ibis_dtypes.DataType: + if pa.types.is_struct(arrow_dtype): + struct_dtype = cast(pa.StructType, arrow_dtype) + return ibis_dtypes.Struct.from_tuples( + [ + (field.name, _arrow_dtype_to_ibis_dtype(field.type)) + for field in struct_dtype + ] + ) + + if arrow_dtype in ARROW_TO_IBIS: + return ARROW_TO_IBIS[arrow_dtype] + if arrow_dtype == pa.null(): + # Used for empty local dataframes where pyarrow has null type + return ibis_dtypes.float64 + else: + raise ValueError( + f"Unexpected Arrow data type {arrow_dtype}. {constants.FEEDBACK_LINK}" + ) + + +def literal_to_ibis_scalar( + literal, force_dtype: Optional[bigframes.dtypes.Dtype] = None, validate: bool = True +): + """Accept any literal and, if possible, return an Ibis Scalar + expression with a BigQuery DataFrames compatible data type + + Args: + literal: + any value accepted by Ibis + force_dtype: + force the value to a specific dtype + validate: + If true, will raise ValueError if type cannot be stored in a + BigQuery DataFrames object. If used as a subexpression, this should + be disabled. + + Returns: + An ibis Scalar supported by BigQuery DataFrame + + Raises: + ValueError: if passed literal cannot be coerced to a + BigQuery DataFrames compatible scalar + """ + # Special case: Can create nulls for non-bidirectional types + if (force_dtype == gpd.array.GeometryDtype()) and pd.isna(literal): + # Ibis has bug for casting nulltype to geospatial, so we perform intermediate cast first + geotype = ibis_dtypes.GeoSpatial(geotype="geography", srid=4326, nullable=True) + return ibis.literal(None, geotype) + ibis_dtype = BIGFRAMES_TO_IBIS[force_dtype] if force_dtype else None + + if pd.api.types.is_list_like(literal): + if validate: + raise ValueError( + f"List types can't be stored in BigQuery DataFrames. {constants.FEEDBACK_LINK}" + ) + # "correct" way would be to use ibis.array, but this produces invalid BQ SQL syntax + return tuple(literal) + if not pd.api.types.is_list_like(literal) and pd.isna(literal): + if ibis_dtype: + return ibis.null().cast(ibis_dtype) + else: + return ibis.null() + + scalar_expr = ibis.literal(literal) + if ibis_dtype: + scalar_expr = ibis.literal(literal, ibis_dtype) + elif scalar_expr.type().is_floating(): + scalar_expr = ibis.literal(literal, ibis_dtypes.float64) + elif scalar_expr.type().is_integer(): + scalar_expr = ibis.literal(literal, ibis_dtypes.int64) + elif scalar_expr.type().is_decimal(): + precision = scalar_expr.type().precision + scale = scalar_expr.type().scale + if (not precision and not scale) or ( + precision and scale and scale <= 9 and precision + (9 - scale) <= 38 + ): + scalar_expr = ibis.literal( + literal, ibis_dtypes.decimal(precision=38, scale=9) + ) + elif precision and scale and scale <= 38 and precision + (38 - scale) <= 76: + scalar_expr = ibis.literal( + literal, ibis_dtypes.decimal(precision=76, scale=38) + ) + else: + raise TypeError( + "BigQuery's decimal data type supports a maximum precision of 76 and a maximum scale of 38." + f"Current precision: {precision}. Current scale: {scale}" + ) + + # TODO(bmil): support other literals that can be coerced to compatible types + if validate and (scalar_expr.type() not in BIGFRAMES_TO_IBIS.values()): + raise ValueError( + f"Literal did not coerce to a supported data type: {scalar_expr.type()}. {constants.FEEDBACK_LINK}" + ) + + return scalar_expr + + +class UnsupportedTypeError(ValueError): + def __init__(self, type_, supported_types): + self.type = type_ + self.supported_types = supported_types + + +def ibis_type_from_python_type(t: type) -> ibis_dtypes.DataType: + if t not in bigframes.dtypes.RF_SUPPORTED_IO_PYTHON_TYPES: + raise UnsupportedTypeError(t, bigframes.dtypes.RF_SUPPORTED_IO_PYTHON_TYPES) + return python_type_to_bigquery_type(t) + + +def ibis_type_from_type_kind(tk: bigquery.StandardSqlTypeNames) -> ibis_dtypes.DataType: + """Convert bq type to ibis. Only to be used for remote functions, does not handle all types.""" + if tk not in bigframes.dtypes.RF_SUPPORTED_IO_BIGQUERY_TYPEKINDS: + raise UnsupportedTypeError( + tk, bigframes.dtypes.RF_SUPPORTED_IO_BIGQUERY_TYPEKINDS + ) + return third_party_ibis_bqtypes.BigQueryType.to_ibis(tk) diff --git a/bigframes/core/compile/scalar_op_compiler.py b/bigframes/core/compile/scalar_op_compiler.py index 000c4a4c09..6b8e60434e 100644 --- a/bigframes/core/compile/scalar_op_compiler.py +++ b/bigframes/core/compile/scalar_op_compiler.py @@ -27,6 +27,7 @@ import pandas as pd import bigframes.constants as constants +import bigframes.core.compile.ibis_types import bigframes.core.expression as ex import bigframes.dtypes import bigframes.operations as ops @@ -78,7 +79,7 @@ def _( expression: ex.ScalarConstantExpression, bindings: typing.Dict[str, ibis_types.Value], ) -> ibis_types.Value: - return bigframes.dtypes.literal_to_ibis_scalar( + return bigframes.core.compile.ibis_types.literal_to_ibis_scalar( expression.value, expression.dtype ) @@ -771,14 +772,16 @@ def numeric_to_datetime(x: ibis_types.Value, unit: str) -> ibis_types.TimestampV @scalar_op_compiler.register_unary_op(ops.AsTypeOp, pass_op=True) def astype_op_impl(x: ibis_types.Value, op: ops.AsTypeOp): - to_type = bigframes.dtypes.bigframes_dtype_to_ibis_dtype(op.to_type) + to_type = bigframes.core.compile.ibis_types.bigframes_dtype_to_ibis_dtype( + op.to_type + ) if isinstance(x, ibis_types.NullScalar): return ibis_types.null().cast(to_type) # When casting DATETIME column into INT column, we need to convert the column into TIMESTAMP first. if to_type == ibis_dtypes.int64 and x.type() == ibis_dtypes.timestamp: x_converted = x.cast(ibis_dtypes.Timestamp(timezone="UTC")) - return bigframes.dtypes.cast_ibis_value(x_converted, to_type) + return bigframes.core.compile.ibis_types.cast_ibis_value(x_converted, to_type) if to_type == ibis_dtypes.int64 and x.type() == ibis_dtypes.time: # The conversion unit is set to "us" (microseconds) for consistency @@ -798,7 +801,7 @@ def astype_op_impl(x: ibis_types.Value, op: ops.AsTypeOp): elif to_type == ibis_dtypes.time: return x_converted.time() - return bigframes.dtypes.cast_ibis_value(x, to_type) + return bigframes.core.compile.ibis_types.cast_ibis_value(x, to_type) @scalar_op_compiler.register_unary_op(ops.IsInOp, pass_op=True) @@ -1013,15 +1016,7 @@ def add_op( ): if isinstance(x, ibis_types.NullScalar) or isinstance(x, ibis_types.NullScalar): return ibis.null() - try: - # Could be string concatenation or numeric addition. - return x + y # type: ignore - except ibis.common.annotations.SignatureValidationError as exc: - left_type = bigframes.dtypes.ibis_dtype_to_bigframes_dtype(x.type()) - right_type = bigframes.dtypes.ibis_dtype_to_bigframes_dtype(y.type()) - raise TypeError( - f"Cannot add {repr(left_type)} and {repr(right_type)}. {constants.FEEDBACK_LINK}" - ) from exc + return x + y # type: ignore @scalar_op_compiler.register_binary_op(ops.sub_op) diff --git a/bigframes/core/compile/schema_translator.py b/bigframes/core/compile/schema_translator.py index 03e9691af6..d19c1bfb86 100644 --- a/bigframes/core/compile/schema_translator.py +++ b/bigframes/core/compile/schema_translator.py @@ -18,6 +18,7 @@ import ibis import ibis.expr.schema +import bigframes.core.compile.ibis_types import bigframes.core.schema as bf_schema import bigframes.dtypes @@ -28,7 +29,7 @@ def convert_bf_schema(schema: bf_schema.ArraySchema) -> ibis.expr.schema.Schema: """ names = schema.names types = [ - bigframes.dtypes.bigframes_dtype_to_ibis_dtype(bf_type) + bigframes.core.compile.ibis_types.bigframes_dtype_to_ibis_dtype(bf_type) for bf_type in schema.dtypes ] return ibis.schema(names=names, types=types) diff --git a/bigframes/core/indexes/base.py b/bigframes/core/indexes/base.py index e40e20b0cb..0e5082447a 100644 --- a/bigframes/core/indexes/base.py +++ b/bigframes/core/indexes/base.py @@ -90,9 +90,12 @@ def __new__( # TODO: Support more index subtypes from bigframes.core.indexes.multi import MultiIndex - klass = MultiIndex if len(block._index_columns) > 1 else cls - # TODO(b/340893286): fix type error - result = typing.cast(Index, object.__new__(klass)) # type: ignore + if len(block._index_columns) <= 1: + klass = cls + else: + klass = MultiIndex + + result = typing.cast(Index, object.__new__(klass)) result._query_job = None result._block = block block.session._register_object(result) @@ -161,7 +164,8 @@ def dtype(self): @property def dtypes(self) -> pandas.Series: return pandas.Series( - data=self._block.index.dtypes, index=self._block.index.names # type:ignore + data=self._block.index.dtypes, + index=typing.cast(typing.Tuple, self._block.index.names), ) @property @@ -408,10 +412,10 @@ def drop( block = block.drop_columns([condition_id]) return Index(block) - def dropna(self, how: str = "any") -> Index: + def dropna(self, how: typing.Literal["all", "any"] = "any") -> Index: if how not in ("any", "all"): raise ValueError("'how' must be one of 'any', 'all'") - result = block_ops.dropna(self._block, self._block.index_columns, how=how) # type: ignore + result = block_ops.dropna(self._block, self._block.index_columns, how=how) return Index(result) def drop_duplicates(self, *, keep: str = "first") -> Index: diff --git a/bigframes/core/local_data.py b/bigframes/core/local_data.py index 8b256be6d2..ac658d1bb8 100644 --- a/bigframes/core/local_data.py +++ b/bigframes/core/local_data.py @@ -46,8 +46,8 @@ def adapt_pa_table(arrow_table: pa.Table) -> pa.Table: def bigframes_type_for_arrow_type(pa_type: pa.DataType) -> bigframes.dtypes.Dtype: - return bigframes.dtypes.ibis_dtype_to_bigframes_dtype( - bigframes.dtypes.arrow_dtype_to_ibis_dtype(arrow_type_replacements(pa_type)) + return bigframes.dtypes.arrow_dtype_to_bigframes_dtype( + arrow_type_replacements(pa_type) ) diff --git a/bigframes/core/nodes.py b/bigframes/core/nodes.py index 1af7c5bd17..077a362ba0 100644 --- a/bigframes/core/nodes.py +++ b/bigframes/core/nodes.py @@ -324,10 +324,12 @@ def roots(self) -> typing.Set[BigFrameNode]: @functools.cached_property def schema(self) -> schemata.ArraySchema: + from bigframes.core.compile.ibis_types import ibis_dtype_to_bigframes_dtype + items = tuple( schemata.SchemaItem( value.get_name(), - bigframes.dtypes.ibis_dtype_to_bigframes_dtype(value.type()), + ibis_dtype_to_bigframes_dtype(value.type()), ) for value in self.columns ) @@ -376,7 +378,7 @@ def __post_init__(self): raise ValueError( f"Requested schema {self.columns} cannot be derived from table schemal {self.physical_schema}" ) - if self.order_col_is_sequential and len(self.total_order_cols) == 1: + if self.order_col_is_sequential and len(self.total_order_cols) != 1: raise ValueError("Sequential primary key must have only one component") @property @@ -409,6 +411,56 @@ def transform_children( return self +# This node shouldn't be used in the "original" expression tree, only used as replacement for original during planning +@dataclass(frozen=True) +class CachedTableNode(BigFrameNode): + # The original BFET subtree that was cached + # note: this isn't a "child" node. + original_node: BigFrameNode = field() + # reference to cached materialization of original_node + project_id: str = field() + dataset_id: str = field() + table_id: str = field() + physical_schema: Tuple[bq.SchemaField, ...] = field() + + ordering: typing.Optional[orderings.ExpressionOrdering] = field() + + @property + def session(self): + return self.original_node.session + + def __hash__(self): + return self._node_hash + + @property + def roots(self) -> typing.Set[BigFrameNode]: + return {self} + + @property + def schema(self) -> schemata.ArraySchema: + return self.original_node.schema + + @functools.cached_property + def variables_introduced(self) -> int: + return len(self.schema.items) + OVERHEAD_VARIABLES + + @property + def hidden_columns(self) -> typing.Tuple[str, ...]: + """Physical columns used to define ordering but not directly exposed as value columns.""" + if self.ordering is None: + return () + return tuple( + col + for col in sorted(self.ordering.referenced_columns) + if col not in self.schema.names + ) + + def transform_children( + self, t: Callable[[BigFrameNode], BigFrameNode] + ) -> BigFrameNode: + return self + + # Unary nodes @dataclass(frozen=True) class PromoteOffsetsNode(UnaryNode): diff --git a/bigframes/core/ordering.py b/bigframes/core/ordering.py index 9009e31be3..1562592720 100644 --- a/bigframes/core/ordering.py +++ b/bigframes/core/ordering.py @@ -215,6 +215,14 @@ def is_sequential(self) -> bool: def all_ordering_columns(self) -> Sequence[OrderingExpression]: return list(self.ordering_value_columns) + @property + def referenced_columns(self) -> Set[str]: + return set( + col + for part in self.ordering_value_columns + for col in part.scalar_expression.unbound_variables + ) + def encode_order_string( order_id: ibis_types.IntegerColumn, length: int = DEFAULT_ORDERING_ID_LENGTH diff --git a/bigframes/core/reshape/__init__.py b/bigframes/core/reshape/__init__.py index 05cb5c7e94..a23461bdb9 100644 --- a/bigframes/core/reshape/__init__.py +++ b/bigframes/core/reshape/__init__.py @@ -116,7 +116,7 @@ def cut( Iterable, ], *, - labels: Optional[bool] = None, + labels: Union[Iterable[str], bool, None] = None, ) -> bigframes.series.Series: if isinstance(bins, int) and bins <= 0: raise ValueError("`bins` should be a positive integer.") diff --git a/bigframes/core/sql.py b/bigframes/core/sql.py index c1e319b860..01dcebad6e 100644 --- a/bigframes/core/sql.py +++ b/bigframes/core/sql.py @@ -20,7 +20,7 @@ import datetime import math import textwrap -from typing import Iterable, TYPE_CHECKING +from typing import Iterable, Mapping, TYPE_CHECKING, Union # Literals and identifiers matching this pattern can be unquoted unquoted = r"^[A-Za-z_][A-Za-z_0-9]*$" @@ -96,6 +96,12 @@ def cast_as_string(column_name: str) -> str: return f"CAST({identifier(column_name)} AS STRING)" +def to_json_string(column_name: str) -> str: + """Return a string representing JSON version of a column.""" + + return f"TO_JSON_STRING({identifier(column_name)})" + + def csv(values: Iterable[str]) -> str: """Return a string of comma separated values.""" return ", ".join(values) @@ -169,3 +175,47 @@ def ordering_clause( part = f"`{ordering_expr.id}` {asc_desc} {null_clause}" parts.append(part) return f"ORDER BY {' ,'.join(parts)}" + + +def create_vector_search_sql( + sql_string: str, + options: Mapping[str, Union[str | int | bool | float]] = {}, +) -> str: + """Encode the VECTOR SEARCH statement for BigQuery Vector Search.""" + + base_table = options["base_table"] + column_to_search = options["column_to_search"] + distance_type = options["distance_type"] + top_k = options["top_k"] + query_column_to_search = options.get("query_column_to_search", None) + + if query_column_to_search is not None: + query_str = f""" + SELECT + query.*, + base.*, + distance, + FROM VECTOR_SEARCH( + TABLE `{base_table}`, + {simple_literal(column_to_search)}, + ({sql_string}), + {simple_literal(query_column_to_search)}, + distance_type => {simple_literal(distance_type)}, + top_k => {simple_literal(top_k)} + ) + """ + else: + query_str = f""" + SELECT + query.*, + base.*, + distance, + FROM VECTOR_SEARCH( + TABLE `{base_table}`, + {simple_literal(column_to_search)}, + ({sql_string}), + distance_type => {simple_literal(distance_type)}, + top_k => {simple_literal(top_k)} + ) + """ + return query_str diff --git a/bigframes/core/tools/datetimes.py b/bigframes/core/tools/datetimes.py index 5eac4cceb9..5d8d8c9685 100644 --- a/bigframes/core/tools/datetimes.py +++ b/bigframes/core/tools/datetimes.py @@ -28,7 +28,7 @@ def to_datetime( arg: Union[ - vendored_pandas_datetimes.local_scalars, + Union[int, float, str, datetime], vendored_pandas_datetimes.local_iterables, bigframes.series.Series, bigframes.dataframe.DataFrame, diff --git a/bigframes/dataframe.py b/bigframes/dataframe.py index e404e439ab..f12c346776 100644 --- a/bigframes/dataframe.py +++ b/bigframes/dataframe.py @@ -1215,10 +1215,30 @@ def to_pandas( self._set_internal_query_job(query_job) return df.set_axis(self._block.column_labels, axis=1, copy=False) - def to_pandas_batches(self) -> Iterable[pandas.DataFrame]: - """Stream DataFrame results to an iterable of pandas DataFrame""" + def to_pandas_batches( + self, page_size: Optional[int] = None, max_results: Optional[int] = None + ) -> Iterable[pandas.DataFrame]: + """Stream DataFrame results to an iterable of pandas DataFrame. + + page_size and max_results determine the size and number of batches, + see https://cloud.google.com/python/docs/reference/bigquery/latest/google.cloud.bigquery.job.QueryJob#google_cloud_bigquery_job_QueryJob_result + + Args: + page_size (int, default None): + The size of each batch. + max_results (int, default None): + If given, only download this many rows at maximum. + + Returns: + Iterable[pandas.DataFrame]: + An iterable of smaller dataframes which combine to + form the original dataframe. Results stream from bigquery, + see https://cloud.google.com/python/docs/reference/bigquery/latest/google.cloud.bigquery.table.RowIterator#google_cloud_bigquery_table_RowIterator_to_arrow_iterable + """ self._optimize_query_complexity() - return self._block.to_pandas_batches() + return self._block.to_pandas_batches( + page_size=page_size, max_results=max_results + ) def _compute_dry_run(self) -> bigquery.QueryJob: return self._block._compute_dry_run() @@ -3313,22 +3333,43 @@ def apply(self, func, *, axis=0, args: typing.Tuple = (), **kwargs): # Early check whether the dataframe dtypes are currently supported # in the remote function # NOTE: Keep in sync with the value converters used in the gcf code - # generated in generate_cloud_function_main_code in remote_function.py + # generated in remote_function_template.py remote_function_supported_dtypes = ( bigframes.dtypes.INT_DTYPE, bigframes.dtypes.FLOAT_DTYPE, bigframes.dtypes.BOOL_DTYPE, + bigframes.dtypes.BYTES_DTYPE, bigframes.dtypes.STRING_DTYPE, ) supported_dtypes_types = tuple( - type(dtype) for dtype in remote_function_supported_dtypes + type(dtype) + for dtype in remote_function_supported_dtypes + if not isinstance(dtype, pandas.ArrowDtype) + ) + # Check ArrowDtype separately since multiple BigQuery types map to + # ArrowDtype, including BYTES and TIMESTAMP. + supported_arrow_types = tuple( + dtype.pyarrow_dtype + for dtype in remote_function_supported_dtypes + if isinstance(dtype, pandas.ArrowDtype) ) supported_dtypes_hints = tuple( str(dtype) for dtype in remote_function_supported_dtypes ) for dtype in self.dtypes: - if not isinstance(dtype, supported_dtypes_types): + if ( + # Not one of the pandas/numpy types. + not isinstance(dtype, supported_dtypes_types) + # And not one of the arrow types. + and not ( + isinstance(dtype, pandas.ArrowDtype) + and any( + dtype.pyarrow_dtype.equals(arrow_type) + for arrow_type in supported_arrow_types + ) + ) + ): raise NotImplementedError( f"DataFrame has a column of dtype '{dtype}' which is not supported with axis=1." f" Supported dtypes are {supported_dtypes_hints}." diff --git a/bigframes/dtypes.py b/bigframes/dtypes.py index 3df67ed9e4..3b9d5bf141 100644 --- a/bigframes/dtypes.py +++ b/bigframes/dtypes.py @@ -16,25 +16,17 @@ import datetime import decimal -import textwrap import typing -from typing import Any, Dict, Iterable, Literal, Tuple, Union -import warnings +from typing import Any, Dict, Literal, Union import bigframes_vendored.ibis.backends.bigquery.datatypes as third_party_ibis_bqtypes -import bigframes_vendored.ibis.expr.operations as vendored_ibis_ops import geopandas as gpd # type: ignore -import google.cloud.bigquery as bigquery import ibis -import ibis.expr.datatypes as ibis_dtypes -from ibis.expr.datatypes.core import dtype as python_type_to_bigquery_type -import ibis.expr.types as ibis_types import numpy as np import pandas as pd import pyarrow as pa import bigframes.constants as constants -import bigframes.exceptions # Type hints for Pandas dtypes supported by BigQuery DataFrame Dtype = Union[ @@ -81,17 +73,6 @@ "binary[pyarrow]", ] -# Type hints for Ibis data types supported by BigQuery DataFrame -IbisDtype = Union[ - ibis_dtypes.Boolean, - ibis_dtypes.Float64, - ibis_dtypes.Int64, - ibis_dtypes.String, - ibis_dtypes.Date, - ibis_dtypes.Time, - ibis_dtypes.Timestamp, -] - BOOL_BIGFRAMES_TYPES = [pd.BooleanDtype()] # Corresponds to the pandas concept of numeric type (such as when 'numeric_only' is specified in an operation) @@ -170,68 +151,23 @@ def is_bool_coercable(type: ExpressionType) -> bool: return (type is None) or is_numeric(type) or is_string_like(type) -BIDIRECTIONAL_MAPPINGS: Iterable[Tuple[IbisDtype, Dtype]] = ( - (ibis_dtypes.boolean, pd.BooleanDtype()), - (ibis_dtypes.date, pd.ArrowDtype(pa.date32())), - (ibis_dtypes.float64, pd.Float64Dtype()), - (ibis_dtypes.int64, pd.Int64Dtype()), - (ibis_dtypes.string, pd.StringDtype(storage="pyarrow")), - (ibis_dtypes.time, pd.ArrowDtype(pa.time64("us"))), - (ibis_dtypes.Timestamp(timezone=None), pd.ArrowDtype(pa.timestamp("us"))), - ( - ibis_dtypes.Timestamp(timezone="UTC"), - pd.ArrowDtype(pa.timestamp("us", tz="UTC")), - ), - (ibis_dtypes.binary, pd.ArrowDtype(pa.binary())), - ( - ibis_dtypes.Decimal(precision=38, scale=9, nullable=True), - pd.ArrowDtype(pa.decimal128(38, 9)), - ), - ( - ibis_dtypes.Decimal(precision=76, scale=38, nullable=True), - pd.ArrowDtype(pa.decimal256(76, 38)), - ), - ( - ibis_dtypes.GeoSpatial(geotype="geography", srid=4326, nullable=True), - gpd.array.GeometryDtype(), - ), -) - -BIGFRAMES_TO_IBIS: Dict[Dtype, ibis_dtypes.DataType] = { - pandas: ibis for ibis, pandas in BIDIRECTIONAL_MAPPINGS -} - -IBIS_TO_ARROW: Dict[ibis_dtypes.DataType, pa.DataType] = { - ibis_dtypes.boolean: pa.bool_(), - ibis_dtypes.date: pa.date32(), - ibis_dtypes.float64: pa.float64(), - ibis_dtypes.int64: pa.int64(), - ibis_dtypes.string: pa.string(), - ibis_dtypes.time: pa.time64("us"), - ibis_dtypes.Timestamp(timezone=None): pa.timestamp("us"), - ibis_dtypes.Timestamp(timezone="UTC"): pa.timestamp("us", tz="UTC"), - ibis_dtypes.binary: pa.binary(), - ibis_dtypes.Decimal(precision=38, scale=9, nullable=True): pa.decimal128(38, 9), - ibis_dtypes.Decimal(precision=76, scale=38, nullable=True): pa.decimal256(76, 38), -} - -ARROW_TO_IBIS = {arrow: ibis for ibis, arrow in IBIS_TO_ARROW.items()} - -IBIS_TO_BIGFRAMES: Dict[ibis_dtypes.DataType, Dtype] = { - ibis: pandas for ibis, pandas in BIDIRECTIONAL_MAPPINGS -} -# Allow REQUIRED fields to map correctly. -IBIS_TO_BIGFRAMES.update( - {ibis.copy(nullable=False): pandas for ibis, pandas in BIDIRECTIONAL_MAPPINGS} -) -IBIS_TO_BIGFRAMES.update( - { - # TODO: Interval - } +_ALL_DTYPES = ( + pd.BooleanDtype(), + pd.ArrowDtype(pa.date32()), + pd.Float64Dtype(), + pd.Int64Dtype(), + pd.StringDtype(storage="pyarrow"), + pd.ArrowDtype(pa.time64("us")), + pd.ArrowDtype(pa.timestamp("us")), + pd.ArrowDtype(pa.timestamp("us", tz="UTC")), + pd.ArrowDtype(pa.binary()), + pd.ArrowDtype(pa.decimal128(38, 9)), + pd.ArrowDtype(pa.decimal256(76, 38)), + gpd.array.GeometryDtype(), ) BIGFRAMES_STRING_TO_BIGFRAMES: Dict[DtypeString, Dtype] = { - typing.cast(DtypeString, dtype.name): dtype for dtype in BIGFRAMES_TO_IBIS.keys() + typing.cast(DtypeString, dtype.name): dtype for dtype in _ALL_DTYPES } # special case - string[pyarrow] doesn't include the storage in its name, and both @@ -262,362 +198,27 @@ def dtype_for_etype(etype: ExpressionType) -> Dtype: return etype -def ibis_dtype_to_bigframes_dtype( - ibis_dtype: ibis_dtypes.DataType, -) -> Dtype: - """Converts an Ibis dtype to a BigQuery DataFrames dtype - - Args: - ibis_dtype: The ibis dtype used to represent this type, which - should in turn correspond to an underlying BigQuery type - - Returns: - The supported BigQuery DataFrames dtype, which may be provided by - pandas, numpy, or db_types - - Raises: - ValueError: if passed an unexpected type - """ - # Special cases: Ibis supports variations on these types, but currently - # our IO returns them as objects. Eventually, we should support them as - # ArrowDType (and update the IO accordingly) - if isinstance(ibis_dtype, ibis_dtypes.Array): - return pd.ArrowDtype(ibis_dtype_to_arrow_dtype(ibis_dtype)) - - if isinstance(ibis_dtype, ibis_dtypes.Struct): - return pd.ArrowDtype(ibis_dtype_to_arrow_dtype(ibis_dtype)) - - # BigQuery only supports integers of size 64 bits. - if isinstance(ibis_dtype, ibis_dtypes.Integer): - return pd.Int64Dtype() - - # Temporary: Will eventually support an explicit json type instead of casting to string. - if isinstance(ibis_dtype, ibis_dtypes.JSON): - warnings.warn( - "Interpreting JSON as string. This behavior may change in future versions.", - bigframes.exceptions.PreviewWarning, - ) - return STRING_DTYPE - - if ibis_dtype in IBIS_TO_BIGFRAMES: - return IBIS_TO_BIGFRAMES[ibis_dtype] - elif isinstance(ibis_dtype, ibis_dtypes.Decimal): - # Temporary workaround for ibis decimal issue (b/323387826) - if ibis_dtype.precision >= 76: - return pd.ArrowDtype(pa.decimal256(76, 38)) - else: - return pd.ArrowDtype(pa.decimal128(38, 9)) - elif isinstance(ibis_dtype, ibis_dtypes.Null): - # Fallback to STRING for NULL values for most flexibility in SQL. - return IBIS_TO_BIGFRAMES[ibis_dtypes.string] - else: - raise ValueError( - f"Unexpected Ibis data type {ibis_dtype}. {constants.FEEDBACK_LINK}" - ) - - -def ibis_dtype_to_arrow_dtype(ibis_dtype: ibis_dtypes.DataType) -> pa.DataType: - if isinstance(ibis_dtype, ibis_dtypes.Array): - return pa.list_( - ibis_dtype_to_arrow_dtype(ibis_dtype.value_type.copy(nullable=True)) - ) - - if isinstance(ibis_dtype, ibis_dtypes.Struct): - return pa.struct( - [ - (name, ibis_dtype_to_arrow_dtype(dtype)) - for name, dtype in ibis_dtype.fields.items() - ] - ) - - if ibis_dtype in IBIS_TO_ARROW: - return IBIS_TO_ARROW[ibis_dtype] - else: - raise ValueError( - f"Unexpected Ibis data type {ibis_dtype}. {constants.FEEDBACK_LINK}" - ) - - -def ibis_value_to_canonical_type(value: ibis_types.Value) -> ibis_types.Value: - """Converts an Ibis expression to canonical type. - - This is useful in cases where multiple types correspond to the same BigFrames dtype. - """ - ibis_type = value.type() - name = value.get_name() - if ibis_type.is_json(): - value = vendored_ibis_ops.ToJsonString(value).to_expr() - return value.name(name) - # Allow REQUIRED fields to be joined with NULLABLE fields. - nullable_type = ibis_type.copy(nullable=True) - return value.cast(nullable_type).name(name) - - -def arrow_dtype_to_ibis_dtype(arrow_dtype: pa.DataType) -> ibis_dtypes.DataType: - if pa.types.is_struct(arrow_dtype): - struct_dtype = typing.cast(pa.StructType, arrow_dtype) - return ibis_dtypes.Struct.from_tuples( - [ - (field.name, arrow_dtype_to_ibis_dtype(field.type)) - for field in struct_dtype - ] - ) - - if arrow_dtype in ARROW_TO_IBIS: - return ARROW_TO_IBIS[arrow_dtype] - if arrow_dtype == pa.null(): - # Used for empty local dataframes where pyarrow has null type - return ibis_dtypes.float64 - else: - raise ValueError( - f"Unexpected Arrow data type {arrow_dtype}. {constants.FEEDBACK_LINK}" - ) - - def arrow_dtype_to_bigframes_dtype(arrow_dtype: pa.DataType) -> Dtype: - return ibis_dtype_to_bigframes_dtype(arrow_dtype_to_ibis_dtype(arrow_dtype)) - - -def bigframes_dtype_to_ibis_dtype( - bigframes_dtype: Union[DtypeString, Dtype, np.dtype[Any]] -) -> ibis_dtypes.DataType: - """Converts a BigQuery DataFrames supported dtype to an Ibis dtype. - - Args: - bigframes_dtype: - A dtype supported by BigQuery DataFrame - - Returns: - IbisDtype: The corresponding Ibis type - - Raises: - ValueError: If passed a dtype not supported by BigQuery DataFrames. - """ - if isinstance(bigframes_dtype, pd.ArrowDtype): - return arrow_dtype_to_ibis_dtype(bigframes_dtype.pyarrow_dtype) - - type_string = str(bigframes_dtype) - if type_string in BIGFRAMES_STRING_TO_BIGFRAMES: - bigframes_dtype = BIGFRAMES_STRING_TO_BIGFRAMES[ - typing.cast(DtypeString, type_string) - ] - else: - raise ValueError( - textwrap.dedent( - f""" - Unexpected data type {bigframes_dtype}. The following - str dtypes are supppted: 'boolean','Float64','Int64', - 'int64[pyarrow]','string','string[pyarrow]', - 'timestamp[us, tz=UTC][pyarrow]','timestamp[us][pyarrow]', - 'date32[day][pyarrow]','time64[us][pyarrow]'. - The following pandas.ExtensionDtype are supported: - pandas.BooleanDtype(), pandas.Float64Dtype(), - pandas.Int64Dtype(), pandas.StringDtype(storage="pyarrow"), - pd.ArrowDtype(pa.date32()), pd.ArrowDtype(pa.time64("us")), - pd.ArrowDtype(pa.timestamp("us")), - pd.ArrowDtype(pa.timestamp("us", tz="UTC")). - {constants.FEEDBACK_LINK} - """ - ) - ) + # TODO: Directly convert instead of using ibis dtype as intermediate step + from bigframes.core.compile.ibis_types import ( + _arrow_dtype_to_ibis_dtype, + ibis_dtype_to_bigframes_dtype, + ) - return BIGFRAMES_TO_IBIS[bigframes_dtype] + return ibis_dtype_to_bigframes_dtype(_arrow_dtype_to_ibis_dtype(arrow_dtype)) def bigframes_dtype_to_arrow_dtype( bigframes_dtype: Union[DtypeString, Dtype, np.dtype[Any]] ) -> pa.DataType: - return ibis_dtype_to_arrow_dtype(bigframes_dtype_to_ibis_dtype(bigframes_dtype)) - - -def literal_to_ibis_scalar( - literal, force_dtype: typing.Optional[Dtype] = None, validate: bool = True -): - """Accept any literal and, if possible, return an Ibis Scalar - expression with a BigQuery DataFrames compatible data type - - Args: - literal: - any value accepted by Ibis - force_dtype: - force the value to a specific dtype - validate: - If true, will raise ValueError if type cannot be stored in a - BigQuery DataFrames object. If used as a subexpression, this should - be disabled. - - Returns: - An ibis Scalar supported by BigQuery DataFrame - - Raises: - ValueError: if passed literal cannot be coerced to a - BigQuery DataFrames compatible scalar - """ - # Special case: Can create nulls for non-bidirectional types - if (force_dtype == gpd.array.GeometryDtype()) and pd.isna(literal): - # Ibis has bug for casting nulltype to geospatial, so we perform intermediate cast first - geotype = ibis_dtypes.GeoSpatial(geotype="geography", srid=4326, nullable=True) - return ibis.literal(None, geotype) - ibis_dtype = BIGFRAMES_TO_IBIS[force_dtype] if force_dtype else None - - if pd.api.types.is_list_like(literal): - if validate: - raise ValueError( - f"List types can't be stored in BigQuery DataFrames. {constants.FEEDBACK_LINK}" - ) - # "correct" way would be to use ibis.array, but this produces invalid BQ SQL syntax - return tuple(literal) - if not pd.api.types.is_list_like(literal) and pd.isna(literal): - if ibis_dtype: - return ibis.null().cast(ibis_dtype) - else: - return ibis.null() - - scalar_expr = ibis.literal(literal) - if ibis_dtype: - scalar_expr = ibis.literal(literal, ibis_dtype) - elif scalar_expr.type().is_floating(): - scalar_expr = ibis.literal(literal, ibis_dtypes.float64) - elif scalar_expr.type().is_integer(): - scalar_expr = ibis.literal(literal, ibis_dtypes.int64) - elif scalar_expr.type().is_decimal(): - precision = scalar_expr.type().precision - scale = scalar_expr.type().scale - if (not precision and not scale) or ( - precision and scale and scale <= 9 and precision + (9 - scale) <= 38 - ): - scalar_expr = ibis.literal( - literal, ibis_dtypes.decimal(precision=38, scale=9) - ) - elif precision and scale and scale <= 38 and precision + (38 - scale) <= 76: - scalar_expr = ibis.literal( - literal, ibis_dtypes.decimal(precision=76, scale=38) - ) - else: - raise TypeError( - "BigQuery's decimal data type supports a maximum precision of 76 and a maximum scale of 38." - f"Current precision: {precision}. Current scale: {scale}" - ) - - # TODO(bmil): support other literals that can be coerced to compatible types - if validate and (scalar_expr.type() not in BIGFRAMES_TO_IBIS.values()): - raise ValueError( - f"Literal did not coerce to a supported data type: {scalar_expr.type()}. {constants.FEEDBACK_LINK}" - ) - - return scalar_expr - - -def cast_ibis_value( - value: ibis_types.Value, to_type: ibis_dtypes.DataType -) -> ibis_types.Value: - """Perform compatible type casts of ibis values - - Args: - value: - Ibis value, which could be a literal, scalar, or column - - to_type: - The Ibis type to cast to - - Returns: - A new Ibis value of type to_type - - Raises: - TypeError: if the type cast cannot be executed""" - if value.type() == to_type: - return value - # casts that just work - # TODO(bmil): add to this as more casts are verified - good_casts = { - ibis_dtypes.bool: (ibis_dtypes.int64,), - ibis_dtypes.int64: ( - ibis_dtypes.bool, - ibis_dtypes.float64, - ibis_dtypes.string, - ibis_dtypes.Decimal(precision=38, scale=9), - ibis_dtypes.Decimal(precision=76, scale=38), - ibis_dtypes.time, - ibis_dtypes.timestamp, - ibis_dtypes.Timestamp(timezone="UTC"), - ), - ibis_dtypes.float64: ( - ibis_dtypes.string, - ibis_dtypes.int64, - ibis_dtypes.Decimal(precision=38, scale=9), - ibis_dtypes.Decimal(precision=76, scale=38), - ), - ibis_dtypes.string: ( - ibis_dtypes.int64, - ibis_dtypes.float64, - ibis_dtypes.Decimal(precision=38, scale=9), - ibis_dtypes.Decimal(precision=76, scale=38), - ibis_dtypes.binary, - ibis_dtypes.date, - ibis_dtypes.timestamp, - ibis_dtypes.Timestamp(timezone="UTC"), - ), - ibis_dtypes.date: ( - ibis_dtypes.string, - ibis_dtypes.timestamp, - ibis_dtypes.Timestamp(timezone="UTC"), - ), - ibis_dtypes.Decimal(precision=38, scale=9): ( - ibis_dtypes.float64, - ibis_dtypes.Decimal(precision=76, scale=38), - ), - ibis_dtypes.Decimal(precision=76, scale=38): ( - ibis_dtypes.float64, - ibis_dtypes.Decimal(precision=38, scale=9), - ), - ibis_dtypes.time: ( - ibis_dtypes.int64, - ibis_dtypes.string, - ), - ibis_dtypes.timestamp: ( - ibis_dtypes.date, - ibis_dtypes.int64, - ibis_dtypes.string, - ibis_dtypes.time, - ibis_dtypes.Timestamp(timezone="UTC"), - ), - ibis_dtypes.Timestamp(timezone="UTC"): ( - ibis_dtypes.date, - ibis_dtypes.int64, - ibis_dtypes.string, - ibis_dtypes.time, - ibis_dtypes.timestamp, - ), - ibis_dtypes.binary: (ibis_dtypes.string,), - } - - value = ibis_value_to_canonical_type(value) - if value.type() in good_casts: - if to_type in good_casts[value.type()]: - return value.cast(to_type) - else: - # this should never happen - raise TypeError( - f"Unexpected value type {value.type()}. {constants.FEEDBACK_LINK}" - ) - - # casts that need some encouragement - - # BigQuery casts bools to lower case strings. Capitalize the result to match Pandas - # TODO(bmil): remove this workaround after fixing Ibis - if value.type() == ibis_dtypes.bool and to_type == ibis_dtypes.string: - return typing.cast(ibis_types.StringValue, value.cast(to_type)).capitalize() - - if value.type() == ibis_dtypes.bool and to_type == ibis_dtypes.float64: - return value.cast(ibis_dtypes.int64).cast(ibis_dtypes.float64) - - if value.type() == ibis_dtypes.float64 and to_type == ibis_dtypes.bool: - return value != ibis_types.literal(0) - - raise TypeError( - f"Unsupported cast {value.type()} to {to_type}. {constants.FEEDBACK_LINK}" + # TODO: Directly convert instead of using ibis dtype as intermediate step + from bigframes.core.compile.ibis_types import ( + _ibis_dtype_to_arrow_dtype, + bigframes_dtype_to_ibis_dtype, ) + return _ibis_dtype_to_arrow_dtype(bigframes_dtype_to_ibis_dtype(bigframes_dtype)) + def is_dtype(scalar: typing.Any, dtype: Dtype) -> bool: """Captures whether a scalar can be losslessly represented by a dtype.""" @@ -740,6 +341,11 @@ def infer_literal_type(literal) -> typing.Optional[Dtype]: if pd.isna(literal): return None # Null value without a definite type # Temporary logic, use ibis inferred type + from bigframes.core.compile.ibis_types import ( + ibis_dtype_to_bigframes_dtype, + literal_to_ibis_scalar, + ) + ibis_literal = literal_to_ibis_scalar(literal) return ibis_dtype_to_bigframes_dtype(ibis_literal.type()) @@ -748,49 +354,44 @@ def infer_literal_arrow_type(literal) -> typing.Optional[pa.DataType]: if pd.isna(literal): return None # Null value without a definite type # Temporary logic, use ibis inferred type + # TODO: Directly convert instead of using ibis dtype as intermediate step + from bigframes.core.compile.ibis_types import ( + _ibis_dtype_to_arrow_dtype, + literal_to_ibis_scalar, + ) + ibis_literal = literal_to_ibis_scalar(literal) - return ibis_dtype_to_arrow_dtype(ibis_literal.type()) + return _ibis_dtype_to_arrow_dtype(ibis_literal.type()) + + +def bf_type_from_type_kind(bf_schema) -> Dict[str, Dtype]: + """Converts bigquery sql type to the default bigframes dtype.""" + ibis_schema: ibis.Schema = third_party_ibis_bqtypes.BigQuerySchema.to_ibis( + bf_schema + ) + # TODO: Directly convert instead of using ibis dtype as intermediate step + from bigframes.core.compile.ibis_types import ibis_dtype_to_bigframes_dtype + return { + name: ibis_dtype_to_bigframes_dtype(type) for name, type in ibis_schema.items() + } + + +# Remote functions use only +# TODO: Refactor into remote function module # Input and output types supported by BigQuery DataFrames remote functions. # TODO(shobs): Extend the support to all types supported by BQ remote functions # https://cloud.google.com/bigquery/docs/remote-functions#limitations -SUPPORTED_IO_PYTHON_TYPES = {bool, float, int, str} -SUPPORTED_IO_BIGQUERY_TYPEKINDS = { +RF_SUPPORTED_IO_PYTHON_TYPES = {bool, bytes, float, int, str} + +RF_SUPPORTED_IO_BIGQUERY_TYPEKINDS = { "BOOLEAN", "BOOL", + "BYTES", "FLOAT", "FLOAT64", "INT64", "INTEGER", "STRING", } - - -class UnsupportedTypeError(ValueError): - def __init__(self, type_, supported_types): - self.type = type_ - self.supported_types = supported_types - - -def ibis_type_from_python_type(t: type) -> ibis_dtypes.DataType: - if t not in SUPPORTED_IO_PYTHON_TYPES: - raise UnsupportedTypeError(t, SUPPORTED_IO_PYTHON_TYPES) - return python_type_to_bigquery_type(t) - - -def ibis_type_from_type_kind(tk: bigquery.StandardSqlTypeNames) -> ibis_dtypes.DataType: - """Convert bq type to ibis. Only to be used for remote functions, does not handle all types.""" - if tk not in SUPPORTED_IO_BIGQUERY_TYPEKINDS: - raise UnsupportedTypeError(tk, SUPPORTED_IO_BIGQUERY_TYPEKINDS) - return third_party_ibis_bqtypes.BigQueryType.to_ibis(tk) - - -def bf_type_from_type_kind(bf_schema) -> Dict[str, Dtype]: - """Converts bigquery sql type to the default bigframes dtype.""" - ibis_schema: ibis.Schema = third_party_ibis_bqtypes.BigQuerySchema.to_ibis( - bf_schema - ) - return { - name: ibis_dtype_to_bigframes_dtype(type) for name, type in ibis_schema.items() - } diff --git a/bigframes/functions/remote_function.py b/bigframes/functions/remote_function.py index 7be252406c..472ac07547 100644 --- a/bigframes/functions/remote_function.py +++ b/bigframes/functions/remote_function.py @@ -24,7 +24,6 @@ import string import sys import tempfile -import textwrap from typing import ( Any, cast, @@ -33,6 +32,7 @@ NamedTuple, Optional, Sequence, + Tuple, TYPE_CHECKING, Union, ) @@ -40,6 +40,7 @@ import ibis import pandas +import pyarrow import requests if TYPE_CHECKING: @@ -60,7 +61,8 @@ from bigframes import clients import bigframes.constants as constants -import bigframes.dtypes +import bigframes.core.compile.ibis_types +import bigframes.functions.remote_function_template logger = logging.getLogger(__name__) @@ -182,15 +184,11 @@ def create_bq_remote_function( # Create BQ function # https://cloud.google.com/bigquery/docs/reference/standard-sql/remote-functions#create_a_remote_function_2 bq_function_args = [] - bq_function_return_type = third_party_ibis_bqtypes.BigQueryType.from_ibis( - output_type - ) + bq_function_return_type = output_type # We are expecting the input type annotations to be 1:1 with the input args - for idx, name in enumerate(input_args): - bq_function_args.append( - f"{name} {third_party_ibis_bqtypes.BigQueryType.from_ibis(input_types[idx])}" - ) + for name, type_ in zip(input_args, input_types): + bq_function_args.append(f"{name} {type_}") remote_function_options = { "endpoint": endpoint, @@ -258,173 +256,24 @@ def get_cloud_function_endpoint(self, name): pass return None - def generate_udf_code(self, def_, dir): - """Generate serialized bytecode using cloudpickle given a udf.""" - udf_code_file_name = "udf.py" - udf_bytecode_file_name = "udf.cloudpickle" - - # original code, only for debugging purpose - udf_code = textwrap.dedent(inspect.getsource(def_)) - udf_code_file_path = os.path.join(dir, udf_code_file_name) - with open(udf_code_file_path, "w") as f: - f.write(udf_code) - - # serialized bytecode - udf_bytecode_file_path = os.path.join(dir, udf_bytecode_file_name) - with open(udf_bytecode_file_path, "wb") as f: - cloudpickle.dump(def_, f, protocol=_pickle_protocol_version) - - return udf_code_file_name, udf_bytecode_file_name - - def generate_cloud_function_main_code(self, def_, dir, is_row_processor=False): - """Get main.py code for the cloud function for the given user defined function.""" - - # Pickle the udf with all its dependencies - udf_code_file, udf_bytecode_file = self.generate_udf_code(def_, dir) - handler_func_name = "udf_http" - - # We want to build a cloud function that works for BQ remote functions, - # where we receive `calls` in json which is a batch of rows from BQ SQL. - # The number and the order of values in each row is expected to exactly - # match to the number and order of arguments in the udf , e.g. if the udf is - # def foo(x: int, y: str): - # ... - # then the http request body could look like - # { - # ... - # "calls" : [ - # [123, "hello"], - # [456, "world"] - # ] - # ... - # } - # https://cloud.google.com/bigquery/docs/reference/standard-sql/remote-functions#input_format - code = """\ -import cloudpickle -import functions_framework -from flask import jsonify -import json -""" - if is_row_processor: - code += """\ -import ast -import math -import pandas as pd - -def get_pd_series(row): - row_json = json.loads(row) - col_names = row_json["names"] - col_types = row_json["types"] - col_values = row_json["values"] - index_length = row_json["indexlength"] - dtype = row_json["dtype"] - - # At this point we are assuming that col_names, col_types and col_values are - # arrays of the same length, representing column names, types and values for - # one row of data - - # column names are not necessarily strings - # they are serialized as repr(name) at source - evaluated_col_names = [] - for col_name in col_names: - try: - col_name = ast.literal_eval(col_name) - except Exception as ex: - raise NameError(f"Failed to evaluate column name from '{col_name}': {ex}") - evaluated_col_names.append(col_name) - col_names = evaluated_col_names - - # Supported converters for pandas to python types - value_converters = { - "boolean": lambda val: val == "true", - "Int64": int, - "Float64": float, - "string": str, - } - - def convert_value(value, value_type): - value_converter = value_converters.get(value_type) - if value_converter is None: - raise ValueError(f"Don't know how to handle type '{value_type}'") - if value is None: - return None - return value_converter(value) - - index_values = [ - pd.Series([convert_value(col_values[i], col_types[i])], dtype=col_types[i])[0] - for i in range(index_length) - ] - - data_col_names = col_names[index_length:] - data_col_types = col_types[index_length:] - data_col_values = col_values[index_length:] - data_col_values = [ - pd.Series([convert_value(a, data_col_types[i])], dtype=data_col_types[i])[0] - for i, a in enumerate(data_col_values) - ] - - row_index = index_values[0] if len(index_values) == 1 else tuple(index_values) - row_series = pd.Series(data_col_values, index=data_col_names, name=row_index, dtype=dtype) - return row_series -""" - code += f"""\ - -# original udf code is in {udf_code_file} -# serialized udf code is in {udf_bytecode_file} -with open("{udf_bytecode_file}", "rb") as f: - udf = cloudpickle.load(f) - -def {handler_func_name}(request): - try: - request_json = request.get_json(silent=True) - calls = request_json["calls"] - replies = [] - for call in calls: -""" - - if is_row_processor: - code += """\ - reply = udf(get_pd_series(call[0])) - if isinstance(reply, float) and (math.isnan(reply) or math.isinf(reply)): - # json serialization of the special float values (nan, inf, -inf) - # is not in strict compliance of the JSON specification - # https://docs.python.org/3/library/json.html#basic-usage. - # Let's convert them to a quoted string representation ("NaN", - # "Infinity", "-Infinity" respectively) which is handled by - # BigQuery - reply = json.dumps(reply) - elif pd.isna(reply): - # Pandas N/A values are not json serializable, so use a python - # equivalent instead - reply = None - elif hasattr(reply, "item"): - # Numpy types are not json serializable, so use its Python - # value instead - reply = reply.item() -""" - else: - code += """\ - reply = udf(*call) -""" - code += """\ - replies.append(reply) - return_json = json.dumps({"replies" : replies}) - return return_json - except Exception as e: - return jsonify( { "errorMessage": str(e) } ), 400 -""" - - main_py = os.path.join(dir, "main.py") - with open(main_py, "w") as f: - f.write(code) - logger.debug(f"Wrote {os.path.abspath(main_py)}:\n{open(main_py).read()}") - - return handler_func_name - def generate_cloud_function_code( - self, def_, dir, package_requirements=None, is_row_processor=False + self, + def_, + directory, + *, + input_types: Tuple[str], + output_type: str, + package_requirements=None, + is_row_processor=False, ): - """Generate the cloud function code for a given user defined function.""" + """Generate the cloud function code for a given user defined function. + + Args: + input_types (tuple[str]): + Types of the input arguments in BigQuery SQL data type names. + output_type (str): + Types of the output scalar as a BigQuery SQL data type name. + """ # requirements.txt requirements = ["cloudpickle >= 2.1.0"] @@ -432,16 +281,21 @@ def generate_cloud_function_code( # bigframes remote function will send an entire row of data as json, # which would be converted to a pandas series and processed requirements.append(f"pandas=={pandas.__version__}") + requirements.append(f"pyarrow=={pyarrow.__version__}") if package_requirements: requirements.extend(package_requirements) requirements = sorted(requirements) - requirements_txt = os.path.join(dir, "requirements.txt") + requirements_txt = os.path.join(directory, "requirements.txt") with open(requirements_txt, "w") as f: f.write("\n".join(requirements)) # main.py - entry_point = self.generate_cloud_function_main_code( - def_, dir, is_row_processor + entry_point = bigframes.functions.remote_function_template.generate_cloud_function_main_code( + def_, + directory, + input_types=input_types, + output_type=output_type, + is_row_processor=is_row_processor, ) return entry_point @@ -449,20 +303,35 @@ def create_cloud_function( self, def_, cf_name, + *, + input_types: Tuple[str], + output_type: str, package_requirements=None, timeout_seconds=600, max_instance_count=None, is_row_processor=False, vpc_connector=None, ): - """Create a cloud function from the given user defined function.""" + """Create a cloud function from the given user defined function. + + Args: + input_types (tuple[str]): + Types of the input arguments in BigQuery SQL data type names. + output_type (str): + Types of the output scalar as a BigQuery SQL data type name. + """ # Build and deploy folder structure containing cloud function - with tempfile.TemporaryDirectory() as dir: + with tempfile.TemporaryDirectory() as directory: entry_point = self.generate_cloud_function_code( - def_, dir, package_requirements, is_row_processor + def_, + directory, + package_requirements=package_requirements, + input_types=input_types, + output_type=output_type, + is_row_processor=is_row_processor, ) - archive_path = shutil.make_archive(dir, "zip", dir) + archive_path = shutil.make_archive(directory, "zip", directory) # We are creating cloud function source code from the currently running # python version. Use the same version to deploy. This is necessary @@ -607,11 +476,13 @@ def provision_bq_remote_function( cf_endpoint = self.create_cloud_function( def_, cloud_function_name, - package_requirements, - cloud_function_timeout, - cloud_function_max_instance_count, - is_row_processor, - cloud_function_vpc_connector, + input_types=input_types, + output_type=output_type, + package_requirements=package_requirements, + timeout_seconds=cloud_function_timeout, + max_instance_count=cloud_function_max_instance_count, + is_row_processor=is_row_processor, + vpc_connector=cloud_function_vpc_connector, ) else: logger.info(f"Cloud function {cloud_function_name} already exists.") @@ -686,12 +557,16 @@ def ibis_signature_from_python_signature( input_types: Sequence[type], output_type: type, ) -> IbisSignature: + return IbisSignature( parameter_names=list(signature.parameters.keys()), input_types=[ - bigframes.dtypes.ibis_type_from_python_type(t) for t in input_types + bigframes.core.compile.ibis_types.ibis_type_from_python_type(t) + for t in input_types ], - output_type=bigframes.dtypes.ibis_type_from_python_type(output_type), + output_type=bigframes.core.compile.ibis_types.ibis_type_from_python_type( + output_type + ), ) @@ -699,6 +574,7 @@ class ReturnTypeMissingError(ValueError): pass +# TODO: Move this to compile folder def ibis_signature_from_routine(routine: bigquery.Routine) -> IbisSignature: if not routine.return_type: raise ReturnTypeMissingError @@ -706,12 +582,14 @@ def ibis_signature_from_routine(routine: bigquery.Routine) -> IbisSignature: return IbisSignature( parameter_names=[arg.name for arg in routine.arguments], input_types=[ - bigframes.dtypes.ibis_type_from_type_kind(arg.data_type.type_kind) + bigframes.core.compile.ibis_types.ibis_type_from_type_kind( + arg.data_type.type_kind + ) if arg.data_type else None for arg in routine.arguments ], - output_type=bigframes.dtypes.ibis_type_from_type_kind( + output_type=bigframes.core.compile.ibis_types.ibis_type_from_type_kind( routine.return_type.type_kind ), ) @@ -920,8 +798,9 @@ def remote_function( https://cloud.google.com/functions/docs/networking/connecting-vpc. """ # Some defaults may be used from the session if not provided otherwise + import bigframes.exceptions as bf_exceptions import bigframes.pandas as bpd - import bigframes.series + import bigframes.series as bf_series import bigframes.session session = cast(bigframes.session.Session, session or bpd.get_global_session()) @@ -1059,13 +938,13 @@ def wrapper(func): # BigQuery DataFrames and pandas object types for compatibility. is_row_processor = False if len(input_types) == 1 and ( - (input_type := input_types[0]) == bigframes.series.Series + (input_type := input_types[0]) == bf_series.Series or input_type == pandas.Series ): warnings.warn( "input_types=Series is in preview.", stacklevel=1, - category=bigframes.exceptions.PreviewWarning, + category=bf_exceptions.PreviewWarning, ) # we will model the row as a json serialized string containing the data @@ -1112,16 +991,21 @@ def try_delattr(attr): rf_name, cf_name = remote_function_client.provision_bq_remote_function( func, - ibis_signature.input_types, - ibis_signature.output_type, - reuse, - name, - packages, - max_batching_rows, - cloud_function_timeout, - cloud_function_max_instances, - is_row_processor, - cloud_function_vpc_connector, + input_types=tuple( + third_party_ibis_bqtypes.BigQueryType.from_ibis(type_) + for type_ in ibis_signature.input_types + ), + output_type=third_party_ibis_bqtypes.BigQueryType.from_ibis( + ibis_signature.output_type + ), + reuse=reuse, + name=name, + package_requirements=packages, + max_batching_rows=max_batching_rows, + cloud_function_timeout=cloud_function_timeout, + cloud_function_max_instance_count=cloud_function_max_instances, + is_row_processor=is_row_processor, + cloud_function_vpc_connector=cloud_function_vpc_connector, ) # TODO: Move ibis logic to compiler step @@ -1135,8 +1019,11 @@ def try_delattr(attr): remote_function_client.get_cloud_function_fully_qualified_name(cf_name) ) func.bigframes_remote_function = str(dataset_ref.routine(rf_name)) # type: ignore - func.output_dtype = bigframes.dtypes.ibis_dtype_to_bigframes_dtype( - ibis_signature.output_type + + func.output_dtype = ( + bigframes.core.compile.ibis_types.ibis_dtype_to_bigframes_dtype( + ibis_signature.output_type + ) ) func.ibis_node = node return func @@ -1146,21 +1033,14 @@ def try_delattr(attr): def read_gbq_function( function_name: str, - session: Optional[Session] = None, - bigquery_client: Optional[bigquery.Client] = None, + *, + session: Session, ): """ Read an existing BigQuery function and prepare it for use in future queries. """ - - # A BigQuery client is required to perform BQ operations - if not bigquery_client and session: - bigquery_client = session.bqclient - if not bigquery_client: - raise ValueError( - "A bigquery client must be provided, either directly or via session. " - f"{constants.FEEDBACK_LINK}" - ) + bigquery_client = session.bqclient + ibis_client = session.ibis_client try: routine_ref = get_routine_reference(function_name, bigquery_client, session) @@ -1182,7 +1062,7 @@ def read_gbq_function( raise ValueError( f"Function return type must be specified. {constants.FEEDBACK_LINK}" ) - except bigframes.dtypes.UnsupportedTypeError as e: + except bigframes.core.compile.ibis_types.UnsupportedTypeError as e: raise ValueError( f"Type {e.type} not supported, supported types are {e.supported_types}. " f"{constants.FEEDBACK_LINK}" @@ -1192,10 +1072,13 @@ def read_gbq_function( # non-standard names for the arguments here. def func(*ignored_args, **ignored_kwargs): f"""Remote function {str(routine_ref)}.""" - # TODO(swast): Construct an ibis client from bigquery_client and - # execute node via a query. + nonlocal node # type: ignore + + expr = node(*ignored_args, **ignored_kwargs) # type: ignore + return ibis_client.execute(expr) # TODO: Move ibis logic to compiler step + func.__name__ = routine_ref.routine_id node = ibis.udf.scalar.builtin( @@ -1205,7 +1088,7 @@ def func(*ignored_args, **ignored_kwargs): signature=(ibis_signature.input_types, ibis_signature.output_type), ) func.bigframes_remote_function = str(routine_ref) # type: ignore - func.output_dtype = bigframes.dtypes.ibis_dtype_to_bigframes_dtype( # type: ignore + func.output_dtype = bigframes.core.compile.ibis_types.ibis_dtype_to_bigframes_dtype( # type: ignore ibis_signature.output_type ) func.ibis_node = node # type: ignore diff --git a/bigframes/functions/remote_function_template.py b/bigframes/functions/remote_function_template.py new file mode 100644 index 0000000000..68fe1b917d --- /dev/null +++ b/bigframes/functions/remote_function_template.py @@ -0,0 +1,289 @@ +# Copyright 2023 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import annotations + +import inspect +import logging +import os +import textwrap +from typing import Tuple + +import cloudpickle + +logger = logging.getLogger(__name__) + + +# Protocol version 4 is available in python version 3.4 and above +# https://docs.python.org/3/library/pickle.html#data-stream-format +_pickle_protocol_version = 4 + + +# Placeholder variables for testing. +input_types = ("STRING",) +output_type = "STRING" + + +# Convert inputs to BigQuery JSON. See: +# https://cloud.google.com/bigquery/docs/remote-functions#json_encoding_of_sql_data_type +# and +# https://cloud.google.com/bigquery/docs/reference/standard-sql/json_functions#to_json_string +def convert_call(input_types, call): + for type_, arg in zip(input_types, call): + yield convert_from_bq_json(type_, arg) + + +def convert_from_bq_json(type_, arg): + import base64 + import collections + + converters = collections.defaultdict(lambda: (lambda value: value)) # type: ignore + converters["BYTES"] = base64.b64decode + converter = converters[type_] + return converter(arg) if arg is not None else None + + +def convert_to_bq_json(type_, arg): + import base64 + import collections + + converters = collections.defaultdict(lambda: (lambda value: value)) # type: ignore + converters["BYTES"] = lambda value: base64.b64encode(value).decode("utf-8") + converter = converters[type_] + return converter(arg) if arg is not None else None + + +# get_pd_series is the inverse of Block._get_rows_as_json_values +# NOTE: Keep in sync with the list of supported types in DataFrame.apply. +def get_pd_series(row): + import ast + import base64 + import json + from typing import Callable, cast + + import pandas as pd + + row_json = json.loads(row) + col_names = row_json["names"] + col_types = row_json["types"] + col_values = row_json["values"] + index_length = row_json["indexlength"] + dtype = row_json["dtype"] + + # At this point we are assuming that col_names, col_types and col_values are + # arrays of the same length, representing column names, types and values for + # one row of data + + # column names are not necessarily strings + # they are serialized as repr(name) at source + evaluated_col_names = [] + for col_name in col_names: + try: + col_name = ast.literal_eval(col_name) + except Exception as ex: + raise NameError(f"Failed to evaluate column name from '{col_name}': {ex}") + evaluated_col_names.append(col_name) + col_names = evaluated_col_names + + # Supported converters for pandas to python types + value_converters = { + "boolean": lambda val: val == "true", + "Int64": int, + "Float64": float, + "string": str, + "binary[pyarrow]": base64.b64decode, + } + + def convert_value(value, value_type): + value_converter = cast(Callable, value_converters.get(value_type)) + if value_converter is None: + raise ValueError(f"Don't know how to handle type '{value_type}'") + if value is None: + return None + return value_converter(value) + + index_values = [ + pd.Series([convert_value(col_values[i], col_types[i])], dtype=col_types[i])[0] + for i in range(index_length) + ] + + data_col_names = col_names[index_length:] + data_col_types = col_types[index_length:] + data_col_values = col_values[index_length:] + data_col_values = [ + pd.Series([convert_value(a, data_col_types[i])], dtype=data_col_types[i])[0] + for i, a in enumerate(data_col_values) + ] + + row_index = index_values[0] if len(index_values) == 1 else tuple(index_values) + row_series = pd.Series( + data_col_values, index=data_col_names, name=row_index, dtype=dtype + ) + return row_series + + +def udf(*args): + """Dummy function to use as a placeholder for function code in templates.""" + pass + + +# We want to build a cloud function that works for BQ remote functions, +# where we receive `calls` in json which is a batch of rows from BQ SQL. +# The number and the order of values in each row is expected to exactly +# match to the number and order of arguments in the udf , e.g. if the udf is +# def foo(x: int, y: str): +# ... +# then the http request body could look like +# { +# ... +# "calls" : [ +# [123, "hello"], +# [456, "world"] +# ] +# ... +# } +# https://cloud.google.com/bigquery/docs/reference/standard-sql/remote-functions#input_format +def udf_http(request): + global input_types, output_type + import json + import traceback + + from flask import jsonify + + try: + request_json = request.get_json(silent=True) + calls = request_json["calls"] + replies = [] + for call in calls: + reply = convert_to_bq_json( + output_type, udf(*convert_call(input_types, call)) + ) + replies.append(reply) + return_json = json.dumps({"replies": replies}) + return return_json + except Exception: + return jsonify({"errorMessage": traceback.format_exc()}), 400 + + +def udf_http_row_processor(request): + global output_type + import json + import math + import traceback + + from flask import jsonify + import pandas as pd + + try: + request_json = request.get_json(silent=True) + calls = request_json["calls"] + replies = [] + for call in calls: + reply = convert_to_bq_json(output_type, udf(get_pd_series(call[0]))) + if isinstance(reply, float) and (math.isnan(reply) or math.isinf(reply)): + # json serialization of the special float values (nan, inf, -inf) + # is not in strict compliance of the JSON specification + # https://docs.python.org/3/library/json.html#basic-usage. + # Let's convert them to a quoted string representation ("NaN", + # "Infinity", "-Infinity" respectively) which is handled by + # BigQuery + reply = json.dumps(reply) + elif pd.isna(reply): + # Pandas N/A values are not json serializable, so use a python + # equivalent instead + reply = None + elif hasattr(reply, "item"): + # Numpy types are not json serializable, so use its Python + # value instead + reply = reply.item() + replies.append(reply) + return_json = json.dumps({"replies": replies}) + return return_json + except Exception: + return jsonify({"errorMessage": traceback.format_exc()}), 400 + + +def generate_udf_code(def_, directory): + """Generate serialized bytecode using cloudpickle given a udf.""" + udf_code_file_name = "udf.py" + udf_bytecode_file_name = "udf.cloudpickle" + + # original code, only for debugging purpose + udf_code = textwrap.dedent(inspect.getsource(def_)) + udf_code_file_path = os.path.join(directory, udf_code_file_name) + with open(udf_code_file_path, "w") as f: + f.write(udf_code) + + # serialized bytecode + udf_bytecode_file_path = os.path.join(directory, udf_bytecode_file_name) + # TODO(b/345433300): try io.BytesIO to avoid writing to the file system + with open(udf_bytecode_file_path, "wb") as f: + cloudpickle.dump(def_, f, protocol=_pickle_protocol_version) + + return udf_code_file_name, udf_bytecode_file_name + + +def generate_cloud_function_main_code( + def_, + directory, + *, + input_types: Tuple[str], + output_type: str, + is_row_processor=False, +): + """Get main.py code for the cloud function for the given user defined function. + + Args: + input_types (tuple[str]): + Types of the input arguments in BigQuery SQL data type names. + output_type (str): + Types of the output scalar as a BigQuery SQL data type name. + """ + + # Pickle the udf with all its dependencies + udf_code_file, udf_bytecode_file = generate_udf_code(def_, directory) + + code_blocks = [ + f"""\ +import cloudpickle + +# original udf code is in {udf_code_file} +# serialized udf code is in {udf_bytecode_file} +with open("{udf_bytecode_file}", "rb") as f: + udf = cloudpickle.load(f) + +input_types = {repr(input_types)} +output_type = {repr(output_type)} +""" + ] + + # For converting scalar outputs to the correct type. + code_blocks.append(inspect.getsource(convert_to_bq_json)) + + if is_row_processor: + code_blocks.append(inspect.getsource(get_pd_series)) + handler_func_name = "udf_http_row_processor" + code_blocks.append(inspect.getsource(udf_http_row_processor)) + else: + code_blocks.append(inspect.getsource(convert_call)) + code_blocks.append(inspect.getsource(convert_from_bq_json)) + handler_func_name = "udf_http" + code_blocks.append(inspect.getsource(udf_http)) + + main_py = os.path.join(directory, "main.py") + with open(main_py, "w") as f: + f.writelines(code_blocks) + logger.debug(f"Wrote {os.path.abspath(main_py)}:\n{open(main_py).read()}") + + return handler_func_name diff --git a/bigframes/ml/decomposition.py b/bigframes/ml/decomposition.py index ad0bce481f..41dea7617f 100644 --- a/bigframes/ml/decomposition.py +++ b/bigframes/ml/decomposition.py @@ -190,5 +190,5 @@ def score( if not self._bqml_model: raise RuntimeError("A model must be fitted before score") - # TODO(b/291973741): X param is ignored. Update BQML supports input in ML.EVALUTE. + # TODO(b/291973741): X param is ignored. Update BQML supports input in ML.EVALUATE. return self._bqml_model.evaluate() diff --git a/bigframes/ml/ensemble.py b/bigframes/ml/ensemble.py index 8fc1e22146..0194d768b8 100644 --- a/bigframes/ml/ensemble.py +++ b/bigframes/ml/ensemble.py @@ -460,7 +460,7 @@ def score( .. note:: - Output matches that of the BigQuery ML.EVALUTE function. + Output matches that of the BigQuery ML.EVALUATE function. See: https://cloud.google.com/bigquery/docs/reference/standard-sql/bigqueryml-syntax-evaluate#regression_models for the outputs relevant to this model type. @@ -616,7 +616,7 @@ def score( .. note:: - Output matches that of the BigQuery ML.EVALUTE function. + Output matches that of the BigQuery ML.EVALUATE function. See: https://cloud.google.com/bigquery/docs/reference/standard-sql/bigqueryml-syntax-evaluate#classification_models for the outputs relevant to this model type. diff --git a/bigframes/ml/forecasting.py b/bigframes/ml/forecasting.py index a87df61801..a1ae8435d5 100644 --- a/bigframes/ml/forecasting.py +++ b/bigframes/ml/forecasting.py @@ -313,7 +313,7 @@ def score( .. note:: - Output matches that of the BigQuery ML.EVALUTE function. + Output matches that of the BigQuery ML.EVALUATE function. See: https://cloud.google.com/bigquery/docs/reference/standard-sql/bigqueryml-syntax-evaluate#time_series_models for the outputs relevant to this model type. diff --git a/bigframes/ml/linear_model.py b/bigframes/ml/linear_model.py index 32168e9a34..0816ef9b24 100644 --- a/bigframes/ml/linear_model.py +++ b/bigframes/ml/linear_model.py @@ -195,7 +195,7 @@ def __init__( self, *, optimize_strategy: Literal[ - "auto_strategy", "batch_gradient_descent", "normal_equation" + "auto_strategy", "batch_gradient_descent" ] = "auto_strategy", fit_intercept: bool = True, l1_reg: Optional[float] = None, diff --git a/bigframes/ml/llm.py b/bigframes/ml/llm.py index 7fa0e236eb..2517178d89 100644 --- a/bigframes/ml/llm.py +++ b/bigframes/ml/llm.py @@ -329,7 +329,7 @@ def score( .. note:: - Output matches that of the BigQuery ML.EVALUTE function. + Output matches that of the BigQuery ML.EVALUATE function. See: https://cloud.google.com/bigquery/docs/reference/standard-sql/bigqueryml-syntax-evaluate#remote-model-llm for the outputs relevant to this model type. @@ -571,6 +571,8 @@ class GeminiTextGenerator(base.BaseEstimator): Connection to connect with remote service. str of the format ... If None, use default connection in session context. BigQuery DataFrame will try to create the connection and attach permission if the connection isn't fully set up. + max_iterations (Optional[int], Default to 300): + The number of steps to run when performing supervised tuning. """ def __init__( @@ -581,9 +583,11 @@ def __init__( ] = "gemini-pro", session: Optional[bigframes.Session] = None, connection_name: Optional[str] = None, + max_iterations: int = 300, ): self.model_name = model_name self.session = session or bpd.get_global_session() + self.max_iterations = max_iterations self._bq_connection_manager = self.session.bqconnectionmanager connection_name = connection_name or self.session._bq_connection @@ -647,6 +651,55 @@ def _from_bq( model._bqml_model = core.BqmlModel(session, bq_model) return model + @property + def _bqml_options(self) -> dict: + """The model options as they will be set for BQML""" + options = { + "max_iterations": self.max_iterations, + "data_split_method": "NO_SPLIT", + } + return options + + def fit( + self, + X: Union[bpd.DataFrame, bpd.Series], + y: Union[bpd.DataFrame, bpd.Series], + ) -> GeminiTextGenerator: + """Fine tune GeminiTextGenerator model. Only support "gemini-pro" model for now. + + .. note:: + + This product or feature is subject to the "Pre-GA Offerings Terms" in the General Service Terms section of the + Service Specific Terms(https://cloud.google.com/terms/service-terms#1). Pre-GA products and features are available "as is" + and might have limited support. For more information, see the launch stage descriptions + (https://cloud.google.com/products#product-launch-stages). + + Args: + X (bigframes.dataframe.DataFrame or bigframes.series.Series): + DataFrame of shape (n_samples, n_features). Training data. + y (bigframes.dataframe.DataFrame or bigframes.series.Series: + Training labels. + + Returns: + GeminiTextGenerator: Fitted estimator. + """ + if self._bqml_model.model_name.startswith("gemini-1.5"): + raise NotImplementedError("Fit is not supported for gemini-1.5 model.") + + X, y = utils.convert_to_dataframe(X, y) + + options = self._bqml_options + options["endpoint"] = "gemini-1.0-pro-002" + options["prompt_col"] = X.columns.tolist()[0] + + self._bqml_model = self._bqml_model_factory.create_llm_remote_model( + X, + y, + options=options, + connection_name=self.connection_name, + ) + return self + def predict( self, X: Union[bpd.DataFrame, bpd.Series], @@ -732,6 +785,67 @@ def predict( return df + def score( + self, + X: Union[bpd.DataFrame, bpd.Series], + y: Union[bpd.DataFrame, bpd.Series], + task_type: Literal[ + "text_generation", "classification", "summarization", "question_answering" + ] = "text_generation", + ) -> bpd.DataFrame: + """Calculate evaluation metrics of the model. Only "gemini-pro" model is supported for now. + + .. note:: + + This product or feature is subject to the "Pre-GA Offerings Terms" in the General Service Terms section of the + Service Specific Terms(https://cloud.google.com/terms/service-terms#1). Pre-GA products and features are available "as is" + and might have limited support. For more information, see the launch stage descriptions + (https://cloud.google.com/products#product-launch-stages). + + .. note:: + + Output matches that of the BigQuery ML.EVALUATE function. + See: https://cloud.google.com/bigquery/docs/reference/standard-sql/bigqueryml-syntax-evaluate#remote-model-llm + for the outputs relevant to this model type. + + Args: + X (bigframes.dataframe.DataFrame or bigframes.series.Series): + A BigQuery DataFrame as evaluation data, which contains only one column of input_text + that contains the prompt text to use when evaluating the model. + y (bigframes.dataframe.DataFrame or bigframes.series.Series): + A BigQuery DataFrame as evaluation labels, which contains only one column of output_text + that you would expect to be returned by the model. + task_type (str): + The type of the task for LLM model. Default to "text_generation". + Possible values: "text_generation", "classification", "summarization", and "question_answering". + + Returns: + bigframes.dataframe.DataFrame: The DataFrame as evaluation result. + """ + if not self._bqml_model: + raise RuntimeError("A model must be fitted before score") + + # TODO(ashleyxu): Support gemini-1.5 when the rollout is ready. b/344891364. + if self._bqml_model.model_name.startswith("gemini-1.5"): + raise NotImplementedError("Score is not supported for gemini-1.5 model.") + + X, y = utils.convert_to_dataframe(X, y) + + if len(X.columns) != 1 or len(y.columns) != 1: + raise ValueError( + f"Only support one column as input for X and y. {constants.FEEDBACK_LINK}" + ) + + # BQML identified the column by name + X_col_label = cast(blocks.Label, X.columns[0]) + y_col_label = cast(blocks.Label, y.columns[0]) + X = X.rename(columns={X_col_label: "input_text"}) + y = y.rename(columns={y_col_label: "output_text"}) + + input_data = X.join(y, how="outer") + + return self._bqml_model.llm_evaluate(input_data, task_type) + def to_gbq(self, model_name: str, replace: bool = False) -> GeminiTextGenerator: """Save the model to BigQuery. diff --git a/bigframes/ml/loader.py b/bigframes/ml/loader.py index 66f207929a..515fb50c6f 100644 --- a/bigframes/ml/loader.py +++ b/bigframes/ml/loader.py @@ -98,13 +98,7 @@ def from_bq( Returns: A BigQuery DataFrames ML model object. """ - # TODO(garrettwu): the entire condition only to TRANSFORM_ONLY when b/331679273 is fixed. - if ( - bq_model.model_type == "TRANSFORM_ONLY" - or bq_model.model_type == "MODEL_TYPE_UNSPECIFIED" - and "transformColumns" in bq_model._properties - and not _is_bq_model_remote(bq_model) - ): + if bq_model.model_type == "TRANSFORM_ONLY": return _transformer_from_bq(session, bq_model) if _is_bq_model_pipeline(bq_model): diff --git a/bigframes/ml/metrics/_metrics.py b/bigframes/ml/metrics/_metrics.py index 2525ecd34f..a40c175000 100644 --- a/bigframes/ml/metrics/_metrics.py +++ b/bigframes/ml/metrics/_metrics.py @@ -227,7 +227,7 @@ def recall_score( y_true: Union[bpd.DataFrame, bpd.Series], y_pred: Union[bpd.DataFrame, bpd.Series], *, - average: str = "binary", + average: typing.Optional[str] = "binary", ) -> pd.Series: # TODO(ashleyxu): support more average type, default to "binary" if average is not None: @@ -264,7 +264,7 @@ def precision_score( y_true: Union[bpd.DataFrame, bpd.Series], y_pred: Union[bpd.DataFrame, bpd.Series], *, - average: str = "binary", + average: typing.Optional[str] = "binary", ) -> pd.Series: # TODO(ashleyxu): support more average type, default to "binary" if average is not None: @@ -303,7 +303,7 @@ def f1_score( y_true: Union[bpd.DataFrame, bpd.Series], y_pred: Union[bpd.DataFrame, bpd.Series], *, - average: str = "binary", + average: typing.Optional[str] = "binary", ) -> pd.Series: # TODO(ashleyxu): support more average type, default to "binary" y_true_series, y_pred_series = utils.convert_to_series(y_true, y_pred) diff --git a/bigframes/operations/__init__.py b/bigframes/operations/__init__.py index 42f83913ee..c10b743631 100644 --- a/bigframes/operations/__init__.py +++ b/bigframes/operations/__init__.py @@ -161,7 +161,10 @@ def _convert_expr_input( def create_unary_op(name: str, type_signature: op_typing.UnaryTypeSignature) -> UnaryOp: return dataclasses.make_dataclass( name, - [("name", typing.ClassVar[str], name), ("output_type", typing.ClassVar[typing.Callable], type_signature.as_method)], # type: ignore + [ + ("name", typing.ClassVar[str], name), + ("output_type", typing.ClassVar[typing.Callable], type_signature.as_method), + ], bases=(UnaryOp,), frozen=True, )() @@ -172,7 +175,10 @@ def create_binary_op( ) -> BinaryOp: return dataclasses.make_dataclass( name, - [("name", typing.ClassVar[str], name), ("output_type", typing.ClassVar[typing.Callable], type_signature.as_method)], # type: ignore + [ + ("name", typing.ClassVar[str], name), + ("output_type", typing.ClassVar[typing.Callable], type_signature.as_method), + ], bases=(BinaryOp,), frozen=True, )() @@ -493,8 +499,9 @@ def output_type(self, *input_types): if self.to_type == pa.string(): return dtypes.STRING_DTYPE if isinstance(self.to_type, str): - # TODO(b/340895446): fix type error - return dtypes.BIGFRAMES_STRING_TO_BIGFRAMES[self.to_type] # type: ignore + return dtypes.BIGFRAMES_STRING_TO_BIGFRAMES[ + typing.cast(dtypes.DtypeString, self.to_type) + ] return self.to_type @@ -516,8 +523,10 @@ class RemoteFunctionOp(UnaryOp): def output_type(self, *input_types): # This property should be set to a valid Dtype by the @remote_function decorator or read_gbq_function method - # TODO(b/340895446): fix type error - return self.func.output_dtype # type: ignore + if hasattr(self.func, "output_dtype"): + return self.func.output_dtype + else: + raise AttributeError("output_dtype not defined") @dataclasses.dataclass(frozen=True) @@ -644,8 +653,10 @@ class BinaryRemoteFunctionOp(BinaryOp): def output_type(self, *input_types): # This property should be set to a valid Dtype by the @remote_function decorator or read_gbq_function method - # TODO(b/340895446): fix type error - return self.func.output_dtype # type: ignore + if hasattr(self.func, "output_dtype"): + return self.func.output_dtype + else: + raise AttributeError("output_dtype not defined") add_op = AddOp() diff --git a/bigframes/operations/_matplotlib/__init__.py b/bigframes/operations/_matplotlib/__init__.py index f869c1e01d..6ffe71139d 100644 --- a/bigframes/operations/_matplotlib/__init__.py +++ b/bigframes/operations/_matplotlib/__init__.py @@ -12,10 +12,14 @@ # See the License for the specific language governing permissions and # limitations under the License. +import typing + import bigframes.operations._matplotlib.core as core import bigframes.operations._matplotlib.hist as hist -PLOT_CLASSES: dict[str, type[core.MPLPlot]] = { +PLOT_TYPES = typing.Union[type[core.SamplingPlot], type[hist.HistPlot]] + +PLOT_CLASSES: dict[str, PLOT_TYPES] = { "hist": hist.HistPlot, "line": core.LinePlot, "area": core.AreaPlot, @@ -24,8 +28,7 @@ def plot(data, kind, **kwargs): - # TODO(b/340896123): fix type error - plot_obj = PLOT_CLASSES[kind](data, **kwargs) # type: ignore + plot_obj = PLOT_CLASSES[kind](data, **kwargs) plot_obj.generate() plot_obj.draw() return plot_obj.result diff --git a/bigframes/operations/_matplotlib/core.py b/bigframes/operations/_matplotlib/core.py index 78b3df1f19..ff8dd86cff 100644 --- a/bigframes/operations/_matplotlib/core.py +++ b/bigframes/operations/_matplotlib/core.py @@ -39,8 +39,10 @@ def draw(self) -> None: @property def result(self): - # TODO(b/340896123): fix type error - return self.axes # type: ignore + if hasattr(self, "axes"): + return self.axes + else: + raise AttributeError("Axes not defined") class SamplingPlot(MPLPlot): diff --git a/bigframes/pandas/__init__.py b/bigframes/pandas/__init__.py index 603f6678a5..f6f9aec800 100644 --- a/bigframes/pandas/__init__.py +++ b/bigframes/pandas/__init__.py @@ -142,7 +142,7 @@ def cut( x: bigframes.series.Series, bins: int, *, - labels: Optional[bool] = None, + labels: Union[Iterable[str], bool, None] = None, ) -> bigframes.series.Series: return bigframes.core.reshape.cut( x, @@ -699,9 +699,35 @@ def read_gbq_function(function_name: str): read_gbq_function.__doc__ = inspect.getdoc(bigframes.session.Session.read_gbq_function) +@typing.overload +def to_datetime( + arg: Union[ + vendored_pandas_datetimes.local_iterables, + bigframes.series.Series, + bigframes.dataframe.DataFrame, + ], + *, + utc: bool = False, + format: Optional[str] = None, + unit: Optional[str] = None, +) -> bigframes.series.Series: + ... + + +@typing.overload +def to_datetime( + arg: Union[int, float, str, datetime], + *, + utc: bool = False, + format: Optional[str] = None, + unit: Optional[str] = None, +) -> Union[pandas.Timestamp, datetime]: + ... + + def to_datetime( arg: Union[ - vendored_pandas_datetimes.local_scalars, + Union[int, float, str, datetime], vendored_pandas_datetimes.local_iterables, bigframes.series.Series, bigframes.dataframe.DataFrame, diff --git a/bigframes/series.py b/bigframes/series.py index 367301f08e..d858060aec 100644 --- a/bigframes/series.py +++ b/bigframes/series.py @@ -783,10 +783,10 @@ def pow(self, other: float | int | Series) -> Series: def rpow(self, other: float | int | Series) -> Series: return self._apply_binary_op(other, ops.pow_op, reverse=True) - def __lt__(self, other: float | int | Series) -> Series: # type: ignore + def __lt__(self, other: float | int | str | Series) -> Series: return self.lt(other) - def __le__(self, other: float | int | Series) -> Series: # type: ignore + def __le__(self, other: float | int | str | Series) -> Series: return self.le(other) def lt(self, other) -> Series: @@ -795,10 +795,10 @@ def lt(self, other) -> Series: def le(self, other) -> Series: return self._apply_binary_op(other, ops.le_op) - def __gt__(self, other: float | int | Series) -> Series: # type: ignore + def __gt__(self, other: float | int | str | Series) -> Series: return self.gt(other) - def __ge__(self, other: float | int | Series) -> Series: # type: ignore + def __ge__(self, other: float | int | str | Series) -> Series: return self.ge(other) def gt(self, other) -> Series: diff --git a/bigframes/session/__init__.py b/bigframes/session/__init__.py index 2919b2d77f..4c5ce21153 100644 --- a/bigframes/session/__init__.py +++ b/bigframes/session/__init__.py @@ -84,7 +84,6 @@ import bigframes.core.compile import bigframes.core.guid import bigframes.core.nodes as nodes -from bigframes.core.ordering import IntegerEncoding import bigframes.core.ordering as order import bigframes.core.schema as schemata import bigframes.core.tree_properties as traversals @@ -295,6 +294,9 @@ def __init__( self._bytes_processed_sum = 0 self._slot_millis_sum = 0 self._execution_count = 0 + # Whether this session treats objects as totally ordered. + # Will expose as feature later, only False for internal testing + self._strictly_ordered = True @property def bqclient(self): @@ -1159,35 +1161,14 @@ def _read_pandas_load_job( ) self._start_generic_job(load_job) - ordering = order.ExpressionOrdering( - ordering_value_columns=tuple([order.ascending_over(ordering_col)]), - total_ordering_columns=frozenset([ordering_col]), - integer_encoding=IntegerEncoding(True, is_sequential=True), - ) - table_expression = self.ibis_client.table( # type: ignore - load_table_destination.table_id, - schema=load_table_destination.dataset_id, - database=load_table_destination.project, - ) - - # b/297590178 Potentially a bug in bqclient.load_table_from_dataframe(), that only when the DF is empty, the index columns disappear in table_expression. - if any( - [new_idx_id not in table_expression.columns for new_idx_id in new_idx_ids] - ): - new_idx_ids, idx_labels = [], [] - - column_values = [ - table_expression[col] - for col in table_expression.columns - if col != ordering_col - ] - array_value = core.ArrayValue.from_ibis( - self, - table_expression, - columns=column_values, - hidden_ordering_columns=[table_expression[ordering_col]], - ordering=ordering, - ) + destination_table = self.bqclient.get_table(load_table_destination) + array_value = core.ArrayValue.from_table( + table=destination_table, + # TODO: Generate this directly from original pandas df. + schema=schemata.ArraySchema.from_bq_table(destination_table), + session=self, + offsets_col=ordering_col, + ).drop_columns([ordering_col]) block = blocks.Block( array_value, @@ -1863,30 +1844,20 @@ def _cache_with_cluster_cols( """Executes the query and uses the resulting table to rewrite future executions.""" # TODO: Use this for all executions? Problem is that caching materializes extra # ordering columns + # TODO: May want to support some partial ordering info even for non-strict ordering mode + keep_order_info = self._strictly_ordered + compiled_value = self._compile_ordered(array_value) ibis_expr = compiled_value._to_ibis_expr( - ordering_mode="unordered", expose_hidden_cols=True + ordering_mode="unordered", expose_hidden_cols=keep_order_info ) tmp_table = self._ibis_to_temp_table( ibis_expr, cluster_cols=cluster_cols, api_name="cached" ) - table_expression = self.ibis_client.table( - tmp_table.table_id, - schema=tmp_table.dataset_id, - database=tmp_table.project, - ) - new_columns = [table_expression[column] for column in compiled_value.column_ids] - new_hidden_columns = [ - table_expression[column] - for column in compiled_value._hidden_ordering_column_names - ] - cached_replacement = core.ArrayValue.from_ibis( - self, - table_expression, - columns=new_columns, - hidden_ordering_columns=new_hidden_columns, - ordering=compiled_value._ordering, + cached_replacement = array_value.as_cached( + cache_table=self.bqclient.get_table(tmp_table), + ordering=compiled_value._ordering if keep_order_info else None, ).node self._cached_executions[array_value.node] = cached_replacement @@ -1894,6 +1865,10 @@ def _cache_with_offsets(self, array_value: core.ArrayValue): """Executes the query and uses the resulting table to rewrite future executions.""" # TODO: Use this for all executions? Problem is that caching materializes extra # ordering columns + if not self._strictly_ordered: + raise ValueError( + "Caching with offsets only supported in strictly ordered mode." + ) compiled_value = self._compile_ordered(array_value) ibis_expr = compiled_value._to_ibis_expr( @@ -1902,18 +1877,8 @@ def _cache_with_offsets(self, array_value: core.ArrayValue): tmp_table = self._ibis_to_temp_table( ibis_expr, cluster_cols=["bigframes_offsets"], api_name="cached" ) - table_expression = self.ibis_client.table( - tmp_table.table_id, - schema=tmp_table.dataset_id, - database=tmp_table.project, - ) - new_columns = [table_expression[column] for column in compiled_value.column_ids] - new_hidden_columns = [table_expression["bigframes_offsets"]] - cached_replacement = core.ArrayValue.from_ibis( - self, - table_expression, - columns=new_columns, - hidden_ordering_columns=new_hidden_columns, + cached_replacement = array_value.as_cached( + cache_table=self.bqclient.get_table(tmp_table), ordering=order.ExpressionOrdering.from_offset_col("bigframes_offsets"), ).node self._cached_executions[array_value.node] = cached_replacement diff --git a/bigframes/session/clients.py b/bigframes/session/clients.py index e7680d1d35..85664d8dc8 100644 --- a/bigframes/session/clients.py +++ b/bigframes/session/clients.py @@ -109,19 +109,27 @@ def __init__( # cloud clients initialized for lazy load self._bqclient = None - self._bqconnectionclient = None - self._bqstoragereadclient = None - self._cloudfunctionsclient = None - self._resourcemanagerclient = None + self._bqconnectionclient: Optional[ + google.cloud.bigquery_connection_v1.ConnectionServiceClient + ] = None + self._bqstoragereadclient: Optional[ + google.cloud.bigquery_storage_v1.BigQueryReadClient + ] = None + self._cloudfunctionsclient: Optional[ + google.cloud.functions_v2.FunctionServiceClient + ] = None + self._resourcemanagerclient: Optional[ + google.cloud.resourcemanager_v3.ProjectsClient + ] = None def _create_bigquery_client(self): bq_options = None if self._use_regional_endpoints: - # TODO(b/340896138): fix type error bq_options = google.api_core.client_options.ClientOptions( api_endpoint=( _BIGQUERY_REGIONAL_ENDPOINT - if self._location.lower() in _REP_SUPPORTED_REGIONS # type: ignore + if self._location is not None + and self._location.lower() in _REP_SUPPORTED_REGIONS else _BIGQUERY_LOCATIONAL_ENDPOINT ).format(location=self._location), ) @@ -159,11 +167,12 @@ def bqconnectionclient(self): bqconnection_info = google.api_core.gapic_v1.client_info.ClientInfo( user_agent=self._application_name ) - # TODO(b/340896138): fix type error - self._bqconnectionclient = google.cloud.bigquery_connection_v1.ConnectionServiceClient( # type: ignore - client_info=bqconnection_info, - client_options=bqconnection_options, - credentials=self._credentials, + self._bqconnectionclient = ( + google.cloud.bigquery_connection_v1.ConnectionServiceClient( + client_info=bqconnection_info, + client_options=bqconnection_options, + credentials=self._credentials, + ) ) return self._bqconnectionclient @@ -173,20 +182,19 @@ def bqstoragereadclient(self): if not self._bqstoragereadclient: bqstorage_options = None if self._use_regional_endpoints: - # TODO(b/340896138): fix type error bqstorage_options = google.api_core.client_options.ClientOptions( api_endpoint=( _BIGQUERYSTORAGE_REGIONAL_ENDPOINT - if self._location.lower() in _REP_SUPPORTED_REGIONS # type: ignore + if self._location is not None + and self._location.lower() in _REP_SUPPORTED_REGIONS else _BIGQUERYSTORAGE_LOCATIONAL_ENDPOINT ).format(location=self._location), ) bqstorage_info = google.api_core.gapic_v1.client_info.ClientInfo( user_agent=self._application_name ) - # TODO(b/340896138): fix type error self._bqstoragereadclient = ( - google.cloud.bigquery_storage_v1.BigQueryReadClient( # type: ignore + google.cloud.bigquery_storage_v1.BigQueryReadClient( client_info=bqstorage_info, client_options=bqstorage_options, credentials=self._credentials, @@ -201,9 +209,8 @@ def cloudfunctionsclient(self): functions_info = google.api_core.gapic_v1.client_info.ClientInfo( user_agent=self._application_name ) - # TODO(b/340896138): fix type error self._cloudfunctionsclient = ( - google.cloud.functions_v2.FunctionServiceClient( # type: ignore + google.cloud.functions_v2.FunctionServiceClient( client_info=functions_info, credentials=self._credentials, ) @@ -217,9 +224,8 @@ def resourcemanagerclient(self): resourcemanager_info = google.api_core.gapic_v1.client_info.ClientInfo( user_agent=self._application_name ) - # TODO(b/340896138): fix type error self._resourcemanagerclient = ( - google.cloud.resourcemanager_v3.ProjectsClient( # type: ignore + google.cloud.resourcemanager_v3.ProjectsClient( credentials=self._credentials, client_info=resourcemanager_info ) ) diff --git a/bigframes/version.py b/bigframes/version.py index 74a30e35b7..56a1200857 100644 --- a/bigframes/version.py +++ b/bigframes/version.py @@ -12,4 +12,4 @@ # See the License for the specific language governing permissions and # limitations under the License. -__version__ = "1.8.0" +__version__ = "1.9.0" diff --git a/mypy.ini b/mypy.ini index 5707f14154..f0a005d2e5 100644 --- a/mypy.ini +++ b/mypy.ini @@ -9,6 +9,9 @@ ignore_missing_imports = True [mypy-cloudpickle.*] ignore_missing_imports = True +[mypy-flask] +ignore_missing_imports = True + [mypy-pydata_google_auth] ignore_missing_imports = True diff --git a/noxfile.py b/noxfile.py index 52583bbf1a..177e0e2ab8 100644 --- a/noxfile.py +++ b/noxfile.py @@ -35,7 +35,15 @@ # https://github.com/str0zzapreti/pytest-retry/issues/32 PYTEST_VERSION = "pytest<8.0.0dev" SPHINX_VERSION = "sphinx==4.5.0" -LINT_PATHS = ["docs", "bigframes", "tests", "third_party", "noxfile.py", "setup.py"] +LINT_PATHS = [ + "docs", + "bigframes", + "tests", + "third_party", + "noxfile.py", + "setup.py", + os.path.join("scripts", "benchmark"), +] DEFAULT_PYTHON_VERSION = "3.10" @@ -76,6 +84,8 @@ SYSTEM_TEST_EXTRAS: List[str] = ["tests"] SYSTEM_TEST_EXTRAS_BY_PYTHON: Dict[str, List[str]] = {} +LOGGING_NAME_ENV_VAR = "BIGFRAMES_PERFORMANCE_LOG_NAME" + CURRENT_DIRECTORY = pathlib.Path(__file__).parent.absolute() # Sessions are executed in the order so putting the smaller sessions @@ -748,8 +758,6 @@ def notebook(session: nox.Session): "--nbmake-timeout=900", # 15 minutes ] - logging_name_env_var = "BIGFRAMES_PERFORMANCE_LOG_NAME" - try: # Populate notebook parameters and make a backup so that the notebooks # are runnable. @@ -763,10 +771,10 @@ def notebook(session: nox.Session): # takes an environment variable for performance logging processes = [] for notebook in notebooks: - session.env[logging_name_env_var] = os.path.basename(notebook) process = Process( target=session.run, args=(*pytest_command, notebook), + kwargs={"env": {LOGGING_NAME_ENV_VAR: os.path.basename(notebook)}}, ) process.start() processes.append(process) @@ -788,11 +796,15 @@ def notebook(session: nox.Session): processes = [] for notebook, regions in notebooks_reg.items(): for region in regions: - session.env[logging_name_env_var] = os.path.basename(notebook) process = Process( target=session.run, args=(*pytest_command, notebook), - kwargs={"env": {"BIGQUERY_LOCATION": region}}, + kwargs={ + "env": { + "BIGQUERY_LOCATION": region, + LOGGING_NAME_ENV_VAR: os.path.basename(notebook), + } + }, ) process.start() processes.append(process) @@ -803,34 +815,69 @@ def notebook(session: nox.Session): # when the environment variable is set as it is above, # notebooks output a .bytesprocessed and .slotmillis report # collect those reports and print a summary - _print_performance_report() + _print_performance_report("notebooks/") + + +@nox.session(python=DEFAULT_PYTHON_VERSION) +def benchmark(session: nox.Session): + session.install("-e", ".[all]") + base_path = os.path.join("scripts", "benchmark") + + benchmark_script_list = list(Path(base_path).rglob("*.py")) + # Run benchmarks in parallel session.run's, since each benchmark + # takes an environment variable for performance logging + processes = [] + for benchmark in benchmark_script_list: + process = Process( + target=session.run, + args=("python", benchmark), + kwargs={"env": {LOGGING_NAME_ENV_VAR: benchmark.as_posix()}}, + ) + process.start() + processes.append(process) + + for process in processes: + process.join() + + # when the environment variable is set as it is above, + # notebooks output a .bytesprocessed and .slotmillis report + # collect those reports and print a summary + _print_performance_report(base_path) -def _print_performance_report(): +def _print_performance_report(path: str): """Add an informational report about http queries, bytes processed, and slot time to the testlog output for purposes of measuring bigquery-related performance changes. + + Looks specifically for output files in subfolders of the + passed path. (*/*.bytesprocessed and */*.slotmillis) """ print("---BIGQUERY USAGE REPORT---") results_dict = {} - for bytes_report in Path("notebooks/").glob("*/*.bytesprocessed"): + bytes_reports = sorted(Path(path).rglob("*.bytesprocessed")) + for bytes_report in bytes_reports: with open(bytes_report, "r") as bytes_file: - filename = bytes_report.stem + filename = bytes_report.relative_to(path).with_suffix("") lines = bytes_file.read().splitlines() query_count = len(lines) total_bytes = sum([int(line) for line in lines]) results_dict[filename] = [query_count, total_bytes] - for millis_report in Path("notebooks/").glob("*/*.slotmillis"): + os.remove(bytes_report) + + millis_reports = sorted(Path(path).rglob("*.slotmillis")) + for millis_report in millis_reports: with open(millis_report, "r") as millis_file: - filename = millis_report.stem + filename = millis_report.relative_to(path).with_suffix("") lines = millis_file.read().splitlines() total_slot_millis = sum([int(line) for line in lines]) results_dict[filename] += [total_slot_millis] + os.remove(millis_report) cumulative_queries = 0 cumulative_bytes = 0 cumulative_slot_millis = 0 - for results in results_dict.values(): + for name, results in results_dict.items(): if len(results) != 3: raise IOError( "Mismatch in performance logging output. " @@ -842,7 +889,7 @@ def _print_performance_report(): cumulative_bytes += total_bytes cumulative_slot_millis += total_slot_millis print( - f"{filename} - query count: {query_count}," + f"{name} - query count: {query_count}," f" bytes processed sum: {total_bytes}," f" slot millis sum: {total_slot_millis}" ) diff --git a/scripts/benchmark/db-benchmark/groupby/G1_1e9_1e2_5_0/q1.py b/scripts/benchmark/db-benchmark/groupby/G1_1e9_1e2_5_0/q1.py new file mode 100644 index 0000000000..cc5f77b49b --- /dev/null +++ b/scripts/benchmark/db-benchmark/groupby/G1_1e9_1e2_5_0/q1.py @@ -0,0 +1,14 @@ +# Contains code from https://github.com/duckdblabs/db-benchmark/blob/master/pandas/groupby-pandas.py + +import bigframes.pandas as bpd + +print("Groupby benchmark 1: sum v1 by id1") + +x = bpd.read_gbq("bigframes-dev-perf.dbbenchmark.G1_1e9_1e2_5_0") + +ans = x.groupby("id1", as_index=False, dropna=False).agg({"v1": "sum"}) +print(ans.shape) +chk = [ans["v1"].sum()] +print(chk) + +bpd.reset_session() diff --git a/scripts/benchmark/db-benchmark/groupby/G1_1e9_1e2_5_0/q2.py b/scripts/benchmark/db-benchmark/groupby/G1_1e9_1e2_5_0/q2.py new file mode 100644 index 0000000000..734a17242b --- /dev/null +++ b/scripts/benchmark/db-benchmark/groupby/G1_1e9_1e2_5_0/q2.py @@ -0,0 +1,14 @@ +# Contains code from https://github.com/duckdblabs/db-benchmark/blob/master/pandas/groupby-pandas.py + +import bigframes.pandas as bpd + +print("Groupby benchmark 2: sum v1 by id1:id2") + +x = bpd.read_gbq("bigframes-dev-perf.dbbenchmark.G1_1e9_1e2_5_0") + +ans = x.groupby(["id1", "id2"], as_index=False, dropna=False).agg({"v1": "sum"}) +print(ans.shape) +chk = [ans["v1"].sum()] +print(chk) + +bpd.reset_session() diff --git a/scripts/benchmark/db-benchmark/groupby/G1_1e9_1e2_5_0/q3.py b/scripts/benchmark/db-benchmark/groupby/G1_1e9_1e2_5_0/q3.py new file mode 100644 index 0000000000..242902de64 --- /dev/null +++ b/scripts/benchmark/db-benchmark/groupby/G1_1e9_1e2_5_0/q3.py @@ -0,0 +1,14 @@ +# Contains code from https://github.com/duckdblabs/db-benchmark/blob/master/pandas/groupby-pandas.py + +import bigframes.pandas as bpd + +print("Groupby benchmark 3: sum v1 mean v3 by id3") + +x = bpd.read_gbq("bigframes-dev-perf.dbbenchmark.G1_1e9_1e2_5_0") + +ans = x.groupby("id3", as_index=False, dropna=False).agg({"v1": "sum", "v3": "mean"}) +print(ans.shape) +chk = [ans["v1"].sum(), ans["v3"].sum()] +print(chk) + +bpd.reset_session() diff --git a/scripts/benchmark/db-benchmark/groupby/G1_1e9_1e2_5_0/q4.py b/scripts/benchmark/db-benchmark/groupby/G1_1e9_1e2_5_0/q4.py new file mode 100644 index 0000000000..e4f769545e --- /dev/null +++ b/scripts/benchmark/db-benchmark/groupby/G1_1e9_1e2_5_0/q4.py @@ -0,0 +1,16 @@ +# Contains code from https://github.com/duckdblabs/db-benchmark/blob/master/pandas/groupby-pandas.py + +import bigframes.pandas as bpd + +print("Groupby benchmark 4: mean v1:v3 by id4") + +x = bpd.read_gbq("bigframes-dev-perf.dbbenchmark.G1_1e9_1e2_5_0") + +ans = x.groupby("id4", as_index=False, dropna=False).agg( + {"v1": "mean", "v2": "mean", "v3": "mean"} +) +print(ans.shape) +chk = [ans["v1"].sum(), ans["v2"].sum(), ans["v3"].sum()] +print(chk) + +bpd.reset_session() diff --git a/scripts/benchmark/db-benchmark/groupby/G1_1e9_1e2_5_0/q5.py b/scripts/benchmark/db-benchmark/groupby/G1_1e9_1e2_5_0/q5.py new file mode 100644 index 0000000000..d34a6c055f --- /dev/null +++ b/scripts/benchmark/db-benchmark/groupby/G1_1e9_1e2_5_0/q5.py @@ -0,0 +1,16 @@ +# Contains code from https://github.com/duckdblabs/db-benchmark/blob/master/pandas/groupby-pandas.py + +import bigframes.pandas as bpd + +print("Groupby benchmark 5: sum v1:v3 by id6") + +x = bpd.read_gbq("bigframes-dev-perf.dbbenchmark.G1_1e9_1e2_5_0") + +ans = x.groupby("id6", as_index=False, dropna=False).agg( + {"v1": "sum", "v2": "sum", "v3": "sum"} +) +print(ans.shape) +chk = [ans["v1"].sum(), ans["v2"].sum(), ans["v3"].sum()] +print(chk) + +bpd.reset_session() diff --git a/scripts/benchmark/db-benchmark/groupby/G1_1e9_1e2_5_0/q6.py b/scripts/benchmark/db-benchmark/groupby/G1_1e9_1e2_5_0/q6.py new file mode 100644 index 0000000000..0f3240a129 --- /dev/null +++ b/scripts/benchmark/db-benchmark/groupby/G1_1e9_1e2_5_0/q6.py @@ -0,0 +1,16 @@ +# Contains code from https://github.com/duckdblabs/db-benchmark/blob/master/pandas/groupby-pandas.py + +import bigframes.pandas as bpd + +print("Groupby benchmark 6: median v3 sd v3 by id4 id5") + +x = bpd.read_gbq("bigframes-dev-perf.dbbenchmark.G1_1e9_1e2_5_0") + +ans = x.groupby(["id4", "id5"], as_index=False, dropna=False).agg( + {"v3": ["median", "std"]} +) +print(ans.shape) +chk = [ans["v3"]["median"].sum(), ans["v3"]["std"].sum()] +print(chk) + +bpd.reset_session() diff --git a/scripts/benchmark/db-benchmark/groupby/G1_1e9_1e2_5_0/q7.py b/scripts/benchmark/db-benchmark/groupby/G1_1e9_1e2_5_0/q7.py new file mode 100644 index 0000000000..78e1e94b85 --- /dev/null +++ b/scripts/benchmark/db-benchmark/groupby/G1_1e9_1e2_5_0/q7.py @@ -0,0 +1,18 @@ +# Contains code from https://github.com/duckdblabs/db-benchmark/blob/master/pandas/groupby-pandas.py + +import bigframes.pandas as bpd + +print("Groupby benchmark 7: max v1 - min v2 by id3") + +x = bpd.read_gbq("bigframes-dev-perf.dbbenchmark.G1_1e9_1e2_5_0") + +ans = ( + x.groupby("id3", as_index=False, dropna=False) + .agg({"v1": "max", "v2": "min"}) + .assign(range_v1_v2=lambda x: x["v1"] - x["v2"])[["id3", "range_v1_v2"]] +) +print(ans.shape) +chk = [ans["range_v1_v2"].sum()] +print(chk) + +bpd.reset_session() diff --git a/scripts/benchmark/db-benchmark/join/J1_1e9_NA_0_0/q1.py b/scripts/benchmark/db-benchmark/join/J1_1e9_NA_0_0/q1.py new file mode 100644 index 0000000000..429dc72ad0 --- /dev/null +++ b/scripts/benchmark/db-benchmark/join/J1_1e9_NA_0_0/q1.py @@ -0,0 +1,16 @@ +# Contains code from https://github.com/duckdblabs/db-benchmark/blob/master/pandas/join-pandas.py + +import bigframes.pandas as bpd + +print("Join benchmark 1: small inner on int") + +x = bpd.read_gbq("bigframes-dev-perf.dbbenchmark.J1_1e9_NA_0_0") +small = bpd.read_gbq("bigframes-dev-perf.dbbenchmark.J1_1e9_1e3_0_0") + +ans = x.merge(small, on="id1") +print(ans.shape) + +chk = [ans["v1"].sum(), ans["v2"].sum()] +print(chk) + +bpd.reset_session() diff --git a/scripts/benchmark/db-benchmark/join/J1_1e9_NA_0_0/q2.py b/scripts/benchmark/db-benchmark/join/J1_1e9_NA_0_0/q2.py new file mode 100644 index 0000000000..210c29f844 --- /dev/null +++ b/scripts/benchmark/db-benchmark/join/J1_1e9_NA_0_0/q2.py @@ -0,0 +1,16 @@ +# Contains code from https://github.com/duckdblabs/db-benchmark/blob/master/pandas/join-pandas.py + +import bigframes.pandas as bpd + +print("Join benchmark 2: medium inner on int") + +x = bpd.read_gbq("bigframes-dev-perf.dbbenchmark.J1_1e9_NA_0_0") +medium = bpd.read_gbq("bigframes-dev-perf.dbbenchmark.J1_1e9_1e6_0_0") + +ans = x.merge(medium, on="id2") +print(ans.shape) + +chk = [ans["v1"].sum(), ans["v2"].sum()] +print(chk) + +bpd.reset_session() diff --git a/scripts/benchmark/db-benchmark/join/J1_1e9_NA_0_0/q3.py b/scripts/benchmark/db-benchmark/join/J1_1e9_NA_0_0/q3.py new file mode 100644 index 0000000000..d88d943604 --- /dev/null +++ b/scripts/benchmark/db-benchmark/join/J1_1e9_NA_0_0/q3.py @@ -0,0 +1,16 @@ +# Contains code from https://github.com/duckdblabs/db-benchmark/blob/master/pandas/join-pandas.py + +import bigframes.pandas as bpd + +print("Join benchmark 3: medium outer on int") + +x = bpd.read_gbq("bigframes-dev-perf.dbbenchmark.J1_1e9_NA_0_0") +medium = bpd.read_gbq("bigframes-dev-perf.dbbenchmark.J1_1e9_1e6_0_0") + +ans = x.merge(medium, how="left", on="id2") +print(ans.shape) + +chk = [ans["v1"].sum(), ans["v2"].sum()] +print(chk) + +bpd.reset_session() diff --git a/scripts/benchmark/db-benchmark/join/J1_1e9_NA_0_0/q4.py b/scripts/benchmark/db-benchmark/join/J1_1e9_NA_0_0/q4.py new file mode 100644 index 0000000000..9167043d9a --- /dev/null +++ b/scripts/benchmark/db-benchmark/join/J1_1e9_NA_0_0/q4.py @@ -0,0 +1,16 @@ +# Contains code from https://github.com/duckdblabs/db-benchmark/blob/master/pandas/join-pandas.py + +import bigframes.pandas as bpd + +print("Join benchmark 4: medium inner on factor") + +x = bpd.read_gbq("bigframes-dev-perf.dbbenchmark.J1_1e9_NA_0_0") +medium = bpd.read_gbq("bigframes-dev-perf.dbbenchmark.J1_1e9_1e6_0_0") + +ans = x.merge(medium, on="id5") +print(ans.shape) + +chk = [ans["v1"].sum(), ans["v2"].sum()] +print(chk) + +bpd.reset_session() diff --git a/scripts/benchmark/db-benchmark/join/J1_1e9_NA_0_0/q5.py b/scripts/benchmark/db-benchmark/join/J1_1e9_NA_0_0/q5.py new file mode 100644 index 0000000000..39eb23ac45 --- /dev/null +++ b/scripts/benchmark/db-benchmark/join/J1_1e9_NA_0_0/q5.py @@ -0,0 +1,16 @@ +# Contains code from https://github.com/duckdblabs/db-benchmark/blob/master/pandas/join-pandas.py + +import bigframes.pandas as bpd + +print("Join benchmark 5: big inner on int") + +x = bpd.read_gbq("bigframes-dev-perf.dbbenchmark.J1_1e9_NA_0_0") +big = bpd.read_gbq("bigframes-dev-perf.dbbenchmark.J1_1e9_1e9_0_0") + +ans = x.merge(big, on="id3") +print(ans.shape) + +chk = [ans["v1"].sum(), ans["v2"].sum()] +print(chk) + +bpd.reset_session() diff --git a/scripts/benchmark/db-benchmark/sort b/scripts/benchmark/db-benchmark/sort new file mode 100644 index 0000000000..e69de29bb2 diff --git a/scripts/benchmark/simple_benchmark.py b/scripts/benchmark/simple_benchmark.py new file mode 100644 index 0000000000..53b35c52ad --- /dev/null +++ b/scripts/benchmark/simple_benchmark.py @@ -0,0 +1,27 @@ +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import bigframes.pandas as bpd + +# This is a placeholder benchmark. +# TODO(340278185): Add more data analysis tasks and benchmark files +# like this one. + +print("Performing simple benchmark.") +df = bpd.DataFrame() +df["column_1"] = bpd.Series([i for i in range(100000)]) +df["column_2"] = bpd.Series([i * 2 for i in range(100000)]) +df["column_3"] = df["column_1"] + df["column_2"] +df.__repr__() +bpd.reset_session() diff --git a/tests/system/conftest.py b/tests/system/conftest.py index 2501693084..06ad73a702 100644 --- a/tests/system/conftest.py +++ b/tests/system/conftest.py @@ -139,6 +139,17 @@ def session() -> Generator[bigframes.Session, None, None]: session.close() # close generated session at cleanup time +@pytest.fixture(scope="session") +def unordered_session() -> Generator[bigframes.Session, None, None]: + context = bigframes.BigQueryOptions( + location="US", + ) + session = bigframes.Session(context=context) + session._strictly_ordered = False + yield session + session.close() # close generated session at cleanup type + + @pytest.fixture(scope="session") def session_tokyo(tokyo_location: str) -> Generator[bigframes.Session, None, None]: context = bigframes.BigQueryOptions( @@ -946,6 +957,18 @@ def penguins_randomforest_classifier_model_name( return model_name +@pytest.fixture(scope="session") +def llm_fine_tune_df_default_index( + session: bigframes.Session, +) -> bigframes.dataframe.DataFrame: + training_table_name = "llm_tuning.emotion_classification_train" + df = session.read_gbq(training_table_name).dropna().head(30) + prefix = "Please do sentiment analysis on the following text and only output a number from 0 to 5 where 0 means sadness, 1 means joy, 2 means love, 3 means anger, 4 means fear, and 5 means surprise. Text: " + df["prompt"] = prefix + df["text"] + df["label"] = df["label"].astype("string") + return df + + @pytest.fixture(scope="session") def usa_names_grouped_table( session: bigframes.Session, dataset_id_permanent diff --git a/tests/system/large/ml/test_forecasting.py b/tests/system/large/ml/test_forecasting.py index 79deb615b1..438177b1a0 100644 --- a/tests/system/large/ml/test_forecasting.py +++ b/tests/system/large/ml/test_forecasting.py @@ -127,8 +127,7 @@ def test_arima_plus_model_fit_params(time_series_df_default_index, dataset_id): assert reloaded_model.horizon == 100 assert reloaded_model.auto_arima is True assert reloaded_model.auto_arima_max_order == 4 - # TODO(garrettwu): now BQML doesn't populate auto_arima_min_order - # assert reloaded_model.auto_arima_min_order == 1 + assert reloaded_model.auto_arima_min_order == 1 assert reloaded_model.data_frequency == "DAILY" assert reloaded_model.holiday_region == "US" assert reloaded_model.clean_spikes_and_dips is False diff --git a/tests/system/large/ml/test_linear_model.py b/tests/system/large/ml/test_linear_model.py index eaf666fd50..0cc9fc5353 100644 --- a/tests/system/large/ml/test_linear_model.py +++ b/tests/system/large/ml/test_linear_model.py @@ -192,8 +192,6 @@ def test_logistic_regression_customized_params_fit_score( f"{dataset_id}.temp_configured_logistic_reg_model" in reloaded_model._bqml_model.model_name ) - # TODO(garrettwu) optimize_strategy isn't logged in BQML - # assert reloaded_model.optimize_strategy == "BATCH_GRADIENT_DESCENT" assert reloaded_model.fit_intercept is False assert reloaded_model.class_weight == "balanced" assert reloaded_model.calculate_p_values is False diff --git a/tests/system/large/test_remote_function.py b/tests/system/large/test_remote_function.py index cce49ea187..6bfc9f0da3 100644 --- a/tests/system/large/test_remote_function.py +++ b/tests/system/large/test_remote_function.py @@ -92,17 +92,14 @@ def make_uniq_udf(udf): target_code = source_code.replace(source_key, target_key, 1) f.write(target_code) spec = importlib.util.spec_from_file_location(udf_file_name, udf_file_path) - # TODO(b/340875260): fix type error - udf_uniq = getattr(spec.loader.load_module(), udf_uniq_name) # type: ignore - - # This is a bit of a hack but we need to remove the reference to a foreign - # module, otherwise the serialization would keep the foreign module - # reference and deserialization would fail with error like following: - # ModuleNotFoundError: No module named 'add_one_2nxcmd9j' - # TODO(shobs): Figure out if there is a better way of generating the unique - # function object, but for now let's just set it to same module as the - # original udf. - udf_uniq.__module__ = udf.__module__ + + assert (spec is not None) and (spec.loader is not None) + module = importlib.util.module_from_spec(spec) + + # exec_module fills the module object with all the functions, classes, and + # variables defined in the module file. + spec.loader.exec_module(module) + udf_uniq = getattr(module, udf_uniq_name) return udf_uniq, tmpdir diff --git a/tests/system/load/test_large_tables.py b/tests/system/load/test_large_tables.py index cf1c787a58..f92207b191 100644 --- a/tests/system/load/test_large_tables.py +++ b/tests/system/load/test_large_tables.py @@ -75,22 +75,17 @@ def test_index_repr_large_table(): def test_to_pandas_batches_large_table(): - df = bpd.read_gbq("load_testing.scalars_10gb") - # df will be downloaded locally - expected_row_count, expected_column_count = df.shape - - row_count = 0 - # TODO(b/340890167): fix type error - for df in df.to_pandas_batches(): # type: ignore - batch_row_count, batch_column_count = df.shape + df = bpd.read_gbq("load_testing.scalars_1tb") + _, expected_column_count = df.shape + + # download only a few batches, since 1tb would be too much + iterable = df.to_pandas_batches(page_size=500, max_results=1500) + # use page size since client library doesn't support + # streaming only part of the dataframe via bqstorage + for pdf in iterable: + batch_row_count, batch_column_count = pdf.shape assert batch_column_count == expected_column_count - row_count += batch_row_count - - # Attempt to save on memory by manually removing the batch df - # from local memory after finishing with processing. - del df - - assert row_count == expected_row_count + assert batch_row_count > 0 @pytest.mark.skip(reason="See if it caused kokoro build aborted.") diff --git a/tests/system/load/test_llm.py b/tests/system/load/test_llm.py index fd13662275..fd047b3ba6 100644 --- a/tests/system/load/test_llm.py +++ b/tests/system/load/test_llm.py @@ -18,18 +18,6 @@ import bigframes.ml.llm -@pytest.fixture(scope="session") -def llm_fine_tune_df_default_index( - session: bigframes.Session, -) -> bigframes.dataframe.DataFrame: - training_table_name = "llm_tuning.emotion_classification_train" - df = session.read_gbq(training_table_name) - prefix = "Please do sentiment analysis on the following text and only output a number from 0 to 5 where 0 means sadness, 1 means joy, 2 means love, 3 means anger, 4 means fear, and 5 means surprise. Text: " - df["prompt"] = prefix + df["text"] - df["label"] = df["label"].astype("string") - return df - - @pytest.fixture(scope="session") def llm_remote_text_pandas_df(): """Additional data matching the penguins dataset, with a new index""" @@ -55,9 +43,8 @@ def test_llm_palm_configure_fit(llm_fine_tune_df_default_index, llm_remote_text_ model_name="text-bison", max_iterations=1 ) - df = llm_fine_tune_df_default_index.dropna().sample(n=100) - X_train = df[["prompt"]] - y_train = df[["label"]] + X_train = llm_fine_tune_df_default_index[["prompt"]] + y_train = llm_fine_tune_df_default_index[["label"]] model.fit(X_train, y_train) assert model is not None @@ -112,3 +99,30 @@ def test_llm_palm_score_params(llm_fine_tune_df_default_index): "evaluation_status", ] assert all(col in score_result_col for col in expected_col) + + +@pytest.mark.flaky(retries=2) +def test_llm_gemini_configure_fit(llm_fine_tune_df_default_index, llm_remote_text_df): + model = bigframes.ml.llm.GeminiTextGenerator( + model_name="gemini-pro", max_iterations=1 + ) + + X_train = llm_fine_tune_df_default_index[["prompt"]] + y_train = llm_fine_tune_df_default_index[["label"]] + model.fit(X_train, y_train) + + assert model is not None + + df = model.predict( + llm_remote_text_df["prompt"], + temperature=0.5, + max_output_tokens=100, + top_k=20, + top_p=0.5, + ).to_pandas() + assert df.shape == (3, 4) + assert "ml_generate_text_llm_result" in df.columns + series = df["ml_generate_text_llm_result"] + assert all(series.str.len() == 1) + + # TODO(ashleyxu b/335492787): After bqml rolled out version control: save, load, check parameters to ensure configuration was kept diff --git a/tests/system/small/bigquery/test_vector_search.py b/tests/system/small/bigquery/test_vector_search.py new file mode 100644 index 0000000000..4280c0a888 --- /dev/null +++ b/tests/system/small/bigquery/test_vector_search.py @@ -0,0 +1,136 @@ +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np +import pandas as pd + +import bigframes.bigquery as bbq +import bigframes.pandas as bpd + + +def test_vector_search_basic_params_with_df(): + search_query = bpd.DataFrame( + { + "query_id": ["dog", "cat"], + "embedding": [[1.0, 2.0], [3.0, 5.2]], + } + ) + vector_search_result = bbq.vector_search( + base_table="bigframes-dev.bigframes_tests_sys.base_table", + column_to_search="my_embedding", + query=search_query, + top_k=2, + ).to_pandas() # type:ignore + expected = pd.DataFrame( + { + "query_id": ["cat", "dog", "dog", "cat"], + "embedding": [ + np.array([3.0, 5.2]), + np.array([1.0, 2.0]), + np.array([1.0, 2.0]), + np.array([3.0, 5.2]), + ], + "id": [5, 1, 4, 2], + "my_embedding": [ + np.array([5.0, 5.4]), + np.array([1.0, 2.0]), + np.array([1.0, 3.2]), + np.array([2.0, 4.0]), + ], + "distance": [2.009975, 0.0, 1.2, 1.56205], + }, + index=pd.Index([1, 0, 0, 1], dtype="Int64"), + ) + pd.testing.assert_frame_equal( + vector_search_result, expected, check_dtype=False, rtol=0.1 + ) + + +def test_vector_search_different_params_with_query(): + search_query = bpd.Series([[1.0, 2.0], [3.0, 5.2]]) + vector_search_result = bbq.vector_search( + base_table="bigframes-dev.bigframes_tests_sys.base_table", + column_to_search="my_embedding", + query=search_query, + distance_type="cosine", + top_k=2, + ).to_pandas() # type:ignore + expected = pd.DataFrame( + { + "0": [ + np.array([1.0, 2.0]), + np.array([1.0, 2.0]), + np.array([3.0, 5.2]), + np.array([3.0, 5.2]), + ], + "id": [2, 1, 1, 2], + "my_embedding": [ + np.array([2.0, 4.0]), + np.array([1.0, 2.0]), + np.array([1.0, 2.0]), + np.array([2.0, 4.0]), + ], + "distance": [0.0, 0.0, 0.001777, 0.001777], + }, + index=pd.Index([0, 0, 1, 1], dtype="Int64"), + ) + pd.testing.assert_frame_equal( + vector_search_result, expected, check_dtype=False, rtol=0.1 + ) + + +def test_vector_search_df_with_query_column_to_search(): + search_query = bpd.DataFrame( + { + "query_id": ["dog", "cat"], + "embedding": [[1.0, 2.0], [3.0, 5.2]], + "another_embedding": [[1.0, 2.5], [3.3, 5.2]], + } + ) + vector_search_result = bbq.vector_search( + base_table="bigframes-dev.bigframes_tests_sys.base_table", + column_to_search="my_embedding", + query=search_query, + query_column_to_search="another_embedding", + top_k=2, + ).to_pandas() # type:ignore + expected = pd.DataFrame( + { + "query_id": ["dog", "dog", "cat", "cat"], + "embedding": [ + np.array([1.0, 2.0]), + np.array([1.0, 2.0]), + np.array([3.0, 5.2]), + np.array([3.0, 5.2]), + ], + "another_embedding": [ + np.array([1.0, 2.5]), + np.array([1.0, 2.5]), + np.array([3.3, 5.2]), + np.array([3.3, 5.2]), + ], + "id": [1, 4, 2, 5], + "my_embedding": [ + np.array([1.0, 2.0]), + np.array([1.0, 3.2]), + np.array([2.0, 4.0]), + np.array([5.0, 5.4]), + ], + "distance": [0.5, 0.7, 1.769181, 1.711724], + }, + index=pd.Index([0, 0, 1, 1], dtype="Int64"), + ) + pd.testing.assert_frame_equal( + vector_search_result, expected, check_dtype=False, rtol=0.1 + ) diff --git a/tests/system/small/ml/test_llm.py b/tests/system/small/ml/test_llm.py index 20e8dd0c19..36d01e126f 100644 --- a/tests/system/small/ml/test_llm.py +++ b/tests/system/small/ml/test_llm.py @@ -15,6 +15,7 @@ import pytest from bigframes.ml import llm +from tests.system import utils def test_create_text_generator_model( @@ -366,3 +367,48 @@ def test_gemini_text_generator_predict_with_params_success( assert "ml_generate_text_llm_result" in df.columns series = df["ml_generate_text_llm_result"] assert all(series.str.len() > 20) + + +@pytest.mark.flaky(retries=2) +def test_llm_gemini_pro_score(llm_fine_tune_df_default_index): + model = llm.GeminiTextGenerator(model_name="gemini-pro") + + # Check score to ensure the model was fitted + score_result = model.score( + X=llm_fine_tune_df_default_index[["prompt"]], + y=llm_fine_tune_df_default_index[["label"]], + ).to_pandas() + utils.check_pandas_df_schema_and_index( + score_result, + columns=[ + "bleu4_score", + "rouge-l_precision", + "rouge-l_recall", + "rouge-l_f1_score", + "evaluation_status", + ], + index=1, + ) + + +@pytest.mark.flaky(retries=2) +def test_llm_gemini_pro_score_params(llm_fine_tune_df_default_index): + model = llm.GeminiTextGenerator(model_name="gemini-pro") + + # Check score to ensure the model was fitted + score_result = model.score( + X=llm_fine_tune_df_default_index["prompt"], + y=llm_fine_tune_df_default_index["label"], + task_type="classification", + ).to_pandas() + utils.check_pandas_df_schema_and_index( + score_result, + columns=[ + "precision", + "recall", + "f1_score", + "label", + "evaluation_status", + ], + index=6, + ) diff --git a/tests/system/small/ml/test_metrics.py b/tests/system/small/ml/test_metrics.py index 9aff2fe773..81e1b2f77f 100644 --- a/tests/system/small/ml/test_metrics.py +++ b/tests/system/small/ml/test_metrics.py @@ -219,8 +219,8 @@ def test_roc_curve_binary_classification_prediction_matches_sklearn(session): ) # sklearn returns float64 np arrays - np_fpr = fpr.to_pandas().astype("float64").array - np_tpr = tpr.to_pandas().astype("float64").array + np_fpr = fpr.to_pandas().astype("float64").array.to_numpy() + np_tpr = tpr.to_pandas().astype("float64").array.to_numpy() np_thresholds = thresholds.to_pandas().astype("float64").array np.testing.assert_array_equal( @@ -228,14 +228,12 @@ def test_roc_curve_binary_classification_prediction_matches_sklearn(session): np_thresholds[1:], expected_thresholds[1:], ) - # TODO(b/340872435): fix type error np.testing.assert_array_equal( - np_fpr, # type: ignore + np_fpr, expected_fpr, ) - # TODO(b/340872435): fix type error np.testing.assert_array_equal( - np_tpr, # type: ignore + np_tpr, expected_tpr, ) @@ -316,8 +314,8 @@ def test_roc_curve_binary_classification_decision_matches_sklearn(session): ) # sklearn returns float64 np arrays - np_fpr = fpr.to_pandas().astype("float64").array - np_tpr = tpr.to_pandas().astype("float64").array + np_fpr = fpr.to_pandas().astype("float64").array.to_numpy() + np_tpr = tpr.to_pandas().astype("float64").array.to_numpy() np_thresholds = thresholds.to_pandas().astype("float64").array np.testing.assert_array_equal( @@ -325,14 +323,12 @@ def test_roc_curve_binary_classification_decision_matches_sklearn(session): np_thresholds[1:], expected_thresholds[1:], ) - # TODO(b/340872435): fix type error np.testing.assert_array_equal( - np_fpr, # type: ignore + np_fpr, expected_fpr, ) - # TODO(b/340872435): fix type error np.testing.assert_array_equal( - np_tpr, # type: ignore + np_tpr, expected_tpr, ) @@ -519,14 +515,10 @@ def test_confusion_matrix_column_index(session): ).astype("Int64") df = session.read_pandas(pd_df) confusion_matrix = metrics.confusion_matrix(df[["y_true"]], df[["y_pred"]]) - # TODO(b/340872435): fix type error - expected_pd_df = ( - pd.DataFrame( # type: ignore - {1: [1, 0, 1, 0], 2: [0, 0, 2, 0], 3: [0, 0, 0, 0], 4: [0, 1, 0, 1]} - ) - .astype("int64") - .set_index([pd.Index([1, 2, 3, 4])]) - ) + expected_pd_df = pd.DataFrame( + {1: [1, 0, 1, 0], 2: [0, 0, 2, 0], 3: [0, 0, 0, 0], 4: [0, 1, 0, 1]}, + index=[1, 2, 3, 4], + ).astype("int64") pd.testing.assert_frame_equal( confusion_matrix, expected_pd_df, check_index_type=False ) @@ -562,9 +554,8 @@ def test_confusion_matrix_str_matches_sklearn(session): expected_confusion_matrix = sklearn_metrics.confusion_matrix( pd_df[["y_true"]], pd_df[["y_pred"]] ) - # TODO(b/340872435): fix type error - expected_pd_df = pd.DataFrame(expected_confusion_matrix).set_index( # type: ignore - [pd.Index(["ant", "bird", "cat"])] + expected_pd_df = pd.DataFrame( + expected_confusion_matrix, index=["ant", "bird", "cat"] ) expected_pd_df.columns = pd.Index(["ant", "bird", "cat"]) pd.testing.assert_frame_equal( @@ -601,9 +592,8 @@ def test_recall_score(session): } ).astype("Int64") df = session.read_pandas(pd_df) - # TODO(b/340872435): fix type error recall = metrics.recall_score( - df[["y_true_arbitrary_name"]], df[["y_pred_arbitrary_name"]], average=None # type: ignore + df[["y_true_arbitrary_name"]], df[["y_pred_arbitrary_name"]], average=None ) expected_values = [1.000000, 0.000000, 0.666667] expected_index = [0, 1, 2] @@ -620,8 +610,7 @@ def test_recall_score_matches_sklearn(session): } ).astype("Int64") df = session.read_pandas(pd_df) - # TODO(b/340872435): fix type error - recall = metrics.recall_score(df[["y_true"]], df[["y_pred"]], average=None) # type: ignore + recall = metrics.recall_score(df[["y_true"]], df[["y_pred"]], average=None) expected_values = sklearn_metrics.recall_score( pd_df[["y_true"]], pd_df[["y_pred"]], average=None ) @@ -638,8 +627,7 @@ def test_recall_score_str_matches_sklearn(session): } ).astype("str") df = session.read_pandas(pd_df) - # TODO(b/340872435): fix type error - recall = metrics.recall_score(df[["y_true"]], df[["y_pred"]], average=None) # type: ignore + recall = metrics.recall_score(df[["y_true"]], df[["y_pred"]], average=None) expected_values = sklearn_metrics.recall_score( pd_df[["y_true"]], pd_df[["y_pred"]], average=None ) @@ -656,8 +644,7 @@ def test_recall_score_series(session): } ).astype("Int64") df = session.read_pandas(pd_df) - # TODO(b/340872435): fix type error - recall = metrics.recall_score(df["y_true"], df["y_pred"], average=None) # type: ignore + recall = metrics.recall_score(df["y_true"], df["y_pred"], average=None) expected_values = [1.000000, 0.000000, 0.666667] expected_index = [0, 1, 2] expected_recall = pd.Series(expected_values, index=expected_index) @@ -673,9 +660,8 @@ def test_precision_score(session): } ).astype("Int64") df = session.read_pandas(pd_df) - # TODO(b/340872435): fix type error precision_score = metrics.precision_score( - df[["y_true_arbitrary_name"]], df[["y_pred_arbitrary_name"]], average=None # type: ignore + df[["y_true_arbitrary_name"]], df[["y_pred_arbitrary_name"]], average=None ) expected_values = [0.666667, 0.000000, 0.666667] expected_index = [0, 1, 2] @@ -696,7 +682,7 @@ def test_precision_score_matches_sklearn(session): df = session.read_pandas(pd_df) # TODO(b/340872435): fix type error precision_score = metrics.precision_score( - df[["y_true"]], df[["y_pred"]], average=None # type: ignore + df[["y_true"]], df[["y_pred"]], average=None ) expected_values = sklearn_metrics.precision_score( pd_df[["y_true"]], pd_df[["y_pred"]], average=None @@ -716,9 +702,8 @@ def test_precision_score_str_matches_sklearn(session): } ).astype("str") df = session.read_pandas(pd_df) - # TODO(b/340872435): fix type error precision_score = metrics.precision_score( - df[["y_true"]], df[["y_pred"]], average=None # type: ignore + df[["y_true"]], df[["y_pred"]], average=None ) expected_values = sklearn_metrics.precision_score( pd_df[["y_true"]], pd_df[["y_pred"]], average=None @@ -738,8 +723,7 @@ def test_precision_score_series(session): } ).astype("Int64") df = session.read_pandas(pd_df) - # TODO(b/340872435): fix type error - precision_score = metrics.precision_score(df["y_true"], df["y_pred"], average=None) # type: ignore + precision_score = metrics.precision_score(df["y_true"], df["y_pred"], average=None) expected_values = [0.666667, 0.000000, 0.666667] expected_index = [0, 1, 2] expected_precision = pd.Series(expected_values, index=expected_index) @@ -757,9 +741,8 @@ def test_f1_score(session): } ).astype("Int64") df = session.read_pandas(pd_df) - # TODO(b/340872435): fix type error f1_score = metrics.f1_score( - df[["y_true_arbitrary_name"]], df[["y_pred_arbitrary_name"]], average=None # type: ignore + df[["y_true_arbitrary_name"]], df[["y_pred_arbitrary_name"]], average=None ) expected_values = [0.8, 0.000000, 0.666667] expected_index = [0, 1, 2] @@ -776,8 +759,7 @@ def test_f1_score_matches_sklearn(session): } ).astype("Int64") df = session.read_pandas(pd_df) - # TODO(b/340872435): fix type error - f1_score = metrics.f1_score(df[["y_true"]], df[["y_pred"]], average=None) # type: ignore + f1_score = metrics.f1_score(df[["y_true"]], df[["y_pred"]], average=None) expected_values = sklearn_metrics.f1_score( pd_df[["y_true"]], pd_df[["y_pred"]], average=None ) @@ -794,8 +776,7 @@ def test_f1_score_str_matches_sklearn(session): } ).astype("str") df = session.read_pandas(pd_df) - # TODO(b/340872435): fix type error - f1_score = metrics.f1_score(df[["y_true"]], df[["y_pred"]], average=None) # type: ignore + f1_score = metrics.f1_score(df[["y_true"]], df[["y_pred"]], average=None) expected_values = sklearn_metrics.f1_score( pd_df[["y_true"]], pd_df[["y_pred"]], average=None ) @@ -812,8 +793,7 @@ def test_f1_score_series(session): } ).astype("Int64") df = session.read_pandas(pd_df) - # TODO(b/340872435): fix type error - f1_score = metrics.f1_score(df["y_true"], df["y_pred"], average=None) # type: ignore + f1_score = metrics.f1_score(df["y_true"], df["y_pred"], average=None) expected_values = [0.8, 0.000000, 0.666667] expected_index = [0, 1, 2] expected_f1 = pd.Series(expected_values, index=expected_index) diff --git a/tests/system/small/ml/test_model_selection.py b/tests/system/small/ml/test_model_selection.py index ca14186a4d..63d0840d29 100644 --- a/tests/system/small/ml/test_model_selection.py +++ b/tests/system/small/ml/test_model_selection.py @@ -130,17 +130,12 @@ def test_train_test_split_seeded_correct_rows( X, y, random_state=42 ) - # TODO(b/340876926): fix type error - X_train = X_train.to_pandas().sort_index() # type: ignore - # TODO(b/340876926): fix type error - X_test = X_test.to_pandas().sort_index() # type: ignore - # TODO(b/340876926): fix type error - y_train = y_train.to_pandas().sort_index() # type: ignore - # TODO(b/340876926): fix type error - y_test = y_test.to_pandas().sort_index() # type: ignore - - # TODO(b/340876926): fix type error - train_index = pd.Index( # type: ignore + X_train_sorted = X_train.to_pandas().sort_index() + X_test_sorted = X_test.to_pandas().sort_index() + y_train_sorted = y_train.to_pandas().sort_index() + y_test_sorted = y_test.to_pandas().sort_index() + + train_index: pd.Index = pd.Index( [ 144, 146, @@ -167,15 +162,20 @@ def test_train_test_split_seeded_correct_rows( dtype="Int64", name="rowindex", ) - # TODO(b/340876926): fix type error - test_index = pd.Index( # type: ignore + test_index: pd.Index = pd.Index( [148, 161, 226, 269, 278, 289, 291], dtype="Int64", name="rowindex" ) all_data.index.name = "_" - # TODO(b/340876926): fix type error + + assert ( + isinstance(X_train_sorted, pd.DataFrame) + and isinstance(X_test_sorted, pd.DataFrame) + and isinstance(y_train_sorted, pd.DataFrame) + and isinstance(y_test_sorted, pd.DataFrame) + ) pd.testing.assert_frame_equal( - X_train, # type: ignore + X_train_sorted, all_data[ [ "species", @@ -184,9 +184,8 @@ def test_train_test_split_seeded_correct_rows( ] ].loc[train_index], ) - # TODO(b/340876926): fix type error pd.testing.assert_frame_equal( - X_test, # type: ignore + X_test_sorted, all_data[ [ "species", @@ -195,18 +194,16 @@ def test_train_test_split_seeded_correct_rows( ] ].loc[test_index], ) - # TODO(b/340876926): fix type error pd.testing.assert_frame_equal( - y_train, # type: ignore + y_train_sorted, all_data[ [ "body_mass_g", ] ].loc[train_index], ) - # TODO(b/340876926): fix type error pd.testing.assert_frame_equal( - y_test, # type: ignore + y_test_sorted, all_data[ [ "body_mass_g", diff --git a/tests/system/small/operations/test_datetimes.py b/tests/system/small/operations/test_datetimes.py index 838bc11108..c5c649c638 100644 --- a/tests/system/small/operations/test_datetimes.py +++ b/tests/system/small/operations/test_datetimes.py @@ -310,9 +310,8 @@ def test_dt_floor(scalars_dfs, col_name, freq): def test_dt_compare_coerce_str_datetime(scalars_dfs): scalars_df, scalars_pandas_df = scalars_dfs bf_series: bigframes.series.Series = scalars_df["datetime_col"] - # TODO(b/340878286): fix type error - bf_result = (bf_series >= "2024-01-01").to_pandas() # type: ignore + bf_result = (bf_series >= "2024-01-01").to_pandas() pd_result = scalars_pandas_df["datetime_col"] >= pd.to_datetime("2024-01-01") # pandas produces pyarrow bool dtype diff --git a/tests/system/small/test_dataframe.py b/tests/system/small/test_dataframe.py index eae25bb027..d5854bd8d0 100644 --- a/tests/system/small/test_dataframe.py +++ b/tests/system/small/test_dataframe.py @@ -17,7 +17,7 @@ import sys import tempfile import typing -from typing import Tuple +from typing import Dict, List, Tuple import geopandas as gpd # type: ignore import numpy as np @@ -146,9 +146,9 @@ def test_df_construct_inline_respects_location(): with bpd.option_context("bigquery.location", "europe-west1"): df = bpd.DataFrame([[1, 2, 3], [4, 5, 6]]) repr(df) + assert df.query_job is not None + table = bpd.get_global_session().bqclient.get_table(df.query_job.destination) - # TODO(b/340876936): fix type error - table = bpd.get_global_session().bqclient.get_table(df.query_job.destination) # type: ignore assert table.location == "europe-west1" @@ -753,10 +753,9 @@ def test_assign_listlike_to_empty_df(session): def test_assign_to_empty_df_multiindex_error(session): empty_df = dataframe.DataFrame(session=session) empty_pandas_df = pd.DataFrame() - # TODO(b/340876936): fix type error - empty_df["empty_col_1"] = [] # type: ignore - # TODO(b/340876936): fix type error - empty_df["empty_col_2"] = [] # type: ignore + + empty_df["empty_col_1"] = typing.cast(series.Series, []) + empty_df["empty_col_2"] = typing.cast(series.Series, []) empty_pandas_df["empty_col_1"] = [] empty_pandas_df["empty_col_2"] = [] empty_df = empty_df.set_index(["empty_col_1", "empty_col_2"]) @@ -1340,40 +1339,25 @@ def test_merge_left_on_right_on(scalars_dfs, merge_how): def test_get_dtypes(scalars_df_default_index): dtypes = scalars_df_default_index.dtypes + dtypes_dict: Dict[str, bigframes.dtypes.Dtype] = { + "bool_col": pd.BooleanDtype(), + "bytes_col": pd.ArrowDtype(pa.binary()), + "date_col": pd.ArrowDtype(pa.date32()), + "datetime_col": pd.ArrowDtype(pa.timestamp("us")), + "geography_col": gpd.array.GeometryDtype(), + "int64_col": pd.Int64Dtype(), + "int64_too": pd.Int64Dtype(), + "numeric_col": pd.ArrowDtype(pa.decimal128(38, 9)), + "float64_col": pd.Float64Dtype(), + "rowindex": pd.Int64Dtype(), + "rowindex_2": pd.Int64Dtype(), + "string_col": pd.StringDtype(storage="pyarrow"), + "time_col": pd.ArrowDtype(pa.time64("us")), + "timestamp_col": pd.ArrowDtype(pa.timestamp("us", tz="UTC")), + } pd.testing.assert_series_equal( dtypes, - pd.Series( - { - # TODO(b/340876936): fix type error - "bool_col": pd.BooleanDtype(), # type: ignore - # TODO(b/340876936): fix type error - "bytes_col": pd.ArrowDtype(pa.binary()), # type: ignore - # TODO(b/340876936): fix type error - "date_col": pd.ArrowDtype(pa.date32()), # type: ignore - # TODO(b/340876936): fix type error - "datetime_col": pd.ArrowDtype(pa.timestamp("us")), # type: ignore - # TODO(b/340876936): fix type error - "geography_col": gpd.array.GeometryDtype(), # type: ignore - # TODO(b/340876936): fix type error - "int64_col": pd.Int64Dtype(), # type: ignore - # TODO(b/340876936): fix type error - "int64_too": pd.Int64Dtype(), # type: ignore - # TODO(b/340876936): fix type error - "numeric_col": pd.ArrowDtype(pa.decimal128(38, 9)), # type: ignore - # TODO(b/340876936): fix type error - "float64_col": pd.Float64Dtype(), # type: ignore - # TODO(b/340876936): fix type error - "rowindex": pd.Int64Dtype(), # type: ignore - # TODO(b/340876936): fix type error - "rowindex_2": pd.Int64Dtype(), # type: ignore - # TODO(b/340876936): fix type error - "string_col": pd.StringDtype(storage="pyarrow"), # type: ignore - # TODO(b/340876936): fix type error - "time_col": pd.ArrowDtype(pa.time64("us")), # type: ignore - # TODO(b/340876936): fix type error - "timestamp_col": pd.ArrowDtype(pa.timestamp("us", tz="UTC")), # type: ignore - } - ), + pd.Series(dtypes_dict), ) @@ -1828,10 +1812,9 @@ def test_df_update(overwrite, filter_func): if pd.__version__.startswith("1."): pytest.skip("dtype handled differently in pandas 1.x.") - # TODO(b/340876936): fix type error - index1 = pandas.Index([1, 2, 3, 4], dtype="Int64") # type: ignore - # TODO(b/340876936): fix type error - index2 = pandas.Index([1, 2, 4, 5], dtype="Int64") # type: ignore + index1: pandas.Index = pandas.Index([1, 2, 3, 4], dtype="Int64") + + index2: pandas.Index = pandas.Index([1, 2, 4, 5], dtype="Int64") pd_df1 = pandas.DataFrame( {"a": [1, None, 3, 4], "b": [5, 6, None, 8]}, dtype="Int64", index=index1 ) @@ -1891,10 +1874,10 @@ def test_df_idxmax(): ], ) def test_df_align(join, axis): - # TODO(b/340876936): fix type error - index1 = pandas.Index([1, 2, 3, 4], dtype="Int64") # type: ignore - # TODO(b/340876936): fix type error - index2 = pandas.Index([1, 2, 4, 5], dtype="Int64") # type: ignore + + index1: pandas.Index = pandas.Index([1, 2, 3, 4], dtype="Int64") + + index2: pandas.Index = pandas.Index([1, 2, 4, 5], dtype="Int64") pd_df1 = pandas.DataFrame( {"a": [1, None, 3, 4], "b": [5, 6, None, 8]}, dtype="Int64", index=index1 ) @@ -1911,10 +1894,11 @@ def test_df_align(join, axis): pd_result1, pd_result2 = pd_df1.align(pd_df2, join=join, axis=axis) # Don't check dtype as pandas does unnecessary float conversion - # TODO(b/340876936): fix type error - pd.testing.assert_frame_equal(bf_result1.to_pandas(), pd_result1, check_dtype=False) # type: ignore - # TODO(b/340876936): fix type error - pd.testing.assert_frame_equal(bf_result2.to_pandas(), pd_result2, check_dtype=False) # type: ignore + assert isinstance(bf_result1, dataframe.DataFrame) and isinstance( + bf_result2, dataframe.DataFrame + ) + pd.testing.assert_frame_equal(bf_result1.to_pandas(), pd_result1, check_dtype=False) + pd.testing.assert_frame_equal(bf_result2.to_pandas(), pd_result2, check_dtype=False) def test_combine_first( @@ -2568,11 +2552,15 @@ def test_df_transpose(): # Include some floats to ensure type coercion values = [[0, 3.5, True], [1, 4.5, False], [2, 6.5, None]] # Test complex case of both axes being multi-indices with non-unique elements - # TODO(b/340876936): fix type error - columns = pd.Index(["A", "B", "A"], dtype=pd.StringDtype(storage="pyarrow")) # type: ignore + + columns: pandas.Index = pd.Index( + ["A", "B", "A"], dtype=pd.StringDtype(storage="pyarrow") + ) columns_multi = pd.MultiIndex.from_arrays([columns, columns], names=["c1", "c2"]) - # TODO(b/340876936): fix type error - index = pd.Index(["b", "a", "a"], dtype=pd.StringDtype(storage="pyarrow")) # type: ignore + + index: pandas.Index = pd.Index( + ["b", "a", "a"], dtype=pd.StringDtype(storage="pyarrow") + ) rows_multi = pd.MultiIndex.from_arrays([index, index], names=["r1", "r2"]) pd_df = pandas.DataFrame(values, index=rows_multi, columns=columns_multi) @@ -3124,9 +3112,9 @@ def test_dataframe_aggregate_int(scalars_df_index, scalars_pandas_df_index, col, # Check dtype separately assert bf_result.dtype == "Int64" - + # Is otherwise "object" dtype + pd_result.index = pd_result.index.astype("string[pyarrow]") # Pandas may produce narrower numeric types - # Pandas has object index type assert_series_equal(pd_result, bf_result, check_dtype=False, check_index_type=False) @@ -3146,6 +3134,7 @@ def test_dataframe_aggregate_bool(scalars_df_index, scalars_pandas_df_index, col # Pandas may produce narrower numeric types # Pandas has object index type + pd_result.index = pd_result.index.astype("string[pyarrow]") assert_series_equal(pd_result, bf_result, check_dtype=False, check_index_type=False) @@ -3183,6 +3172,7 @@ def test_dataframe_aggregates( # Pandas may produce narrower numeric types, but bigframes always produces Float64 # Pandas has object index type + pd_result.index = pd_result.index.astype("string[pyarrow]") assert_series_equal( pd_result, bf_result, @@ -3740,10 +3730,9 @@ def test_df_setattr_index(): [[1, 1, 1], [1, 1, 1]], columns=["index", "columns", "my_column"] ) bf_df = dataframe.DataFrame(pd_df) - # TODO(b/340876936): fix type error - pd_df.index = [4, 5] # type: ignore - # TODO(b/340876936): fix type error - bf_df.index = [4, 5] # type: ignore + + pd_df.index = pandas.Index([4, 5]) + bf_df.index = [4, 5] assert_pandas_df_equal( pd_df, bf_df.to_pandas(), check_index_type=False, check_dtype=False @@ -3755,10 +3744,10 @@ def test_df_setattr_columns(): [[1, 1, 1], [1, 1, 1]], columns=["index", "columns", "my_column"] ) bf_df = dataframe.DataFrame(pd_df) - # TODO(b/340876936): fix type error - pd_df.columns = [4, 5, 6] # type: ignore - # TODO(b/340876936): fix type error - bf_df.columns = [4, 5, 6] # type: ignore + + pd_df.columns = typing.cast(pandas.Index, pandas.Index([4, 5, 6])) + + bf_df.columns = pandas.Index([4, 5, 6]) assert_pandas_df_equal( pd_df, bf_df.to_pandas(), check_index_type=False, check_dtype=False @@ -3852,8 +3841,8 @@ def test_iloc_list_multiindex(scalars_dfs): def test_iloc_empty_list(scalars_df_index, scalars_pandas_df_index): - # TODO(b/340876936): fix type error - index_list = [] # type: ignore + + index_list: List[int] = [] bf_result = scalars_df_index.iloc[index_list] pd_result = scalars_pandas_df_index.iloc[index_list] diff --git a/tests/system/small/test_encryption.py b/tests/system/small/test_encryption.py index 088211d7fc..428a6a28bf 100644 --- a/tests/system/small/test_encryption.py +++ b/tests/system/small/test_encryption.py @@ -242,14 +242,12 @@ def test_bqml(bq_cmek, session_with_bq_cmek, penguins_table_id): model.fit(X_train, y_train) assert model is not None - # TODO(b/340879287): fix type error - assert model._bqml_model.model.encryption_configuration is not None # type: ignore - # TODO(b/340879287): fix type error - assert model._bqml_model.model.encryption_configuration.kms_key_name == bq_cmek # type: ignore + assert model._bqml_model is not None + assert model._bqml_model.model.encryption_configuration is not None + assert model._bqml_model.model.encryption_configuration.kms_key_name == bq_cmek # Assert that model exists in BQ with intended encryption - # TODO(b/340879287): fix type error - model_bq = session_with_bq_cmek.bqclient.get_model(model._bqml_model.model_name) # type: ignore + model_bq = session_with_bq_cmek.bqclient.get_model(model._bqml_model.model_name) assert model_bq.encryption_configuration.kms_key_name == bq_cmek # Explicitly save the model to a destination and assert that encryption holds @@ -260,12 +258,12 @@ def test_bqml(bq_cmek, session_with_bq_cmek, penguins_table_id): f"{model_ref.project}.{model_ref.dataset_id}.{model_ref.model_id}" ) new_model = model.to_gbq(model_ref_full_name) - # TODO(b/340879287): fix type error - assert new_model._bqml_model.model.encryption_configuration.kms_key_name == bq_cmek # type: ignore + assert new_model._bqml_model is not None + assert new_model._bqml_model.model.encryption_configuration is not None + assert new_model._bqml_model.model.encryption_configuration.kms_key_name == bq_cmek # Assert that model exists in BQ with intended encryption - # TODO(b/340879287): fix type error - model_bq = session_with_bq_cmek.bqclient.get_model(new_model._bqml_model.model_name) # type: ignore + model_bq = session_with_bq_cmek.bqclient.get_model(new_model._bqml_model.model_name) assert model_bq.encryption_configuration.kms_key_name == bq_cmek # Assert that model registration keeps the encryption @@ -279,11 +277,11 @@ def test_bqml(bq_cmek, session_with_bq_cmek, penguins_table_id): # https://cloud.google.com/vertex-ai/docs/general/cmek#create_resources_with_the_kms_key. # bigframes.ml does not provide any API for the model deployment. model_registered = new_model.register() - # TODO(b/340879287): fix type error + assert model_registered._bqml_model is not None + assert model_registered._bqml_model.model.encryption_configuration is not None assert ( - model_registered._bqml_model.model.encryption_configuration.kms_key_name # type: ignore + model_registered._bqml_model.model.encryption_configuration.kms_key_name == bq_cmek ) - # TODO(b/340879287): fix type error - model_bq = session_with_bq_cmek.bqclient.get_model(new_model._bqml_model.model_name) # type: ignore + model_bq = session_with_bq_cmek.bqclient.get_model(new_model._bqml_model.model_name) assert model_bq.encryption_configuration.kms_key_name == bq_cmek diff --git a/tests/system/small/test_index.py b/tests/system/small/test_index.py index 58fd346bc1..d68cf6c3f3 100644 --- a/tests/system/small/test_index.py +++ b/tests/system/small/test_index.py @@ -24,20 +24,17 @@ def test_index_construct_from_list(): bf_result = bpd.Index( [3, 14, 159], dtype=pd.Int64Dtype(), name="my_index" ).to_pandas() - # TODO(b/340878489): fix type error - pd_result = pd.Index([3, 14, 159], dtype=pd.Int64Dtype(), name="my_index") # type: ignore + pd_result: pd.Index = pd.Index([3, 14, 159], dtype=pd.Int64Dtype(), name="my_index") pd.testing.assert_index_equal(bf_result, pd_result) def test_index_construct_from_series(): - # TODO(b/340878489): fix type error bf_result = bpd.Index( bpd.Series([3, 14, 159], dtype=pd.Float64Dtype(), name="series_name"), name="index_name", dtype=pd.Int64Dtype(), - ).to_pandas() # type: ignore - # TODO(b/340878489): fix type error - pd_result = pd.Index( # type: ignore + ).to_pandas() + pd_result: pd.Index = pd.Index( pd.Series([3, 14, 159], dtype=pd.Float64Dtype(), name="series_name"), name="index_name", dtype=pd.Int64Dtype(), @@ -49,14 +46,15 @@ def test_index_construct_from_index(): bf_index_input = bpd.Index( [3, 14, 159], dtype=pd.Float64Dtype(), name="series_name" ) - # TODO(b/340878489): fix type error bf_result = bpd.Index( - bf_index_input, dtype=pd.Int64Dtype(), name="index_name" # type: ignore + bf_index_input, dtype=pd.Int64Dtype(), name="index_name" ).to_pandas() - # TODO(b/340878489): fix type error - pd_index_input = pd.Index([3, 14, 159], dtype=pd.Float64Dtype(), name="series_name") # type: ignore - # TODO(b/340878489): fix type error - pd_result = pd.Index(pd_index_input, dtype=pd.Int64Dtype(), name="index_name") # type: ignore + pd_index_input: pd.Index = pd.Index( + [3, 14, 159], dtype=pd.Float64Dtype(), name="series_name" + ) + pd_result: pd.Index = pd.Index( + pd_index_input, dtype=pd.Int64Dtype(), name="index_name" + ) pd.testing.assert_index_equal(bf_result, pd_result) @@ -365,17 +363,16 @@ def test_index_drop_duplicates(scalars_df_index, scalars_pandas_df_index, keep): def test_index_isin(scalars_df_index, scalars_pandas_df_index): + col_name = "int64_col" bf_series = ( - scalars_df_index.set_index("int64_col").index.isin([2, 55555, 4]).to_pandas() + scalars_df_index.set_index(col_name).index.isin([2, 55555, 4]).to_pandas() ) - pd_result_array = scalars_pandas_df_index.set_index("int64_col").index.isin( + pd_result_array = scalars_pandas_df_index.set_index(col_name).index.isin( [2, 55555, 4] ) - # TODO(b/340878489): fix type error - pd.testing.assert_index_equal( # type: ignore - pd.Index(pd_result_array), + pd.testing.assert_index_equal( + pd.Index(pd_result_array).set_names(col_name), bf_series, - check_names=False, ) diff --git a/tests/system/small/test_pandas.py b/tests/system/small/test_pandas.py index 256046f8b1..30ffaa8a7d 100644 --- a/tests/system/small/test_pandas.py +++ b/tests/system/small/test_pandas.py @@ -394,12 +394,8 @@ def test_cut(scalars_dfs): # make sure the result is a supported dtype assert bf_result.dtype == bpd.Int64Dtype() - - # TODO(b/340884971): fix type error - bf_result = bf_result.to_pandas() # type: ignore pd_result = pd_result.astype("Int64") - # TODO(b/340884971): fix type error - pd.testing.assert_series_equal(bf_result, pd_result) # type: ignore + pd.testing.assert_series_equal(bf_result.to_pandas(), pd_result) def test_cut_default_labels(scalars_dfs): @@ -529,13 +525,9 @@ def test_qcut(scalars_dfs, q): scalars_pandas_df["float64_col"], q, labels=False, duplicates="drop" ) bf_result = bpd.qcut(scalars_df["float64_col"], q, labels=False, duplicates="drop") - - # TODO(b/340884971): fix type error - bf_result = bf_result.to_pandas() # type: ignore pd_result = pd_result.astype("Int64") - # TODO(b/340884971): fix type error - pd.testing.assert_series_equal(bf_result, pd_result) # type: ignore + pd.testing.assert_series_equal(bf_result.to_pandas(), pd_result) @pytest.mark.parametrize( @@ -572,9 +564,8 @@ def test_to_datetime_scalar(arg, utc, unit, format): ], ) def test_to_datetime_iterable(arg, utc, unit, format): - # TODO(b/340884971): fix type error bf_result = ( - bpd.to_datetime(arg, utc=utc, unit=unit, format=format) # type: ignore + bpd.to_datetime(arg, utc=utc, unit=unit, format=format) .to_pandas() .astype("datetime64[ns, UTC]" if utc else "datetime64[ns]") ) @@ -589,9 +580,8 @@ def test_to_datetime_iterable(arg, utc, unit, format): def test_to_datetime_series(scalars_dfs): scalars_df, scalars_pandas_df = scalars_dfs col = "int64_too" - # TODO(b/340884971): fix type error bf_result = ( - bpd.to_datetime(scalars_df[col], unit="s").to_pandas().astype("datetime64[s]") # type: ignore + bpd.to_datetime(scalars_df[col], unit="s").to_pandas().astype("datetime64[s]") ) pd_result = pd.Series(pd.to_datetime(scalars_pandas_df[col], unit="s")) pd.testing.assert_series_equal( @@ -614,8 +604,7 @@ def test_to_datetime_series(scalars_dfs): ], ) def test_to_datetime_unit_param(arg, unit): - # TODO(b/340884971): fix type error - bf_result = bpd.to_datetime(arg, unit=unit).to_pandas().astype("datetime64[ns]") # type: ignore + bf_result = bpd.to_datetime(arg, unit=unit).to_pandas().astype("datetime64[ns]") pd_result = pd.Series(pd.to_datetime(arg, unit=unit)).dt.floor("us") pd.testing.assert_series_equal( bf_result, pd_result, check_index_type=False, check_names=False @@ -632,9 +621,8 @@ def test_to_datetime_unit_param(arg, unit): ], ) def test_to_datetime_format_param(arg, utc, format): - # TODO(b/340884971): fix type error bf_result = ( - bpd.to_datetime(arg, utc=utc, format=format) # type: ignore + bpd.to_datetime(arg, utc=utc, format=format) .to_pandas() .astype("datetime64[ns, UTC]" if utc else "datetime64[ns]") ) @@ -686,9 +674,8 @@ def test_to_datetime_format_param(arg, utc, format): ], ) def test_to_datetime_string_inputs(arg, utc, output_in_utc, format): - # TODO(b/340884971): fix type error bf_result = ( - bpd.to_datetime(arg, utc=utc, format=format) # type: ignore + bpd.to_datetime(arg, utc=utc, format=format) .to_pandas() .astype("datetime64[ns, UTC]" if output_in_utc else "datetime64[ns]") ) @@ -730,9 +717,8 @@ def test_to_datetime_string_inputs(arg, utc, output_in_utc, format): ], ) def test_to_datetime_timestamp_inputs(arg, utc, output_in_utc): - # TODO(b/340884971): fix type error bf_result = ( - bpd.to_datetime(arg, utc=utc) # type: ignore + bpd.to_datetime(arg, utc=utc) .to_pandas() .astype("datetime64[ns, UTC]" if output_in_utc else "datetime64[ns]") ) diff --git a/tests/system/small/test_remote_function.py b/tests/system/small/test_remote_function.py index d2ee4411f4..d84d520988 100644 --- a/tests/system/small/test_remote_function.py +++ b/tests/system/small/test_remote_function.py @@ -17,6 +17,7 @@ import google.api_core.exceptions from google.cloud import bigquery import pandas as pd +import pyarrow import pytest import bigframes @@ -80,7 +81,7 @@ def session_with_bq_connection( bq_cf_connection, dataset_id_permanent ) -> bigframes.Session: session = bigframes.Session( - bigframes.BigQueryOptions(bq_connection=bq_cf_connection) + bigframes.BigQueryOptions(bq_connection=bq_cf_connection, location="US") ) return session @@ -484,17 +485,27 @@ def add_one(x): @pytest.mark.flaky(retries=2, delay=120) -def test_series_map(session_with_bq_connection, scalars_dfs): - def add_one(x): - return x + 1 +def test_series_map_bytes(session_with_bq_connection, scalars_dfs): + """Check that bytes is support as input and output.""" + scalars_df, scalars_pandas_df = scalars_dfs - remote_add_one = session_with_bq_connection.remote_function([int], int)(add_one) + def bytes_to_hex(mybytes: bytes) -> bytes: + import pandas - scalars_df, scalars_pandas_df = scalars_dfs + return mybytes.hex().encode("utf-8") if pandas.notna(mybytes) else None # type: ignore - bf_result = scalars_df.int64_too.map(remote_add_one).to_pandas() - pd_result = scalars_pandas_df.int64_too.map(add_one) - pd_result = pd_result.astype("Int64") # pandas type differences + # TODO(b/345516010): the type: ignore is because "Optional" not yet + # supported as a type annotation in @remote_function(). + assert bytes_to_hex(None) is None # type: ignore + assert bytes_to_hex(b"\x00\xdd\xba\x11") == b"00ddba11" + pd_result = scalars_pandas_df.bytes_col.map(bytes_to_hex).astype( + pd.ArrowDtype(pyarrow.binary()) + ) + + remote_bytes_to_hex = session_with_bq_connection.remote_function( + packages=["pandas"] + )(bytes_to_hex) + bf_result = scalars_df.bytes_col.map(remote_bytes_to_hex).to_pandas() pd.testing.assert_series_equal( bf_result, @@ -537,12 +548,12 @@ def add_one(x): @pytest.mark.flaky(retries=2, delay=120) -def test_read_gbq_function_detects_invalid_function(bigquery_client, dataset_id): +def test_read_gbq_function_detects_invalid_function(session, dataset_id): dataset_ref = bigquery.DatasetReference.from_string(dataset_id) with pytest.raises(ValueError) as e: rf.read_gbq_function( str(dataset_ref.routine("not_a_function")), - bigquery_client=bigquery_client, + session=session, ) assert "Unknown function" in str(e.value) @@ -550,6 +561,7 @@ def test_read_gbq_function_detects_invalid_function(bigquery_client, dataset_id) @pytest.mark.flaky(retries=2, delay=120) def test_read_gbq_function_like_original( + session, bigquery_client, bigqueryconnection_client, cloudfunctions_client, @@ -577,7 +589,7 @@ def square1(x): square2 = rf.read_gbq_function( function_name=square1.bigframes_remote_function, - bigquery_client=bigquery_client, + session=session, ) # The newly-created function (square1) should have a remote function AND a @@ -607,7 +619,14 @@ def square1(x): @pytest.mark.flaky(retries=2, delay=120) -def test_read_gbq_function_reads_udfs(bigquery_client, dataset_id): +def test_read_gbq_function_runs_existing_udf(session, bigquery_client, dataset_id): + func = session.read_gbq_function("bqutil.fn.cw_lower_case_ascii_only") + got = func("AURÉLIE") + assert got == "aurÉlie" + + +@pytest.mark.flaky(retries=2, delay=120) +def test_read_gbq_function_reads_udfs(session, bigquery_client, dataset_id): dataset_ref = bigquery.DatasetReference.from_string(dataset_id) arg = bigquery.RoutineArgument( name="x", @@ -633,7 +652,8 @@ def test_read_gbq_function_reads_udfs(bigquery_client, dataset_id): # Create the routine in BigQuery and read it back using read_gbq_function. bigquery_client.create_routine(routine, exists_ok=True) square = rf.read_gbq_function( - str(routine.reference), bigquery_client=bigquery_client + str(routine.reference), + session=session, ) # It should point to the named routine and yield the expected results. @@ -649,16 +669,17 @@ def test_read_gbq_function_reads_udfs(bigquery_client, dataset_id): indirect_df = bigframes.dataframe.DataFrame(src) indirect_df = indirect_df.assign(y=indirect_df.x.apply(square)) - # TODO(b/340875260): fix type error - indirect_df = indirect_df.to_pandas() # type: ignore + converted_indirect_df = indirect_df.to_pandas() assert_pandas_df_equal( - direct_df, indirect_df, ignore_order=True, check_index_type=False + direct_df, converted_indirect_df, ignore_order=True, check_index_type=False ) @pytest.mark.flaky(retries=2, delay=120) -def test_read_gbq_function_enforces_explicit_types(bigquery_client, dataset_id): +def test_read_gbq_function_enforces_explicit_types( + session, bigquery_client, dataset_id +): dataset_ref = bigquery.DatasetReference.from_string(dataset_id) typed_arg = bigquery.RoutineArgument( name="x", @@ -702,24 +723,35 @@ def test_read_gbq_function_enforces_explicit_types(bigquery_client, dataset_id): bigquery_client.create_routine(neither_type_specified, exists_ok=True) rf.read_gbq_function( - str(both_types_specified.reference), bigquery_client=bigquery_client + str(both_types_specified.reference), + session=session, ) rf.read_gbq_function( - str(only_return_type_specified.reference), bigquery_client=bigquery_client + str(only_return_type_specified.reference), + session=session, ) with pytest.raises(ValueError): rf.read_gbq_function( - str(only_arg_type_specified.reference), bigquery_client=bigquery_client + str(only_arg_type_specified.reference), + session=session, ) with pytest.raises(ValueError): rf.read_gbq_function( - str(neither_type_specified.reference), bigquery_client=bigquery_client + str(neither_type_specified.reference), + session=session, ) @pytest.mark.flaky(retries=2, delay=120) def test_df_apply_axis_1(session, scalars_dfs): - columns = ["bool_col", "int64_col", "int64_too", "float64_col", "string_col"] + columns = [ + "bool_col", + "int64_col", + "int64_too", + "float64_col", + "string_col", + "bytes_col", + ] scalars_df, scalars_pandas_df = scalars_dfs def add_ints(row): @@ -729,9 +761,10 @@ def add_ints(row): bigframes.exceptions.PreviewWarning, match="input_types=Series is in preview.", ): - add_ints_remote = session.remote_function(bigframes.series.Series, int)( - add_ints - ) + add_ints_remote = session.remote_function( + bigframes.series.Series, + int, + )(add_ints) with pytest.warns( bigframes.exceptions.PreviewWarning, match="axis=1 scenario is in preview." @@ -829,7 +862,6 @@ def add_ints(row): @pytest.mark.parametrize( ("column"), [ - pytest.param("bytes_col"), pytest.param("date_col"), pytest.param("datetime_col"), pytest.param("geography_col"), @@ -854,7 +886,9 @@ def echo(row): with pytest.raises( NotImplementedError, match=re.escape( - f"DataFrame has a column of dtype '{dtype}' which is not supported with axis=1. Supported dtypes are ('Int64', 'Float64', 'boolean', 'string')." + f"DataFrame has a column of dtype '{dtype}' which is not supported with axis=1. Supported dtypes are (" ), + ), pytest.warns( + bigframes.exceptions.PreviewWarning, match="axis=1 scenario is in preview." ): scalars_df[[column]].apply(echo, axis=1) diff --git a/tests/system/small/test_unordered.py b/tests/system/small/test_unordered.py new file mode 100644 index 0000000000..12c0d6e259 --- /dev/null +++ b/tests/system/small/test_unordered.py @@ -0,0 +1,28 @@ +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import pandas as pd + +import bigframes.pandas as bpd +from tests.system.utils import assert_pandas_df_equal + + +def test_unordered_mode_cache_aggregate(unordered_session): + pd_df = pd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}, dtype=pd.Int64Dtype()) + df = bpd.DataFrame(pd_df, session=unordered_session) + mean_diff = df - df.mean() + mean_diff.cache() + bf_result = mean_diff.to_pandas(ordered=False) + pd_result = pd_df - pd_df.mean() + + assert_pandas_df_equal(bf_result, pd_result, ignore_order=True) diff --git a/tests/unit/_config/test_threaded_options.py b/tests/unit/_config/test_threaded_options.py new file mode 100644 index 0000000000..7fc97a9f72 --- /dev/null +++ b/tests/unit/_config/test_threaded_options.py @@ -0,0 +1,41 @@ +# Copyright 2023 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import threading + +import bigframes._config + + +def test_mutate_options_threaded(): + options = bigframes._config.Options() + options.display.max_rows = 50 + result_dict = {"this_before": options.display.max_rows} + + def mutate_options_threaded(options, result_dict): + result_dict["other_before"] = options.display.max_rows + + options.display.max_rows = 100 + result_dict["other_after"] = options.display.max_rows + + thread = threading.Thread( + target=(lambda: mutate_options_threaded(options, result_dict)) + ) + thread.start() + thread.join(1) + result_dict["this_after"] = options.display.max_rows + + assert result_dict["this_before"] == 50 + assert result_dict["this_after"] == 50 + assert result_dict["other_before"] == 25 + assert result_dict["other_after"] == 100 diff --git a/tests/unit/core/compiler/__init__.py b/tests/unit/core/compiler/__init__.py new file mode 100644 index 0000000000..6d5e14bcf4 --- /dev/null +++ b/tests/unit/core/compiler/__init__.py @@ -0,0 +1,13 @@ +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/tests/unit/core/compiler/test_googlesql.py b/tests/unit/core/compiler/test_googlesql.py new file mode 100644 index 0000000000..70ca5cfa12 --- /dev/null +++ b/tests/unit/core/compiler/test_googlesql.py @@ -0,0 +1,155 @@ +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pytest + +import bigframes.core.compile.googlesql as sql + + +@pytest.mark.parametrize( + ("table_id", "dataset_id", "project_id", "expected"), + [ + pytest.param("a", None, None, "`a`"), + pytest.param("a", "b", None, "`b`.`a`"), + pytest.param("a", "b", "c", "`c`.`b`.`a`"), + pytest.param("a", None, "c", None, marks=pytest.mark.xfail(raises=ValueError)), + ], +) +def test_table_expression(table_id, dataset_id, project_id, expected): + expr = sql.TableExpression( + table_id=table_id, dataset_id=dataset_id, project_id=project_id + ) + assert expr.sql() == expected + + +@pytest.mark.parametrize( + ("table_name", "alias", "expected"), + [ + pytest.param(None, None, None, marks=pytest.mark.xfail(raises=ValueError)), + pytest.param("a", None, "`a`"), + pytest.param("a", "aa", "`a` AS `aa`"), + ], +) +def test_from_item_w_table_name(table_name, alias, expected): + expr = sql.FromItem( + table_name=None + if table_name is None + else sql.TableExpression(table_id=table_name), + as_alias=None + if alias is None + else sql.AsAlias(sql.AliasExpression(alias=alias)), + ) + assert expr.sql() == expected + + +def test_from_item_w_query_expr(): + from_clause = sql.FromClause( + sql.FromItem(table_name=sql.TableExpression(table_id="table_a")) + ) + select = sql.Select( + select_list=[sql.SelectAll(sql.StarExpression())], + from_clause_list=[from_clause], + ) + query_expr = sql.QueryExpr(select=select) + expected = "SELECT\n*\nFROM\n`table_a`" + + # A QueryExpr object + expr = sql.FromItem(query_expr=query_expr) + assert expr.sql() == f"({expected})" + + # A str object + expr = sql.FromItem(query_expr=expected) + assert expr.sql() == f"({expected})" + + +def test_from_item_w_cte(): + expr = sql.FromItem(cte_name=sql.CTEExpression("test")) + assert expr.sql() == "`test`" + + +@pytest.mark.parametrize( + ("col_name", "alias", "expected"), + [ + pytest.param("a", None, "`a`"), + pytest.param("a", "aa", "`a` AS `aa`"), + ], +) +def test_select_expression(col_name, alias, expected): + expr = sql.SelectExpression( + expression=sql.ColumnExpression(col_name), + alias=None if alias is None else sql.AliasExpression(alias=alias), + ) + assert expr.sql() == expected + + +def test_select(): + select_1 = sql.SelectExpression(expression=sql.ColumnExpression("a")) + select_2 = sql.SelectExpression( + expression=sql.ColumnExpression("b"), alias=sql.AliasExpression(alias="bb") + ) + from_1 = sql.FromItem(table_name=sql.TableExpression(table_id="table_a")) + from_2 = sql.FromItem( + query_expr="SELECT * FROM project.table_b", + as_alias=sql.AsAlias(sql.AliasExpression(alias="table_b")), + ) + expr = sql.Select( + select_list=[select_1, select_2], + from_clause_list=[sql.FromClause(from_1), sql.FromClause(from_2)], + ) + expected = "SELECT\n`a`,\n`b` AS `bb`\nFROM\n`table_a`,\n(SELECT * FROM project.table_b) AS `table_b`" + + assert expr.sql() == expected + + +def test_query_expr_w_cte(): + # Test a simple SELECT query. + from_clause1 = sql.FromClause( + sql.FromItem(table_name=sql.TableExpression(table_id="table_a")) + ) + select1 = sql.Select( + select_list=[sql.SelectAll(sql.StarExpression())], + from_clause_list=[from_clause1], + ) + query1 = sql.QueryExpr(select=select1) + query1_sql = "SELECT\n*\nFROM\n`table_a`" + assert query1.sql() == query1_sql + + # Test a query with CTE statements. + cte1 = sql.NonRecursiveCTE(cte_name=sql.CTEExpression("a"), query_expr=query1) + cte2 = sql.NonRecursiveCTE(cte_name=sql.CTEExpression("b"), query_expr=query1) + + cte1_sql = f"`a` AS (\n{query1_sql}\n)" + cte2_sql = f"`b` AS (\n{query1_sql}\n)" + assert cte1.sql() == cte1_sql + assert cte2.sql() == cte2_sql + + with_cte_list = [cte1, cte2] + select2 = sql.Select( + select_list=[ + sql.SelectExpression( + sql.ColumnExpression(parent=cte1.cte_name, name="column_x") + ), + sql.SelectAll(sql.StarExpression(parent=cte2.cte_name)), + ], + from_clause_list=[ + sql.FromClause(sql.FromItem(cte_name=cte1.cte_name)), + sql.FromClause(sql.FromItem(cte_name=cte2.cte_name)), + ], + ) + select2_sql = "SELECT\n`a`.`column_x`,\n`b`.*\nFROM\n`a`,\n`b`" + assert select2.sql() == select2_sql + + query2 = sql.QueryExpr(select=select2, with_cte_list=with_cte_list) + query2_sql = f"WITH {cte1_sql},\n{cte2_sql}\n{select2_sql}" + assert query2.sql() == query2_sql diff --git a/tests/unit/test_dtypes.py b/tests/unit/core/test_dtypes.py similarity index 86% rename from tests/unit/test_dtypes.py rename to tests/unit/core/test_dtypes.py index dabbf11c6c..ae194be83f 100644 --- a/tests/unit/test_dtypes.py +++ b/tests/unit/core/test_dtypes.py @@ -20,6 +20,7 @@ import pyarrow as pa # type: ignore import pytest +import bigframes.core.compile.ibis_types import bigframes.dtypes @@ -67,14 +68,14 @@ ) def test_ibis_dtype_converts(ibis_dtype, bigframes_dtype): """Test all the Ibis data types needed to read BigQuery tables""" - result = bigframes.dtypes.ibis_dtype_to_bigframes_dtype(ibis_dtype) + result = bigframes.core.compile.ibis_types.ibis_dtype_to_bigframes_dtype(ibis_dtype) assert result == bigframes_dtype def test_ibis_timestamp_pst_raises_unexpected_datatype(): """BigQuery timestamp only supports UTC time""" with pytest.raises(ValueError, match="Unexpected Ibis data type"): - bigframes.dtypes.ibis_dtype_to_bigframes_dtype( + bigframes.core.compile.ibis_types.ibis_dtype_to_bigframes_dtype( ibis_dtypes.Timestamp(timezone="PST") ) @@ -82,7 +83,9 @@ def test_ibis_timestamp_pst_raises_unexpected_datatype(): def test_ibis_float32_raises_unexpected_datatype(): """Other Ibis types not read from BigQuery are not expected""" with pytest.raises(ValueError, match="Unexpected Ibis data type"): - bigframes.dtypes.ibis_dtype_to_bigframes_dtype(ibis_dtypes.float32) + bigframes.core.compile.ibis_types.ibis_dtype_to_bigframes_dtype( + ibis_dtypes.float32 + ) IBIS_ARROW_DTYPES = ( @@ -139,13 +142,13 @@ def test_ibis_float32_raises_unexpected_datatype(): @pytest.mark.parametrize(("ibis_dtype", "arrow_dtype"), IBIS_ARROW_DTYPES) def test_arrow_dtype_to_ibis_dtype(ibis_dtype, arrow_dtype): - result = bigframes.dtypes.arrow_dtype_to_ibis_dtype(arrow_dtype) + result = bigframes.core.compile.ibis_types._arrow_dtype_to_ibis_dtype(arrow_dtype) assert result == ibis_dtype @pytest.mark.parametrize(("ibis_dtype", "arrow_dtype"), IBIS_ARROW_DTYPES) def test_ibis_dtype_to_arrow_dtype(ibis_dtype, arrow_dtype): - result = bigframes.dtypes.ibis_dtype_to_arrow_dtype(ibis_dtype) + result = bigframes.core.compile.ibis_types._ibis_dtype_to_arrow_dtype(ibis_dtype) assert result == arrow_dtype @@ -178,7 +181,9 @@ def test_ibis_dtype_to_arrow_dtype(ibis_dtype, arrow_dtype): ) def test_bigframes_dtype_converts(ibis_dtype, bigframes_dtype): """Test all the Ibis data types needed to read BigQuery tables""" - result = bigframes.dtypes.bigframes_dtype_to_ibis_dtype(bigframes_dtype) + result = bigframes.core.compile.ibis_types.bigframes_dtype_to_ibis_dtype( + bigframes_dtype + ) assert result == ibis_dtype @@ -203,20 +208,22 @@ def test_bigframes_dtype_converts(ibis_dtype, bigframes_dtype): ) def test_bigframes_string_dtype_converts(ibis_dtype, bigframes_dtype_str): """Test all the Ibis data types needed to read BigQuery tables""" - result = bigframes.dtypes.bigframes_dtype_to_ibis_dtype(bigframes_dtype_str) + result = bigframes.core.compile.ibis_types.bigframes_dtype_to_ibis_dtype( + bigframes_dtype_str + ) assert result == ibis_dtype def test_unsupported_dtype_raises_unexpected_datatype(): """Incompatible dtypes should fail when passed into BigQuery DataFrames""" with pytest.raises(ValueError, match="Unexpected data type"): - bigframes.dtypes.bigframes_dtype_to_ibis_dtype(np.float32) + bigframes.core.compile.ibis_types.bigframes_dtype_to_ibis_dtype(np.float32) def test_unsupported_dtype_str_raises_unexpected_datatype(): """Incompatible dtypes should fail when passed into BigQuery DataFrames""" with pytest.raises(ValueError, match="Unexpected data type"): - bigframes.dtypes.bigframes_dtype_to_ibis_dtype("int64") + bigframes.core.compile.ibis_types.bigframes_dtype_to_ibis_dtype("int64") @pytest.mark.parametrize( @@ -228,21 +235,23 @@ def test_unsupported_dtype_str_raises_unexpected_datatype(): ], ) def test_literal_to_ibis_scalar_converts(literal, ibis_scalar): - assert bigframes.dtypes.literal_to_ibis_scalar(literal).equals(ibis_scalar) + assert bigframes.core.compile.ibis_types.literal_to_ibis_scalar(literal).equals( + ibis_scalar + ) def test_literal_to_ibis_scalar_throws_on_incompatible_literal(): with pytest.raises( ValueError, ): - bigframes.dtypes.literal_to_ibis_scalar({"mykey": "myval"}) + bigframes.core.compile.ibis_types.literal_to_ibis_scalar({"mykey": "myval"}) def test_remote_function_io_types_are_supported_bigframes_types(): from ibis.expr.datatypes.core import dtype as python_type_to_bigquery_type - from bigframes.dtypes import SUPPORTED_IO_PYTHON_TYPES as rf_supported_io_types + from bigframes.dtypes import RF_SUPPORTED_IO_PYTHON_TYPES as rf_supported_io_types for python_type in rf_supported_io_types: ibis_type = python_type_to_bigquery_type(python_type) - assert ibis_type in bigframes.dtypes.IBIS_TO_BIGFRAMES + assert ibis_type in bigframes.core.compile.ibis_types.IBIS_TO_BIGFRAMES diff --git a/tests/unit/core/test_sql.py b/tests/unit/core/test_sql.py new file mode 100644 index 0000000000..29f1e48a70 --- /dev/null +++ b/tests/unit/core/test_sql.py @@ -0,0 +1,78 @@ +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +from bigframes.core import sql + + +def test_create_vector_search_sql_simple(): + sql_string = "SELECT embedding FROM my_embeddings_table WHERE id = 1" + options = { + "base_table": "my_base_table", + "column_to_search": "my_embedding_column", + "distance_type": "COSINE", + "top_k": 10, + "use_brute_force": False, + } + + expected_query = f""" + SELECT + query.*, + base.*, + distance, + FROM VECTOR_SEARCH( + TABLE `my_base_table`, + 'my_embedding_column', + ({sql_string}), + distance_type => 'COSINE', + top_k => 10 + ) + """ + + result_query = sql.create_vector_search_sql( + sql_string, options # type:ignore + ) + assert result_query == expected_query + + +def test_create_vector_search_sql_query_column_to_search(): + sql_string = "SELECT embedding FROM my_embeddings_table WHERE id = 1" + options = { + "base_table": "my_base_table", + "column_to_search": "my_embedding_column", + "distance_type": "COSINE", + "top_k": 10, + "query_column_to_search": "new_embedding_column", + "use_brute_force": False, + } + + expected_query = f""" + SELECT + query.*, + base.*, + distance, + FROM VECTOR_SEARCH( + TABLE `my_base_table`, + 'my_embedding_column', + ({sql_string}), + 'new_embedding_column', + distance_type => 'COSINE', + top_k => 10 + ) + """ + + result_query = sql.create_vector_search_sql( + sql_string, options # type:ignore + ) + assert result_query == expected_query diff --git a/tests/unit/functions/test_remote_function_template.py b/tests/unit/functions/test_remote_function_template.py new file mode 100644 index 0000000000..70b033d938 --- /dev/null +++ b/tests/unit/functions/test_remote_function_template.py @@ -0,0 +1,193 @@ +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import json + +import pandas as pd +import pandas.testing +import pyarrow +import pytest + +import bigframes.dtypes +import bigframes.functions.remote_function_template as remote_function_template + +HELLO_WORLD_BASE64_BYTES = b"SGVsbG8sIFdvcmxkIQ==" +HELLO_WORLD_BASE64_STR = "SGVsbG8sIFdvcmxkIQ==" + + +@pytest.mark.parametrize( + ["type_", "json_value", "expected"], + ( + pytest.param( + # Type names should match those in BigQueryType.from_ibis in + # third_party/bigframes_vendored/ibis/backends/bigquery/datatypes.py + "BOOLEAN", + True, + True, + ), + pytest.param( + "BYTES", + HELLO_WORLD_BASE64_STR, + b"Hello, World!", + ), + pytest.param( + "FLOAT64", + 1.25, + 1.25, + ), + pytest.param( + "INT64", + 123, + 123, + ), + pytest.param( + "STRING", + "Hello, World!", + "Hello, World!", + ), + ), +) +def test_convert_from_bq_json(type_, json_value, expected): + got = remote_function_template.convert_from_bq_json(type_, json_value) + assert got == expected + + +@pytest.mark.parametrize( + "type_", + [ + # Type names should match those in BigQueryType.from_ibis in + # third_party/bigframes_vendored/ibis/backends/bigquery/datatypes.py + "BOOLEAN", + "BYTES", + "FLOAT64", + "INT64", + "STRING", + ], +) +def test_convert_from_bq_json_none(type_): + got = remote_function_template.convert_from_bq_json(type_, None) + assert got is None + + +@pytest.mark.parametrize( + ["type_", "value", "expected"], + ( + pytest.param( + # Type names should match those in BigQueryType.from_ibis in + # third_party/bigframes_vendored/ibis/backends/bigquery/datatypes.py + "BOOLEAN", + True, + True, + ), + pytest.param( + "BYTES", + b"Hello, World!", + HELLO_WORLD_BASE64_STR, + ), + pytest.param( + "FLOAT64", + 1.25, + 1.25, + ), + pytest.param( + "INT64", + 123, + 123, + ), + pytest.param( + "STRING", + "Hello, World!", + "Hello, World!", + ), + ), +) +def test_convert_to_bq_json(type_, value, expected): + got = remote_function_template.convert_to_bq_json(type_, value) + assert got == expected + + +@pytest.mark.parametrize( + "type_", + [ + # Type names should match those in BigQueryType.from_ibis in + # third_party/bigframes_vendored/ibis/backends/bigquery/datatypes.py + "BOOLEAN", + "BYTES", + "FLOAT64", + "INT64", + "STRING", + ], +) +def test_convert_to_bq_json_none(type_): + got = remote_function_template.convert_to_bq_json(type_, None) + assert got is None + + +@pytest.mark.parametrize( + ["row_json", "expected"], + ( + pytest.param( + json.dumps( + { + "names": ["'my-index'", "'col1'", "'col2'", "'col3'"], + "types": ["string", "Int64", "Int64", "Int64"], + "values": ["my-index-value", "1", None, "-1"], + "indexlength": 1, + "dtype": "Int64", + } + ), + pd.Series( + [1, pd.NA, -1], + dtype="Int64", + index=["col1", "col2", "col3"], + name="my-index-value", + ), + id="int64-string-index", + ), + pytest.param( + json.dumps( + { + "names": ["'col1'", "'col2'", "'col3'"], + "types": ["binary[pyarrow]", "binary[pyarrow]", "binary[pyarrow]"], + "values": [HELLO_WORLD_BASE64_STR, "dGVzdDI=", "dGVzdDM="], + "indexlength": 0, + "dtype": "binary[pyarrow]", + } + ), + pd.Series( + [b"Hello, World!", b"test2", b"test3"], + dtype=pd.ArrowDtype(pyarrow.binary()), + index=["col1", "col2", "col3"], + name=(), + ), + id="binary-no-index", + ), + ), +) +def test_get_pd_series(row_json, expected): + got = remote_function_template.get_pd_series(row_json) + pandas.testing.assert_series_equal(got, expected) + + +def test_get_pd_series_converter_dtypes(): + """Ensures the string format of the dtype doesn't change from that expected by get_pd_series.""" + + # Keep in sync with value_converters in get_pd_series. + # NOTE: Any change here is a red flag that there has been a breaking change + # that will affect deployed axis=1 remote functions. + assert str(bigframes.dtypes.BOOL_DTYPE) == "boolean" + assert str(bigframes.dtypes.BYTES_DTYPE) == "binary[pyarrow]" + assert str(bigframes.dtypes.FLOAT_DTYPE) == "Float64" + assert str(bigframes.dtypes.INT_DTYPE) == "Int64" + assert str(bigframes.dtypes.STRING_DTYPE) == "string" diff --git a/tests/unit/test_pandas.py b/tests/unit/test_pandas.py index 54a7a79d3c..408590d4bb 100644 --- a/tests/unit/test_pandas.py +++ b/tests/unit/test_pandas.py @@ -95,8 +95,7 @@ def test_cut_raises_with_labels(): match="The 'labels' parameter must be either False or None.", ): mock_series = mock.create_autospec(bigframes.pandas.Series, instance=True) - # TODO(b/340893280): fix type error - bigframes.pandas.cut(mock_series, 4, labels=["a", "b", "c", "d"]) # type: ignore + bigframes.pandas.cut(mock_series, 4, labels=["a", "b", "c", "d"]) @pytest.mark.parametrize( diff --git a/tests/unit/test_remote_function.py b/tests/unit/test_remote_function.py index 1bd3f3b14f..6868e85b9c 100644 --- a/tests/unit/test_remote_function.py +++ b/tests/unit/test_remote_function.py @@ -19,6 +19,7 @@ import pandas import pytest +import bigframes.core.compile.ibis_types import bigframes.dtypes import bigframes.functions.remote_function import bigframes.series @@ -62,11 +63,11 @@ def axis_1_function(myparam: series_type) -> str: # type: ignore def test_supported_types_correspond(): # The same types should be representable by the supported Python and BigQuery types. ibis_types_from_python = { - ibis_types.dtype(t) for t in bigframes.dtypes.SUPPORTED_IO_PYTHON_TYPES + ibis_types.dtype(t) for t in bigframes.dtypes.RF_SUPPORTED_IO_PYTHON_TYPES } ibis_types_from_bigquery = { third_party_ibis_bqtypes.BigQueryType.to_ibis(tk) - for tk in bigframes.dtypes.SUPPORTED_IO_BIGQUERY_TYPEKINDS + for tk in bigframes.dtypes.RF_SUPPORTED_IO_BIGQUERY_TYPEKINDS } assert ibis_types_from_python == ibis_types_from_bigquery diff --git a/third_party/bigframes_vendored/pandas/core/indexes/base.py b/third_party/bigframes_vendored/pandas/core/indexes/base.py index eb6b9161fc..b0e1a09392 100644 --- a/third_party/bigframes_vendored/pandas/core/indexes/base.py +++ b/third_party/bigframes_vendored/pandas/core/indexes/base.py @@ -1,6 +1,8 @@ # Contains code from https://github.com/pandas-dev/pandas/blob/main/pandas/core/indexes/base.py from __future__ import annotations +import typing + from bigframes import constants @@ -320,7 +322,7 @@ def drop(self, labels) -> Index: """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) - def dropna(self, how: str = "any"): + def dropna(self, how: typing.Literal["all", "any"] = "any"): """Return Index without NA/NaN values. Args: diff --git a/third_party/bigframes_vendored/pandas/core/tools/datetimes.py b/third_party/bigframes_vendored/pandas/core/tools/datetimes.py index 3d460b2b16..52b287b949 100644 --- a/third_party/bigframes_vendored/pandas/core/tools/datetimes.py +++ b/third_party/bigframes_vendored/pandas/core/tools/datetimes.py @@ -1,14 +1,13 @@ # Contains code from https://github.com/pandas-dev/pandas/blob/main/pandas/core/tools/datetimes.py from datetime import datetime -from typing import Iterable, Mapping, Union +from typing import List, Mapping, Tuple, Union import pandas as pd from bigframes import constants, series -local_scalars = Union[int, float, str, datetime] -local_iterables = Union[Iterable, pd.Series, pd.DataFrame, Mapping] +local_iterables = Union[List, Tuple, pd.Series, pd.DataFrame, Mapping] def to_datetime( diff --git a/third_party/bigframes_vendored/sklearn/base.py b/third_party/bigframes_vendored/sklearn/base.py index 1a151a1119..57c9e79f8d 100644 --- a/third_party/bigframes_vendored/sklearn/base.py +++ b/third_party/bigframes_vendored/sklearn/base.py @@ -87,7 +87,7 @@ def score(self, X, y): .. note:: - Output matches that of the BigQuery ML.EVALUTE function. + Output matches that of the BigQuery ML.EVALUATE function. See: https://cloud.google.com/bigquery/docs/reference/standard-sql/bigqueryml-syntax-evaluate#classification_models for the outputs relevant to this model type. @@ -115,7 +115,7 @@ def score(self, X, y): .. note:: - Output matches that of the BigQuery ML.EVALUTE function. + Output matches that of the BigQuery ML.EVALUATE function. See: https://cloud.google.com/bigquery/docs/reference/standard-sql/bigqueryml-syntax-evaluate#regression_models for the outputs relevant to this model type. diff --git a/third_party/bigframes_vendored/sklearn/cluster/_kmeans.py b/third_party/bigframes_vendored/sklearn/cluster/_kmeans.py index 386b620f4a..aaf43dbcfe 100644 --- a/third_party/bigframes_vendored/sklearn/cluster/_kmeans.py +++ b/third_party/bigframes_vendored/sklearn/cluster/_kmeans.py @@ -103,7 +103,7 @@ def score( .. note:: - Output matches that of the BigQuery ML.EVALUTE function. + Output matches that of the BigQuery ML.EVALUATE function. See: https://cloud.google.com/bigquery/docs/reference/standard-sql/bigqueryml-syntax-evaluate#k-means_models for the outputs relevant to this model type. diff --git a/third_party/bigframes_vendored/sklearn/decomposition/_pca.py b/third_party/bigframes_vendored/sklearn/decomposition/_pca.py index 71e53bf4a9..ae6f0b0561 100644 --- a/third_party/bigframes_vendored/sklearn/decomposition/_pca.py +++ b/third_party/bigframes_vendored/sklearn/decomposition/_pca.py @@ -52,7 +52,7 @@ def score(self, X=None, y=None): .. note:: - Output matches that of the BigQuery ML.EVALUTE function. + Output matches that of the BigQuery ML.EVALUATE function. See: https://cloud.google.com/bigquery/docs/reference/standard-sql/bigqueryml-syntax-evaluate#pca_models for the outputs relevant to this model type. diff --git a/third_party/bigframes_vendored/sklearn/linear_model/_logistic.py b/third_party/bigframes_vendored/sklearn/linear_model/_logistic.py index 49198eb9bd..c52a37018c 100644 --- a/third_party/bigframes_vendored/sklearn/linear_model/_logistic.py +++ b/third_party/bigframes_vendored/sklearn/linear_model/_logistic.py @@ -26,8 +26,10 @@ class LogisticRegression(LinearClassifierMixin, BaseEstimator): Args: optimize_strategy (str, default "auto_strategy"): The strategy to train logistic regression models. Possible values are - "auto_strategy", "batch_gradient_descent", "normal_equation". Default - to "auto_strategy". + "auto_strategy" and "batch_gradient_descent". The two are equilevant since + "auto_strategy" will fall back to "batch_gradient_descent". The API is kept + for consistency. + Default to "auto_strategy". fit_intercept (default True): Default True. Specifies if a constant (a.k.a. bias or intercept) should be added to the decision function.