From 92a1af35b8de4afb6cdb5b5e89facdceb5c151d2 Mon Sep 17 00:00:00 2001 From: Garrett Wu <6505921+GarrettWu@users.noreply.github.com> Date: Mon, 25 Mar 2024 12:10:17 -0700 Subject: [PATCH 01/53] docs: add progress_bar code sample (#508) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Thank you for opening a Pull Request! Before submitting your PR, there are a few things you can do to make sure it goes smoothly: - [ ] Make sure to open an issue as a [bug/issue](https://togithub.com/googleapis/python-bigquery-dataframes/issues/new/choose) before writing your code! That way we can discuss the change, evaluate designs, and agree on the general idea - [ ] Ensure the tests and linter pass - [ ] Code coverage does not decrease (if any source code was changed) - [ ] Appropriate docs were updated (if necessary) Fixes # 🦕 --- .../bigframes_vendored/pandas/core/config_init.py | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/third_party/bigframes_vendored/pandas/core/config_init.py b/third_party/bigframes_vendored/pandas/core/config_init.py index 33c6b3e093..ecc103d7c8 100644 --- a/third_party/bigframes_vendored/pandas/core/config_init.py +++ b/third_party/bigframes_vendored/pandas/core/config_init.py @@ -32,9 +32,21 @@ User can execute the job by calling .to_pandas() >>> # df.to_pandas() -Reset option +Reset repr_mode option >>> bpd.options.display.repr_mode = "head" +Can also set the progress_bar option to see the progress bar in terminal, + >>> bpd.options.display.progress_bar = "terminal" + +notebook, + >>> bpd.options.display.progress_bar = "notebook" + +or just remove it. + >>> bpd.options.display.progress_bar = None + +Setting to default value "auto" will detect and show progress bar automatically. + >>> bpd.options.display.progress_bar = "auto" + Attributes: max_columns (int, default 20): If `max_columns` is exceeded, switch to truncate view. From 036649e7edbd8528196a3ce4b64837b554d83ecf Mon Sep 17 00:00:00 2001 From: Garrett Wu <6505921+GarrettWu@users.noreply.github.com> Date: Mon, 25 Mar 2024 13:36:33 -0700 Subject: [PATCH 02/53] chore: fix model.register test to use anonymous dataset (#510) --- tests/system/small/ml/conftest.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/tests/system/small/ml/conftest.py b/tests/system/small/ml/conftest.py index c9100f36f3..33351afe45 100644 --- a/tests/system/small/ml/conftest.py +++ b/tests/system/small/ml/conftest.py @@ -47,12 +47,11 @@ def penguins_bqml_linear_model(session, penguins_linear_model_name) -> core.Bqml @pytest.fixture(scope="function") def ephemera_penguins_bqml_linear_model( - penguins_bqml_linear_model, + session: bigframes.Session, + penguins_bqml_linear_model: core.BqmlModel, ) -> core.BqmlModel: model = penguins_bqml_linear_model - return model.copy( - f"{model._model.project}.{model._model.dataset_id}.{uuid.uuid4().hex}" - ) + return model.copy(f"{session._anonymous_dataset}.{uuid.uuid4().hex}") @pytest.fixture(scope="session") From e8e66cf25887f64d2a7cb26081c2ef3cea10827d Mon Sep 17 00:00:00 2001 From: TrevorBergeron Date: Mon, 25 Mar 2024 20:40:27 -0700 Subject: [PATCH 03/53] feat: Add support for numpy expm1, log1p, floor, ceil, arctan2 ops (#505) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Tim Sweña (Swast) --- bigframes/core/compile/scalar_op_compiler.py | 54 ++++++++++++++++++++ bigframes/operations/__init__.py | 10 ++++ tests/system/small/test_numpy.py | 22 ++++++++ 3 files changed, 86 insertions(+) diff --git a/bigframes/core/compile/scalar_op_compiler.py b/bigframes/core/compile/scalar_op_compiler.py index d2fc453835..5c165fa1df 100644 --- a/bigframes/core/compile/scalar_op_compiler.py +++ b/bigframes/core/compile/scalar_op_compiler.py @@ -257,6 +257,13 @@ def arctan_op_impl(x: ibis_types.Value): return typing.cast(ibis_types.NumericValue, x).atan() +@scalar_op_compiler.register_binary_op(ops.arctan2_op) +def arctan2_op_impl(x: ibis_types.Value, y: ibis_types.Value): + return typing.cast(ibis_types.NumericValue, x).atan2( + typing.cast(ibis_types.NumericValue, y) + ) + + # Hyperbolic trig functions # BQ has these functions, but Ibis doesn't @scalar_op_compiler.register_unary_op(ops.sinh_op) @@ -319,6 +326,30 @@ def arctanh_op_impl(x: ibis_types.Value): # Numeric Ops +@scalar_op_compiler.register_unary_op(ops.floor_op) +def floor_op_impl(x: ibis_types.Value): + x_numeric = typing.cast(ibis_types.NumericValue, x) + if x_numeric.type().is_integer(): + return x_numeric.cast(ibis_dtypes.Float64()) + if x_numeric.type().is_floating(): + # Default ibis impl tries to cast to integer, which doesn't match pandas and can overflow + return float_floor(x_numeric) + else: # numeric + return x_numeric.floor() + + +@scalar_op_compiler.register_unary_op(ops.ceil_op) +def ceil_op_impl(x: ibis_types.Value): + x_numeric = typing.cast(ibis_types.NumericValue, x) + if x_numeric.type().is_integer(): + return x_numeric.cast(ibis_dtypes.Float64()) + if x_numeric.type().is_floating(): + # Default ibis impl tries to cast to integer, which doesn't match pandas and can overflow + return float_ceil(x_numeric) + else: # numeric + return x_numeric.ceil() + + @scalar_op_compiler.register_unary_op(ops.abs_op) def abs_op_impl(x: ibis_types.Value): return typing.cast(ibis_types.NumericValue, x).abs() @@ -347,6 +378,11 @@ def ln_op_impl(x: ibis_types.Value): return (~domain).ifelse(out_of_domain, numeric_value.ln()) +@scalar_op_compiler.register_unary_op(ops.log1p_op) +def log1p_op_impl(x: ibis_types.Value): + return ln_op_impl(_ibis_num(1) + x) + + @scalar_op_compiler.register_unary_op(ops.exp_op) def exp_op_impl(x: ibis_types.Value): numeric_value = typing.cast(ibis_types.NumericValue, x) @@ -354,6 +390,11 @@ def exp_op_impl(x: ibis_types.Value): return (~domain).ifelse(_INF, numeric_value.exp()) +@scalar_op_compiler.register_unary_op(ops.expm1_op) +def expm1_op_impl(x: ibis_types.Value): + return exp_op_impl(x) - _ibis_num(1) + + @scalar_op_compiler.register_unary_op(ops.invert_op) def invert_op_impl(x: ibis_types.Value): return typing.cast(ibis_types.NumericValue, x).negate() @@ -1318,3 +1359,16 @@ def _ibis_num(number: float): @ibis.udf.scalar.builtin def timestamp(a: str) -> ibis_dtypes.timestamp: """Convert string to timestamp.""" + + +# Need these because ibis otherwise tries to do casts to int that can fail +@ibis.udf.scalar.builtin(name="floor") +def float_floor(a: float) -> float: + """Convert string to timestamp.""" + return 0 # pragma: NO COVER + + +@ibis.udf.scalar.builtin(name="ceil") +def float_ceil(a: float) -> float: + """Convert string to timestamp.""" + return 0 # pragma: NO COVER diff --git a/bigframes/operations/__init__.py b/bigframes/operations/__init__.py index 4ecb8dca5a..2ef71fde7f 100644 --- a/bigframes/operations/__init__.py +++ b/bigframes/operations/__init__.py @@ -246,11 +246,16 @@ def create_ternary_op( arcsinh_op = create_unary_op(name="arcsinh", type_rule=op_typing.REAL_NUMERIC) arccosh_op = create_unary_op(name="arccosh", type_rule=op_typing.REAL_NUMERIC) arctanh_op = create_unary_op(name="arctanh", type_rule=op_typing.REAL_NUMERIC) +arctan2_op = create_binary_op(name="arctan2", type_rule=op_typing.REAL_NUMERIC) ## Numeric Ops +floor_op = create_unary_op(name="floor", type_rule=op_typing.REAL_NUMERIC) +ceil_op = create_unary_op(name="ceil", type_rule=op_typing.REAL_NUMERIC) abs_op = create_unary_op(name="abs", type_rule=op_typing.INPUT_TYPE) exp_op = create_unary_op(name="exp", type_rule=op_typing.REAL_NUMERIC) +expm1_op = create_unary_op(name="expm1", type_rule=op_typing.REAL_NUMERIC) ln_op = create_unary_op(name="log", type_rule=op_typing.REAL_NUMERIC) log10_op = create_unary_op(name="log10", type_rule=op_typing.REAL_NUMERIC) +log1p_op = create_unary_op(name="log1p", type_rule=op_typing.REAL_NUMERIC) sqrt_op = create_unary_op(name="sqrt", type_rule=op_typing.REAL_NUMERIC) @@ -540,6 +545,10 @@ def output_type(self, *input_types: dtypes.ExpressionType) -> dtypes.ExpressionT np.log10: log10_op, np.sqrt: sqrt_op, np.abs: abs_op, + np.floor: floor_op, + np.ceil: ceil_op, + np.log1p: log1p_op, + np.expm1: expm1_op, } @@ -549,4 +558,5 @@ def output_type(self, *input_types: dtypes.ExpressionType) -> dtypes.ExpressionT np.multiply: mul_op, np.divide: div_op, np.power: pow_op, + np.arctan2: arctan2_op, } diff --git a/tests/system/small/test_numpy.py b/tests/system/small/test_numpy.py index 5c2a93ec39..8e349e472a 100644 --- a/tests/system/small/test_numpy.py +++ b/tests/system/small/test_numpy.py @@ -56,6 +56,10 @@ def test_series_ufuncs(floats_pd, floats_bf, opname): ("log10",), ("sqrt",), ("abs",), + ("floor",), + ("ceil",), + ("expm1",), + ("log1p",), ], ) def test_df_ufuncs(scalars_dfs, opname): @@ -77,6 +81,7 @@ def test_df_ufuncs(scalars_dfs, opname): ("multiply",), ("divide",), ("power",), + ("arctan2",), ], ) def test_series_binary_ufuncs(floats_product_pd, floats_product_bf, opname): @@ -112,6 +117,23 @@ def test_df_binary_ufuncs(scalars_dfs, opname): pd.testing.assert_frame_equal(bf_result, pd_result) +@pytest.mark.parametrize( + ("x", "y"), + [ + ("int64_col", "int64_col"), + ("float64_col", "int64_col"), + ], +) +def test_series_atan2(scalars_dfs, x, y): + # Test atan2 separately as pandas errors when passing entire df as input, so pass only series + scalars_df, scalars_pandas_df = scalars_dfs + + bf_result = np.arctan2(scalars_df[x], scalars_df[y]).to_pandas() + pd_result = np.arctan2(scalars_pandas_df[x], scalars_pandas_df[y]) + + pd.testing.assert_series_equal(bf_result, pd_result) + + def test_series_binary_ufuncs_reverse(scalars_dfs): scalars_df, scalars_pandas_df = scalars_dfs From 5e28ebd1ba3a5559e093c2ea676c0714c1434ba9 Mon Sep 17 00:00:00 2001 From: TrevorBergeron Date: Tue, 26 Mar 2024 12:06:47 -0700 Subject: [PATCH 04/53] feat: add `DataFrame.eval` and `DataFrame.query` (#361) * feat: add DataFrame.eval, DataFrame.query * address pr comments * add docstring, disable new tests for legacy pandas * vendor the pandas eval implementation * amend eval docstring * fix doctest expectation * amend doctest * pr comments * Fix doctest for eval --- bigframes/core/eval.py | 71 ++ bigframes/dataframe.py | 11 + tests/system/small/test_dataframe.py | 38 + .../bigframes_vendored/pandas/core/common.py | 26 + .../pandas/core/computation/align.py | 226 +++++ .../pandas/core/computation/common.py | 48 + .../pandas/core/computation/engines.py | 94 ++ .../pandas/core/computation/eval.py | 368 ++++++++ .../pandas/core/computation/expr.py | 828 ++++++++++++++++++ .../pandas/core/computation/ops.py | 605 +++++++++++++ .../pandas/core/computation/parsing.py | 196 +++++ .../pandas/core/computation/scope.py | 355 ++++++++ .../pandas/core/dtypes/inference.py | 31 + .../bigframes_vendored/pandas/core/frame.py | 153 ++++ .../pandas/util/_exceptions.py | 29 + .../pandas/util/_validators.py | 58 ++ 16 files changed, 3137 insertions(+) create mode 100644 bigframes/core/eval.py create mode 100644 third_party/bigframes_vendored/pandas/core/computation/align.py create mode 100644 third_party/bigframes_vendored/pandas/core/computation/common.py create mode 100644 third_party/bigframes_vendored/pandas/core/computation/engines.py create mode 100644 third_party/bigframes_vendored/pandas/core/computation/eval.py create mode 100644 third_party/bigframes_vendored/pandas/core/computation/expr.py create mode 100644 third_party/bigframes_vendored/pandas/core/computation/ops.py create mode 100644 third_party/bigframes_vendored/pandas/core/computation/parsing.py create mode 100644 third_party/bigframes_vendored/pandas/core/computation/scope.py create mode 100644 third_party/bigframes_vendored/pandas/core/dtypes/inference.py create mode 100644 third_party/bigframes_vendored/pandas/util/_exceptions.py create mode 100644 third_party/bigframes_vendored/pandas/util/_validators.py diff --git a/bigframes/core/eval.py b/bigframes/core/eval.py new file mode 100644 index 0000000000..692ca1c7bb --- /dev/null +++ b/bigframes/core/eval.py @@ -0,0 +1,71 @@ +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import dataclasses +from typing import Optional + +import bigframes_vendored.pandas.core.computation.eval as vendored_pandas_eval +import bigframes_vendored.pandas.core.computation.parsing as vendored_pandas_eval_parsing + +import bigframes.dataframe as dataframe +import bigframes.dtypes +import bigframes.series as series + + +def eval(df: dataframe.DataFrame, expr: str, target: Optional[dataframe.DataFrame]): + """ + Evaluate the given python expression + + Args: + df (DataFrame): + Columns of this dataframe will be used to resolve variables in expression. + expr (str): + One or more python expression to evaluate. + target (DataFrame or None): + The evaluation result will be written to the target if provided. + + Returns: + Result of evaluation. + """ + index_resolver = { + vendored_pandas_eval_parsing.clean_column_name(str(name)): EvalSeries( + df.index.get_level_values(level).to_series() + ) + for level, name in enumerate(df.index.names) + } + column_resolver = { + vendored_pandas_eval_parsing.clean_column_name(str(name)): EvalSeries(series) + for name, series in df.items() + } + # 3 Levels: user -> logging wrapper -> dataframe -> eval helper (this) + return vendored_pandas_eval.eval( + expr=expr, level=3, target=target, resolvers=(index_resolver, column_resolver) # type: ignore + ) + + +@dataclasses.dataclass +class FakeNumpyArray: + dtype: bigframes.dtypes.Dtype + + +class EvalSeries(series.Series): + """Slight modified series that works better with pandas.eval""" + + def __init__(self, underlying: series.Series): + super().__init__(data=underlying._block) + + @property + def values(self): + """Returns fake numpy array with only dtype property so that eval can determine schema without actually downloading the data.""" + return FakeNumpyArray(self.dtype) diff --git a/bigframes/dataframe.py b/bigframes/dataframe.py index 07dae2c53b..7e82ba125c 100644 --- a/bigframes/dataframe.py +++ b/bigframes/dataframe.py @@ -1493,6 +1493,17 @@ def sort_values( ) return DataFrame(self._block.order_by(ordering)) + def eval(self, expr: str) -> DataFrame: + import bigframes.core.eval as bf_eval + + return bf_eval.eval(self, expr, target=self) + + def query(self, expr: str) -> DataFrame: + import bigframes.core.eval as bf_eval + + eval_result = bf_eval.eval(self, expr, target=None) + return self[eval_result] + def value_counts( self, subset: typing.Union[blocks.Label, typing.Sequence[blocks.Label]] = None, diff --git a/tests/system/small/test_dataframe.py b/tests/system/small/test_dataframe.py index 99ee6680fa..e58a666709 100644 --- a/tests/system/small/test_dataframe.py +++ b/tests/system/small/test_dataframe.py @@ -3822,6 +3822,44 @@ def test_df_to_orc(scalars_df_index, scalars_pandas_df_index): assert bf_result == pd_result +@skip_legacy_pandas +@pytest.mark.parametrize( + ("expr",), + [ + ("new_col = int64_col + int64_too",), + ("new_col = (rowindex > 3) | bool_col",), + ("int64_too = bool_col\nnew_col2 = rowindex",), + ], +) +def test_df_eval(scalars_dfs, expr): + scalars_df, scalars_pandas_df = scalars_dfs + + bf_result = scalars_df.eval(expr).to_pandas() + pd_result = scalars_pandas_df.eval(expr) + + pd.testing.assert_frame_equal(bf_result, pd_result) + + +@skip_legacy_pandas +@pytest.mark.parametrize( + ("expr",), + [ + ("int64_col > int64_too",), + ("bool_col",), + ("((int64_col - int64_too) % @local_var) == 0",), + ], +) +def test_df_query(scalars_dfs, expr): + # local_var is referenced in expressions + local_var = 3 # NOQA + scalars_df, scalars_pandas_df = scalars_dfs + + bf_result = scalars_df.query(expr).to_pandas() + pd_result = scalars_pandas_df.query(expr) + + pd.testing.assert_frame_equal(bf_result, pd_result) + + @pytest.mark.parametrize( ("subset", "normalize", "ascending", "dropna"), [ diff --git a/third_party/bigframes_vendored/pandas/core/common.py b/third_party/bigframes_vendored/pandas/core/common.py index ded5a22b8f..872a64db6c 100644 --- a/third_party/bigframes_vendored/pandas/core/common.py +++ b/third_party/bigframes_vendored/pandas/core/common.py @@ -3,6 +3,8 @@ from typing import Callable, TYPE_CHECKING +from bigframes_vendored.pandas.core.dtypes.inference import iterable_not_string + if TYPE_CHECKING: from bigframes_vendored.pandas.pandas._typing import T @@ -40,3 +42,27 @@ def pipe( return func(*args, **kwargs) else: return func(obj, *args, **kwargs) + + +def flatten(line): + """ + Flatten an arbitrarily nested sequence. + + Parameters + ---------- + line : sequence + The non string sequence to flatten + + Notes + ----- + This doesn't consider strings sequences. + + Returns + ------- + flattened : generator + """ + for element in line: + if iterable_not_string(element): + yield from flatten(element) + else: + yield element diff --git a/third_party/bigframes_vendored/pandas/core/computation/align.py b/third_party/bigframes_vendored/pandas/core/computation/align.py new file mode 100644 index 0000000000..2608dabe7a --- /dev/null +++ b/third_party/bigframes_vendored/pandas/core/computation/align.py @@ -0,0 +1,226 @@ +# Contains code from https://github.com/pandas-dev/pandas/blob/main/pandas/core/computation/align.py +""" +Core eval alignment algorithms. +""" +from __future__ import annotations + +from functools import partial, wraps +from typing import Callable, TYPE_CHECKING +import warnings + +import bigframes_vendored.pandas.core.common as com +from bigframes_vendored.pandas.core.computation.common import result_type_many +from bigframes_vendored.pandas.util._exceptions import find_stack_level +import numpy as np +from pandas.errors import PerformanceWarning + +if TYPE_CHECKING: + from collections.abc import Sequence + + from bigframes_vendored.pandas.core.generic import NDFrame + from bigframes_vendored.pandas.core.indexes.base import Index + from pandas._typing import F + + +def _align_core_single_unary_op( + term, +) -> tuple[partial | type[NDFrame], dict[str, Index] | None]: + typ: partial | type[NDFrame] + axes: dict[str, Index] | None = None + + if isinstance(term.value, np.ndarray): + typ = partial(np.asanyarray, dtype=term.value.dtype) + else: + typ = type(term.value) + if hasattr(term.value, "axes"): + axes = _zip_axes_from_type(typ, term.value.axes) + + return typ, axes + + +def _zip_axes_from_type( + typ: type[NDFrame], new_axes: Sequence[Index] +) -> dict[str, Index]: + return {name: new_axes[i] for i, name in enumerate(typ._AXIS_ORDERS)} + + +def _any_pandas_objects(terms) -> bool: + """ + Check a sequence of terms for instances of PandasObject. + """ + return any(is_pandas_object(term.value) for term in terms) + + +def _filter_special_cases(f) -> Callable[[F], F]: + @wraps(f) + def wrapper(terms): + # single unary operand + if len(terms) == 1: + return _align_core_single_unary_op(terms[0]) + + term_values = (term.value for term in terms) + + # we don't have any pandas objects + if not _any_pandas_objects(terms): + return result_type_many(*term_values), None + + return f(terms) + + return wrapper + + +@_filter_special_cases +def _align_core(terms): + term_index = [i for i, term in enumerate(terms) if hasattr(term.value, "axes")] + term_dims = [terms[i].value.ndim for i in term_index] + + from pandas import Series + + ndims = Series(dict(zip(term_index, term_dims))) + + # initial axes are the axes of the largest-axis'd term + biggest = terms[ndims.idxmax()].value + typ = biggest._constructor + axes = biggest.axes + naxes = len(axes) + gt_than_one_axis = naxes > 1 + + for value in (terms[i].value for i in term_index): + value_is_series = is_series(value) + is_series_and_gt_one_axis = value_is_series and gt_than_one_axis + + for axis, items in enumerate(value.axes): + if is_series_and_gt_one_axis: + ax, itm = naxes - 1, value.index + else: + ax, itm = axis, items + + if not axes[ax].is_(itm): + axes[ax] = axes[ax].join(itm, how="outer") + + for i, ndim in ndims.items(): + for axis, items in zip(range(ndim), axes): + ti = terms[i].value + + if hasattr(ti, "reindex"): + transpose = value_is_series(ti) and naxes > 1 + reindexer = axes[naxes - 1] if transpose else items + + term_axis_size = len(ti.axes[axis]) + reindexer_size = len(reindexer) + + ordm = np.log10(max(1, abs(reindexer_size - term_axis_size))) + if ordm >= 1 and reindexer_size >= 10000: + w = ( + f"Alignment difference on axis {axis} is larger " + f"than an order of magnitude on term {repr(terms[i].name)}, " + f"by more than {ordm:.4g}; performance may suffer." + ) + warnings.warn( + w, category=PerformanceWarning, stacklevel=find_stack_level() + ) + + obj = ti.reindex(reindexer, axis=axis, copy=False) + terms[i].update(obj) + + terms[i].update(terms[i].value.values) + + return typ, _zip_axes_from_type(typ, axes) + + +def align_terms(terms): + """ + Align a set of terms. + """ + try: + # flatten the parse tree (a nested list, really) + terms = list(com.flatten(terms)) + except TypeError: + # can't iterate so it must just be a constant or single variable + if is_series_or_dataframe(terms.value): + typ = type(terms.value) + return typ, _zip_axes_from_type(typ, terms.value.axes) + return np.result_type(terms.type), None + + # if all resolved variables are numeric scalars + if all(term.is_scalar for term in terms): + return result_type_many(*(term.value for term in terms)).type, None + + # perform the main alignment + typ, axes = _align_core(terms) + return typ, axes + + +def reconstruct_object(typ, obj, axes, dtype): + """ + Reconstruct an object given its type, raw value, and possibly empty + (None) axes. + + Parameters + ---------- + typ : object + A type + obj : object + The value to use in the type constructor + axes : dict + The axes to use to construct the resulting pandas object + + Returns + ------- + ret : typ + An object of type ``typ`` with the value `obj` and possible axes + `axes`. + """ + try: + typ = typ.type + except AttributeError: + pass + + res_t = np.result_type(obj.dtype, dtype) + + if not isinstance(typ, partial) and is_pandas_type(typ): + return typ(obj, dtype=res_t, **axes) + + # special case for pathological things like ~True/~False + if hasattr(res_t, "type") and typ == np.bool_ and res_t != np.bool_: + ret_value = res_t.type(obj) + else: + ret_value = typ(obj).astype(res_t) + # The condition is to distinguish 0-dim array (returned in case of + # scalar) and 1 element array + # e.g. np.array(0) and np.array([0]) + if ( + len(obj.shape) == 1 + and len(obj) == 1 + and not isinstance(ret_value, np.ndarray) + ): + ret_value = np.array([ret_value]).astype(res_t) + + return ret_value + + +# Custom to recognize BigFrames types +def is_series(obj) -> bool: + from bigframes_vendored.pandas.core.series import Series + + return isinstance(obj, Series) + + +def is_series_or_dataframe(obj) -> bool: + from bigframes_vendored.pandas.core.frame import NDFrame + + return isinstance(obj, NDFrame) + + +def is_pandas_object(obj) -> bool: + from bigframes_vendored.pandas.core.frame import NDFrame + from bigframes_vendored.pandas.core.indexes.base import Index + + return isinstance(obj, NDFrame) or isinstance(obj, Index) + + +def is_pandas_type(type) -> bool: + from bigframes_vendored.pandas.core.frame import NDFrame + from bigframes_vendored.pandas.core.indexes.base import Index + + return issubclass(type, NDFrame) or issubclass(type, Index) diff --git a/third_party/bigframes_vendored/pandas/core/computation/common.py b/third_party/bigframes_vendored/pandas/core/computation/common.py new file mode 100644 index 0000000000..7775489d0d --- /dev/null +++ b/third_party/bigframes_vendored/pandas/core/computation/common.py @@ -0,0 +1,48 @@ +# Contains code from https://github.com/pandas-dev/pandas/blob/main/pandas/core/computation/common.py +from __future__ import annotations + +from functools import reduce + +import numpy as np +from pandas._config import get_option + + +def ensure_decoded(s) -> str: + """ + If we have bytes, decode them to unicode. + """ + if isinstance(s, (np.bytes_, bytes)): + s = s.decode(get_option("display.encoding")) + return s + + +def result_type_many(*arrays_and_dtypes): + """ + Wrapper around numpy.result_type which overcomes the NPY_MAXARGS (32) + argument limit. + """ + try: + return np.result_type(*arrays_and_dtypes) + except ValueError: + # we have > NPY_MAXARGS terms in our expression + return reduce(np.result_type, arrays_and_dtypes) + except TypeError: + from pandas.core.dtypes.cast import find_common_type + from pandas.core.dtypes.common import is_extension_array_dtype + + arr_and_dtypes = list(arrays_and_dtypes) + ea_dtypes, non_ea_dtypes = [], [] + for arr_or_dtype in arr_and_dtypes: + if is_extension_array_dtype(arr_or_dtype): + ea_dtypes.append(arr_or_dtype) + else: + non_ea_dtypes.append(arr_or_dtype) + + if non_ea_dtypes: + try: + np_dtype = np.result_type(*non_ea_dtypes) + except ValueError: + np_dtype = reduce(np.result_type, arrays_and_dtypes) + return find_common_type(ea_dtypes + [np_dtype]) + + return find_common_type(ea_dtypes) diff --git a/third_party/bigframes_vendored/pandas/core/computation/engines.py b/third_party/bigframes_vendored/pandas/core/computation/engines.py new file mode 100644 index 0000000000..15fd48b237 --- /dev/null +++ b/third_party/bigframes_vendored/pandas/core/computation/engines.py @@ -0,0 +1,94 @@ +# Contains code from https://github.com/pandas-dev/pandas/blob/main/pandas/core/computation/engines.py +""" +Engine classes for :func:`~pandas.eval` +""" +from __future__ import annotations + +import abc + +from bigframes_vendored.pandas.core.computation.align import ( + align_terms, + reconstruct_object, +) +from pandas.io.formats import printing + + +class AbstractEngine(metaclass=abc.ABCMeta): + """Object serving as a base class for all engines.""" + + has_neg_frac = False + + def __init__(self, expr) -> None: + self.expr = expr + self.aligned_axes = None + self.result_type = None + + def convert(self) -> str: + """ + Convert an expression for evaluation. + + Defaults to return the expression as a string. + """ + return printing.pprint_thing(self.expr) + + def evaluate(self) -> object: + """ + Run the engine on the expression. + + This method performs alignment which is necessary no matter what engine + is being used, thus its implementation is in the base class. + + Returns + ------- + object + The result of the passed expression. + """ + if not self._is_aligned: + self.result_type, self.aligned_axes = align_terms(self.expr.terms) + + # make sure no names in resolvers and locals/globals clash + res = self._evaluate() + return reconstruct_object( + self.result_type, res, self.aligned_axes, self.expr.terms.return_type + ) + + @property + def _is_aligned(self) -> bool: + return self.aligned_axes is not None and self.result_type is not None + + @abc.abstractmethod + def _evaluate(self): + """ + Return an evaluated expression. + + Parameters + ---------- + env : Scope + The local and global environment in which to evaluate an + expression. + + Notes + ----- + Must be implemented by subclasses. + """ + + +class PythonEngine(AbstractEngine): + """ + Evaluate an expression in Python space. + + Mostly for testing purposes. + """ + + has_neg_frac = False + + def evaluate(self): + return self.expr() + + def _evaluate(self) -> None: + pass + + +ENGINES: dict[str, type[AbstractEngine]] = { + "python": PythonEngine, +} diff --git a/third_party/bigframes_vendored/pandas/core/computation/eval.py b/third_party/bigframes_vendored/pandas/core/computation/eval.py new file mode 100644 index 0000000000..56d60174a6 --- /dev/null +++ b/third_party/bigframes_vendored/pandas/core/computation/eval.py @@ -0,0 +1,368 @@ +# Contains code from https://github.com/pandas-dev/pandas/blob/main/pandas/core/computation/eval.py +""" +Top level ``eval`` module. +""" +from __future__ import annotations + +import tokenize +from typing import TYPE_CHECKING +import warnings + +from bigframes_vendored.pandas.core.computation.engines import ENGINES +from bigframes_vendored.pandas.core.computation.expr import Expr, PARSERS +from bigframes_vendored.pandas.core.computation.parsing import tokenize_string +from bigframes_vendored.pandas.core.computation.scope import ensure_scope +from bigframes_vendored.pandas.core.generic import NDFrame +from bigframes_vendored.pandas.util._validators import validate_bool_kwarg +from pandas.io.formats.printing import pprint_thing + +if TYPE_CHECKING: + from pandas.core.computation.ops import BinOp + + +def _check_engine(engine: str | None) -> str: + """ + Make sure a valid engine is passed. + + Parameters + ---------- + engine : str + String to validate. + + Raises + ------ + KeyError + * If an invalid engine is passed. + + Returns + ------- + str + Engine name. + """ + + if engine is None: + engine = "python" + + if engine not in ENGINES: + valid_engines = list(ENGINES.keys()) + raise KeyError( + f"Invalid engine '{engine}' passed, valid engines are {valid_engines}" + ) + + return engine + + +def _check_parser(parser: str): + """ + Make sure a valid parser is passed. + + Parameters + ---------- + parser : str + + Raises + ------ + KeyError + * If an invalid parser is passed + """ + if parser not in PARSERS: + raise KeyError( + f"Invalid parser '{parser}' passed, valid parsers are {PARSERS.keys()}" + ) + + +def _check_resolvers(resolvers): + if resolvers is not None: + for resolver in resolvers: + if not hasattr(resolver, "__getitem__"): + name = type(resolver).__name__ + raise TypeError( + f"Resolver of type '{name}' does not " + "implement the __getitem__ method" + ) + + +def _check_expression(expr): + """ + Make sure an expression is not an empty string + + Parameters + ---------- + expr : object + An object that can be converted to a string + + Raises + ------ + ValueError + * If expr is an empty string + """ + if not expr: + raise ValueError("expr cannot be an empty string") + + +def _convert_expression(expr) -> str: + """ + Convert an object to an expression. + + This function converts an object to an expression (a unicode string) and + checks to make sure it isn't empty after conversion. This is used to + convert operators to their string representation for recursive calls to + :func:`~pandas.eval`. + + Parameters + ---------- + expr : object + The object to be converted to a string. + + Returns + ------- + str + The string representation of an object. + + Raises + ------ + ValueError + * If the expression is empty. + """ + s = pprint_thing(expr) + _check_expression(s) + return s + + +def _check_for_locals(expr: str, stack_level: int, parser: str): + at_top_of_stack = stack_level == 0 + not_pandas_parser = parser != "pandas" + + if not_pandas_parser: + msg = "The '@' prefix is only supported by the pandas parser" + elif at_top_of_stack: + msg = ( + "The '@' prefix is not allowed in top-level eval calls.\n" + "please refer to your variables by name without the '@' prefix." + ) + + if at_top_of_stack or not_pandas_parser: + for toknum, tokval in tokenize_string(expr): + if toknum == tokenize.OP and tokval == "@": + raise SyntaxError(msg) + + +def eval( + expr: str | BinOp, # we leave BinOp out of the docstr bc it isn't for users + parser: str = "pandas", + engine: str | None = None, + local_dict=None, + global_dict=None, + resolvers=(), + level: int = 0, + target=None, + inplace: bool = False, +): + """ + Evaluate a Python expression as a string using various backends. + + The following arithmetic operations are supported: ``+``, ``-``, ``*``, + ``/``, ``**``, ``%``, ``//`` (python engine only) along with the following + boolean operations: ``|`` (or), ``&`` (and), and ``~`` (not). + Additionally, the ``'pandas'`` parser allows the use of :keyword:`and`, + :keyword:`or`, and :keyword:`not` with the same semantics as the + corresponding bitwise operators. :class:`~pandas.Series` and + :class:`~pandas.DataFrame` objects are supported and behave as they would + with plain ol' Python evaluation. + + **Examples:** + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + + >>> df = bpd.DataFrame({"animal": ["dog", "pig"], "age": [10, 20]}) + >>> df + animal age + 0 dog 10 + 1 pig 20 + + [2 rows x 2 columns] + + We can add a new column using ``pd.eval``: + + >>> df.eval("double_age = age * 2") + animal age double_age + 0 dog 10 20 + 1 pig 20 40 + + [2 rows x 3 columns] + + Args: + expr (str): + The expression to evaluate. This string cannot contain any Python + `statements + `__, + only Python `expressions + `__. + parser ({'pandas', 'python'}, default 'pandas'): + The parser to use to construct the syntax tree from the expression. The + default of ``'pandas'`` parses code slightly different than standard + Python. Alternatively, you can parse an expression using the + ``'python'`` parser to retain strict Python semantics. See the + :ref:`enhancing performance ` documentation for + more details. + engine ({'python'}, default None): + + The engine used to evaluate the expression. Supported engines are + + - None : defaults to ``python`` + - ``'python'`` : Performs operations as if you had ``eval``'d in top + level python. This engine is generally not that useful. + + More backends may be available in the future. + local_dict (dict or None, optional): + A dictionary of local variables, taken from locals() by default. + global_dict (dict or None, optional): + A dictionary of global variables, taken from globals() by default. + resolvers (list of dict-like or None, optional): + A list of objects implementing the ``__getitem__`` special method that + you can use to inject an additional collection of namespaces to use for + variable lookup. For example, this is used in the + :meth:`~DataFrame.query` method to inject the + ``DataFrame.index`` and ``DataFrame.columns`` + variables that refer to their respective :class:`~pandas.DataFrame` + instance attributes. + level (int, optional): + The number of prior stack frames to traverse and add to the current + scope. Most users will **not** need to change this parameter. + target (object, optional, default None): + This is the target object for assignment. It is used when there is + variable assignment in the expression. If so, then `target` must + support item assignment with string keys, and if a copy is being + returned, it must also support `.copy()`. + inplace (bool, default False): + If `target` is provided, and the expression mutates `target`, whether + to modify `target` inplace. Otherwise, return a copy of `target` with + the mutation. + + Returns: + ndarray, numeric scalar, DataFrame, Series, or None: + The completion value of evaluating the given code or None if ``inplace=True``. + + Raises: + ValueError: + There are many instances where such an error can be raised: + + - `target=None`, but the expression is multiline. + - The expression is multiline, but not all them have item assignment. + An example of such an arrangement is this: + + a = b + 1 + a + 2 + + Here, there are expressions on different lines, making it multiline, + but the last line has no variable assigned to the output of `a + 2`. + - `inplace=True`, but the expression is missing item assignment. + - Item assignment is provided, but the `target` does not support + string item assignment. + - Item assignment is provided and `inplace=False`, but the `target` + does not support the `.copy()` method + + """ + inplace = validate_bool_kwarg(inplace, "inplace") + + exprs: list[str | BinOp] + if isinstance(expr, str): + _check_expression(expr) + exprs = [e.strip() for e in expr.splitlines() if e.strip() != ""] + else: + # ops.BinOp; for internal compat, not intended to be passed by users + exprs = [expr] + multi_line = len(exprs) > 1 + + if multi_line and target is None: + raise ValueError( + "multi-line expressions are only valid in the " + "context of data, use DataFrame.eval" + ) + engine = _check_engine(engine) + _check_parser(parser) + _check_resolvers(resolvers) + + ret = None + first_expr = True + target_modified = False + + for expr in exprs: + expr = _convert_expression(expr) + _check_for_locals(expr, level, parser) + + # get our (possibly passed-in) scope + env = ensure_scope( + level + 1, + global_dict=global_dict, + local_dict=local_dict, + resolvers=resolvers, + target=target, + ) + + parsed_expr = Expr(expr, engine=engine, parser=parser, env=env) + + # construct the engine and evaluate the parsed expression + eng = ENGINES[engine] + eng_inst = eng(parsed_expr) + ret = eng_inst.evaluate() + + if parsed_expr.assigner is None: + if multi_line: + raise ValueError( + "Multi-line expressions are only valid " + "if all expressions contain an assignment" + ) + if inplace: + raise ValueError("Cannot operate inplace if there is no assignment") + + # assign if needed + assigner = parsed_expr.assigner + if env.target is not None and assigner is not None: + target_modified = True + + # if returning a copy, copy only on the first assignment + if not inplace and first_expr: + try: + target = env.target + if isinstance(target, NDFrame): + target = target.copy() + except AttributeError as err: + raise ValueError("Cannot return a copy of the target") from err + else: + target = env.target + + # TypeError is most commonly raised (e.g. int, list), but you + # get IndexError if you try to do this assignment on np.ndarray. + # we will ignore numpy warnings here; e.g. if trying + # to use a non-numeric indexer + try: + with warnings.catch_warnings(record=True): + # TODO: Filter the warnings we actually care about here. + if inplace and isinstance(target, NDFrame): + target.loc[:, assigner] = ret + else: + target[ # pyright: ignore[reportGeneralTypeIssues] + assigner + ] = ret + except (TypeError, IndexError) as err: + raise ValueError("Cannot assign expression output to target") from err + + if not resolvers: + resolvers = ({assigner: ret},) + else: + # existing resolver needs updated to handle + # case of mutating existing column in copy + for resolver in resolvers: + if assigner in resolver: + resolver[assigner] = ret + break + else: + resolvers += ({assigner: ret},) + + ret = None + first_expr = False + + # We want to exclude `inplace=None` as being False. + if inplace is False: + return target if target_modified else ret diff --git a/third_party/bigframes_vendored/pandas/core/computation/expr.py b/third_party/bigframes_vendored/pandas/core/computation/expr.py new file mode 100644 index 0000000000..44f649e59d --- /dev/null +++ b/third_party/bigframes_vendored/pandas/core/computation/expr.py @@ -0,0 +1,828 @@ +# Contains code from https://github.com/pandas-dev/pandas/blob/main/pandas/core/computation/expr.py +""" +:func:`~pandas.eval` parsers. +""" +from __future__ import annotations + +import ast +from functools import partial, reduce +from keyword import iskeyword +import tokenize +from typing import Callable, TypeVar + +import bigframes_vendored.pandas.core.common as com +from bigframes_vendored.pandas.core.computation.ops import ( + ARITH_OPS_SYMS, + BinOp, + BOOL_OPS_SYMS, + CMP_OPS_SYMS, + Constant, + Div, + FuncNode, + is_term, + LOCAL_TAG, + Op, + Term, + UNARY_OPS_SYMS, + UnaryOp, +) +from bigframes_vendored.pandas.core.computation.parsing import ( + clean_backtick_quoted_toks, + tokenize_string, +) +from bigframes_vendored.pandas.core.computation.scope import Scope +import numpy as np +from pandas.errors import UndefinedVariableError +from pandas.io.formats import printing + + +def _rewrite_assign(tok: tuple[int, str]) -> tuple[int, str]: + """ + Rewrite the assignment operator for PyTables expressions that use ``=`` + as a substitute for ``==``. + + Parameters + ---------- + tok : tuple of int, str + ints correspond to the all caps constants in the tokenize module + + Returns + ------- + tuple of int, str + Either the input or token or the replacement values + """ + toknum, tokval = tok + return toknum, "==" if tokval == "=" else tokval + + +def _replace_booleans(tok: tuple[int, str]) -> tuple[int, str]: + """ + Replace ``&`` with ``and`` and ``|`` with ``or`` so that bitwise + precedence is changed to boolean precedence. + + Parameters + ---------- + tok : tuple of int, str + ints correspond to the all caps constants in the tokenize module + + Returns + ------- + tuple of int, str + Either the input or token or the replacement values + """ + toknum, tokval = tok + if toknum == tokenize.OP: + if tokval == "&": + return tokenize.NAME, "and" + elif tokval == "|": + return tokenize.NAME, "or" + return toknum, tokval + return toknum, tokval + + +def _replace_locals(tok: tuple[int, str]) -> tuple[int, str]: + """ + Replace local variables with a syntactically valid name. + + Parameters + ---------- + tok : tuple of int, str + ints correspond to the all caps constants in the tokenize module + + Returns + ------- + tuple of int, str + Either the input or token or the replacement values + + Notes + ----- + This is somewhat of a hack in that we rewrite a string such as ``'@a'`` as + ``'__pd_eval_local_a'`` by telling the tokenizer that ``__pd_eval_local_`` + is a ``tokenize.OP`` and to replace the ``'@'`` symbol with it. + """ + toknum, tokval = tok + if toknum == tokenize.OP and tokval == "@": + return tokenize.OP, LOCAL_TAG + return toknum, tokval + + +def _compose2(f, g): + """ + Compose 2 callables. + """ + return lambda *args, **kwargs: f(g(*args, **kwargs)) + + +def _compose(*funcs): + """ + Compose 2 or more callables. + """ + assert len(funcs) > 1, "At least 2 callables must be passed to compose" + return reduce(_compose2, funcs) + + +def _preparse( + source: str, + f=_compose( + _replace_locals, _replace_booleans, _rewrite_assign, clean_backtick_quoted_toks + ), +) -> str: + """ + Compose a collection of tokenization functions. + + Parameters + ---------- + source : str + A Python source code string + f : callable + This takes a tuple of (toknum, tokval) as its argument and returns a + tuple with the same structure but possibly different elements. Defaults + to the composition of ``_rewrite_assign``, ``_replace_booleans``, and + ``_replace_locals``. + + Returns + ------- + str + Valid Python source code + + Notes + ----- + The `f` parameter can be any callable that takes *and* returns input of the + form ``(toknum, tokval)``, where ``toknum`` is one of the constants from + the ``tokenize`` module and ``tokval`` is a string. + """ + assert callable(f), "f must be callable" + return tokenize.untokenize(f(x) for x in tokenize_string(source)) + + +def _is_type(t): + """ + Factory for a type checking function of type ``t`` or tuple of types. + """ + return lambda x: isinstance(x.value, t) + + +_is_list = _is_type(list) +_is_str = _is_type(str) + + +# partition all AST nodes +_all_nodes = frozenset( + node + for node in (getattr(ast, name) for name in dir(ast)) + if isinstance(node, type) and issubclass(node, ast.AST) +) + + +def _filter_nodes(superclass, all_nodes=_all_nodes): + """ + Filter out AST nodes that are subclasses of ``superclass``. + """ + node_names = (node.__name__ for node in all_nodes if issubclass(node, superclass)) + return frozenset(node_names) + + +_all_node_names = frozenset(x.__name__ for x in _all_nodes) +_mod_nodes = _filter_nodes(ast.mod) +_stmt_nodes = _filter_nodes(ast.stmt) +_expr_nodes = _filter_nodes(ast.expr) +_expr_context_nodes = _filter_nodes(ast.expr_context) +_boolop_nodes = _filter_nodes(ast.boolop) +_operator_nodes = _filter_nodes(ast.operator) +_unary_op_nodes = _filter_nodes(ast.unaryop) +_cmp_op_nodes = _filter_nodes(ast.cmpop) +_comprehension_nodes = _filter_nodes(ast.comprehension) +_handler_nodes = _filter_nodes(ast.excepthandler) +_arguments_nodes = _filter_nodes(ast.arguments) +_keyword_nodes = _filter_nodes(ast.keyword) +_alias_nodes = _filter_nodes(ast.alias) + + +# nodes that we don't support directly but are needed for parsing +_hacked_nodes = frozenset(["Assign", "Module", "Expr"]) + + +_unsupported_expr_nodes = frozenset( + [ + "Yield", + "GeneratorExp", + "IfExp", + "DictComp", + "SetComp", + "Repr", + "Lambda", + "Set", + "AST", + "Is", + "IsNot", + ] +) + +# these nodes are low priority or won't ever be supported (e.g., AST) +_unsupported_nodes = ( + _stmt_nodes + | _mod_nodes + | _handler_nodes + | _arguments_nodes + | _keyword_nodes + | _alias_nodes + | _expr_context_nodes + | _unsupported_expr_nodes +) - _hacked_nodes + +# we're adding a different assignment in some cases to be equality comparison +# and we don't want `stmt` and friends in their so get only the class whose +# names are capitalized +_base_supported_nodes = (_all_node_names - _unsupported_nodes) | _hacked_nodes +intersection = _unsupported_nodes & _base_supported_nodes +_msg = f"cannot both support and not support {intersection}" +assert not intersection, _msg + + +def _node_not_implemented(node_name: str) -> Callable[..., None]: + """ + Return a function that raises a NotImplementedError with a passed node name. + """ + + def f(self, *args, **kwargs): + raise NotImplementedError(f"'{node_name}' nodes are not implemented") + + return f + + +# should be bound by BaseExprVisitor but that creates a circular dependency: +# _T is used in disallow, but disallow is used to define BaseExprVisitor +# https://github.com/microsoft/pyright/issues/2315 +_T = TypeVar("_T") + + +def disallow(nodes: set[str]) -> Callable[[type[_T]], type[_T]]: + """ + Decorator to disallow certain nodes from parsing. Raises a + NotImplementedError instead. + + Returns + ------- + callable + """ + + def disallowed(cls: type[_T]) -> type[_T]: + # error: "Type[_T]" has no attribute "unsupported_nodes" + cls.unsupported_nodes = () # type: ignore[attr-defined] + for node in nodes: + new_method = _node_not_implemented(node) + name = f"visit_{node}" + # error: "Type[_T]" has no attribute "unsupported_nodes" + cls.unsupported_nodes += (name,) # type: ignore[attr-defined] + setattr(cls, name, new_method) + return cls + + return disallowed + + +def _op_maker(op_class, op_symbol): + """ + Return a function to create an op class with its symbol already passed. + + Returns + ------- + callable + """ + + def f(self, node, *args, **kwargs): + """ + Return a partial function with an Op subclass with an operator already passed. + + Returns + ------- + callable + """ + return partial(op_class, op_symbol, *args, **kwargs) + + return f + + +_op_classes = {"binary": BinOp, "unary": UnaryOp} + + +def add_ops(op_classes): + """ + Decorator to add default implementation of ops. + """ + + def f(cls): + for op_attr_name, op_class in op_classes.items(): + ops = getattr(cls, f"{op_attr_name}_ops") + ops_map = getattr(cls, f"{op_attr_name}_op_nodes_map") + for op in ops: + op_node = ops_map[op] + if op_node is not None: + made_op = _op_maker(op_class, op) + setattr(cls, f"visit_{op_node}", made_op) + return cls + + return f + + +@disallow(_unsupported_nodes) +@add_ops(_op_classes) +class BaseExprVisitor(ast.NodeVisitor): + """ + Custom ast walker. Parsers of other engines should subclass this class + if necessary. + + Parameters + ---------- + env : Scope + engine : str + parser : str + preparser : callable + """ + + const_type: type[Term] = Constant + term_type = Term + + binary_ops = CMP_OPS_SYMS + BOOL_OPS_SYMS + ARITH_OPS_SYMS + binary_op_nodes = ( + "Gt", + "Lt", + "GtE", + "LtE", + "Eq", + "NotEq", + "In", + "NotIn", + "BitAnd", + "BitOr", + "And", + "Or", + "Add", + "Sub", + "Mult", + None, + "Pow", + "FloorDiv", + "Mod", + ) + binary_op_nodes_map = dict(zip(binary_ops, binary_op_nodes)) + + unary_ops = UNARY_OPS_SYMS + unary_op_nodes = "UAdd", "USub", "Invert", "Not" + unary_op_nodes_map = dict(zip(unary_ops, unary_op_nodes)) + + rewrite_map = { + ast.Eq: ast.In, + ast.NotEq: ast.NotIn, + ast.In: ast.In, + ast.NotIn: ast.NotIn, + } + + unsupported_nodes: tuple[str, ...] + + def __init__(self, env, engine, parser, preparser=_preparse) -> None: + self.env = env + self.engine = engine + self.parser = parser + self.preparser = preparser + self.assigner = None + + def visit(self, node, **kwargs): + if isinstance(node, str): + clean = self.preparser(node) + try: + node = ast.fix_missing_locations(ast.parse(clean)) + except SyntaxError as e: + if any(iskeyword(x) for x in clean.split()): + e.msg = "Python keyword not valid identifier in numexpr query" + raise e + + method = f"visit_{type(node).__name__}" + visitor = getattr(self, method) + return visitor(node, **kwargs) + + def visit_Module(self, node, **kwargs): + if len(node.body) != 1: + raise SyntaxError("only a single expression is allowed") + expr = node.body[0] + return self.visit(expr, **kwargs) + + def visit_Expr(self, node, **kwargs): + return self.visit(node.value, **kwargs) + + def _rewrite_membership_op(self, node, left, right): + # the kind of the operator (is actually an instance) + op_instance = node.op + op_type = type(op_instance) + + # must be two terms and the comparison operator must be ==/!=/in/not in + if is_term(left) and is_term(right) and op_type in self.rewrite_map: + left_list, right_list = map(_is_list, (left, right)) + left_str, right_str = map(_is_str, (left, right)) + + # if there are any strings or lists in the expression + if left_list or right_list or left_str or right_str: + op_instance = self.rewrite_map[op_type]() + + # pop the string variable out of locals and replace it with a list + # of one string, kind of a hack + if right_str: + name = self.env.add_tmp([right.value]) + right = self.term_type(name, self.env) + + if left_str: + name = self.env.add_tmp([left.value]) + left = self.term_type(name, self.env) + + op = self.visit(op_instance) + return op, op_instance, left, right + + def _maybe_transform_eq_ne(self, node, left=None, right=None): + if left is None: + left = self.visit(node.left, side="left") + if right is None: + right = self.visit(node.right, side="right") + op, op_class, left, right = self._rewrite_membership_op(node, left, right) + return op, op_class, left, right + + def _maybe_downcast_constants(self, left, right): + f32 = np.dtype(np.float32) + if ( + left.is_scalar + and hasattr(left, "value") + and not right.is_scalar + and right.return_type == f32 + ): + # right is a float32 array, left is a scalar + name = self.env.add_tmp(np.float32(left.value)) + left = self.term_type(name, self.env) + if ( + right.is_scalar + and hasattr(right, "value") + and not left.is_scalar + and left.return_type == f32 + ): + # left is a float32 array, right is a scalar + name = self.env.add_tmp(np.float32(right.value)) + right = self.term_type(name, self.env) + + return left, right + + def _maybe_eval(self, binop, eval_in_python): + # eval `in` and `not in` (for now) in "partial" python space + # things that can be evaluated in "eval" space will be turned into + # temporary variables. for example, + # [1,2] in a + 2 * b + # in that case a + 2 * b will be evaluated using numexpr, and the "in" + # call will be evaluated using isin (in python space) + return binop.evaluate( + self.env, self.engine, self.parser, self.term_type, eval_in_python + ) + + def _maybe_evaluate_binop( + self, + op, + op_class, + lhs, + rhs, + eval_in_python=("in", "not in"), + maybe_eval_in_python=("==", "!=", "<", ">", "<=", ">="), + ): + res = op(lhs, rhs) + + if res.has_invalid_return_type: + raise TypeError( + f"unsupported operand type(s) for {res.op}: " + f"'{lhs.type}' and '{rhs.type}'" + ) + + if self.engine != "pytables" and ( + res.op in CMP_OPS_SYMS + and getattr(lhs, "is_datetime", False) + or getattr(rhs, "is_datetime", False) + ): + # all date ops must be done in python bc numexpr doesn't work + # well with NaT + return self._maybe_eval(res, self.binary_ops) + + if res.op in eval_in_python: + # "in"/"not in" ops are always evaluated in python + return self._maybe_eval(res, eval_in_python) + elif self.engine != "pytables": + if ( + getattr(lhs, "return_type", None) == object + or getattr(rhs, "return_type", None) == object + ): + # evaluate "==" and "!=" in python if either of our operands + # has an object return type + return self._maybe_eval(res, eval_in_python + maybe_eval_in_python) + return res + + def visit_BinOp(self, node, **kwargs): + op, op_class, left, right = self._maybe_transform_eq_ne(node) + left, right = self._maybe_downcast_constants(left, right) + return self._maybe_evaluate_binop(op, op_class, left, right) + + def visit_Div(self, node, **kwargs): + return lambda lhs, rhs: Div(lhs, rhs) + + def visit_UnaryOp(self, node, **kwargs): + op = self.visit(node.op) + operand = self.visit(node.operand) + return op(operand) + + def visit_Name(self, node, **kwargs): + return self.term_type(node.id, self.env, **kwargs) + + # TODO(py314): deprecated since Python 3.8. Remove after Python 3.14 is min + def visit_NameConstant(self, node, **kwargs) -> Term: + return self.const_type(node.value, self.env) + + # TODO(py314): deprecated since Python 3.8. Remove after Python 3.14 is min + def visit_Num(self, node, **kwargs) -> Term: + return self.const_type(node.value, self.env) + + def visit_Constant(self, node, **kwargs) -> Term: + return self.const_type(node.value, self.env) + + # TODO(py314): deprecated since Python 3.8. Remove after Python 3.14 is min + def visit_Str(self, node, **kwargs): + name = self.env.add_tmp(node.s) + return self.term_type(name, self.env) + + def visit_List(self, node, **kwargs): + name = self.env.add_tmp([self.visit(e)(self.env) for e in node.elts]) + return self.term_type(name, self.env) + + visit_Tuple = visit_List + + def visit_Index(self, node, **kwargs): + """df.index[4]""" + return self.visit(node.value) + + def visit_Subscript(self, node, **kwargs): + from pandas import eval as pd_eval + + value = self.visit(node.value) + slobj = self.visit(node.slice) + result = pd_eval( + slobj, local_dict=self.env, engine=self.engine, parser=self.parser + ) + try: + # a Term instance + v = value.value[result] + except AttributeError: + # an Op instance + lhs = pd_eval( + value, local_dict=self.env, engine=self.engine, parser=self.parser + ) + v = lhs[result] + name = self.env.add_tmp(v) + return self.term_type(name, env=self.env) + + def visit_Slice(self, node, **kwargs): + """df.index[slice(4,6)]""" + lower = node.lower + if lower is not None: + lower = self.visit(lower).value + upper = node.upper + if upper is not None: + upper = self.visit(upper).value + step = node.step + if step is not None: + step = self.visit(step).value + + return slice(lower, upper, step) + + def visit_Assign(self, node, **kwargs): + """ + support a single assignment node, like + + c = a + b + + set the assigner at the top level, must be a Name node which + might or might not exist in the resolvers + + """ + if len(node.targets) != 1: + raise SyntaxError("can only assign a single expression") + if not isinstance(node.targets[0], ast.Name): + raise SyntaxError("left hand side of an assignment must be a single name") + if self.env.target is None: + raise ValueError("cannot assign without a target object") + + try: + assigner = self.visit(node.targets[0], **kwargs) + except UndefinedVariableError: + assigner = node.targets[0].id + + self.assigner = getattr(assigner, "name", assigner) + if self.assigner is None: + raise SyntaxError( + "left hand side of an assignment must be a single resolvable name" + ) + + return self.visit(node.value, **kwargs) + + def visit_Attribute(self, node, **kwargs): + attr = node.attr + value = node.value + + ctx = node.ctx + if isinstance(ctx, ast.Load): + # resolve the value + resolved = self.visit(value).value + try: + v = getattr(resolved, attr) + name = self.env.add_tmp(v) + return self.term_type(name, self.env) + except AttributeError: + # something like datetime.datetime where scope is overridden + if isinstance(value, ast.Name) and value.id == attr: + return resolved + raise + + raise ValueError(f"Invalid Attribute context {type(ctx).__name__}") + + def visit_Call(self, node, side=None, **kwargs): + if isinstance(node.func, ast.Attribute) and node.func.attr != "__call__": + res = self.visit_Attribute(node.func) + elif not isinstance(node.func, ast.Name): + raise TypeError("Only named functions are supported") + else: + try: + res = self.visit(node.func) + except UndefinedVariableError: + # Check if this is a supported function name + try: + res = FuncNode(node.func.id) + except ValueError: + # Raise original error + raise + + if res is None: + # error: "expr" has no attribute "id" + raise ValueError( + f"Invalid function call {node.func.id}" # type: ignore[attr-defined] + ) + if hasattr(res, "value"): + res = res.value + + if isinstance(res, FuncNode): + new_args = [self.visit(arg) for arg in node.args] + + if node.keywords: + raise TypeError( + f'Function "{res.name}" does not support keyword arguments' + ) + + return res(*new_args) + + else: + new_args = [self.visit(arg)(self.env) for arg in node.args] + + for key in node.keywords: + if not isinstance(key, ast.keyword): + # error: "expr" has no attribute "id" + raise ValueError( + "keyword error in function call " # type: ignore[attr-defined] + f"'{node.func.id}'" + ) + + if key.arg: + kwargs[key.arg] = self.visit(key.value)(self.env) + + name = self.env.add_tmp(res(*new_args, **kwargs)) + return self.term_type(name=name, env=self.env) + + def translate_In(self, op): + return op + + def visit_Compare(self, node, **kwargs): + ops = node.ops + comps = node.comparators + + # base case: we have something like a CMP b + if len(comps) == 1: + op = self.translate_In(ops[0]) + binop = ast.BinOp(op=op, left=node.left, right=comps[0]) + return self.visit(binop) + + # recursive case: we have a chained comparison, a CMP b CMP c, etc. + left = node.left + values = [] + for op, comp in zip(ops, comps): + new_node = self.visit( + ast.Compare(comparators=[comp], left=left, ops=[self.translate_In(op)]) + ) + left = comp + values.append(new_node) + return self.visit(ast.BoolOp(op=ast.And(), values=values)) + + def _try_visit_binop(self, bop): + if isinstance(bop, (Op, Term)): + return bop + return self.visit(bop) + + def visit_BoolOp(self, node, **kwargs): + def visitor(x, y): + lhs = self._try_visit_binop(x) + rhs = self._try_visit_binop(y) + + op, op_class, lhs, rhs = self._maybe_transform_eq_ne(node, lhs, rhs) + return self._maybe_evaluate_binop(op, node.op, lhs, rhs) + + operands = node.values + return reduce(visitor, operands) + + +_python_not_supported = frozenset(["Dict", "BoolOp", "In", "NotIn"]) + + +@disallow( + (_unsupported_nodes | _python_not_supported) + - (_boolop_nodes | frozenset(["BoolOp", "Attribute", "In", "NotIn", "Tuple"])) +) +class PandasExprVisitor(BaseExprVisitor): + def __init__( + self, + env, + engine, + parser, + preparser=partial( + _preparse, + f=_compose(_replace_locals, _replace_booleans, clean_backtick_quoted_toks), + ), + ) -> None: + super().__init__(env, engine, parser, preparser) + + +@disallow(_unsupported_nodes | _python_not_supported | frozenset(["Not"])) +class PythonExprVisitor(BaseExprVisitor): + def __init__( + self, env, engine, parser, preparser=lambda source, f=None: source + ) -> None: + super().__init__(env, engine, parser, preparser=preparser) + + +class Expr: + """ + Object encapsulating an expression. + + Parameters + ---------- + expr : str + engine : str, optional, default 'numexpr' + parser : str, optional, default 'pandas' + env : Scope, optional, default None + level : int, optional, default 2 + """ + + env: Scope + engine: str + parser: str + + def __init__( + self, + expr, + engine: str = "numexpr", + parser: str = "pandas", + env: Scope | None = None, + level: int = 0, + ) -> None: + self.expr = expr + self.env = env or Scope(level=level + 1) + self.engine = engine + self.parser = parser + self._visitor = PARSERS[parser](self.env, self.engine, self.parser) + self.terms = self.parse() + + @property + def assigner(self): + return getattr(self._visitor, "assigner", None) + + def __call__(self): + return self.terms(self.env) + + def __repr__(self) -> str: + return printing.pprint_thing(self.terms) + + def __len__(self) -> int: + return len(self.expr) + + def parse(self): + """ + Parse an expression. + """ + return self._visitor.visit(self.expr) + + @property + def names(self): + """ + Get the names in an expression. + """ + if is_term(self.terms): + return frozenset([self.terms.name]) + return frozenset(term.name for term in com.flatten(self.terms)) + + +PARSERS = {"python": PythonExprVisitor, "pandas": PandasExprVisitor} diff --git a/third_party/bigframes_vendored/pandas/core/computation/ops.py b/third_party/bigframes_vendored/pandas/core/computation/ops.py new file mode 100644 index 0000000000..75b914c876 --- /dev/null +++ b/third_party/bigframes_vendored/pandas/core/computation/ops.py @@ -0,0 +1,605 @@ +# Contains code from https://github.com/pandas-dev/pandas/blob/main/pandas/core/computation/ops.py +""" +Operator classes for eval. +""" + +from __future__ import annotations + +from datetime import datetime +from functools import partial +import operator +from typing import Callable, Literal, TYPE_CHECKING + +import bigframes_vendored.pandas.core.common as com +from bigframes_vendored.pandas.core.computation.common import ( + ensure_decoded, + result_type_many, +) +from bigframes_vendored.pandas.core.computation.scope import DEFAULT_GLOBALS +import numpy as np +from pandas._libs.tslibs import Timestamp +from pandas.core.dtypes.common import is_list_like, is_scalar +from pandas.io.formats.printing import pprint_thing, pprint_thing_encoded + +if TYPE_CHECKING: + from collections.abc import Iterable, Iterator + +REDUCTIONS = ("sum", "prod", "min", "max") + +_unary_math_ops = ( + "sin", + "cos", + "exp", + "log", + "expm1", + "log1p", + "sqrt", + "sinh", + "cosh", + "tanh", + "arcsin", + "arccos", + "arctan", + "arccosh", + "arcsinh", + "arctanh", + "abs", + "log10", + "floor", + "ceil", +) +_binary_math_ops = ("arctan2",) + +MATHOPS = _unary_math_ops + _binary_math_ops + + +LOCAL_TAG = "__pd_eval_local_" + + +class Term: + def __new__(cls, name, env, side=None, encoding=None): + klass = Constant if not isinstance(name, str) else cls + # error: Argument 2 for "super" not an instance of argument 1 + supr_new = super(Term, klass).__new__ # type: ignore[misc] + return supr_new(klass) + + is_local: bool + + def __init__(self, name, env, side=None, encoding=None) -> None: + # name is a str for Term, but may be something else for subclasses + self._name = name + self.env = env + self.side = side + tname = str(name) + self.is_local = tname.startswith(LOCAL_TAG) or tname in DEFAULT_GLOBALS + self._value = self._resolve_name() + self.encoding = encoding + + @property + def local_name(self) -> str: + return self.name.replace(LOCAL_TAG, "") + + def __repr__(self) -> str: + return pprint_thing(self.name) + + def __call__(self, *args, **kwargs): + return self.value + + def evaluate(self, *args, **kwargs) -> Term: + return self + + def _resolve_name(self): + local_name = str(self.local_name) + is_local = self.is_local + if local_name in self.env.scope and isinstance( + self.env.scope[local_name], type + ): + is_local = False + + res = self.env.resolve(local_name, is_local=is_local) + self.update(res) + + if hasattr(res, "ndim") and res.ndim > 2: + raise NotImplementedError( + "N-dimensional objects, where N > 2, are not supported with eval" + ) + return res + + def update(self, value) -> None: + """ + search order for local (i.e., @variable) variables: + + scope, key_variable + [('locals', 'local_name'), + ('globals', 'local_name'), + ('locals', 'key'), + ('globals', 'key')] + """ + key = self.name + + # if it's a variable name (otherwise a constant) + if isinstance(key, str): + self.env.swapkey(self.local_name, key, new_value=value) + + self.value = value + + @property + def is_scalar(self) -> bool: + return is_scalar(self._value) + + @property + def type(self): + try: + # potentially very slow for large, mixed dtype frames + return self._value.values.dtype + except AttributeError: + try: + # ndarray + return self._value.dtype + except AttributeError: + # scalar + return type(self._value) + + return_type = type + + @property + def raw(self) -> str: + return f"{type(self).__name__}(name={repr(self.name)}, type={self.type})" + + @property + def is_datetime(self) -> bool: + try: + t = self.type.type + except AttributeError: + t = self.type + + return issubclass(t, (datetime, np.datetime64)) + + @property + def value(self): + return self._value + + @value.setter + def value(self, new_value) -> None: + self._value = new_value + + @property + def name(self): + return self._name + + @property + def ndim(self) -> int: + return self._value.ndim + + +class Constant(Term): + def _resolve_name(self): + return self._name + + @property + def name(self): + return self.value + + def __repr__(self) -> str: + # in python 2 str() of float + # can truncate shorter than repr() + return repr(self.name) + + +_bool_op_map = {"not": "~", "and": "&", "or": "|"} + + +class Op: + """ + Hold an operator of arbitrary arity. + """ + + op: str + + def __init__(self, op: str, operands: Iterable[Term | Op], encoding=None) -> None: + self.op = _bool_op_map.get(op, op) + self.operands = operands + self.encoding = encoding + + def __iter__(self) -> Iterator: + return iter(self.operands) + + def __repr__(self) -> str: + """ + Print a generic n-ary operator and its operands using infix notation. + """ + # recurse over the operands + parened = (f"({pprint_thing(opr)})" for opr in self.operands) + return pprint_thing(f" {self.op} ".join(parened)) + + @property + def return_type(self): + # clobber types to bool if the op is a boolean operator + if self.op in (CMP_OPS_SYMS + BOOL_OPS_SYMS): + return np.bool_ + return result_type_many(*(term.type for term in com.flatten(self))) + + @property + def has_invalid_return_type(self) -> bool: + types = self.operand_types + obj_dtype_set = frozenset([np.dtype("object")]) + return self.return_type == object and types - obj_dtype_set + + @property + def operand_types(self): + return frozenset(term.type for term in com.flatten(self)) + + @property + def is_scalar(self) -> bool: + return all(operand.is_scalar for operand in self.operands) + + @property + def is_datetime(self) -> bool: + try: + t = self.return_type.type + except AttributeError: + t = self.return_type + + return issubclass(t, (datetime, np.datetime64)) + + +def _in(x, y): + """ + Compute the vectorized membership of ``x in y`` if possible, otherwise + use Python. + """ + try: + return x.isin(y) + except AttributeError: + if is_list_like(x): + try: + return y.isin(x) + except AttributeError: + pass + return x in y + + +def _not_in(x, y): + """ + Compute the vectorized membership of ``x not in y`` if possible, + otherwise use Python. + """ + try: + return ~x.isin(y) + except AttributeError: + if is_list_like(x): + try: + return ~y.isin(x) + except AttributeError: + pass + return x not in y + + +CMP_OPS_SYMS = (">", "<", ">=", "<=", "==", "!=", "in", "not in") +_cmp_ops_funcs = ( + operator.gt, + operator.lt, + operator.ge, + operator.le, + operator.eq, + operator.ne, + _in, + _not_in, +) +_cmp_ops_dict = dict(zip(CMP_OPS_SYMS, _cmp_ops_funcs)) + +BOOL_OPS_SYMS = ("&", "|", "and", "or") +_bool_ops_funcs = (operator.and_, operator.or_, operator.and_, operator.or_) +_bool_ops_dict = dict(zip(BOOL_OPS_SYMS, _bool_ops_funcs)) + +ARITH_OPS_SYMS = ("+", "-", "*", "/", "**", "//", "%") +_arith_ops_funcs = ( + operator.add, + operator.sub, + operator.mul, + operator.truediv, + operator.pow, + operator.floordiv, + operator.mod, +) +_arith_ops_dict = dict(zip(ARITH_OPS_SYMS, _arith_ops_funcs)) + +SPECIAL_CASE_ARITH_OPS_SYMS = ("**", "//", "%") +_special_case_arith_ops_funcs = (operator.pow, operator.floordiv, operator.mod) +_special_case_arith_ops_dict = dict( + zip(SPECIAL_CASE_ARITH_OPS_SYMS, _special_case_arith_ops_funcs) +) + +_binary_ops_dict = {} + +for d in (_cmp_ops_dict, _bool_ops_dict, _arith_ops_dict): + _binary_ops_dict.update(d) + + +def _cast_inplace(terms, acceptable_dtypes, dtype) -> None: + """ + Cast an expression inplace. + + Parameters + ---------- + terms : Op + The expression that should cast. + acceptable_dtypes : list of acceptable numpy.dtype + Will not cast if term's dtype in this list. + dtype : str or numpy.dtype + The dtype to cast to. + """ + dt = np.dtype(dtype) + for term in terms: + if term.type in acceptable_dtypes: + continue + + try: + new_value = term.value.astype(dt) + except AttributeError: + new_value = dt.type(term.value) + term.update(new_value) + + +def is_term(obj) -> bool: + return isinstance(obj, Term) + + +class BinOp(Op): + """ + Hold a binary operator and its operands. + + Parameters + ---------- + op : str + lhs : Term or Op + rhs : Term or Op + """ + + def __init__(self, op: str, lhs, rhs) -> None: + super().__init__(op, (lhs, rhs)) + self.lhs = lhs + self.rhs = rhs + + self._disallow_scalar_only_bool_ops() + + self.convert_values() + + try: + self.func = _binary_ops_dict[op] + except KeyError as err: + # has to be made a list for python3 + keys = list(_binary_ops_dict.keys()) + raise ValueError( + f"Invalid binary operator {repr(op)}, valid operators are {keys}" + ) from err + + def __call__(self, env): + """ + Recursively evaluate an expression in Python space. + + Parameters + ---------- + env : Scope + + Returns + ------- + object + The result of an evaluated expression. + """ + # recurse over the left/right nodes + left = self.lhs(env) + right = self.rhs(env) + + return self.func(left, right) + + def evaluate(self, env, engine: str, parser, term_type, eval_in_python): + """ + Evaluate a binary operation *before* being passed to the engine. + + Parameters + ---------- + env : Scope + engine : str + parser : str + term_type : type + eval_in_python : list + + Returns + ------- + term_type + The "pre-evaluated" expression as an instance of ``term_type`` + """ + if engine == "python": + res = self(env) + else: + # recurse over the left/right nodes + + left = self.lhs.evaluate( + env, + engine=engine, + parser=parser, + term_type=term_type, + eval_in_python=eval_in_python, + ) + + right = self.rhs.evaluate( + env, + engine=engine, + parser=parser, + term_type=term_type, + eval_in_python=eval_in_python, + ) + + # base cases + if self.op in eval_in_python: + res = self.func(left.value, right.value) + else: + from pandas.core.computation.eval import eval + + res = eval(self, local_dict=env, engine=engine, parser=parser) + + name = env.add_tmp(res) + return term_type(name, env=env) + + def convert_values(self) -> None: + """ + Convert datetimes to a comparable value in an expression. + """ + + def stringify(value): + encoder: Callable + if self.encoding is not None: + encoder = partial(pprint_thing_encoded, encoding=self.encoding) + else: + encoder = pprint_thing + return encoder(value) + + lhs, rhs = self.lhs, self.rhs + + if is_term(lhs) and lhs.is_datetime and is_term(rhs) and rhs.is_scalar: + v = rhs.value + if isinstance(v, (int, float)): + v = stringify(v) + v = Timestamp(ensure_decoded(v)) + if v.tz is not None: + v = v.tz_convert("UTC") + self.rhs.update(v) + + if is_term(rhs) and rhs.is_datetime and is_term(lhs) and lhs.is_scalar: + v = lhs.value + if isinstance(v, (int, float)): + v = stringify(v) + v = Timestamp(ensure_decoded(v)) + if v.tz is not None: + v = v.tz_convert("UTC") + self.lhs.update(v) + + def _disallow_scalar_only_bool_ops(self): + rhs = self.rhs + lhs = self.lhs + + # GH#24883 unwrap dtype if necessary to ensure we have a type object + rhs_rt = rhs.return_type + rhs_rt = getattr(rhs_rt, "type", rhs_rt) + lhs_rt = lhs.return_type + lhs_rt = getattr(lhs_rt, "type", lhs_rt) + if ( + (lhs.is_scalar or rhs.is_scalar) + and self.op in _bool_ops_dict + and ( + not ( + issubclass(rhs_rt, (bool, np.bool_)) + and issubclass(lhs_rt, (bool, np.bool_)) + ) + ) + ): + raise NotImplementedError("cannot evaluate scalar only bool ops") + + +def isnumeric(dtype) -> bool: + return issubclass(np.dtype(dtype).type, np.number) + + +class Div(BinOp): + """ + Div operator to special case casting. + + Parameters + ---------- + lhs, rhs : Term or Op + The Terms or Ops in the ``/`` expression. + """ + + def __init__(self, lhs, rhs) -> None: + super().__init__("/", lhs, rhs) + + if not isnumeric(lhs.return_type) or not isnumeric(rhs.return_type): + raise TypeError( + f"unsupported operand type(s) for {self.op}: " + f"'{lhs.return_type}' and '{rhs.return_type}'" + ) + + # do not upcast float32s to float64 un-necessarily + acceptable_dtypes = [np.float32, np.float64] + _cast_inplace(com.flatten(self), acceptable_dtypes, np.float64) + + +UNARY_OPS_SYMS = ("+", "-", "~", "not") +_unary_ops_funcs = (operator.pos, operator.neg, operator.invert, operator.invert) +_unary_ops_dict = dict(zip(UNARY_OPS_SYMS, _unary_ops_funcs)) + + +class UnaryOp(Op): + """ + Hold a unary operator and its operands. + + Parameters + ---------- + op : str + The token used to represent the operator. + operand : Term or Op + The Term or Op operand to the operator. + + Raises + ------ + ValueError + * If no function associated with the passed operator token is found. + """ + + def __init__(self, op: Literal["+", "-", "~", "not"], operand) -> None: + super().__init__(op, (operand,)) + self.operand = operand + + try: + self.func = _unary_ops_dict[op] + except KeyError as err: + raise ValueError( + f"Invalid unary operator {repr(op)}, " + f"valid operators are {UNARY_OPS_SYMS}" + ) from err + + def __call__(self, env) -> MathCall: + operand = self.operand(env) + # error: Cannot call function of unknown type + return self.func(operand) # type: ignore[operator] + + def __repr__(self) -> str: + return pprint_thing(f"{self.op}({self.operand})") + + @property + def return_type(self) -> np.dtype: + operand = self.operand + if operand.return_type == np.dtype("bool"): + return np.dtype("bool") + if isinstance(operand, Op) and ( + operand.op in _cmp_ops_dict or operand.op in _bool_ops_dict + ): + return np.dtype("bool") + return np.dtype("int") + + +class MathCall(Op): + def __init__(self, func, args) -> None: + super().__init__(func.name, args) + self.func = func + + def __call__(self, env): + # error: "Op" not callable + operands = [op(env) for op in self.operands] # type: ignore[operator] + return self.func.func(*operands) + + def __repr__(self) -> str: + operands = map(str, self.operands) + return pprint_thing(f"{self.op}({','.join(operands)})") + + +class FuncNode: + def __init__(self, name: str) -> None: + if name not in MATHOPS: + raise ValueError(f'"{name}" is not a supported function') + self.name = name + self.func = getattr(np, name) + + def __call__(self, *args): + return MathCall(self, args) diff --git a/third_party/bigframes_vendored/pandas/core/computation/parsing.py b/third_party/bigframes_vendored/pandas/core/computation/parsing.py new file mode 100644 index 0000000000..e54f459735 --- /dev/null +++ b/third_party/bigframes_vendored/pandas/core/computation/parsing.py @@ -0,0 +1,196 @@ +# Contains code from https://github.com/pandas-dev/pandas/blob/main/pandas/core/computation/parsing.py +""" +:func:`~pandas.eval` source string parsing functions +""" +from __future__ import annotations + +from io import StringIO +from keyword import iskeyword +import token +import tokenize +from typing import TYPE_CHECKING + +if TYPE_CHECKING: + from collections.abc import Hashable, Iterator + +# A token value Python's tokenizer probably will never use. +BACKTICK_QUOTED_STRING = 100 + + +def create_valid_python_identifier(name: str) -> str: + """ + Create valid Python identifiers from any string. + + Check if name contains any special characters. If it contains any + special characters, the special characters will be replaced by + a special string and a prefix is added. + + Raises + ------ + SyntaxError + If the returned name is not a Python valid identifier, raise an exception. + This can happen if there is a hashtag in the name, as the tokenizer will + than terminate and not find the backtick. + But also for characters that fall out of the range of (U+0001..U+007F). + """ + if name.isidentifier() and not iskeyword(name): + return name + + # Create a dict with the special characters and their replacement string. + # EXACT_TOKEN_TYPES contains these special characters + # token.tok_name contains a readable description of the replacement string. + special_characters_replacements = { + char: f"_{token.tok_name[tokval]}_" + for char, tokval in (tokenize.EXACT_TOKEN_TYPES.items()) + } + special_characters_replacements.update( + { + " ": "_", + "?": "_QUESTIONMARK_", + "!": "_EXCLAMATIONMARK_", + "$": "_DOLLARSIGN_", + "€": "_EUROSIGN_", + "°": "_DEGREESIGN_", + # Including quotes works, but there are exceptions. + "'": "_SINGLEQUOTE_", + '"': "_DOUBLEQUOTE_", + # Currently not possible. Terminates parser and won't find backtick. + # "#": "_HASH_", + } + ) + + name = "".join([special_characters_replacements.get(char, char) for char in name]) + name = f"BACKTICK_QUOTED_STRING_{name}" + + if not name.isidentifier(): + raise SyntaxError(f"Could not convert '{name}' to a valid Python identifier.") + + return name + + +def clean_backtick_quoted_toks(tok: tuple[int, str]) -> tuple[int, str]: + """ + Clean up a column name if surrounded by backticks. + + Backtick quoted string are indicated by a certain tokval value. If a string + is a backtick quoted token it will processed by + :func:`_create_valid_python_identifier` so that the parser can find this + string when the query is executed. + In this case the tok will get the NAME tokval. + + Parameters + ---------- + tok : tuple of int, str + ints correspond to the all caps constants in the tokenize module + + Returns + ------- + tok : Tuple[int, str] + Either the input or token or the replacement values + """ + toknum, tokval = tok + if toknum == BACKTICK_QUOTED_STRING: + return tokenize.NAME, create_valid_python_identifier(tokval) + return toknum, tokval + + +def clean_column_name(name: Hashable) -> Hashable: + """ + Function to emulate the cleaning of a backtick quoted name. + + The purpose for this function is to see what happens to the name of + identifier if it goes to the process of being parsed a Python code + inside a backtick quoted string and than being cleaned + (removed of any special characters). + + Parameters + ---------- + name : hashable + Name to be cleaned. + + Returns + ------- + name : hashable + Returns the name after tokenizing and cleaning. + + Notes + ----- + For some cases, a name cannot be converted to a valid Python identifier. + In that case :func:`tokenize_string` raises a SyntaxError. + In that case, we just return the name unmodified. + + If this name was used in the query string (this makes the query call impossible) + an error will be raised by :func:`tokenize_backtick_quoted_string` instead, + which is not caught and propagates to the user level. + """ + try: + tokenized = tokenize_string(f"`{name}`") + tokval = next(tokenized)[1] + return create_valid_python_identifier(tokval) + except SyntaxError: + return name + + +def tokenize_backtick_quoted_string( + token_generator: Iterator[tokenize.TokenInfo], source: str, string_start: int +) -> tuple[int, str]: + """ + Creates a token from a backtick quoted string. + + Moves the token_generator forwards till right after the next backtick. + + Parameters + ---------- + token_generator : Iterator[tokenize.TokenInfo] + The generator that yields the tokens of the source string (Tuple[int, str]). + The generator is at the first token after the backtick (`) + + source : str + The Python source code string. + + string_start : int + This is the start of backtick quoted string inside the source string. + + Returns + ------- + tok: Tuple[int, str] + The token that represents the backtick quoted string. + The integer is equal to BACKTICK_QUOTED_STRING (100). + """ + for _, tokval, start, _, _ in token_generator: + if tokval == "`": + string_end = start[1] + break + + return BACKTICK_QUOTED_STRING, source[string_start:string_end] + + +def tokenize_string(source: str) -> Iterator[tuple[int, str]]: + """ + Tokenize a Python source code string. + + Parameters + ---------- + source : str + The Python source code string. + + Returns + ------- + tok_generator : Iterator[Tuple[int, str]] + An iterator yielding all tokens with only toknum and tokval (Tuple[ing, str]). + """ + line_reader = StringIO(source).readline + token_generator = tokenize.generate_tokens(line_reader) + + # Loop over all tokens till a backtick (`) is found. + # Then, take all tokens till the next backtick to form a backtick quoted string + for toknum, tokval, start, _, _ in token_generator: + if tokval == "`": + try: + yield tokenize_backtick_quoted_string( + token_generator, source, string_start=start[1] + 1 + ) + except Exception as err: + raise SyntaxError(f"Failed to parse backticks in '{source}'.") from err + else: + yield toknum, tokval diff --git a/third_party/bigframes_vendored/pandas/core/computation/scope.py b/third_party/bigframes_vendored/pandas/core/computation/scope.py new file mode 100644 index 0000000000..bfd7eb1d12 --- /dev/null +++ b/third_party/bigframes_vendored/pandas/core/computation/scope.py @@ -0,0 +1,355 @@ +# Contains code from https://github.com/pandas-dev/pandas/blob/main/pandas/core/computation/scope.py +""" +Module for scope operations +""" +from __future__ import annotations + +from collections import ChainMap +import datetime +import inspect +from io import StringIO +import itertools +import pprint +import struct +import sys +from typing import TypeVar + +import numpy as np +from pandas._libs.tslibs import Timestamp +from pandas.errors import UndefinedVariableError + +_KT = TypeVar("_KT") +_VT = TypeVar("_VT") + + +# https://docs.python.org/3/library/collections.html#chainmap-examples-and-recipes +class DeepChainMap(ChainMap[_KT, _VT]): + """ + Variant of ChainMap that allows direct updates to inner scopes. + + Only works when all passed mapping are mutable. + """ + + def __setitem__(self, key: _KT, value: _VT) -> None: + for mapping in self.maps: + if key in mapping: + mapping[key] = value + return + self.maps[0][key] = value + + def __delitem__(self, key: _KT) -> None: + """ + Raises + ------ + KeyError + If `key` doesn't exist. + """ + for mapping in self.maps: + if key in mapping: + del mapping[key] + return + raise KeyError(key) + + +def ensure_scope( + level: int, global_dict=None, local_dict=None, resolvers=(), target=None +) -> Scope: + """Ensure that we are grabbing the correct scope.""" + return Scope( + level + 1, + global_dict=global_dict, + local_dict=local_dict, + resolvers=resolvers, + target=target, + ) + + +def _replacer(x) -> str: + """ + Replace a number with its hexadecimal representation. Used to tag + temporary variables with their calling scope's id. + """ + # get the hex repr of the binary char and remove 0x and pad by pad_size + # zeros + try: + hexin = ord(x) + except TypeError: + # bytes literals masquerade as ints when iterating in py3 + hexin = x + + return hex(hexin) + + +def _raw_hex_id(obj) -> str: + """Return the padded hexadecimal id of ``obj``.""" + # interpret as a pointer since that's what really what id returns + packed = struct.pack("@P", id(obj)) + return "".join([_replacer(x) for x in packed]) + + +DEFAULT_GLOBALS = { + "Timestamp": Timestamp, + "datetime": datetime.datetime, + "True": True, + "False": False, + "list": list, + "tuple": tuple, + "inf": np.inf, + "Inf": np.inf, +} + + +def _get_pretty_string(obj) -> str: + """ + Return a prettier version of obj. + + Parameters + ---------- + obj : object + Object to pretty print + + Returns + ------- + str + Pretty print object repr + """ + sio = StringIO() + pprint.pprint(obj, stream=sio) + return sio.getvalue() + + +class Scope: + """ + Object to hold scope, with a few bells to deal with some custom syntax + and contexts added by pandas. + + Parameters + ---------- + level : int + global_dict : dict or None, optional, default None + local_dict : dict or Scope or None, optional, default None + resolvers : list-like or None, optional, default None + target : object + + Attributes + ---------- + level : int + scope : DeepChainMap + target : object + temps : dict + """ + + __slots__ = ["level", "scope", "target", "resolvers", "temps"] + level: int + scope: DeepChainMap + resolvers: DeepChainMap + temps: dict + + def __init__( + self, level: int, global_dict=None, local_dict=None, resolvers=(), target=None + ) -> None: + self.level = level + 1 + + # shallow copy because we don't want to keep filling this up with what + # was there before if there are multiple calls to Scope/_ensure_scope + self.scope = DeepChainMap(DEFAULT_GLOBALS.copy()) + self.target = target + + if isinstance(local_dict, Scope): + self.scope.update(local_dict.scope) + if local_dict.target is not None: + self.target = local_dict.target + self._update(local_dict.level) + + frame = sys._getframe(self.level) + + try: + # shallow copy here because we don't want to replace what's in + # scope when we align terms (alignment accesses the underlying + # numpy array of pandas objects) + scope_global = self.scope.new_child( + (global_dict if global_dict is not None else frame.f_globals).copy() + ) + self.scope = DeepChainMap(scope_global) + if not isinstance(local_dict, Scope): + scope_local = self.scope.new_child( + (local_dict if local_dict is not None else frame.f_locals).copy() + ) + self.scope = DeepChainMap(scope_local) + finally: + del frame + + # assumes that resolvers are going from outermost scope to inner + if isinstance(local_dict, Scope): + resolvers += tuple(local_dict.resolvers.maps) + self.resolvers = DeepChainMap(*resolvers) + self.temps = {} + + def __repr__(self) -> str: + scope_keys = _get_pretty_string(list(self.scope.keys())) + res_keys = _get_pretty_string(list(self.resolvers.keys())) + return f"{type(self).__name__}(scope={scope_keys}, resolvers={res_keys})" + + @property + def has_resolvers(self) -> bool: + """ + Return whether we have any extra scope. + + For example, DataFrames pass Their columns as resolvers during calls to + ``DataFrame.eval()`` and ``DataFrame.query()``. + + Returns + ------- + hr : bool + """ + return bool(len(self.resolvers)) + + def resolve(self, key: str, is_local: bool): + """ + Resolve a variable name in a possibly local context. + + Parameters + ---------- + key : str + A variable name + is_local : bool + Flag indicating whether the variable is local or not (prefixed with + the '@' symbol) + + Returns + ------- + value : object + The value of a particular variable + """ + try: + # only look for locals in outer scope + if is_local: + return self.scope[key] + + # not a local variable so check in resolvers if we have them + if self.has_resolvers: + return self.resolvers[key] + + # if we're here that means that we have no locals and we also have + # no resolvers + assert not is_local and not self.has_resolvers + return self.scope[key] + except KeyError: + try: + # last ditch effort we look in temporaries + # these are created when parsing indexing expressions + # e.g., df[df > 0] + return self.temps[key] + except KeyError as err: + raise UndefinedVariableError(key, is_local) from err + + def swapkey(self, old_key: str, new_key: str, new_value=None) -> None: + """ + Replace a variable name, with a potentially new value. + + Parameters + ---------- + old_key : str + Current variable name to replace + new_key : str + New variable name to replace `old_key` with + new_value : object + Value to be replaced along with the possible renaming + """ + if self.has_resolvers: + maps = self.resolvers.maps + self.scope.maps + else: + maps = self.scope.maps + + maps.append(self.temps) + + for mapping in maps: + if old_key in mapping: + mapping[new_key] = new_value + return + + def _get_vars(self, stack, scopes: list[str]) -> None: + """ + Get specifically scoped variables from a list of stack frames. + + Parameters + ---------- + stack : list + A list of stack frames as returned by ``inspect.stack()`` + scopes : sequence of strings + A sequence containing valid stack frame attribute names that + evaluate to a dictionary. For example, ('locals', 'globals') + """ + variables = itertools.product(scopes, stack) + for scope, (frame, _, _, _, _, _) in variables: + try: + d = getattr(frame, f"f_{scope}") + self.scope = DeepChainMap(self.scope.new_child(d)) + finally: + # won't remove it, but DECREF it + # in Py3 this probably isn't necessary since frame won't be + # scope after the loop + del frame + + def _update(self, level: int) -> None: + """ + Update the current scope by going back `level` levels. + + Parameters + ---------- + level : int + """ + sl = level + 1 + + # add sl frames to the scope starting with the + # most distant and overwriting with more current + # makes sure that we can capture variable scope + stack = inspect.stack() + + try: + self._get_vars(stack[:sl], scopes=["locals"]) + finally: + del stack[:], stack + + def add_tmp(self, value) -> str: + """ + Add a temporary variable to the scope. + + Parameters + ---------- + value : object + An arbitrary object to be assigned to a temporary variable. + + Returns + ------- + str + The name of the temporary variable created. + """ + name = f"{type(value).__name__}_{self.ntemps}_{_raw_hex_id(self)}" + + # add to inner most scope + assert name not in self.temps + self.temps[name] = value + assert name in self.temps + + # only increment if the variable gets put in the scope + return name + + @property + def ntemps(self) -> int: + """The number of temporary variables in this scope""" + return len(self.temps) + + @property + def full_scope(self) -> DeepChainMap: + """ + Return the full scope for use with passing to engines transparently + as a mapping. + + Returns + ------- + vars : DeepChainMap + All variables in this scope. + """ + maps = [self.temps] + self.resolvers.maps + self.scope.maps + return DeepChainMap(*maps) diff --git a/third_party/bigframes_vendored/pandas/core/dtypes/inference.py b/third_party/bigframes_vendored/pandas/core/dtypes/inference.py new file mode 100644 index 0000000000..fcbb4c242f --- /dev/null +++ b/third_party/bigframes_vendored/pandas/core/dtypes/inference.py @@ -0,0 +1,31 @@ +# Contains code from https://github.com/pandas-dev/pandas/blob/main/pandas/core/dtypes/inference.py +""" basic inference routines """ + +from __future__ import annotations + +from collections import abc + + +def iterable_not_string(obj) -> bool: + """ + Check if the object is an iterable but not a string. + + Parameters + ---------- + obj : The object to check. + + Returns + ------- + is_iter_not_string : bool + Whether `obj` is a non-string iterable. + + Examples + -------- + >>> iterable_not_string([1, 2, 3]) + True + >>> iterable_not_string("foo") + False + >>> iterable_not_string(1) + False + """ + return isinstance(obj, abc.Iterable) and not isinstance(obj, str) diff --git a/third_party/bigframes_vendored/pandas/core/frame.py b/third_party/bigframes_vendored/pandas/core/frame.py index d70d3827e7..2640cce6da 100644 --- a/third_party/bigframes_vendored/pandas/core/frame.py +++ b/third_party/bigframes_vendored/pandas/core/frame.py @@ -2865,6 +2865,7 @@ def cov(self, *, numeric_only) -> DataFrame: Returns: DataFrame: The covariance matrix of the series of the DataFrame. """ + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) def update( self, other, join: str = "left", overwrite: bool = True, filter_func=None @@ -4931,6 +4932,158 @@ def value_counts( """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + def eval(self, expr: str) -> DataFrame: + """ + Evaluate a string describing operations on DataFrame columns. + + Operates on columns only, not specific rows or elements. This allows + `eval` to run arbitrary code, which can make you vulnerable to code + injection if you pass user input to this function. + + **Examples:** + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + + >>> df = bpd.DataFrame({'A': range(1, 6), 'B': range(10, 0, -2)}) + >>> df + A B + 0 1 10 + 1 2 8 + 2 3 6 + 3 4 4 + 4 5 2 + + [5 rows x 2 columns] + >>> df.eval('A + B') + 0 11 + 1 10 + 2 9 + 3 8 + 4 7 + dtype: Int64 + + Assignment is allowed though by default the original DataFrame is not + modified. + + >>> df.eval('C = A + B') + A B C + 0 1 10 11 + 1 2 8 10 + 2 3 6 9 + 3 4 4 8 + 4 5 2 7 + + [5 rows x 3 columns] + >>> df + A B + 0 1 10 + 1 2 8 + 2 3 6 + 3 4 4 + 4 5 2 + + [5 rows x 2 columns] + + Multiple columns can be assigned to using multi-line expressions: + + >>> df.eval( + ... ''' + ... C = A + B + ... D = A - B + ... ''' + ... ) + A B C D + 0 1 10 11 -9 + 1 2 8 10 -6 + 2 3 6 9 -3 + 3 4 4 8 0 + 4 5 2 7 3 + + [5 rows x 4 columns] + + + Args: + expr (str): + The expression string to evaluate. + + Returns: + DataFrame + """ + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + + def query(self, expr: str) -> DataFrame | None: + """ + Query the columns of a DataFrame with a boolean expression. + + **Examples:** + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + + >>> df = bpd.DataFrame({'A': range(1, 6), + ... 'B': range(10, 0, -2), + ... 'C C': range(10, 5, -1)}) + >>> df + A B C C + 0 1 10 10 + 1 2 8 9 + 2 3 6 8 + 3 4 4 7 + 4 5 2 6 + + [5 rows x 3 columns] + >>> df.query('A > B') + A B C C + 4 5 2 6 + + [1 rows x 3 columns] + + The previous expression is equivalent to + + >>> df[df.A > df.B] + A B C C + 4 5 2 6 + + [1 rows x 3 columns] + + For columns with spaces in their name, you can use backtick quoting. + + >>> df.query('B == `C C`') + A B C C + 0 1 10 10 + + [1 rows x 3 columns] + + The previous expression is equivalent to + + >>> df[df.B == df['C C']] + A B C C + 0 1 10 10 + + [1 rows x 3 columns] + + Args: + expr (str): + The query string to evaluate. + + You can refer to variables + in the environment by prefixing them with an '@' character like + ``@a + b``. + + You can refer to column names that are not valid Python variable names + by surrounding them in backticks. Thus, column names containing spaces + or punctuations (besides underscores) or starting with digits must be + surrounded by backticks. (For example, a column named "Area (cm^2)" would + be referenced as ```Area (cm^2)```). Column names which are Python keywords + (like "list", "for", "import", etc) cannot be used. + + For example, if one of your columns is called ``a a`` and you want + to sum it with ``b``, your query should be ```a a` + b``. + + Returns: + DataFrame + """ + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + def interpolate(self, method: str = "linear"): """ Fill NaN values using an interpolation method. diff --git a/third_party/bigframes_vendored/pandas/util/_exceptions.py b/third_party/bigframes_vendored/pandas/util/_exceptions.py new file mode 100644 index 0000000000..4ca649153a --- /dev/null +++ b/third_party/bigframes_vendored/pandas/util/_exceptions.py @@ -0,0 +1,29 @@ +# Contains code from https://github.com/pandas-dev/pandas/blob/main/pandas/util/_exceptions.py +from __future__ import annotations + +import inspect +import os + + +def find_stack_level() -> int: + """ + Find the first place in the stack that is not inside pandas + (tests notwithstanding). + """ + + import pandas as pd + + pkg_dir = os.path.dirname(pd.__file__) + test_dir = os.path.join(pkg_dir, "tests") + + # https://stackoverflow.com/questions/17407119/python-inspect-stack-is-slow + frame = inspect.currentframe() + n = 0 + while frame: + fname = inspect.getfile(frame) + if fname.startswith(pkg_dir) and not fname.startswith(test_dir): + frame = frame.f_back + n += 1 + else: + break + return n diff --git a/third_party/bigframes_vendored/pandas/util/_validators.py b/third_party/bigframes_vendored/pandas/util/_validators.py new file mode 100644 index 0000000000..1f36e0d528 --- /dev/null +++ b/third_party/bigframes_vendored/pandas/util/_validators.py @@ -0,0 +1,58 @@ +# Contains code from https://github.com/pandas-dev/pandas/blob/main/pandas/util/_validators.py +""" +Module that contains many useful utilities +for validating data or function arguments +""" +from __future__ import annotations + +from typing import TypeVar + +from pandas.core.dtypes.common import is_bool + +BoolishT = TypeVar("BoolishT", bool, int) +BoolishNoneT = TypeVar("BoolishNoneT", bool, int, None) + + +def validate_bool_kwarg( + value: BoolishNoneT, + arg_name: str, + none_allowed: bool = True, + int_allowed: bool = False, +) -> BoolishNoneT: + """ + Ensure that argument passed in arg_name can be interpreted as boolean. + + Parameters + ---------- + value : bool + Value to be validated. + arg_name : str + Name of the argument. To be reflected in the error message. + none_allowed : bool, default True + Whether to consider None to be a valid boolean. + int_allowed : bool, default False + Whether to consider integer value to be a valid boolean. + + Returns + ------- + value + The same value as input. + + Raises + ------ + ValueError + If the value is not a valid boolean. + """ + good_value = is_bool(value) + if none_allowed: + good_value = good_value or (value is None) + + if int_allowed: + good_value = good_value or isinstance(value, int) + + if not good_value: + raise ValueError( + f'For argument "{arg_name}" expected type bool, received ' + f"type {type(value).__name__}." + ) + return value # pyright: ignore[reportGeneralTypeIssues] From e7a8e461ea4ba8df74c0da978b23413e590368bc Mon Sep 17 00:00:00 2001 From: Huan Chen <142538604+Genesis929@users.noreply.github.com> Date: Tue, 26 Mar 2024 12:54:48 -0700 Subject: [PATCH 05/53] chore: remove initialization workaround in to_datetime. (#514) * chore: Remove initialization workaround in to_datetime. * remove unused import * Update document. --- bigframes/core/tools/datetimes.py | 17 +---------------- .../pandas/core/arrays/datetimelike.py | 2 +- .../pandas/core/tools/datetimes.py | 2 +- 3 files changed, 3 insertions(+), 18 deletions(-) diff --git a/bigframes/core/tools/datetimes.py b/bigframes/core/tools/datetimes.py index 96bf556101..a2851bc256 100644 --- a/bigframes/core/tools/datetimes.py +++ b/bigframes/core/tools/datetimes.py @@ -20,7 +20,6 @@ import pandas as pd import bigframes.constants as constants -import bigframes.core.global_session as global_session import bigframes.dataframe import bigframes.operations as ops import bigframes.series @@ -52,21 +51,7 @@ def to_datetime( f"to datetime is not implemented. {constants.FEEDBACK_LINK}" ) - if not isinstance(arg, bigframes.series.Series): - # This block ensures compatibility with local data formats, including - # iterables and pandas.Series - # TODO: Currently, data upload is performed using pandas DataFrames - # combined with the `read_pandas` method due to the BigFrames DataFrame - # constructor's limitations in handling various data types. Plan to update - # the upload process to utilize the BigFrames DataFrame constructor directly - # once it is enhanced for more related datatypes. - arg = global_session.with_default_session( - bigframes.session.Session.read_pandas, pd.DataFrame(arg) - ) - if len(arg.columns) != 1: - raise ValueError("Input must be 1-dimensional.") - - arg = arg[arg.columns[0]] + arg = bigframes.series.Series(arg) if not utc and arg.dtype not in ("Int64", "Float64"): # type: ignore raise NotImplementedError( diff --git a/third_party/bigframes_vendored/pandas/core/arrays/datetimelike.py b/third_party/bigframes_vendored/pandas/core/arrays/datetimelike.py index bd5f055ece..ce5f8d55f3 100644 --- a/third_party/bigframes_vendored/pandas/core/arrays/datetimelike.py +++ b/third_party/bigframes_vendored/pandas/core/arrays/datetimelike.py @@ -26,7 +26,7 @@ def strftime(self, date_format: str): 0 August 15, 2014, 08:15:12 AM 1 February 29, 2012, 02:15:12 AM 2 August 15, 2015, 03:15:12 AM - Name: 0, dtype: string + dtype: string Args: date_format (str): diff --git a/third_party/bigframes_vendored/pandas/core/tools/datetimes.py b/third_party/bigframes_vendored/pandas/core/tools/datetimes.py index 8a8a562bae..442220f237 100644 --- a/third_party/bigframes_vendored/pandas/core/tools/datetimes.py +++ b/third_party/bigframes_vendored/pandas/core/tools/datetimes.py @@ -48,7 +48,7 @@ def to_datetime( >>> bpd.to_datetime(list_str, format="%m-%d-%Y %H:%M", utc=True) 0 2021-01-31 14:30:00+00:00 1 2021-02-28 15:45:00+00:00 - Name: 0, dtype: timestamp[us, tz=UTC][pyarrow] + dtype: timestamp[us, tz=UTC][pyarrow] Converting a Series of Strings with Timezone Information: From 6873b30b691a11a368308825a72013d8ec1408ed Mon Sep 17 00:00:00 2001 From: TrevorBergeron Date: Tue, 26 Mar 2024 13:42:16 -0700 Subject: [PATCH 06/53] fix: Product operation produces float result for all input types (#501) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Thank you for opening a Pull Request! Before submitting your PR, there are a few things you can do to make sure it goes smoothly: - [ ] Make sure to open an issue as a [bug/issue](https://togithub.com/googleapis/python-bigquery-dataframes/issues/new/choose) before writing your code! That way we can discuss the change, evaluate designs, and agree on the general idea - [ ] Ensure the tests and linter pass - [ ] Code coverage does not decrease (if any source code was changed) - [ ] Appropriate docs were updated (if necessary) Fixes # 🦕 --- bigframes/core/compile/aggregate_compiler.py | 2 +- bigframes/operations/aggregations.py | 5 +---- tests/system/small/test_groupby.py | 3 +-- tests/system/small/test_series.py | 2 +- third_party/bigframes_vendored/pandas/core/frame.py | 8 ++++---- 5 files changed, 8 insertions(+), 12 deletions(-) diff --git a/bigframes/core/compile/aggregate_compiler.py b/bigframes/core/compile/aggregate_compiler.py index 9c1db0f162..ae21243506 100644 --- a/bigframes/core/compile/aggregate_compiler.py +++ b/bigframes/core/compile/aggregate_compiler.py @@ -190,7 +190,7 @@ def _( .else_(magnitude * pow(-1, negative_count_parity)) .end() ) - return float_result.cast(column.type()) # type: ignore + return float_result @compile_unary_agg.register diff --git a/bigframes/operations/aggregations.py b/bigframes/operations/aggregations.py index 9a270f1ce7..76aa2a6112 100644 --- a/bigframes/operations/aggregations.py +++ b/bigframes/operations/aggregations.py @@ -139,10 +139,7 @@ class ProductOp(UnaryAggregateOp): name: ClassVar[str] = "product" def output_type(self, *input_types: dtypes.ExpressionType): - if pd.api.types.is_bool_dtype(input_types[0]): - return dtypes.INT_DTYPE - else: - return input_types[0] + return dtypes.FLOAT_DTYPE @dataclasses.dataclass(frozen=True) diff --git a/tests/system/small/test_groupby.py b/tests/system/small/test_groupby.py index e7ecbedfc2..ba79ba1ab1 100644 --- a/tests/system/small/test_groupby.py +++ b/tests/system/small/test_groupby.py @@ -228,8 +228,7 @@ def test_dataframe_groupby_multi_sum( (lambda x: x.cumsum(numeric_only=True)), (lambda x: x.cummax(numeric_only=True)), (lambda x: x.cummin(numeric_only=True)), - # pandas 2.2 uses floating point for cumulative product even for - # integer inputs. + # Pre-pandas 2.2 doesn't always proeduce float. (lambda x: x.cumprod().astype("Float64")), (lambda x: x.shift(periods=2)), ], diff --git a/tests/system/small/test_series.py b/tests/system/small/test_series.py index 8847753e88..258fb1cfd8 100644 --- a/tests/system/small/test_series.py +++ b/tests/system/small/test_series.py @@ -1481,7 +1481,7 @@ def test_groupby_prod(scalars_dfs): bf_series = scalars_df[col_name].groupby(scalars_df["int64_col"]).prod() pd_series = ( scalars_pandas_df[col_name].groupby(scalars_pandas_df["int64_col"]).prod() - ) + ).astype(pd.Float64Dtype()) # TODO(swast): Update groupby to use index based on group by key(s). bf_result = bf_series.to_pandas() assert_series_equal( diff --git a/third_party/bigframes_vendored/pandas/core/frame.py b/third_party/bigframes_vendored/pandas/core/frame.py index 2640cce6da..50cce1eeab 100644 --- a/third_party/bigframes_vendored/pandas/core/frame.py +++ b/third_party/bigframes_vendored/pandas/core/frame.py @@ -4416,10 +4416,10 @@ def cumprod(self) -> DataFrame: [3 rows x 2 columns] >>> df.cumprod() - A B - 0 3 1 - 1 3 2 - 2 6 6 + A B + 0 3.0 1.0 + 1 3.0 2.0 + 2 6.0 6.0 [3 rows x 2 columns] From 3b80f956755c9d7043138aab6e5687cba50be8cb Mon Sep 17 00:00:00 2001 From: Garrett Wu <6505921+GarrettWu@users.noreply.github.com> Date: Tue, 26 Mar 2024 14:09:44 -0700 Subject: [PATCH 07/53] docs: fix docs of ARIMAPlus.predict (#512) --- bigframes/ml/forecasting.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/bigframes/ml/forecasting.py b/bigframes/ml/forecasting.py index 292389dcbb..7993327200 100644 --- a/bigframes/ml/forecasting.py +++ b/bigframes/ml/forecasting.py @@ -234,7 +234,12 @@ def _fit( def predict( self, X=None, *, horizon: int = 3, confidence_level: float = 0.95 ) -> bpd.DataFrame: - """Predict the closest cluster for each sample in X. + """Forecast time series at future horizon. + + .. note:: + + Output matches that of the BigQuery ML.FORECAST function. + See: https://cloud.google.com/bigquery/docs/reference/standard-sql/bigqueryml-syntax-forecast Args: X (default None): From 5f37b0902fae2c099207acf3ce2e251c09ac889d Mon Sep 17 00:00:00 2001 From: Ashley Xu <139821907+ashleyxuu@users.noreply.github.com> Date: Tue, 26 Mar 2024 15:56:20 -0700 Subject: [PATCH 08/53] docs: add the code samples for metrics{auc, roc_auc_score, roc_curve} (#520) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Thank you for opening a Pull Request! Before submitting your PR, there are a few things you can do to make sure it goes smoothly: - [ ] Make sure to open an issue as a [bug/issue](https://togithub.com/googleapis/python-bigquery-dataframes/issues/new/choose) before writing your code! That way we can discuss the change, evaluate designs, and agree on the general idea - [ ] Ensure the tests and linter pass - [ ] Code coverage does not decrease (if any source code was changed) - [ ] Appropriate docs were updated (if necessary) Fixes # 🦕 --- .../sklearn/metrics/_ranking.py | 78 +++++++++++++++++++ 1 file changed, 78 insertions(+) diff --git a/third_party/bigframes_vendored/sklearn/metrics/_ranking.py b/third_party/bigframes_vendored/sklearn/metrics/_ranking.py index ac919edbe3..dee8b350c0 100644 --- a/third_party/bigframes_vendored/sklearn/metrics/_ranking.py +++ b/third_party/bigframes_vendored/sklearn/metrics/_ranking.py @@ -27,6 +27,29 @@ def auc(x, y) -> float: way to summarize a precision-recall curve, see :func:`average_precision_score`. + **Examples:** + + >>> import bigframes.pandas as bpd + >>> import bigframes.ml.metrics + >>> bpd.options.display.progress_bar = None + + >>> x = bpd.DataFrame([1, 1, 2, 2]) + >>> y = bpd.DataFrame([2, 3, 4, 5]) + >>> auc = bigframes.ml.metrics.auc(x, y) + >>> auc + 3.5 + + The input can be Series: + + >>> df = bpd.DataFrame( + ... {"x": [1, 1, 2, 2], + ... "y": [2, 3, 4, 5],} + ... ) + >>> auc = bigframes.ml.metrics.auc(df["x"], df["y"]) + >>> auc + 3.5 + + Args: x (Series or DataFrame of shape (n_samples,)): X coordinates. These must be either monotonic increasing or monotonic @@ -44,6 +67,28 @@ def roc_auc_score(y_true, y_score) -> float: """Compute Area Under the Receiver Operating Characteristic Curve (ROC AUC) \ from prediction scores. + **Examples:** + + >>> import bigframes.pandas as bpd + >>> import bigframes.ml.metrics + >>> bpd.options.display.progress_bar = None + + >>> y_true = bpd.DataFrame([0, 0, 1, 1, 0, 1, 0, 1, 1, 1]) + >>> y_score = bpd.DataFrame([0.1, 0.4, 0.35, 0.8, 0.65, 0.9, 0.5, 0.3, 0.6, 0.45]) + >>> roc_auc_score = bigframes.ml.metrics.roc_auc_score(y_true, y_score) + >>> roc_auc_score + 0.625 + + The input can be Series: + + >>> df = bpd.DataFrame( + ... {"y_true": [0, 0, 1, 1, 0, 1, 0, 1, 1, 1], + ... "y_score": [0.1, 0.4, 0.35, 0.8, 0.65, 0.9, 0.5, 0.3, 0.6, 0.45],} + ... ) + >>> roc_auc_score = bigframes.ml.metrics.roc_auc_score(df["y_true"], df["y_score"]) + >>> roc_auc_score + 0.625 + Args: y_true (Series or DataFrame of shape (n_samples,)): True labels or binary label indicators. The binary and multiclass cases @@ -72,6 +117,39 @@ def roc_curve( ): """Compute Receiver operating characteristic (ROC). + **Examples:** + + >>> import bigframes.pandas as bpd + >>> import bigframes.ml.metrics + >>> bpd.options.display.progress_bar = None + + >>> y_true = bpd.DataFrame([1, 1, 2, 2]) + >>> y_score = bpd.DataFrame([0.1, 0.4, 0.35, 0.8]) + >>> fpr, tpr, thresholds = bigframes.ml.metrics.roc_curve(y_true, y_score, drop_intermediate=False) + >>> fpr + 0 0.0 + 1 0.0 + 2 0.0 + 3 0.0 + 4 0.0 + Name: fpr, dtype: Float64 + + >>> tpr + 0 0.0 + 1 0.333333 + 2 0.5 + 3 0.833333 + 4 1.0 + Name: tpr, dtype: Float64 + + >>> thresholds + 0 inf + 1 0.8 + 2 0.4 + 3 0.35 + 4 0.1 + Name: thresholds, dtype: Float64 + Args: y_true: Series or DataFrame of shape (n_samples,) True binary labels. If labels are not either {-1, 1} or {0, 1}, then From 36920951b1d3bd216b3abc18f7d770ada590a3d0 Mon Sep 17 00:00:00 2001 From: Garrett Wu <6505921+GarrettWu@users.noreply.github.com> Date: Tue, 26 Mar 2024 18:18:21 -0700 Subject: [PATCH 09/53] chore: remove unused ColumnTransformer check (#527) The check is no-op and misleading. It is not a chained transformer. But transformer to the same column, which is supported both in BQML and sklearn. --- bigframes/ml/compose.py | 6 ------ tests/system/large/ml/test_compose.py | 26 ++++++++++---------------- 2 files changed, 10 insertions(+), 22 deletions(-) diff --git a/bigframes/ml/compose.py b/bigframes/ml/compose.py index d35941b338..6d4fa5b76d 100644 --- a/bigframes/ml/compose.py +++ b/bigframes/ml/compose.py @@ -23,7 +23,6 @@ import bigframes_vendored.sklearn.compose._column_transformer -from bigframes import constants from bigframes.core import log_adapter from bigframes.ml import base, core, globals, preprocessing, utils import bigframes.pandas as bpd @@ -77,7 +76,6 @@ def transformers_( ] ] = [] - column_set: set[str] = set() for entry in self.transformers: name, transformer, column_or_columns = entry columns = ( @@ -87,10 +85,6 @@ def transformers_( ) for column in columns: - if column in column_set: - raise NotImplementedError( - f"Chained transformers on the same column isn't supported. {constants.FEEDBACK_LINK}" - ) result.append((name, transformer, column)) return result diff --git a/tests/system/large/ml/test_compose.py b/tests/system/large/ml/test_compose.py index 6ea4f72489..bb9a4d8f64 100644 --- a/tests/system/large/ml/test_compose.py +++ b/tests/system/large/ml/test_compose.py @@ -32,10 +32,15 @@ def test_columntransformer_standalone_fit_and_transform( "species", ), ( - "scale", + "starndard_scale", bigframes.ml.preprocessing.StandardScaler(), ["culmen_length_mm", "flipper_length_mm"], ), + ( + "min_max_scale", + bigframes.ml.preprocessing.MinMaxScaler(), + ["culmen_length_mm"], + ), ] ) @@ -51,6 +56,7 @@ def test_columntransformer_standalone_fit_and_transform( expected = pandas.DataFrame( { + "min_max_scaled_culmen_length_mm": [0.269, 0.232, 0.210], "onehotencoded_species": [ [{"index": 1, "value": 1.0}], [{"index": 1, "value": 1.0}], @@ -65,14 +71,8 @@ def test_columntransformer_standalone_fit_and_transform( }, index=pandas.Index([1633, 1672, 1690], dtype="Int64", name="tag_number"), ) - expected.standard_scaled_culmen_length_mm = ( - expected.standard_scaled_culmen_length_mm.astype("Float64") - ) - expected.standard_scaled_flipper_length_mm = ( - expected.standard_scaled_flipper_length_mm.astype("Float64") - ) - pandas.testing.assert_frame_equal(result, expected, rtol=1e-3, check_dtype=False) + pandas.testing.assert_frame_equal(result, expected, rtol=0.1, check_dtype=False) def test_columntransformer_standalone_fit_transform(new_penguins_df): @@ -84,7 +84,7 @@ def test_columntransformer_standalone_fit_transform(new_penguins_df): "species", ), ( - "scale", + "standard_scale", bigframes.ml.preprocessing.StandardScaler(), ["culmen_length_mm", "flipper_length_mm"], ), @@ -116,11 +116,5 @@ def test_columntransformer_standalone_fit_transform(new_penguins_df): }, index=pandas.Index([1633, 1672, 1690], dtype="Int64", name="tag_number"), ) - expected.standard_scaled_culmen_length_mm = ( - expected.standard_scaled_culmen_length_mm.astype("Float64") - ) - expected.standard_scaled_flipper_length_mm = ( - expected.standard_scaled_flipper_length_mm.astype("Float64") - ) - pandas.testing.assert_frame_equal(result, expected, rtol=1e-3, check_dtype=False) + pandas.testing.assert_frame_equal(result, expected, rtol=0.1, check_dtype=False) From 0be891191ed89be77494e4dcda30fb37836842ac Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tim=20Swe=C3=B1a=20=28Swast=29?= Date: Wed, 27 Mar 2024 13:35:00 -0500 Subject: [PATCH 10/53] feat: expose `DataFrame.bqclient` to assist in integrations (#519) * feat: expose `DataFrame.session` to assist in integrations * add code sample for sharing when the service account is only known on the backend * expose bqclient instead * start a DF construction sample * finish read sample --- bigframes/dataframe.py | 5 + notebooks/dataframes/dataframe.ipynb | 22 + notebooks/dataframes/integrations.ipynb | 635 ++++++++++++++++++++++++ 3 files changed, 662 insertions(+) create mode 100644 notebooks/dataframes/integrations.ipynb diff --git a/bigframes/dataframe.py b/bigframes/dataframe.py index 7e82ba125c..599546284b 100644 --- a/bigframes/dataframe.py +++ b/bigframes/dataframe.py @@ -306,6 +306,11 @@ def empty(self) -> bool: def values(self) -> numpy.ndarray: return self.to_numpy() + @property + def bqclient(self) -> bigframes.Session: + """BigQuery REST API Client the DataFrame uses for operations.""" + return self._session.bqclient + @property def _session(self) -> bigframes.Session: return self._get_block().expr.session diff --git a/notebooks/dataframes/dataframe.ipynb b/notebooks/dataframes/dataframe.ipynb index 15da075552..de9bb1d04f 100644 --- a/notebooks/dataframes/dataframe.ipynb +++ b/notebooks/dataframes/dataframe.ipynb @@ -1,5 +1,27 @@ { "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "id": "eeec3428", + "metadata": {}, + "outputs": [], + "source": [ + "# Copyright 2023 Google LLC\n", + "#\n", + "# Licensed under the Apache License, Version 2.0 (the \"License\");\n", + "# you may not use this file except in compliance with the License.\n", + "# You may obtain a copy of the License at\n", + "#\n", + "# https://www.apache.org/licenses/LICENSE-2.0\n", + "#\n", + "# Unless required by applicable law or agreed to in writing, software\n", + "# distributed under the License is distributed on an \"AS IS\" BASIS,\n", + "# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n", + "# See the License for the specific language governing permissions and\n", + "# limitations under the License." + ] + }, { "attachments": {}, "cell_type": "markdown", diff --git a/notebooks/dataframes/integrations.ipynb b/notebooks/dataframes/integrations.ipynb new file mode 100644 index 0000000000..735e18d94e --- /dev/null +++ b/notebooks/dataframes/integrations.ipynb @@ -0,0 +1,635 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 35, + "metadata": {}, + "outputs": [], + "source": [ + "# Copyright 2024 Google LLC\n", + "#\n", + "# Licensed under the Apache License, Version 2.0 (the \"License\");\n", + "# you may not use this file except in compliance with the License.\n", + "# You may obtain a copy of the License at\n", + "#\n", + "# https://www.apache.org/licenses/LICENSE-2.0\n", + "#\n", + "# Unless required by applicable law or agreed to in writing, software\n", + "# distributed under the License is distributed on an \"AS IS\" BASIS,\n", + "# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n", + "# See the License for the specific language governing permissions and\n", + "# limitations under the License." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Integrating with BigQuery DataFrames\n", + "\n", + "This notebook demonstrates operations for building applications that integrate with BigQuery DataFrames. Follow these samples to build an integration that accepts a BigQuery DataFrames object or returns one." + ] + }, + { + "cell_type": "code", + "execution_count": 36, + "metadata": {}, + "outputs": [], + "source": [ + "import bigframes.pandas as bpd\n", + "\n", + "# Sample data\n", + "df = bpd.DataFrame({\n", + " \"index\": [0, 1, 2, 3, 4],\n", + " \"int_col\": [1, 2, 3, 4, 5],\n", + " \"float_col\": [1.0, -0.5, 0.25, -0.125, 0.0625],\n", + " \"string_col\": [\"a\", \"b\", \"c\", \"d\", \"e\"],\n", + "}).set_index(\"index\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Accepting a BigQuery DataFrames (bigframes) DataFrame\n", + "\n", + "The recommended serialization format for a BigQuery DataFrames (bigframes) DataFrame is a BigQuery table. To write a DataFrame to a BigQuery table, use the `DataFrame.to_gbq()` method. With no `destination_table`, BigQuery DataFrames creates a table in the anonymous dataset corresponding to the BigQuery user & location and returns the corresponding table ID." + ] + }, + { + "cell_type": "code", + "execution_count": 37, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "Query job 00b5c727-f2bf-4265-be22-d7d505619db7 is DONE. 0 Bytes processed. Open Job" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [ + "'swast-scratch._63cfa399614a54153cc386c27d6c0c6fdb249f9e.bqdf20240327_43bbc4c64fb947f7b69db570a5641506'" + ] + }, + "execution_count": 37, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "table_id = df.to_gbq()\n", + "table_id" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Sharing the table with your application's backend\n", + "\n", + "Tables created in the user's anonymous dataset are only queryable by the user who created them. Many applications authenticate with a [service account](https://cloud.google.com/iam/docs/service-account-overview), which may be different from the end-user running BigQuery DataFrames (bigframes).\n", + "\n", + "Grant your application access to this table by granting your application's service account associated with the customer the `roles/bigquery.dataViewer` role on the [BigQuery table with an IAM policy](https://cloud.google.com/bigquery/docs/control-access-to-resources-iam#grant_access_to_a_table_or_view)." + ] + }, + { + "cell_type": "code", + "execution_count": 38, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "Query job f9c39ac2-a428-45c9-bb3a-643fc62a1c5b is DONE. 0 Bytes processed. Open Job" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + " index int_col float_col string_col\n", + "0 2 3 0.2500 c\n", + "1 4 5 0.0625 e\n", + "2 0 1 1.0000 a\n", + "3 1 2 -0.5000 b\n", + "4 3 4 -0.1250 d\n" + ] + } + ], + "source": [ + "# This sample assumes the client code knows which service account to share with.\n", + "your_service_account_email = \"your-service-account@bigframes-samples.iam.gserviceaccount.com\"\n", + "\n", + "\n", + "def df_to_gbq_plus_workoad(df):\n", + " table_id = df.to_gbq()\n", + "\n", + " bqclient = df.bqclient\n", + " policy = bqclient.get_iam_policy(table_id)\n", + " binding = {\n", + " \"role\": \"roles/bigquery.dataViewer\",\n", + " \"members\": {f\"serviceAccount:{your_service_account_email}\"},\n", + " }\n", + " policy.bindings.append(binding)\n", + " bqclient.set_iam_policy(table_id, policy)\n", + "\n", + " # TODO(developer): Pass table_id to your application and start your workload.\n", + " example_workload(table_id)\n", + "\n", + "\n", + "def example_workload(table_id):\n", + " # For example, for one node workloads, use the client library to read the table\n", + " # as a pandas DataFrame.\n", + " from google.cloud import bigquery\n", + "\n", + " # This sample assumes this client is authenticated as the user\n", + " # your_service_account_email.\n", + " client = bigquery.Client()\n", + " pandas_df = client.list_rows(table_id).to_dataframe()\n", + " print(pandas_df)\n", + "\n", + "\n", + "df_to_gbq_plus_workoad(df)\n" + ] + }, + { + "cell_type": "code", + "execution_count": 39, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "Query job ad53c7f2-e3bd-4667-b60b-b700c24b7a81 is DONE. 0 Bytes processed. Open Job" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + " index int_col float_col string_col\n", + "0 4 5 0.0625 e\n", + "1 0 1 1.0000 a\n", + "2 2 3 0.2500 c\n", + "3 3 4 -0.1250 d\n", + "4 1 2 -0.5000 b\n" + ] + } + ], + "source": [ + "# This sample assumes the client code doesn't know which service account to share with.\n", + "\n", + "\n", + "def df_to_gbq_plus_workoad(df):\n", + " table_id = df.to_gbq()\n", + "\n", + " bqclient = df.bqclient\n", + " token = bqclient._credentials.token\n", + " project_id = bqclient.project\n", + "\n", + " share_table_and_start_workload(table_id, token, project_id)\n", + "\n", + "\n", + "def share_table_and_start_workload(table_id, token, project_id):\n", + " # This code runs in the backend for your application.\n", + " from google.cloud import bigquery\n", + " import google.oauth2.credentials\n", + "\n", + " # Note: these credentials don't have any way to be refreshed,\n", + " # so only use them long enough to share the table with the\n", + " # service account.\n", + " credentials = google.oauth2.credentials.Credentials(token)\n", + " bqclient = bigquery.Client(\n", + " project=project_id,\n", + " credentials=credentials,\n", + " )\n", + "\n", + " # This is assumed to only be available on the backend.\n", + " your_service_account_email = \"your-service-account@bigframes-samples.iam.gserviceaccount.com\"\n", + " policy = bqclient.get_iam_policy(table_id)\n", + " binding = {\n", + " \"role\": \"roles/bigquery.dataViewer\",\n", + " \"members\": {f\"serviceAccount:{your_service_account_email}\"},\n", + " }\n", + " policy.bindings.append(binding)\n", + " bqclient.set_iam_policy(table_id, policy)\n", + "\n", + " # Now that the table has been shared, bqclient with the temporary token\n", + " # is no longer needed.\n", + " example_workload(table_id)\n", + "\n", + "\n", + "def example_workload(table_id):\n", + " # For example, for one node workloads, use the client library to read the table\n", + " # as a pandas DataFrame.\n", + " from google.cloud import bigquery\n", + "\n", + " # This sample assumes this client is authenticated as the user\n", + " # your_service_account_email.\n", + " client = bigquery.Client()\n", + " pandas_df = client.list_rows(table_id).to_dataframe()\n", + " print(pandas_df)\n", + "\n", + "\n", + "df_to_gbq_plus_workoad(df)\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Preserving order\n", + "\n", + "Depending on your use case, you may want to include the ordering so that it can be restored withing your application." + ] + }, + { + "cell_type": "code", + "execution_count": 40, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "Query job 2aa7033c-c547-4ae2-a9aa-33272be82b9c is DONE. 0 Bytes processed. Open Job" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [ + "'swast-scratch._63cfa399614a54153cc386c27d6c0c6fdb249f9e.bqdf20240327_b484a3967fba4a41850f4eb21b4b3bd8'" + ] + }, + "execution_count": 40, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "ordering_column = \"ordering_id_maybe_with_some_random_text_to_avoid_collisions\"\n", + "table_id = df.to_gbq(ordering_id=ordering_column)\n", + "table_id" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Creating clustered tables\n", + "\n", + "Large tables can be optimized by passing in `clustering_columns` to create a [clustered table](https://cloud.google.com/bigquery/docs/clustered-tables)." + ] + }, + { + "cell_type": "code", + "execution_count": 41, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "Query job 1d489f94-2840-405e-9114-d439dcfcf7aa is DONE. 0 Bytes processed. Open Job" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [ + "'swast-scratch._63cfa399614a54153cc386c27d6c0c6fdb249f9e.bqdf20240327_d00699eeeed743b487c870dca5bcf23b'" + ] + }, + "execution_count": 41, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "table_id = df.to_gbq(clustering_columns=(\"index\", \"int_col\"))\n", + "table_id" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Returning a BigQuery DataFrames (bigframes) DataFrame\n", + "\n", + "The recommended way to construct a DataFrame is from a BigQuery table which has a unique primary key. By default a primary key is used as the index, which allows for more efficient queries than the default index generation.\n", + "\n", + "This sample assumes there is a shared dataset that\n", + "\n", + "1. The application can write to and\n", + "2. the bigframes user can read from.\n", + "\n", + "There are many ways an application can [write to a BigQuery table](https://cloud.google.com/bigquery/docs/loading-data), including BigQuery load jobs, DML, streaming REST API, and the BigQuery Write API. Each has different costs, performance, and limitations. Choose the one that best suits your application's needs." + ] + }, + { + "cell_type": "code", + "execution_count": 42, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Dataset(DatasetReference('swast-scratch', 'my_dataset'))" + ] + }, + "execution_count": 42, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# The assumption is that there is a shared dataset to work with.\n", + "from google.cloud import bigquery\n", + "\n", + "bqclient = bigquery.Client()\n", + "bqclient.create_dataset(\"my_dataset\", exists_ok=True)" + ] + }, + { + "cell_type": "code", + "execution_count": 43, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "Query job 40977e60-97c3-4c93-89e2-d7334e5af71d is DONE. 0 Bytes processed. Open Job" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Query job 81e35bb8-2e27-4a18-b596-15a7805331f0 is DONE. 270 Bytes processed. Open Job" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
statepostal_codepop
unique_index
2MI48105669
3GA303092581
5TX787015373
7CO803012087
11MA021422592
13IL606072630
17MI482012
19NC27701801
23CA926121115
29WA980334952
\n", + "

10 rows × 3 columns

\n", + "
[10 rows x 3 columns in total]" + ], + "text/plain": [ + " state postal_code pop\n", + "unique_index \n", + "2 MI 48105 669\n", + "3 GA 30309 2581\n", + "5 TX 78701 5373\n", + "7 CO 80301 2087\n", + "11 MA 02142 2592\n", + "13 IL 60607 2630\n", + "17 MI 48201 2\n", + "19 NC 27701 801\n", + "23 CA 92612 1115\n", + "29 WA 98033 4952\n", + "\n", + "[10 rows x 3 columns]" + ] + }, + "execution_count": 43, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# For simplicity, this sample assumes your application uses\n", + "# a load job with the CSV file format.\n", + "# See: https://cloud.google.com/bigquery/docs/loading-data-cloud-storage-csv#python\n", + "import datetime\n", + "import io\n", + "import random\n", + "\n", + "\n", + "def create_table_for_bigframes():\n", + " # This code is assumed to run on the application's backend.\n", + " from google.cloud import bigquery\n", + "\n", + " client = bigquery.Client()\n", + "\n", + " # The end-user is expected to have read access to this table.\n", + " table_suffix = f\"{datetime.datetime.now().strftime('%Y%m%d_%H%M%S_%f')}_{random.randrange(1_000_000)}\"\n", + " table_id = f\"{client.project}.my_dataset.integrations_ipynb_{table_suffix}\"\n", + "\n", + " # Best practice: set the primary key to a unique column to use as the\n", + " # index and default ordering in a BigQuery DataFrames (bigframes) DataFrame.\n", + " # Having a unique identity column allows the DataFrame to be constructed\n", + " # more efficiently.\n", + " #\n", + " # Note 1: Even a random UUID would be helpful for efficiency.\n", + " #\n", + " # Note 2: Don't do this if you can't guarantee uniqueness, as the BigQuery\n", + " # query engine uses this property to optimize queries. Non-unique primary\n", + " # keys result in undefined behavior.\n", + " #\n", + " # Note 3: client.create_table doesn't support primary key, so instead\n", + " # use DDL to create the table.\n", + " create_table_ddl = f\"\"\"\n", + " CREATE OR REPLACE TABLE `{table_id}`\n", + " (\n", + " unique_index INT64,\n", + " state STRING,\n", + " postal_code STRING,\n", + " pop INT64,\n", + " PRIMARY KEY (unique_index) NOT ENFORCED\n", + " )\n", + " -- Clustering by the index column can make joins and loc operations more efficient.\n", + " -- Also cluster by columns which are expected to be used as common filters.\n", + " CLUSTER BY unique_index, state\n", + " \"\"\"\n", + " client.query_and_wait(create_table_ddl)\n", + "\n", + " csv_file = io.BytesIO(\n", + "b\"\"\"unique_index,state,postal_code,pop\n", + "2,MI,48105,669\n", + "3,GA,30309,2581\n", + "5,TX,78701,5373\n", + "7,CO,80301,2087\n", + "11,MA,02142,2592\n", + "13,IL,60607,2630\n", + "17,MI,48201,2\n", + "19,NC,27701,801\n", + "23,CA,92612,1115\n", + "29,WA,98033,4952\n", + "\"\"\"\n", + " )\n", + " job_config = bigquery.LoadJobConfig(\n", + " skip_leading_rows=1,\n", + " source_format=bigquery.SourceFormat.CSV,\n", + " )\n", + " load_job = client.load_table_from_file(\n", + " csv_file, table_id, job_config=job_config\n", + " )\n", + " load_job.result() # Waits for the job to complete.\n", + "\n", + " return table_id\n", + "\n", + "\n", + "table_id = create_table_for_bigframes()\n", + "\n", + "\n", + "# This is assumed to run on the client.\n", + "import bigframes.pandas as bpd\n", + "df = bpd.read_gbq_table(table_id, index_col=[\"unique_index\"])\n", + "df" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "bigframes", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.1" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} From 893fe154ed0bf4156ff8424b788782eda1ac9d7a Mon Sep 17 00:00:00 2001 From: Ashley Xu <139821907+ashleyxuu@users.noreply.github.com> Date: Wed, 27 Mar 2024 13:24:16 -0700 Subject: [PATCH 11/53] chore: address comments from technical writers (#528) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Thank you for opening a Pull Request! Before submitting your PR, there are a few things you can do to make sure it goes smoothly: - [ ] Make sure to open an issue as a [bug/issue](https://togithub.com/googleapis/python-bigquery-dataframes/issues/new/choose) before writing your code! That way we can discuss the change, evaluate designs, and agree on the general idea - [ ] Ensure the tests and linter pass - [ ] Code coverage does not decrease (if any source code was changed) - [ ] Appropriate docs were updated (if necessary) Fixes internal #325274012 🦕 --- bigframes/_config/bigquery_options.py | 46 ++++++++++++++++----------- 1 file changed, 27 insertions(+), 19 deletions(-) diff --git a/bigframes/_config/bigquery_options.py b/bigframes/_config/bigquery_options.py index d035fe5df1..9da953a582 100644 --- a/bigframes/_config/bigquery_options.py +++ b/bigframes/_config/bigquery_options.py @@ -56,7 +56,8 @@ def __init__( def application_name(self) -> Optional[str]: """The application name to amend to the user-agent sent to Google APIs. - Recommended format is ``"appplication-name/major.minor.patch_version"`` + The application name to amend to the user agent sent to Google APIs. + The recommended format is ``"appplication-name/major.minor.patch_version"`` or ``"(gpn:PartnerName;)"`` for official Google partners. """ return self._application_name @@ -71,7 +72,7 @@ def application_name(self, value: Optional[str]): @property def credentials(self) -> Optional[google.auth.credentials.Credentials]: - """The OAuth2 Credentials to use for this client.""" + """The OAuth2 credentials to use for this client.""" return self._credentials @credentials.setter @@ -84,7 +85,7 @@ def credentials(self, value: Optional[google.auth.credentials.Credentials]): def location(self) -> Optional[str]: """Default location for job, datasets, and tables. - See: https://cloud.google.com/bigquery/docs/locations + For more information, see https://cloud.google.com/bigquery/docs/locations BigQuery locations. """ return self._location @@ -107,13 +108,15 @@ def project(self, value: Optional[str]): @property def bq_connection(self) -> Optional[str]: - """Name of the BigQuery connection to use. Should be of the form + """Name of the BigQuery connection to use in the form ... - You should either have the connection already created in the - location you have chosen, or you should have the Project IAM - Admin role to enable the service to create the connection for you if you - need it. + You either need to create the connection in a location of your choice, or + you need the Project Admin IAM role to enable the service to create the + connection for you. + + If this option isn't available, or the project or location isn't provided, + then the default connection project/location/connection_id is used in the session. If this option isn't provided, or project or location aren't provided, session will use its default project/location/connection_id as default connection. @@ -151,12 +154,12 @@ def use_regional_endpoints(self) -> bool: """Flag to connect to regional API endpoints. .. deprecated:: 0.13.0 - Use of regional endpoints is a feature in preview and + Use of regional endpoints is a feature in Preview and available only in selected regions and projects. - Requires ``location`` to also be set. For example, set - ``location='asia-northeast1'`` and ``use_regional_endpoints=True`` to - connect to asia-northeast1-bigquery.googleapis.com. + Requires that ``location`` is set. For example, to connect to + asia-northeast1-bigquery.googleapis.com, specify + ``location='asia-northeast1'`` and ``use_regional_endpoints=True``. """ return self._use_regional_endpoints @@ -177,17 +180,22 @@ def use_regional_endpoints(self, value: bool): @property def kms_key_name(self) -> Optional[str]: - """Customer managed encryption key used to control encryption of the + """ + Customer-managed encryption key + used to control encryption of the data at rest in BigQuery. This key + takes the format projects/PROJECT_ID/locations/LOCATION/keyRings/KEYRING/cryptoKeys/KEY + + Customer managed encryption key used to control encryption of the data-at-rest in BigQuery. This is of the format projects/PROJECT_ID/locations/LOCATION/keyRings/KEYRING/cryptoKeys/KEY - See https://cloud.google.com/bigquery/docs/customer-managed-encryption - for more details. + For more information, see https://cloud.google.com/bigquery/docs/customer-managed-encryption + Customer-managed Cloud KMS keys - Please make sure the project used for Bigquery DataFrames has "Cloud KMS - CryptoKey Encrypter/Decrypter" role in the key's project, See - https://cloud.google.com/bigquery/docs/customer-managed-encryption#assign_role - for steps on how to ensure that. + Make sure the project used for Bigquery DataFrames has the + Cloud KMS CryptoKey Encrypter/Decrypter IAM role in the key's project. + For more information, see https://cloud.google.com/bigquery/docs/customer-managed-encryption#assign_role + Assign the Encrypter/Decrypter. """ return self._kms_key_name From 9b1525a0c359455160bfbc0dc1366e37982ad01f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tim=20Swe=C3=B1a=20=28Swast=29?= Date: Wed, 27 Mar 2024 16:30:36 -0500 Subject: [PATCH 12/53] deps: include `pyarrow` as a dependency (#529) * deps: include `pyarrow` as a dependency Test with a minimum version of pyarrow to confirm compatibility. * fix unit test * fix unit tests * try pyarrow 11 * try pyarrow 8 * skip failing astype test --- noxfile.py | 2 +- setup.py | 1 + testing/constraints-3.10.txt | 14 ++++++++------ testing/constraints-3.9.txt | 1 + tests/system/small/test_series.py | 6 ++++++ 5 files changed, 17 insertions(+), 7 deletions(-) diff --git a/noxfile.py b/noxfile.py index 6b36995480..a002c9def7 100644 --- a/noxfile.py +++ b/noxfile.py @@ -341,8 +341,8 @@ def run_system( pytest_cmd.extend(extra_pytest_options) session.run( *pytest_cmd, - test_folder, *session.posargs, + test_folder, ) diff --git a/setup.py b/setup.py index 768fac530c..86fb9d496c 100644 --- a/setup.py +++ b/setup.py @@ -47,6 +47,7 @@ "ibis-framework[bigquery] >=8.0.0,<9.0.0dev", # TODO: Relax upper bound once we have fixed `system_prerelease` tests. "pandas >=1.5.0", + "pyarrow >=8.0.0", "pydata-google-auth >=1.8.2", "requests >=2.27.1", "scikit-learn >=1.2.2", diff --git a/testing/constraints-3.10.txt b/testing/constraints-3.10.txt index 9f0786f47e..5782b03a2f 100644 --- a/testing/constraints-3.10.txt +++ b/testing/constraints-3.10.txt @@ -1,15 +1,17 @@ # Keep in sync with colab/containers/requirements.core.in image -google-auth==2.17.3 +google-auth==2.27.0 ipykernel==5.5.6 ipython==7.34.0 -notebook==6.4.8 -pandas==1.5.3 -portpicker==1.3.9 -requests==2.27.1 -tornado==6.3.1 +notebook==6.5.5 +pandas==2.0.3 +pandas-stubs==2.0.3.230814 +portpicker==1.5.2 +requests==2.31.0 +tornado==6.3.3 absl-py==1.4.0 debugpy==1.6.6 ipywidgets==7.7.1 matplotlib==3.7.1 psutil==5.9.5 +seaborn==0.13.1 traitlets==5.7.1 diff --git a/testing/constraints-3.9.txt b/testing/constraints-3.9.txt index 0aeb15eab8..1e1f3a3e66 100644 --- a/testing/constraints-3.9.txt +++ b/testing/constraints-3.9.txt @@ -12,6 +12,7 @@ google-cloud-resource-manager==1.10.3 google-cloud-storage==2.0.0 ibis-framework==8.0.0 pandas==1.5.0 +pyarrow==8.0.0 pydata-google-auth==1.8.2 requests==2.27.1 scikit-learn==1.2.2 diff --git a/tests/system/small/test_series.py b/tests/system/small/test_series.py index 258fb1cfd8..794ab6b7a2 100644 --- a/tests/system/small/test_series.py +++ b/tests/system/small/test_series.py @@ -2783,6 +2783,12 @@ def test_string_astype_float(): def test_string_astype_date(): + if int(pa.__version__.split(".")[0]) < 15: + pytest.skip( + "Avoid pyarrow.lib.ArrowNotImplementedError: " + "Unsupported cast from string to date32 using function cast_date32." + ) + pd_series = pd.Series(["2014-08-15", "2215-08-15", "2016-02-29"]).astype( pd.ArrowDtype(pa.string()) ) From 56cefff894a1819aab85cc7bdd38469fbc8072bb Mon Sep 17 00:00:00 2001 From: Henry Solberg Date: Wed, 27 Mar 2024 14:44:16 -0700 Subject: [PATCH 13/53] chore: experimentally double load test timeout (#535) While I work in parallel from other angles, I'd like to try out doubling the timeouts for the kokoro runs, just as another data point. --- .kokoro/load/common.cfg | 2 +- noxfile.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/.kokoro/load/common.cfg b/.kokoro/load/common.cfg index 7f6fa7e0d9..d86932662d 100644 --- a/.kokoro/load/common.cfg +++ b/.kokoro/load/common.cfg @@ -8,4 +8,4 @@ action { } build_file: "python-bigquery-dataframes/.kokoro/build.sh" -timeout_mins: 360 +timeout_mins: 720 diff --git a/noxfile.py b/noxfile.py index a002c9def7..a5e77964f1 100644 --- a/noxfile.py +++ b/noxfile.py @@ -399,7 +399,7 @@ def load(session: nox.sessions.Session): prefix_name="load", test_folder=os.path.join("tests", "system", "load"), print_duration=True, - timeout_seconds=60 * 60, + timeout_seconds=60 * 60 * 12, ) From 082c58bbe76821b90337dc5af0ab5fa7515682c2 Mon Sep 17 00:00:00 2001 From: Henry Solberg Date: Thu, 28 Mar 2024 07:08:18 -0700 Subject: [PATCH 14/53] fix: don't download 100gb onto local python machine in load test (#537) * fix: don't download 100gb onto local python machine in load test * Update test_large_tables.py --- tests/system/load/test_large_tables.py | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/tests/system/load/test_large_tables.py b/tests/system/load/test_large_tables.py index 1d4a6b0a5b..22baa2268f 100644 --- a/tests/system/load/test_large_tables.py +++ b/tests/system/load/test_large_tables.py @@ -74,13 +74,9 @@ def test_index_repr_large_table(): assert actual is not None -# FAILED -# tests/system/load/test_large_tables.py::test_to_pandas_batches_large_table -# google.api_core.exceptions.Forbidden: 403 Response too large to return. -# Consider specifying a destination table in your job... -@pytest.mark.xfail def test_to_pandas_batches_large_table(): - df = bpd.read_gbq("load_testing.scalars_100gb") + df = bpd.read_gbq("load_testing.scalars_10gb") + # df will be downloaded locally expected_row_count, expected_column_count = df.shape row_count = 0 From ae528d76fa1585f7581e3c774f9554944d25431e Mon Sep 17 00:00:00 2001 From: Shobhit Singh Date: Thu, 28 Mar 2024 18:58:55 +0000 Subject: [PATCH 15/53] test: add e2e tests for all BQ locations (#517) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * feat: support BQ regional endpoints for europe-west9, europe-west3, us-east4, and us-west1 * add location tests with lep and rep behavior * add default location test * fix mypy failures * Update tests/system/large/test_location.py * make pytest paramxn order deterministic * only unit tests for LEP * remove unit tests on ClientsProvider as it is failing for credentials --------- Co-authored-by: Tim Sweña (Swast) --- bigframes/session/clients.py | 12 +-- tests/config.py | 72 ++++++++++++++++ tests/system/large/test_location.py | 129 ++++++++++++++++++++++++++++ 3 files changed, 207 insertions(+), 6 deletions(-) create mode 100644 tests/config.py create mode 100644 tests/system/large/test_location.py diff --git a/bigframes/session/clients.py b/bigframes/session/clients.py index d97e53901d..32f13fa00d 100644 --- a/bigframes/session/clients.py +++ b/bigframes/session/clients.py @@ -69,12 +69,12 @@ class ClientsProvider: def __init__( self, - project: Optional[str], - location: Optional[str], - use_regional_endpoints: Optional[bool], - credentials: Optional[google.auth.credentials.Credentials], - application_name: Optional[str], - bq_kms_key_name: Optional[str], + project: Optional[str] = None, + location: Optional[str] = None, + use_regional_endpoints: Optional[bool] = None, + credentials: Optional[google.auth.credentials.Credentials] = None, + application_name: Optional[str] = None, + bq_kms_key_name: Optional[str] = None, ): credentials_project = None if credentials is None: diff --git a/tests/config.py b/tests/config.py new file mode 100644 index 0000000000..a885d7e71d --- /dev/null +++ b/tests/config.py @@ -0,0 +1,72 @@ +# Copyright 2023 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# https://cloud.google.com/bigquery/docs/locations +ALL_BIGQUERY_LOCATIONS = [ + "us-east5", + "us-south1", + "us-central1", + "us-west4", + "us-west2", + "northamerica-northeast1", + "us-east4", + "us-west1", + "us-west3", + "southamerica-east1", + "southamerica-west1", + "us-east1", + "northamerica-northeast2", + "asia-south2", + "asia-east2", + "asia-southeast2", + "australia-southeast2", + "asia-south1", + "asia-northeast2", + "asia-northeast3", + "asia-southeast1", + "australia-southeast1", + "asia-east1", + "asia-northeast1", + "europe-west1", + "europe-west10", + "europe-north1", + "europe-west3", + "europe-west2", + "europe-southwest1", + "europe-west8", + "europe-west4", + "europe-west9", + "europe-west12", + "europe-central2", + "europe-west6", + "me-central2", + "me-central1", + "me-west1", + "me-central2", + "me-central1", + "me-west1", + "africa-south1", +] + +REP_ENABLED_BIGQUERY_LOCATIONS = [ + "me-central2", + "europe-west9", + "europe-west3", + "us-east4", + "us-west1", +] + +LEP_ENABLED_BIGQUERY_LOCATIONS = sorted( + set(ALL_BIGQUERY_LOCATIONS) - set(REP_ENABLED_BIGQUERY_LOCATIONS) +) diff --git a/tests/system/large/test_location.py b/tests/system/large/test_location.py new file mode 100644 index 0000000000..a4cf8919a0 --- /dev/null +++ b/tests/system/large/test_location.py @@ -0,0 +1,129 @@ +# Copyright 2023 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import typing + +from google.cloud import bigquery +import pytest + +import bigframes +import bigframes.session.clients +from tests import config + + +def _assert_bq_execution_location(session: bigframes.Session): + df = session.read_gbq( + """ + SELECT "aaa" as name, 111 as number + UNION ALL + SELECT "bbb" as name, 222 as number + UNION ALL + SELECT "aaa" as name, 333 as number + """ + ) + + assert ( + typing.cast(bigquery.QueryJob, df.query_job).location + == session.bqclient.location + ) + + result = ( + df[["name", "number"]] + .groupby("name") + .sum(numeric_only=True) + .sort_values("number", ascending=False) + .head() + ) + + assert ( + typing.cast(bigquery.QueryJob, result.query_job).location + == session.bqclient.location + ) + + +def test_bq_location_default(): + session = bigframes.Session() + + assert session.bqclient.location == "US" + + # by default global endpoint is used + assert ( + session.bqclient._connection.API_BASE_URL == "https://bigquery.googleapis.com" + ) + + # assert that bigframes session honors the location + _assert_bq_execution_location(session) + + +@pytest.mark.parametrize("bigquery_location", config.ALL_BIGQUERY_LOCATIONS) +def test_bq_location(bigquery_location): + session = bigframes.Session( + context=bigframes.BigQueryOptions(location=bigquery_location) + ) + + assert session.bqclient.location == bigquery_location + + # by default global endpoint is used + assert ( + session.bqclient._connection.API_BASE_URL == "https://bigquery.googleapis.com" + ) + + # assert that bigframes session honors the location + _assert_bq_execution_location(session) + + +@pytest.mark.parametrize( + "bigquery_location", + config.REP_ENABLED_BIGQUERY_LOCATIONS, +) +def test_bq_rep_endpoints(bigquery_location): + session = bigframes.Session( + context=bigframes.BigQueryOptions( + location=bigquery_location, use_regional_endpoints=True + ) + ) + + assert session.bqclient.location == bigquery_location + assert ( + session.bqclient._connection.API_BASE_URL + == "https://bigquery.{location}.rep.googleapis.com".format( + location=bigquery_location + ) + ) + + # assert that bigframes session honors the location + _assert_bq_execution_location(session) + + +@pytest.mark.parametrize( + "bigquery_location", + config.LEP_ENABLED_BIGQUERY_LOCATIONS, +) +def test_bq_lep_endpoints(bigquery_location): + # We are not testing BigFrames Session for LEP endpoints because it involves + # query execution using the endpoint, which requires the project to be + # allowlisted for LEP access. We could hardcode one project which is + # allowlisted but then not every open source developer will have access to + # that. Let's rely on just creating the clients for LEP. + clients_provider = bigframes.session.clients.ClientsProvider( + location=bigquery_location, use_regional_endpoints=True + ) + + assert clients_provider.bqclient.location == bigquery_location + assert ( + clients_provider.bqclient._connection.API_BASE_URL + == "https://{location}-bigquery.googleapis.com".format( + location=bigquery_location + ) + ) From ae4ff8d1ef40e0634b842a257ea1043c6d00dccb Mon Sep 17 00:00:00 2001 From: TrevorBergeron Date: Thu, 28 Mar 2024 12:50:57 -0700 Subject: [PATCH 16/53] refactor: Add type constraints to internal op definitions. (#532) --- bigframes/core/nodes.py | 6 + bigframes/dtypes.py | 47 +++- bigframes/operations/__init__.py | 367 +++++++++++++++++++-------- bigframes/operations/type.py | 197 +++++++++++--- tests/system/small/test_dataframe.py | 2 +- 5 files changed, 472 insertions(+), 147 deletions(-) diff --git a/bigframes/core/nodes.py b/bigframes/core/nodes.py index 5ebd2a5997..c1ceeebffe 100644 --- a/bigframes/core/nodes.py +++ b/bigframes/core/nodes.py @@ -318,6 +318,12 @@ def __hash__(self): class ProjectionNode(UnaryNode): assignments: typing.Tuple[typing.Tuple[ex.Expression, str], ...] + def __post_init__(self): + input_types = self.child.schema._mapping + for expression, id in self.assignments: + # throws TypeError if invalid + _ = expression.output_type(input_types) + def __hash__(self): return self._node_hash diff --git a/bigframes/dtypes.py b/bigframes/dtypes.py index 63adc059f3..79e1456f31 100644 --- a/bigframes/dtypes.py +++ b/bigframes/dtypes.py @@ -47,13 +47,19 @@ # None represents the type of a None scalar. ExpressionType = typing.Optional[Dtype] -# Used when storing Null expressions -DEFAULT_DTYPE = pd.Float64Dtype() INT_DTYPE = pd.Int64Dtype() FLOAT_DTYPE = pd.Float64Dtype() BOOL_DTYPE = pd.BooleanDtype() STRING_DTYPE = pd.StringDtype(storage="pyarrow") +BYTES_DTYPE = pd.ArrowDtype(pa.binary()) +DATE_DTYPE = pd.ArrowDtype(pa.date32()) +TIME_DTYPE = pd.ArrowDtype(pa.time64("us")) +DATETIME_DTYPE = pd.ArrowDtype(pa.timestamp("us")) +TIMESTAMP_DTYPE = pd.ArrowDtype(pa.timestamp("us", tz="UTC")) + +# Used when storing Null expressions +DEFAULT_DTYPE = FLOAT_DTYPE # On BQ side, ARRAY, STRUCT, GEOGRAPHY, JSON are not orderable UNORDERED_DTYPES = [gpd.array.GeometryDtype()] @@ -100,6 +106,43 @@ pd.ArrowDtype(pa.decimal256(76, 38)), ] + +## dtype predicates - use these to maintain consistency +def is_datetime_like(type: ExpressionType) -> bool: + return type in (DATETIME_DTYPE, TIMESTAMP_DTYPE) + + +def is_date_like(type: ExpressionType) -> bool: + return type in (DATETIME_DTYPE, TIMESTAMP_DTYPE, DATE_DTYPE) + + +def is_time_like(type: ExpressionType) -> bool: + return type in (DATETIME_DTYPE, TIMESTAMP_DTYPE, TIME_DTYPE) + + +def is_binary_like(type: ExpressionType) -> bool: + return type in (BOOL_DTYPE, BYTES_DTYPE, INT_DTYPE) + + +def is_string_like(type: ExpressionType) -> bool: + return type in (STRING_DTYPE, BYTES_DTYPE) + + +def is_array_like(type: ExpressionType) -> bool: + if isinstance(type, pd.ArrowDtype) and isinstance(type.pyarrow_dtype, pa.ListType): + return True + else: + return type in (STRING_DTYPE, BYTES_DTYPE) + + +def is_numeric(type: ExpressionType) -> bool: + return type in NUMERIC_BIGFRAMES_TYPES_PERMISSIVE + + +def is_comparable(type: ExpressionType) -> bool: + return (type is not None) and (type not in UNORDERED_DTYPES) + + # Type hints for Ibis data types that can be read to Python objects by BigQuery DataFrame ReadOnlyIbisDtype = Union[ ibis_dtypes.Binary, diff --git a/bigframes/operations/__init__.py b/bigframes/operations/__init__.py index 2ef71fde7f..dcd5494626 100644 --- a/bigframes/operations/__init__.py +++ b/bigframes/operations/__init__.py @@ -161,102 +161,165 @@ def _convert_expr_input( # Operation Factories -def create_unary_op( - name: str, type_rule: op_typing.OpTypeRule = op_typing.INPUT_TYPE -) -> UnaryOp: +def create_unary_op(name: str, type_signature: op_typing.UnaryTypeSignature) -> UnaryOp: return dataclasses.make_dataclass( name, - [("name", typing.ClassVar[str], name), ("output_type", typing.ClassVar[typing.Callable], type_rule.as_method)], # type: ignore + [("name", typing.ClassVar[str], name), ("output_type", typing.ClassVar[typing.Callable], type_signature.as_method)], # type: ignore bases=(UnaryOp,), frozen=True, )() def create_binary_op( - name: str, type_rule: op_typing.OpTypeRule = op_typing.Supertype() + name: str, type_signature: op_typing.BinaryTypeSignature ) -> BinaryOp: return dataclasses.make_dataclass( name, - [("name", typing.ClassVar[str], name), ("output_type", typing.ClassVar[typing.Callable], type_rule.as_method)], # type: ignore + [("name", typing.ClassVar[str], name), ("output_type", typing.ClassVar[typing.Callable], type_signature.as_method)], # type: ignore bases=(BinaryOp,), frozen=True, )() -def create_ternary_op( - name: str, type_rule: op_typing.OpTypeRule = op_typing.Supertype() -) -> TernaryOp: - return dataclasses.make_dataclass( - name, - [("name", typing.ClassVar[str], name), ("output_type", typing.ClassVar[typing.Callable], type_rule.as_method)], # type: ignore - bases=(TernaryOp,), - frozen=True, - )() - - # Unary Ops ## Generic Ops -invert_op = create_unary_op(name="invert", type_rule=op_typing.INPUT_TYPE) -isnull_op = create_unary_op(name="isnull", type_rule=op_typing.PREDICATE) -notnull_op = create_unary_op(name="notnull", type_rule=op_typing.PREDICATE) -hash_op = create_unary_op(name="hash", type_rule=op_typing.INTEGER) +invert_op = create_unary_op( + name="invert", + type_signature=op_typing.TypePreserving( + dtypes.is_binary_like, + description="binary-like", + ), +) # numeric +isnull_op = create_unary_op( + name="isnull", + type_signature=op_typing.FixedOutputType( + lambda x: True, dtypes.BOOL_DTYPE, description="nullable" + ), +) +notnull_op = create_unary_op( + name="notnull", + type_signature=op_typing.FixedOutputType( + lambda x: True, dtypes.BOOL_DTYPE, description="nullable" + ), +) +hash_op = create_unary_op( + name="hash", + type_signature=op_typing.FixedOutputType( + dtypes.is_string_like, dtypes.INT_DTYPE, description="string-like" + ), +) ## String Ops -len_op = create_unary_op(name="len", type_rule=op_typing.INTEGER) -reverse_op = create_unary_op(name="reverse", type_rule=op_typing.STRING) -lower_op = create_unary_op(name="lower", type_rule=op_typing.STRING) -upper_op = create_unary_op(name="upper", type_rule=op_typing.STRING) -strip_op = create_unary_op(name="strip", type_rule=op_typing.STRING) -isalnum_op = create_unary_op(name="isalnum", type_rule=op_typing.PREDICATE) -isalpha_op = create_unary_op(name="isalpha", type_rule=op_typing.PREDICATE) -isdecimal_op = create_unary_op(name="isdecimal", type_rule=op_typing.PREDICATE) -isdigit_op = create_unary_op(name="isdigit", type_rule=op_typing.PREDICATE) -isnumeric_op = create_unary_op(name="isnumeric", type_rule=op_typing.PREDICATE) -isspace_op = create_unary_op(name="isspace", type_rule=op_typing.PREDICATE) -islower_op = create_unary_op(name="islower", type_rule=op_typing.PREDICATE) -isupper_op = create_unary_op(name="isupper", type_rule=op_typing.PREDICATE) -rstrip_op = create_unary_op(name="rstrip", type_rule=op_typing.STRING) -lstrip_op = create_unary_op(name="lstrip", type_rule=op_typing.STRING) -capitalize_op = create_unary_op(name="capitalize", type_rule=op_typing.STRING) +len_op = create_unary_op( + name="len", + type_signature=op_typing.FixedOutputType( + dtypes.is_array_like, dtypes.INT_DTYPE, description="array-like" + ), +) +reverse_op = create_unary_op(name="reverse", type_signature=op_typing.STRING_TRANSFORM) +lower_op = create_unary_op(name="lower", type_signature=op_typing.STRING_TRANSFORM) +upper_op = create_unary_op(name="upper", type_signature=op_typing.STRING_TRANSFORM) +strip_op = create_unary_op(name="strip", type_signature=op_typing.STRING_TRANSFORM) +isalnum_op = create_unary_op(name="isalnum", type_signature=op_typing.STRING_PREDICATE) +isalpha_op = create_unary_op(name="isalpha", type_signature=op_typing.STRING_PREDICATE) +isdecimal_op = create_unary_op( + name="isdecimal", type_signature=op_typing.STRING_PREDICATE +) +isdigit_op = create_unary_op(name="isdigit", type_signature=op_typing.STRING_PREDICATE) +isnumeric_op = create_unary_op( + name="isnumeric", type_signature=op_typing.STRING_PREDICATE +) +isspace_op = create_unary_op(name="isspace", type_signature=op_typing.STRING_PREDICATE) +islower_op = create_unary_op(name="islower", type_signature=op_typing.STRING_PREDICATE) +isupper_op = create_unary_op(name="isupper", type_signature=op_typing.STRING_PREDICATE) +rstrip_op = create_unary_op(name="rstrip", type_signature=op_typing.STRING_TRANSFORM) +lstrip_op = create_unary_op(name="lstrip", type_signature=op_typing.STRING_TRANSFORM) +capitalize_op = create_unary_op( + name="capitalize", type_signature=op_typing.STRING_TRANSFORM +) ## DateTime Ops -day_op = create_unary_op(name="day", type_rule=op_typing.INTEGER) -dayofweek_op = create_unary_op(name="dayofweek", type_rule=op_typing.INTEGER) +### datelike accessors +day_op = create_unary_op( + name="day", + type_signature=op_typing.DATELIKE_ACCESSOR, +) +month_op = create_unary_op( + name="month", + type_signature=op_typing.DATELIKE_ACCESSOR, +) +year_op = create_unary_op( + name="year", + type_signature=op_typing.DATELIKE_ACCESSOR, +) +dayofweek_op = create_unary_op( + name="dayofweek", + type_signature=op_typing.DATELIKE_ACCESSOR, +) +quarter_op = create_unary_op( + name="quarter", + type_signature=op_typing.DATELIKE_ACCESSOR, +) +### timelike accessors +hour_op = create_unary_op( + name="hour", + type_signature=op_typing.TIMELIKE_ACCESSOR, +) +minute_op = create_unary_op( + name="minute", + type_signature=op_typing.TIMELIKE_ACCESSOR, +) +second_op = create_unary_op( + name="second", + type_signature=op_typing.TIMELIKE_ACCESSOR, +) +normalize_op = create_unary_op( + name="normalize", + type_signature=op_typing.TypePreserving( + dtypes.is_time_like, + description="time-like", + ), +) +### datetimelike accessors date_op = create_unary_op( - name="date", type_rule=op_typing.Fixed(pd.ArrowDtype(pa.date32())) + name="date", + type_signature=op_typing.FixedOutputType( + dtypes.is_date_like, dtypes.DATE_DTYPE, description="date-like" + ), ) -hour_op = create_unary_op(name="hour", type_rule=op_typing.INTEGER) -minute_op = create_unary_op(name="minute", type_rule=op_typing.INTEGER) -month_op = create_unary_op(name="month", type_rule=op_typing.INTEGER) -quarter_op = create_unary_op(name="quarter", type_rule=op_typing.INTEGER) -second_op = create_unary_op(name="second", type_rule=op_typing.INTEGER) time_op = create_unary_op( - name="time", type_rule=op_typing.Fixed(pd.ArrowDtype(pa.time64("us"))) + name="time", + type_signature=op_typing.FixedOutputType( + dtypes.is_time_like, dtypes.TIME_DTYPE, description="time-like" + ), ) -year_op = create_unary_op(name="year", type_rule=op_typing.INTEGER) -normalize_op = create_unary_op(name="normalize") ## Trigonometry Ops -sin_op = create_unary_op(name="sin", type_rule=op_typing.REAL_NUMERIC) -cos_op = create_unary_op(name="cos", type_rule=op_typing.REAL_NUMERIC) -tan_op = create_unary_op(name="tan", type_rule=op_typing.REAL_NUMERIC) -arcsin_op = create_unary_op(name="arcsin", type_rule=op_typing.REAL_NUMERIC) -arccos_op = create_unary_op(name="arccos", type_rule=op_typing.REAL_NUMERIC) -arctan_op = create_unary_op(name="arctan", type_rule=op_typing.REAL_NUMERIC) -sinh_op = create_unary_op(name="sinh", type_rule=op_typing.REAL_NUMERIC) -cosh_op = create_unary_op(name="cosh", type_rule=op_typing.REAL_NUMERIC) -tanh_op = create_unary_op(name="tanh", type_rule=op_typing.REAL_NUMERIC) -arcsinh_op = create_unary_op(name="arcsinh", type_rule=op_typing.REAL_NUMERIC) -arccosh_op = create_unary_op(name="arccosh", type_rule=op_typing.REAL_NUMERIC) -arctanh_op = create_unary_op(name="arctanh", type_rule=op_typing.REAL_NUMERIC) -arctan2_op = create_binary_op(name="arctan2", type_rule=op_typing.REAL_NUMERIC) +sin_op = create_unary_op(name="sin", type_signature=op_typing.UNARY_REAL_NUMERIC) +cos_op = create_unary_op(name="cos", type_signature=op_typing.UNARY_REAL_NUMERIC) +tan_op = create_unary_op(name="tan", type_signature=op_typing.UNARY_REAL_NUMERIC) +arcsin_op = create_unary_op(name="arcsin", type_signature=op_typing.UNARY_REAL_NUMERIC) +arccos_op = create_unary_op(name="arccos", type_signature=op_typing.UNARY_REAL_NUMERIC) +arctan_op = create_unary_op(name="arctan", type_signature=op_typing.UNARY_REAL_NUMERIC) +sinh_op = create_unary_op(name="sinh", type_signature=op_typing.UNARY_REAL_NUMERIC) +cosh_op = create_unary_op(name="cosh", type_signature=op_typing.UNARY_REAL_NUMERIC) +tanh_op = create_unary_op(name="tanh", type_signature=op_typing.UNARY_REAL_NUMERIC) +arcsinh_op = create_unary_op( + name="arcsinh", type_signature=op_typing.UNARY_REAL_NUMERIC +) +arccosh_op = create_unary_op( + name="arccosh", type_signature=op_typing.UNARY_REAL_NUMERIC +) +arctanh_op = create_unary_op( + name="arctanh", type_signature=op_typing.UNARY_REAL_NUMERIC +) ## Numeric Ops -floor_op = create_unary_op(name="floor", type_rule=op_typing.REAL_NUMERIC) -ceil_op = create_unary_op(name="ceil", type_rule=op_typing.REAL_NUMERIC) -abs_op = create_unary_op(name="abs", type_rule=op_typing.INPUT_TYPE) -exp_op = create_unary_op(name="exp", type_rule=op_typing.REAL_NUMERIC) -expm1_op = create_unary_op(name="expm1", type_rule=op_typing.REAL_NUMERIC) -ln_op = create_unary_op(name="log", type_rule=op_typing.REAL_NUMERIC) -log10_op = create_unary_op(name="log10", type_rule=op_typing.REAL_NUMERIC) -log1p_op = create_unary_op(name="log1p", type_rule=op_typing.REAL_NUMERIC) -sqrt_op = create_unary_op(name="sqrt", type_rule=op_typing.REAL_NUMERIC) +floor_op = create_unary_op(name="floor", type_signature=op_typing.UNARY_REAL_NUMERIC) +ceil_op = create_unary_op(name="ceil", type_signature=op_typing.UNARY_REAL_NUMERIC) +abs_op = create_unary_op(name="abs", type_signature=op_typing.UNARY_NUMERIC) +exp_op = create_unary_op(name="exp", type_signature=op_typing.UNARY_REAL_NUMERIC) +expm1_op = create_unary_op(name="expm1", type_signature=op_typing.UNARY_REAL_NUMERIC) +ln_op = create_unary_op(name="log", type_signature=op_typing.UNARY_REAL_NUMERIC) +log10_op = create_unary_op(name="log10", type_signature=op_typing.UNARY_REAL_NUMERIC) +log1p_op = create_unary_op(name="log1p", type_signature=op_typing.UNARY_REAL_NUMERIC) +sqrt_op = create_unary_op(name="sqrt", type_signature=op_typing.UNARY_REAL_NUMERIC) # Parameterized unary ops @@ -266,7 +329,7 @@ class StrContainsOp(UnaryOp): pat: str def output_type(self, *input_types): - return dtypes.BOOL_DTYPE + return op_typing.STRING_PREDICATE.output_type(input_types[0]) @dataclasses.dataclass(frozen=True) @@ -275,7 +338,7 @@ class StrContainsRegexOp(UnaryOp): pat: str def output_type(self, *input_types): - return dtypes.BOOL_DTYPE + return op_typing.STRING_PREDICATE.output_type(input_types[0]) @dataclasses.dataclass(frozen=True) @@ -284,7 +347,7 @@ class StrGetOp(UnaryOp): i: int def output_type(self, *input_types): - return dtypes.STRING_DTYPE + return op_typing.STRING_TRANSFORM.output_type(input_types[0]) @dataclasses.dataclass(frozen=True) @@ -295,7 +358,7 @@ class StrPadOp(UnaryOp): side: typing.Literal["both", "left", "right"] def output_type(self, *input_types): - return dtypes.STRING_DTYPE + return op_typing.STRING_TRANSFORM.output_type(input_types[0]) @dataclasses.dataclass(frozen=True) @@ -305,7 +368,7 @@ class ReplaceStrOp(UnaryOp): repl: str def output_type(self, *input_types): - return dtypes.STRING_DTYPE + return op_typing.STRING_TRANSFORM.output_type(input_types[0]) @dataclasses.dataclass(frozen=True) @@ -315,7 +378,7 @@ class RegexReplaceStrOp(UnaryOp): repl: str def output_type(self, *input_types): - return dtypes.STRING_DTYPE + return op_typing.STRING_TRANSFORM.output_type(input_types[0]) @dataclasses.dataclass(frozen=True) @@ -324,7 +387,7 @@ class StartsWithOp(UnaryOp): pat: typing.Sequence[str] def output_type(self, *input_types): - return dtypes.BOOL_DTYPE + return op_typing.STRING_PREDICATE.output_type(input_types[0]) @dataclasses.dataclass(frozen=True) @@ -333,7 +396,7 @@ class EndsWithOp(UnaryOp): pat: typing.Sequence[str] def output_type(self, *input_types): - return dtypes.BOOL_DTYPE + return op_typing.STRING_PREDICATE.output_type(input_types[0]) @dataclasses.dataclass(frozen=True) @@ -342,7 +405,7 @@ class ZfillOp(UnaryOp): width: int def output_type(self, *input_types): - return dtypes.STRING_DTYPE + return op_typing.STRING_TRANSFORM.output_type(input_types[0]) @dataclasses.dataclass(frozen=True) @@ -353,7 +416,10 @@ class StrFindOp(UnaryOp): end: typing.Optional[int] def output_type(self, *input_types): - return dtypes.INT_DTYPE + signature = op_typing.FixedOutputType( + dtypes.is_string_like, dtypes.INT_DTYPE, "string-like" + ) + return signature.output_type(input_types[0]) @dataclasses.dataclass(frozen=True) @@ -363,7 +429,7 @@ class StrExtractOp(UnaryOp): n: int = 1 def output_type(self, *input_types): - return dtypes.STRING_DTYPE + return op_typing.STRING_TRANSFORM.output_type(input_types[0]) @dataclasses.dataclass(frozen=True) @@ -373,7 +439,7 @@ class StrSliceOp(UnaryOp): end: typing.Optional[int] def output_type(self, *input_types): - return dtypes.STRING_DTYPE + return op_typing.STRING_TRANSFORM.output_type(input_types[0]) @dataclasses.dataclass(frozen=True) @@ -382,7 +448,7 @@ class StrRepeatOp(UnaryOp): repeats: int def output_type(self, *input_types): - return dtypes.STRING_DTYPE + return op_typing.STRING_TRANSFORM.output_type(input_types[0]) # Other parameterized unary operations @@ -392,9 +458,15 @@ class StructFieldOp(UnaryOp): name_or_index: str | int def output_type(self, *input_types): - pd_type = typing.cast(pd.ArrowDtype, input_types[0]) - pa_struct_t = typing.cast(pa.StructType, pd_type.pyarrow_dtype) - pa_result_type = pa_struct_t[self.name_or_index].type + input_type = input_types[0] + if not isinstance(input_type, pd.ArrowDtype): + raise TypeError("field accessor input must be a struct type") + + pa_type = input_type.pyarrow_dtype + if not isinstance(pa_type, pa.StructType): + raise TypeError("field accessor input must be a struct type") + + pa_result_type = pa_type[self.name_or_index].type # TODO: Directly convert from arrow to pandas type ibis_result_type = dtypes.arrow_dtype_to_ibis_dtype(pa_result_type) return dtypes.ibis_dtype_to_bigframes_dtype(ibis_result_type) @@ -476,37 +548,100 @@ def output_type(self, *input_types): # Binary Ops -fillna_op = create_binary_op(name="fillna") -cliplower_op = create_binary_op(name="clip_lower") -clipupper_op = create_binary_op(name="clip_upper") -coalesce_op = create_binary_op(name="coalesce") +fillna_op = create_binary_op(name="fillna", type_signature=op_typing.COMMON_SUPERTYPE) +cliplower_op = create_binary_op( + name="clip_lower", type_signature=op_typing.COMMON_SUPERTYPE +) +clipupper_op = create_binary_op( + name="clip_upper", type_signature=op_typing.COMMON_SUPERTYPE +) +coalesce_op = create_binary_op( + name="coalesce", type_signature=op_typing.COMMON_SUPERTYPE +) + + ## Math Ops -add_op = create_binary_op(name="add", type_rule=op_typing.NUMERIC) -sub_op = create_binary_op(name="sub", type_rule=op_typing.NUMERIC) -mul_op = create_binary_op(name="mul", type_rule=op_typing.NUMERIC) -div_op = create_binary_op(name="div", type_rule=op_typing.REAL_NUMERIC) -floordiv_op = create_binary_op(name="floordiv", type_rule=op_typing.NUMERIC) -pow_op = create_binary_op(name="pow", type_rule=op_typing.NUMERIC) -mod_op = create_binary_op(name="mod", type_rule=op_typing.NUMERIC) -round_op = create_binary_op(name="round", type_rule=op_typing.REAL_NUMERIC) -unsafe_pow_op = create_binary_op(name="unsafe_pow_op", type_rule=op_typing.REAL_NUMERIC) +@dataclasses.dataclass(frozen=True) +class AddOp(BinaryOp): + name: typing.ClassVar[str] = "add" + + def output_type(self, *input_types): + left_type = input_types[0] + right_type = input_types[1] + if all(map(dtypes.is_string_like, input_types)) and len(set(input_types)) == 1: + # String addition + return input_types[0] + if (left_type is None or dtypes.is_numeric(left_type)) and ( + right_type is None or dtypes.is_numeric(right_type) + ): + # Numeric addition + return dtypes.lcd_etype(left_type, right_type) + # TODO: Add temporal addition once delta types supported + raise TypeError(f"Cannot add dtypes {left_type} and {right_type}") + + +@dataclasses.dataclass(frozen=True) +class SubOp(BinaryOp): + name: typing.ClassVar[str] = "sub" + + # Note: this is actualyl a vararg op, but we don't model that yet + def output_type(self, *input_types): + left_type = input_types[0] + right_type = input_types[1] + if (left_type is None or dtypes.is_numeric(left_type)) and ( + right_type is None or dtypes.is_numeric(right_type) + ): + # Numeric subtraction + return dtypes.lcd_etype(left_type, right_type) + # TODO: Add temporal addition once delta types supported + raise TypeError(f"Cannot subtract dtypes {left_type} and {right_type}") + + +add_op = AddOp() +sub_op = SubOp() +mul_op = create_binary_op(name="mul", type_signature=op_typing.BINARY_NUMERIC) +div_op = create_binary_op(name="div", type_signature=op_typing.BINARY_REAL_NUMERIC) +floordiv_op = create_binary_op(name="floordiv", type_signature=op_typing.BINARY_NUMERIC) +pow_op = create_binary_op(name="pow", type_signature=op_typing.BINARY_NUMERIC) +mod_op = create_binary_op(name="mod", type_signature=op_typing.BINARY_NUMERIC) +arctan2_op = create_binary_op( + name="arctan2", type_signature=op_typing.BINARY_REAL_NUMERIC +) +round_op = create_binary_op(name="round", type_signature=op_typing.BINARY_REAL_NUMERIC) +unsafe_pow_op = create_binary_op( + name="unsafe_pow_op", type_signature=op_typing.BINARY_REAL_NUMERIC +) # Logical Ops -and_op = create_binary_op(name="and") -or_op = create_binary_op(name="or") +and_op = create_binary_op(name="and", type_signature=op_typing.LOGICAL) +or_op = create_binary_op(name="or", type_signature=op_typing.LOGICAL) ## Comparison Ops -eq_op = create_binary_op(name="eq", type_rule=op_typing.PREDICATE) +eq_op = create_binary_op(name="eq", type_signature=op_typing.COMPARISON) eq_null_match_op = create_binary_op( - name="eq_nulls_match", type_rule=op_typing.PREDICATE + name="eq_nulls_match", type_signature=op_typing.COMPARISON ) -ne_op = create_binary_op(name="ne", type_rule=op_typing.PREDICATE) -lt_op = create_binary_op(name="lt", type_rule=op_typing.PREDICATE) -gt_op = create_binary_op(name="gt", type_rule=op_typing.PREDICATE) -le_op = create_binary_op(name="le", type_rule=op_typing.PREDICATE) -ge_op = create_binary_op(name="ge", type_rule=op_typing.PREDICATE) +ne_op = create_binary_op(name="ne", type_signature=op_typing.COMPARISON) +lt_op = create_binary_op(name="lt", type_signature=op_typing.COMPARISON) +gt_op = create_binary_op(name="gt", type_signature=op_typing.COMPARISON) +le_op = create_binary_op(name="le", type_signature=op_typing.COMPARISON) +ge_op = create_binary_op(name="ge", type_signature=op_typing.COMPARISON) + ## String Ops -strconcat_op = create_binary_op(name="strconcat", type_rule=op_typing.STRING) +@dataclasses.dataclass(frozen=True) +class StrConcatOp(BinaryOp): + name: typing.ClassVar[str] = "str_concat" + + # Note: this is actualyl a vararg op, but we don't model that yet + def output_type(self, *input_types): + if not all(map(dtypes.is_string_like, input_types)): + raise TypeError("string concat requires string-like arguments") + if len(set(input_types)) != 1: + raise TypeError("string concat requires like-typed arguments") + return input_types[0] + + +strconcat_op = StrConcatOp() # Ternary Ops @@ -515,15 +650,25 @@ class WhereOp(TernaryOp): name: typing.ClassVar[str] = "where" def output_type(self, *input_types: dtypes.ExpressionType) -> dtypes.ExpressionType: - # Second input is boolean and doesn't affect output type + if input_types[1] != dtypes.BOOL_DTYPE: + raise TypeError("where condition must be a boolean") return dtypes.lcd_etype(input_types[0], input_types[2]) where_op = WhereOp() -clip_op = create_ternary_op(name="clip", type_rule=op_typing.Supertype()) +@dataclasses.dataclass(frozen=True) +class ClipOp(TernaryOp): + name: typing.ClassVar[str] = "clip" + + def output_type(self, *input_types: dtypes.ExpressionType) -> dtypes.ExpressionType: + return dtypes.lcd_etype( + input_types[0], dtypes.lcd_etype(input_types[1], input_types[2]) + ) + +clip_op = ClipOp() # Just parameterless unary ops for now # TODO: Parameter mappings diff --git a/bigframes/operations/type.py b/bigframes/operations/type.py index 30e0c1e745..a1dc8edffc 100644 --- a/bigframes/operations/type.py +++ b/bigframes/operations/type.py @@ -12,64 +12,195 @@ # See the License for the specific language governing permissions and # limitations under the License. +import abc import dataclasses -import functools +from typing import Callable import bigframes.dtypes from bigframes.dtypes import ExpressionType -# TODO: Apply input type constraints to help pre-empt invalid expression construction - @dataclasses.dataclass -class OpTypeRule: - def output_type(self, *input_types: ExpressionType) -> ExpressionType: - raise NotImplementedError("Abstract typing rule has no output type") +class TypeSignature(abc.ABC): + """ + Type Signature represent a mapping from input types to output type. + + Type signatures should throw a TypeError if the input types cannot be handled by the operation. + """ + + @property + @abc.abstractmethod + def as_method(self): + """Convert the signature into an object method. Convenience function for constructing ops that use the signature.""" + ... + + +class UnaryTypeSignature(TypeSignature): + @abc.abstractmethod + def output_type(self, input_type: ExpressionType) -> ExpressionType: + ... + + @property + def as_method(self): + def meth(_, *input_types: ExpressionType) -> ExpressionType: + assert len(input_types) == 1 + return self.output_type(input_types[0]) + + return meth + + +class BinaryTypeSignature(TypeSignature): + @abc.abstractmethod + def output_type( + self, left_type: ExpressionType, right_type: ExpressionType + ) -> ExpressionType: + ... @property def as_method(self): def meth(_, *input_types: ExpressionType) -> ExpressionType: - return self.output_type(*input_types) + assert len(input_types) == 2 + return self.output_type(input_types[0], input_types[1]) return meth @dataclasses.dataclass -class InputType(OpTypeRule): - def output_type(self, *input_types: ExpressionType) -> ExpressionType: - assert len(input_types) == 1 - return input_types[0] +class TypePreserving(UnaryTypeSignature): + type_predicate: Callable[[ExpressionType], bool] + description: str + + def output_type(self, input_type: ExpressionType) -> ExpressionType: + if not self.type_predicate(input_type): + raise TypeError( + f"Type {input_type} is not supported. Type must be {self.description}" + ) + return input_type + + +@dataclasses.dataclass +class FixedOutputType(UnaryTypeSignature): + type_predicate: Callable[[ExpressionType], bool] + fixed_type: ExpressionType + description: str + + def output_type(self, input_type: ExpressionType) -> ExpressionType: + if (input_type is not None) and not self.type_predicate(input_type): + raise TypeError( + f"Type {input_type} is not supported. Type must be {self.description}" + ) + return self.fixed_type @dataclasses.dataclass -class RealNumeric(OpTypeRule): - def output_type(self, *input_types: ExpressionType) -> ExpressionType: - return functools.reduce( - lambda t1, t2: bigframes.dtypes.lcd_etype(t1, t2), - [*input_types, bigframes.dtypes.FLOAT_DTYPE], - ) +class UnaryRealNumeric(UnaryTypeSignature): + """Type signature for real-valued functions like exp, log, sin, tan.""" + + def output_type(self, type: ExpressionType) -> ExpressionType: + if type is None: + return bigframes.dtypes.FLOAT_DTYPE + if not bigframes.dtypes.is_numeric(type): + raise TypeError(f"Type {type} is not numeric") + if type in (bigframes.dtypes.INT_DTYPE, bigframes.dtypes.BOOL_DTYPE): + # Real numeric ops produce floats on int input + return bigframes.dtypes.FLOAT_DTYPE + return type @dataclasses.dataclass -class Supertype(OpTypeRule): - def output_type(self, *input_types: ExpressionType) -> ExpressionType: - return functools.reduce( - lambda t1, t2: bigframes.dtypes.lcd_etype(t1, t2), input_types - ) +class BinaryNumeric(BinaryTypeSignature): + """Type signature for numeric functions like multiply, modulo that can map ints to ints.""" + + def output_type( + self, left_type: ExpressionType, right_type: ExpressionType + ) -> ExpressionType: + if (left_type is not None) and not bigframes.dtypes.is_numeric(left_type): + raise TypeError(f"Type {left_type} is not numeric") + if (right_type is not None) and not bigframes.dtypes.is_numeric(right_type): + raise TypeError(f"Type {right_type} is not numeric") + return bigframes.dtypes.lcd_etype(left_type, right_type) @dataclasses.dataclass -class Fixed(OpTypeRule): - out_type: ExpressionType +class BinaryRealNumeric(BinaryTypeSignature): + """Type signature for real-valued functions like divide, arctan2, pow.""" + + def output_type( + self, left_type: ExpressionType, right_type: ExpressionType + ) -> ExpressionType: + if (left_type is not None) and not bigframes.dtypes.is_numeric(left_type): + raise TypeError(f"Type {left_type} is not numeric") + if (right_type is not None) and not bigframes.dtypes.is_numeric(right_type): + raise TypeError(f"Type {right_type} is not numeric") + lcd_type = bigframes.dtypes.lcd_etype(left_type, right_type) + if lcd_type == bigframes.dtypes.INT_DTYPE: + # Real numeric ops produce floats on int input + return bigframes.dtypes.FLOAT_DTYPE + return lcd_type - def output_type(self, *input_types: ExpressionType) -> ExpressionType: - return self.out_type +@dataclasses.dataclass +class Supertype(BinaryTypeSignature): + """Type signature for functions that return a the supertype of its inputs. Currently BigFrames just supports upcasting numerics.""" -# Common type rules -NUMERIC = Supertype() -REAL_NUMERIC = RealNumeric() -PREDICATE = Fixed(bigframes.dtypes.BOOL_DTYPE) -INTEGER = Fixed(bigframes.dtypes.INT_DTYPE) -STRING = Fixed(bigframes.dtypes.STRING_DTYPE) -INPUT_TYPE = InputType() + def output_type( + self, left_type: ExpressionType, right_type: ExpressionType + ) -> ExpressionType: + return bigframes.dtypes.lcd_etype(left_type, right_type) + + +@dataclasses.dataclass +class Comparison(BinaryTypeSignature): + """Type signature for comparison operators.""" + + def output_type( + self, left_type: ExpressionType, right_type: ExpressionType + ) -> ExpressionType: + common_type = bigframes.dtypes.lcd_etype(left_type, right_type) + if not bigframes.dtypes.is_comparable(common_type): + raise TypeError(f"Types {left_type} and {right_type} are not comparable") + return bigframes.dtypes.BOOL_DTYPE + + +@dataclasses.dataclass +class Logical(BinaryTypeSignature): + """Type signature for logical operators like AND, OR and NOT.""" + + def output_type( + self, left_type: ExpressionType, right_type: ExpressionType + ) -> ExpressionType: + if left_type is None or right_type is None: + return bigframes.dtypes.BOOL_DTYPE + if not bigframes.dtypes.is_binary_like(left_type): + raise TypeError(f"Type {left_type} is not binary") + if not bigframes.dtypes.is_binary_like(right_type): + raise TypeError(f"Type {right_type} is not binary") + if left_type != right_type: + raise TypeError( + "Bitwise operands {left_type} and {right_type} do not match" + ) + return left_type + + +# Common type signatures +UNARY_NUMERIC = TypePreserving(bigframes.dtypes.is_numeric, description="numeric") +UNARY_REAL_NUMERIC = UnaryRealNumeric() +BINARY_NUMERIC = BinaryNumeric() +BINARY_REAL_NUMERIC = BinaryRealNumeric() +COMPARISON = Comparison() +COMMON_SUPERTYPE = Supertype() +LOGICAL = Logical() +STRING_TRANSFORM = TypePreserving( + bigframes.dtypes.is_string_like, description="numeric" +) +STRING_PREDICATE = FixedOutputType( + bigframes.dtypes.is_string_like, + bigframes.dtypes.BOOL_DTYPE, + description="string-like", +) +DATELIKE_ACCESSOR = FixedOutputType( + bigframes.dtypes.is_date_like, bigframes.dtypes.INT_DTYPE, description="date-like" +) +TIMELIKE_ACCESSOR = FixedOutputType( + bigframes.dtypes.is_time_like, bigframes.dtypes.INT_DTYPE, description="time-like" +) diff --git a/tests/system/small/test_dataframe.py b/tests/system/small/test_dataframe.py index e58a666709..355849538e 100644 --- a/tests/system/small/test_dataframe.py +++ b/tests/system/small/test_dataframe.py @@ -2003,7 +2003,7 @@ def test_mod(scalars_dfs, other_scalar): def test_scalar_binop_str_exception(scalars_dfs): scalars_df, _ = scalars_dfs columns = ["string_col"] - with pytest.raises(Exception): + with pytest.raises(TypeError, match="Cannot add dtypes"): (scalars_df[columns] + 1).to_pandas() From a0fb8bbfddd07f1e0ef03eeb4be653d1e9f06772 Mon Sep 17 00:00:00 2001 From: Ashley Xu <139821907+ashleyxuu@users.noreply.github.com> Date: Thu, 28 Mar 2024 15:49:22 -0700 Subject: [PATCH 17/53] docs: Migrate the overview page to Bigframes official landing page (#536) --- README.rst | 386 +---------------------------------------------------- 1 file changed, 5 insertions(+), 381 deletions(-) diff --git a/README.rst b/README.rst index 73709641de..26bbbffa88 100644 --- a/README.rst +++ b/README.rst @@ -10,395 +10,19 @@ powered by the BigQuery engine. BigQuery DataFrames is an open-source package. You can run ``pip install --upgrade bigframes`` to install the latest version. + Documentation ------------- * `BigQuery DataFrames source code (GitHub) `_ * `BigQuery DataFrames sample notebooks `_ * `BigQuery DataFrames API reference `_ -* `BigQuery documentation `_ - - -Quickstart ----------- - -Prerequisites -^^^^^^^^^^^^^ - -* Install the ``bigframes`` package. -* Create a Google Cloud project and billing account. -* In an interactive environment (like Notebook, Python REPL or command line), - ``bigframes`` will do the authentication on-the-fly if needed. Otherwise, see - `how to set up application default credentials `_ - for various environments. For example, to pre-authenticate on your laptop you can - `install and initialize the gcloud CLI `_, - and then generate the application default credentials by doing - `gcloud auth application-default login `_. -* The user must have - `BigQuery Job User `_ and - `BigQuery Read Session User `_ - roles for the minimum usage. Additional IAM requirements apply for using remote - functions and ML. - -Code sample -^^^^^^^^^^^ - -Import ``bigframes.pandas`` for a pandas-like interface. The ``read_gbq`` -method accepts either a fully-qualified table ID or a SQL query. - -.. code-block:: python - - import bigframes.pandas as bpd - - bpd.options.bigquery.project = your_gcp_project_id - df1 = bpd.read_gbq("project.dataset.table") - df2 = bpd.read_gbq("SELECT a, b, c, FROM `project.dataset.table`") - -* `More code samples `_ - - -Locations ---------- -BigQuery DataFrames uses a -`BigQuery session `_ -internally to manage metadata on the service side. This session is tied to a -`location `_ . -BigQuery DataFrames uses the US multi-region as the default location, but you -can use ``session_options.location`` to set a different location. Every query -in a session is executed in the location where the session was created. -BigQuery DataFrames -auto-populates ``bf.options.bigquery.location`` if the user starts with -``read_gbq/read_gbq_table/read_gbq_query()`` and specifies a table, either -directly or in a SQL statement. - -If you want to reset the location of the created DataFrame or Series objects, -you can close the session by executing ``bigframes.pandas.close_session()``. -After that, you can reuse ``bigframes.pandas.options.bigquery.location`` to -specify another location. - - -``read_gbq()`` requires you to specify a location if the dataset you are -querying is not in the US multi-region. If you try to read a table from another -location, you get a NotFound exception. - -Project -------- -If ``bf.options.bigquery.project`` is not set, the ``$GOOGLE_CLOUD_PROJECT`` -environment variable is used, which is set in the notebook runtime serving the -BigQuery Studio/Vertex Notebooks. - -ML Capabilities ---------------- - -The ML capabilities in BigQuery DataFrames let you preprocess data, and -then train models on that data. You can also chain these actions together to -create data pipelines. - -Preprocess data -^^^^^^^^^^^^^^^^^^^^^^^^ - -Create transformers to prepare data for use in estimators (models) by -using the -`bigframes.ml.preprocessing module `_ -and the `bigframes.ml.compose module `_. -BigQuery DataFrames offers the following transformations: - -* Use the `KBinsDiscretizer class `_ - in the ``bigframes.ml.preprocessing`` module to bin continuous data into intervals. -* Use the `LabelEncoder class `_ - in the ``bigframes.ml.preprocessing`` module to normalize the target labels as integer values. -* Use the `MaxAbsScaler class `_ - in the ``bigframes.ml.preprocessing`` module to scale each feature to the range ``[-1, 1]`` by its maximum absolute value. -* Use the `MinMaxScaler class `_ - in the ``bigframes.ml.preprocessing`` module to standardize features by scaling each feature to the range ``[0, 1]``. -* Use the `StandardScaler class `_ - in the ``bigframes.ml.preprocessing`` module to standardize features by removing the mean and scaling to unit variance. -* Use the `OneHotEncoder class `_ - in the ``bigframes.ml.preprocessing`` module to transform categorical values into numeric format. -* Use the `ColumnTransformer class `_ - in the ``bigframes.ml.compose`` module to apply transformers to DataFrames columns. - - -Train models -^^^^^^^^^^^^ - -Create estimators to train models in BigQuery DataFrames. - -**Clustering models** - -Create estimators for clustering models by using the -`bigframes.ml.cluster module `_. - -* Use the `KMeans class `_ - to create K-means clustering models. Use these models for - data segmentation. For example, identifying customer segments. K-means is an - unsupervised learning technique, so model training doesn't require labels or split - data for training or evaluation. - -**Decomposition models** - -Create estimators for decomposition models by using the `bigframes.ml.decomposition module `_. - -* Use the `PCA class `_ - to create principal component analysis (PCA) models. Use these - models for computing principal components and using them to perform a change of - basis on the data. This provides dimensionality reduction by projecting each data - point onto only the first few principal components to obtain lower-dimensional - data while preserving as much of the data's variation as possible. - - -**Ensemble models** - -Create estimators for ensemble models by using the `bigframes.ml.ensemble module `_. - -* Use the `RandomForestClassifier class `_ - to create random forest classifier models. Use these models for constructing multiple - learning method decision trees for classification. -* Use the `RandomForestRegressor class `_ - to create random forest regression models. Use - these models for constructing multiple learning method decision trees for regression. -* Use the `XGBClassifier class `_ - to create gradient boosted tree classifier models. Use these models for additively - constructing multiple learning method decision trees for classification. -* Use the `XGBRegressor class `_ - to create gradient boosted tree regression models. Use these models for additively - constructing multiple learning method decision trees for regression. - - -**Forecasting models** - -Create estimators for forecasting models by using the `bigframes.ml.forecasting module `_. - -* Use the `ARIMAPlus class `_ - to create time series forecasting models. - -**Imported models** - -Create estimators for imported models by using the `bigframes.ml.imported module `_. - -* Use the `ONNXModel class `_ - to import Open Neural Network Exchange (ONNX) models. -* Use the `TensorFlowModel class `_ - to import TensorFlow models. -* Use the `XGBoostModel class `_ - to import XGBoostModel models. - -**Linear models** - -Create estimators for linear models by using the `bigframes.ml.linear_model module `_. - -* Use the `LinearRegression class `_ - to create linear regression models. Use these models for forecasting. For example, - forecasting the sales of an item on a given day. -* Use the `LogisticRegression class `_ - to create logistic regression models. Use these models for the classification of two - or more possible values such as whether an input is ``low-value``, ``medium-value``, - or ``high-value``. - -**Large language models** - -Create estimators for LLMs by using the `bigframes.ml.llm module `_. - -* Use the `GeminiTextGenerator class `_ to create Gemini text generator models. Use these models - for text generation tasks. -* Use the `PaLM2TextGenerator class `_ to create PaLM2 text generator models. Use these models - for text generation tasks. -* Use the `PaLM2TextEmbeddingGenerator class `_ to create PaLM2 text embedding generator models. - Use these models for text embedding generation tasks. - - -Create pipelines -^^^^^^^^^^^^^^^^ - -Create ML pipelines by using -`bigframes.ml.pipeline module `_. -Pipelines let you assemble several ML steps to be cross-validated together while setting -different parameters. This simplifies your code, and allows you to deploy data preprocessing -steps and an estimator together. - -* Use the `Pipeline class `_ - to create a pipeline of transforms with a final estimator. - - -ML remote models ----------------- - -**Requirements** - -To use BigQuery DataFrames ML remote models (`bigframes.ml.remote` or `bigframes.ml.llm`), -you must enable the following APIs: - -* The BigQuery API (bigquery.googleapis.com) -* The BigQuery Connection API (bigqueryconnection.googleapis.com) -* The Vertex AI API (aiplatform.googleapis.com) - -and you must be granted the following IAM roles in the project: - -* BigQuery Data Editor (roles/bigquery.dataEditor) -* BigQuery Connection Admin (roles/bigquery.connectionAdmin) -* Service Account User (roles/iam.serviceAccountUser) -* Vertex AI User (roles/aiplatform.user) -* Project IAM Admin (roles/resourcemanager.projectIamAdmin) if using default - BigQuery connection, or Browser (roles/browser) if using a pre-configured connection. - This requirement can be avoided by setting - ``bigframes.pandas.options.bigquery.skip_bq_connection_check`` option to ``True``, - in which case the connection (default or pre-configured) would be - used as-is without any existence or permission check. - - -ML locations ------------- - -``bigframes.ml`` supports the same locations as BigQuery ML. BigQuery ML model -prediction and other ML functions are supported in all BigQuery regions. Support -for model training varies by region. For more information, see -`BigQuery ML locations `_. - - -Data types ----------- - -BigQuery DataFrames supports the following numpy and pandas dtypes: - -* ``numpy.dtype("O")`` -* ``pandas.BooleanDtype()`` -* ``pandas.Float64Dtype()`` -* ``pandas.Int64Dtype()`` -* ``pandas.StringDtype(storage="pyarrow")`` -* ``pandas.ArrowDtype(pa.date32())`` -* ``pandas.ArrowDtype(pa.time64("us"))`` -* ``pandas.ArrowDtype(pa.timestamp("us"))`` -* ``pandas.ArrowDtype(pa.timestamp("us", tz="UTC"))`` - -BigQuery DataFrames doesn’t support the following BigQuery data types: - -* ``ARRAY`` -* ``NUMERIC`` -* ``BIGNUMERIC`` -* ``INTERVAL`` -* ``STRUCT`` -* ``JSON`` - -All other BigQuery data types display as the object type. - - -Remote functions ----------------- - -BigQuery DataFrames gives you the ability to turn your custom scalar functions -into `BigQuery remote functions -`_ . Creating a remote -function in BigQuery DataFrames (See `code samples -`_) -creates: - -1. A `Cloud Functions (2nd gen) function `_. -2. A `BigQuery connection `_. - If the BigQuery connection is created, the BigQuery service will - create a - `Google Cloud-managed IAM service account `_ - and attach it to the connection. You can use a pre-configured BigQuery - connection if you prefer, in which case the connection creation is skipped. -3. A BigQuery remote function that talks to the cloud function (1) using the BigQuery - connection (2). - -BigQuery connections are created in the same location as the BigQuery -DataFrames session, using the name you provide in the custom function -definition. To view and manage connections, do the following: - -1. Go to `BigQuery in the Google Cloud Console `__. -2. Select the project in which you created the remote function. -3. In the Explorer pane, expand that project and then expand External connections. - -BigQuery remote functions are created in the dataset you specify, or -in a special type of `hidden dataset `__ -referred to as an anonymous dataset. To view and manage remote functions created -in a user provided dataset, do the following: - -1. Go to `BigQuery in the Google Cloud Console `__. -2. Select the project in which you created the remote function. -3. In the Explorer pane, expand that project, expand the dataset in which you - created the remote function, and then expand Routines. - -To view and manage Cloud Functions functions, use the -`Functions `_ -page and use the project picker to select the project in which you -created the function. For easy identification, the names of the functions -created by BigQuery DataFrames are prefixed by ``bigframes``. - -**Requirements** - -To use BigQuery DataFrames remote functions, you must enable the following APIs: - -* The BigQuery API (bigquery.googleapis.com) -* The BigQuery Connection API (bigqueryconnection.googleapis.com) -* The Cloud Functions API (cloudfunctions.googleapis.com) -* The Cloud Run API (run.googleapis.com) -* The Artifact Registry API (artifactregistry.googleapis.com) -* The Cloud Build API (cloudbuild.googleapis.com ) -* The Cloud Resource Manager API (cloudresourcemanager.googleapis.com) - -To use BigQuery DataFrames remote functions, you must be granted the -following IAM roles in the project: - -* BigQuery Data Editor (roles/bigquery.dataEditor) -* BigQuery Connection Admin (roles/bigquery.connectionAdmin) -* Cloud Functions Developer (roles/cloudfunctions.developer) -* Service Account User (roles/iam.serviceAccountUser) -* Storage Object Viewer (roles/storage.objectViewer) -* Project IAM Admin (roles/resourcemanager.projectIamAdmin) if using default - BigQuery connection, or Browser (roles/browser) if using a pre-configured connection. - This requirement can be avoided by setting - ``bigframes.pandas.options.bigquery.skip_bq_connection_check`` option to ``True``, - in which case the connection (default or pre-configured) would be - used as-is without any existence or permission check. - -**Limitations** - -* Remote functions take about 90 seconds to become available when you first create them. -* Trivial changes in the notebook, such as inserting a new cell or renaming a variable, - might cause the remote function to be re-created, even if these changes are unrelated - to the remote function code. -* BigQuery DataFrames does not differentiate any personal data you include in the remote - function code. The remote function code is serialized as an opaque box to deploy it as a - Cloud Functions function. -* The Cloud Functions (2nd gen) functions, BigQuery connections, and BigQuery remote - functions created by BigQuery DataFrames persist in Google Cloud. If you don’t want to - keep these resources, you must delete them separately using an appropriate Cloud Functions - or BigQuery interface. -* A project can have up to 1000 Cloud Functions (2nd gen) functions at a time. See Cloud - Functions quotas for all the limits. - - -Quotas and limits ------------------- - -`BigQuery quotas `_ -including hardware, software, and network components. - - -Session termination -------------------- - -Each BigQuery DataFrames DataFrame or Series object is tied to a BigQuery -DataFrames session, which is in turn based on a BigQuery session. BigQuery -sessions -`auto-terminate `_ -; when this happens, you can’t use previously -created DataFrame or Series objects and must re-create them using a new -BigQuery DataFrames session. You can do this by running -``bigframes.pandas.close_session()`` and then re-running the BigQuery -DataFrames expressions. - -Data processing location ------------------------- -BigQuery DataFrames is designed for scale, which it achieves by keeping data -and processing on the BigQuery service. However, you can bring data into the -memory of your client machine by calling ``.to_pandas()`` on a DataFrame or Series -object. If you choose to do this, the memory limitation of your client machine -applies. +Getting started with BigQuery DataFrames +---------------------------------------- +Try the `BigQuery DataFrames quickstart `_ +to get up and running in just a few minutes. License From 769868b9fc7dfff2e7b1ed5cec52a5dd3dfd6ff2 Mon Sep 17 00:00:00 2001 From: Garrett Wu <6505921+GarrettWu@users.noreply.github.com> Date: Thu, 28 Mar 2024 21:12:16 -0700 Subject: [PATCH 18/53] docs: Mark Gemini model as Pre-GA (#543) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Thank you for opening a Pull Request! Before submitting your PR, there are a few things you can do to make sure it goes smoothly: - [ ] Make sure to open an issue as a [bug/issue](https://togithub.com/googleapis/python-bigquery-dataframes/issues/new/choose) before writing your code! That way we can discuss the change, evaluate designs, and agree on the general idea - [ ] Ensure the tests and linter pass - [ ] Code coverage does not decrease (if any source code was changed) - [ ] Appropriate docs were updated (if necessary) Fixes # 🦕 --- bigframes/ml/llm.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/bigframes/ml/llm.py b/bigframes/ml/llm.py index 6c4ae2ea43..e35f4d813d 100644 --- a/bigframes/ml/llm.py +++ b/bigframes/ml/llm.py @@ -421,6 +421,12 @@ def to_gbq( class GeminiTextGenerator(base.BaseEstimator): """Gemini text generator LLM model. + .. note:: + This product or feature is subject to the "Pre-GA Offerings Terms" in the General Service Terms section of the + Service Specific Terms(https://cloud.google.com/terms/service-terms#1). Pre-GA products and features are available "as is" + and might have limited support. For more information, see the launch stage descriptions + (https://cloud.google.com/products#product-launch-stages). + Args: session (bigframes.Session or None): BQ session to create the model. If None, use the global default session. From 54e49cff89bd329852a823cd5cf5c5b41b7f9e32 Mon Sep 17 00:00:00 2001 From: Henry Solberg Date: Fri, 29 Mar 2024 10:03:15 -0700 Subject: [PATCH 19/53] feat: Support max_columns in repr and make repr more efficient (#515) --- bigframes/core/blocks.py | 42 ++++++++++++++------- bigframes/core/indexes/index.py | 10 ++--- bigframes/dataframe.py | 66 +++++++++++++-------------------- bigframes/series.py | 9 ++--- bigframes/session/__init__.py | 8 +++- 5 files changed, 70 insertions(+), 65 deletions(-) diff --git a/bigframes/core/blocks.py b/bigframes/core/blocks.py index afa13375b1..6827e1afe8 100644 --- a/bigframes/core/blocks.py +++ b/bigframes/core/blocks.py @@ -464,6 +464,23 @@ def to_pandas_batches(self): self._copy_index_to_pandas(df) yield df + def download_pandas_preview( + self, max_rows: int + ) -> Tuple[pd.DataFrame, bigquery.QueryJob]: + """Download one page of results and return the query job.""" + dtypes = dict(zip(self.index_columns, self.index.dtypes)) + dtypes.update(zip(self.value_columns, self.dtypes)) + results_iterator, query_job = self.session._execute( + self.expr, sorted=True, max_results=max_rows + ) + arrow_results_iterator = results_iterator.to_arrow_iterable() + arrow_table = next(arrow_results_iterator) + downloaded_df = bigframes.session._io.pandas.arrow_to_pandas( + arrow_table, dtypes + ) + self._copy_index_to_pandas(downloaded_df) + return downloaded_df, query_job + def _copy_index_to_pandas(self, df: pd.DataFrame): """Set the index on pandas DataFrame to match this block. @@ -1294,26 +1311,25 @@ def _forward_slice(self, start: int = 0, stop=None, step: int = 1): # queries. @functools.cache def retrieve_repr_request_results( - self, max_results: int - ) -> Tuple[pd.DataFrame, int, bigquery.QueryJob]: + self, max_results: int, max_columns: int + ) -> Tuple[pd.DataFrame, Tuple[int, int], bigquery.QueryJob]: """ Retrieves a pandas dataframe containing only max_results many rows for use with printing methods. - Returns a tuple of the dataframe and the overall number of rows of the query. + Returns a tuple of the dataframe preview for printing and the overall number + of rows and columns of the table, as well as the query job used. """ - # TODO(swast): Select a subset of columns if max_columns is less than the - # number of columns in the schema. - count = self.shape[0] - if count > max_results: - head_block = self.slice(0, max_results) - else: - head_block = self - computed_df, query_job = head_block.to_pandas() - formatted_df = computed_df.set_axis(self.column_labels, axis=1) + pandas_df, query_job = self.download_pandas_preview(max_results) + row_count = self.session._get_table_row_count(query_job.destination) + column_count = len(self.value_columns) + + formatted_df = pandas_df.set_axis(self.column_labels, axis=1) # we reset the axis and substitute the bf index name for the default formatted_df.index.name = self.index.name - return formatted_df, count, query_job + # limit column count + formatted_df = formatted_df.iloc[:, 0:max_columns] + return formatted_df, (row_count, column_count), query_job def promote_offsets(self, label: Label = None) -> typing.Tuple[Block, str]: result_id = guid.generate_guid() diff --git a/bigframes/core/indexes/index.py b/bigframes/core/indexes/index.py index c818b68711..48988aaffe 100644 --- a/bigframes/core/indexes/index.py +++ b/bigframes/core/indexes/index.py @@ -205,17 +205,17 @@ def query_job(self) -> Optional[bigquery.QueryJob]: return self._query_job def __repr__(self) -> str: - # TODO(swast): Add a timeout here? If the query is taking a long time, - # maybe we just print the job metadata that we have so far? - # TODO(swast): Avoid downloading the whole series by using job - # metadata, like we do with DataFrame. opts = bigframes.options.display max_results = opts.max_rows + max_columns = opts.max_columns if opts.repr_mode == "deferred": return formatter.repr_query_job(self.query_job) - pandas_df, _, query_job = self._block.retrieve_repr_request_results(max_results) + pandas_df, _, query_job = self._block.retrieve_repr_request_results( + max_results, max_columns + ) self._query_job = query_job + return repr(pandas_df.index) def copy(self, name: Optional[Hashable] = None): diff --git a/bigframes/dataframe.py b/bigframes/dataframe.py index 599546284b..b75cc5faac 100644 --- a/bigframes/dataframe.py +++ b/bigframes/dataframe.py @@ -579,28 +579,16 @@ def __setattr__(self, key: str, value): object.__setattr__(self, key, value) def __repr__(self) -> str: - """Converts a DataFrame to a string. Calls to_pandas. + """Converts a DataFrame to a string using pandas dataframe __repr__. - Only represents the first `bigframes.options.display.max_rows`. + Only represents the first `bigframes.options.display.max_rows` + and `bigframes.options.display.max_columns`. """ - opts = bigframes.options.display - max_results = opts.max_rows - if opts.repr_mode == "deferred": + if bigframes.options.display.repr_mode == "deferred": return formatter.repr_query_job(self.query_job) - self._cached() - # TODO(swast): pass max_columns and get the true column count back. Maybe - # get 1 more column than we have requested so that pandas can add the - # ... for us? - pandas_df, row_count, query_job = self._block.retrieve_repr_request_results( - max_results - ) - - self._set_internal_query_job(query_job) - - column_count = len(pandas_df.columns) - - with display_options.pandas_repr(opts): + pandas_df, shape = self._perform_repr_request() + with display_options.pandas_repr(bigframes.options.display): repr_string = repr(pandas_df) # Modify the end of the string to reflect count. @@ -608,42 +596,40 @@ def __repr__(self) -> str: pattern = re.compile("\\[[0-9]+ rows x [0-9]+ columns\\]") if pattern.match(lines[-1]): lines = lines[:-2] - - if row_count > len(lines) - 1: + if shape[0] > len(lines) - 1: lines.append("...") - lines.append("") - lines.append(f"[{row_count} rows x {column_count} columns]") + lines.append(f"[{shape[0]} rows x {shape[1]} columns]") return "\n".join(lines) + def _perform_repr_request(self) -> Tuple[pandas.DataFrame, Tuple[int, int]]: + max_results = bigframes.options.display.max_rows + max_columns = bigframes.options.display.max_columns + self._cached() + pandas_df, shape, query_job = self._block.retrieve_repr_request_results( + max_results, max_columns + ) + self._set_internal_query_job(query_job) + return pandas_df, shape + def _repr_html_(self) -> str: """ Returns an html string primarily for use by notebooks for displaying - a representation of the DataFrame. Displays 20 rows by default since - many notebooks are not configured for large tables. + a representation of the DataFrame. Displays at most the number of rows + and columns given by `bigframes.options.display.max_rows` and + `bigframes.options.display.max_columns`. """ - opts = bigframes.options.display - max_results = bigframes.options.display.max_rows - if opts.repr_mode == "deferred": - return formatter.repr_query_job_html(self.query_job) - self._cached() - # TODO(swast): pass max_columns and get the true column count back. Maybe - # get 1 more column than we have requested so that pandas can add the - # ... for us? - pandas_df, row_count, query_job = self._block.retrieve_repr_request_results( - max_results - ) - - self._set_internal_query_job(query_job) + if bigframes.options.display.repr_mode == "deferred": + return formatter.repr_query_job_html(self.query_job) - column_count = len(pandas_df.columns) + pandas_df, shape = self._perform_repr_request() - with display_options.pandas_repr(opts): + with display_options.pandas_repr(bigframes.options.display): # _repr_html_ stub is missing so mypy thinks it's a Series. Ignore mypy. html_string = pandas_df._repr_html_() # type:ignore - html_string += f"[{row_count} rows x {column_count} columns in total]" + html_string += f"[{shape[0]} rows x {shape[1]} columns in total]" return html_string def __setitem__(self, key: str, value: SingleItemValue): diff --git a/bigframes/series.py b/bigframes/series.py index e7b358c2fe..f1ac89f514 100644 --- a/bigframes/series.py +++ b/bigframes/series.py @@ -272,17 +272,16 @@ def reset_index( return bigframes.dataframe.DataFrame(block) def __repr__(self) -> str: - # TODO(swast): Add a timeout here? If the query is taking a long time, - # maybe we just print the job metadata that we have so far? - # TODO(swast): Avoid downloading the whole series by using job - # metadata, like we do with DataFrame. opts = bigframes.options.display max_results = opts.max_rows + max_columns = opts.max_columns if opts.repr_mode == "deferred": return formatter.repr_query_job(self.query_job) self._cached() - pandas_df, _, query_job = self._block.retrieve_repr_request_results(max_results) + pandas_df, _, query_job = self._block.retrieve_repr_request_results( + max_results, max_columns + ) self._set_internal_query_job(query_job) return repr(pandas_df.iloc[:, 0]) diff --git a/bigframes/session/__init__.py b/bigframes/session/__init__.py index 479b3a7bac..6a2c87bb05 100644 --- a/bigframes/session/__init__.py +++ b/bigframes/session/__init__.py @@ -1831,6 +1831,7 @@ def _execute( sorted: bool = True, dry_run=False, col_id_overrides: Mapping[str, str] = {}, + max_results: Optional[int] = None, ) -> tuple[bigquery.table.RowIterator, bigquery.QueryJob]: sql = self._to_sql( array_value, sorted=sorted, col_id_overrides=col_id_overrides @@ -1840,8 +1841,7 @@ def _execute( else: job_config.dry_run = dry_run return self._start_query( - sql=sql, - job_config=job_config, + sql=sql, job_config=job_config, max_results=max_results ) def _peek( @@ -1886,6 +1886,10 @@ def _get_table_size(self, destination_table): table = self.bqclient.get_table(destination_table) return table.num_bytes + def _get_table_row_count(self, destination_table) -> int: + table = self.bqclient.get_table(destination_table) + return table.num_rows + def _rows_to_dataframe( self, row_iterator: bigquery.table.RowIterator, dtypes: Dict ) -> pandas.DataFrame: From 1156c1e3ce8c1e62898dbe68ccd6c5ab3cd4068f Mon Sep 17 00:00:00 2001 From: Ashley Xu <139821907+ashleyxuu@users.noreply.github.com> Date: Fri, 29 Mar 2024 10:23:23 -0700 Subject: [PATCH 20/53] feat: support `ML.GENERATE_EMBEDDING` in `PaLM2TextEmbeddingGenerator` (#539) * feat: support ML.GENERATE_EMBEDDING in PaLM2TextEmbeddingGenerator --- bigframes/ml/core.py | 4 ++-- bigframes/ml/llm.py | 4 ++-- bigframes/ml/sql.py | 6 +++--- tests/system/small/ml/test_llm.py | 12 ++++++------ tests/unit/ml/test_sql.py | 6 +++--- 5 files changed, 16 insertions(+), 16 deletions(-) diff --git a/bigframes/ml/core.py b/bigframes/ml/core.py index 03d9b806b9..04aaeec1bc 100644 --- a/bigframes/ml/core.py +++ b/bigframes/ml/core.py @@ -152,14 +152,14 @@ def generate_text( ), ) - def generate_text_embedding( + def generate_embedding( self, input_data: bpd.DataFrame, options: Mapping[str, int | float], ) -> bpd.DataFrame: return self._apply_sql( input_data, - lambda source_df: self._model_manipulation_sql_generator.ml_generate_text_embedding( + lambda source_df: self._model_manipulation_sql_generator.ml_generate_embedding( source_df=source_df, struct_options=options, ), diff --git a/bigframes/ml/llm.py b/bigframes/ml/llm.py index e35f4d813d..031656f1d8 100644 --- a/bigframes/ml/llm.py +++ b/bigframes/ml/llm.py @@ -44,7 +44,7 @@ _GEMINI_PRO_ENDPOINT = "gemini-pro" _ML_GENERATE_TEXT_STATUS = "ml_generate_text_status" -_ML_EMBED_TEXT_STATUS = "ml_embed_text_status" +_ML_EMBED_TEXT_STATUS = "ml_generate_embedding_status" @log_adapter.class_logger @@ -389,7 +389,7 @@ def predict(self, X: Union[bpd.DataFrame, bpd.Series]) -> bpd.DataFrame: "flatten_json_output": True, } - df = self._bqml_model.generate_text_embedding(X, options) + df = self._bqml_model.generate_embedding(X, options) if (df[_ML_EMBED_TEXT_STATUS] != "").any(): warnings.warn( diff --git a/bigframes/ml/sql.py b/bigframes/ml/sql.py index 807fadc06a..fab358cce3 100644 --- a/bigframes/ml/sql.py +++ b/bigframes/ml/sql.py @@ -270,12 +270,12 @@ def ml_generate_text( return f"""SELECT * FROM ML.GENERATE_TEXT(MODEL `{self._model_name}`, ({self._source_sql(source_df)}), {struct_options_sql})""" - def ml_generate_text_embedding( + def ml_generate_embedding( self, source_df: bpd.DataFrame, struct_options: Mapping[str, Union[int, float]] ) -> str: - """Encode ML.GENERATE_TEXT_EMBEDDING for BQML""" + """Encode ML.GENERATE_EMBEDDING for BQML""" struct_options_sql = self.struct_options(**struct_options) - return f"""SELECT * FROM ML.GENERATE_TEXT_EMBEDDING(MODEL `{self._model_name}`, + return f"""SELECT * FROM ML.GENERATE_EMBEDDING(MODEL `{self._model_name}`, ({self._source_sql(source_df)}), {struct_options_sql})""" def ml_detect_anomalies( diff --git a/tests/system/small/ml/test_llm.py b/tests/system/small/ml/test_llm.py index 4d2ddfe513..2e135bef7b 100644 --- a/tests/system/small/ml/test_llm.py +++ b/tests/system/small/ml/test_llm.py @@ -261,8 +261,8 @@ def test_embedding_generator_predict_success( ): df = palm2_embedding_generator_model.predict(llm_text_df).to_pandas() assert df.shape == (3, 4) - assert "text_embedding" in df.columns - series = df["text_embedding"] + assert "ml_generate_embedding_result" in df.columns + series = df["ml_generate_embedding_result"] value = series[0] assert len(value) == 768 @@ -273,8 +273,8 @@ def test_embedding_generator_multilingual_predict_success( ): df = palm2_embedding_generator_multilingual_model.predict(llm_text_df).to_pandas() assert df.shape == (3, 4) - assert "text_embedding" in df.columns - series = df["text_embedding"] + assert "ml_generate_embedding_result" in df.columns + series = df["ml_generate_embedding_result"] value = series[0] assert len(value) == 768 @@ -285,8 +285,8 @@ def test_embedding_generator_predict_series_success( ): df = palm2_embedding_generator_model.predict(llm_text_df["prompt"]).to_pandas() assert df.shape == (3, 4) - assert "text_embedding" in df.columns - series = df["text_embedding"] + assert "ml_generate_embedding_result" in df.columns + series = df["ml_generate_embedding_result"] value = series[0] assert len(value) == 768 diff --git a/tests/unit/ml/test_sql.py b/tests/unit/ml/test_sql.py index 913bab0379..5b1ff37775 100644 --- a/tests/unit/ml/test_sql.py +++ b/tests/unit/ml/test_sql.py @@ -373,17 +373,17 @@ def test_ml_generate_text_correct( ) -def test_ml_generate_text_embedding_correct( +def test_ml_generate_embedding_correct( model_manipulation_sql_generator: ml_sql.ModelManipulationSqlGenerator, mock_df: bpd.DataFrame, ): - sql = model_manipulation_sql_generator.ml_generate_text_embedding( + sql = model_manipulation_sql_generator.ml_generate_embedding( source_df=mock_df, struct_options={"option_key1": 1, "option_key2": 2.2}, ) assert ( sql - == """SELECT * FROM ML.GENERATE_TEXT_EMBEDDING(MODEL `my_project_id.my_dataset_id.my_model_id`, + == """SELECT * FROM ML.GENERATE_EMBEDDING(MODEL `my_project_id.my_dataset_id.my_model_id`, (input_X_sql), STRUCT( 1 AS option_key1, 2.2 AS option_key2))""" From f6c40cdc91968364c0a072092ebfecf6e0f5cf34 Mon Sep 17 00:00:00 2001 From: Ashley Xu <139821907+ashleyxuu@users.noreply.github.com> Date: Fri, 29 Mar 2024 11:36:27 -0700 Subject: [PATCH 21/53] test: add the code snippets for gemini model (#546) --- samples/snippets/gemini_model_test.py | 44 +++++++++++++++++++++++++++ 1 file changed, 44 insertions(+) create mode 100644 samples/snippets/gemini_model_test.py diff --git a/samples/snippets/gemini_model_test.py b/samples/snippets/gemini_model_test.py new file mode 100644 index 0000000000..89212875ae --- /dev/null +++ b/samples/snippets/gemini_model_test.py @@ -0,0 +1,44 @@ +# Copyright 2023 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +def test_gemini_text_generator_model(): + # Determine project id, in this case prefer the one set in the environment + # variable GOOGLE_CLOUD_PROJECT (if any) + import os + + PROJECT_ID = os.getenv("GOOGLE_CLOUD_PROJECT", "bigframes-dev") + REGION = "us" + CONN_NAME = "bigframes-default-connection" + + # [START bigquery_dataframes_gemini_model] + from bigframes.ml.llm import GeminiTextGenerator + import bigframes.pandas as bpd + + # Create the Gemini LLM model + session = bpd.get_global_session() + connection = f"{PROJECT_ID}.{REGION}.{CONN_NAME}" + model = GeminiTextGenerator(session=session, connection_name=connection) + + df_api = bpd.read_csv("gs://cloud-samples-data/vertex-ai/bigframe/df.csv") + + # Prepare the prompts and send them to the LLM model for prediction + df_prompt_prefix = "Generate Pandas sample code for DataFrame." + df_prompt = df_prompt_prefix + df_api["API"] + + # Predict using the model + df_pred = model.predict(df_prompt.to_frame(), max_output_tokens=1024) + # [END bigquery_dataframes_gemini_model] + assert df_pred["ml_generate_text_llm_result"] is not None + assert df_pred["ml_generate_text_llm_result"].iloc[0] is not None From 0a4153cc71a44c09b8d691897f1e5afa58c69f25 Mon Sep 17 00:00:00 2001 From: Huan Chen <142538604+Genesis929@users.noreply.github.com> Date: Fri, 29 Mar 2024 14:07:06 -0700 Subject: [PATCH 22/53] fix: assign NaN scalar to column error. (#513) * fix: assign NaN scalar to column error. * Update test. * remove import. --- bigframes/core/__init__.py | 4 ++++ tests/system/small/test_dataframe.py | 17 ++++++++++++----- 2 files changed, 16 insertions(+), 5 deletions(-) diff --git a/bigframes/core/__init__.py b/bigframes/core/__init__.py index 6fd6fc23c2..ce9c22132b 100644 --- a/bigframes/core/__init__.py +++ b/bigframes/core/__init__.py @@ -229,6 +229,10 @@ def assign_constant( value: typing.Any, dtype: typing.Optional[bigframes.dtypes.Dtype], ) -> ArrayValue: + if pandas.isna(value): + # Need to assign a data type when value is NaN. + dtype = dtype or bigframes.dtypes.DEFAULT_DTYPE + if destination_id in self.column_ids: # Mutate case exprs = [ ( diff --git a/tests/system/small/test_dataframe.py b/tests/system/small/test_dataframe.py index 355849538e..645914285e 100644 --- a/tests/system/small/test_dataframe.py +++ b/tests/system/small/test_dataframe.py @@ -605,17 +605,24 @@ def test_assign_new_column_w_loc(scalars_dfs): pd.testing.assert_frame_equal(bf_result, pd_result) -def test_assign_new_column_w_setitem(scalars_dfs): +@pytest.mark.parametrize( + ("scalar",), + [ + (2.1,), + (None,), + ], +) +def test_assign_new_column_w_setitem(scalars_dfs, scalar): scalars_df, scalars_pandas_df = scalars_dfs bf_df = scalars_df.copy() pd_df = scalars_pandas_df.copy() - bf_df["new_col"] = 2 - pd_df["new_col"] = 2 + bf_df["new_col"] = scalar + pd_df["new_col"] = scalar bf_result = bf_df.to_pandas() pd_result = pd_df - # Convert default pandas dtypes `int64` to match BigQuery DataFrames dtypes. - pd_result["new_col"] = pd_result["new_col"].astype("Int64") + # Convert default pandas dtypes `float64` to match BigQuery DataFrames dtypes. + pd_result["new_col"] = pd_result["new_col"].astype("Float64") pd.testing.assert_frame_equal(bf_result, pd_result) From f79827757bbf022c718a4d239c622a7a7ec2feee Mon Sep 17 00:00:00 2001 From: Shobhit Singh Date: Fri, 29 Mar 2024 21:08:20 +0000 Subject: [PATCH 23/53] chore: enhance documentation coverage script (#545) to include just the docstring coverage by default, provide `-c`/`--code-samples` option for the code sample coverage. --- ...erage.py => get_documentation_coverage.py} | 41 ++++++++++++++----- 1 file changed, 31 insertions(+), 10 deletions(-) rename scripts/{get_code_sample_coverage.py => get_documentation_coverage.py} (74%) diff --git a/scripts/get_code_sample_coverage.py b/scripts/get_documentation_coverage.py similarity index 74% rename from scripts/get_code_sample_coverage.py rename to scripts/get_documentation_coverage.py index d81023394f..0b9417b2d3 100755 --- a/scripts/get_code_sample_coverage.py +++ b/scripts/get_documentation_coverage.py @@ -16,7 +16,7 @@ import importlib import inspect import sys -from typing import Dict, List +import typing import bigframes import bigframes.pandas as bpd @@ -50,6 +50,11 @@ "remote", ] +COVERAGE_GENERATORS = { + "documentation": lambda docstr: docstr, + "code samples": lambda docstr: docstr and "**Examples:**" in docstr, +} + for module_name in ML_MODULE_NAMES: module = importlib.import_module(f"bigframes.ml.{module_name}") classes_ = [ @@ -58,9 +63,15 @@ CLASSES.extend(classes_) -def get_code_samples_summary() -> Dict[str, Dict[str, List[str]]]: +def get_coverage_summary( + func: typing.Callable, +) -> typing.Dict[str, typing.Dict[str, typing.List[str]]]: """Get Summary of the code samples coverage in BigFrames APIs. + Args: + func (callable): + Function to accept documentation and return whether it satisfies + coverage. Returns: Summary: A dictionary of the format { @@ -73,7 +84,7 @@ def get_code_samples_summary() -> Dict[str, Dict[str, List[str]]]: } } """ - summary: Dict[str, Dict[str, List[str]]] = dict() + summary: typing.Dict[str, typing.Dict[str, typing.List[str]]] = dict() for class_ in CLASSES: class_key = f"{class_.__module__}.{class_.__name__}" @@ -104,8 +115,8 @@ def predicate(impl): impl = getattr(class_, name) docstr = inspect.getdoc(impl) - code_samples_present = docstr and "**Examples:**" in docstr - key = PRESENT if code_samples_present else NOT_PRESENT + coverage_present = func(docstr) + key = PRESENT if coverage_present else NOT_PRESENT summary[class_key][key].append(name) return summary @@ -113,7 +124,16 @@ def predicate(impl): if __name__ == "__main__": parser = argparse.ArgumentParser( - description="Get a summary of code samples coverage in BigFrames APIs." + description="Get a summary of documentation coverage in BigFrames APIs." + ) + parser.add_argument( + "-c", + "--code-samples", + type=bool, + action=argparse.BooleanOptionalAction, + default=False, + help="Whether to calculate code samples coverage. By default the tool" + " calculates the documentation (docstring) coverage.", ) parser.add_argument( "-d", @@ -121,12 +141,13 @@ def predicate(impl): type=bool, action=argparse.BooleanOptionalAction, default=False, - help="Whether to print APIs with and without code samples.", + help="Whether to print APIs with and without the coverage.", ) args = parser.parse_args(sys.argv[1:]) - summary = get_code_samples_summary() + scenario = "code samples" if args.code_samples else "documentation" + summary = get_coverage_summary(COVERAGE_GENERATORS[scenario]) total_with_code_samples = 0 total = 0 @@ -140,8 +161,8 @@ def predicate(impl): coverage = 100 * apis_with_code_samples / apis_total print(f"{class_}: {coverage:.1f}% ({apis_with_code_samples}/{apis_total})") if args.details: - print(f"===> APIs WITH code samples: {class_summary[PRESENT]}") - print(f"===> APIs WITHOUT code samples: {class_summary[NOT_PRESENT]}") + print(f"===> APIs WITH {scenario}: {class_summary[PRESENT]}") + print(f"===> APIs WITHOUT {scenario}: {class_summary[NOT_PRESENT]}") coverage = 100 * total_with_code_samples / total print(f"Total: {coverage:.1f}% ({total_with_code_samples}/{total})") From 8eca99a03bc4bdaccf15a979b5382f3659f2aac5 Mon Sep 17 00:00:00 2001 From: TrevorBergeron Date: Fri, 29 Mar 2024 14:09:34 -0700 Subject: [PATCH 24/53] fix: Fix case where df.peek would fail to execute even with force=True (#511) * fix: Fix case where df.peek would fail to execute even with force=True * remove cache from peekable property * if force=True always peek after caching even if peeking inefficient --- bigframes/core/blocks.py | 7 +++- bigframes/core/nodes.py | 37 +------------------ .../core/{traversal.py => tree_properties.py} | 11 ++++++ bigframes/dataframe.py | 2 +- bigframes/session/__init__.py | 7 ++-- tests/system/small/test_dataframe.py | 11 ++++++ 6 files changed, 33 insertions(+), 42 deletions(-) rename bigframes/core/{traversal.py => tree_properties.py} (72%) diff --git a/bigframes/core/blocks.py b/bigframes/core/blocks.py index 6827e1afe8..aab8b1ad4d 100644 --- a/bigframes/core/blocks.py +++ b/bigframes/core/blocks.py @@ -41,6 +41,7 @@ import bigframes.core.guid as guid import bigframes.core.join_def as join_defs import bigframes.core.ordering as ordering +import bigframes.core.tree_properties as tree_properties import bigframes.core.utils import bigframes.core.utils as utils import bigframes.dtypes @@ -443,8 +444,10 @@ def to_pandas( df.set_axis(self.column_labels, axis=1, copy=False) return df, query_job - def try_peek(self, n: int = 20) -> typing.Optional[pd.DataFrame]: - if self.expr.node.peekable: + def try_peek( + self, n: int = 20, force: bool = False + ) -> typing.Optional[pd.DataFrame]: + if force or tree_properties.peekable(self.expr.node): iterator, _ = self.session._peek(self.expr, n) df = self._to_dataframe(iterator) self._copy_index_to_pandas(df) diff --git a/bigframes/core/nodes.py b/bigframes/core/nodes.py index c1ceeebffe..8f646ac4bb 100644 --- a/bigframes/core/nodes.py +++ b/bigframes/core/nodes.py @@ -90,11 +90,6 @@ def session(self): def _node_hash(self): return hash(tuple(hash(getattr(self, field.name)) for field in fields(self))) - @property - def peekable(self) -> bool: - """Indicates whether the node can be sampled efficiently""" - return all(child.peekable for child in self.child_nodes) - @property def roots(self) -> typing.Set[BigFrameNode]: roots = itertools.chain.from_iterable( @@ -143,12 +138,6 @@ def child_nodes(self) -> typing.Sequence[BigFrameNode]: def __hash__(self): return self._node_hash - @property - def peekable(self) -> bool: - children_peekable = all(child.peekable for child in self.child_nodes) - single_root = len(self.roots) == 1 - return children_peekable and single_root - @functools.cached_property def schema(self) -> schemata.ArraySchema: def join_mapping_to_schema_item(mapping: JoinColumnMapping): @@ -204,10 +193,6 @@ class ReadLocalNode(BigFrameNode): def __hash__(self): return self._node_hash - @property - def peekable(self) -> bool: - return True - @property def roots(self) -> typing.Set[BigFrameNode]: return {self} @@ -233,10 +218,6 @@ def session(self): def __hash__(self): return self._node_hash - @property - def peekable(self) -> bool: - return True - @property def roots(self) -> typing.Set[BigFrameNode]: return {self} @@ -261,13 +242,9 @@ class PromoteOffsetsNode(UnaryNode): def __hash__(self): return self._node_hash - @property - def peekable(self) -> bool: - return False - @property def non_local(self) -> bool: - return False + return True @property def schema(self) -> schemata.ArraySchema: @@ -371,10 +348,6 @@ def row_preserving(self) -> bool: def __hash__(self): return self._node_hash - @property - def peekable(self) -> bool: - return False - @property def non_local(self) -> bool: return True @@ -407,10 +380,6 @@ class WindowOpNode(UnaryNode): def __hash__(self): return self._node_hash - @property - def peekable(self) -> bool: - return False - @property def non_local(self) -> bool: return True @@ -459,10 +428,6 @@ def row_preserving(self) -> bool: def non_local(self) -> bool: return True - @property - def peekable(self) -> bool: - return False - @functools.cached_property def schema(self) -> schemata.ArraySchema: def infer_dtype( diff --git a/bigframes/core/traversal.py b/bigframes/core/tree_properties.py similarity index 72% rename from bigframes/core/traversal.py rename to bigframes/core/tree_properties.py index b038ee6599..bc29f115f6 100644 --- a/bigframes/core/traversal.py +++ b/bigframes/core/tree_properties.py @@ -12,8 +12,11 @@ # See the License for the specific language governing permissions and # limitations under the License. + import bigframes.core.nodes as nodes +# TODO: Convert these functions to iterative or enforce hard limit on tree depth. The below algorithms can cause stack to exceed limit. + def is_trivially_executable(node: nodes.BigFrameNode) -> bool: if local_only(node): @@ -25,3 +28,11 @@ def is_trivially_executable(node: nodes.BigFrameNode) -> bool: def local_only(node: nodes.BigFrameNode) -> bool: return all(isinstance(node, nodes.ReadLocalNode) for node in node.roots) + + +def peekable(node: nodes.BigFrameNode) -> bool: + if local_only(node): + return True + children_peekable = all(peekable(child) for child in node.child_nodes) + self_peekable = not node.non_local + return children_peekable and self_peekable diff --git a/bigframes/dataframe.py b/bigframes/dataframe.py index b75cc5faac..1df78dd4cd 100644 --- a/bigframes/dataframe.py +++ b/bigframes/dataframe.py @@ -1111,7 +1111,7 @@ def peek(self, n: int = 5, *, force: bool = True) -> pandas.DataFrame: if maybe_result is None: if force: self._cached() - maybe_result = self._block.try_peek(n) + maybe_result = self._block.try_peek(n, force=True) assert maybe_result is not None else: raise ValueError( diff --git a/bigframes/session/__init__.py b/bigframes/session/__init__.py index 6a2c87bb05..8294bc3aa1 100644 --- a/bigframes/session/__init__.py +++ b/bigframes/session/__init__.py @@ -82,7 +82,8 @@ import bigframes.core.guid as guid from bigframes.core.ordering import IntegerEncoding import bigframes.core.ordering as order -import bigframes.core.traversal as traversals +import bigframes.core.tree_properties as traversals +import bigframes.core.tree_properties as tree_properties import bigframes.core.utils as utils import bigframes.dtypes import bigframes.formatting_helpers as formatting_helpers @@ -1848,8 +1849,8 @@ def _peek( self, array_value: core.ArrayValue, n_rows: int ) -> tuple[bigquery.table.RowIterator, bigquery.QueryJob]: """A 'peek' efficiently accesses a small number of rows in the dataframe.""" - if not array_value.node.peekable: - raise NotImplementedError("cannot efficient peek this dataframe") + if not tree_properties.peekable(array_value.node): + warnings.warn("Peeking this value cannot be done efficiently.") sql = self._compile_unordered(array_value).peek_sql(n_rows) return self._start_query( sql=sql, diff --git a/tests/system/small/test_dataframe.py b/tests/system/small/test_dataframe.py index 645914285e..cf907b02d6 100644 --- a/tests/system/small/test_dataframe.py +++ b/tests/system/small/test_dataframe.py @@ -494,6 +494,17 @@ def test_df_peek_force_default(scalars_dfs): assert len(peek_result) == 3 +def test_df_peek_reset_index(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + peek_result = ( + scalars_df[["int64_col", "int64_too"]].reset_index(drop=True).peek(n=3) + ) + pd.testing.assert_index_equal( + scalars_pandas_df[["int64_col", "int64_too"]].columns, peek_result.columns + ) + assert len(peek_result) == 3 + + def test_repr_w_all_rows(scalars_dfs): scalars_df, scalars_pandas_df = scalars_dfs From 9d8cf6792a8dbe03e03b102c454d15fcde7986af Mon Sep 17 00:00:00 2001 From: Garrett Wu <6505921+GarrettWu@users.noreply.github.com> Date: Fri, 29 Mar 2024 14:12:03 -0700 Subject: [PATCH 25/53] feat: add ColumnTransformer save/load (#541) --- bigframes/ml/compose.py | 133 ++++++++++++++++++++++++- bigframes/ml/loader.py | 30 +++++- bigframes/ml/pipeline.py | 113 +-------------------- bigframes/session/__init__.py | 2 +- tests/system/large/ml/test_compose.py | 57 ++++++++--- tests/system/large/ml/test_pipeline.py | 4 +- 6 files changed, 204 insertions(+), 135 deletions(-) diff --git a/bigframes/ml/compose.py b/bigframes/ml/compose.py index 6d4fa5b76d..cd233589d6 100644 --- a/bigframes/ml/compose.py +++ b/bigframes/ml/compose.py @@ -18,16 +18,21 @@ from __future__ import annotations +import re +import types import typing -from typing import List, Optional, Tuple, Union +from typing import cast, List, Optional, Tuple, Union import bigframes_vendored.sklearn.compose._column_transformer +from google.cloud import bigquery +import bigframes +from bigframes import constants from bigframes.core import log_adapter from bigframes.ml import base, core, globals, preprocessing, utils import bigframes.pandas as bpd -CompilablePreprocessorType = Union[ +_PREPROCESSING_TYPES = Union[ preprocessing.OneHotEncoder, preprocessing.StandardScaler, preprocessing.MaxAbsScaler, @@ -36,6 +41,17 @@ preprocessing.LabelEncoder, ] +_BQML_TRANSFROM_TYPE_MAPPING = types.MappingProxyType( + { + "ML.STANDARD_SCALER": preprocessing.StandardScaler, + "ML.ONE_HOT_ENCODER": preprocessing.OneHotEncoder, + "ML.MAX_ABS_SCALER": preprocessing.MaxAbsScaler, + "ML.MIN_MAX_SCALER": preprocessing.MinMaxScaler, + "ML.BUCKETIZE": preprocessing.KBinsDiscretizer, + "ML.LABEL_ENCODER": preprocessing.LabelEncoder, + } +) + @log_adapter.class_logger class ColumnTransformer( @@ -51,7 +67,7 @@ def __init__( transformers: List[ Tuple[ str, - CompilablePreprocessorType, + _PREPROCESSING_TYPES, Union[str, List[str]], ] ], @@ -66,12 +82,12 @@ def __init__( @property def transformers_( self, - ) -> List[Tuple[str, CompilablePreprocessorType, str,]]: + ) -> List[Tuple[str, _PREPROCESSING_TYPES, str,]]: """The collection of transformers as tuples of (name, transformer, column).""" result: List[ Tuple[ str, - CompilablePreprocessorType, + _PREPROCESSING_TYPES, str, ] ] = [] @@ -89,6 +105,96 @@ def transformers_( return result + @classmethod + def _from_bq( + cls, session: bigframes.Session, model: bigquery.Model + ) -> ColumnTransformer: + col_transformer = cls._extract_from_bq_model(model) + col_transformer._bqml_model = core.BqmlModel(session, model) + + return col_transformer + + @classmethod + def _extract_from_bq_model( + cls, + bq_model: bigquery.Model, + ) -> ColumnTransformer: + """Extract transformers as ColumnTransformer obj from a BQ Model. Keep the _bqml_model field as None.""" + assert "transformColumns" in bq_model._properties + + transformers: List[ + Tuple[ + str, + _PREPROCESSING_TYPES, + Union[str, List[str]], + ] + ] = [] + + def camel_to_snake(name): + name = re.sub("(.)([A-Z][a-z]+)", r"\1_\2", name) + return re.sub("([a-z0-9])([A-Z])", r"\1_\2", name).lower() + + for transform_col in bq_model._properties["transformColumns"]: + # pass the columns that are not transformed + if "transformSql" not in transform_col: + continue + transform_sql: str = cast(dict, transform_col)["transformSql"] + if not transform_sql.startswith("ML."): + continue + + found_transformer = False + for prefix in _BQML_TRANSFROM_TYPE_MAPPING: + if transform_sql.startswith(prefix): + transformer_cls = _BQML_TRANSFROM_TYPE_MAPPING[prefix] + transformers.append( + ( + camel_to_snake(transformer_cls.__name__), + *transformer_cls._parse_from_sql(transform_sql), # type: ignore + ) + ) + + found_transformer = True + break + if not found_transformer: + raise NotImplementedError( + f"Unsupported transformer type. {constants.FEEDBACK_LINK}" + ) + + return cls(transformers=transformers) + + def _merge( + self, bq_model: bigquery.Model + ) -> Union[ + ColumnTransformer, + preprocessing.StandardScaler, + preprocessing.OneHotEncoder, + preprocessing.MaxAbsScaler, + preprocessing.MinMaxScaler, + preprocessing.KBinsDiscretizer, + preprocessing.LabelEncoder, + ]: + """Try to merge the column transformer to a simple transformer. Depends on all the columns in bq_model are transformed with the same transformer.""" + transformers = self.transformers_ + + assert len(transformers) > 0 + _, transformer_0, column_0 = transformers[0] + columns = [column_0] + for _, transformer, column in transformers[1:]: + # all transformers are the same + if transformer != transformer_0: + return self + columns.append(column) + # all feature columns are transformed + if sorted( + [ + cast(str, feature_column.name) + for feature_column in bq_model.feature_columns + ] + ) == sorted(columns): + return transformer_0 + + return self + def _compile_to_sql( self, columns: List[str], @@ -143,3 +249,20 @@ def transform(self, X: Union[bpd.DataFrame, bpd.Series]) -> bpd.DataFrame: bpd.DataFrame, df[self._output_names], ) + + def to_gbq(self, model_name: str, replace: bool = False) -> ColumnTransformer: + """Save the transformer as a BigQuery model. + + Args: + model_name (str): + the name of the model. + replace (bool, default False): + whether to replace if the model already exists. Default to False. + + Returns: + ColumnTransformer: saved model.""" + if not self._bqml_model: + raise RuntimeError("A transformer must be fitted before it can be saved") + + new_model = self._bqml_model.copy(model_name, replace) + return new_model.session.read_gbq_model(model_name) diff --git a/bigframes/ml/loader.py b/bigframes/ml/loader.py index 31912a0129..508003a98d 100644 --- a/bigframes/ml/loader.py +++ b/bigframes/ml/loader.py @@ -23,6 +23,7 @@ import bigframes.constants as constants from bigframes.ml import ( cluster, + compose, decomposition, ensemble, forecasting, @@ -79,6 +80,7 @@ def from_bq( llm.PaLM2TextGenerator, llm.PaLM2TextEmbeddingGenerator, pipeline.Pipeline, + compose.ColumnTransformer, ]: """Load a BQML model to BigQuery DataFrames ML. @@ -89,22 +91,32 @@ def from_bq( Returns: A BigQuery DataFrames ML model object. """ + # TODO(garrettwu): the entire condition only to TRANSFORM_ONLY when b/331679273 is fixed. + if ( + bq_model.model_type == "TRANSFORM_ONLY" + or bq_model.model_type == "MODEL_TYPE_UNSPECIFIED" + and "transformColumns" in bq_model._properties + and not _is_bq_model_remote(bq_model) + ): + return _transformer_from_bq(session, bq_model) + if _is_bq_model_pipeline(bq_model): return pipeline.Pipeline._from_bq(session, bq_model) return _model_from_bq(session, bq_model) +def _transformer_from_bq(session: bigframes.Session, bq_model: bigquery.Model): + # TODO(garrettwu): add other transformers + return compose.ColumnTransformer._from_bq(session, bq_model) + + def _model_from_bq(session: bigframes.Session, bq_model: bigquery.Model): if bq_model.model_type in _BQML_MODEL_TYPE_MAPPING: return _BQML_MODEL_TYPE_MAPPING[bq_model.model_type]._from_bq( # type: ignore session=session, model=bq_model ) - if ( - bq_model.model_type == "MODEL_TYPE_UNSPECIFIED" - and "remoteModelInfo" in bq_model._properties - and "endpoint" in bq_model._properties["remoteModelInfo"] - ): + if _is_bq_model_remote(bq_model): # Parse the remote model endpoint bqml_endpoint = bq_model._properties["remoteModelInfo"]["endpoint"] model_endpoint = bqml_endpoint.split("/")[-1] @@ -121,3 +133,11 @@ def _model_from_bq(session: bigframes.Session, bq_model: bigquery.Model): def _is_bq_model_pipeline(bq_model: bigquery.Model) -> bool: return "transformColumns" in bq_model._properties + + +def _is_bq_model_remote(bq_model: bigquery.Model) -> bool: + return ( + bq_model.model_type == "MODEL_TYPE_UNSPECIFIED" + and "remoteModelInfo" in bq_model._properties + and "endpoint" in bq_model._properties["remoteModelInfo"] + ) diff --git a/bigframes/ml/pipeline.py b/bigframes/ml/pipeline.py index 9289b613b8..92a3bae77d 100644 --- a/bigframes/ml/pipeline.py +++ b/bigframes/ml/pipeline.py @@ -18,7 +18,7 @@ from __future__ import annotations -from typing import cast, List, Optional, Tuple, Union +from typing import List, Optional, Tuple, Union import bigframes_vendored.sklearn.pipeline from google.cloud import bigquery @@ -83,8 +83,8 @@ def __init__(self, steps: List[Tuple[str, base.BaseEstimator]]): @classmethod def _from_bq(cls, session: bigframes.Session, bq_model: bigquery.Model) -> Pipeline: - col_transformer = _extract_as_column_transformer(bq_model) - transform = _merge_column_transformer(bq_model, col_transformer) + col_transformer = compose.ColumnTransformer._extract_from_bq_model(bq_model) + transform = col_transformer._merge(bq_model) estimator = loader._model_from_bq(session, bq_model) return cls([("transform", transform), ("estimator", estimator)]) @@ -138,110 +138,3 @@ def to_gbq(self, model_name: str, replace: bool = False) -> Pipeline: new_model = self._estimator._bqml_model.copy(model_name, replace) return new_model.session.read_gbq_model(model_name) - - -def _extract_as_column_transformer( - bq_model: bigquery.Model, -) -> compose.ColumnTransformer: - """Extract transformers as ColumnTransformer obj from a BQ Model.""" - assert "transformColumns" in bq_model._properties - - transformers: List[ - Tuple[ - str, - Union[ - preprocessing.OneHotEncoder, - preprocessing.StandardScaler, - preprocessing.MaxAbsScaler, - preprocessing.MinMaxScaler, - preprocessing.KBinsDiscretizer, - preprocessing.LabelEncoder, - ], - Union[str, List[str]], - ] - ] = [] - for transform_col in bq_model._properties["transformColumns"]: - # pass the columns that are not transformed - if "transformSql" not in transform_col: - continue - - transform_sql: str = cast(dict, transform_col)["transformSql"] - if transform_sql.startswith("ML.STANDARD_SCALER"): - transformers.append( - ( - "standard_scaler", - *preprocessing.StandardScaler._parse_from_sql(transform_sql), - ) - ) - elif transform_sql.startswith("ML.ONE_HOT_ENCODER"): - transformers.append( - ( - "ont_hot_encoder", - *preprocessing.OneHotEncoder._parse_from_sql(transform_sql), - ) - ) - elif transform_sql.startswith("ML.MAX_ABS_SCALER"): - transformers.append( - ( - "max_abs_scaler", - *preprocessing.MaxAbsScaler._parse_from_sql(transform_sql), - ) - ) - elif transform_sql.startswith("ML.MIN_MAX_SCALER"): - transformers.append( - ( - "min_max_scaler", - *preprocessing.MinMaxScaler._parse_from_sql(transform_sql), - ) - ) - elif transform_sql.startswith("ML.BUCKETIZE"): - transformers.append( - ( - "k_bins_discretizer", - *preprocessing.KBinsDiscretizer._parse_from_sql(transform_sql), - ) - ) - elif transform_sql.startswith("ML.LABEL_ENCODER"): - transformers.append( - ( - "label_encoder", - *preprocessing.LabelEncoder._parse_from_sql(transform_sql), - ) - ) - else: - raise NotImplementedError( - f"Unsupported transformer type. {constants.FEEDBACK_LINK}" - ) - - return compose.ColumnTransformer(transformers=transformers) - - -def _merge_column_transformer( - bq_model: bigquery.Model, column_transformer: compose.ColumnTransformer -) -> Union[ - compose.ColumnTransformer, - preprocessing.StandardScaler, - preprocessing.OneHotEncoder, - preprocessing.MaxAbsScaler, - preprocessing.MinMaxScaler, - preprocessing.KBinsDiscretizer, - preprocessing.LabelEncoder, -]: - """Try to merge the column transformer to a simple transformer.""" - transformers = column_transformer.transformers_ - - assert len(transformers) > 0 - _, transformer_0, column_0 = transformers[0] - columns = [column_0] - for _, transformer, column in transformers[1:]: - # all transformers are the same - if transformer != transformer_0: - return column_transformer - columns.append(column) - # all feature columns are transformed - if sorted( - [cast(str, feature_column.name) for feature_column in bq_model.feature_columns] - ) == sorted(columns): - return transformer_0 - - return column_transformer diff --git a/bigframes/session/__init__.py b/bigframes/session/__init__.py index 8294bc3aa1..6573934f94 100644 --- a/bigframes/session/__init__.py +++ b/bigframes/session/__init__.py @@ -953,7 +953,7 @@ def read_gbq_model(self, model_name: str): to load from the default project. Returns: - A bigframes.ml Model wrapping the model. + A bigframes.ml Model, Transformer or Pipeline wrapping the model. """ import bigframes.ml.loader diff --git a/tests/system/large/ml/test_compose.py b/tests/system/large/ml/test_compose.py index bb9a4d8f64..d7c49ca95a 100644 --- a/tests/system/large/ml/test_compose.py +++ b/tests/system/large/ml/test_compose.py @@ -14,31 +14,27 @@ import pandas -import bigframes.ml.cluster -import bigframes.ml.compose -import bigframes.ml.linear_model -import bigframes.ml.pipeline -import bigframes.ml.preprocessing +from bigframes.ml import compose, preprocessing def test_columntransformer_standalone_fit_and_transform( penguins_df_default_index, new_penguins_df ): - transformer = bigframes.ml.compose.ColumnTransformer( + transformer = compose.ColumnTransformer( [ ( "onehot", - bigframes.ml.preprocessing.OneHotEncoder(), + preprocessing.OneHotEncoder(), "species", ), ( "starndard_scale", - bigframes.ml.preprocessing.StandardScaler(), + preprocessing.StandardScaler(), ["culmen_length_mm", "flipper_length_mm"], ), ( "min_max_scale", - bigframes.ml.preprocessing.MinMaxScaler(), + preprocessing.MinMaxScaler(), ["culmen_length_mm"], ), ] @@ -76,16 +72,16 @@ def test_columntransformer_standalone_fit_and_transform( def test_columntransformer_standalone_fit_transform(new_penguins_df): - transformer = bigframes.ml.compose.ColumnTransformer( + transformer = compose.ColumnTransformer( [ ( "onehot", - bigframes.ml.preprocessing.OneHotEncoder(), + preprocessing.OneHotEncoder(), "species", ), ( "standard_scale", - bigframes.ml.preprocessing.StandardScaler(), + preprocessing.StandardScaler(), ["culmen_length_mm", "flipper_length_mm"], ), ] @@ -118,3 +114,40 @@ def test_columntransformer_standalone_fit_transform(new_penguins_df): ) pandas.testing.assert_frame_equal(result, expected, rtol=0.1, check_dtype=False) + + +def test_columntransformer_save_load(new_penguins_df, dataset_id): + transformer = compose.ColumnTransformer( + [ + ( + "onehot", + preprocessing.OneHotEncoder(), + "species", + ), + ( + "standard_scale", + preprocessing.StandardScaler(), + ["culmen_length_mm", "flipper_length_mm"], + ), + ] + ) + transformer.fit( + new_penguins_df[["species", "culmen_length_mm", "flipper_length_mm"]] + ) + + reloaded_transformer = transformer.to_gbq( + f"{dataset_id}.temp_configured_model", replace=True + ) + + assert isinstance(reloaded_transformer, compose.ColumnTransformer) + + expected = [ + ( + "one_hot_encoder", + preprocessing.OneHotEncoder(max_categories=1000001, min_frequency=0), + "species", + ), + ("standard_scaler", preprocessing.StandardScaler(), "culmen_length_mm"), + ("standard_scaler", preprocessing.StandardScaler(), "flipper_length_mm"), + ] + assert reloaded_transformer.transformers_ == expected diff --git a/tests/system/large/ml/test_pipeline.py b/tests/system/large/ml/test_pipeline.py index c128469bd2..c460efa75f 100644 --- a/tests/system/large/ml/test_pipeline.py +++ b/tests/system/large/ml/test_pipeline.py @@ -646,7 +646,7 @@ def test_pipeline_columntransformer_to_gbq(penguins_df_default_index, dataset_id compose.ColumnTransformer( [ ( - "ont_hot_encoder", + "one_hot_encoder", preprocessing.OneHotEncoder( drop="most_frequent", min_frequency=5, @@ -699,7 +699,7 @@ def test_pipeline_columntransformer_to_gbq(penguins_df_default_index, dataset_id transformers = pl_loaded._transform.transformers_ expected = [ ( - "ont_hot_encoder", + "one_hot_encoder", preprocessing.OneHotEncoder( drop="most_frequent", max_categories=100, min_frequency=5 ), From 74c391586280b55c35d66c697167122d72c13386 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tim=20Swe=C3=B1a=20=28Swast=29?= Date: Fri, 29 Mar 2024 18:28:16 -0500 Subject: [PATCH 26/53] docs: add "Supported pandas APIs" reference to the documentation (#542) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Thank you for opening a Pull Request! Before submitting your PR, there are a few things you can do to make sure it goes smoothly: - [ ] Make sure to open an issue as a [bug/issue](https://togithub.com/googleapis/python-bigquery-dataframes/issues/new/choose) before writing your code! That way we can discuss the change, evaluate designs, and agree on the general idea - [ ] Ensure the tests and linter pass - [ ] Code coverage does not decrease (if any source code was changed) - [ ] Appropriate docs were updated (if necessary) 🦕 --- .kokoro/release-nightly.sh | 1 + docs/index.rst | 1 + docs/supported_pandas_apis.rst | 62 +++++++ docs/supported_pandas_apis/.gitignore | 1 + docs/templates/toc.yml | 2 + noxfile.py | 12 ++ scripts/publish_api_coverage.py | 222 ++++++++++++++++++++++++-- scripts/test_publish_api_coverage.py | 2 + 8 files changed, 291 insertions(+), 12 deletions(-) create mode 100644 docs/supported_pandas_apis.rst create mode 100644 docs/supported_pandas_apis/.gitignore diff --git a/.kokoro/release-nightly.sh b/.kokoro/release-nightly.sh index 5624df3b8d..7da0881bbe 100755 --- a/.kokoro/release-nightly.sh +++ b/.kokoro/release-nightly.sh @@ -106,6 +106,7 @@ for gcs_path in gs://vertex_sdk_private_releases/bigframe/ \ # write access to COVERAGE_TABLE=bigframes-metrics.coverage_report.bigframes_coverage_nightly python3.10 scripts/publish_api_coverage.py \ + bigquery \ --bigframes_version=$BIGFRAMES_VERSION \ --release_version=$RELEASE_VERSION \ --bigquery_table=$COVERAGE_TABLE diff --git a/docs/index.rst b/docs/index.rst index d239ea3a78..b17ac7cbd9 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -7,6 +7,7 @@ API reference :maxdepth: 3 reference/index + supported_pandas_apis Changelog --------- diff --git a/docs/supported_pandas_apis.rst b/docs/supported_pandas_apis.rst new file mode 100644 index 0000000000..f4b57f05d1 --- /dev/null +++ b/docs/supported_pandas_apis.rst @@ -0,0 +1,62 @@ +Supported pandas APIs +===================== + +The following tables show the pandas APIs that have been implemented (or not) +in BigQuery DataFrames. + +* 'Y' means it implements all parameters. +* 'P' means it implements only some parameters. + +DataFrame +--------- + +.. raw:: html + :file: supported_pandas_apis/bf_dataframe.html + +DataFrameGroupBy +---------------- + +.. raw:: html + :file: supported_pandas_apis/bf_dataframegroupby.html + +Index +----- + +.. raw:: html + :file: supported_pandas_apis/bf_index.html + +pandas module +------------- + +.. raw:: html + :file: supported_pandas_apis/bf_pandas.html + +Series +------ + +.. raw:: html + :file: supported_pandas_apis/bf_series.html + +Series.dt methods +----------------- + +.. raw:: html + :file: supported_pandas_apis/bf_datetimemethods.html + +Series.str methods +------------------ + +.. raw:: html + :file: supported_pandas_apis/bf_stringmethods.html + +SeriesGroupBy +------------- + +.. raw:: html + :file: supported_pandas_apis/bf_seriesgroupby.html + +Window +------ + +.. raw:: html + :file: supported_pandas_apis/bf_window.html diff --git a/docs/supported_pandas_apis/.gitignore b/docs/supported_pandas_apis/.gitignore new file mode 100644 index 0000000000..2d19fc766d --- /dev/null +++ b/docs/supported_pandas_apis/.gitignore @@ -0,0 +1 @@ +*.html diff --git a/docs/templates/toc.yml b/docs/templates/toc.yml index c07e6141f1..57b0522d04 100644 --- a/docs/templates/toc.yml +++ b/docs/templates/toc.yml @@ -72,6 +72,8 @@ name: Series - name: Window uid: bigframes.core.window.Window + - href: supported_pandas_apis.html + name: Supported pandas APIs name: bigframes.pandas - items: - items: diff --git a/noxfile.py b/noxfile.py index a5e77964f1..4ac3a81723 100644 --- a/noxfile.py +++ b/noxfile.py @@ -467,6 +467,12 @@ def docs(session): ) shutil.rmtree(os.path.join("docs", "_build"), ignore_errors=True) + + session.run( + "python", + "scripts/publish_api_coverage.py", + "docs", + ) session.run( "sphinx-build", "-W", # warnings as errors @@ -503,6 +509,12 @@ def docfx(session): ) shutil.rmtree(os.path.join("docs", "_build"), ignore_errors=True) + + session.run( + "python", + "scripts/publish_api_coverage.py", + "docs", + ) session.run( "sphinx-build", "-T", # show full traceback on exception diff --git a/scripts/publish_api_coverage.py b/scripts/publish_api_coverage.py index 856307e440..4a35ade9ef 100644 --- a/scripts/publish_api_coverage.py +++ b/scripts/publish_api_coverage.py @@ -17,27 +17,110 @@ import argparse import inspect +import pathlib +import sys import pandas as pd +import pandas.core.groupby +import pandas.core.indexes.accessors +import pandas.core.strings.accessor +import pandas.core.window.rolling +import bigframes +import bigframes.core.groupby +import bigframes.core.window +import bigframes.operations.datetimes import bigframes.pandas as bpd +REPO_ROOT = pathlib.Path(__file__).parent.parent + +URL_PREFIX = { + "pandas": ( + "https://cloud.google.com/python/docs/reference/bigframes/latest/bigframes.pandas#bigframes_pandas_" + ), + "dataframe": ( + "https://cloud.google.com/python/docs/reference/bigframes/latest/bigframes.dataframe.DataFrame#bigframes_dataframe_DataFrame_" + ), + "dataframegroupby": ( + "https://cloud.google.com/python/docs/reference/bigframes/latest/bigframes.core.groupby.DataFrameGroupBy#bigframes_core_groupby_DataFrameGroupBy_" + ), + "series": ( + "https://cloud.google.com/python/docs/reference/bigframes/latest/bigframes.series.Series#bigframes_series_Series_" + ), + "seriesgroupby": ( + "https://cloud.google.com/python/docs/reference/bigframes/latest/bigframes.core.groupby.SeriesGroupBy#bigframes_core_groupby_SeriesGroupBy_" + ), + "datetimemethods": ( + "https://cloud.google.com/python/docs/reference/bigframes/latest/bigframes.operations.datetimes.DatetimeMethods#bigframes_operations_datetimes_DatetimeMethods_" + ), + "stringmethods": ( + "https://cloud.google.com/python/docs/reference/bigframes/latest/bigframes.operations.strings.StringMethods#bigframes_operations_strings_StringMethods_" + ), + "window": ( + "https://cloud.google.com/python/docs/reference/bigframes/latest/bigframes.core.window.Window#bigframes_core_window_Window_" + ), + # TODO: Index not documented. +} + + +PANDAS_TARGETS = [ + ("pandas", pd, bpd), + ("dataframe", pd.DataFrame, bpd.DataFrame), + ( + "dataframegroupby", + pandas.core.groupby.DataFrameGroupBy, + bigframes.core.groupby.DataFrameGroupBy, + ), + ("series", pd.Series, bpd.Series), + ( + "seriesgroupby", + pandas.core.groupby.DataFrameGroupBy, + bigframes.core.groupby.DataFrameGroupBy, + ), + ( + "datetimemethods", + pandas.core.indexes.accessors.CombinedDatetimelikeProperties, + bigframes.operations.datetimes.DatetimeMethods, + ), + ( + "stringmethods", + pandas.core.strings.accessor.StringMethods, + bigframes.operations.strings.StringMethods, + ), + ( + "window", + pandas.core.window.rolling.Rolling, + bigframes.core.window.Window, + ), + ("index", pd.Index, bpd.Index), +] + + +def names_from_signature(signature): + """Extract the names of parameters from signature + + See: https://docs.python.org/3/library/inspect.html#inspect.signature + """ + return frozenset({parameter for parameter in signature.parameters}) + + +def calculate_missing_parameters(bigframes_function, target_function): + bigframes_params = names_from_signature(inspect.signature(bigframes_function)) + target_params = names_from_signature(inspect.signature(target_function)) + return target_params - bigframes_params + def generate_pandas_api_coverage(): """Inspect all our pandas objects, and compare with the real pandas objects, to see which methods we implement. For each, generate a regex that can be used to check if its present in a notebook""" - header = ["api", "pattern", "kind", "is_in_bigframes"] + header = ["api", "pattern", "kind", "is_in_bigframes", "missing_parameters"] api_patterns = [] - targets = [ - ("pandas", pd, bpd), - ("dataframe", pd.DataFrame, bpd.DataFrame), - ("series", pd.Series, bpd.Series), - ("index", pd.Index, bpd.Index), - ] indexers = ["loc", "iloc", "iat", "ix", "at"] - for name, pandas_obj, bigframes_obj in targets: + for name, pandas_obj, bigframes_obj in PANDAS_TARGETS: for member in dir(pandas_obj): + missing_parameters = "" + # skip private functions and properties if member[0] == "_" and member[1] != "_": continue @@ -50,6 +133,17 @@ def generate_pandas_api_coverage(): # Function, match .member( token = f"\\.{member}\\(" token_type = "function" + + if hasattr(bigframes_obj, member): + bigframes_function = getattr(bigframes_obj, member) + pandas_function = getattr(pandas_obj, member) + missing_parameters = ", ".join( + sorted( + calculate_missing_parameters( + bigframes_function, pandas_function + ) + ) + ) elif member in indexers: # Indexer, match .indexer[ token = f"\\.{member}\\[" @@ -62,7 +156,13 @@ def generate_pandas_api_coverage(): is_in_bigframes = hasattr(bigframes_obj, member) api_patterns.append( - [f"{name}.{member}", token, token_type, is_in_bigframes] + [ + f"{name}.{member}", + token, + token_type, + is_in_bigframes, + missing_parameters, + ] ) return pd.DataFrame(api_patterns, columns=header) @@ -165,14 +265,112 @@ def build_api_coverage_table(bigframes_version: str, release_version: str): return combined_df.infer_objects().convert_dtypes() +def format_api(api_names, is_in_bigframes, api_prefix): + api_names = api_names.str.slice(start=len(f"{api_prefix}.")) + formatted = "" + api_names + "" + url_prefix = URL_PREFIX.get(api_prefix) + if url_prefix is None: + return formatted + + linked = '' + formatted + "" + return formatted.mask(is_in_bigframes, linked) + + +def generate_api_coverage(df, api_prefix): + dataframe_apis = df.loc[df["api"].str.startswith(f"{api_prefix}.")] + fully_implemented = ( + dataframe_apis["missing_parameters"].str.len() == 0 + ) & dataframe_apis["is_in_bigframes"] + partial_implemented = ( + dataframe_apis["missing_parameters"].str.len() != 0 + ) & dataframe_apis["is_in_bigframes"] + not_implemented = ~dataframe_apis["is_in_bigframes"] + dataframe_table = pd.DataFrame( + { + "API": format_api( + dataframe_apis["api"], + dataframe_apis["is_in_bigframes"], + api_prefix, + ), + "Implemented": "", + "Missing parameters": dataframe_apis["missing_parameters"], + } + ) + dataframe_table.loc[fully_implemented, "Implemented"] = "Y" + dataframe_table.loc[partial_implemented, "Implemented"] = "P" + dataframe_table.loc[not_implemented, "Implemented"] = "N" + return dataframe_table + + +def generate_api_coverage_doc(df, api_prefix): + dataframe_table = generate_api_coverage(df, api_prefix) + dataframe_table = dataframe_table.loc[~(dataframe_table["Implemented"] == "N")] + dataframe_table["Implemented"] = dataframe_table["Implemented"].map( + { + "Y": "Y", + "P": "P", + } + ) + + with open( + REPO_ROOT / "docs" / "supported_pandas_apis" / f"bf_{api_prefix}.html", + "w", + ) as html_file: + dataframe_table.to_html( + html_file, index=False, header=True, escape=False, border=0, col_space="8em" + ) + + +def generate_api_coverage_docs(df): + for target in PANDAS_TARGETS: + api_prefix = target[0] + generate_api_coverage_doc(df, api_prefix) + + +def print_api_coverage_summary(df, api_prefix): + dataframe_table = generate_api_coverage(df, api_prefix) + + print(api_prefix) + print(dataframe_table[["Implemented", "API"]].groupby(["Implemented"]).count()) + print(f"{api_prefix} APIs: {dataframe_table.shape[0]}\n") + + +def print_api_coverage_summaries(df): + for target in PANDAS_TARGETS: + api_prefix = target[0] + print_api_coverage_summary(df, api_prefix) + + print(f"\nAll APIs: {len(df.index)}") + fully_implemented = (df["missing_parameters"].str.len() == 0) & df[ + "is_in_bigframes" + ] + print(f"Y: {fully_implemented.sum()}") + partial_implemented = (df["missing_parameters"].str.len() != 0) & df[ + "is_in_bigframes" + ] + print(f"P: {partial_implemented.sum()}") + not_implemented = ~df["is_in_bigframes"] + print(f"N: {not_implemented.sum()}") + + def main(): parser = argparse.ArgumentParser() - parser.add_argument("--bigframes_version") - parser.add_argument("--release_version") + parser.add_argument("output_type") + parser.add_argument("--bigframes_version", default=bigframes.__version__) + parser.add_argument("--release_version", default="") parser.add_argument("--bigquery_table_name") args = parser.parse_args() df = build_api_coverage_table(args.bigframes_version, args.release_version) - df.to_gbq(args.bigquery_table_name, if_exists="append") + + if args.output_type == "bigquery": + df.to_gbq(args.bigquery_table_name, if_exists="append") + elif args.output_type == "docs": + generate_api_coverage_docs(df) + elif args.output_type == "summary": + print_api_coverage_summaries(df) + else: + print(f"Unexpected output_type {repr(args.output_type)}") + sys.exit(1) if __name__ == "__main__": diff --git a/scripts/test_publish_api_coverage.py b/scripts/test_publish_api_coverage.py index 96b2d1bb48..061cc1c25c 100644 --- a/scripts/test_publish_api_coverage.py +++ b/scripts/test_publish_api_coverage.py @@ -27,6 +27,7 @@ def test_api_coverage_produces_expected_schema(): "string", "boolean", "string", + "string", "datetime64[ns]", "string", "string", @@ -36,6 +37,7 @@ def test_api_coverage_produces_expected_schema(): "pattern", "kind", "is_in_bigframes", + "missing_parameters", "module", "timestamp", "bigframes_version", From 347f2dda2298e17cd44a298f04a723f2d20c080a Mon Sep 17 00:00:00 2001 From: Ashley Xu <139821907+ashleyxuu@users.noreply.github.com> Date: Fri, 29 Mar 2024 17:08:16 -0700 Subject: [PATCH 27/53] fix: sync the notebook with embedding changes (#550) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Thank you for opening a Pull Request! Before submitting your PR, there are a few things you can do to make sure it goes smoothly: - [ ] Make sure to open an issue as a [bug/issue](https://togithub.com/googleapis/python-bigquery-dataframes/issues/new/choose) before writing your code! That way we can discuss the change, evaluate designs, and agree on the general idea - [ ] Ensure the tests and linter pass - [ ] Code coverage does not decrease (if any source code was changed) - [ ] Appropriate docs were updated (if necessary) Fixes # 🦕 --- .../bq_dataframes_llm_kmeans.ipynb | 1074 +++++++++-------- 1 file changed, 547 insertions(+), 527 deletions(-) diff --git a/notebooks/generative_ai/bq_dataframes_llm_kmeans.ipynb b/notebooks/generative_ai/bq_dataframes_llm_kmeans.ipynb index 61445d85c5..2c6d109ba8 100644 --- a/notebooks/generative_ai/bq_dataframes_llm_kmeans.ipynb +++ b/notebooks/generative_ai/bq_dataframes_llm_kmeans.ipynb @@ -289,7 +289,7 @@ { "data": { "text/html": [ - "Query job d5778724-6966-42ba-b8a6-2a1865a1184c is DONE. 2.3 GB processed. Open Job" + "Query job 030e5d08-f690-47e4-b7cc-342731245575 is DONE. 2.3 GB processed. Open Job" ], "text/plain": [ "" @@ -301,7 +301,7 @@ { "data": { "text/html": [ - "Query job 4d48bf69-571c-4773-8486-0232840597d5 is DONE. 55.1 MB processed. Open Job" + "Query job a9c5f416-c5d2-4209-b639-bccb81a25d7e is DONE. 58.8 MB processed. Open Job" ], "text/plain": [ "" @@ -336,36 +336,36 @@ " \n", " \n", " \n", - " 24\n", - " I sent disputed to Transunion, XXXX and XXXX f...\n", + " 1053364\n", + " My Macy 's American Express account was taken ...\n", " \n", " \n", - " 942\n", - " on XX/XX/2017 I sent XXXX, transunion, XXXX pr...\n", + " 1053757\n", + " I am a victim of identity theft. The informati...\n", " \n", " \n", - " 1193\n", - " On Wednesday, XXXX XXXX , I initiated a wir...\n", + " 1053784\n", + " In XXXX 2016, Amex took out $ XXXX.+ unauthori...\n", " \n", " \n", - " 1292\n", - " Dear Sir or Madam, I am a victim of identity t...\n", + " 1054237\n", + " I am not for sure the exact date of my loan it...\n", " \n", " \n", - " 1377\n", - " For the purpose of this complaint, I will refe...\n", + " 1054244\n", + " I entered a consumer credit transaction with t...\n", " \n", " \n", "\n", "" ], "text/plain": [ - " consumer_complaint_narrative\n", - "24 I sent disputed to Transunion, XXXX and XXXX f...\n", - "942 on XX/XX/2017 I sent XXXX, transunion, XXXX pr...\n", - "1193 On Wednesday, XXXX XXXX , I initiated a wir...\n", - "1292 Dear Sir or Madam, I am a victim of identity t...\n", - "1377 For the purpose of this complaint, I will refe..." + " consumer_complaint_narrative\n", + "1053364 My Macy 's American Express account was taken ...\n", + "1053757 I am a victim of identity theft. The informati...\n", + "1053784 In XXXX 2016, Amex took out $ XXXX.+ unauthori...\n", + "1054237 I am not for sure the exact date of my loan it...\n", + "1054244 I entered a consumer credit transaction with t..." ] }, "execution_count": 7, @@ -418,7 +418,7 @@ { "data": { "text/html": [ - "Query job 15b352c2-783c-42b1-bc03-e5772f00381a is DONE. 0 Bytes processed. Open Job" + "Query job 77eee871-31eb-4939-a015-f5505c94786e is DONE. 0 Bytes processed. Open Job" ], "text/plain": [ "" @@ -444,7 +444,7 @@ { "data": { "text/html": [ - "Query job e2152e81-b736-4a68-a25a-c5eb2b03d734 is DONE. 1.3 GB processed. Open Job" + "Query job 63cdd004-21b6-41bf-8876-aa646f1f268e is DONE. 1.3 GB processed. Open Job" ], "text/plain": [ "" @@ -456,7 +456,7 @@ { "data": { "text/html": [ - "Query job b1a3d20b-aee3-424c-a0c5-5b36f1177709 is DONE. 80.0 kB processed. Open Job" + "Query job cda12546-9931-48f6-8b22-74a9ab85fa28 is DONE. 80.0 kB processed. Open Job" ], "text/plain": [ "" @@ -468,7 +468,7 @@ { "data": { "text/html": [ - "Query job 6b2fad50-cbc8-42ea-83c1-b5d3eaac10b9 is DONE. 20.0 kB processed. Open Job" + "Query job 759a13c5-c02f-4ae8-9b22-d7ef423ffe8d is DONE. 20.0 kB processed. Open Job" ], "text/plain": [ "" @@ -480,19 +480,7 @@ { "data": { "text/html": [ - "Query job 31896ae6-fbb5-42fb-98c4-13bd19d1adfa is DONE. 0 Bytes processed. Open Job" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Query job 43f04543-f59b-4f1b-8598-c529324904be is DONE. 72.1 MB processed. Open Job" + "Query job 1bad8ef3-8103-4a98-bec4-699d97673b9a is DONE. 72.0 MB processed. Open Job" ], "text/plain": [ "" @@ -522,187 +510,188 @@ " \n", " \n", " \n", - " text_embedding\n", - " statistics\n", - " ml_embed_text_status\n", + " ml_generate_embedding_result\n", + " ml_generate_embedding_statistics\n", + " ml_generate_embedding_status\n", " content\n", " \n", " \n", " \n", " \n", - " 545\n", - " [ 1.82510037e-02 -1.27867460e-02 -1.57095697e-...\n", - " {\"token_count\":178,\"truncated\":false}\n", + " 357\n", + " [ 1.33585772e-02 -3.76597494e-02 -6.14452176e-...\n", + " {\"token_count\":306,\"truncated\":false}\n", " \n", - " My payments have been approximately {$89.00} w...\n", + " I decided to try XXXX services for my wife and...\n", " \n", " \n", - " 614\n", - " [ 5.40032536e-02 -5.28502129e-02 -5.33268750e-...\n", - " {\"token_count\":399,\"truncated\":false}\n", + " 428\n", + " [ 3.10036819e-04 -3.82593311e-02 -3.41922641e-...\n", + " {\"token_count\":134,\"truncated\":false}\n", " \n", - " Hi, I have contacted Trans Union XXXX XXXX abo...\n", + " XXXX I went to the bank in question ( XXXX XXX...\n", " \n", " \n", - " 1236\n", - " [-5.32836001e-03 -5.84292673e-02 -5.86670786e-...\n", - " {\"token_count\":129,\"truncated\":false}\n", + " 1319\n", + " [ 1.97481886e-02 -1.97448786e-02 -5.13443351e-...\n", + " {\"token_count\":215,\"truncated\":false}\n", " \n", - " I have a XXXX XXXX XXXX credit card on my Exp...\n", + " I currently have a home loan with my ex husban...\n", " \n", " \n", - " 1477\n", - " [ 3.02605387e-02 -4.37121317e-02 -2.70802993e-...\n", - " {\"token_count\":16,\"truncated\":false}\n", + " 1993\n", + " [ 9.83821880e-03 -6.55664057e-02 -5.46210706e-...\n", + " {\"token_count\":536,\"truncated\":false}\n", " \n", - " Wrongs information, selling my information to ...\n", + " NOT MY ACCOUNT, NOT AN AUTHORIZED USER {$1800....\n", " \n", " \n", - " 2261\n", - " [ 2.35723313e-02 -3.73509154e-02 -6.44604117e-...\n", - " {\"token_count\":33,\"truncated\":false}\n", + " 1997\n", + " [ 0.03145148 -0.01011822 -0.02316323 -0.025078...\n", + " {\"token_count\":123,\"truncated\":false}\n", " \n", - " Please investigate and delete disputed item th...\n", + " After a while the payments became harder and h...\n", " \n", " \n", - " 2361\n", - " [ 1.04440488e-02 -9.37070698e-03 -7.36323372e-...\n", - " {\"token_count\":45,\"truncated\":false}\n", + " 2469\n", + " [ 4.74590808e-03 -4.56819348e-02 -2.49751769e-...\n", + " {\"token_count\":60,\"truncated\":false}\n", " \n", - " By the provisions of the Fair Credit Reporting...\n", + " In the course of my student loan, I have been ...\n", " \n", " \n", - " 2378\n", - " [ 3.04989032e-02 -4.08191867e-02 -6.18648790e-...\n", - " {\"token_count\":892,\"truncated\":false}\n", + " 2624\n", + " [ 3.91883589e-03 -3.26644145e-02 -7.10378587e-...\n", + " {\"token_count\":254,\"truncated\":false}\n", " \n", - " Since XX/XX/XXXX I have been trying to dispute...\n", + " In accordance with the Fair Credit Reporting A...\n", " \n", " \n", - " 3133\n", - " [ 0.00152804 -0.04189068 -0.04220504 -0.053740...\n", - " {\"token_count\":90,\"truncated\":false}\n", + " 2832\n", + " [ 8.35181400e-03 -2.91643552e-02 -4.30776961e-...\n", + " {\"token_count\":79,\"truncated\":false}\n", " \n", - " Out of the blue I received a debt collection n...\n", + " LVNV FUNDING LLC is continually placing a coll...\n", " \n", " \n", - " 3140\n", - " [ 3.11435573e-02 -4.44000624e-02 -2.10917685e-...\n", - " {\"token_count\":372,\"truncated\":false}\n", + " 3328\n", + " [ 2.71253809e-02 -1.77491009e-02 -5.32273464e-...\n", + " {\"token_count\":156,\"truncated\":false}\n", " \n", - " My wife and I have been sending money to XXXX ...\n", + " On XX/XX/2020 I sent a letter regarding inaccu...\n", " \n", " \n", - " 3322\n", - " [ 2.75927987e-02 -6.23729872e-03 -3.83295454e-...\n", - " {\"token_count\":36,\"truncated\":false}\n", + " 3650\n", + " [-6.10093866e-03 -5.93599863e-02 -8.04531425e-...\n", + " {\"token_count\":175,\"truncated\":false}\n", " \n", - " Phone calls from Convergent Outsourcing XXXX. ...\n", + " Over a year and a half ago we started the proc...\n", " \n", " \n", - " 3583\n", - " [ 9.20385588e-03 -3.83387171e-02 -6.46291822e-...\n", - " {\"token_count\":52,\"truncated\":false}\n", + " 3860\n", + " [ 5.84836192e-02 -2.43354496e-03 -5.57337068e-...\n", + " {\"token_count\":1267,\"truncated\":false}\n", " \n", - " I recently received a copy of my credit report...\n", + " The issue is 26 late payments on me and my wif...\n", " \n", " \n", - " 4134\n", - " [-7.04960374e-04 -3.52595337e-02 -1.65264793e-...\n", - " {\"token_count\":412,\"truncated\":false}\n", + " 4464\n", + " [ 6.05084226e-02 -3.21578234e-02 -7.51668587e-...\n", + " {\"token_count\":906,\"truncated\":false}\n", " \n", - " I have been sending the creditor what they hav...\n", + " I purchased as replacement for a lost XXXX XXX...\n", " \n", " \n", - " 4496\n", - " [ 3.67735326e-02 1.21120387e-03 -5.20942472e-...\n", - " {\"token_count\":182,\"truncated\":false}\n", + " 4470\n", + " [ 1.28689921e-02 -3.25881056e-02 -6.53645024e-...\n", + " {\"token_count\":200,\"truncated\":false}\n", " \n", - " This is my second complaint. Their response to...\n", + " in accordance with the Fair Credit Reporting a...\n", " \n", " \n", - " 5260\n", - " [ 2.07133405e-02 -1.69602726e-02 -5.07124476e-...\n", - " {\"token_count\":103,\"truncated\":false}\n", + " 4567\n", + " [-5.49167022e-03 -3.84587422e-02 -8.56091827e-...\n", + " {\"token_count\":110,\"truncated\":false}\n", " \n", - " XX/XX/XXXX and XX/XX/XXXX, {$3200.00} contacte...\n", + " I have submitted multiple disputes through the...\n", " \n", " \n", - " 5400\n", - " [ 1.44114876e-02 -2.34710164e-02 -6.58538565e-...\n", - " {\"token_count\":60,\"truncated\":false}\n", + " 4713\n", + " [ 2.68485844e-02 -3.46762352e-02 -4.59849052e-...\n", + " {\"token_count\":549,\"truncated\":false}\n", " \n", - " Upon checking my XXXX credit report I noticed ...\n", + " While shopping for furniture for my home I ope...\n", " \n", " \n", - " 5425\n", - " [ 3.10326386e-02 -2.19427086e-02 -6.56386837e-...\n", - " {\"token_count\":87,\"truncated\":false}\n", + " 5181\n", + " [ 2.05754172e-02 -3.83999050e-02 -9.29225236e-...\n", + " {\"token_count\":77,\"truncated\":false}\n", " \n", - " Follow up to previous complaint XXXX XXXX XXXX...\n", + " I had opened a Wells Fargo checking account wi...\n", " \n", " \n", - " 6014\n", - " [ 1.90773793e-02 -2.27493346e-02 -3.27166244e-...\n", - " {\"token_count\":175,\"truncated\":false}\n", + " 5511\n", + " [-0.00217485 -0.04031368 -0.06604777 -0.052006...\n", + " {\"token_count\":262,\"truncated\":false}\n", " \n", - " My new XXXX lease was over always paid on time...\n", + " I recently disputed ( see attached letter ) wi...\n", " \n", " \n", - " 8192\n", - " [ 0.01937891 -0.05466933 -0.06070872 -0.059028...\n", - " {\"token_count\":131,\"truncated\":false}\n", + " 5888\n", + " [-8.15972779e-03 -3.46563384e-02 -5.91776446e-...\n", + " {\"token_count\":176,\"truncated\":false}\n", " \n", - " I have no idea where this account cane from. B...\n", + " XXXX XXXX XXXX XXXX \n", + "I have disputed this acco...\n", " \n", " \n", - " 8240\n", - " [ 4.34123818e-03 -3.40953320e-02 -4.06381376e-...\n", - " {\"token_count\":87,\"truncated\":false}\n", + " 6299\n", + " [ 4.80043218e-02 -4.13420722e-02 -6.12363108e-...\n", + " {\"token_count\":151,\"truncated\":false}\n", " \n", - " I TIED TO BUY CAR AT XXXX, THEY GOT APPROVAL F...\n", + " XXXX ; XXXX and Transunion are reporting ( 30 ...\n", " \n", " \n", - " 8720\n", - " [ 0.03133732 -0.03972461 -0.00178199 -0.035876...\n", - " {\"token_count\":645,\"truncated\":false}\n", + " 7143\n", + " [ 4.39200476e-02 -3.04005221e-02 -3.47866341e-...\n", + " {\"token_count\":234,\"truncated\":false}\n", " \n", - " XXXX XXXX XXXX XXXX, NY XXXX XX/XX/XXXX Consum...\n", + " My Macys account is due on the first of every ...\n", " \n", " \n", - " 8914\n", - " [ 1.75969116e-02 -2.25022305e-02 -5.70390299e-...\n", - " {\"token_count\":180,\"truncated\":false}\n", + " 7219\n", + " [ 1.00224940e-02 -3.79302073e-03 -3.41785327e-...\n", + " {\"token_count\":26,\"truncated\":false}\n", " \n", - " On XX/XX/21 I sent a letter regarding inaccura...\n", + " Keep getting letters and calls from collection...\n", " \n", " \n", - " 10021\n", - " [ 5.02460636e-02 -5.25112189e-02 -4.12914790e-...\n", - " {\"token_count\":30,\"truncated\":false}\n", + " 7574\n", + " [-0.00149564 -0.06619431 -0.05084481 -0.048579...\n", + " {\"token_count\":129,\"truncated\":false}\n", " \n", - " XX/XX/XXXX and XX/XX/XXXX inaccurate informati...\n", + " On XXXX I was on the XXXX app and there was a ...\n", " \n", " \n", - " 10327\n", - " [-0.00979626 -0.04912931 -0.08654705 -0.021063...\n", - " {\"token_count\":194,\"truncated\":false}\n", + " 8759\n", + " [ 0.01501553 -0.03575936 -0.050562 -0.034884...\n", + " {\"token_count\":501,\"truncated\":false}\n", " \n", - " When I reviewed my credit report, I discovered...\n", + " Obviously I've been a victim of fraud, therefo...\n", " \n", " \n", - " 10345\n", - " [-0.04292191 -0.02636929 -0.06177032 -0.076520...\n", - " {\"token_count\":262,\"truncated\":false}\n", + " 9700\n", + " [ 1.01501048e-02 -2.80565154e-02 -4.05892394e-...\n", + " {\"token_count\":48,\"truncated\":false}\n", " \n", - " U.S. Bank sent two letters containing Visa Deb...\n", + " The following item have not been properly inve...\n", " \n", " \n", - " 10369\n", - " [ 2.16020197e-02 -5.62509745e-02 -5.93873672e-...\n", - " {\"token_count\":77,\"truncated\":false}\n", + " 9822\n", + " [ 2.95880195e-02 1.65440738e-02 -3.33247967e-...\n", + " {\"token_count\":2373,\"truncated\":true}\n", " \n", - " I requested from XXXX that they reverse the la...\n", + " During the housing market crash I went through...\n", " \n", " \n", "\n", @@ -710,86 +699,87 @@ "[10000 rows x 4 columns in total]" ], "text/plain": [ - " text_embedding \\\n", - "545 [ 1.82510037e-02 -1.27867460e-02 -1.57095697e-... \n", - "614 [ 5.40032536e-02 -5.28502129e-02 -5.33268750e-... \n", - "1236 [-5.32836001e-03 -5.84292673e-02 -5.86670786e-... \n", - "1477 [ 3.02605387e-02 -4.37121317e-02 -2.70802993e-... \n", - "2261 [ 2.35723313e-02 -3.73509154e-02 -6.44604117e-... \n", - "2361 [ 1.04440488e-02 -9.37070698e-03 -7.36323372e-... \n", - "2378 [ 3.04989032e-02 -4.08191867e-02 -6.18648790e-... \n", - "3133 [ 0.00152804 -0.04189068 -0.04220504 -0.053740... \n", - "3140 [ 3.11435573e-02 -4.44000624e-02 -2.10917685e-... \n", - "3322 [ 2.75927987e-02 -6.23729872e-03 -3.83295454e-... \n", - "3583 [ 9.20385588e-03 -3.83387171e-02 -6.46291822e-... \n", - "4134 [-7.04960374e-04 -3.52595337e-02 -1.65264793e-... \n", - "4496 [ 3.67735326e-02 1.21120387e-03 -5.20942472e-... \n", - "5260 [ 2.07133405e-02 -1.69602726e-02 -5.07124476e-... \n", - "5400 [ 1.44114876e-02 -2.34710164e-02 -6.58538565e-... \n", - "5425 [ 3.10326386e-02 -2.19427086e-02 -6.56386837e-... \n", - "6014 [ 1.90773793e-02 -2.27493346e-02 -3.27166244e-... \n", - "8192 [ 0.01937891 -0.05466933 -0.06070872 -0.059028... \n", - "8240 [ 4.34123818e-03 -3.40953320e-02 -4.06381376e-... \n", - "8720 [ 0.03133732 -0.03972461 -0.00178199 -0.035876... \n", - "8914 [ 1.75969116e-02 -2.25022305e-02 -5.70390299e-... \n", - "10021 [ 5.02460636e-02 -5.25112189e-02 -4.12914790e-... \n", - "10327 [-0.00979626 -0.04912931 -0.08654705 -0.021063... \n", - "10345 [-0.04292191 -0.02636929 -0.06177032 -0.076520... \n", - "10369 [ 2.16020197e-02 -5.62509745e-02 -5.93873672e-... \n", + " ml_generate_embedding_result \\\n", + "357 [ 1.33585772e-02 -3.76597494e-02 -6.14452176e-... \n", + "428 [ 3.10036819e-04 -3.82593311e-02 -3.41922641e-... \n", + "1319 [ 1.97481886e-02 -1.97448786e-02 -5.13443351e-... \n", + "1993 [ 9.83821880e-03 -6.55664057e-02 -5.46210706e-... \n", + "1997 [ 0.03145148 -0.01011822 -0.02316323 -0.025078... \n", + "2469 [ 4.74590808e-03 -4.56819348e-02 -2.49751769e-... \n", + "2624 [ 3.91883589e-03 -3.26644145e-02 -7.10378587e-... \n", + "2832 [ 8.35181400e-03 -2.91643552e-02 -4.30776961e-... \n", + "3328 [ 2.71253809e-02 -1.77491009e-02 -5.32273464e-... \n", + "3650 [-6.10093866e-03 -5.93599863e-02 -8.04531425e-... \n", + "3860 [ 5.84836192e-02 -2.43354496e-03 -5.57337068e-... \n", + "4464 [ 6.05084226e-02 -3.21578234e-02 -7.51668587e-... \n", + "4470 [ 1.28689921e-02 -3.25881056e-02 -6.53645024e-... \n", + "4567 [-5.49167022e-03 -3.84587422e-02 -8.56091827e-... \n", + "4713 [ 2.68485844e-02 -3.46762352e-02 -4.59849052e-... \n", + "5181 [ 2.05754172e-02 -3.83999050e-02 -9.29225236e-... \n", + "5511 [-0.00217485 -0.04031368 -0.06604777 -0.052006... \n", + "5888 [-8.15972779e-03 -3.46563384e-02 -5.91776446e-... \n", + "6299 [ 4.80043218e-02 -4.13420722e-02 -6.12363108e-... \n", + "7143 [ 4.39200476e-02 -3.04005221e-02 -3.47866341e-... \n", + "7219 [ 1.00224940e-02 -3.79302073e-03 -3.41785327e-... \n", + "7574 [-0.00149564 -0.06619431 -0.05084481 -0.048579... \n", + "8759 [ 0.01501553 -0.03575936 -0.050562 -0.034884... \n", + "9700 [ 1.01501048e-02 -2.80565154e-02 -4.05892394e-... \n", + "9822 [ 2.95880195e-02 1.65440738e-02 -3.33247967e-... \n", "\n", - " statistics ml_embed_text_status \\\n", - "545 {\"token_count\":178,\"truncated\":false} \n", - "614 {\"token_count\":399,\"truncated\":false} \n", - "1236 {\"token_count\":129,\"truncated\":false} \n", - "1477 {\"token_count\":16,\"truncated\":false} \n", - "2261 {\"token_count\":33,\"truncated\":false} \n", - "2361 {\"token_count\":45,\"truncated\":false} \n", - "2378 {\"token_count\":892,\"truncated\":false} \n", - "3133 {\"token_count\":90,\"truncated\":false} \n", - "3140 {\"token_count\":372,\"truncated\":false} \n", - "3322 {\"token_count\":36,\"truncated\":false} \n", - "3583 {\"token_count\":52,\"truncated\":false} \n", - "4134 {\"token_count\":412,\"truncated\":false} \n", - "4496 {\"token_count\":182,\"truncated\":false} \n", - "5260 {\"token_count\":103,\"truncated\":false} \n", - "5400 {\"token_count\":60,\"truncated\":false} \n", - "5425 {\"token_count\":87,\"truncated\":false} \n", - "6014 {\"token_count\":175,\"truncated\":false} \n", - "8192 {\"token_count\":131,\"truncated\":false} \n", - "8240 {\"token_count\":87,\"truncated\":false} \n", - "8720 {\"token_count\":645,\"truncated\":false} \n", - "8914 {\"token_count\":180,\"truncated\":false} \n", - "10021 {\"token_count\":30,\"truncated\":false} \n", - "10327 {\"token_count\":194,\"truncated\":false} \n", - "10345 {\"token_count\":262,\"truncated\":false} \n", - "10369 {\"token_count\":77,\"truncated\":false} \n", + " ml_generate_embedding_statistics ml_generate_embedding_status \\\n", + "357 {\"token_count\":306,\"truncated\":false} \n", + "428 {\"token_count\":134,\"truncated\":false} \n", + "1319 {\"token_count\":215,\"truncated\":false} \n", + "1993 {\"token_count\":536,\"truncated\":false} \n", + "1997 {\"token_count\":123,\"truncated\":false} \n", + "2469 {\"token_count\":60,\"truncated\":false} \n", + "2624 {\"token_count\":254,\"truncated\":false} \n", + "2832 {\"token_count\":79,\"truncated\":false} \n", + "3328 {\"token_count\":156,\"truncated\":false} \n", + "3650 {\"token_count\":175,\"truncated\":false} \n", + "3860 {\"token_count\":1267,\"truncated\":false} \n", + "4464 {\"token_count\":906,\"truncated\":false} \n", + "4470 {\"token_count\":200,\"truncated\":false} \n", + "4567 {\"token_count\":110,\"truncated\":false} \n", + "4713 {\"token_count\":549,\"truncated\":false} \n", + "5181 {\"token_count\":77,\"truncated\":false} \n", + "5511 {\"token_count\":262,\"truncated\":false} \n", + "5888 {\"token_count\":176,\"truncated\":false} \n", + "6299 {\"token_count\":151,\"truncated\":false} \n", + "7143 {\"token_count\":234,\"truncated\":false} \n", + "7219 {\"token_count\":26,\"truncated\":false} \n", + "7574 {\"token_count\":129,\"truncated\":false} \n", + "8759 {\"token_count\":501,\"truncated\":false} \n", + "9700 {\"token_count\":48,\"truncated\":false} \n", + "9822 {\"token_count\":2373,\"truncated\":true} \n", "\n", - " content \n", - "545 My payments have been approximately {$89.00} w... \n", - "614 Hi, I have contacted Trans Union XXXX XXXX abo... \n", - "1236 I have a XXXX XXXX XXXX credit card on my Exp... \n", - "1477 Wrongs information, selling my information to ... \n", - "2261 Please investigate and delete disputed item th... \n", - "2361 By the provisions of the Fair Credit Reporting... \n", - "2378 Since XX/XX/XXXX I have been trying to dispute... \n", - "3133 Out of the blue I received a debt collection n... \n", - "3140 My wife and I have been sending money to XXXX ... \n", - "3322 Phone calls from Convergent Outsourcing XXXX. ... \n", - "3583 I recently received a copy of my credit report... \n", - "4134 I have been sending the creditor what they hav... \n", - "4496 This is my second complaint. Their response to... \n", - "5260 XX/XX/XXXX and XX/XX/XXXX, {$3200.00} contacte... \n", - "5400 Upon checking my XXXX credit report I noticed ... \n", - "5425 Follow up to previous complaint XXXX XXXX XXXX... \n", - "6014 My new XXXX lease was over always paid on time... \n", - "8192 I have no idea where this account cane from. B... \n", - "8240 I TIED TO BUY CAR AT XXXX, THEY GOT APPROVAL F... \n", - "8720 XXXX XXXX XXXX XXXX, NY XXXX XX/XX/XXXX Consum... \n", - "8914 On XX/XX/21 I sent a letter regarding inaccura... \n", - "10021 XX/XX/XXXX and XX/XX/XXXX inaccurate informati... \n", - "10327 When I reviewed my credit report, I discovered... \n", - "10345 U.S. Bank sent two letters containing Visa Deb... \n", - "10369 I requested from XXXX that they reverse the la... \n", + " content \n", + "357 I decided to try XXXX services for my wife and... \n", + "428 XXXX I went to the bank in question ( XXXX XXX... \n", + "1319 I currently have a home loan with my ex husban... \n", + "1993 NOT MY ACCOUNT, NOT AN AUTHORIZED USER {$1800.... \n", + "1997 After a while the payments became harder and h... \n", + "2469 In the course of my student loan, I have been ... \n", + "2624 In accordance with the Fair Credit Reporting A... \n", + "2832 LVNV FUNDING LLC is continually placing a coll... \n", + "3328 On XX/XX/2020 I sent a letter regarding inaccu... \n", + "3650 Over a year and a half ago we started the proc... \n", + "3860 The issue is 26 late payments on me and my wif... \n", + "4464 I purchased as replacement for a lost XXXX XXX... \n", + "4470 in accordance with the Fair Credit Reporting a... \n", + "4567 I have submitted multiple disputes through the... \n", + "4713 While shopping for furniture for my home I ope... \n", + "5181 I had opened a Wells Fargo checking account wi... \n", + "5511 I recently disputed ( see attached letter ) wi... \n", + "5888 XXXX XXXX XXXX XXXX \n", + "I have disputed this acco... \n", + "6299 XXXX ; XXXX and Transunion are reporting ( 30 ... \n", + "7143 My Macys account is due on the first of every ... \n", + "7219 Keep getting letters and calls from collection... \n", + "7574 On XXXX I was on the XXXX app and there was a ... \n", + "8759 Obviously I've been a victim of fraud, therefo... \n", + "9700 The following item have not been properly inve... \n", + "9822 During the housing market crash I went through... \n", "...\n", "\n", "[10000 rows x 4 columns]" @@ -822,19 +812,7 @@ { "data": { "text/html": [ - "Query job c78e1040-2a57-42f6-8fdb-5b9524846259 is DONE. 72.1 MB processed. Open Job" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Query job 0986541b-3941-4387-b813-8888f53d149e is DONE. 0 Bytes processed. Open Job" + "Query job b4594edf-80e5-4476-ac06-b799001f4cb0 is DONE. 72.0 MB processed. Open Job" ], "text/plain": [ "" @@ -846,7 +824,7 @@ { "data": { "text/html": [ - "Query job 754aadd2-fee6-495c-acef-506f4e13c062 is DONE. 72.6 MB processed. Open Job" + "Query job 417e806a-2574-4b1b-8276-a95fa2df56e1 is DONE. 72.5 MB processed. Open Job" ], "text/plain": [ "" @@ -876,187 +854,188 @@ " \n", " \n", " \n", - " text_embedding\n", - " statistics\n", - " ml_embed_text_status\n", + " ml_generate_embedding_result\n", + " ml_generate_embedding_statistics\n", + " ml_generate_embedding_status\n", " content\n", " \n", " \n", " \n", " \n", - " 545\n", - " [ 1.82510037e-02 -1.27867460e-02 -1.57095697e-...\n", - " {\"token_count\":178,\"truncated\":false}\n", + " 357\n", + " [ 1.33585772e-02 -3.76597494e-02 -6.14452176e-...\n", + " {\"token_count\":306,\"truncated\":false}\n", " \n", - " My payments have been approximately {$89.00} w...\n", + " I decided to try XXXX services for my wife and...\n", " \n", " \n", - " 614\n", - " [ 5.40032536e-02 -5.28502129e-02 -5.33268750e-...\n", - " {\"token_count\":399,\"truncated\":false}\n", + " 428\n", + " [ 3.10036819e-04 -3.82593311e-02 -3.41922641e-...\n", + " {\"token_count\":134,\"truncated\":false}\n", " \n", - " Hi, I have contacted Trans Union XXXX XXXX abo...\n", + " XXXX I went to the bank in question ( XXXX XXX...\n", " \n", " \n", - " 1236\n", - " [-5.32836001e-03 -5.84292673e-02 -5.86670786e-...\n", - " {\"token_count\":129,\"truncated\":false}\n", + " 1319\n", + " [ 1.97481886e-02 -1.97448786e-02 -5.13443351e-...\n", + " {\"token_count\":215,\"truncated\":false}\n", " \n", - " I have a XXXX XXXX XXXX credit card on my Exp...\n", + " I currently have a home loan with my ex husban...\n", " \n", " \n", - " 1477\n", - " [ 3.02605387e-02 -4.37121317e-02 -2.70802993e-...\n", - " {\"token_count\":16,\"truncated\":false}\n", + " 1993\n", + " [ 9.83821880e-03 -6.55664057e-02 -5.46210706e-...\n", + " {\"token_count\":536,\"truncated\":false}\n", " \n", - " Wrongs information, selling my information to ...\n", + " NOT MY ACCOUNT, NOT AN AUTHORIZED USER {$1800....\n", " \n", " \n", - " 2261\n", - " [ 2.35723313e-02 -3.73509154e-02 -6.44604117e-...\n", - " {\"token_count\":33,\"truncated\":false}\n", + " 1997\n", + " [ 0.03145148 -0.01011822 -0.02316323 -0.025078...\n", + " {\"token_count\":123,\"truncated\":false}\n", " \n", - " Please investigate and delete disputed item th...\n", + " After a while the payments became harder and h...\n", " \n", " \n", - " 2361\n", - " [ 1.04440488e-02 -9.37070698e-03 -7.36323372e-...\n", - " {\"token_count\":45,\"truncated\":false}\n", + " 2469\n", + " [ 4.74590808e-03 -4.56819348e-02 -2.49751769e-...\n", + " {\"token_count\":60,\"truncated\":false}\n", " \n", - " By the provisions of the Fair Credit Reporting...\n", + " In the course of my student loan, I have been ...\n", " \n", " \n", - " 2378\n", - " [ 3.04989032e-02 -4.08191867e-02 -6.18648790e-...\n", - " {\"token_count\":892,\"truncated\":false}\n", + " 2624\n", + " [ 3.91883589e-03 -3.26644145e-02 -7.10378587e-...\n", + " {\"token_count\":254,\"truncated\":false}\n", " \n", - " Since XX/XX/XXXX I have been trying to dispute...\n", + " In accordance with the Fair Credit Reporting A...\n", " \n", " \n", - " 3133\n", - " [ 0.00152804 -0.04189068 -0.04220504 -0.053740...\n", - " {\"token_count\":90,\"truncated\":false}\n", + " 2832\n", + " [ 8.35181400e-03 -2.91643552e-02 -4.30776961e-...\n", + " {\"token_count\":79,\"truncated\":false}\n", " \n", - " Out of the blue I received a debt collection n...\n", + " LVNV FUNDING LLC is continually placing a coll...\n", " \n", " \n", - " 3140\n", - " [ 3.11435573e-02 -4.44000624e-02 -2.10917685e-...\n", - " {\"token_count\":372,\"truncated\":false}\n", + " 3328\n", + " [ 2.71253809e-02 -1.77491009e-02 -5.32273464e-...\n", + " {\"token_count\":156,\"truncated\":false}\n", " \n", - " My wife and I have been sending money to XXXX ...\n", + " On XX/XX/2020 I sent a letter regarding inaccu...\n", " \n", " \n", - " 3322\n", - " [ 2.75927987e-02 -6.23729872e-03 -3.83295454e-...\n", - " {\"token_count\":36,\"truncated\":false}\n", + " 3650\n", + " [-6.10093866e-03 -5.93599863e-02 -8.04531425e-...\n", + " {\"token_count\":175,\"truncated\":false}\n", " \n", - " Phone calls from Convergent Outsourcing XXXX. ...\n", + " Over a year and a half ago we started the proc...\n", " \n", " \n", - " 3583\n", - " [ 9.20385588e-03 -3.83387171e-02 -6.46291822e-...\n", - " {\"token_count\":52,\"truncated\":false}\n", + " 3860\n", + " [ 5.84836192e-02 -2.43354496e-03 -5.57337068e-...\n", + " {\"token_count\":1267,\"truncated\":false}\n", " \n", - " I recently received a copy of my credit report...\n", + " The issue is 26 late payments on me and my wif...\n", " \n", " \n", - " 4134\n", - " [-7.04960374e-04 -3.52595337e-02 -1.65264793e-...\n", - " {\"token_count\":412,\"truncated\":false}\n", + " 4464\n", + " [ 6.05084226e-02 -3.21578234e-02 -7.51668587e-...\n", + " {\"token_count\":906,\"truncated\":false}\n", " \n", - " I have been sending the creditor what they hav...\n", + " I purchased as replacement for a lost XXXX XXX...\n", " \n", " \n", - " 4496\n", - " [ 3.67735326e-02 1.21120387e-03 -5.20942472e-...\n", - " {\"token_count\":182,\"truncated\":false}\n", + " 4470\n", + " [ 1.28689921e-02 -3.25881056e-02 -6.53645024e-...\n", + " {\"token_count\":200,\"truncated\":false}\n", " \n", - " This is my second complaint. Their response to...\n", + " in accordance with the Fair Credit Reporting a...\n", " \n", " \n", - " 5260\n", - " [ 2.07133405e-02 -1.69602726e-02 -5.07124476e-...\n", - " {\"token_count\":103,\"truncated\":false}\n", + " 4567\n", + " [-5.49167022e-03 -3.84587422e-02 -8.56091827e-...\n", + " {\"token_count\":110,\"truncated\":false}\n", " \n", - " XX/XX/XXXX and XX/XX/XXXX, {$3200.00} contacte...\n", + " I have submitted multiple disputes through the...\n", " \n", " \n", - " 5400\n", - " [ 1.44114876e-02 -2.34710164e-02 -6.58538565e-...\n", - " {\"token_count\":60,\"truncated\":false}\n", + " 4713\n", + " [ 2.68485844e-02 -3.46762352e-02 -4.59849052e-...\n", + " {\"token_count\":549,\"truncated\":false}\n", " \n", - " Upon checking my XXXX credit report I noticed ...\n", + " While shopping for furniture for my home I ope...\n", " \n", " \n", - " 5425\n", - " [ 3.10326386e-02 -2.19427086e-02 -6.56386837e-...\n", - " {\"token_count\":87,\"truncated\":false}\n", + " 5181\n", + " [ 2.05754172e-02 -3.83999050e-02 -9.29225236e-...\n", + " {\"token_count\":77,\"truncated\":false}\n", " \n", - " Follow up to previous complaint XXXX XXXX XXXX...\n", + " I had opened a Wells Fargo checking account wi...\n", " \n", " \n", - " 6014\n", - " [ 1.90773793e-02 -2.27493346e-02 -3.27166244e-...\n", - " {\"token_count\":175,\"truncated\":false}\n", + " 5511\n", + " [-0.00217485 -0.04031368 -0.06604777 -0.052006...\n", + " {\"token_count\":262,\"truncated\":false}\n", " \n", - " My new XXXX lease was over always paid on time...\n", + " I recently disputed ( see attached letter ) wi...\n", " \n", " \n", - " 8192\n", - " [ 0.01937891 -0.05466933 -0.06070872 -0.059028...\n", - " {\"token_count\":131,\"truncated\":false}\n", + " 5888\n", + " [-8.15972779e-03 -3.46563384e-02 -5.91776446e-...\n", + " {\"token_count\":176,\"truncated\":false}\n", " \n", - " I have no idea where this account cane from. B...\n", + " XXXX XXXX XXXX XXXX \n", + "I have disputed this acco...\n", " \n", " \n", - " 8240\n", - " [ 4.34123818e-03 -3.40953320e-02 -4.06381376e-...\n", - " {\"token_count\":87,\"truncated\":false}\n", + " 6299\n", + " [ 4.80043218e-02 -4.13420722e-02 -6.12363108e-...\n", + " {\"token_count\":151,\"truncated\":false}\n", " \n", - " I TIED TO BUY CAR AT XXXX, THEY GOT APPROVAL F...\n", + " XXXX ; XXXX and Transunion are reporting ( 30 ...\n", " \n", " \n", - " 8720\n", - " [ 0.03133732 -0.03972461 -0.00178199 -0.035876...\n", - " {\"token_count\":645,\"truncated\":false}\n", + " 7143\n", + " [ 4.39200476e-02 -3.04005221e-02 -3.47866341e-...\n", + " {\"token_count\":234,\"truncated\":false}\n", " \n", - " XXXX XXXX XXXX XXXX, NY XXXX XX/XX/XXXX Consum...\n", + " My Macys account is due on the first of every ...\n", " \n", " \n", - " 8914\n", - " [ 1.75969116e-02 -2.25022305e-02 -5.70390299e-...\n", - " {\"token_count\":180,\"truncated\":false}\n", + " 7219\n", + " [ 1.00224940e-02 -3.79302073e-03 -3.41785327e-...\n", + " {\"token_count\":26,\"truncated\":false}\n", " \n", - " On XX/XX/21 I sent a letter regarding inaccura...\n", + " Keep getting letters and calls from collection...\n", " \n", " \n", - " 10021\n", - " [ 5.02460636e-02 -5.25112189e-02 -4.12914790e-...\n", - " {\"token_count\":30,\"truncated\":false}\n", + " 7574\n", + " [-0.00149564 -0.06619431 -0.05084481 -0.048579...\n", + " {\"token_count\":129,\"truncated\":false}\n", " \n", - " XX/XX/XXXX and XX/XX/XXXX inaccurate informati...\n", + " On XXXX I was on the XXXX app and there was a ...\n", " \n", " \n", - " 10327\n", - " [-0.00979626 -0.04912931 -0.08654705 -0.021063...\n", - " {\"token_count\":194,\"truncated\":false}\n", + " 8759\n", + " [ 0.01501553 -0.03575936 -0.050562 -0.034884...\n", + " {\"token_count\":501,\"truncated\":false}\n", " \n", - " When I reviewed my credit report, I discovered...\n", + " Obviously I've been a victim of fraud, therefo...\n", " \n", " \n", - " 10345\n", - " [-0.04292191 -0.02636929 -0.06177032 -0.076520...\n", - " {\"token_count\":262,\"truncated\":false}\n", + " 9700\n", + " [ 1.01501048e-02 -2.80565154e-02 -4.05892394e-...\n", + " {\"token_count\":48,\"truncated\":false}\n", " \n", - " U.S. Bank sent two letters containing Visa Deb...\n", + " The following item have not been properly inve...\n", " \n", " \n", - " 10369\n", - " [ 2.16020197e-02 -5.62509745e-02 -5.93873672e-...\n", - " {\"token_count\":77,\"truncated\":false}\n", + " 9822\n", + " [ 2.95880195e-02 1.65440738e-02 -3.33247967e-...\n", + " {\"token_count\":2373,\"truncated\":true}\n", " \n", - " I requested from XXXX that they reverse the la...\n", + " During the housing market crash I went through...\n", " \n", " \n", "\n", @@ -1064,86 +1043,87 @@ "[10000 rows x 4 columns in total]" ], "text/plain": [ - " text_embedding \\\n", - "545 [ 1.82510037e-02 -1.27867460e-02 -1.57095697e-... \n", - "614 [ 5.40032536e-02 -5.28502129e-02 -5.33268750e-... \n", - "1236 [-5.32836001e-03 -5.84292673e-02 -5.86670786e-... \n", - "1477 [ 3.02605387e-02 -4.37121317e-02 -2.70802993e-... \n", - "2261 [ 2.35723313e-02 -3.73509154e-02 -6.44604117e-... \n", - "2361 [ 1.04440488e-02 -9.37070698e-03 -7.36323372e-... \n", - "2378 [ 3.04989032e-02 -4.08191867e-02 -6.18648790e-... \n", - "3133 [ 0.00152804 -0.04189068 -0.04220504 -0.053740... \n", - "3140 [ 3.11435573e-02 -4.44000624e-02 -2.10917685e-... \n", - "3322 [ 2.75927987e-02 -6.23729872e-03 -3.83295454e-... \n", - "3583 [ 9.20385588e-03 -3.83387171e-02 -6.46291822e-... \n", - "4134 [-7.04960374e-04 -3.52595337e-02 -1.65264793e-... \n", - "4496 [ 3.67735326e-02 1.21120387e-03 -5.20942472e-... \n", - "5260 [ 2.07133405e-02 -1.69602726e-02 -5.07124476e-... \n", - "5400 [ 1.44114876e-02 -2.34710164e-02 -6.58538565e-... \n", - "5425 [ 3.10326386e-02 -2.19427086e-02 -6.56386837e-... \n", - "6014 [ 1.90773793e-02 -2.27493346e-02 -3.27166244e-... \n", - "8192 [ 0.01937891 -0.05466933 -0.06070872 -0.059028... \n", - "8240 [ 4.34123818e-03 -3.40953320e-02 -4.06381376e-... \n", - "8720 [ 0.03133732 -0.03972461 -0.00178199 -0.035876... \n", - "8914 [ 1.75969116e-02 -2.25022305e-02 -5.70390299e-... \n", - "10021 [ 5.02460636e-02 -5.25112189e-02 -4.12914790e-... \n", - "10327 [-0.00979626 -0.04912931 -0.08654705 -0.021063... \n", - "10345 [-0.04292191 -0.02636929 -0.06177032 -0.076520... \n", - "10369 [ 2.16020197e-02 -5.62509745e-02 -5.93873672e-... \n", + " ml_generate_embedding_result \\\n", + "357 [ 1.33585772e-02 -3.76597494e-02 -6.14452176e-... \n", + "428 [ 3.10036819e-04 -3.82593311e-02 -3.41922641e-... \n", + "1319 [ 1.97481886e-02 -1.97448786e-02 -5.13443351e-... \n", + "1993 [ 9.83821880e-03 -6.55664057e-02 -5.46210706e-... \n", + "1997 [ 0.03145148 -0.01011822 -0.02316323 -0.025078... \n", + "2469 [ 4.74590808e-03 -4.56819348e-02 -2.49751769e-... \n", + "2624 [ 3.91883589e-03 -3.26644145e-02 -7.10378587e-... \n", + "2832 [ 8.35181400e-03 -2.91643552e-02 -4.30776961e-... \n", + "3328 [ 2.71253809e-02 -1.77491009e-02 -5.32273464e-... \n", + "3650 [-6.10093866e-03 -5.93599863e-02 -8.04531425e-... \n", + "3860 [ 5.84836192e-02 -2.43354496e-03 -5.57337068e-... \n", + "4464 [ 6.05084226e-02 -3.21578234e-02 -7.51668587e-... \n", + "4470 [ 1.28689921e-02 -3.25881056e-02 -6.53645024e-... \n", + "4567 [-5.49167022e-03 -3.84587422e-02 -8.56091827e-... \n", + "4713 [ 2.68485844e-02 -3.46762352e-02 -4.59849052e-... \n", + "5181 [ 2.05754172e-02 -3.83999050e-02 -9.29225236e-... \n", + "5511 [-0.00217485 -0.04031368 -0.06604777 -0.052006... \n", + "5888 [-8.15972779e-03 -3.46563384e-02 -5.91776446e-... \n", + "6299 [ 4.80043218e-02 -4.13420722e-02 -6.12363108e-... \n", + "7143 [ 4.39200476e-02 -3.04005221e-02 -3.47866341e-... \n", + "7219 [ 1.00224940e-02 -3.79302073e-03 -3.41785327e-... \n", + "7574 [-0.00149564 -0.06619431 -0.05084481 -0.048579... \n", + "8759 [ 0.01501553 -0.03575936 -0.050562 -0.034884... \n", + "9700 [ 1.01501048e-02 -2.80565154e-02 -4.05892394e-... \n", + "9822 [ 2.95880195e-02 1.65440738e-02 -3.33247967e-... \n", "\n", - " statistics ml_embed_text_status \\\n", - "545 {\"token_count\":178,\"truncated\":false} \n", - "614 {\"token_count\":399,\"truncated\":false} \n", - "1236 {\"token_count\":129,\"truncated\":false} \n", - "1477 {\"token_count\":16,\"truncated\":false} \n", - "2261 {\"token_count\":33,\"truncated\":false} \n", - "2361 {\"token_count\":45,\"truncated\":false} \n", - "2378 {\"token_count\":892,\"truncated\":false} \n", - "3133 {\"token_count\":90,\"truncated\":false} \n", - "3140 {\"token_count\":372,\"truncated\":false} \n", - "3322 {\"token_count\":36,\"truncated\":false} \n", - "3583 {\"token_count\":52,\"truncated\":false} \n", - "4134 {\"token_count\":412,\"truncated\":false} \n", - "4496 {\"token_count\":182,\"truncated\":false} \n", - "5260 {\"token_count\":103,\"truncated\":false} \n", - "5400 {\"token_count\":60,\"truncated\":false} \n", - "5425 {\"token_count\":87,\"truncated\":false} \n", - "6014 {\"token_count\":175,\"truncated\":false} \n", - "8192 {\"token_count\":131,\"truncated\":false} \n", - "8240 {\"token_count\":87,\"truncated\":false} \n", - "8720 {\"token_count\":645,\"truncated\":false} \n", - "8914 {\"token_count\":180,\"truncated\":false} \n", - "10021 {\"token_count\":30,\"truncated\":false} \n", - "10327 {\"token_count\":194,\"truncated\":false} \n", - "10345 {\"token_count\":262,\"truncated\":false} \n", - "10369 {\"token_count\":77,\"truncated\":false} \n", + " ml_generate_embedding_statistics ml_generate_embedding_status \\\n", + "357 {\"token_count\":306,\"truncated\":false} \n", + "428 {\"token_count\":134,\"truncated\":false} \n", + "1319 {\"token_count\":215,\"truncated\":false} \n", + "1993 {\"token_count\":536,\"truncated\":false} \n", + "1997 {\"token_count\":123,\"truncated\":false} \n", + "2469 {\"token_count\":60,\"truncated\":false} \n", + "2624 {\"token_count\":254,\"truncated\":false} \n", + "2832 {\"token_count\":79,\"truncated\":false} \n", + "3328 {\"token_count\":156,\"truncated\":false} \n", + "3650 {\"token_count\":175,\"truncated\":false} \n", + "3860 {\"token_count\":1267,\"truncated\":false} \n", + "4464 {\"token_count\":906,\"truncated\":false} \n", + "4470 {\"token_count\":200,\"truncated\":false} \n", + "4567 {\"token_count\":110,\"truncated\":false} \n", + "4713 {\"token_count\":549,\"truncated\":false} \n", + "5181 {\"token_count\":77,\"truncated\":false} \n", + "5511 {\"token_count\":262,\"truncated\":false} \n", + "5888 {\"token_count\":176,\"truncated\":false} \n", + "6299 {\"token_count\":151,\"truncated\":false} \n", + "7143 {\"token_count\":234,\"truncated\":false} \n", + "7219 {\"token_count\":26,\"truncated\":false} \n", + "7574 {\"token_count\":129,\"truncated\":false} \n", + "8759 {\"token_count\":501,\"truncated\":false} \n", + "9700 {\"token_count\":48,\"truncated\":false} \n", + "9822 {\"token_count\":2373,\"truncated\":true} \n", "\n", - " content \n", - "545 My payments have been approximately {$89.00} w... \n", - "614 Hi, I have contacted Trans Union XXXX XXXX abo... \n", - "1236 I have a XXXX XXXX XXXX credit card on my Exp... \n", - "1477 Wrongs information, selling my information to ... \n", - "2261 Please investigate and delete disputed item th... \n", - "2361 By the provisions of the Fair Credit Reporting... \n", - "2378 Since XX/XX/XXXX I have been trying to dispute... \n", - "3133 Out of the blue I received a debt collection n... \n", - "3140 My wife and I have been sending money to XXXX ... \n", - "3322 Phone calls from Convergent Outsourcing XXXX. ... \n", - "3583 I recently received a copy of my credit report... \n", - "4134 I have been sending the creditor what they hav... \n", - "4496 This is my second complaint. Their response to... \n", - "5260 XX/XX/XXXX and XX/XX/XXXX, {$3200.00} contacte... \n", - "5400 Upon checking my XXXX credit report I noticed ... \n", - "5425 Follow up to previous complaint XXXX XXXX XXXX... \n", - "6014 My new XXXX lease was over always paid on time... \n", - "8192 I have no idea where this account cane from. B... \n", - "8240 I TIED TO BUY CAR AT XXXX, THEY GOT APPROVAL F... \n", - "8720 XXXX XXXX XXXX XXXX, NY XXXX XX/XX/XXXX Consum... \n", - "8914 On XX/XX/21 I sent a letter regarding inaccura... \n", - "10021 XX/XX/XXXX and XX/XX/XXXX inaccurate informati... \n", - "10327 When I reviewed my credit report, I discovered... \n", - "10345 U.S. Bank sent two letters containing Visa Deb... \n", - "10369 I requested from XXXX that they reverse the la... \n", + " content \n", + "357 I decided to try XXXX services for my wife and... \n", + "428 XXXX I went to the bank in question ( XXXX XXX... \n", + "1319 I currently have a home loan with my ex husban... \n", + "1993 NOT MY ACCOUNT, NOT AN AUTHORIZED USER {$1800.... \n", + "1997 After a while the payments became harder and h... \n", + "2469 In the course of my student loan, I have been ... \n", + "2624 In accordance with the Fair Credit Reporting A... \n", + "2832 LVNV FUNDING LLC is continually placing a coll... \n", + "3328 On XX/XX/2020 I sent a letter regarding inaccu... \n", + "3650 Over a year and a half ago we started the proc... \n", + "3860 The issue is 26 late payments on me and my wif... \n", + "4464 I purchased as replacement for a lost XXXX XXX... \n", + "4470 in accordance with the Fair Credit Reporting a... \n", + "4567 I have submitted multiple disputes through the... \n", + "4713 While shopping for furniture for my home I ope... \n", + "5181 I had opened a Wells Fargo checking account wi... \n", + "5511 I recently disputed ( see attached letter ) wi... \n", + "5888 XXXX XXXX XXXX XXXX \n", + "I have disputed this acco... \n", + "6299 XXXX ; XXXX and Transunion are reporting ( 30 ... \n", + "7143 My Macys account is due on the first of every ... \n", + "7219 Keep getting letters and calls from collection... \n", + "7574 On XXXX I was on the XXXX app and there was a ... \n", + "8759 Obviously I've been a victim of fraud, therefo... \n", + "9700 The following item have not been properly inve... \n", + "9822 During the housing market crash I went through... \n", "...\n", "\n", "[10000 rows x 4 columns]" @@ -1156,10 +1136,10 @@ ], "source": [ "successful_rows = (\n", - " (predicted_embeddings[\"ml_embed_text_status\"] == \"\")\n", + " (predicted_embeddings[\"ml_generate_embedding_status\"] == \"\")\n", " # Series.str.len() gives the length of an array.\n", " # See: https://stackoverflow.com/a/41340543/101923\n", - " & (predicted_embeddings[\"text_embedding\"].str.len() != 0)\n", + " & (predicted_embeddings[\"ml_generate_embedding_result\"].str.len() != 0)\n", ")\n", "predicted_embeddings = predicted_embeddings[successful_rows]\n", "predicted_embeddings\n" @@ -1214,7 +1194,7 @@ { "data": { "text/html": [ - "Query job fa4bbc13-3831-4c80-9b59-9939e605ed58 is DONE. 61.7 MB processed. Open Job" + "Query job 18aa46ee-0b10-4912-ae14-87b7e81ee447 is DONE. 61.7 MB processed. Open Job" ], "text/plain": [ "" @@ -1226,7 +1206,7 @@ { "data": { "text/html": [ - "Query job d2d681aa-e49a-4fda-89fd-60cf906d3aec is DONE. 0 Bytes processed. Open Job" + "Query job fd573f97-2424-472a-969d-463f184967d9 is DONE. 0 Bytes processed. Open Job" ], "text/plain": [ "" @@ -1238,7 +1218,7 @@ { "data": { "text/html": [ - "Query job 234bb6be-625c-4c96-baea-c37c33410114 is DONE. 72.7 MB processed. Open Job" + "Query job 9f2e0a3f-d7d6-4fb8-b558-95f39235410d is DONE. 72.7 MB processed. Open Job" ], "text/plain": [ "" @@ -1250,7 +1230,7 @@ { "data": { "text/html": [ - "Query job 285817cb-99d3-426f-82c3-89d36119e8db is DONE. 80.0 kB processed. Open Job" + "Query job 786ababe-7c40-426f-bb39-154329e4c51a is DONE. 80.0 kB processed. Open Job" ], "text/plain": [ "" @@ -1262,7 +1242,7 @@ { "data": { "text/html": [ - "Query job 3a39d2b0-55a1-4922-972a-8806b387f877 is DONE. 73.3 MB processed. Open Job" + "Query job a191fc97-baa6-4c7c-b78f-4365678caa60 is DONE. 73.2 MB processed. Open Job" ], "text/plain": [ "" @@ -1294,57 +1274,57 @@ " \n", " CENTROID_ID\n", " NEAREST_CENTROIDS_DISTANCE\n", - " text_embedding\n", - " statistics\n", - " ml_embed_text_status\n", + " ml_generate_embedding_result\n", + " ml_generate_embedding_statistics\n", + " ml_generate_embedding_status\n", " content\n", " \n", " \n", " \n", " \n", - " 182250\n", + " 1244571\n", " 1\n", - " [{'CENTROID_ID': 1, 'DISTANCE': 0.570560301900...\n", - " [ 4.70298417e-02 -4.08669300e-02 -2.99868709e-...\n", - " {\"token_count\":10,\"truncated\":false}\n", + " [{'CENTROID_ID': 1, 'DISTANCE': 0.414497263076...\n", + " [ 1.10590272e-02 -2.11433582e-02 -5.66212423e-...\n", + " {\"token_count\":100,\"truncated\":false}\n", " \n", - " These are not my accounts. Please remove them.\n", + " Ive disputed two Bankruptcies that still exist...\n", " \n", " \n", - " 3023485\n", + " 744390\n", " 1\n", - " [{'CENTROID_ID': 1, 'DISTANCE': 0.523572693768...\n", - " [ 1.55437263e-02 -1.93240177e-02 -2.48466972e-...\n", - " {\"token_count\":10,\"truncated\":false}\n", + " [{'CENTROID_ID': 1, 'DISTANCE': 0.416584344032...\n", + " [ 4.15011719e-02 -4.50705849e-02 -7.35541508e-...\n", + " {\"token_count\":100,\"truncated\":false}\n", " \n", - " This debt is not mine due to identity theft.\n", + " The XXXX account was settled as a class action...\n", " \n", " \n", - " 407254\n", + " 127514\n", " 1\n", - " [{'CENTROID_ID': 1, 'DISTANCE': 0.515173566816...\n", - " [-0.01293471 -0.01959546 -0.02238463 -0.066214...\n", - " {\"token_count\":10,\"truncated\":false}\n", + " [{'CENTROID_ID': 1, 'DISTANCE': 0.541137734253...\n", + " [ 3.54415141e-02 1.23769706e-02 -2.61783414e-...\n", + " {\"token_count\":100,\"truncated\":false}\n", " \n", - " I do not owe this company money!!!!!\n", + " I have late payments reported on my student lo...\n", " \n", " \n", - " 1509454\n", + " 630563\n", " 1\n", - " [{'CENTROID_ID': 1, 'DISTANCE': 0.645342721754...\n", - " [ 3.21860723e-02 -2.67103072e-02 -4.78175096e-...\n", - " {\"token_count\":10,\"truncated\":false}\n", + " [{'CENTROID_ID': 1, 'DISTANCE': 0.477175150810...\n", + " [ 2.34235693e-02 -4.21241224e-02 -3.90484147e-...\n", + " {\"token_count\":100,\"truncated\":false}\n", " \n", - " VIOLATES HIPPA AND CRA\n", + " A Military Star Credit card, aka Take it Home ...\n", " \n", " \n", - " 2357848\n", + " 2651231\n", " 1\n", - " [{'CENTROID_ID': 1, 'DISTANCE': 0.519872186251...\n", - " [-1.88122243e-02 -2.68064123e-02 -4.69480827e-...\n", - " {\"token_count\":10,\"truncated\":false}\n", + " [{'CENTROID_ID': 1, 'DISTANCE': 0.489760037964...\n", + " [ 2.64898203e-02 -5.62610961e-02 -5.82714193e-...\n", + " {\"token_count\":101,\"truncated\":false}\n", " \n", - " Receive numerous phone calls. I have no debt.\n", + " My mortgage is with Bank of America. I filed C...\n", " \n", " \n", "\n", @@ -1352,32 +1332,32 @@ ], "text/plain": [ " CENTROID_ID NEAREST_CENTROIDS_DISTANCE \\\n", - "182250 1 [{'CENTROID_ID': 1, 'DISTANCE': 0.570560301900... \n", - "3023485 1 [{'CENTROID_ID': 1, 'DISTANCE': 0.523572693768... \n", - "407254 1 [{'CENTROID_ID': 1, 'DISTANCE': 0.515173566816... \n", - "1509454 1 [{'CENTROID_ID': 1, 'DISTANCE': 0.645342721754... \n", - "2357848 1 [{'CENTROID_ID': 1, 'DISTANCE': 0.519872186251... \n", + "1244571 1 [{'CENTROID_ID': 1, 'DISTANCE': 0.414497263076... \n", + "744390 1 [{'CENTROID_ID': 1, 'DISTANCE': 0.416584344032... \n", + "127514 1 [{'CENTROID_ID': 1, 'DISTANCE': 0.541137734253... \n", + "630563 1 [{'CENTROID_ID': 1, 'DISTANCE': 0.477175150810... \n", + "2651231 1 [{'CENTROID_ID': 1, 'DISTANCE': 0.489760037964... \n", "\n", - " text_embedding \\\n", - "182250 [ 4.70298417e-02 -4.08669300e-02 -2.99868709e-... \n", - "3023485 [ 1.55437263e-02 -1.93240177e-02 -2.48466972e-... \n", - "407254 [-0.01293471 -0.01959546 -0.02238463 -0.066214... \n", - "1509454 [ 3.21860723e-02 -2.67103072e-02 -4.78175096e-... \n", - "2357848 [-1.88122243e-02 -2.68064123e-02 -4.69480827e-... \n", + " ml_generate_embedding_result \\\n", + "1244571 [ 1.10590272e-02 -2.11433582e-02 -5.66212423e-... \n", + "744390 [ 4.15011719e-02 -4.50705849e-02 -7.35541508e-... \n", + "127514 [ 3.54415141e-02 1.23769706e-02 -2.61783414e-... \n", + "630563 [ 2.34235693e-02 -4.21241224e-02 -3.90484147e-... \n", + "2651231 [ 2.64898203e-02 -5.62610961e-02 -5.82714193e-... \n", "\n", - " statistics ml_embed_text_status \\\n", - "182250 {\"token_count\":10,\"truncated\":false} \n", - "3023485 {\"token_count\":10,\"truncated\":false} \n", - "407254 {\"token_count\":10,\"truncated\":false} \n", - "1509454 {\"token_count\":10,\"truncated\":false} \n", - "2357848 {\"token_count\":10,\"truncated\":false} \n", + " ml_generate_embedding_statistics ml_generate_embedding_status \\\n", + "1244571 {\"token_count\":100,\"truncated\":false} \n", + "744390 {\"token_count\":100,\"truncated\":false} \n", + "127514 {\"token_count\":100,\"truncated\":false} \n", + "630563 {\"token_count\":100,\"truncated\":false} \n", + "2651231 {\"token_count\":101,\"truncated\":false} \n", "\n", - " content \n", - "182250 These are not my accounts. Please remove them. \n", - "3023485 This debt is not mine due to identity theft. \n", - "407254 I do not owe this company money!!!!! \n", - "1509454 VIOLATES HIPPA AND CRA \n", - "2357848 Receive numerous phone calls. I have no debt. " + " content \n", + "1244571 Ive disputed two Bankruptcies that still exist... \n", + "744390 The XXXX account was settled as a class action... \n", + "127514 I have late payments reported on my student lo... \n", + "630563 A Military Star Credit card, aka Take it Home ... \n", + "2651231 My mortgage is with Bank of America. I filed C... " ] }, "execution_count": 13, @@ -1387,7 +1367,7 @@ ], "source": [ "# Use KMeans clustering to calculate our groups. Will take ~3 minutes.\n", - "cluster_model.fit(predicted_embeddings[[\"text_embedding\"]])\n", + "cluster_model.fit(predicted_embeddings[[\"ml_generate_embedding_result\"]])\n", "clustered_result = cluster_model.predict(predicted_embeddings)\n", "# Notice the CENTROID_ID column, which is the ID number of the group that\n", "# each complaint belongs to.\n", @@ -1430,7 +1410,7 @@ { "data": { "text/html": [ - "Query job 85ead687-4ba9-44bf-88da-23a066f45960 is DONE. 10.7 MB processed. Open Job" + "Query job 8bfc647f-b9e5-40a2-816c-d12e8f81bea3 is DONE. 10.6 MB processed. Open Job" ], "text/plain": [ "" @@ -1442,7 +1422,7 @@ { "data": { "text/html": [ - "Query job 68ef20cd-220d-40a9-bb42-63ed3d6f5d3f is DONE. 10.7 MB processed. Open Job" + "Query job 6f834214-9cc3-4577-bb2d-980ba05df817 is DONE. 10.6 MB processed. Open Job" ], "text/plain": [ "" @@ -1478,42 +1458,62 @@ "output_type": "stream", "text": [ "comment list 1:\n", - "1. Wrongs information, selling my information to third party. Incorrect reporting\n", - "2. I TIED TO BUY CAR AT XXXX, THEY GOT APPROVAL FROM XXXX XXXX XXXX XXXX WHICH ENDED A A LIE. THEY ALSO, PULLED MANY OTHERS I DID NT AGREED TO. SOLF PULLS ONLY\n", - "3. XX/XX/XXXX and XX/XX/XXXX inaccurate information reported 30 days late.\n", - "4. Im working on myCredit and I see a couple of inquiries that I have no idea where they came from.\n", - "5. I request a copy of all dispute results and documentary evidence from XXXX, and XXXX mailed to me\n", + "1. I currently have a home loan with my ex husband with PHH Mortgages. We filed for divorce and in the divorce decree he became liable for the home and paying the payments. He ended up missing XXXX payments which effected my credit fairly substaintailly. when I became aware of the late payments, I ensured that the account was up to date and have since. I presented to them that I have the legal documents that he is obligated to make the payments each month and that I am not responisble for the payment. I asked them to remove the XXXX dings on my credit and they would not. I offered to present the paperwork and they still would not. The home is now being sold. I even filed with XXXX as a discrepency and they would not remove it. I would have never let these become a late payment. I was not even notified as they had all of his information in the file.\n", + "2. In the course of my student loan, I have been making payments and I feel that the payments haven't been added to the debt, the company stated that I am delinquent over 180 and my payments are auto pay. This has had a negative impact on my credit score.\n", + "3. The issue is 26 late payments on me and my wife 's credit reports due to a system error on a joint mortgage account that was always paid on time using autopay. ( will attach docs to support this ). \n", "\n", - "comment list 2:\n", - "1. My wife and I have been sending money to XXXX via my brother-in-law to finish a building project we have been working on since XXXX with target date of completion by XX/XX/XXXX. In XXXX XXXX my brother-in-law in was contacted by his bank to confirm he was not defrauding my wife. My brother-in-law confirmed he was helping to handle the building project by organizing and paying the workers. In XXXX XXXX Bank of America reach out to my wife to update her profile to avoid account restrictions. My wife 's account was eventually restricted until she called and confirmed her employment and other personal information. My wife 's full account activities were then restored and we continued sending wire transfers to XXXX via her checking account. \n", - "Then I received a letter dated XXXX XXXX XXXX from Bank of America stating the money market account I share with my wife which has been opened since XXXX will be will be restricted from use in 21 days and closed in 30 days with no reason. I strongly believe this is a result of the legal international wires because there was no reason to close the Savings account which had with hardly any activity. \n", - "I agree that Bank of America has a right to close accounts but I do not agree with Bank of America closing accounts because of international transactions unless they can prove fraud, criminal activity or support for terrorism, this is discriminatory towards foreign nationals. How are foreign nationals suppose to make investments or support their family/community if they are excluded from the banking system?\n", - "2. XXXX XXXX XXXX XXXX, NY XXXX XX/XX/XXXX Consumer Financial Protection Bureau XXXX XXXX XXXX XXXX, IA XXXX Dear Sir or Madam : In XX/XX/XXXX Out of the blue JP Morgan Chase arbitrarily closed my account. This was after my mother is a XXXX survivor who is over XXXX years old and for whom I have a general power of attorney and take care of her bill paying was questioned about a transaction. She is also XXXX XXXX. \n", + "This is an ongoing nightmare me and my wife are going through over the past 3 years. \n", + "Sent many dispute letters to the creditor and to the 3 bureaus, was promised multiple times that all late payments will be removed, we also has a letter from the bank stating we were never late on this account, also have a recording of a phone call with bank 's permission were the representative admits there was a system error and promised again that all late payments will be deleted from both of our credit reports. \n", + "As of today, for an unknown reason XXXX reports 6x30 days late payments, XXXX reports 24 lates, and Transunion 23 lates. \n", + "\n", + "We have always paid our mortgage on time for many years, enrolled in autopay and making 2 payments per month. Our mortgage is currently with XXXX XXXX, XXXX XXXX is the mortgage servicer who's collecting from us and disbursing payments to XXXX XXXX. \n", + "\n", + "I will attach here our mortgage transaction history confirming payments have been made on time, letter from the mortgage servicer XXXX XXXX XXXX XXXX confirming we were never late, copy of a page from my credit report showing all the late payments, and a few bank statements showing payment made on time while showing as late on our credit reports. ( XXXX & XXXX XXXX ) PLEASE HELP us to resolve this issue and have all late payments on this account removed from XXXX & XXXX XXXX. \n", + "\n", + "Her is a small portion of our previous communication with XXXX and their response : XX/XX/XXXX : we spoke with XXXX, XXXX stated that the funds were misappropriated and went to the wrong account. Said he'll contact their Tax Dep ' and get back to me and never did. \n", "\n", - "I have reason to belief that a mentally disturbed family member for whom I have an order of protection initiated this situation. This individual has ben harassing me and other members of my family for a considerable amount of time. \n", + "XX/XX/XXXX : I spoke in length with XXXX from XXXX XXXX & XXXX from XXXX at XXXX who opened an investigation. Their supervisor said he made all the necessary changes. The next month, the money was withdrawn from my account on time and i received again a late fee and 30 days late on my credit report. \n", "\n", - "The bank initially was satisfied with her response. However within 2 days they closed the account of a XXXX year old XXXX XXXX person. \n", + "XX/XX/XXXX Spoke to XXXX who sent me to XXXX who sent me to XXXX XXXX from Escalation department, she promised the issue was fixed and late payments will be removed in up to 30 days and she will email me a deletion letter. Nothing was sent! and i called and wrote 5 emails to her and never got a response. \n", "\n", - "Soon after for no reason my account was closed as well. I tried to reach out to the corporate offices of Chase and make great effort to find out what happened and to restore my account as well as my mothers but I was unsuccessful. In addition the people I spoke to were not only unhelpful bu exceedingly rude. \n", + "XX/XX/XXXX spoke with XXXX to follow up with XXXX XXXX, no response. \n", "\n", - "I should add that I have had an account with Chase since XX/XX/XXXX and took care of my ailing father before he passed away for over 25 years as well. I am now taking care of my mother for over 28 years. \n", + "XX/XX/XXXX spoke with XXXX at XXXX, she said that the transaction history is our proof that issue was corrected and she'll submit a request to delete the late payments as the system does not show that previous request was made. \n", "\n", - "I went so far as contacting a prominent Television reporter who was interested in doing a report on what happened. \n", + "XX/XX/XXXX XXXX sent a letter stating that we had a shortage of {$5300.00}. Again, upon checking my bank account all monthly payments were made on time. To avoid further issues i sent a check for {$5300.00} on XX/XX/XXXX. \n", "\n", - "I have since managed to open an account at another bank but this week I had reason to go to a branch of Chase regarding another issue and a manager using my That is a very serious unsubstantiated accusation and given this information I have no choice but to submit this complaint. \n", + "XX/XX/XXXX following many joint calls with XXXX XXXX i received a letter from XXXX stating that my credit might have been affected due to processing error and that they sent XXXX XXXX a letter requesting a removal of all late payments. \n", "\n", - "I have no interest in having an account again at a disreputable bank like Chase but I can not and will not accept or tolerate a derogatory accusation be associated with my name. \n", + "XX/XX/XXXX spoke with XXXX XXXX again.. \n", + "XX/XX/XXXX spoke with XXXX at XXXX I have many more... \n", "\n", - "I hope that my complaint will hAve the desired effect of removing this derogatory unsubstantiated accusation be removed from my name. However. I will not let this unfair matter stand and Chase ought to know that I have already retained an attorney and will if necessary hold Chase responsible and liable all damage i have incurred now And in the future Enclosed, please find the letter from Chase stating that they were closing my mothers account and a similar letter was received by me too. \n", + "*** XX/XX/XXXX : SPOKE WITH XXXX XXXX ( resolution team ) at XXXX. She said they will delete the late payments from both reports ( XXXX & XXXX XXXX and will call me to follow up on XX/XX/XXXX. She also emailed me the payment activity on the account. XXXX I received the payment history but NO CALL OR RESOLUTION. \n", "\n", - "Also please find a letter from her Doctor stating that she is XXXX XXXX. \n", + "XX/XX/XXXX : SPOKE WITH XXXX FROM XXXX AND XXXX FROM XXXX XXXX ON A XXXX WAY CALL at XXXX, XXXX PERMISSION TO RECORD THE CALL, SHE AGREED, we went over all the late payments, she said she sees the error and promised that this time it will be resolved and get deleted from our credit reports. Again, nothing was resolved and we never heard back from anyone.\n", + "4. XXXX ; XXXX and Transunion are reporting ( 30 ) plus days late on the XXXX XXXX partial account number XXXX. ( Please see page 3 of the attached credit report. ) This account was paid in XXXX, 2019 and the lates are reporting in XXXX, 2019. Please keep in mind that it is impossible to have late payments on an account that was paid off a month prior. This incorrect reporting is harming my credit score and this line item need to be removed from my credit report. I have contacted the ( 3 ) bureaus to fix this, however I have been unsuccessful.\n", + "5. My Macys account is due on the first of every month. Since I have had the card I have paid on the XXXX PRIOR to the due date. And have paid over the amount due. In XXXX my XXXX XXXX auto pay did not come out of my account and rather than calling me - on the XXXX of XXXX just 5 days late they cut my credit off and shut me out of my account so I can not even see my credit profile - I have made the payment and they still are locking me out - please look into this - you will see that is what happened and they are stating in a letter it is becasue my XXXX report shows a seriuos derogorty item which it does not and I have submitted a complaint with them as well. Macys has been the worst credit experience of my LIFE and I did read the reviews but thought it would be different for me I guess? \n", + "thank you for your help.\n", "\n", - "Thank you. \n", + "comment list 2:\n", + "1. I decided to try XXXX services for my wife and I so I purchased phones for both of us. After a day or two of trial we felt unsatisfied so we headed back to the store and we returned all items. We got charged with restocking fees and taxes. Later on I got a bill in the mail in which I was being charged {$1200.00} for the returned items. After several attempts of arguing for about XXXX months about whether I owed XXXX or not I was dismissed of such charges, but a month after I was charged by a collecting company called ERC for {$61.00}. I asked them to explain such charges weather if they were fees or taxes and they we unable to disclose information. Therefore, I asked them to send me a bill in the mail with details about the charges, as well as a dispute package and they told me they would send me a bill. About the dispute part, they said that I needed to call XXXX to discuss the charges with them but XXXX said that I had to discuss this with the collecting company. I never received a detailed statement neither a chanse to defend my self about such charges, I checked my credit score and found a red flag in it because of this. \n", + "\n", + "I am now hoping you may help me with this case. \n", + "\n", + "Thanks :\n", + "2. Over a year and a half ago we started the process of buying a home. Our mortgage guy sent us to a credit repair co. They got the collection account from Weltman , Weinberg & Reis taken off my credit, because it was unverifiable. Now it is back on my credit. I have credit reports showing the trade line on and then off and now today it is currently on my report. When I called to verify the account with WW & R they sent me a heavily redacted letter verifying absolutely nothing. I would like this unverifiable account taken off my credit and removed permanently. This should not be a loan I have to pay for if there is no verification that it is my debt. Attached are the credit reports and the letter of verification that was sent to me.\n", + "3. I recently disputed ( see attached letter ) with Receivable Management Services an account entry that they placed on my credit report without providing a dunning letter or any correspondence that would have allowed me 30 days to dispute the validity of the alleged debt. To date, I have not received any communication from them. They are blatantly violating my rights by reporting this inaccurate, erroneous, unverifiable entry.\n", + "\n", + "Additionally, this account entry does not reflect a payment history which should be included on any entry that is reflected on my credit report. In my previous communication to them, I specifically requested that they provide an agreement that states their authority to collect on the alleged debt, agreement with signature of the alleged debtor wherein he/she agreed to pay the creditor, alleged account number, date this alleged debt became payable, original delinquency date, and to date to no avail. \n", "\n", - "XXXX XXXX\n", - "3. U.S. Bank sent two letters containing Visa Debit Cards to our address on XX/XX/2021. One Visa Debit Card is in the name of XXXX XXXX and one Visa Debit Card is in the name of XXXX XXXX. These cards supposedly link to existing checking accounts at U.S. Bank. However : ( 1 ) Neither of us have existing checking accounts at U.S. Bank, ( 2 ) Neither of us solicited a bank account at U.S. Bank, and ( 3 ) Neither of us solicited a Visa Debit Card. We have attempted to call U.S. Bank at the phone numbers provided in the letters but are only able to access an automated system which will not proceed without us establishing accounts and activating these cards. We are concerned here that one of two things has happened : either ( 1 ) we are victims of identity theft and some third party is trying to establish accounts in our name, or ( 2 ) U.S. Bank is engaged in bank fraud. In either case, we request the assistance of the Consumer Financial Protection Bureau. Thank you.\n", - "4. I contacted my bank over 3 times about this amount, the first two times I spoke to gentleman that agreed with me that I didnt get back a certain amount of dollars back, I did the math and they refuse to see that I do not owe this amount because I never had it in the first place. I wrote out all my charges and connected it to the charges made back from the consumer and I was missing XXXX, I called XXXX they said they gave it all back which is not their fault because they showed me proof. Along the lines Capital One does not want to take responsibility for the missing money. I have wrote everything out and then its not adding up, they keep saying that they did a charge back twice which is incorrect. My balance was at XXXX before I made this purchase and it shouldve been returned back to XXXX because I return all the items and nothing is in my possession. I have proof that I returned everything.\n", - "5. CB INDIGO ( Bank ) XX/XX/2022 I just recently got off the phone with the company and they wont put in a request of removal of a fraudulent hard inquiry from Insigo Mastercard to XXXX. They dont even have my information on file, I called 3 times most of them are lazy and was giving me a hard time.\n", + "As such, since they have refused to respond to my request and not provide any documentation to substantiate their allegations, coupled with the fact that they did not provide me a dunning letter is grounds for this erroneous, inaccurate, unverifiable entry to be deleted from my credit report.\n", + "4. I accepted service from XXXX XXXX XXXX. The company did not inform me that internet was required. They also told me that the agreement was at will without penalty. They never addressed my needs as a customer. My bill is only {$230.00}. They placed false information regarding my bill with a collection agency who has placed information on my credit report without contacting me or giving me an opportunity to dispute the validity of the debt. The debt is not valid. The actions are unlawful and I am requesting that the actions of this collection agency be reported to the Federal Trade Commission.\n", + "5. I have continued to submit an investigation for a Bankruptcy place on my credit report. I have been trying to get this removed because it was place on my credit report in error and inaccurate. ALL THREE CREDIT BUREAUS have continue to ignore the information proving this was place in error and fail to properly investigate the dispute I have place in their office. \n", + "\n", + "1. They say they have verified this dispute with XXXX but I have a letter from XXXX stating this was removed because they were unable to verify the accuracy of the bankruptcy. I received this letter on XX/XX/XXXX. XXXX just finished an investigation on XX/XX/2019 stating the verified this with XXXX. \n", + "2.Experian Open the dispute on XX/XX/2019 and closed it on XX/XX/19 stated they verified with XXXX and the Bankruptcy court and I have a letter From XXXX stating they could not verify the accuracy of this dispute. I also, have a letter from the court house stating they do not verify information with the credit bureaus How could be this be on my XXXX file when XXXX has removed this item. \n", + "3. XXXX open and investigation XX/XX/2019 and closed it XX/XX/2019 No way they properly investigation I have submitted all information to dispute the inaccurate information. Please do a proper investigation. \n", + "\n", + "XXXX, Experian, and XXXX please do a proper investigation under 611 of the FCRA thank you very much I have attached the letter proving this this is not on my XXXX consumer report and a letter form the court house stating they do not report information to the credit bureaus from the XXXX XXXX XXXX, Clerk of Court United State Bankruptcy Court on dated XX/XX/2019 I have summited it to the credit bureaus to be ignored. I have as for a description of my investigation by section 611 of the FCRA and the information from the investigation is inaccurate.\n", "\n" ] } @@ -1547,41 +1547,61 @@ "text": [ "Please highlight the most obvious difference between the two lists of comments:\n", "comment list 1:\n", - "1. Wrongs information, selling my information to third party. Incorrect reporting\n", - "2. I TIED TO BUY CAR AT XXXX, THEY GOT APPROVAL FROM XXXX XXXX XXXX XXXX WHICH ENDED A A LIE. THEY ALSO, PULLED MANY OTHERS I DID NT AGREED TO. SOLF PULLS ONLY\n", - "3. XX/XX/XXXX and XX/XX/XXXX inaccurate information reported 30 days late.\n", - "4. Im working on myCredit and I see a couple of inquiries that I have no idea where they came from.\n", - "5. I request a copy of all dispute results and documentary evidence from XXXX, and XXXX mailed to me\n", - "comment list 2:\n", - "1. My wife and I have been sending money to XXXX via my brother-in-law to finish a building project we have been working on since XXXX with target date of completion by XX/XX/XXXX. In XXXX XXXX my brother-in-law in was contacted by his bank to confirm he was not defrauding my wife. My brother-in-law confirmed he was helping to handle the building project by organizing and paying the workers. In XXXX XXXX Bank of America reach out to my wife to update her profile to avoid account restrictions. My wife 's account was eventually restricted until she called and confirmed her employment and other personal information. My wife 's full account activities were then restored and we continued sending wire transfers to XXXX via her checking account. \n", - "Then I received a letter dated XXXX XXXX XXXX from Bank of America stating the money market account I share with my wife which has been opened since XXXX will be will be restricted from use in 21 days and closed in 30 days with no reason. I strongly believe this is a result of the legal international wires because there was no reason to close the Savings account which had with hardly any activity. \n", - "I agree that Bank of America has a right to close accounts but I do not agree with Bank of America closing accounts because of international transactions unless they can prove fraud, criminal activity or support for terrorism, this is discriminatory towards foreign nationals. How are foreign nationals suppose to make investments or support their family/community if they are excluded from the banking system?\n", - "2. XXXX XXXX XXXX XXXX, NY XXXX XX/XX/XXXX Consumer Financial Protection Bureau XXXX XXXX XXXX XXXX, IA XXXX Dear Sir or Madam : In XX/XX/XXXX Out of the blue JP Morgan Chase arbitrarily closed my account. This was after my mother is a XXXX survivor who is over XXXX years old and for whom I have a general power of attorney and take care of her bill paying was questioned about a transaction. She is also XXXX XXXX. \n", + "1. I currently have a home loan with my ex husband with PHH Mortgages. We filed for divorce and in the divorce decree he became liable for the home and paying the payments. He ended up missing XXXX payments which effected my credit fairly substaintailly. when I became aware of the late payments, I ensured that the account was up to date and have since. I presented to them that I have the legal documents that he is obligated to make the payments each month and that I am not responisble for the payment. I asked them to remove the XXXX dings on my credit and they would not. I offered to present the paperwork and they still would not. The home is now being sold. I even filed with XXXX as a discrepency and they would not remove it. I would have never let these become a late payment. I was not even notified as they had all of his information in the file.\n", + "2. In the course of my student loan, I have been making payments and I feel that the payments haven't been added to the debt, the company stated that I am delinquent over 180 and my payments are auto pay. This has had a negative impact on my credit score.\n", + "3. The issue is 26 late payments on me and my wife 's credit reports due to a system error on a joint mortgage account that was always paid on time using autopay. ( will attach docs to support this ). \n", + "\n", + "This is an ongoing nightmare me and my wife are going through over the past 3 years. \n", + "Sent many dispute letters to the creditor and to the 3 bureaus, was promised multiple times that all late payments will be removed, we also has a letter from the bank stating we were never late on this account, also have a recording of a phone call with bank 's permission were the representative admits there was a system error and promised again that all late payments will be deleted from both of our credit reports. \n", + "As of today, for an unknown reason XXXX reports 6x30 days late payments, XXXX reports 24 lates, and Transunion 23 lates. \n", "\n", - "I have reason to belief that a mentally disturbed family member for whom I have an order of protection initiated this situation. This individual has ben harassing me and other members of my family for a considerable amount of time. \n", + "We have always paid our mortgage on time for many years, enrolled in autopay and making 2 payments per month. Our mortgage is currently with XXXX XXXX, XXXX XXXX is the mortgage servicer who's collecting from us and disbursing payments to XXXX XXXX. \n", "\n", - "The bank initially was satisfied with her response. However within 2 days they closed the account of a XXXX year old XXXX XXXX person. \n", + "I will attach here our mortgage transaction history confirming payments have been made on time, letter from the mortgage servicer XXXX XXXX XXXX XXXX confirming we were never late, copy of a page from my credit report showing all the late payments, and a few bank statements showing payment made on time while showing as late on our credit reports. ( XXXX & XXXX XXXX ) PLEASE HELP us to resolve this issue and have all late payments on this account removed from XXXX & XXXX XXXX. \n", "\n", - "Soon after for no reason my account was closed as well. I tried to reach out to the corporate offices of Chase and make great effort to find out what happened and to restore my account as well as my mothers but I was unsuccessful. In addition the people I spoke to were not only unhelpful bu exceedingly rude. \n", + "Her is a small portion of our previous communication with XXXX and their response : XX/XX/XXXX : we spoke with XXXX, XXXX stated that the funds were misappropriated and went to the wrong account. Said he'll contact their Tax Dep ' and get back to me and never did. \n", "\n", - "I should add that I have had an account with Chase since XX/XX/XXXX and took care of my ailing father before he passed away for over 25 years as well. I am now taking care of my mother for over 28 years. \n", + "XX/XX/XXXX : I spoke in length with XXXX from XXXX XXXX & XXXX from XXXX at XXXX who opened an investigation. Their supervisor said he made all the necessary changes. The next month, the money was withdrawn from my account on time and i received again a late fee and 30 days late on my credit report. \n", "\n", - "I went so far as contacting a prominent Television reporter who was interested in doing a report on what happened. \n", + "XX/XX/XXXX Spoke to XXXX who sent me to XXXX who sent me to XXXX XXXX from Escalation department, she promised the issue was fixed and late payments will be removed in up to 30 days and she will email me a deletion letter. Nothing was sent! and i called and wrote 5 emails to her and never got a response. \n", + "\n", + "XX/XX/XXXX spoke with XXXX to follow up with XXXX XXXX, no response. \n", + "\n", + "XX/XX/XXXX spoke with XXXX at XXXX, she said that the transaction history is our proof that issue was corrected and she'll submit a request to delete the late payments as the system does not show that previous request was made. \n", + "\n", + "XX/XX/XXXX XXXX sent a letter stating that we had a shortage of {$5300.00}. Again, upon checking my bank account all monthly payments were made on time. To avoid further issues i sent a check for {$5300.00} on XX/XX/XXXX. \n", + "\n", + "XX/XX/XXXX following many joint calls with XXXX XXXX i received a letter from XXXX stating that my credit might have been affected due to processing error and that they sent XXXX XXXX a letter requesting a removal of all late payments. \n", + "\n", + "XX/XX/XXXX spoke with XXXX XXXX again.. \n", + "XX/XX/XXXX spoke with XXXX at XXXX I have many more... \n", + "\n", + "*** XX/XX/XXXX : SPOKE WITH XXXX XXXX ( resolution team ) at XXXX. She said they will delete the late payments from both reports ( XXXX & XXXX XXXX and will call me to follow up on XX/XX/XXXX. She also emailed me the payment activity on the account. XXXX I received the payment history but NO CALL OR RESOLUTION. \n", + "\n", + "XX/XX/XXXX : SPOKE WITH XXXX FROM XXXX AND XXXX FROM XXXX XXXX ON A XXXX WAY CALL at XXXX, XXXX PERMISSION TO RECORD THE CALL, SHE AGREED, we went over all the late payments, she said she sees the error and promised that this time it will be resolved and get deleted from our credit reports. Again, nothing was resolved and we never heard back from anyone.\n", + "4. XXXX ; XXXX and Transunion are reporting ( 30 ) plus days late on the XXXX XXXX partial account number XXXX. ( Please see page 3 of the attached credit report. ) This account was paid in XXXX, 2019 and the lates are reporting in XXXX, 2019. Please keep in mind that it is impossible to have late payments on an account that was paid off a month prior. This incorrect reporting is harming my credit score and this line item need to be removed from my credit report. I have contacted the ( 3 ) bureaus to fix this, however I have been unsuccessful.\n", + "5. My Macys account is due on the first of every month. Since I have had the card I have paid on the XXXX PRIOR to the due date. And have paid over the amount due. In XXXX my XXXX XXXX auto pay did not come out of my account and rather than calling me - on the XXXX of XXXX just 5 days late they cut my credit off and shut me out of my account so I can not even see my credit profile - I have made the payment and they still are locking me out - please look into this - you will see that is what happened and they are stating in a letter it is becasue my XXXX report shows a seriuos derogorty item which it does not and I have submitted a complaint with them as well. Macys has been the worst credit experience of my LIFE and I did read the reviews but thought it would be different for me I guess? \n", + "thank you for your help.\n", + "comment list 2:\n", + "1. I decided to try XXXX services for my wife and I so I purchased phones for both of us. After a day or two of trial we felt unsatisfied so we headed back to the store and we returned all items. We got charged with restocking fees and taxes. Later on I got a bill in the mail in which I was being charged {$1200.00} for the returned items. After several attempts of arguing for about XXXX months about whether I owed XXXX or not I was dismissed of such charges, but a month after I was charged by a collecting company called ERC for {$61.00}. I asked them to explain such charges weather if they were fees or taxes and they we unable to disclose information. Therefore, I asked them to send me a bill in the mail with details about the charges, as well as a dispute package and they told me they would send me a bill. About the dispute part, they said that I needed to call XXXX to discuss the charges with them but XXXX said that I had to discuss this with the collecting company. I never received a detailed statement neither a chanse to defend my self about such charges, I checked my credit score and found a red flag in it because of this. \n", "\n", - "I have since managed to open an account at another bank but this week I had reason to go to a branch of Chase regarding another issue and a manager using my That is a very serious unsubstantiated accusation and given this information I have no choice but to submit this complaint. \n", + "I am now hoping you may help me with this case. \n", "\n", - "I have no interest in having an account again at a disreputable bank like Chase but I can not and will not accept or tolerate a derogatory accusation be associated with my name. \n", + "Thanks :\n", + "2. Over a year and a half ago we started the process of buying a home. Our mortgage guy sent us to a credit repair co. They got the collection account from Weltman , Weinberg & Reis taken off my credit, because it was unverifiable. Now it is back on my credit. I have credit reports showing the trade line on and then off and now today it is currently on my report. When I called to verify the account with WW & R they sent me a heavily redacted letter verifying absolutely nothing. I would like this unverifiable account taken off my credit and removed permanently. This should not be a loan I have to pay for if there is no verification that it is my debt. Attached are the credit reports and the letter of verification that was sent to me.\n", + "3. I recently disputed ( see attached letter ) with Receivable Management Services an account entry that they placed on my credit report without providing a dunning letter or any correspondence that would have allowed me 30 days to dispute the validity of the alleged debt. To date, I have not received any communication from them. They are blatantly violating my rights by reporting this inaccurate, erroneous, unverifiable entry.\n", "\n", - "I hope that my complaint will hAve the desired effect of removing this derogatory unsubstantiated accusation be removed from my name. However. I will not let this unfair matter stand and Chase ought to know that I have already retained an attorney and will if necessary hold Chase responsible and liable all damage i have incurred now And in the future Enclosed, please find the letter from Chase stating that they were closing my mothers account and a similar letter was received by me too. \n", + "Additionally, this account entry does not reflect a payment history which should be included on any entry that is reflected on my credit report. In my previous communication to them, I specifically requested that they provide an agreement that states their authority to collect on the alleged debt, agreement with signature of the alleged debtor wherein he/she agreed to pay the creditor, alleged account number, date this alleged debt became payable, original delinquency date, and to date to no avail. \n", "\n", - "Also please find a letter from her Doctor stating that she is XXXX XXXX. \n", + "As such, since they have refused to respond to my request and not provide any documentation to substantiate their allegations, coupled with the fact that they did not provide me a dunning letter is grounds for this erroneous, inaccurate, unverifiable entry to be deleted from my credit report.\n", + "4. I accepted service from XXXX XXXX XXXX. The company did not inform me that internet was required. They also told me that the agreement was at will without penalty. They never addressed my needs as a customer. My bill is only {$230.00}. They placed false information regarding my bill with a collection agency who has placed information on my credit report without contacting me or giving me an opportunity to dispute the validity of the debt. The debt is not valid. The actions are unlawful and I am requesting that the actions of this collection agency be reported to the Federal Trade Commission.\n", + "5. I have continued to submit an investigation for a Bankruptcy place on my credit report. I have been trying to get this removed because it was place on my credit report in error and inaccurate. ALL THREE CREDIT BUREAUS have continue to ignore the information proving this was place in error and fail to properly investigate the dispute I have place in their office. \n", "\n", - "Thank you. \n", + "1. They say they have verified this dispute with XXXX but I have a letter from XXXX stating this was removed because they were unable to verify the accuracy of the bankruptcy. I received this letter on XX/XX/XXXX. XXXX just finished an investigation on XX/XX/2019 stating the verified this with XXXX. \n", + "2.Experian Open the dispute on XX/XX/2019 and closed it on XX/XX/19 stated they verified with XXXX and the Bankruptcy court and I have a letter From XXXX stating they could not verify the accuracy of this dispute. I also, have a letter from the court house stating they do not verify information with the credit bureaus How could be this be on my XXXX file when XXXX has removed this item. \n", + "3. XXXX open and investigation XX/XX/2019 and closed it XX/XX/2019 No way they properly investigation I have submitted all information to dispute the inaccurate information. Please do a proper investigation. \n", "\n", - "XXXX XXXX\n", - "3. U.S. Bank sent two letters containing Visa Debit Cards to our address on XX/XX/2021. One Visa Debit Card is in the name of XXXX XXXX and one Visa Debit Card is in the name of XXXX XXXX. These cards supposedly link to existing checking accounts at U.S. Bank. However : ( 1 ) Neither of us have existing checking accounts at U.S. Bank, ( 2 ) Neither of us solicited a bank account at U.S. Bank, and ( 3 ) Neither of us solicited a Visa Debit Card. We have attempted to call U.S. Bank at the phone numbers provided in the letters but are only able to access an automated system which will not proceed without us establishing accounts and activating these cards. We are concerned here that one of two things has happened : either ( 1 ) we are victims of identity theft and some third party is trying to establish accounts in our name, or ( 2 ) U.S. Bank is engaged in bank fraud. In either case, we request the assistance of the Consumer Financial Protection Bureau. Thank you.\n", - "4. I contacted my bank over 3 times about this amount, the first two times I spoke to gentleman that agreed with me that I didnt get back a certain amount of dollars back, I did the math and they refuse to see that I do not owe this amount because I never had it in the first place. I wrote out all my charges and connected it to the charges made back from the consumer and I was missing XXXX, I called XXXX they said they gave it all back which is not their fault because they showed me proof. Along the lines Capital One does not want to take responsibility for the missing money. I have wrote everything out and then its not adding up, they keep saying that they did a charge back twice which is incorrect. My balance was at XXXX before I made this purchase and it shouldve been returned back to XXXX because I return all the items and nothing is in my possession. I have proof that I returned everything.\n", - "5. CB INDIGO ( Bank ) XX/XX/2022 I just recently got off the phone with the company and they wont put in a request of removal of a fraudulent hard inquiry from Insigo Mastercard to XXXX. They dont even have my information on file, I called 3 times most of them are lazy and was giving me a hard time.\n", + "XXXX, Experian, and XXXX please do a proper investigation under 611 of the FCRA thank you very much I have attached the letter proving this this is not on my XXXX consumer report and a letter form the court house stating they do not report information to the credit bureaus from the XXXX XXXX XXXX, Clerk of Court United State Bankruptcy Court on dated XX/XX/2019 I have summited it to the credit bureaus to be ignored. I have as for a description of my investigation by section 611 of the FCRA and the information from the investigation is inaccurate.\n", "\n" ] } @@ -1613,7 +1633,7 @@ { "data": { "text/html": [ - "Query job a7ce86a7-3a18-47b9-a46f-98dbe6a5a339 is DONE. 0 Bytes processed. Open Job" + "Query job a069b4a5-5238-4ca8-a6c0-d48781d00f6c is DONE. 0 Bytes processed. Open Job" ], "text/plain": [ "" @@ -1651,7 +1671,7 @@ { "data": { "text/html": [ - "Query job d568c03d-6bbd-4c3e-b087-563b7f5135ed is DONE. 0 Bytes processed. Open Job" + "Query job 63f6e1d0-b0dc-4f5c-a001-5889c28162c5 is DONE. 0 Bytes processed. Open Job" ], "text/plain": [ "" @@ -1663,7 +1683,7 @@ { "data": { "text/html": [ - "Query job 17eaa806-51a4-4ee9-b219-75455d0095a7 is DONE. 8 Bytes processed. Open Job" + "Query job c1c9e28b-ba6d-4485-b892-0bf2428f927c is DONE. 8 Bytes processed. Open Job" ], "text/plain": [ "" @@ -1675,7 +1695,7 @@ { "data": { "text/html": [ - "Query job e6d40ded-691d-4523-94ea-dd8202bd0220 is DONE. 2 Bytes processed. Open Job" + "Query job 67402b3c-eee4-4fe4-aeaf-fb27606ecde7 is DONE. 2 Bytes processed. Open Job" ], "text/plain": [ "" @@ -1687,7 +1707,7 @@ { "data": { "text/html": [ - "Query job 200f0b88-7b6d-417b-a181-a98138e3bc95 is DONE. 193 Bytes processed. Open Job" + "Query job 83166900-0787-4a6d-b822-c3be87990e35 is DONE. 328 Bytes processed. Open Job" ], "text/plain": [ "" @@ -1699,7 +1719,7 @@ { "data": { "text/plain": [ - "'The most obvious difference between the two lists of comments is that list 1 is related to credit reporting disputes and list 2 is a collection of general consumer banking complaints.'" + "'The most obvious difference between the two lists of comments is that the first list contains comments about credit report issues related to mortgages and loans, while the second list contains comments about credit report issues related to other types of debts, such as cell phone bills, collections, and bankruptcies.'" ] }, "execution_count": 19, @@ -1753,7 +1773,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.11.1" + "version": "3.10.13" } }, "nbformat": 4, From 9e741543ee978a33101a73f7ff1a8ef8925abbd1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tim=20Swe=C3=B1a=20=28Swast=29?= Date: Mon, 1 Apr 2024 11:12:38 -0500 Subject: [PATCH 28/53] Revert "feat: Support max_columns in repr and make repr more efficient (#515)" (#554) This reverts commit 54e49cff89bd329852a823cd5cf5c5b41b7f9e32. --- bigframes/core/blocks.py | 42 +++++++-------------- bigframes/core/indexes/index.py | 10 ++--- bigframes/dataframe.py | 66 ++++++++++++++++++++------------- bigframes/series.py | 9 +++-- bigframes/session/__init__.py | 8 +--- 5 files changed, 65 insertions(+), 70 deletions(-) diff --git a/bigframes/core/blocks.py b/bigframes/core/blocks.py index aab8b1ad4d..11899eef11 100644 --- a/bigframes/core/blocks.py +++ b/bigframes/core/blocks.py @@ -467,23 +467,6 @@ def to_pandas_batches(self): self._copy_index_to_pandas(df) yield df - def download_pandas_preview( - self, max_rows: int - ) -> Tuple[pd.DataFrame, bigquery.QueryJob]: - """Download one page of results and return the query job.""" - dtypes = dict(zip(self.index_columns, self.index.dtypes)) - dtypes.update(zip(self.value_columns, self.dtypes)) - results_iterator, query_job = self.session._execute( - self.expr, sorted=True, max_results=max_rows - ) - arrow_results_iterator = results_iterator.to_arrow_iterable() - arrow_table = next(arrow_results_iterator) - downloaded_df = bigframes.session._io.pandas.arrow_to_pandas( - arrow_table, dtypes - ) - self._copy_index_to_pandas(downloaded_df) - return downloaded_df, query_job - def _copy_index_to_pandas(self, df: pd.DataFrame): """Set the index on pandas DataFrame to match this block. @@ -1314,25 +1297,26 @@ def _forward_slice(self, start: int = 0, stop=None, step: int = 1): # queries. @functools.cache def retrieve_repr_request_results( - self, max_results: int, max_columns: int - ) -> Tuple[pd.DataFrame, Tuple[int, int], bigquery.QueryJob]: + self, max_results: int + ) -> Tuple[pd.DataFrame, int, bigquery.QueryJob]: """ Retrieves a pandas dataframe containing only max_results many rows for use with printing methods. - Returns a tuple of the dataframe preview for printing and the overall number - of rows and columns of the table, as well as the query job used. + Returns a tuple of the dataframe and the overall number of rows of the query. """ - pandas_df, query_job = self.download_pandas_preview(max_results) - row_count = self.session._get_table_row_count(query_job.destination) - column_count = len(self.value_columns) - - formatted_df = pandas_df.set_axis(self.column_labels, axis=1) + # TODO(swast): Select a subset of columns if max_columns is less than the + # number of columns in the schema. + count = self.shape[0] + if count > max_results: + head_block = self.slice(0, max_results) + else: + head_block = self + computed_df, query_job = head_block.to_pandas() + formatted_df = computed_df.set_axis(self.column_labels, axis=1) # we reset the axis and substitute the bf index name for the default formatted_df.index.name = self.index.name - # limit column count - formatted_df = formatted_df.iloc[:, 0:max_columns] - return formatted_df, (row_count, column_count), query_job + return formatted_df, count, query_job def promote_offsets(self, label: Label = None) -> typing.Tuple[Block, str]: result_id = guid.generate_guid() diff --git a/bigframes/core/indexes/index.py b/bigframes/core/indexes/index.py index 48988aaffe..c818b68711 100644 --- a/bigframes/core/indexes/index.py +++ b/bigframes/core/indexes/index.py @@ -205,17 +205,17 @@ def query_job(self) -> Optional[bigquery.QueryJob]: return self._query_job def __repr__(self) -> str: + # TODO(swast): Add a timeout here? If the query is taking a long time, + # maybe we just print the job metadata that we have so far? + # TODO(swast): Avoid downloading the whole series by using job + # metadata, like we do with DataFrame. opts = bigframes.options.display max_results = opts.max_rows - max_columns = opts.max_columns if opts.repr_mode == "deferred": return formatter.repr_query_job(self.query_job) - pandas_df, _, query_job = self._block.retrieve_repr_request_results( - max_results, max_columns - ) + pandas_df, _, query_job = self._block.retrieve_repr_request_results(max_results) self._query_job = query_job - return repr(pandas_df.index) def copy(self, name: Optional[Hashable] = None): diff --git a/bigframes/dataframe.py b/bigframes/dataframe.py index 1df78dd4cd..066b082490 100644 --- a/bigframes/dataframe.py +++ b/bigframes/dataframe.py @@ -579,16 +579,28 @@ def __setattr__(self, key: str, value): object.__setattr__(self, key, value) def __repr__(self) -> str: - """Converts a DataFrame to a string using pandas dataframe __repr__. + """Converts a DataFrame to a string. Calls to_pandas. - Only represents the first `bigframes.options.display.max_rows` - and `bigframes.options.display.max_columns`. + Only represents the first `bigframes.options.display.max_rows`. """ - if bigframes.options.display.repr_mode == "deferred": + opts = bigframes.options.display + max_results = opts.max_rows + if opts.repr_mode == "deferred": return formatter.repr_query_job(self.query_job) - pandas_df, shape = self._perform_repr_request() - with display_options.pandas_repr(bigframes.options.display): + self._cached() + # TODO(swast): pass max_columns and get the true column count back. Maybe + # get 1 more column than we have requested so that pandas can add the + # ... for us? + pandas_df, row_count, query_job = self._block.retrieve_repr_request_results( + max_results + ) + + self._set_internal_query_job(query_job) + + column_count = len(pandas_df.columns) + + with display_options.pandas_repr(opts): repr_string = repr(pandas_df) # Modify the end of the string to reflect count. @@ -596,40 +608,42 @@ def __repr__(self) -> str: pattern = re.compile("\\[[0-9]+ rows x [0-9]+ columns\\]") if pattern.match(lines[-1]): lines = lines[:-2] - if shape[0] > len(lines) - 1: + + if row_count > len(lines) - 1: lines.append("...") + lines.append("") - lines.append(f"[{shape[0]} rows x {shape[1]} columns]") + lines.append(f"[{row_count} rows x {column_count} columns]") return "\n".join(lines) - def _perform_repr_request(self) -> Tuple[pandas.DataFrame, Tuple[int, int]]: - max_results = bigframes.options.display.max_rows - max_columns = bigframes.options.display.max_columns - self._cached() - pandas_df, shape, query_job = self._block.retrieve_repr_request_results( - max_results, max_columns - ) - self._set_internal_query_job(query_job) - return pandas_df, shape - def _repr_html_(self) -> str: """ Returns an html string primarily for use by notebooks for displaying - a representation of the DataFrame. Displays at most the number of rows - and columns given by `bigframes.options.display.max_rows` and - `bigframes.options.display.max_columns`. + a representation of the DataFrame. Displays 20 rows by default since + many notebooks are not configured for large tables. """ - - if bigframes.options.display.repr_mode == "deferred": + opts = bigframes.options.display + max_results = bigframes.options.display.max_rows + if opts.repr_mode == "deferred": return formatter.repr_query_job_html(self.query_job) - pandas_df, shape = self._perform_repr_request() + self._cached() + # TODO(swast): pass max_columns and get the true column count back. Maybe + # get 1 more column than we have requested so that pandas can add the + # ... for us? + pandas_df, row_count, query_job = self._block.retrieve_repr_request_results( + max_results + ) + + self._set_internal_query_job(query_job) + + column_count = len(pandas_df.columns) - with display_options.pandas_repr(bigframes.options.display): + with display_options.pandas_repr(opts): # _repr_html_ stub is missing so mypy thinks it's a Series. Ignore mypy. html_string = pandas_df._repr_html_() # type:ignore - html_string += f"[{shape[0]} rows x {shape[1]} columns in total]" + html_string += f"[{row_count} rows x {column_count} columns in total]" return html_string def __setitem__(self, key: str, value: SingleItemValue): diff --git a/bigframes/series.py b/bigframes/series.py index f1ac89f514..e7b358c2fe 100644 --- a/bigframes/series.py +++ b/bigframes/series.py @@ -272,16 +272,17 @@ def reset_index( return bigframes.dataframe.DataFrame(block) def __repr__(self) -> str: + # TODO(swast): Add a timeout here? If the query is taking a long time, + # maybe we just print the job metadata that we have so far? + # TODO(swast): Avoid downloading the whole series by using job + # metadata, like we do with DataFrame. opts = bigframes.options.display max_results = opts.max_rows - max_columns = opts.max_columns if opts.repr_mode == "deferred": return formatter.repr_query_job(self.query_job) self._cached() - pandas_df, _, query_job = self._block.retrieve_repr_request_results( - max_results, max_columns - ) + pandas_df, _, query_job = self._block.retrieve_repr_request_results(max_results) self._set_internal_query_job(query_job) return repr(pandas_df.iloc[:, 0]) diff --git a/bigframes/session/__init__.py b/bigframes/session/__init__.py index 6573934f94..ac266da3bd 100644 --- a/bigframes/session/__init__.py +++ b/bigframes/session/__init__.py @@ -1832,7 +1832,6 @@ def _execute( sorted: bool = True, dry_run=False, col_id_overrides: Mapping[str, str] = {}, - max_results: Optional[int] = None, ) -> tuple[bigquery.table.RowIterator, bigquery.QueryJob]: sql = self._to_sql( array_value, sorted=sorted, col_id_overrides=col_id_overrides @@ -1842,7 +1841,8 @@ def _execute( else: job_config.dry_run = dry_run return self._start_query( - sql=sql, job_config=job_config, max_results=max_results + sql=sql, + job_config=job_config, ) def _peek( @@ -1887,10 +1887,6 @@ def _get_table_size(self, destination_table): table = self.bqclient.get_table(destination_table) return table.num_bytes - def _get_table_row_count(self, destination_table) -> int: - table = self.bqclient.get_table(destination_table) - return table.num_rows - def _rows_to_dataframe( self, row_iterator: bigquery.table.RowIterator, dtypes: Dict ) -> pandas.DataFrame: From 9ac4ed8a374b757b5b19eaa64e8ec4739866c9bd Mon Sep 17 00:00:00 2001 From: Shobhit Singh Date: Mon, 1 Apr 2024 16:24:29 +0000 Subject: [PATCH 29/53] chore: fix typo `mertics` to `metrics` (#549) * chore: fix typo `mertics` to `metrics` * revert unintended change --- bigframes/ml/metrics/_metrics.py | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/bigframes/ml/metrics/_metrics.py b/bigframes/ml/metrics/_metrics.py index e8c7400f35..ee86798b33 100644 --- a/bigframes/ml/metrics/_metrics.py +++ b/bigframes/ml/metrics/_metrics.py @@ -19,8 +19,8 @@ import typing from typing import Tuple, Union -import bigframes_vendored.sklearn.metrics._classification as vendored_mertics_classification -import bigframes_vendored.sklearn.metrics._ranking as vendored_mertics_ranking +import bigframes_vendored.sklearn.metrics._classification as vendored_metrics_classification +import bigframes_vendored.sklearn.metrics._ranking as vendored_metrics_ranking import bigframes_vendored.sklearn.metrics._regression as vendored_metrics_regression import numpy as np import pandas as pd @@ -79,7 +79,7 @@ def accuracy_score( return score.sum() -accuracy_score.__doc__ = inspect.getdoc(vendored_mertics_classification.accuracy_score) +accuracy_score.__doc__ = inspect.getdoc(vendored_metrics_classification.accuracy_score) def roc_curve( @@ -149,7 +149,7 @@ def roc_curve( ) -roc_curve.__doc__ = inspect.getdoc(vendored_mertics_ranking.roc_curve) +roc_curve.__doc__ = inspect.getdoc(vendored_metrics_ranking.roc_curve) def roc_auc_score( @@ -171,7 +171,7 @@ def roc_auc_score( return (width_diff * height_avg).sum() -roc_auc_score.__doc__ = inspect.getdoc(vendored_mertics_ranking.roc_auc_score) +roc_auc_score.__doc__ = inspect.getdoc(vendored_metrics_ranking.roc_auc_score) def auc( @@ -185,7 +185,7 @@ def auc( return auc -auc.__doc__ = inspect.getdoc(vendored_mertics_ranking.auc) +auc.__doc__ = inspect.getdoc(vendored_metrics_ranking.auc) def confusion_matrix( @@ -223,7 +223,7 @@ def confusion_matrix( confusion_matrix.__doc__ = inspect.getdoc( - vendored_mertics_classification.confusion_matrix + vendored_metrics_classification.confusion_matrix ) @@ -261,7 +261,7 @@ def recall_score( return recall_score -recall_score.__doc__ = inspect.getdoc(vendored_mertics_classification.recall_score) +recall_score.__doc__ = inspect.getdoc(vendored_metrics_classification.recall_score) def precision_score( @@ -299,7 +299,7 @@ def precision_score( precision_score.__doc__ = inspect.getdoc( - vendored_mertics_classification.precision_score + vendored_metrics_classification.precision_score ) @@ -334,4 +334,4 @@ def f1_score( return f1_score -f1_score.__doc__ = inspect.getdoc(vendored_mertics_classification.f1_score) +f1_score.__doc__ = inspect.getdoc(vendored_metrics_classification.f1_score) From f207c8f16151523aa9bb37f5c6abe6c37ee5bbb3 Mon Sep 17 00:00:00 2001 From: Ashley Xu <139821907+ashleyxuu@users.noreply.github.com> Date: Mon, 1 Apr 2024 14:19:51 -0700 Subject: [PATCH 30/53] chore: address comments from technical writers for legal review (#555) --- bigframes/_config/bigquery_options.py | 2 +- bigframes/ml/compose.py | 2 +- bigframes/ml/imported.py | 6 +++--- .../bigframes_vendored/pandas/core/groupby/__init__.py | 10 +++++----- .../sklearn/compose/_column_transformer.py | 4 ++-- .../bigframes_vendored/sklearn/decomposition/_pca.py | 4 ++-- .../sklearn/linear_model/_logistic.py | 2 +- .../sklearn/preprocessing/_discretization.py | 2 +- 8 files changed, 16 insertions(+), 16 deletions(-) diff --git a/bigframes/_config/bigquery_options.py b/bigframes/_config/bigquery_options.py index 9da953a582..50e14eaf28 100644 --- a/bigframes/_config/bigquery_options.py +++ b/bigframes/_config/bigquery_options.py @@ -57,7 +57,7 @@ def application_name(self) -> Optional[str]: """The application name to amend to the user-agent sent to Google APIs. The application name to amend to the user agent sent to Google APIs. - The recommended format is ``"appplication-name/major.minor.patch_version"`` + The recommended format is ``"application-name/major.minor.patch_version"`` or ``"(gpn:PartnerName;)"`` for official Google partners. """ return self._application_name diff --git a/bigframes/ml/compose.py b/bigframes/ml/compose.py index cd233589d6..21cfba8e01 100644 --- a/bigframes/ml/compose.py +++ b/bigframes/ml/compose.py @@ -13,7 +13,7 @@ # limitations under the License. """Build composite transformers on heterogeneous data. This module is styled -after Scikit-Learn's compose module: +after scikit-Learn's compose module: https://scikit-learn.org/stable/modules/classes.html#module-sklearn.compose.""" from __future__ import annotations diff --git a/bigframes/ml/imported.py b/bigframes/ml/imported.py index 7f75827083..b551150050 100644 --- a/bigframes/ml/imported.py +++ b/bigframes/ml/imported.py @@ -34,7 +34,7 @@ class TensorFlowModel(base.Predictor): model_path (str): GCS path that holds the model files. session (BigQuery Session): - BQ session to create the model + BQ session to create the model. """ def __init__( @@ -113,7 +113,7 @@ class ONNXModel(base.Predictor): model_path (str): Cloud Storage path that holds the model files. session (BigQuery Session): - BQ session to create the model + BQ session to create the model. """ def __init__( @@ -207,7 +207,7 @@ class XGBoostModel(base.Predictor): and feature_types are both specified in the model file. Supported types are "bool", "string", "int64", "float64", "array", "array", "array", "array". session (BigQuery Session): - BQ session to create the model + BQ session to create the model. """ def __init__( diff --git a/third_party/bigframes_vendored/pandas/core/groupby/__init__.py b/third_party/bigframes_vendored/pandas/core/groupby/__init__.py index 8730cf0007..e1cc8c5a53 100644 --- a/third_party/bigframes_vendored/pandas/core/groupby/__init__.py +++ b/third_party/bigframes_vendored/pandas/core/groupby/__init__.py @@ -78,7 +78,7 @@ def median( Include only float, int, boolean columns. exact (bool, default False): Calculate the exact median instead of an approximation. Note: - ``exact=True`` not yet supported. + ``exact=True`` is not supported. Returns: pandas.Series or pandas.DataFrame: Median of groups. @@ -178,7 +178,7 @@ def sum( Include only float, int, boolean columns. min_count (int, default 0): The required number of valid values to perform the operation. If fewer - than ``min_count`` non-NA values are present the result will be NA. + than ``min_count`` and non-NA values are present, the result will be NA. Returns: Series or DataFrame: Computed sum of values within each group. @@ -194,7 +194,7 @@ def prod(self, numeric_only: bool = False, min_count: int = 0): Include only float, int, boolean columns. min_count (int, default 0): The required number of valid values to perform the operation. If fewer - than ``min_count`` non-NA values are present the result will be NA. + than ``min_count`` and non-NA values are present, the result will be NA. Returns: Series or DataFrame: Computed prod of values within each group. @@ -214,7 +214,7 @@ def min( Include only float, int, boolean columns. min_count (int, default 0): The required number of valid values to perform the operation. If fewer - than ``min_count`` non-NA values are present the result will be NA. + than ``min_count`` and non-NA values are present, the result will be NA. Returns: Series or DataFrame: Computed min of values within each group. @@ -234,7 +234,7 @@ def max( Include only float, int, boolean columns. min_count (int, default 0): The required number of valid values to perform the operation. If fewer - than ``min_count`` non-NA values are present the result will be NA. + than ``min_count`` and non-NA values are present, the result will be NA. Returns: Series or DataFrame: Computed max of values within each group. diff --git a/third_party/bigframes_vendored/sklearn/compose/_column_transformer.py b/third_party/bigframes_vendored/sklearn/compose/_column_transformer.py index b08eb10492..4b0bd42706 100644 --- a/third_party/bigframes_vendored/sklearn/compose/_column_transformer.py +++ b/third_party/bigframes_vendored/sklearn/compose/_column_transformer.py @@ -19,9 +19,9 @@ class ColumnTransformer(_BaseComposition): """Applies transformers to columns of BigQuery DataFrames. This estimator allows different columns or column subsets of the input - to be transformed separately and the features generated by each transformer + to be transformed separately, and the features generated by each transformer will be concatenated to form a single feature space. - This is useful for heterogeneous or columnar data, to combine several + This is useful for heterogeneous or columnar data to combine several feature extraction mechanisms or transformations into a single transformer. Args: diff --git a/third_party/bigframes_vendored/sklearn/decomposition/_pca.py b/third_party/bigframes_vendored/sklearn/decomposition/_pca.py index dcce75d1d9..f126e0439d 100644 --- a/third_party/bigframes_vendored/sklearn/decomposition/_pca.py +++ b/third_party/bigframes_vendored/sklearn/decomposition/_pca.py @@ -22,8 +22,8 @@ class PCA(BaseEstimator, metaclass=ABCMeta): Args: n_components (int, float or None, default None): - Number of components to keep. - If n_components is not set all components are kept. n_components = min(n_samples, n_features). + Number of components to keep. If n_components is not set all + components are kept, n_components = min(n_samples, n_features). If 0 < n_components < 1, select the number of components such that the amount of variance that needs to be explained is greater than the percentage specified by n_components. svd_solver ("full", "randomized" or "auto", default "auto"): The solver to use to calculate the principal components. Details: https://cloud.google.com/bigquery/docs/reference/standard-sql/bigqueryml-syntax-create-pca#pca_solver. diff --git a/third_party/bigframes_vendored/sklearn/linear_model/_logistic.py b/third_party/bigframes_vendored/sklearn/linear_model/_logistic.py index 88ff32ea06..494c730a6d 100644 --- a/third_party/bigframes_vendored/sklearn/linear_model/_logistic.py +++ b/third_party/bigframes_vendored/sklearn/linear_model/_logistic.py @@ -38,7 +38,7 @@ class LogisticRegression(LinearClassifierMixin, BaseEstimator): automatically adjust weights inversely proportional to class frequencies in the input data as ``n_samples / (n_classes * np.bincount(y))``. Dict isn't - supported now. + supported. l1_reg (float or None, default None): The amount of L1 regularization applied. Default to None. Can't be set in "normal_equation" mode. If unset, value 0 is used. l2_reg (float, default 0.0): diff --git a/third_party/bigframes_vendored/sklearn/preprocessing/_discretization.py b/third_party/bigframes_vendored/sklearn/preprocessing/_discretization.py index 5fcc481573..98b9d0371f 100644 --- a/third_party/bigframes_vendored/sklearn/preprocessing/_discretization.py +++ b/third_party/bigframes_vendored/sklearn/preprocessing/_discretization.py @@ -18,7 +18,7 @@ class KBinsDiscretizer(TransformerMixin, BaseEstimator): strategy ({'uniform', 'quantile'}, default='quantile'): Strategy used to define the widths of the bins. 'uniform': All bins in each feature have identical widths. 'quantile': All bins in each - feature have the same number of points. Only `uniform` is supported now. + feature have the same number of points. Only `uniform` is supported. """ def fit(self, X, y=None): From d805241b7ec99fcb7579dce778d4b04778a72002 Mon Sep 17 00:00:00 2001 From: Garrett Wu <6505921+GarrettWu@users.noreply.github.com> Date: Mon, 1 Apr 2024 14:29:44 -0700 Subject: [PATCH 31/53] feat: add transformers save/load (#552) * feat: add transformers save/load * fix mypy --- bigframes/ml/base.py | 30 ++++- bigframes/ml/compose.py | 54 +------- bigframes/ml/loader.py | 11 +- bigframes/ml/preprocessing.py | 10 ++ tests/system/large/ml/test_compose.py | 1 + tests/system/large/ml/test_pipeline.py | 6 +- tests/system/small/ml/test_core.py | 2 +- tests/system/small/ml/test_llm.py | 24 ++-- tests/system/small/ml/test_preprocessing.py | 130 ++++++++++++++++---- 9 files changed, 173 insertions(+), 95 deletions(-) diff --git a/bigframes/ml/base.py b/bigframes/ml/base.py index e58ed4feef..5e7aada8de 100644 --- a/bigframes/ml/base.py +++ b/bigframes/ml/base.py @@ -178,7 +178,33 @@ def fit( return self._fit(X, y) -class Transformer(BaseEstimator): +class BaseTransformer(BaseEstimator): + """Transformer base class.""" + + def __init__(self): + self._bqml_model: Optional[core.BqmlModel] = None + + _T = TypeVar("_T", bound="BaseTransformer") + + def to_gbq(self: _T, model_name: str, replace: bool = False) -> _T: + """Save the transformer as a BigQuery model. + + Args: + model_name (str): + the name of the model. + replace (bool, default False): + whether to replace if the model already exists. Default to False. + + Returns: + Saved transformer.""" + if not self._bqml_model: + raise RuntimeError("A transformer must be fitted before it can be saved") + + new_model = self._bqml_model.copy(model_name, replace) + return new_model.session.read_gbq_model(model_name) + + +class Transformer(BaseTransformer): """A BigQuery DataFrames Transformer base class that transforms data. Also the transformers can be attached to a pipeline with a predictor.""" @@ -199,7 +225,7 @@ def fit_transform( return self.fit(X, y).transform(X) -class LabelTransformer(BaseEstimator): +class LabelTransformer(BaseTransformer): """A BigQuery DataFrames Label Transformer base class that transforms data. Also the transformers can be attached to a pipeline with a predictor.""" diff --git a/bigframes/ml/compose.py b/bigframes/ml/compose.py index 21cfba8e01..8638f4d182 100644 --- a/bigframes/ml/compose.py +++ b/bigframes/ml/compose.py @@ -26,21 +26,11 @@ import bigframes_vendored.sklearn.compose._column_transformer from google.cloud import bigquery -import bigframes from bigframes import constants from bigframes.core import log_adapter from bigframes.ml import base, core, globals, preprocessing, utils import bigframes.pandas as bpd -_PREPROCESSING_TYPES = Union[ - preprocessing.OneHotEncoder, - preprocessing.StandardScaler, - preprocessing.MaxAbsScaler, - preprocessing.MinMaxScaler, - preprocessing.KBinsDiscretizer, - preprocessing.LabelEncoder, -] - _BQML_TRANSFROM_TYPE_MAPPING = types.MappingProxyType( { "ML.STANDARD_SCALER": preprocessing.StandardScaler, @@ -67,7 +57,7 @@ def __init__( transformers: List[ Tuple[ str, - _PREPROCESSING_TYPES, + preprocessing.PreprocessingType, Union[str, List[str]], ] ], @@ -82,12 +72,12 @@ def __init__( @property def transformers_( self, - ) -> List[Tuple[str, _PREPROCESSING_TYPES, str,]]: + ) -> List[Tuple[str, preprocessing.PreprocessingType, str,]]: """The collection of transformers as tuples of (name, transformer, column).""" result: List[ Tuple[ str, - _PREPROCESSING_TYPES, + preprocessing.PreprocessingType, str, ] ] = [] @@ -105,15 +95,6 @@ def transformers_( return result - @classmethod - def _from_bq( - cls, session: bigframes.Session, model: bigquery.Model - ) -> ColumnTransformer: - col_transformer = cls._extract_from_bq_model(model) - col_transformer._bqml_model = core.BqmlModel(session, model) - - return col_transformer - @classmethod def _extract_from_bq_model( cls, @@ -125,7 +106,7 @@ def _extract_from_bq_model( transformers: List[ Tuple[ str, - _PREPROCESSING_TYPES, + preprocessing.PreprocessingType, Union[str, List[str]], ] ] = [] @@ -164,15 +145,7 @@ def camel_to_snake(name): def _merge( self, bq_model: bigquery.Model - ) -> Union[ - ColumnTransformer, - preprocessing.StandardScaler, - preprocessing.OneHotEncoder, - preprocessing.MaxAbsScaler, - preprocessing.MinMaxScaler, - preprocessing.KBinsDiscretizer, - preprocessing.LabelEncoder, - ]: + ) -> Union[ColumnTransformer, preprocessing.PreprocessingType,]: """Try to merge the column transformer to a simple transformer. Depends on all the columns in bq_model are transformed with the same transformer.""" transformers = self.transformers_ @@ -249,20 +222,3 @@ def transform(self, X: Union[bpd.DataFrame, bpd.Series]) -> bpd.DataFrame: bpd.DataFrame, df[self._output_names], ) - - def to_gbq(self, model_name: str, replace: bool = False) -> ColumnTransformer: - """Save the transformer as a BigQuery model. - - Args: - model_name (str): - the name of the model. - replace (bool, default False): - whether to replace if the model already exists. Default to False. - - Returns: - ColumnTransformer: saved model.""" - if not self._bqml_model: - raise RuntimeError("A transformer must be fitted before it can be saved") - - new_model = self._bqml_model.copy(model_name, replace) - return new_model.session.read_gbq_model(model_name) diff --git a/bigframes/ml/loader.py b/bigframes/ml/loader.py index 508003a98d..c6e38e6534 100644 --- a/bigframes/ml/loader.py +++ b/bigframes/ml/loader.py @@ -24,6 +24,7 @@ from bigframes.ml import ( cluster, compose, + core, decomposition, ensemble, forecasting, @@ -31,6 +32,7 @@ linear_model, llm, pipeline, + preprocessing, utils, ) @@ -81,6 +83,7 @@ def from_bq( llm.PaLM2TextEmbeddingGenerator, pipeline.Pipeline, compose.ColumnTransformer, + preprocessing.PreprocessingType, ]: """Load a BQML model to BigQuery DataFrames ML. @@ -107,8 +110,12 @@ def from_bq( def _transformer_from_bq(session: bigframes.Session, bq_model: bigquery.Model): - # TODO(garrettwu): add other transformers - return compose.ColumnTransformer._from_bq(session, bq_model) + transformer = compose.ColumnTransformer._extract_from_bq_model(bq_model)._merge( + bq_model + ) + transformer._bqml_model = core.BqmlModel(session, bq_model) + + return transformer def _model_from_bq(session: bigframes.Session, bq_model: bigquery.Model): diff --git a/bigframes/ml/preprocessing.py b/bigframes/ml/preprocessing.py index 23eab42978..fd7d44f731 100644 --- a/bigframes/ml/preprocessing.py +++ b/bigframes/ml/preprocessing.py @@ -639,3 +639,13 @@ def transform(self, y: Union[bpd.DataFrame, bpd.Series]) -> bpd.DataFrame: bpd.DataFrame, df[self._output_names], ) + + +PreprocessingType = Union[ + OneHotEncoder, + StandardScaler, + MaxAbsScaler, + MinMaxScaler, + KBinsDiscretizer, + LabelEncoder, +] diff --git a/tests/system/large/ml/test_compose.py b/tests/system/large/ml/test_compose.py index d7c49ca95a..72e016f4bb 100644 --- a/tests/system/large/ml/test_compose.py +++ b/tests/system/large/ml/test_compose.py @@ -151,3 +151,4 @@ def test_columntransformer_save_load(new_penguins_df, dataset_id): ("standard_scaler", preprocessing.StandardScaler(), "flipper_length_mm"), ] assert reloaded_transformer.transformers_ == expected + assert reloaded_transformer._bqml_model is not None diff --git a/tests/system/large/ml/test_pipeline.py b/tests/system/large/ml/test_pipeline.py index c460efa75f..c165b1e030 100644 --- a/tests/system/large/ml/test_pipeline.py +++ b/tests/system/large/ml/test_pipeline.py @@ -222,7 +222,7 @@ def test_pipeline_logistic_regression_fit_score_predict( ) -@pytest.mark.flaky(retries=2, delay=120) +@pytest.mark.flaky(retries=2) def test_pipeline_xgbregressor_fit_score_predict(session, penguins_df_default_index): """Test a supervised model with a minimal preprocessing step""" pl = pipeline.Pipeline( @@ -297,7 +297,7 @@ def test_pipeline_xgbregressor_fit_score_predict(session, penguins_df_default_in ) -@pytest.mark.flaky(retries=2, delay=120) +@pytest.mark.flaky(retries=2) def test_pipeline_random_forest_classifier_fit_score_predict( session, penguins_df_default_index ): @@ -445,7 +445,7 @@ def test_pipeline_PCA_fit_score_predict(session, penguins_df_default_index): ) -@pytest.mark.flaky(retries=2, delay=120) +@pytest.mark.flaky(retries=2) def test_pipeline_standard_scaler_kmeans_fit_score_predict( session, penguins_pandas_df_default_index ): diff --git a/tests/system/small/ml/test_core.py b/tests/system/small/ml/test_core.py index 02030cd31e..c505057d7b 100644 --- a/tests/system/small/ml/test_core.py +++ b/tests/system/small/ml/test_core.py @@ -333,7 +333,7 @@ def test_remote_model_predict( ) -@pytest.mark.flaky(retries=2, delay=120) +@pytest.mark.flaky(retries=2) def test_model_generate_text( bqml_palm2_text_generator_model: core.BqmlModel, llm_text_df ): diff --git a/tests/system/small/ml/test_llm.py b/tests/system/small/ml/test_llm.py index 2e135bef7b..e526d54362 100644 --- a/tests/system/small/ml/test_llm.py +++ b/tests/system/small/ml/test_llm.py @@ -49,7 +49,7 @@ def test_create_text_generator_32k_model( assert reloaded_model.connection_name == bq_connection -@pytest.mark.flaky(retries=2, delay=120) +@pytest.mark.flaky(retries=2) def test_create_text_generator_model_default_session( bq_connection, llm_text_pandas_df, bigquery_client ): @@ -76,7 +76,7 @@ def test_create_text_generator_model_default_session( assert all(series.str.len() > 20) -@pytest.mark.flaky(retries=2, delay=120) +@pytest.mark.flaky(retries=2) def test_create_text_generator_32k_model_default_session( bq_connection, llm_text_pandas_df, bigquery_client ): @@ -103,7 +103,7 @@ def test_create_text_generator_32k_model_default_session( assert all(series.str.len() > 20) -@pytest.mark.flaky(retries=2, delay=120) +@pytest.mark.flaky(retries=2) def test_create_text_generator_model_default_connection( llm_text_pandas_df, bigquery_client ): @@ -131,7 +131,7 @@ def test_create_text_generator_model_default_connection( # Marked as flaky only because BQML LLM is in preview, the service only has limited capacity, not stable enough. -@pytest.mark.flaky(retries=2, delay=120) +@pytest.mark.flaky(retries=2) def test_text_generator_predict_default_params_success( palm2_text_generator_model, llm_text_df ): @@ -142,7 +142,7 @@ def test_text_generator_predict_default_params_success( assert all(series.str.len() > 20) -@pytest.mark.flaky(retries=2, delay=120) +@pytest.mark.flaky(retries=2) def test_text_generator_predict_series_default_params_success( palm2_text_generator_model, llm_text_df ): @@ -153,7 +153,7 @@ def test_text_generator_predict_series_default_params_success( assert all(series.str.len() > 20) -@pytest.mark.flaky(retries=2, delay=120) +@pytest.mark.flaky(retries=2) def test_text_generator_predict_arbitrary_col_label_success( palm2_text_generator_model, llm_text_df ): @@ -165,7 +165,7 @@ def test_text_generator_predict_arbitrary_col_label_success( assert all(series.str.len() > 20) -@pytest.mark.flaky(retries=2, delay=120) +@pytest.mark.flaky(retries=2) def test_text_generator_predict_with_params_success( palm2_text_generator_model, llm_text_df ): @@ -255,7 +255,7 @@ def test_create_text_embedding_generator_multilingual_model_defaults(bq_connecti assert model._bqml_model is not None -@pytest.mark.flaky(retries=2, delay=120) +@pytest.mark.flaky(retries=2) def test_embedding_generator_predict_success( palm2_embedding_generator_model, llm_text_df ): @@ -267,7 +267,7 @@ def test_embedding_generator_predict_success( assert len(value) == 768 -@pytest.mark.flaky(retries=2, delay=120) +@pytest.mark.flaky(retries=2) def test_embedding_generator_multilingual_predict_success( palm2_embedding_generator_multilingual_model, llm_text_df ): @@ -279,7 +279,7 @@ def test_embedding_generator_multilingual_predict_success( assert len(value) == 768 -@pytest.mark.flaky(retries=2, delay=120) +@pytest.mark.flaky(retries=2) def test_embedding_generator_predict_series_success( palm2_embedding_generator_model, llm_text_df ): @@ -306,7 +306,7 @@ def test_create_gemini_text_generator_model( assert reloaded_model.connection_name == bq_connection -@pytest.mark.flaky(retries=2, delay=120) +@pytest.mark.flaky(retries=2) def test_gemini_text_generator_predict_default_params_success( gemini_text_generator_model, llm_text_df ): @@ -317,7 +317,7 @@ def test_gemini_text_generator_predict_default_params_success( assert all(series.str.len() > 20) -@pytest.mark.flaky(retries=2, delay=120) +@pytest.mark.flaky(retries=2) def test_gemini_text_generator_predict_with_params_success( gemini_text_generator_model, llm_text_df ): diff --git a/tests/system/small/ml/test_preprocessing.py b/tests/system/small/ml/test_preprocessing.py index 990795da3b..040111f38a 100644 --- a/tests/system/small/ml/test_preprocessing.py +++ b/tests/system/small/ml/test_preprocessing.py @@ -18,7 +18,7 @@ import pyarrow as pa import bigframes.features -import bigframes.ml.preprocessing +from bigframes.ml import preprocessing ONE_HOT_ENCODED_DTYPE = ( pd.ArrowDtype(pa.list_(pa.struct([("index", pa.int64()), ("value", pa.float64())]))) @@ -29,7 +29,7 @@ def test_standard_scaler_normalizes(penguins_df_default_index, new_penguins_df): # TODO(http://b/292431644): add a second test that compares output to sklearn.preprocessing.StandardScaler, when BQML's change is in prod. - scaler = bigframes.ml.preprocessing.StandardScaler() + scaler = preprocessing.StandardScaler() scaler.fit( penguins_df_default_index[ ["culmen_length_mm", "culmen_depth_mm", "flipper_length_mm"] @@ -68,7 +68,7 @@ def test_standard_scaler_normalizes(penguins_df_default_index, new_penguins_df): def test_standard_scaler_normalizeds_fit_transform(new_penguins_df): # TODO(http://b/292431644): add a second test that compares output to sklearn.preprocessing.StandardScaler, when BQML's change is in prod. - scaler = bigframes.ml.preprocessing.StandardScaler() + scaler = preprocessing.StandardScaler() result = scaler.fit_transform( new_penguins_df[["culmen_length_mm", "culmen_depth_mm", "flipper_length_mm"]] ).to_pandas() @@ -97,7 +97,7 @@ def test_standard_scaler_normalizeds_fit_transform(new_penguins_df): def test_standard_scaler_series_normalizes(penguins_df_default_index, new_penguins_df): # TODO(http://b/292431644): add a second test that compares output to sklearn.preprocessing.StandardScaler, when BQML's change is in prod. - scaler = bigframes.ml.preprocessing.StandardScaler() + scaler = preprocessing.StandardScaler() scaler.fit(penguins_df_default_index["culmen_length_mm"]) result = scaler.transform(penguins_df_default_index["culmen_length_mm"]).to_pandas() @@ -128,9 +128,22 @@ def test_standard_scaler_series_normalizes(penguins_df_default_index, new_pengui pd.testing.assert_frame_equal(result, expected, rtol=1e-3) +def test_standard_scaler_save_load(new_penguins_df, dataset_id): + transformer = preprocessing.StandardScaler() + transformer.fit( + new_penguins_df[["culmen_length_mm", "culmen_depth_mm", "flipper_length_mm"]] + ) + + reloaded_transformer = transformer.to_gbq( + f"{dataset_id}.temp_configured_model", replace=True + ) + assert isinstance(reloaded_transformer, preprocessing.StandardScaler) + assert reloaded_transformer._bqml_model is not None + + def test_max_abs_scaler_normalizes(penguins_df_default_index, new_penguins_df): # TODO(http://b/292431644): add a second test that compares output to sklearn.preprocessing.MaxAbsScaler, when BQML's change is in prod. - scaler = bigframes.ml.preprocessing.MaxAbsScaler() + scaler = preprocessing.MaxAbsScaler() scaler.fit( penguins_df_default_index[ ["culmen_length_mm", "culmen_depth_mm", "flipper_length_mm"] @@ -168,7 +181,7 @@ def test_max_abs_scaler_normalizes(penguins_df_default_index, new_penguins_df): def test_max_abs_scaler_normalizeds_fit_transform(new_penguins_df): - scaler = bigframes.ml.preprocessing.MaxAbsScaler() + scaler = preprocessing.MaxAbsScaler() result = scaler.fit_transform( new_penguins_df[["culmen_length_mm", "culmen_depth_mm", "flipper_length_mm"]] ).to_pandas() @@ -192,7 +205,7 @@ def test_max_abs_scaler_normalizeds_fit_transform(new_penguins_df): def test_max_abs_scaler_series_normalizes(penguins_df_default_index, new_penguins_df): - scaler = bigframes.ml.preprocessing.MaxAbsScaler() + scaler = preprocessing.MaxAbsScaler() scaler.fit(penguins_df_default_index["culmen_length_mm"]) result = scaler.transform(penguins_df_default_index["culmen_length_mm"]).to_pandas() @@ -219,8 +232,21 @@ def test_max_abs_scaler_series_normalizes(penguins_df_default_index, new_penguin pd.testing.assert_frame_equal(result, expected, rtol=1e-3) +def test_max_abs_scaler_save_load(new_penguins_df, dataset_id): + transformer = preprocessing.MaxAbsScaler() + transformer.fit( + new_penguins_df[["culmen_length_mm", "culmen_depth_mm", "flipper_length_mm"]] + ) + + reloaded_transformer = transformer.to_gbq( + f"{dataset_id}.temp_configured_model", replace=True + ) + assert isinstance(reloaded_transformer, preprocessing.MaxAbsScaler) + assert reloaded_transformer._bqml_model is not None + + def test_min_max_scaler_normalized_fit_transform(new_penguins_df): - scaler = bigframes.ml.preprocessing.MinMaxScaler() + scaler = preprocessing.MinMaxScaler() result = scaler.fit_transform( new_penguins_df[["culmen_length_mm", "culmen_depth_mm", "flipper_length_mm"]] ).to_pandas() @@ -244,7 +270,7 @@ def test_min_max_scaler_normalized_fit_transform(new_penguins_df): def test_min_max_scaler_series_normalizes(penguins_df_default_index, new_penguins_df): - scaler = bigframes.ml.preprocessing.MinMaxScaler() + scaler = preprocessing.MinMaxScaler() scaler.fit(penguins_df_default_index["culmen_length_mm"]) result = scaler.transform(penguins_df_default_index["culmen_length_mm"]).to_pandas() @@ -274,7 +300,7 @@ def test_min_max_scaler_series_normalizes(penguins_df_default_index, new_penguin def test_min_max_scaler_normalizes(penguins_df_default_index, new_penguins_df): # TODO(http://b/292431644): add a second test that compares output to sklearn.preprocessing.MinMaxScaler, when BQML's change is in prod. - scaler = bigframes.ml.preprocessing.MinMaxScaler() + scaler = preprocessing.MinMaxScaler() scaler.fit( penguins_df_default_index[ ["culmen_length_mm", "culmen_depth_mm", "flipper_length_mm"] @@ -312,8 +338,21 @@ def test_min_max_scaler_normalizes(penguins_df_default_index, new_penguins_df): pd.testing.assert_frame_equal(result, expected, rtol=1e-3) +def test_min_max_scaler_save_load(new_penguins_df, dataset_id): + transformer = preprocessing.MinMaxScaler() + transformer.fit( + new_penguins_df[["culmen_length_mm", "culmen_depth_mm", "flipper_length_mm"]] + ) + + reloaded_transformer = transformer.to_gbq( + f"{dataset_id}.temp_configured_model", replace=True + ) + assert isinstance(reloaded_transformer, preprocessing.MinMaxScaler) + assert reloaded_transformer._bqml_model is not None + + def test_k_bins_discretizer_normalized_fit_transform_default_params(new_penguins_df): - discretizer = bigframes.ml.preprocessing.KBinsDiscretizer(strategy="uniform") + discretizer = preprocessing.KBinsDiscretizer(strategy="uniform") result = discretizer.fit_transform( new_penguins_df[["culmen_length_mm", "culmen_depth_mm", "flipper_length_mm"]] ).to_pandas() @@ -339,7 +378,7 @@ def test_k_bins_discretizer_normalized_fit_transform_default_params(new_penguins def test_k_bins_discretizer_series_normalizes( penguins_df_default_index, new_penguins_df ): - discretizer = bigframes.ml.preprocessing.KBinsDiscretizer(strategy="uniform") + discretizer = preprocessing.KBinsDiscretizer(strategy="uniform") discretizer.fit(penguins_df_default_index["culmen_length_mm"]) result = discretizer.transform( @@ -365,7 +404,7 @@ def test_k_bins_discretizer_series_normalizes( def test_k_bins_discretizer_normalizes(penguins_df_default_index, new_penguins_df): # TODO(http://b/292431644): add a second test that compares output to sklearn.preprocessing.KBinsDiscretizer, when BQML's change is in prod. - discretizer = bigframes.ml.preprocessing.KBinsDiscretizer(strategy="uniform") + discretizer = preprocessing.KBinsDiscretizer(strategy="uniform") discretizer.fit( penguins_df_default_index[ ["culmen_length_mm", "culmen_depth_mm", "flipper_length_mm"] @@ -402,9 +441,7 @@ def test_k_bins_discretizer_normalizes_different_params( penguins_df_default_index, new_penguins_df ): # TODO(http://b/292431644): add a second test that compares output to sklearn.preprocessing.KBinsDiscretizer, when BQML's change is in prod. - discretizer = bigframes.ml.preprocessing.KBinsDiscretizer( - n_bins=6, strategy="uniform" - ) + discretizer = preprocessing.KBinsDiscretizer(n_bins=6, strategy="uniform") discretizer.fit( penguins_df_default_index[ ["culmen_length_mm", "culmen_depth_mm", "flipper_length_mm"] @@ -437,8 +474,23 @@ def test_k_bins_discretizer_normalizes_different_params( pd.testing.assert_frame_equal(result, expected, rtol=1e-3) +def test_k_bins_discretizer_save_load(new_penguins_df, dataset_id): + transformer = preprocessing.KBinsDiscretizer(n_bins=6, strategy="uniform") + transformer.fit( + new_penguins_df[["culmen_length_mm", "culmen_depth_mm", "flipper_length_mm"]] + ) + + reloaded_transformer = transformer.to_gbq( + f"{dataset_id}.temp_configured_model", replace=True + ) + assert isinstance(reloaded_transformer, preprocessing.KBinsDiscretizer) + assert reloaded_transformer.n_bins == transformer.n_bins + assert reloaded_transformer.strategy == transformer.strategy + assert reloaded_transformer._bqml_model is not None + + def test_one_hot_encoder_default_params(new_penguins_df): - encoder = bigframes.ml.preprocessing.OneHotEncoder() + encoder = preprocessing.OneHotEncoder() encoder.fit(new_penguins_df[["species", "sex"]]) result = encoder.transform(new_penguins_df).to_pandas() @@ -469,7 +521,7 @@ def test_one_hot_encoder_default_params(new_penguins_df): def test_one_hot_encoder_default_params_fit_transform(new_penguins_df): - encoder = bigframes.ml.preprocessing.OneHotEncoder() + encoder = preprocessing.OneHotEncoder() result = encoder.fit_transform(new_penguins_df[["species", "sex"]]).to_pandas() @@ -499,7 +551,7 @@ def test_one_hot_encoder_default_params_fit_transform(new_penguins_df): def test_one_hot_encoder_series_default_params(new_penguins_df): - encoder = bigframes.ml.preprocessing.OneHotEncoder() + encoder = preprocessing.OneHotEncoder() encoder.fit(new_penguins_df["species"]) result = encoder.transform(new_penguins_df).to_pandas() @@ -525,7 +577,7 @@ def test_one_hot_encoder_series_default_params(new_penguins_df): def test_one_hot_encoder_params(new_penguins_df): - encoder = bigframes.ml.preprocessing.OneHotEncoder("most_frequent", 100, 2) + encoder = preprocessing.OneHotEncoder("most_frequent", 100, 2) encoder.fit(new_penguins_df[["species", "sex"]]) result = encoder.transform(new_penguins_df).to_pandas() @@ -556,7 +608,7 @@ def test_one_hot_encoder_params(new_penguins_df): def test_one_hot_encoder_different_data(penguins_df_default_index, new_penguins_df): - encoder = bigframes.ml.preprocessing.OneHotEncoder() + encoder = preprocessing.OneHotEncoder() encoder.fit(penguins_df_default_index[["species", "sex"]]) result = encoder.transform(new_penguins_df).to_pandas() @@ -586,8 +638,21 @@ def test_one_hot_encoder_different_data(penguins_df_default_index, new_penguins_ pd.testing.assert_frame_equal(result, expected) +def test_one_hot_encoder_save_load(new_penguins_df, dataset_id): + transformer = preprocessing.OneHotEncoder(min_frequency=1, max_categories=10) + transformer.fit(new_penguins_df[["species", "sex"]]) + + reloaded_transformer = transformer.to_gbq( + f"{dataset_id}.temp_configured_model", replace=True + ) + assert isinstance(reloaded_transformer, preprocessing.OneHotEncoder) + assert reloaded_transformer.min_frequency == transformer.min_frequency + assert reloaded_transformer.max_categories == transformer.max_categories + assert reloaded_transformer._bqml_model is not None + + def test_label_encoder_default_params(new_penguins_df): - encoder = bigframes.ml.preprocessing.LabelEncoder() + encoder = preprocessing.LabelEncoder() encoder.fit(new_penguins_df["species"]) result = encoder.transform(new_penguins_df["species"]).to_pandas() @@ -613,7 +678,7 @@ def test_label_encoder_default_params(new_penguins_df): def test_label_encoder_default_params_fit_transform(new_penguins_df): - encoder = bigframes.ml.preprocessing.LabelEncoder() + encoder = preprocessing.LabelEncoder() result = encoder.fit_transform(new_penguins_df[["species"]]).to_pandas() @@ -638,7 +703,7 @@ def test_label_encoder_default_params_fit_transform(new_penguins_df): def test_label_encoder_series_default_params(new_penguins_df): - encoder = bigframes.ml.preprocessing.LabelEncoder() + encoder = preprocessing.LabelEncoder() encoder.fit(new_penguins_df["species"]) result = encoder.transform(new_penguins_df).to_pandas() @@ -664,7 +729,7 @@ def test_label_encoder_series_default_params(new_penguins_df): def test_label_encoder_params(new_penguins_df): - encoder = bigframes.ml.preprocessing.LabelEncoder(100, 2) + encoder = preprocessing.LabelEncoder(100, 2) encoder.fit(new_penguins_df[["species"]]) result = encoder.transform(new_penguins_df).to_pandas() @@ -690,7 +755,7 @@ def test_label_encoder_params(new_penguins_df): def test_label_encoder_different_data(penguins_df_default_index, new_penguins_df): - encoder = bigframes.ml.preprocessing.LabelEncoder() + encoder = preprocessing.LabelEncoder() encoder.fit(penguins_df_default_index[["species"]]) result = encoder.transform(new_penguins_df).to_pandas() @@ -715,4 +780,17 @@ def test_label_encoder_different_data(penguins_df_default_index, new_penguins_df pd.testing.assert_frame_equal(result, expected) +def test_label_encoder_save_load(new_penguins_df, dataset_id): + transformer = preprocessing.LabelEncoder(min_frequency=1, max_categories=10) + transformer.fit(new_penguins_df[["species"]]) + + reloaded_transformer = transformer.to_gbq( + f"{dataset_id}.temp_configured_model", replace=True + ) + assert isinstance(reloaded_transformer, preprocessing.LabelEncoder) + assert reloaded_transformer.min_frequency == transformer.min_frequency + assert reloaded_transformer.max_categories == transformer.max_categories + assert reloaded_transformer._bqml_model is not None + + # TODO(garrettwu): add OneHotEncoder tests to compare with sklearn. From 483390830ae0ee2fe0fb47dc7d2aea143b2dc7d8 Mon Sep 17 00:00:00 2001 From: TrevorBergeron Date: Mon, 1 Apr 2024 16:02:17 -0700 Subject: [PATCH 32/53] fix: Respect hard stack size limit and swallow limit change exception. (#558) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Thank you for opening a Pull Request! Before submitting your PR, there are a few things you can do to make sure it goes smoothly: - [ ] Make sure to open an issue as a [bug/issue](https://togithub.com/googleapis/python-bigquery-dataframes/issues/new/choose) before writing your code! That way we can discuss the change, evaluate designs, and agree on the general idea - [ ] Ensure the tests and linter pass - [ ] Code coverage does not decrease (if any source code was changed) - [ ] Appropriate docs were updated (if necessary) Fixes # 🦕 --- bigframes/pandas/__init__.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/bigframes/pandas/__init__.py b/bigframes/pandas/__init__.py index b6476c5eb8..fc008f36e5 100644 --- a/bigframes/pandas/__init__.py +++ b/bigframes/pandas/__init__.py @@ -714,9 +714,13 @@ def to_datetime( # which the applicable limit is now hard coded. See: # https://github.com/python/cpython/issues/112282 sys.setrecursionlimit(max(10000000, sys.getrecursionlimit())) -resource.setrlimit( - resource.RLIMIT_STACK, (resource.RLIM_INFINITY, resource.RLIM_INFINITY) -) + +soft_limit, hard_limit = resource.getrlimit(resource.RLIMIT_STACK) +if soft_limit < hard_limit or hard_limit == resource.RLIM_INFINITY: + try: + resource.setrlimit(resource.RLIMIT_STACK, (hard_limit, hard_limit)) + except Exception: + pass # Use __all__ to let type checkers know what is part of the public API. __all___ = [ From 4995c0046265463bc5c502cbeb34c7632d5a255e Mon Sep 17 00:00:00 2001 From: Garrett Wu <6505921+GarrettWu@users.noreply.github.com> Date: Mon, 1 Apr 2024 20:47:19 -0700 Subject: [PATCH 33/53] fix: rename PaLM2TextEmbeddingGenerator.predict output columns to be backward compatible (#561) --- bigframes/ml/llm.py | 9 +- .../bq_dataframes_llm_kmeans.ipynb | 1066 ++++++++--------- tests/system/small/ml/test_llm.py | 12 +- 3 files changed, 527 insertions(+), 560 deletions(-) diff --git a/bigframes/ml/llm.py b/bigframes/ml/llm.py index 031656f1d8..ffaeb399bb 100644 --- a/bigframes/ml/llm.py +++ b/bigframes/ml/llm.py @@ -44,7 +44,7 @@ _GEMINI_PRO_ENDPOINT = "gemini-pro" _ML_GENERATE_TEXT_STATUS = "ml_generate_text_status" -_ML_EMBED_TEXT_STATUS = "ml_generate_embedding_status" +_ML_EMBED_TEXT_STATUS = "ml_embed_text_status" @log_adapter.class_logger @@ -390,6 +390,13 @@ def predict(self, X: Union[bpd.DataFrame, bpd.Series]) -> bpd.DataFrame: } df = self._bqml_model.generate_embedding(X, options) + df = df.rename( + columns={ + "ml_generate_embedding_result": "text_embedding", + "ml_generate_embedding_statistics": "statistics", + "ml_generate_embedding_status": _ML_EMBED_TEXT_STATUS, + } + ) if (df[_ML_EMBED_TEXT_STATUS] != "").any(): warnings.warn( diff --git a/notebooks/generative_ai/bq_dataframes_llm_kmeans.ipynb b/notebooks/generative_ai/bq_dataframes_llm_kmeans.ipynb index 2c6d109ba8..ab6fd93f9a 100644 --- a/notebooks/generative_ai/bq_dataframes_llm_kmeans.ipynb +++ b/notebooks/generative_ai/bq_dataframes_llm_kmeans.ipynb @@ -289,7 +289,7 @@ { "data": { "text/html": [ - "Query job 030e5d08-f690-47e4-b7cc-342731245575 is DONE. 2.3 GB processed. Open Job" + "Query job 952b852e-7cf0-493d-8258-fe60daf45ebf is DONE. 2.3 GB processed. Open Job" ], "text/plain": [ "" @@ -301,7 +301,7 @@ { "data": { "text/html": [ - "Query job a9c5f416-c5d2-4209-b639-bccb81a25d7e is DONE. 58.8 MB processed. Open Job" + "Query job f9939880-6c66-4da5-9e90-daf8d9a9d83c is DONE. 50.3 MB processed. Open Job" ], "text/plain": [ "" @@ -336,24 +336,24 @@ " \n", " \n", " \n", - " 1053364\n", - " My Macy 's American Express account was taken ...\n", + " 1799560\n", + " Thursday, XX/XX/XXXX, unauthorized charges wer...\n", " \n", " \n", - " 1053757\n", - " I am a victim of identity theft. The informati...\n", + " 1800272\n", + " The credit reporting company is reporting inac...\n", " \n", " \n", - " 1053784\n", - " In XXXX 2016, Amex took out $ XXXX.+ unauthori...\n", + " 1800409\n", + " In accordance with the Fair Credit Reporting a...\n", " \n", " \n", - " 1054237\n", - " I am not for sure the exact date of my loan it...\n", + " 1800550\n", + " I told the credit bureaus to \" investigate eve...\n", " \n", " \n", - " 1054244\n", - " I entered a consumer credit transaction with t...\n", + " 1800818\n", + " Im writing in reference regarding XXXXXXXX XXX...\n", " \n", " \n", "\n", @@ -361,11 +361,11 @@ ], "text/plain": [ " consumer_complaint_narrative\n", - "1053364 My Macy 's American Express account was taken ...\n", - "1053757 I am a victim of identity theft. The informati...\n", - "1053784 In XXXX 2016, Amex took out $ XXXX.+ unauthori...\n", - "1054237 I am not for sure the exact date of my loan it...\n", - "1054244 I entered a consumer credit transaction with t..." + "1799560 Thursday, XX/XX/XXXX, unauthorized charges wer...\n", + "1800272 The credit reporting company is reporting inac...\n", + "1800409 In accordance with the Fair Credit Reporting a...\n", + "1800550 I told the credit bureaus to \" investigate eve...\n", + "1800818 Im writing in reference regarding XXXXXXXX XXX..." ] }, "execution_count": 7, @@ -418,7 +418,7 @@ { "data": { "text/html": [ - "Query job 77eee871-31eb-4939-a015-f5505c94786e is DONE. 0 Bytes processed. Open Job" + "Query job e3ff0549-f0ee-4508-bb4f-beea14bf54f5 is DONE. 0 Bytes processed. Open Job" ], "text/plain": [ "" @@ -444,7 +444,7 @@ { "data": { "text/html": [ - "Query job 63cdd004-21b6-41bf-8876-aa646f1f268e is DONE. 1.3 GB processed. Open Job" + "Query job 5b3d8f8c-9e8d-4378-b4df-e3328300f17a is DONE. 1.3 GB processed. Open Job" ], "text/plain": [ "" @@ -456,7 +456,7 @@ { "data": { "text/html": [ - "Query job cda12546-9931-48f6-8b22-74a9ab85fa28 is DONE. 80.0 kB processed. Open Job" + "Query job f35c2982-4953-45fa-84bd-d0ce04e13c5e is DONE. 80.0 kB processed. Open Job" ], "text/plain": [ "" @@ -468,7 +468,7 @@ { "data": { "text/html": [ - "Query job 759a13c5-c02f-4ae8-9b22-d7ef423ffe8d is DONE. 20.0 kB processed. Open Job" + "Query job b70c55a3-b18b-4313-86b0-31f5b3b570fb is DONE. 20.0 kB processed. Open Job" ], "text/plain": [ "" @@ -480,7 +480,19 @@ { "data": { "text/html": [ - "Query job 1bad8ef3-8103-4a98-bec4-699d97673b9a is DONE. 72.0 MB processed. Open Job" + "Query job 2b2cfd9f-c713-4411-a3ca-1916cec84ff0 is DONE. 0 Bytes processed. Open Job" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Query job 09cadae1-1c66-43cf-a76f-7495b0123006 is DONE. 71.9 MB processed. Open Job" ], "text/plain": [ "" @@ -510,188 +522,187 @@ " \n", " \n", " \n", - " ml_generate_embedding_result\n", - " ml_generate_embedding_statistics\n", - " ml_generate_embedding_status\n", + " text_embedding\n", + " statistics\n", + " ml_embed_text_status\n", " content\n", " \n", " \n", " \n", " \n", - " 357\n", - " [ 1.33585772e-02 -3.76597494e-02 -6.14452176e-...\n", - " {\"token_count\":306,\"truncated\":false}\n", + " 782\n", + " [ 2.78223325e-02 -1.71949025e-02 -5.01214415e-...\n", + " {\"token_count\":121,\"truncated\":false}\n", " \n", - " I decided to try XXXX services for my wife and...\n", + " I 've sent multiple letters to this agency abo...\n", " \n", " \n", - " 428\n", - " [ 3.10036819e-04 -3.82593311e-02 -3.41922641e-...\n", - " {\"token_count\":134,\"truncated\":false}\n", + " 795\n", + " [ 1.39164589e-02 -5.08313216e-02 -4.53360938e-...\n", + " {\"token_count\":141,\"truncated\":false}\n", " \n", - " XXXX I went to the bank in question ( XXXX XXX...\n", + " I receive social security XXXX funds in my XXX...\n", " \n", " \n", - " 1319\n", - " [ 1.97481886e-02 -1.97448786e-02 -5.13443351e-...\n", - " {\"token_count\":215,\"truncated\":false}\n", + " 861\n", + " [ 7.80681521e-03 -3.23560014e-02 -6.76454604e-...\n", + " {\"token_count\":160,\"truncated\":false}\n", " \n", - " I currently have a home loan with my ex husban...\n", + " Hello, My name is XXXX XXXX XXXX. I have a pro...\n", " \n", " \n", - " 1993\n", - " [ 9.83821880e-03 -6.55664057e-02 -5.46210706e-...\n", - " {\"token_count\":536,\"truncated\":false}\n", + " 1103\n", + " [ 4.18044440e-02 -4.28444222e-02 -6.26875088e-...\n", + " {\"token_count\":31,\"truncated\":false}\n", " \n", - " NOT MY ACCOUNT, NOT AN AUTHORIZED USER {$1800....\n", + " The debt occurred more than 7 years in the pas...\n", " \n", " \n", - " 1997\n", - " [ 0.03145148 -0.01011822 -0.02316323 -0.025078...\n", - " {\"token_count\":123,\"truncated\":false}\n", + " 1241\n", + " [ 7.74183637e-03 -6.50701150e-02 -2.13856809e-...\n", + " {\"token_count\":23,\"truncated\":false}\n", " \n", - " After a while the payments became harder and h...\n", + " UNAUTHORIZED CREDIT REPORTING NO ACCOUNT TO VA...\n", " \n", " \n", - " 2469\n", - " [ 4.74590808e-03 -4.56819348e-02 -2.49751769e-...\n", - " {\"token_count\":60,\"truncated\":false}\n", + " 1729\n", + " [ 3.15782428e-02 -1.23979878e-02 -3.70296165e-...\n", + " {\"token_count\":382,\"truncated\":false}\n", " \n", - " In the course of my student loan, I have been ...\n", + " XXXX on XXXX XX/XX/2021 I have Mr. Cooper mort...\n", " \n", " \n", - " 2624\n", - " [ 3.91883589e-03 -3.26644145e-02 -7.10378587e-...\n", - " {\"token_count\":254,\"truncated\":false}\n", + " 2167\n", + " [ 9.87241510e-03 -1.81103535e-02 -4.17162031e-...\n", + " {\"token_count\":556,\"truncated\":false}\n", " \n", - " In accordance with the Fair Credit Reporting A...\n", + " This is the third such complaint I have submit...\n", " \n", " \n", - " 2832\n", - " [ 8.35181400e-03 -2.91643552e-02 -4.30776961e-...\n", - " {\"token_count\":79,\"truncated\":false}\n", + " 2219\n", + " [ 2.56749280e-02 -4.92163002e-02 -5.67202382e-...\n", + " {\"token_count\":196,\"truncated\":false}\n", " \n", - " LVNV FUNDING LLC is continually placing a coll...\n", + " Found and add online for a Prepaid Credit card...\n", " \n", " \n", - " 3328\n", - " [ 2.71253809e-02 -1.77491009e-02 -5.32273464e-...\n", - " {\"token_count\":156,\"truncated\":false}\n", + " 2392\n", + " [ 2.34611966e-02 -4.74611111e-02 -3.59710641e-...\n", + " {\"token_count\":641,\"truncated\":false}\n", " \n", - " On XX/XX/2020 I sent a letter regarding inaccu...\n", + " I am furnishing this complaint against Fed Loa...\n", " \n", " \n", - " 3650\n", - " [-6.10093866e-03 -5.93599863e-02 -8.04531425e-...\n", - " {\"token_count\":175,\"truncated\":false}\n", + " 2528\n", + " [ 1.90760177e-02 -4.90266569e-02 -5.60806654e-...\n", + " {\"token_count\":176,\"truncated\":false}\n", " \n", - " Over a year and a half ago we started the proc...\n", + " Despite multiple written requests, the unverif...\n", " \n", " \n", - " 3860\n", - " [ 5.84836192e-02 -2.43354496e-03 -5.57337068e-...\n", - " {\"token_count\":1267,\"truncated\":false}\n", + " 2737\n", + " [ 1.81887485e-02 -8.74284655e-03 -2.73009986e-...\n", + " {\"token_count\":230,\"truncated\":false}\n", " \n", - " The issue is 26 late payments on me and my wif...\n", + " After unsatisfying communication in the messag...\n", " \n", " \n", - " 4464\n", - " [ 6.05084226e-02 -3.21578234e-02 -7.51668587e-...\n", - " {\"token_count\":906,\"truncated\":false}\n", + " 2859\n", + " [ 3.52482982e-02 -3.30757573e-02 -4.48422395e-...\n", + " {\"token_count\":238,\"truncated\":false}\n", " \n", - " I purchased as replacement for a lost XXXX XXX...\n", + " Good Morning. My name is XXXX XXXX. My account...\n", " \n", " \n", - " 4470\n", - " [ 1.28689921e-02 -3.25881056e-02 -6.53645024e-...\n", - " {\"token_count\":200,\"truncated\":false}\n", + " 3439\n", + " [ 3.40348878e-03 -2.72301212e-02 -2.03482248e-...\n", + " {\"token_count\":197,\"truncated\":false}\n", " \n", - " in accordance with the Fair Credit Reporting a...\n", + " I have ongoing disputes that are preventing me...\n", " \n", " \n", - " 4567\n", - " [-5.49167022e-03 -3.84587422e-02 -8.56091827e-...\n", - " {\"token_count\":110,\"truncated\":false}\n", + " 3738\n", + " [ 0.01422119 -0.01114973 -0.04438976 -0.024421...\n", + " {\"token_count\":160,\"truncated\":false}\n", " \n", - " I have submitted multiple disputes through the...\n", + " I had a loan with national Collegiate Trust. i...\n", " \n", " \n", - " 4713\n", - " [ 2.68485844e-02 -3.46762352e-02 -4.59849052e-...\n", - " {\"token_count\":549,\"truncated\":false}\n", + " 3805\n", + " [ 1.08179580e-02 -3.44337188e-02 -5.08812033e-...\n", + " {\"token_count\":477,\"truncated\":false}\n", " \n", - " While shopping for furniture for my home I ope...\n", + " Hi I am submitting this XXXX XXXX this isn't a...\n", " \n", " \n", - " 5181\n", - " [ 2.05754172e-02 -3.83999050e-02 -9.29225236e-...\n", - " {\"token_count\":77,\"truncated\":false}\n", + " 3915\n", + " [-7.23852217e-03 -4.69538383e-02 -5.60489520e-...\n", + " {\"token_count\":116,\"truncated\":false}\n", " \n", - " I had opened a Wells Fargo checking account wi...\n", + " portfolio is showin on my credit report with a...\n", " \n", " \n", - " 5511\n", - " [-0.00217485 -0.04031368 -0.06604777 -0.052006...\n", - " {\"token_count\":262,\"truncated\":false}\n", + " 3917\n", + " [-8.92711710e-03 -4.49132621e-02 -4.29662578e-...\n", + " {\"token_count\":71,\"truncated\":false}\n", " \n", - " I recently disputed ( see attached letter ) wi...\n", + " the company shared my information with another...\n", " \n", " \n", - " 5888\n", - " [-8.15972779e-03 -3.46563384e-02 -5.91776446e-...\n", - " {\"token_count\":176,\"truncated\":false}\n", + " 4281\n", + " [-1.69487391e-02 -1.89835522e-02 -3.80971469e-...\n", + " {\"token_count\":130,\"truncated\":false}\n", " \n", - " XXXX XXXX XXXX XXXX \n", - "I have disputed this acco...\n", + " I tried to submit a teacher loan forgiveness a...\n", " \n", " \n", - " 6299\n", - " [ 4.80043218e-02 -4.13420722e-02 -6.12363108e-...\n", - " {\"token_count\":151,\"truncated\":false}\n", + " 4470\n", + " [ 1.28689921e-02 -3.25881056e-02 -6.53645024e-...\n", + " {\"token_count\":200,\"truncated\":false}\n", " \n", - " XXXX ; XXXX and Transunion are reporting ( 30 ...\n", + " in accordance with the Fair Credit Reporting a...\n", " \n", " \n", - " 7143\n", - " [ 4.39200476e-02 -3.04005221e-02 -3.47866341e-...\n", - " {\"token_count\":234,\"truncated\":false}\n", + " 4915\n", + " [ 5.19403480e-02 -7.32436478e-02 -4.60561663e-...\n", + " {\"token_count\":23,\"truncated\":false}\n", " \n", - " My Macys account is due on the first of every ...\n", + " XXXX XXXX did not give me a receipt or a copy ...\n", " \n", " \n", - " 7219\n", - " [ 1.00224940e-02 -3.79302073e-03 -3.41785327e-...\n", - " {\"token_count\":26,\"truncated\":false}\n", + " 4928\n", + " [-4.43694415e-03 -3.66776163e-04 -9.08496231e-...\n", + " {\"token_count\":83,\"truncated\":false}\n", " \n", - " Keep getting letters and calls from collection...\n", + " This company has filed a civil suit during a g...\n", " \n", " \n", - " 7574\n", - " [-0.00149564 -0.06619431 -0.05084481 -0.048579...\n", - " {\"token_count\":129,\"truncated\":false}\n", + " 5338\n", + " [ 2.19908613e-03 -3.93951498e-02 -6.52823672e-...\n", + " {\"token_count\":1279,\"truncated\":false}\n", " \n", - " On XXXX I was on the XXXX app and there was a ...\n", + " My credit report contains errors that is keepi...\n", " \n", " \n", - " 8759\n", - " [ 0.01501553 -0.03575936 -0.050562 -0.034884...\n", - " {\"token_count\":501,\"truncated\":false}\n", + " 5582\n", + " [ 2.86326781e-02 -4.89189997e-02 -8.68150592e-...\n", + " {\"token_count\":396,\"truncated\":false}\n", " \n", - " Obviously I've been a victim of fraud, therefo...\n", + " Coast Professional, XXXX, LA contacted me by m...\n", " \n", " \n", - " 9700\n", - " [ 1.01501048e-02 -2.80565154e-02 -4.05892394e-...\n", - " {\"token_count\":48,\"truncated\":false}\n", + " 6386\n", + " [ 3.33276950e-02 1.53224478e-02 -1.89354066e-...\n", + " {\"token_count\":79,\"truncated\":false}\n", " \n", - " The following item have not been properly inve...\n", + " Cares act refund requested in XXXX, called mul...\n", " \n", " \n", - " 9822\n", - " [ 2.95880195e-02 1.65440738e-02 -3.33247967e-...\n", - " {\"token_count\":2373,\"truncated\":true}\n", + " 6956\n", + " [ 1.47060463e-02 -3.36431377e-02 -6.56675845e-...\n", + " {\"token_count\":194,\"truncated\":false}\n", " \n", - " During the housing market crash I went through...\n", + " n accordance with the Fair Credit Reporting ac...\n", " \n", " \n", "\n", @@ -699,87 +710,86 @@ "[10000 rows x 4 columns in total]" ], "text/plain": [ - " ml_generate_embedding_result \\\n", - "357 [ 1.33585772e-02 -3.76597494e-02 -6.14452176e-... \n", - "428 [ 3.10036819e-04 -3.82593311e-02 -3.41922641e-... \n", - "1319 [ 1.97481886e-02 -1.97448786e-02 -5.13443351e-... \n", - "1993 [ 9.83821880e-03 -6.55664057e-02 -5.46210706e-... \n", - "1997 [ 0.03145148 -0.01011822 -0.02316323 -0.025078... \n", - "2469 [ 4.74590808e-03 -4.56819348e-02 -2.49751769e-... \n", - "2624 [ 3.91883589e-03 -3.26644145e-02 -7.10378587e-... \n", - "2832 [ 8.35181400e-03 -2.91643552e-02 -4.30776961e-... \n", - "3328 [ 2.71253809e-02 -1.77491009e-02 -5.32273464e-... \n", - "3650 [-6.10093866e-03 -5.93599863e-02 -8.04531425e-... \n", - "3860 [ 5.84836192e-02 -2.43354496e-03 -5.57337068e-... \n", - "4464 [ 6.05084226e-02 -3.21578234e-02 -7.51668587e-... \n", + " text_embedding \\\n", + "782 [ 2.78223325e-02 -1.71949025e-02 -5.01214415e-... \n", + "795 [ 1.39164589e-02 -5.08313216e-02 -4.53360938e-... \n", + "861 [ 7.80681521e-03 -3.23560014e-02 -6.76454604e-... \n", + "1103 [ 4.18044440e-02 -4.28444222e-02 -6.26875088e-... \n", + "1241 [ 7.74183637e-03 -6.50701150e-02 -2.13856809e-... \n", + "1729 [ 3.15782428e-02 -1.23979878e-02 -3.70296165e-... \n", + "2167 [ 9.87241510e-03 -1.81103535e-02 -4.17162031e-... \n", + "2219 [ 2.56749280e-02 -4.92163002e-02 -5.67202382e-... \n", + "2392 [ 2.34611966e-02 -4.74611111e-02 -3.59710641e-... \n", + "2528 [ 1.90760177e-02 -4.90266569e-02 -5.60806654e-... \n", + "2737 [ 1.81887485e-02 -8.74284655e-03 -2.73009986e-... \n", + "2859 [ 3.52482982e-02 -3.30757573e-02 -4.48422395e-... \n", + "3439 [ 3.40348878e-03 -2.72301212e-02 -2.03482248e-... \n", + "3738 [ 0.01422119 -0.01114973 -0.04438976 -0.024421... \n", + "3805 [ 1.08179580e-02 -3.44337188e-02 -5.08812033e-... \n", + "3915 [-7.23852217e-03 -4.69538383e-02 -5.60489520e-... \n", + "3917 [-8.92711710e-03 -4.49132621e-02 -4.29662578e-... \n", + "4281 [-1.69487391e-02 -1.89835522e-02 -3.80971469e-... \n", "4470 [ 1.28689921e-02 -3.25881056e-02 -6.53645024e-... \n", - "4567 [-5.49167022e-03 -3.84587422e-02 -8.56091827e-... \n", - "4713 [ 2.68485844e-02 -3.46762352e-02 -4.59849052e-... \n", - "5181 [ 2.05754172e-02 -3.83999050e-02 -9.29225236e-... \n", - "5511 [-0.00217485 -0.04031368 -0.06604777 -0.052006... \n", - "5888 [-8.15972779e-03 -3.46563384e-02 -5.91776446e-... \n", - "6299 [ 4.80043218e-02 -4.13420722e-02 -6.12363108e-... \n", - "7143 [ 4.39200476e-02 -3.04005221e-02 -3.47866341e-... \n", - "7219 [ 1.00224940e-02 -3.79302073e-03 -3.41785327e-... \n", - "7574 [-0.00149564 -0.06619431 -0.05084481 -0.048579... \n", - "8759 [ 0.01501553 -0.03575936 -0.050562 -0.034884... \n", - "9700 [ 1.01501048e-02 -2.80565154e-02 -4.05892394e-... \n", - "9822 [ 2.95880195e-02 1.65440738e-02 -3.33247967e-... \n", + "4915 [ 5.19403480e-02 -7.32436478e-02 -4.60561663e-... \n", + "4928 [-4.43694415e-03 -3.66776163e-04 -9.08496231e-... \n", + "5338 [ 2.19908613e-03 -3.93951498e-02 -6.52823672e-... \n", + "5582 [ 2.86326781e-02 -4.89189997e-02 -8.68150592e-... \n", + "6386 [ 3.33276950e-02 1.53224478e-02 -1.89354066e-... \n", + "6956 [ 1.47060463e-02 -3.36431377e-02 -6.56675845e-... \n", "\n", - " ml_generate_embedding_statistics ml_generate_embedding_status \\\n", - "357 {\"token_count\":306,\"truncated\":false} \n", - "428 {\"token_count\":134,\"truncated\":false} \n", - "1319 {\"token_count\":215,\"truncated\":false} \n", - "1993 {\"token_count\":536,\"truncated\":false} \n", - "1997 {\"token_count\":123,\"truncated\":false} \n", - "2469 {\"token_count\":60,\"truncated\":false} \n", - "2624 {\"token_count\":254,\"truncated\":false} \n", - "2832 {\"token_count\":79,\"truncated\":false} \n", - "3328 {\"token_count\":156,\"truncated\":false} \n", - "3650 {\"token_count\":175,\"truncated\":false} \n", - "3860 {\"token_count\":1267,\"truncated\":false} \n", - "4464 {\"token_count\":906,\"truncated\":false} \n", - "4470 {\"token_count\":200,\"truncated\":false} \n", - "4567 {\"token_count\":110,\"truncated\":false} \n", - "4713 {\"token_count\":549,\"truncated\":false} \n", - "5181 {\"token_count\":77,\"truncated\":false} \n", - "5511 {\"token_count\":262,\"truncated\":false} \n", - "5888 {\"token_count\":176,\"truncated\":false} \n", - "6299 {\"token_count\":151,\"truncated\":false} \n", - "7143 {\"token_count\":234,\"truncated\":false} \n", - "7219 {\"token_count\":26,\"truncated\":false} \n", - "7574 {\"token_count\":129,\"truncated\":false} \n", - "8759 {\"token_count\":501,\"truncated\":false} \n", - "9700 {\"token_count\":48,\"truncated\":false} \n", - "9822 {\"token_count\":2373,\"truncated\":true} \n", + " statistics ml_embed_text_status \\\n", + "782 {\"token_count\":121,\"truncated\":false} \n", + "795 {\"token_count\":141,\"truncated\":false} \n", + "861 {\"token_count\":160,\"truncated\":false} \n", + "1103 {\"token_count\":31,\"truncated\":false} \n", + "1241 {\"token_count\":23,\"truncated\":false} \n", + "1729 {\"token_count\":382,\"truncated\":false} \n", + "2167 {\"token_count\":556,\"truncated\":false} \n", + "2219 {\"token_count\":196,\"truncated\":false} \n", + "2392 {\"token_count\":641,\"truncated\":false} \n", + "2528 {\"token_count\":176,\"truncated\":false} \n", + "2737 {\"token_count\":230,\"truncated\":false} \n", + "2859 {\"token_count\":238,\"truncated\":false} \n", + "3439 {\"token_count\":197,\"truncated\":false} \n", + "3738 {\"token_count\":160,\"truncated\":false} \n", + "3805 {\"token_count\":477,\"truncated\":false} \n", + "3915 {\"token_count\":116,\"truncated\":false} \n", + "3917 {\"token_count\":71,\"truncated\":false} \n", + "4281 {\"token_count\":130,\"truncated\":false} \n", + "4470 {\"token_count\":200,\"truncated\":false} \n", + "4915 {\"token_count\":23,\"truncated\":false} \n", + "4928 {\"token_count\":83,\"truncated\":false} \n", + "5338 {\"token_count\":1279,\"truncated\":false} \n", + "5582 {\"token_count\":396,\"truncated\":false} \n", + "6386 {\"token_count\":79,\"truncated\":false} \n", + "6956 {\"token_count\":194,\"truncated\":false} \n", "\n", " content \n", - "357 I decided to try XXXX services for my wife and... \n", - "428 XXXX I went to the bank in question ( XXXX XXX... \n", - "1319 I currently have a home loan with my ex husban... \n", - "1993 NOT MY ACCOUNT, NOT AN AUTHORIZED USER {$1800.... \n", - "1997 After a while the payments became harder and h... \n", - "2469 In the course of my student loan, I have been ... \n", - "2624 In accordance with the Fair Credit Reporting A... \n", - "2832 LVNV FUNDING LLC is continually placing a coll... \n", - "3328 On XX/XX/2020 I sent a letter regarding inaccu... \n", - "3650 Over a year and a half ago we started the proc... \n", - "3860 The issue is 26 late payments on me and my wif... \n", - "4464 I purchased as replacement for a lost XXXX XXX... \n", + "782 I 've sent multiple letters to this agency abo... \n", + "795 I receive social security XXXX funds in my XXX... \n", + "861 Hello, My name is XXXX XXXX XXXX. I have a pro... \n", + "1103 The debt occurred more than 7 years in the pas... \n", + "1241 UNAUTHORIZED CREDIT REPORTING NO ACCOUNT TO VA... \n", + "1729 XXXX on XXXX XX/XX/2021 I have Mr. Cooper mort... \n", + "2167 This is the third such complaint I have submit... \n", + "2219 Found and add online for a Prepaid Credit card... \n", + "2392 I am furnishing this complaint against Fed Loa... \n", + "2528 Despite multiple written requests, the unverif... \n", + "2737 After unsatisfying communication in the messag... \n", + "2859 Good Morning. My name is XXXX XXXX. My account... \n", + "3439 I have ongoing disputes that are preventing me... \n", + "3738 I had a loan with national Collegiate Trust. i... \n", + "3805 Hi I am submitting this XXXX XXXX this isn't a... \n", + "3915 portfolio is showin on my credit report with a... \n", + "3917 the company shared my information with another... \n", + "4281 I tried to submit a teacher loan forgiveness a... \n", "4470 in accordance with the Fair Credit Reporting a... \n", - "4567 I have submitted multiple disputes through the... \n", - "4713 While shopping for furniture for my home I ope... \n", - "5181 I had opened a Wells Fargo checking account wi... \n", - "5511 I recently disputed ( see attached letter ) wi... \n", - "5888 XXXX XXXX XXXX XXXX \n", - "I have disputed this acco... \n", - "6299 XXXX ; XXXX and Transunion are reporting ( 30 ... \n", - "7143 My Macys account is due on the first of every ... \n", - "7219 Keep getting letters and calls from collection... \n", - "7574 On XXXX I was on the XXXX app and there was a ... \n", - "8759 Obviously I've been a victim of fraud, therefo... \n", - "9700 The following item have not been properly inve... \n", - "9822 During the housing market crash I went through... \n", + "4915 XXXX XXXX did not give me a receipt or a copy ... \n", + "4928 This company has filed a civil suit during a g... \n", + "5338 My credit report contains errors that is keepi... \n", + "5582 Coast Professional, XXXX, LA contacted me by m... \n", + "6386 Cares act refund requested in XXXX, called mul... \n", + "6956 n accordance with the Fair Credit Reporting ac... \n", "...\n", "\n", "[10000 rows x 4 columns]" @@ -812,7 +822,19 @@ { "data": { "text/html": [ - "Query job b4594edf-80e5-4476-ac06-b799001f4cb0 is DONE. 72.0 MB processed. Open Job" + "Query job 2c99b34a-1956-4de7-8330-898f1f25560b is DONE. 71.9 MB processed. Open Job" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Query job 3ffed5f8-935a-4a3f-a560-6416445e4868 is DONE. 0 Bytes processed. Open Job" ], "text/plain": [ "" @@ -824,7 +846,7 @@ { "data": { "text/html": [ - "Query job 417e806a-2574-4b1b-8276-a95fa2df56e1 is DONE. 72.5 MB processed. Open Job" + "Query job 7b55783a-6d8f-41b9-b404-73253140029a is DONE. 72.3 MB processed. Open Job" ], "text/plain": [ "" @@ -854,188 +876,187 @@ " \n", " \n", " \n", - " ml_generate_embedding_result\n", - " ml_generate_embedding_statistics\n", - " ml_generate_embedding_status\n", + " text_embedding\n", + " statistics\n", + " ml_embed_text_status\n", " content\n", " \n", " \n", " \n", " \n", - " 357\n", - " [ 1.33585772e-02 -3.76597494e-02 -6.14452176e-...\n", - " {\"token_count\":306,\"truncated\":false}\n", + " 782\n", + " [ 2.78223325e-02 -1.71949025e-02 -5.01214415e-...\n", + " {\"token_count\":121,\"truncated\":false}\n", " \n", - " I decided to try XXXX services for my wife and...\n", + " I 've sent multiple letters to this agency abo...\n", " \n", " \n", - " 428\n", - " [ 3.10036819e-04 -3.82593311e-02 -3.41922641e-...\n", - " {\"token_count\":134,\"truncated\":false}\n", + " 795\n", + " [ 1.39164589e-02 -5.08313216e-02 -4.53360938e-...\n", + " {\"token_count\":141,\"truncated\":false}\n", " \n", - " XXXX I went to the bank in question ( XXXX XXX...\n", + " I receive social security XXXX funds in my XXX...\n", " \n", " \n", - " 1319\n", - " [ 1.97481886e-02 -1.97448786e-02 -5.13443351e-...\n", - " {\"token_count\":215,\"truncated\":false}\n", + " 861\n", + " [ 7.80681521e-03 -3.23560014e-02 -6.76454604e-...\n", + " {\"token_count\":160,\"truncated\":false}\n", " \n", - " I currently have a home loan with my ex husban...\n", + " Hello, My name is XXXX XXXX XXXX. I have a pro...\n", " \n", " \n", - " 1993\n", - " [ 9.83821880e-03 -6.55664057e-02 -5.46210706e-...\n", - " {\"token_count\":536,\"truncated\":false}\n", + " 1103\n", + " [ 4.18044440e-02 -4.28444222e-02 -6.26875088e-...\n", + " {\"token_count\":31,\"truncated\":false}\n", " \n", - " NOT MY ACCOUNT, NOT AN AUTHORIZED USER {$1800....\n", + " The debt occurred more than 7 years in the pas...\n", " \n", " \n", - " 1997\n", - " [ 0.03145148 -0.01011822 -0.02316323 -0.025078...\n", - " {\"token_count\":123,\"truncated\":false}\n", + " 1241\n", + " [ 7.74183637e-03 -6.50701150e-02 -2.13856809e-...\n", + " {\"token_count\":23,\"truncated\":false}\n", " \n", - " After a while the payments became harder and h...\n", + " UNAUTHORIZED CREDIT REPORTING NO ACCOUNT TO VA...\n", " \n", " \n", - " 2469\n", - " [ 4.74590808e-03 -4.56819348e-02 -2.49751769e-...\n", - " {\"token_count\":60,\"truncated\":false}\n", + " 1729\n", + " [ 3.15782428e-02 -1.23979878e-02 -3.70296165e-...\n", + " {\"token_count\":382,\"truncated\":false}\n", " \n", - " In the course of my student loan, I have been ...\n", + " XXXX on XXXX XX/XX/2021 I have Mr. Cooper mort...\n", " \n", " \n", - " 2624\n", - " [ 3.91883589e-03 -3.26644145e-02 -7.10378587e-...\n", - " {\"token_count\":254,\"truncated\":false}\n", + " 2167\n", + " [ 9.87241510e-03 -1.81103535e-02 -4.17162031e-...\n", + " {\"token_count\":556,\"truncated\":false}\n", " \n", - " In accordance with the Fair Credit Reporting A...\n", + " This is the third such complaint I have submit...\n", " \n", " \n", - " 2832\n", - " [ 8.35181400e-03 -2.91643552e-02 -4.30776961e-...\n", - " {\"token_count\":79,\"truncated\":false}\n", + " 2219\n", + " [ 2.56749280e-02 -4.92163002e-02 -5.67202382e-...\n", + " {\"token_count\":196,\"truncated\":false}\n", " \n", - " LVNV FUNDING LLC is continually placing a coll...\n", + " Found and add online for a Prepaid Credit card...\n", " \n", " \n", - " 3328\n", - " [ 2.71253809e-02 -1.77491009e-02 -5.32273464e-...\n", - " {\"token_count\":156,\"truncated\":false}\n", + " 2392\n", + " [ 2.34611966e-02 -4.74611111e-02 -3.59710641e-...\n", + " {\"token_count\":641,\"truncated\":false}\n", " \n", - " On XX/XX/2020 I sent a letter regarding inaccu...\n", + " I am furnishing this complaint against Fed Loa...\n", " \n", " \n", - " 3650\n", - " [-6.10093866e-03 -5.93599863e-02 -8.04531425e-...\n", - " {\"token_count\":175,\"truncated\":false}\n", + " 2528\n", + " [ 1.90760177e-02 -4.90266569e-02 -5.60806654e-...\n", + " {\"token_count\":176,\"truncated\":false}\n", " \n", - " Over a year and a half ago we started the proc...\n", + " Despite multiple written requests, the unverif...\n", " \n", " \n", - " 3860\n", - " [ 5.84836192e-02 -2.43354496e-03 -5.57337068e-...\n", - " {\"token_count\":1267,\"truncated\":false}\n", + " 2737\n", + " [ 1.81887485e-02 -8.74284655e-03 -2.73009986e-...\n", + " {\"token_count\":230,\"truncated\":false}\n", " \n", - " The issue is 26 late payments on me and my wif...\n", + " After unsatisfying communication in the messag...\n", " \n", " \n", - " 4464\n", - " [ 6.05084226e-02 -3.21578234e-02 -7.51668587e-...\n", - " {\"token_count\":906,\"truncated\":false}\n", + " 2859\n", + " [ 3.52482982e-02 -3.30757573e-02 -4.48422395e-...\n", + " {\"token_count\":238,\"truncated\":false}\n", " \n", - " I purchased as replacement for a lost XXXX XXX...\n", + " Good Morning. My name is XXXX XXXX. My account...\n", " \n", " \n", - " 4470\n", - " [ 1.28689921e-02 -3.25881056e-02 -6.53645024e-...\n", - " {\"token_count\":200,\"truncated\":false}\n", + " 3439\n", + " [ 3.40348878e-03 -2.72301212e-02 -2.03482248e-...\n", + " {\"token_count\":197,\"truncated\":false}\n", " \n", - " in accordance with the Fair Credit Reporting a...\n", + " I have ongoing disputes that are preventing me...\n", " \n", " \n", - " 4567\n", - " [-5.49167022e-03 -3.84587422e-02 -8.56091827e-...\n", - " {\"token_count\":110,\"truncated\":false}\n", + " 3738\n", + " [ 0.01422119 -0.01114973 -0.04438976 -0.024421...\n", + " {\"token_count\":160,\"truncated\":false}\n", " \n", - " I have submitted multiple disputes through the...\n", + " I had a loan with national Collegiate Trust. i...\n", " \n", " \n", - " 4713\n", - " [ 2.68485844e-02 -3.46762352e-02 -4.59849052e-...\n", - " {\"token_count\":549,\"truncated\":false}\n", + " 3805\n", + " [ 1.08179580e-02 -3.44337188e-02 -5.08812033e-...\n", + " {\"token_count\":477,\"truncated\":false}\n", " \n", - " While shopping for furniture for my home I ope...\n", + " Hi I am submitting this XXXX XXXX this isn't a...\n", " \n", " \n", - " 5181\n", - " [ 2.05754172e-02 -3.83999050e-02 -9.29225236e-...\n", - " {\"token_count\":77,\"truncated\":false}\n", + " 3915\n", + " [-7.23852217e-03 -4.69538383e-02 -5.60489520e-...\n", + " {\"token_count\":116,\"truncated\":false}\n", " \n", - " I had opened a Wells Fargo checking account wi...\n", + " portfolio is showin on my credit report with a...\n", " \n", " \n", - " 5511\n", - " [-0.00217485 -0.04031368 -0.06604777 -0.052006...\n", - " {\"token_count\":262,\"truncated\":false}\n", + " 3917\n", + " [-8.92711710e-03 -4.49132621e-02 -4.29662578e-...\n", + " {\"token_count\":71,\"truncated\":false}\n", " \n", - " I recently disputed ( see attached letter ) wi...\n", + " the company shared my information with another...\n", " \n", " \n", - " 5888\n", - " [-8.15972779e-03 -3.46563384e-02 -5.91776446e-...\n", - " {\"token_count\":176,\"truncated\":false}\n", + " 4281\n", + " [-1.69487391e-02 -1.89835522e-02 -3.80971469e-...\n", + " {\"token_count\":130,\"truncated\":false}\n", " \n", - " XXXX XXXX XXXX XXXX \n", - "I have disputed this acco...\n", + " I tried to submit a teacher loan forgiveness a...\n", " \n", " \n", - " 6299\n", - " [ 4.80043218e-02 -4.13420722e-02 -6.12363108e-...\n", - " {\"token_count\":151,\"truncated\":false}\n", + " 4470\n", + " [ 1.28689921e-02 -3.25881056e-02 -6.53645024e-...\n", + " {\"token_count\":200,\"truncated\":false}\n", " \n", - " XXXX ; XXXX and Transunion are reporting ( 30 ...\n", + " in accordance with the Fair Credit Reporting a...\n", " \n", " \n", - " 7143\n", - " [ 4.39200476e-02 -3.04005221e-02 -3.47866341e-...\n", - " {\"token_count\":234,\"truncated\":false}\n", + " 4915\n", + " [ 5.19403480e-02 -7.32436478e-02 -4.60561663e-...\n", + " {\"token_count\":23,\"truncated\":false}\n", " \n", - " My Macys account is due on the first of every ...\n", + " XXXX XXXX did not give me a receipt or a copy ...\n", " \n", " \n", - " 7219\n", - " [ 1.00224940e-02 -3.79302073e-03 -3.41785327e-...\n", - " {\"token_count\":26,\"truncated\":false}\n", + " 4928\n", + " [-4.43694415e-03 -3.66776163e-04 -9.08496231e-...\n", + " {\"token_count\":83,\"truncated\":false}\n", " \n", - " Keep getting letters and calls from collection...\n", + " This company has filed a civil suit during a g...\n", " \n", " \n", - " 7574\n", - " [-0.00149564 -0.06619431 -0.05084481 -0.048579...\n", - " {\"token_count\":129,\"truncated\":false}\n", + " 5338\n", + " [ 2.19908613e-03 -3.93951498e-02 -6.52823672e-...\n", + " {\"token_count\":1279,\"truncated\":false}\n", " \n", - " On XXXX I was on the XXXX app and there was a ...\n", + " My credit report contains errors that is keepi...\n", " \n", " \n", - " 8759\n", - " [ 0.01501553 -0.03575936 -0.050562 -0.034884...\n", - " {\"token_count\":501,\"truncated\":false}\n", + " 5582\n", + " [ 2.86326781e-02 -4.89189997e-02 -8.68150592e-...\n", + " {\"token_count\":396,\"truncated\":false}\n", " \n", - " Obviously I've been a victim of fraud, therefo...\n", + " Coast Professional, XXXX, LA contacted me by m...\n", " \n", " \n", - " 9700\n", - " [ 1.01501048e-02 -2.80565154e-02 -4.05892394e-...\n", - " {\"token_count\":48,\"truncated\":false}\n", + " 6386\n", + " [ 3.33276950e-02 1.53224478e-02 -1.89354066e-...\n", + " {\"token_count\":79,\"truncated\":false}\n", " \n", - " The following item have not been properly inve...\n", + " Cares act refund requested in XXXX, called mul...\n", " \n", " \n", - " 9822\n", - " [ 2.95880195e-02 1.65440738e-02 -3.33247967e-...\n", - " {\"token_count\":2373,\"truncated\":true}\n", + " 6956\n", + " [ 1.47060463e-02 -3.36431377e-02 -6.56675845e-...\n", + " {\"token_count\":194,\"truncated\":false}\n", " \n", - " During the housing market crash I went through...\n", + " n accordance with the Fair Credit Reporting ac...\n", " \n", " \n", "\n", @@ -1043,87 +1064,86 @@ "[10000 rows x 4 columns in total]" ], "text/plain": [ - " ml_generate_embedding_result \\\n", - "357 [ 1.33585772e-02 -3.76597494e-02 -6.14452176e-... \n", - "428 [ 3.10036819e-04 -3.82593311e-02 -3.41922641e-... \n", - "1319 [ 1.97481886e-02 -1.97448786e-02 -5.13443351e-... \n", - "1993 [ 9.83821880e-03 -6.55664057e-02 -5.46210706e-... \n", - "1997 [ 0.03145148 -0.01011822 -0.02316323 -0.025078... \n", - "2469 [ 4.74590808e-03 -4.56819348e-02 -2.49751769e-... \n", - "2624 [ 3.91883589e-03 -3.26644145e-02 -7.10378587e-... \n", - "2832 [ 8.35181400e-03 -2.91643552e-02 -4.30776961e-... \n", - "3328 [ 2.71253809e-02 -1.77491009e-02 -5.32273464e-... \n", - "3650 [-6.10093866e-03 -5.93599863e-02 -8.04531425e-... \n", - "3860 [ 5.84836192e-02 -2.43354496e-03 -5.57337068e-... \n", - "4464 [ 6.05084226e-02 -3.21578234e-02 -7.51668587e-... \n", + " text_embedding \\\n", + "782 [ 2.78223325e-02 -1.71949025e-02 -5.01214415e-... \n", + "795 [ 1.39164589e-02 -5.08313216e-02 -4.53360938e-... \n", + "861 [ 7.80681521e-03 -3.23560014e-02 -6.76454604e-... \n", + "1103 [ 4.18044440e-02 -4.28444222e-02 -6.26875088e-... \n", + "1241 [ 7.74183637e-03 -6.50701150e-02 -2.13856809e-... \n", + "1729 [ 3.15782428e-02 -1.23979878e-02 -3.70296165e-... \n", + "2167 [ 9.87241510e-03 -1.81103535e-02 -4.17162031e-... \n", + "2219 [ 2.56749280e-02 -4.92163002e-02 -5.67202382e-... \n", + "2392 [ 2.34611966e-02 -4.74611111e-02 -3.59710641e-... \n", + "2528 [ 1.90760177e-02 -4.90266569e-02 -5.60806654e-... \n", + "2737 [ 1.81887485e-02 -8.74284655e-03 -2.73009986e-... \n", + "2859 [ 3.52482982e-02 -3.30757573e-02 -4.48422395e-... \n", + "3439 [ 3.40348878e-03 -2.72301212e-02 -2.03482248e-... \n", + "3738 [ 0.01422119 -0.01114973 -0.04438976 -0.024421... \n", + "3805 [ 1.08179580e-02 -3.44337188e-02 -5.08812033e-... \n", + "3915 [-7.23852217e-03 -4.69538383e-02 -5.60489520e-... \n", + "3917 [-8.92711710e-03 -4.49132621e-02 -4.29662578e-... \n", + "4281 [-1.69487391e-02 -1.89835522e-02 -3.80971469e-... \n", "4470 [ 1.28689921e-02 -3.25881056e-02 -6.53645024e-... \n", - "4567 [-5.49167022e-03 -3.84587422e-02 -8.56091827e-... \n", - "4713 [ 2.68485844e-02 -3.46762352e-02 -4.59849052e-... \n", - "5181 [ 2.05754172e-02 -3.83999050e-02 -9.29225236e-... \n", - "5511 [-0.00217485 -0.04031368 -0.06604777 -0.052006... \n", - "5888 [-8.15972779e-03 -3.46563384e-02 -5.91776446e-... \n", - "6299 [ 4.80043218e-02 -4.13420722e-02 -6.12363108e-... \n", - "7143 [ 4.39200476e-02 -3.04005221e-02 -3.47866341e-... \n", - "7219 [ 1.00224940e-02 -3.79302073e-03 -3.41785327e-... \n", - "7574 [-0.00149564 -0.06619431 -0.05084481 -0.048579... \n", - "8759 [ 0.01501553 -0.03575936 -0.050562 -0.034884... \n", - "9700 [ 1.01501048e-02 -2.80565154e-02 -4.05892394e-... \n", - "9822 [ 2.95880195e-02 1.65440738e-02 -3.33247967e-... \n", + "4915 [ 5.19403480e-02 -7.32436478e-02 -4.60561663e-... \n", + "4928 [-4.43694415e-03 -3.66776163e-04 -9.08496231e-... \n", + "5338 [ 2.19908613e-03 -3.93951498e-02 -6.52823672e-... \n", + "5582 [ 2.86326781e-02 -4.89189997e-02 -8.68150592e-... \n", + "6386 [ 3.33276950e-02 1.53224478e-02 -1.89354066e-... \n", + "6956 [ 1.47060463e-02 -3.36431377e-02 -6.56675845e-... \n", "\n", - " ml_generate_embedding_statistics ml_generate_embedding_status \\\n", - "357 {\"token_count\":306,\"truncated\":false} \n", - "428 {\"token_count\":134,\"truncated\":false} \n", - "1319 {\"token_count\":215,\"truncated\":false} \n", - "1993 {\"token_count\":536,\"truncated\":false} \n", - "1997 {\"token_count\":123,\"truncated\":false} \n", - "2469 {\"token_count\":60,\"truncated\":false} \n", - "2624 {\"token_count\":254,\"truncated\":false} \n", - "2832 {\"token_count\":79,\"truncated\":false} \n", - "3328 {\"token_count\":156,\"truncated\":false} \n", - "3650 {\"token_count\":175,\"truncated\":false} \n", - "3860 {\"token_count\":1267,\"truncated\":false} \n", - "4464 {\"token_count\":906,\"truncated\":false} \n", - "4470 {\"token_count\":200,\"truncated\":false} \n", - "4567 {\"token_count\":110,\"truncated\":false} \n", - "4713 {\"token_count\":549,\"truncated\":false} \n", - "5181 {\"token_count\":77,\"truncated\":false} \n", - "5511 {\"token_count\":262,\"truncated\":false} \n", - "5888 {\"token_count\":176,\"truncated\":false} \n", - "6299 {\"token_count\":151,\"truncated\":false} \n", - "7143 {\"token_count\":234,\"truncated\":false} \n", - "7219 {\"token_count\":26,\"truncated\":false} \n", - "7574 {\"token_count\":129,\"truncated\":false} \n", - "8759 {\"token_count\":501,\"truncated\":false} \n", - "9700 {\"token_count\":48,\"truncated\":false} \n", - "9822 {\"token_count\":2373,\"truncated\":true} \n", + " statistics ml_embed_text_status \\\n", + "782 {\"token_count\":121,\"truncated\":false} \n", + "795 {\"token_count\":141,\"truncated\":false} \n", + "861 {\"token_count\":160,\"truncated\":false} \n", + "1103 {\"token_count\":31,\"truncated\":false} \n", + "1241 {\"token_count\":23,\"truncated\":false} \n", + "1729 {\"token_count\":382,\"truncated\":false} \n", + "2167 {\"token_count\":556,\"truncated\":false} \n", + "2219 {\"token_count\":196,\"truncated\":false} \n", + "2392 {\"token_count\":641,\"truncated\":false} \n", + "2528 {\"token_count\":176,\"truncated\":false} \n", + "2737 {\"token_count\":230,\"truncated\":false} \n", + "2859 {\"token_count\":238,\"truncated\":false} \n", + "3439 {\"token_count\":197,\"truncated\":false} \n", + "3738 {\"token_count\":160,\"truncated\":false} \n", + "3805 {\"token_count\":477,\"truncated\":false} \n", + "3915 {\"token_count\":116,\"truncated\":false} \n", + "3917 {\"token_count\":71,\"truncated\":false} \n", + "4281 {\"token_count\":130,\"truncated\":false} \n", + "4470 {\"token_count\":200,\"truncated\":false} \n", + "4915 {\"token_count\":23,\"truncated\":false} \n", + "4928 {\"token_count\":83,\"truncated\":false} \n", + "5338 {\"token_count\":1279,\"truncated\":false} \n", + "5582 {\"token_count\":396,\"truncated\":false} \n", + "6386 {\"token_count\":79,\"truncated\":false} \n", + "6956 {\"token_count\":194,\"truncated\":false} \n", "\n", " content \n", - "357 I decided to try XXXX services for my wife and... \n", - "428 XXXX I went to the bank in question ( XXXX XXX... \n", - "1319 I currently have a home loan with my ex husban... \n", - "1993 NOT MY ACCOUNT, NOT AN AUTHORIZED USER {$1800.... \n", - "1997 After a while the payments became harder and h... \n", - "2469 In the course of my student loan, I have been ... \n", - "2624 In accordance with the Fair Credit Reporting A... \n", - "2832 LVNV FUNDING LLC is continually placing a coll... \n", - "3328 On XX/XX/2020 I sent a letter regarding inaccu... \n", - "3650 Over a year and a half ago we started the proc... \n", - "3860 The issue is 26 late payments on me and my wif... \n", - "4464 I purchased as replacement for a lost XXXX XXX... \n", + "782 I 've sent multiple letters to this agency abo... \n", + "795 I receive social security XXXX funds in my XXX... \n", + "861 Hello, My name is XXXX XXXX XXXX. I have a pro... \n", + "1103 The debt occurred more than 7 years in the pas... \n", + "1241 UNAUTHORIZED CREDIT REPORTING NO ACCOUNT TO VA... \n", + "1729 XXXX on XXXX XX/XX/2021 I have Mr. Cooper mort... \n", + "2167 This is the third such complaint I have submit... \n", + "2219 Found and add online for a Prepaid Credit card... \n", + "2392 I am furnishing this complaint against Fed Loa... \n", + "2528 Despite multiple written requests, the unverif... \n", + "2737 After unsatisfying communication in the messag... \n", + "2859 Good Morning. My name is XXXX XXXX. My account... \n", + "3439 I have ongoing disputes that are preventing me... \n", + "3738 I had a loan with national Collegiate Trust. i... \n", + "3805 Hi I am submitting this XXXX XXXX this isn't a... \n", + "3915 portfolio is showin on my credit report with a... \n", + "3917 the company shared my information with another... \n", + "4281 I tried to submit a teacher loan forgiveness a... \n", "4470 in accordance with the Fair Credit Reporting a... \n", - "4567 I have submitted multiple disputes through the... \n", - "4713 While shopping for furniture for my home I ope... \n", - "5181 I had opened a Wells Fargo checking account wi... \n", - "5511 I recently disputed ( see attached letter ) wi... \n", - "5888 XXXX XXXX XXXX XXXX \n", - "I have disputed this acco... \n", - "6299 XXXX ; XXXX and Transunion are reporting ( 30 ... \n", - "7143 My Macys account is due on the first of every ... \n", - "7219 Keep getting letters and calls from collection... \n", - "7574 On XXXX I was on the XXXX app and there was a ... \n", - "8759 Obviously I've been a victim of fraud, therefo... \n", - "9700 The following item have not been properly inve... \n", - "9822 During the housing market crash I went through... \n", + "4915 XXXX XXXX did not give me a receipt or a copy ... \n", + "4928 This company has filed a civil suit during a g... \n", + "5338 My credit report contains errors that is keepi... \n", + "5582 Coast Professional, XXXX, LA contacted me by m... \n", + "6386 Cares act refund requested in XXXX, called mul... \n", + "6956 n accordance with the Fair Credit Reporting ac... \n", "...\n", "\n", "[10000 rows x 4 columns]" @@ -1136,10 +1156,10 @@ ], "source": [ "successful_rows = (\n", - " (predicted_embeddings[\"ml_generate_embedding_status\"] == \"\")\n", + " (predicted_embeddings[\"ml_embed_text_status\"] == \"\")\n", " # Series.str.len() gives the length of an array.\n", " # See: https://stackoverflow.com/a/41340543/101923\n", - " & (predicted_embeddings[\"ml_generate_embedding_result\"].str.len() != 0)\n", + " & (predicted_embeddings[\"text_embedding\"].str.len() != 0)\n", ")\n", "predicted_embeddings = predicted_embeddings[successful_rows]\n", "predicted_embeddings\n" @@ -1194,7 +1214,7 @@ { "data": { "text/html": [ - "Query job 18aa46ee-0b10-4912-ae14-87b7e81ee447 is DONE. 61.7 MB processed. Open Job" + "Query job 46da96c8-c454-44d3-8b98-0e1bfeca69dd is DONE. 61.7 MB processed. Open Job" ], "text/plain": [ "" @@ -1206,7 +1226,7 @@ { "data": { "text/html": [ - "Query job fd573f97-2424-472a-969d-463f184967d9 is DONE. 0 Bytes processed. Open Job" + "Query job dc6fe7cf-329d-4274-aff9-0b8dc2e56230 is DONE. 0 Bytes processed. Open Job" ], "text/plain": [ "" @@ -1218,7 +1238,7 @@ { "data": { "text/html": [ - "Query job 9f2e0a3f-d7d6-4fb8-b558-95f39235410d is DONE. 72.7 MB processed. Open Job" + "Query job 8c25a14a-af39-40a9-add5-de0f14bce9ce is DONE. 72.4 MB processed. Open Job" ], "text/plain": [ "" @@ -1230,7 +1250,7 @@ { "data": { "text/html": [ - "Query job 786ababe-7c40-426f-bb39-154329e4c51a is DONE. 80.0 kB processed. Open Job" + "Query job 0a6a45b2-7c35-4be8-91a3-391a5381553e is DONE. 80.0 kB processed. Open Job" ], "text/plain": [ "" @@ -1242,7 +1262,7 @@ { "data": { "text/html": [ - "Query job a191fc97-baa6-4c7c-b78f-4365678caa60 is DONE. 73.2 MB processed. Open Job" + "Query job b5e00edd-de21-40c1-bf61-9f1affdea318 is DONE. 73.1 MB processed. Open Job" ], "text/plain": [ "" @@ -1274,57 +1294,57 @@ " \n", " CENTROID_ID\n", " NEAREST_CENTROIDS_DISTANCE\n", - " ml_generate_embedding_result\n", - " ml_generate_embedding_statistics\n", - " ml_generate_embedding_status\n", + " text_embedding\n", + " statistics\n", + " ml_embed_text_status\n", " content\n", " \n", " \n", " \n", " \n", - " 1244571\n", + " 1094645\n", " 1\n", - " [{'CENTROID_ID': 1, 'DISTANCE': 0.414497263076...\n", - " [ 1.10590272e-02 -2.11433582e-02 -5.66212423e-...\n", - " {\"token_count\":100,\"truncated\":false}\n", + " [{'CENTROID_ID': 1, 'DISTANCE': 0.530282685572...\n", + " [ 7.32792774e-03 -7.59598315e-02 -4.49591577e-...\n", + " {\"token_count\":10,\"truncated\":false}\n", " \n", - " Ive disputed two Bankruptcies that still exist...\n", + " I do not have an account with this creditor\n", " \n", " \n", - " 744390\n", + " 3372485\n", " 1\n", - " [{'CENTROID_ID': 1, 'DISTANCE': 0.416584344032...\n", - " [ 4.15011719e-02 -4.50705849e-02 -7.35541508e-...\n", - " {\"token_count\":100,\"truncated\":false}\n", + " [{'CENTROID_ID': 1, 'DISTANCE': 0.643931578310...\n", + " [-0.00161087 -0.04956109 -0.07371692 -0.057822...\n", + " {\"token_count\":10,\"truncated\":false}\n", " \n", - " The XXXX account was settled as a class action...\n", + " Hard inquiries in my report that I do not reco...\n", " \n", " \n", - " 127514\n", + " 2669308\n", " 1\n", - " [{'CENTROID_ID': 1, 'DISTANCE': 0.541137734253...\n", - " [ 3.54415141e-02 1.23769706e-02 -2.61783414e-...\n", + " [{'CENTROID_ID': 1, 'DISTANCE': 0.599709344244...\n", + " [ 5.50241247e-02 -1.50039541e-02 -2.08624080e-...\n", " {\"token_count\":100,\"truncated\":false}\n", " \n", - " I have late payments reported on my student lo...\n", + " I purchase {$25.00} for stock on the cash app ...\n", " \n", " \n", - " 630563\n", + " 133816\n", " 1\n", - " [{'CENTROID_ID': 1, 'DISTANCE': 0.477175150810...\n", - " [ 2.34235693e-02 -4.21241224e-02 -3.90484147e-...\n", + " [{'CENTROID_ID': 1, 'DISTANCE': 0.618444281124...\n", + " [ 1.77251529e-02 -3.89547497e-02 -3.82236368e-...\n", " {\"token_count\":100,\"truncated\":false}\n", " \n", - " A Military Star Credit card, aka Take it Home ...\n", + " BBVA fees I am in The Texas snow storm where I...\n", " \n", " \n", - " 2651231\n", + " 2697156\n", " 1\n", - " [{'CENTROID_ID': 1, 'DISTANCE': 0.489760037964...\n", - " [ 2.64898203e-02 -5.62610961e-02 -5.82714193e-...\n", - " {\"token_count\":101,\"truncated\":false}\n", + " [{'CENTROID_ID': 1, 'DISTANCE': 0.500398902102...\n", + " [-1.28429877e-02 -1.85956229e-02 -3.93197313e-...\n", + " {\"token_count\":1011,\"truncated\":false}\n", " \n", - " My mortgage is with Bank of America. I filed C...\n", + " After paying on my student loan for years, I o...\n", " \n", " \n", "\n", @@ -1332,32 +1352,32 @@ ], "text/plain": [ " CENTROID_ID NEAREST_CENTROIDS_DISTANCE \\\n", - "1244571 1 [{'CENTROID_ID': 1, 'DISTANCE': 0.414497263076... \n", - "744390 1 [{'CENTROID_ID': 1, 'DISTANCE': 0.416584344032... \n", - "127514 1 [{'CENTROID_ID': 1, 'DISTANCE': 0.541137734253... \n", - "630563 1 [{'CENTROID_ID': 1, 'DISTANCE': 0.477175150810... \n", - "2651231 1 [{'CENTROID_ID': 1, 'DISTANCE': 0.489760037964... \n", + "1094645 1 [{'CENTROID_ID': 1, 'DISTANCE': 0.530282685572... \n", + "3372485 1 [{'CENTROID_ID': 1, 'DISTANCE': 0.643931578310... \n", + "2669308 1 [{'CENTROID_ID': 1, 'DISTANCE': 0.599709344244... \n", + "133816 1 [{'CENTROID_ID': 1, 'DISTANCE': 0.618444281124... \n", + "2697156 1 [{'CENTROID_ID': 1, 'DISTANCE': 0.500398902102... \n", "\n", - " ml_generate_embedding_result \\\n", - "1244571 [ 1.10590272e-02 -2.11433582e-02 -5.66212423e-... \n", - "744390 [ 4.15011719e-02 -4.50705849e-02 -7.35541508e-... \n", - "127514 [ 3.54415141e-02 1.23769706e-02 -2.61783414e-... \n", - "630563 [ 2.34235693e-02 -4.21241224e-02 -3.90484147e-... \n", - "2651231 [ 2.64898203e-02 -5.62610961e-02 -5.82714193e-... \n", + " text_embedding \\\n", + "1094645 [ 7.32792774e-03 -7.59598315e-02 -4.49591577e-... \n", + "3372485 [-0.00161087 -0.04956109 -0.07371692 -0.057822... \n", + "2669308 [ 5.50241247e-02 -1.50039541e-02 -2.08624080e-... \n", + "133816 [ 1.77251529e-02 -3.89547497e-02 -3.82236368e-... \n", + "2697156 [-1.28429877e-02 -1.85956229e-02 -3.93197313e-... \n", "\n", - " ml_generate_embedding_statistics ml_generate_embedding_status \\\n", - "1244571 {\"token_count\":100,\"truncated\":false} \n", - "744390 {\"token_count\":100,\"truncated\":false} \n", - "127514 {\"token_count\":100,\"truncated\":false} \n", - "630563 {\"token_count\":100,\"truncated\":false} \n", - "2651231 {\"token_count\":101,\"truncated\":false} \n", + " statistics ml_embed_text_status \\\n", + "1094645 {\"token_count\":10,\"truncated\":false} \n", + "3372485 {\"token_count\":10,\"truncated\":false} \n", + "2669308 {\"token_count\":100,\"truncated\":false} \n", + "133816 {\"token_count\":100,\"truncated\":false} \n", + "2697156 {\"token_count\":1011,\"truncated\":false} \n", "\n", " content \n", - "1244571 Ive disputed two Bankruptcies that still exist... \n", - "744390 The XXXX account was settled as a class action... \n", - "127514 I have late payments reported on my student lo... \n", - "630563 A Military Star Credit card, aka Take it Home ... \n", - "2651231 My mortgage is with Bank of America. I filed C... " + "1094645 I do not have an account with this creditor \n", + "3372485 Hard inquiries in my report that I do not reco... \n", + "2669308 I purchase {$25.00} for stock on the cash app ... \n", + "133816 BBVA fees I am in The Texas snow storm where I... \n", + "2697156 After paying on my student loan for years, I o... " ] }, "execution_count": 13, @@ -1367,7 +1387,7 @@ ], "source": [ "# Use KMeans clustering to calculate our groups. Will take ~3 minutes.\n", - "cluster_model.fit(predicted_embeddings[[\"ml_generate_embedding_result\"]])\n", + "cluster_model.fit(predicted_embeddings[[\"text_embedding\"]])\n", "clustered_result = cluster_model.predict(predicted_embeddings)\n", "# Notice the CENTROID_ID column, which is the ID number of the group that\n", "# each complaint belongs to.\n", @@ -1410,7 +1430,7 @@ { "data": { "text/html": [ - "Query job 8bfc647f-b9e5-40a2-816c-d12e8f81bea3 is DONE. 10.6 MB processed. Open Job" + "Query job 8d4f24d6-dc37-47d3-8b4d-4505a55c4ccc is DONE. 10.4 MB processed. Open Job" ], "text/plain": [ "" @@ -1422,7 +1442,7 @@ { "data": { "text/html": [ - "Query job 6f834214-9cc3-4577-bb2d-980ba05df817 is DONE. 10.6 MB processed. Open Job" + "Query job c1f979ee-1f5d-4f37-8595-ee2167c06e63 is DONE. 10.4 MB processed. Open Job" ], "text/plain": [ "" @@ -1458,62 +1478,32 @@ "output_type": "stream", "text": [ "comment list 1:\n", - "1. I currently have a home loan with my ex husband with PHH Mortgages. We filed for divorce and in the divorce decree he became liable for the home and paying the payments. He ended up missing XXXX payments which effected my credit fairly substaintailly. when I became aware of the late payments, I ensured that the account was up to date and have since. I presented to them that I have the legal documents that he is obligated to make the payments each month and that I am not responisble for the payment. I asked them to remove the XXXX dings on my credit and they would not. I offered to present the paperwork and they still would not. The home is now being sold. I even filed with XXXX as a discrepency and they would not remove it. I would have never let these become a late payment. I was not even notified as they had all of his information in the file.\n", - "2. In the course of my student loan, I have been making payments and I feel that the payments haven't been added to the debt, the company stated that I am delinquent over 180 and my payments are auto pay. This has had a negative impact on my credit score.\n", - "3. The issue is 26 late payments on me and my wife 's credit reports due to a system error on a joint mortgage account that was always paid on time using autopay. ( will attach docs to support this ). \n", - "\n", - "This is an ongoing nightmare me and my wife are going through over the past 3 years. \n", - "Sent many dispute letters to the creditor and to the 3 bureaus, was promised multiple times that all late payments will be removed, we also has a letter from the bank stating we were never late on this account, also have a recording of a phone call with bank 's permission were the representative admits there was a system error and promised again that all late payments will be deleted from both of our credit reports. \n", - "As of today, for an unknown reason XXXX reports 6x30 days late payments, XXXX reports 24 lates, and Transunion 23 lates. \n", - "\n", - "We have always paid our mortgage on time for many years, enrolled in autopay and making 2 payments per month. Our mortgage is currently with XXXX XXXX, XXXX XXXX is the mortgage servicer who's collecting from us and disbursing payments to XXXX XXXX. \n", - "\n", - "I will attach here our mortgage transaction history confirming payments have been made on time, letter from the mortgage servicer XXXX XXXX XXXX XXXX confirming we were never late, copy of a page from my credit report showing all the late payments, and a few bank statements showing payment made on time while showing as late on our credit reports. ( XXXX & XXXX XXXX ) PLEASE HELP us to resolve this issue and have all late payments on this account removed from XXXX & XXXX XXXX. \n", - "\n", - "Her is a small portion of our previous communication with XXXX and their response : XX/XX/XXXX : we spoke with XXXX, XXXX stated that the funds were misappropriated and went to the wrong account. Said he'll contact their Tax Dep ' and get back to me and never did. \n", + "1. This is the third such complaint I have submitted regarding the same type of issue over the past 12-18 months. \n", "\n", - "XX/XX/XXXX : I spoke in length with XXXX from XXXX XXXX & XXXX from XXXX at XXXX who opened an investigation. Their supervisor said he made all the necessary changes. The next month, the money was withdrawn from my account on time and i received again a late fee and 30 days late on my credit report. \n", - "\n", - "XX/XX/XXXX Spoke to XXXX who sent me to XXXX who sent me to XXXX XXXX from Escalation department, she promised the issue was fixed and late payments will be removed in up to 30 days and she will email me a deletion letter. Nothing was sent! and i called and wrote 5 emails to her and never got a response. \n", - "\n", - "XX/XX/XXXX spoke with XXXX to follow up with XXXX XXXX, no response. \n", - "\n", - "XX/XX/XXXX spoke with XXXX at XXXX, she said that the transaction history is our proof that issue was corrected and she'll submit a request to delete the late payments as the system does not show that previous request was made. \n", - "\n", - "XX/XX/XXXX XXXX sent a letter stating that we had a shortage of {$5300.00}. Again, upon checking my bank account all monthly payments were made on time. To avoid further issues i sent a check for {$5300.00} on XX/XX/XXXX. \n", - "\n", - "XX/XX/XXXX following many joint calls with XXXX XXXX i received a letter from XXXX stating that my credit might have been affected due to processing error and that they sent XXXX XXXX a letter requesting a removal of all late payments. \n", - "\n", - "XX/XX/XXXX spoke with XXXX XXXX again.. \n", - "XX/XX/XXXX spoke with XXXX at XXXX I have many more... \n", - "\n", - "*** XX/XX/XXXX : SPOKE WITH XXXX XXXX ( resolution team ) at XXXX. She said they will delete the late payments from both reports ( XXXX & XXXX XXXX and will call me to follow up on XX/XX/XXXX. She also emailed me the payment activity on the account. XXXX I received the payment history but NO CALL OR RESOLUTION. \n", - "\n", - "XX/XX/XXXX : SPOKE WITH XXXX FROM XXXX AND XXXX FROM XXXX XXXX ON A XXXX WAY CALL at XXXX, XXXX PERMISSION TO RECORD THE CALL, SHE AGREED, we went over all the late payments, she said she sees the error and promised that this time it will be resolved and get deleted from our credit reports. Again, nothing was resolved and we never heard back from anyone.\n", - "4. XXXX ; XXXX and Transunion are reporting ( 30 ) plus days late on the XXXX XXXX partial account number XXXX. ( Please see page 3 of the attached credit report. ) This account was paid in XXXX, 2019 and the lates are reporting in XXXX, 2019. Please keep in mind that it is impossible to have late payments on an account that was paid off a month prior. This incorrect reporting is harming my credit score and this line item need to be removed from my credit report. I have contacted the ( 3 ) bureaus to fix this, however I have been unsuccessful.\n", - "5. My Macys account is due on the first of every month. Since I have had the card I have paid on the XXXX PRIOR to the due date. And have paid over the amount due. In XXXX my XXXX XXXX auto pay did not come out of my account and rather than calling me - on the XXXX of XXXX just 5 days late they cut my credit off and shut me out of my account so I can not even see my credit profile - I have made the payment and they still are locking me out - please look into this - you will see that is what happened and they are stating in a letter it is becasue my XXXX report shows a seriuos derogorty item which it does not and I have submitted a complaint with them as well. Macys has been the worst credit experience of my LIFE and I did read the reviews but thought it would be different for me I guess? \n", - "thank you for your help.\n", + "On XX/XX/XXXX, my co-signed account was flagged by Navient as past due. The XXXX payment was mailed priority on XX/XX/XXXX and received by Navient on XX/XX/XXXX and delivered to \" an individual '' per the post office. \n", + "I called Navient on XX/XX/XXXX to talk to them about why my account was flagged since they received the payment long before the due date. The payment is sent via XXXX money orders under the same cover. The XXXX money order ( {$160.00} ) was cashed on XX/XX/XXXX per XXXX XXXX, the second money order ( {$250.00} ) which was sent in the same priority envelope and received the same time has not been cashed. \n", + "When I called the customer service agent at Navient she told me that my account was past due and wanted me to send another payment. When I explained that they had received the payment she argued with me that if they received it, the payment would have been cashed. I asked to speak with a supervisor. \n", + "I was connected with supervisor, XXXX XXXX, who asked that I send copies of the payments to him so he could submit for a missing payment request. I faxed the proof on XXXX @ XXXX with a receipt acknowledgment. \n", + "On XX/XX/XXXX, the payment was still not applied to the account. When I called XXXX XXXX, the money order was still not cashed. I called Navient again. Because of an argumentative customer service rep again, I requested to speak with a supervisor. I spoke with XXXX XXXX. She states that payment was not received. I explained the situation again. She said the missing payment request had not been submitted. She had me upload the documents so she could request a missing payment search. I have done everything I have been asked. \n", + "This issues continues to occur. For approximately 6 months at a time, Navient gets this right then there are issues again and again. I have submitted CFPB complaints about this in the past. \n", + "I was told it would take 5-7 business days to be resolved.\n", + "2. I tried to submit a teacher loan forgiveness application and they lost my application. I submitted the application again after talking to XXXX people at Nelnet. Then when I called back to check on the status they told me that I needed to submit another form for a different school that I worked at. I had already called previously and asked if I needed to submit any other papers and they told me \" no ''. Therefore, I have been paying my loan for 5 months and it should be forgiven. I am still paying my loan because I have to wait for them to approve the new forgiveness paperwork.\n", + "3. PayPal continues to overcharge their currency rate. It it always inflated in their favor so that they collect the difference.\n", + "4. My government feeds are not coming on to my card and I need the problem fix today\n", + "5. Paypal Credit 's website is confusing and does not accurately reflect all activity. When speaking with representatives like XXXX, it 's confusing to them and they can barely follow along with it. I am not receiving statements, which proves it difficult to determine the due dates on the accounts. The Reps are n't knowledgeable and the only thing they repeat to you is the amount due on the screen.\n", "\n", "comment list 2:\n", - "1. I decided to try XXXX services for my wife and I so I purchased phones for both of us. After a day or two of trial we felt unsatisfied so we headed back to the store and we returned all items. We got charged with restocking fees and taxes. Later on I got a bill in the mail in which I was being charged {$1200.00} for the returned items. After several attempts of arguing for about XXXX months about whether I owed XXXX or not I was dismissed of such charges, but a month after I was charged by a collecting company called ERC for {$61.00}. I asked them to explain such charges weather if they were fees or taxes and they we unable to disclose information. Therefore, I asked them to send me a bill in the mail with details about the charges, as well as a dispute package and they told me they would send me a bill. About the dispute part, they said that I needed to call XXXX to discuss the charges with them but XXXX said that I had to discuss this with the collecting company. I never received a detailed statement neither a chanse to defend my self about such charges, I checked my credit score and found a red flag in it because of this. \n", - "\n", - "I am now hoping you may help me with this case. \n", - "\n", - "Thanks :\n", - "2. Over a year and a half ago we started the process of buying a home. Our mortgage guy sent us to a credit repair co. They got the collection account from Weltman , Weinberg & Reis taken off my credit, because it was unverifiable. Now it is back on my credit. I have credit reports showing the trade line on and then off and now today it is currently on my report. When I called to verify the account with WW & R they sent me a heavily redacted letter verifying absolutely nothing. I would like this unverifiable account taken off my credit and removed permanently. This should not be a loan I have to pay for if there is no verification that it is my debt. Attached are the credit reports and the letter of verification that was sent to me.\n", - "3. I recently disputed ( see attached letter ) with Receivable Management Services an account entry that they placed on my credit report without providing a dunning letter or any correspondence that would have allowed me 30 days to dispute the validity of the alleged debt. To date, I have not received any communication from them. They are blatantly violating my rights by reporting this inaccurate, erroneous, unverifiable entry.\n", + "1. XXXX on XXXX XX/XX/2021 I have Mr. Cooper mortgage for years now. On XXXX XXXX XXXX I made an additional payment of $ XXXX towards my principal. More than 4 days - it's not reflected in the Amortization schedule ( Amortization schedule is not even visible ). Even after so many additional principal payments, Payoff calculator is way off and it still shows XXXX maturity date while it should start showing something like XX/XX/XXXX/XX/XX/XXXX as the initial date. There are lots of discrepancies on their website to reflect the balance and total. When called customer service on Friday and also chatted - i was assured of fixing this - but no fix till this point of time. Customer service there is a long wait. Auto bot doesn't let customers talk to the real person. \n", + "Finally after a lots of follow-up I got the amortization schedule via email but it is not reflecting another additional principal payment of {$4700.00} made on XX/XX/2021. \n", + "I did numerous chats and phone calls. Why i should depend on inefficinent humans to see my revised amortization schedule? \n", + "Why the online amortization schedule is not visible now? \n", "\n", - "Additionally, this account entry does not reflect a payment history which should be included on any entry that is reflected on my credit report. In my previous communication to them, I specifically requested that they provide an agreement that states their authority to collect on the alleged debt, agreement with signature of the alleged debtor wherein he/she agreed to pay the creditor, alleged account number, date this alleged debt became payable, original delinquency date, and to date to no avail. \n", - "\n", - "As such, since they have refused to respond to my request and not provide any documentation to substantiate their allegations, coupled with the fact that they did not provide me a dunning letter is grounds for this erroneous, inaccurate, unverifiable entry to be deleted from my credit report.\n", - "4. I accepted service from XXXX XXXX XXXX. The company did not inform me that internet was required. They also told me that the agreement was at will without penalty. They never addressed my needs as a customer. My bill is only {$230.00}. They placed false information regarding my bill with a collection agency who has placed information on my credit report without contacting me or giving me an opportunity to dispute the validity of the debt. The debt is not valid. The actions are unlawful and I am requesting that the actions of this collection agency be reported to the Federal Trade Commission.\n", - "5. I have continued to submit an investigation for a Bankruptcy place on my credit report. I have been trying to get this removed because it was place on my credit report in error and inaccurate. ALL THREE CREDIT BUREAUS have continue to ignore the information proving this was place in error and fail to properly investigate the dispute I have place in their office. \n", - "\n", - "1. They say they have verified this dispute with XXXX but I have a letter from XXXX stating this was removed because they were unable to verify the accuracy of the bankruptcy. I received this letter on XX/XX/XXXX. XXXX just finished an investigation on XX/XX/2019 stating the verified this with XXXX. \n", - "2.Experian Open the dispute on XX/XX/2019 and closed it on XX/XX/19 stated they verified with XXXX and the Bankruptcy court and I have a letter From XXXX stating they could not verify the accuracy of this dispute. I also, have a letter from the court house stating they do not verify information with the credit bureaus How could be this be on my XXXX file when XXXX has removed this item. \n", - "3. XXXX open and investigation XX/XX/2019 and closed it XX/XX/2019 No way they properly investigation I have submitted all information to dispute the inaccurate information. Please do a proper investigation. \n", - "\n", - "XXXX, Experian, and XXXX please do a proper investigation under 611 of the FCRA thank you very much I have attached the letter proving this this is not on my XXXX consumer report and a letter form the court house stating they do not report information to the credit bureaus from the XXXX XXXX XXXX, Clerk of Court United State Bankruptcy Court on dated XX/XX/2019 I have summited it to the credit bureaus to be ignored. I have as for a description of my investigation by section 611 of the FCRA and the information from the investigation is inaccurate.\n", + "Worst thing, after turning on escrow account, there is no transparency. Amount of escrow account is not in sync with all the sections of my online account. It seems that there are too many internal buggy systems in Mr. Cooper and everybody from different IT department is showing a different $ $ figure at various places. \n", + "Highly inefficient organization.\n", + "2. I had a loan with national Collegiate Trust. i did n't make any payments on the loan for 6 years and due to the \" statute to limitations '' the loan collector should not be contacting me any more, by they still are in both forms phone call 's and letters. I am also trying to buy a house for my family and found out that i can not get a USDA loan for the house because National Collegiate Trust has filled the loan as a government delinquent loan. The problem with that is the loan is a private loan not a federal/Government loan. due to the way they filled the lion on my credit report i am not able to buy my first home.\n", + "3. Cares act refund requested in XXXX, called multiple times given conflicting information every time. Its now XXXX and looking like its going to be even longer potentially months before the refund arrives. Blatant disregard for the cares act signed into law and terrible customer service. This company is ill suited to service the federal government contract it enjoys.\n", + "4. In an attempt to pay off my loan with Ocwen, I sent XXXX large payments to the institution on XXXX XXXX that would have decreased my loan amount to within a couple of hundred dollars ( you can not send XXXX payment that exceeds 80 % of the loan balance so I broke the payments up into XXXX XXXX. I scheduled the payments for the same day because I did not want any interest to accrue. After a few days, I noticed where the XXXX payments were withdrawn from my bank but that my Ocwen balance had not changed to reflect my XXXX payments. I called Ocwen on XXXX XXXX to ask about the payment. The Ocwen rep explained that because the XXXX payments were scheduled for the same day, that it exceeded the 80 % max limit. I asked Ocwen to return my XXXX payments. The rep said it would take XXXX business days. I called Ocwen back on XXXX XXXX when my funds had not been returned to my bank account. I gave them my account number and routing number to my bank, information that I am sure they already had since my monthly mortgage payments are debited from my account. They asked me to wait a couple more days. I called Ocwen back on XXXX XXXX. The rep asked me to be patient and assured me that the funds would be returned by XXXX XXXX. There were no funds returned to my account on XXXX XXXX. I called Ocwen again. I was asked to wait 40+ minutes on hold while the Ocwen rep put me in touch with an escalation specialist ( ES ). The ES told me that my funds would be reurned within XXXX hours and that he had put it in as a priority because I had called so often. There were no funds on XXXX XXXX. I called Ocwen again to see if there was a problem. There was. After speaking to a rep and another ES, I was told that my funds could not be returned to me! The ES said that he did not see my funds! He claimed to put me in touch with someone who XXXX be able to address my concerns. So that 's where I am now, on hold waiting to speak to yet another person! This is a significant amount of money and I fear that Ocwen is trying to get away with keeping my XXXX payments!\n", + "5. In XX/XX/XXXX we received our first bill from XXXX XXXX for XXXX. ( attached ) We promptly paid the bill. Again, in XX/XX/XXXX we paid our second bill for XXXX. Again, both on time. Then when XX/XX/XXXX statement came we were billed XXXX. My husband called XXXX XXXX to find out what the issue was. We were told there was a loan shortfall caused by us paying XXXX ( the amount we were billed ) and that our loan was re-amoritized. I argued I had a fixed rate loan, had never missed a payment, had never made a late payment, and paid exactly what was billed. Well, after double checking my promissory note ( attached ) and TILA ( attached ) I was to always be billed XXXX. XXXX changed the monthly payment and thus caused a shortfall. When I told them this information they refused to correct the shortfall. Not only did they not correct the mistake they refused to return my calls or emails to provide answers for these issues. Around 90 days later and this issue still exists and they still refuse to answer. Additionally I offered to make up the shortfall myself by offering a check for the missing amount and they just applied it to interest. Thus the shortfall still exists. The extra amount would have gone directly to principal. Additionally, in XX/XX/XXXX we made an extra payment amount on top of the monthly payment. This was made all at the same time. The letter we sent contained directions to only apply extra payments beyond the monthly requirement be applied to principal and not the next months payment. This was ignored. Then XXXX \" a higher up '' as she calls herself lied and told me it went to principal when clearly it did not. We requested this be fixed and it has not been done. No one has offered to fix anything either. A certified letter is attached that I mailed. I also made dozens of calls.\n", "\n" ] } @@ -1547,61 +1537,31 @@ "text": [ "Please highlight the most obvious difference between the two lists of comments:\n", "comment list 1:\n", - "1. I currently have a home loan with my ex husband with PHH Mortgages. We filed for divorce and in the divorce decree he became liable for the home and paying the payments. He ended up missing XXXX payments which effected my credit fairly substaintailly. when I became aware of the late payments, I ensured that the account was up to date and have since. I presented to them that I have the legal documents that he is obligated to make the payments each month and that I am not responisble for the payment. I asked them to remove the XXXX dings on my credit and they would not. I offered to present the paperwork and they still would not. The home is now being sold. I even filed with XXXX as a discrepency and they would not remove it. I would have never let these become a late payment. I was not even notified as they had all of his information in the file.\n", - "2. In the course of my student loan, I have been making payments and I feel that the payments haven't been added to the debt, the company stated that I am delinquent over 180 and my payments are auto pay. This has had a negative impact on my credit score.\n", - "3. The issue is 26 late payments on me and my wife 's credit reports due to a system error on a joint mortgage account that was always paid on time using autopay. ( will attach docs to support this ). \n", - "\n", - "This is an ongoing nightmare me and my wife are going through over the past 3 years. \n", - "Sent many dispute letters to the creditor and to the 3 bureaus, was promised multiple times that all late payments will be removed, we also has a letter from the bank stating we were never late on this account, also have a recording of a phone call with bank 's permission were the representative admits there was a system error and promised again that all late payments will be deleted from both of our credit reports. \n", - "As of today, for an unknown reason XXXX reports 6x30 days late payments, XXXX reports 24 lates, and Transunion 23 lates. \n", - "\n", - "We have always paid our mortgage on time for many years, enrolled in autopay and making 2 payments per month. Our mortgage is currently with XXXX XXXX, XXXX XXXX is the mortgage servicer who's collecting from us and disbursing payments to XXXX XXXX. \n", - "\n", - "I will attach here our mortgage transaction history confirming payments have been made on time, letter from the mortgage servicer XXXX XXXX XXXX XXXX confirming we were never late, copy of a page from my credit report showing all the late payments, and a few bank statements showing payment made on time while showing as late on our credit reports. ( XXXX & XXXX XXXX ) PLEASE HELP us to resolve this issue and have all late payments on this account removed from XXXX & XXXX XXXX. \n", - "\n", - "Her is a small portion of our previous communication with XXXX and their response : XX/XX/XXXX : we spoke with XXXX, XXXX stated that the funds were misappropriated and went to the wrong account. Said he'll contact their Tax Dep ' and get back to me and never did. \n", - "\n", - "XX/XX/XXXX : I spoke in length with XXXX from XXXX XXXX & XXXX from XXXX at XXXX who opened an investigation. Their supervisor said he made all the necessary changes. The next month, the money was withdrawn from my account on time and i received again a late fee and 30 days late on my credit report. \n", - "\n", - "XX/XX/XXXX Spoke to XXXX who sent me to XXXX who sent me to XXXX XXXX from Escalation department, she promised the issue was fixed and late payments will be removed in up to 30 days and she will email me a deletion letter. Nothing was sent! and i called and wrote 5 emails to her and never got a response. \n", - "\n", - "XX/XX/XXXX spoke with XXXX to follow up with XXXX XXXX, no response. \n", + "1. This is the third such complaint I have submitted regarding the same type of issue over the past 12-18 months. \n", "\n", - "XX/XX/XXXX spoke with XXXX at XXXX, she said that the transaction history is our proof that issue was corrected and she'll submit a request to delete the late payments as the system does not show that previous request was made. \n", - "\n", - "XX/XX/XXXX XXXX sent a letter stating that we had a shortage of {$5300.00}. Again, upon checking my bank account all monthly payments were made on time. To avoid further issues i sent a check for {$5300.00} on XX/XX/XXXX. \n", - "\n", - "XX/XX/XXXX following many joint calls with XXXX XXXX i received a letter from XXXX stating that my credit might have been affected due to processing error and that they sent XXXX XXXX a letter requesting a removal of all late payments. \n", - "\n", - "XX/XX/XXXX spoke with XXXX XXXX again.. \n", - "XX/XX/XXXX spoke with XXXX at XXXX I have many more... \n", - "\n", - "*** XX/XX/XXXX : SPOKE WITH XXXX XXXX ( resolution team ) at XXXX. She said they will delete the late payments from both reports ( XXXX & XXXX XXXX and will call me to follow up on XX/XX/XXXX. She also emailed me the payment activity on the account. XXXX I received the payment history but NO CALL OR RESOLUTION. \n", - "\n", - "XX/XX/XXXX : SPOKE WITH XXXX FROM XXXX AND XXXX FROM XXXX XXXX ON A XXXX WAY CALL at XXXX, XXXX PERMISSION TO RECORD THE CALL, SHE AGREED, we went over all the late payments, she said she sees the error and promised that this time it will be resolved and get deleted from our credit reports. Again, nothing was resolved and we never heard back from anyone.\n", - "4. XXXX ; XXXX and Transunion are reporting ( 30 ) plus days late on the XXXX XXXX partial account number XXXX. ( Please see page 3 of the attached credit report. ) This account was paid in XXXX, 2019 and the lates are reporting in XXXX, 2019. Please keep in mind that it is impossible to have late payments on an account that was paid off a month prior. This incorrect reporting is harming my credit score and this line item need to be removed from my credit report. I have contacted the ( 3 ) bureaus to fix this, however I have been unsuccessful.\n", - "5. My Macys account is due on the first of every month. Since I have had the card I have paid on the XXXX PRIOR to the due date. And have paid over the amount due. In XXXX my XXXX XXXX auto pay did not come out of my account and rather than calling me - on the XXXX of XXXX just 5 days late they cut my credit off and shut me out of my account so I can not even see my credit profile - I have made the payment and they still are locking me out - please look into this - you will see that is what happened and they are stating in a letter it is becasue my XXXX report shows a seriuos derogorty item which it does not and I have submitted a complaint with them as well. Macys has been the worst credit experience of my LIFE and I did read the reviews but thought it would be different for me I guess? \n", - "thank you for your help.\n", + "On XX/XX/XXXX, my co-signed account was flagged by Navient as past due. The XXXX payment was mailed priority on XX/XX/XXXX and received by Navient on XX/XX/XXXX and delivered to \" an individual '' per the post office. \n", + "I called Navient on XX/XX/XXXX to talk to them about why my account was flagged since they received the payment long before the due date. The payment is sent via XXXX money orders under the same cover. The XXXX money order ( {$160.00} ) was cashed on XX/XX/XXXX per XXXX XXXX, the second money order ( {$250.00} ) which was sent in the same priority envelope and received the same time has not been cashed. \n", + "When I called the customer service agent at Navient she told me that my account was past due and wanted me to send another payment. When I explained that they had received the payment she argued with me that if they received it, the payment would have been cashed. I asked to speak with a supervisor. \n", + "I was connected with supervisor, XXXX XXXX, who asked that I send copies of the payments to him so he could submit for a missing payment request. I faxed the proof on XXXX @ XXXX with a receipt acknowledgment. \n", + "On XX/XX/XXXX, the payment was still not applied to the account. When I called XXXX XXXX, the money order was still not cashed. I called Navient again. Because of an argumentative customer service rep again, I requested to speak with a supervisor. I spoke with XXXX XXXX. She states that payment was not received. I explained the situation again. She said the missing payment request had not been submitted. She had me upload the documents so she could request a missing payment search. I have done everything I have been asked. \n", + "This issues continues to occur. For approximately 6 months at a time, Navient gets this right then there are issues again and again. I have submitted CFPB complaints about this in the past. \n", + "I was told it would take 5-7 business days to be resolved.\n", + "2. I tried to submit a teacher loan forgiveness application and they lost my application. I submitted the application again after talking to XXXX people at Nelnet. Then when I called back to check on the status they told me that I needed to submit another form for a different school that I worked at. I had already called previously and asked if I needed to submit any other papers and they told me \" no ''. Therefore, I have been paying my loan for 5 months and it should be forgiven. I am still paying my loan because I have to wait for them to approve the new forgiveness paperwork.\n", + "3. PayPal continues to overcharge their currency rate. It it always inflated in their favor so that they collect the difference.\n", + "4. My government feeds are not coming on to my card and I need the problem fix today\n", + "5. Paypal Credit 's website is confusing and does not accurately reflect all activity. When speaking with representatives like XXXX, it 's confusing to them and they can barely follow along with it. I am not receiving statements, which proves it difficult to determine the due dates on the accounts. The Reps are n't knowledgeable and the only thing they repeat to you is the amount due on the screen.\n", "comment list 2:\n", - "1. I decided to try XXXX services for my wife and I so I purchased phones for both of us. After a day or two of trial we felt unsatisfied so we headed back to the store and we returned all items. We got charged with restocking fees and taxes. Later on I got a bill in the mail in which I was being charged {$1200.00} for the returned items. After several attempts of arguing for about XXXX months about whether I owed XXXX or not I was dismissed of such charges, but a month after I was charged by a collecting company called ERC for {$61.00}. I asked them to explain such charges weather if they were fees or taxes and they we unable to disclose information. Therefore, I asked them to send me a bill in the mail with details about the charges, as well as a dispute package and they told me they would send me a bill. About the dispute part, they said that I needed to call XXXX to discuss the charges with them but XXXX said that I had to discuss this with the collecting company. I never received a detailed statement neither a chanse to defend my self about such charges, I checked my credit score and found a red flag in it because of this. \n", - "\n", - "I am now hoping you may help me with this case. \n", - "\n", - "Thanks :\n", - "2. Over a year and a half ago we started the process of buying a home. Our mortgage guy sent us to a credit repair co. They got the collection account from Weltman , Weinberg & Reis taken off my credit, because it was unverifiable. Now it is back on my credit. I have credit reports showing the trade line on and then off and now today it is currently on my report. When I called to verify the account with WW & R they sent me a heavily redacted letter verifying absolutely nothing. I would like this unverifiable account taken off my credit and removed permanently. This should not be a loan I have to pay for if there is no verification that it is my debt. Attached are the credit reports and the letter of verification that was sent to me.\n", - "3. I recently disputed ( see attached letter ) with Receivable Management Services an account entry that they placed on my credit report without providing a dunning letter or any correspondence that would have allowed me 30 days to dispute the validity of the alleged debt. To date, I have not received any communication from them. They are blatantly violating my rights by reporting this inaccurate, erroneous, unverifiable entry.\n", - "\n", - "Additionally, this account entry does not reflect a payment history which should be included on any entry that is reflected on my credit report. In my previous communication to them, I specifically requested that they provide an agreement that states their authority to collect on the alleged debt, agreement with signature of the alleged debtor wherein he/she agreed to pay the creditor, alleged account number, date this alleged debt became payable, original delinquency date, and to date to no avail. \n", - "\n", - "As such, since they have refused to respond to my request and not provide any documentation to substantiate their allegations, coupled with the fact that they did not provide me a dunning letter is grounds for this erroneous, inaccurate, unverifiable entry to be deleted from my credit report.\n", - "4. I accepted service from XXXX XXXX XXXX. The company did not inform me that internet was required. They also told me that the agreement was at will without penalty. They never addressed my needs as a customer. My bill is only {$230.00}. They placed false information regarding my bill with a collection agency who has placed information on my credit report without contacting me or giving me an opportunity to dispute the validity of the debt. The debt is not valid. The actions are unlawful and I am requesting that the actions of this collection agency be reported to the Federal Trade Commission.\n", - "5. I have continued to submit an investigation for a Bankruptcy place on my credit report. I have been trying to get this removed because it was place on my credit report in error and inaccurate. ALL THREE CREDIT BUREAUS have continue to ignore the information proving this was place in error and fail to properly investigate the dispute I have place in their office. \n", - "\n", - "1. They say they have verified this dispute with XXXX but I have a letter from XXXX stating this was removed because they were unable to verify the accuracy of the bankruptcy. I received this letter on XX/XX/XXXX. XXXX just finished an investigation on XX/XX/2019 stating the verified this with XXXX. \n", - "2.Experian Open the dispute on XX/XX/2019 and closed it on XX/XX/19 stated they verified with XXXX and the Bankruptcy court and I have a letter From XXXX stating they could not verify the accuracy of this dispute. I also, have a letter from the court house stating they do not verify information with the credit bureaus How could be this be on my XXXX file when XXXX has removed this item. \n", - "3. XXXX open and investigation XX/XX/2019 and closed it XX/XX/2019 No way they properly investigation I have submitted all information to dispute the inaccurate information. Please do a proper investigation. \n", + "1. XXXX on XXXX XX/XX/2021 I have Mr. Cooper mortgage for years now. On XXXX XXXX XXXX I made an additional payment of $ XXXX towards my principal. More than 4 days - it's not reflected in the Amortization schedule ( Amortization schedule is not even visible ). Even after so many additional principal payments, Payoff calculator is way off and it still shows XXXX maturity date while it should start showing something like XX/XX/XXXX/XX/XX/XXXX as the initial date. There are lots of discrepancies on their website to reflect the balance and total. When called customer service on Friday and also chatted - i was assured of fixing this - but no fix till this point of time. Customer service there is a long wait. Auto bot doesn't let customers talk to the real person. \n", + "Finally after a lots of follow-up I got the amortization schedule via email but it is not reflecting another additional principal payment of {$4700.00} made on XX/XX/2021. \n", + "I did numerous chats and phone calls. Why i should depend on inefficinent humans to see my revised amortization schedule? \n", + "Why the online amortization schedule is not visible now? \n", "\n", - "XXXX, Experian, and XXXX please do a proper investigation under 611 of the FCRA thank you very much I have attached the letter proving this this is not on my XXXX consumer report and a letter form the court house stating they do not report information to the credit bureaus from the XXXX XXXX XXXX, Clerk of Court United State Bankruptcy Court on dated XX/XX/2019 I have summited it to the credit bureaus to be ignored. I have as for a description of my investigation by section 611 of the FCRA and the information from the investigation is inaccurate.\n", + "Worst thing, after turning on escrow account, there is no transparency. Amount of escrow account is not in sync with all the sections of my online account. It seems that there are too many internal buggy systems in Mr. Cooper and everybody from different IT department is showing a different $ $ figure at various places. \n", + "Highly inefficient organization.\n", + "2. I had a loan with national Collegiate Trust. i did n't make any payments on the loan for 6 years and due to the \" statute to limitations '' the loan collector should not be contacting me any more, by they still are in both forms phone call 's and letters. I am also trying to buy a house for my family and found out that i can not get a USDA loan for the house because National Collegiate Trust has filled the loan as a government delinquent loan. The problem with that is the loan is a private loan not a federal/Government loan. due to the way they filled the lion on my credit report i am not able to buy my first home.\n", + "3. Cares act refund requested in XXXX, called multiple times given conflicting information every time. Its now XXXX and looking like its going to be even longer potentially months before the refund arrives. Blatant disregard for the cares act signed into law and terrible customer service. This company is ill suited to service the federal government contract it enjoys.\n", + "4. In an attempt to pay off my loan with Ocwen, I sent XXXX large payments to the institution on XXXX XXXX that would have decreased my loan amount to within a couple of hundred dollars ( you can not send XXXX payment that exceeds 80 % of the loan balance so I broke the payments up into XXXX XXXX. I scheduled the payments for the same day because I did not want any interest to accrue. After a few days, I noticed where the XXXX payments were withdrawn from my bank but that my Ocwen balance had not changed to reflect my XXXX payments. I called Ocwen on XXXX XXXX to ask about the payment. The Ocwen rep explained that because the XXXX payments were scheduled for the same day, that it exceeded the 80 % max limit. I asked Ocwen to return my XXXX payments. The rep said it would take XXXX business days. I called Ocwen back on XXXX XXXX when my funds had not been returned to my bank account. I gave them my account number and routing number to my bank, information that I am sure they already had since my monthly mortgage payments are debited from my account. They asked me to wait a couple more days. I called Ocwen back on XXXX XXXX. The rep asked me to be patient and assured me that the funds would be returned by XXXX XXXX. There were no funds returned to my account on XXXX XXXX. I called Ocwen again. I was asked to wait 40+ minutes on hold while the Ocwen rep put me in touch with an escalation specialist ( ES ). The ES told me that my funds would be reurned within XXXX hours and that he had put it in as a priority because I had called so often. There were no funds on XXXX XXXX. I called Ocwen again to see if there was a problem. There was. After speaking to a rep and another ES, I was told that my funds could not be returned to me! The ES said that he did not see my funds! He claimed to put me in touch with someone who XXXX be able to address my concerns. So that 's where I am now, on hold waiting to speak to yet another person! This is a significant amount of money and I fear that Ocwen is trying to get away with keeping my XXXX payments!\n", + "5. In XX/XX/XXXX we received our first bill from XXXX XXXX for XXXX. ( attached ) We promptly paid the bill. Again, in XX/XX/XXXX we paid our second bill for XXXX. Again, both on time. Then when XX/XX/XXXX statement came we were billed XXXX. My husband called XXXX XXXX to find out what the issue was. We were told there was a loan shortfall caused by us paying XXXX ( the amount we were billed ) and that our loan was re-amoritized. I argued I had a fixed rate loan, had never missed a payment, had never made a late payment, and paid exactly what was billed. Well, after double checking my promissory note ( attached ) and TILA ( attached ) I was to always be billed XXXX. XXXX changed the monthly payment and thus caused a shortfall. When I told them this information they refused to correct the shortfall. Not only did they not correct the mistake they refused to return my calls or emails to provide answers for these issues. Around 90 days later and this issue still exists and they still refuse to answer. Additionally I offered to make up the shortfall myself by offering a check for the missing amount and they just applied it to interest. Thus the shortfall still exists. The extra amount would have gone directly to principal. Additionally, in XX/XX/XXXX we made an extra payment amount on top of the monthly payment. This was made all at the same time. The letter we sent contained directions to only apply extra payments beyond the monthly requirement be applied to principal and not the next months payment. This was ignored. Then XXXX \" a higher up '' as she calls herself lied and told me it went to principal when clearly it did not. We requested this be fixed and it has not been done. No one has offered to fix anything either. A certified letter is attached that I mailed. I also made dozens of calls.\n", "\n" ] } @@ -1633,7 +1593,7 @@ { "data": { "text/html": [ - "Query job a069b4a5-5238-4ca8-a6c0-d48781d00f6c is DONE. 0 Bytes processed. Open Job" + "Query job de5da6c9-96b5-42a1-b199-42687392fe37 is DONE. 0 Bytes processed. Open Job" ], "text/plain": [ "" @@ -1671,7 +1631,7 @@ { "data": { "text/html": [ - "Query job 63f6e1d0-b0dc-4f5c-a001-5889c28162c5 is DONE. 0 Bytes processed. Open Job" + "Query job 1363c327-00b5-4835-a902-da84882bc996 is DONE. 0 Bytes processed. Open Job" ], "text/plain": [ "" @@ -1683,7 +1643,7 @@ { "data": { "text/html": [ - "Query job c1c9e28b-ba6d-4485-b892-0bf2428f927c is DONE. 8 Bytes processed. Open Job" + "Query job c5996f1e-a140-4e7d-8775-091e1a73d882 is DONE. 8 Bytes processed. Open Job" ], "text/plain": [ "" @@ -1695,7 +1655,7 @@ { "data": { "text/html": [ - "Query job 67402b3c-eee4-4fe4-aeaf-fb27606ecde7 is DONE. 2 Bytes processed. Open Job" + "Query job db1de3ab-2e6e-4b3f-8e6a-01bad33ac45f is DONE. 2 Bytes processed. Open Job" ], "text/plain": [ "" @@ -1707,7 +1667,7 @@ { "data": { "text/html": [ - "Query job 83166900-0787-4a6d-b822-c3be87990e35 is DONE. 328 Bytes processed. Open Job" + "Query job 38d9a9d0-7f03-4091-858b-f864da30987e is DONE. 375 Bytes processed. Open Job" ], "text/plain": [ "" @@ -1719,7 +1679,7 @@ { "data": { "text/plain": [ - "'The most obvious difference between the two lists of comments is that the first list contains comments about credit report issues related to mortgages and loans, while the second list contains comments about credit report issues related to other types of debts, such as cell phone bills, collections, and bankruptcies.'" + "'The most obvious difference between the two lists of comments is the subject matter. The first list of comments is primarily focused on issues with financial institutions, such as Navient, Nelnet, PayPal, and Mr. Cooper. The second list of comments is primarily focused on issues with government agencies, such as the National Collegiate Trust, the USDA, and Ocwen.'" ] }, "execution_count": 19, @@ -1773,7 +1733,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.10.13" + "version": "3.10.9" } }, "nbformat": 4, diff --git a/tests/system/small/ml/test_llm.py b/tests/system/small/ml/test_llm.py index e526d54362..b9e4889801 100644 --- a/tests/system/small/ml/test_llm.py +++ b/tests/system/small/ml/test_llm.py @@ -261,8 +261,8 @@ def test_embedding_generator_predict_success( ): df = palm2_embedding_generator_model.predict(llm_text_df).to_pandas() assert df.shape == (3, 4) - assert "ml_generate_embedding_result" in df.columns - series = df["ml_generate_embedding_result"] + assert "text_embedding" in df.columns + series = df["text_embedding"] value = series[0] assert len(value) == 768 @@ -273,8 +273,8 @@ def test_embedding_generator_multilingual_predict_success( ): df = palm2_embedding_generator_multilingual_model.predict(llm_text_df).to_pandas() assert df.shape == (3, 4) - assert "ml_generate_embedding_result" in df.columns - series = df["ml_generate_embedding_result"] + assert "text_embedding" in df.columns + series = df["text_embedding"] value = series[0] assert len(value) == 768 @@ -285,8 +285,8 @@ def test_embedding_generator_predict_series_success( ): df = palm2_embedding_generator_model.predict(llm_text_df["prompt"]).to_pandas() assert df.shape == (3, 4) - assert "ml_generate_embedding_result" in df.columns - series = df["ml_generate_embedding_result"] + assert "text_embedding" in df.columns + series = df["text_embedding"] value = series[0] assert len(value) == 768 From ea9576125d46f3912372f75ebe51196ba83e96db Mon Sep 17 00:00:00 2001 From: Ashley Xu <139821907+ashleyxuu@users.noreply.github.com> Date: Tue, 2 Apr 2024 11:06:17 -0700 Subject: [PATCH 34/53] docs: add opeartions in API docs (#557) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Thank you for opening a Pull Request! Before submitting your PR, there are a few things you can do to make sure it goes smoothly: - [ ] Make sure to open an issue as a [bug/issue](https://togithub.com/googleapis/python-bigquery-dataframes/issues/new/choose) before writing your code! That way we can discuss the change, evaluate designs, and agree on the general idea - [ ] Ensure the tests and linter pass - [ ] Code coverage does not decrease (if any source code was changed) - [x] Appropriate docs were updated (if necessary) - [x] Plotting module:https://screenshot.googleplex.com/8VQbFyz4U2vzVd2 Fixes # 🦕 --- docs/templates/toc.yml | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/docs/templates/toc.yml b/docs/templates/toc.yml index 57b0522d04..1898655535 100644 --- a/docs/templates/toc.yml +++ b/docs/templates/toc.yml @@ -60,6 +60,11 @@ name: Indexers - name: pandas uid: bigframes.pandas + - items: + - name: Plotting + uid: bigframes.operations.plotting + - name: PlotAccessor + uid: bigframes.operations.plotting.PlotAccessor - items: - name: Series uid: bigframes.series.Series From 1caac27fe95ef3eb36bad2ac351090891922858c Mon Sep 17 00:00:00 2001 From: Chelsea Lin <124939984+chelsea-lin@users.noreply.github.com> Date: Tue, 2 Apr 2024 16:29:31 -0700 Subject: [PATCH 35/53] fix: exclude list-like s parameter in plot.scatter (#568) --- bigframes/operations/_matplotlib/core.py | 6 ++++++ tests/system/small/operations/test_plotting.py | 16 ++++++++++++++++ .../bigframes_vendored/pandas/plotting/_core.py | 3 --- 3 files changed, 22 insertions(+), 3 deletions(-) diff --git a/bigframes/operations/_matplotlib/core.py b/bigframes/operations/_matplotlib/core.py index ad5abb4bca..2c1c2bc4ac 100644 --- a/bigframes/operations/_matplotlib/core.py +++ b/bigframes/operations/_matplotlib/core.py @@ -98,6 +98,12 @@ def __init__(self, data, **kwargs) -> None: f"Only support a single color string or a column name/posision. {constants.FEEDBACK_LINK}" ) + s = self.kwargs.get("s", None) + if self._is_sequence_arg(s): + raise NotImplementedError( + f"Only support a single color string or a column name/posision. {constants.FEEDBACK_LINK}" + ) + def _compute_plot_data(self): sample = self._compute_sample_data(self.data) diff --git a/tests/system/small/operations/test_plotting.py b/tests/system/small/operations/test_plotting.py index 41ea7d4ebb..824125adf2 100644 --- a/tests/system/small/operations/test_plotting.py +++ b/tests/system/small/operations/test_plotting.py @@ -240,6 +240,22 @@ def test_scatter_args_c(c): ) +@pytest.mark.parametrize( + ("arg_name"), + [ + pytest.param("c", marks=pytest.mark.xfail(raises=NotImplementedError)), + pytest.param("s", marks=pytest.mark.xfail(raises=NotImplementedError)), + ], +) +def test_scatter_sequence_arg(arg_name): + data = { + "a": [1, 2, 3], + "b": [1, 2, 3], + } + arg_value = [3, 3, 1] + bpd.DataFrame(data).plot.scatter(x="a", y="b", **{arg_name: arg_value}) + + def test_sampling_plot_args_n(): df = bpd.DataFrame(np.arange(bf_mpl.DEFAULT_SAMPLING_N * 10), columns=["one"]) ax = df.plot.line() diff --git a/third_party/bigframes_vendored/pandas/plotting/_core.py b/third_party/bigframes_vendored/pandas/plotting/_core.py index f8da9efdc0..19f56965df 100644 --- a/third_party/bigframes_vendored/pandas/plotting/_core.py +++ b/third_party/bigframes_vendored/pandas/plotting/_core.py @@ -257,9 +257,6 @@ def scatter( - A string with the name of the column to be used for marker's size. - A single scalar so all points have the same size. - - A sequence of scalars, which will be used for each point's size - recursively. For instance, when passing [2,14] all points size - will be either 2 or 14, alternatively. c (str, int or array-like, optional): The color of each point. Possible values are: From 4ae0262a2b1dfc35c1e4c3392b9e21456d6e964e Mon Sep 17 00:00:00 2001 From: TrevorBergeron Date: Tue, 2 Apr 2024 23:40:16 -0700 Subject: [PATCH 36/53] fix: Restore string to date/time type coercion (#565) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Thank you for opening a Pull Request! Before submitting your PR, there are a few things you can do to make sure it goes smoothly: - [ ] Make sure to open an issue as a [bug/issue](https://togithub.com/googleapis/python-bigquery-dataframes/issues/new/choose) before writing your code! That way we can discuss the change, evaluate designs, and agree on the general idea - [ ] Ensure the tests and linter pass - [ ] Code coverage does not decrease (if any source code was changed) - [ ] Appropriate docs were updated (if necessary) Fixes # 🦕 --- bigframes/dtypes.py | 23 +++++-- bigframes/operations/__init__.py | 24 +++---- bigframes/operations/type.py | 22 +++++-- .../system/small/operations/test_datetimes.py | 64 +++++++++++++++++++ 4 files changed, 107 insertions(+), 26 deletions(-) diff --git a/bigframes/dtypes.py b/bigframes/dtypes.py index 79e1456f31..3d8c06d188 100644 --- a/bigframes/dtypes.py +++ b/bigframes/dtypes.py @@ -648,6 +648,7 @@ def is_compatible(scalar: typing.Any, dtype: Dtype) -> typing.Optional[Dtype]: def lcd_type(dtype1: Dtype, dtype2: Dtype) -> Dtype: + """Get the supertype of the two types.""" if dtype1 == dtype2: return dtype1 # Implicit conversion currently only supported for numeric types @@ -664,12 +665,26 @@ def lcd_type(dtype1: Dtype, dtype2: Dtype) -> Dtype: return hierarchy[lcd_index] -def lcd_etype(etype1: ExpressionType, etype2: ExpressionType) -> ExpressionType: - if etype1 is None: +def coerce_to_common(etype1: ExpressionType, etype2: ExpressionType) -> ExpressionType: + """Coerce types to a common type or throw a TypeError""" + if etype1 is not None and etype2 is not None: + common_supertype = lcd_type(etype1, etype2) + if common_supertype is not None: + return common_supertype + if can_coerce(etype1, etype2): return etype2 - if etype2 is None: + if can_coerce(etype2, etype1): return etype1 - return lcd_type_or_throw(etype1, etype2) + raise TypeError(f"Cannot coerce {etype1} and {etype2} to a common type.") + + +def can_coerce(source_type: ExpressionType, target_type: ExpressionType) -> bool: + if source_type is None: + return True # None can be coerced to any supported type + else: + return (source_type == STRING_DTYPE) and ( + target_type in (DATETIME_DTYPE, TIMESTAMP_DTYPE, TIME_DTYPE, DATE_DTYPE) + ) def lcd_type_or_throw(dtype1: Dtype, dtype2: Dtype) -> Dtype: diff --git a/bigframes/operations/__init__.py b/bigframes/operations/__init__.py index dcd5494626..0dcc643238 100644 --- a/bigframes/operations/__init__.py +++ b/bigframes/operations/__init__.py @@ -548,16 +548,10 @@ def output_type(self, *input_types): # Binary Ops -fillna_op = create_binary_op(name="fillna", type_signature=op_typing.COMMON_SUPERTYPE) -cliplower_op = create_binary_op( - name="clip_lower", type_signature=op_typing.COMMON_SUPERTYPE -) -clipupper_op = create_binary_op( - name="clip_upper", type_signature=op_typing.COMMON_SUPERTYPE -) -coalesce_op = create_binary_op( - name="coalesce", type_signature=op_typing.COMMON_SUPERTYPE -) +fillna_op = create_binary_op(name="fillna", type_signature=op_typing.COERCE) +cliplower_op = create_binary_op(name="clip_lower", type_signature=op_typing.COERCE) +clipupper_op = create_binary_op(name="clip_upper", type_signature=op_typing.COERCE) +coalesce_op = create_binary_op(name="coalesce", type_signature=op_typing.COERCE) ## Math Ops @@ -575,7 +569,7 @@ def output_type(self, *input_types): right_type is None or dtypes.is_numeric(right_type) ): # Numeric addition - return dtypes.lcd_etype(left_type, right_type) + return dtypes.coerce_to_common(left_type, right_type) # TODO: Add temporal addition once delta types supported raise TypeError(f"Cannot add dtypes {left_type} and {right_type}") @@ -592,7 +586,7 @@ def output_type(self, *input_types): right_type is None or dtypes.is_numeric(right_type) ): # Numeric subtraction - return dtypes.lcd_etype(left_type, right_type) + return dtypes.coerce_to_common(left_type, right_type) # TODO: Add temporal addition once delta types supported raise TypeError(f"Cannot subtract dtypes {left_type} and {right_type}") @@ -652,7 +646,7 @@ class WhereOp(TernaryOp): def output_type(self, *input_types: dtypes.ExpressionType) -> dtypes.ExpressionType: if input_types[1] != dtypes.BOOL_DTYPE: raise TypeError("where condition must be a boolean") - return dtypes.lcd_etype(input_types[0], input_types[2]) + return dtypes.coerce_to_common(input_types[0], input_types[2]) where_op = WhereOp() @@ -663,8 +657,8 @@ class ClipOp(TernaryOp): name: typing.ClassVar[str] = "clip" def output_type(self, *input_types: dtypes.ExpressionType) -> dtypes.ExpressionType: - return dtypes.lcd_etype( - input_types[0], dtypes.lcd_etype(input_types[1], input_types[2]) + return dtypes.coerce_to_common( + input_types[0], dtypes.coerce_to_common(input_types[1], input_types[2]) ) diff --git a/bigframes/operations/type.py b/bigframes/operations/type.py index a1dc8edffc..f469070805 100644 --- a/bigframes/operations/type.py +++ b/bigframes/operations/type.py @@ -118,7 +118,7 @@ def output_type( raise TypeError(f"Type {left_type} is not numeric") if (right_type is not None) and not bigframes.dtypes.is_numeric(right_type): raise TypeError(f"Type {right_type} is not numeric") - return bigframes.dtypes.lcd_etype(left_type, right_type) + return bigframes.dtypes.coerce_to_common(left_type, right_type) @dataclasses.dataclass @@ -132,7 +132,7 @@ def output_type( raise TypeError(f"Type {left_type} is not numeric") if (right_type is not None) and not bigframes.dtypes.is_numeric(right_type): raise TypeError(f"Type {right_type} is not numeric") - lcd_type = bigframes.dtypes.lcd_etype(left_type, right_type) + lcd_type = bigframes.dtypes.coerce_to_common(left_type, right_type) if lcd_type == bigframes.dtypes.INT_DTYPE: # Real numeric ops produce floats on int input return bigframes.dtypes.FLOAT_DTYPE @@ -140,13 +140,21 @@ def output_type( @dataclasses.dataclass -class Supertype(BinaryTypeSignature): - """Type signature for functions that return a the supertype of its inputs. Currently BigFrames just supports upcasting numerics.""" +class CoerceCommon(BinaryTypeSignature): + """Attempt to coerce inputs to a compatible type.""" def output_type( self, left_type: ExpressionType, right_type: ExpressionType ) -> ExpressionType: - return bigframes.dtypes.lcd_etype(left_type, right_type) + try: + return bigframes.dtypes.coerce_to_common(left_type, right_type) + except TypeError: + pass + if bigframes.dtypes.can_coerce(left_type, right_type): + return right_type + if bigframes.dtypes.can_coerce(right_type, left_type): + return left_type + raise TypeError(f"Cannot coerce {left_type} and {right_type} to a common type.") @dataclasses.dataclass @@ -156,7 +164,7 @@ class Comparison(BinaryTypeSignature): def output_type( self, left_type: ExpressionType, right_type: ExpressionType ) -> ExpressionType: - common_type = bigframes.dtypes.lcd_etype(left_type, right_type) + common_type = CoerceCommon().output_type(left_type, right_type) if not bigframes.dtypes.is_comparable(common_type): raise TypeError(f"Types {left_type} and {right_type} are not comparable") return bigframes.dtypes.BOOL_DTYPE @@ -188,7 +196,7 @@ def output_type( BINARY_NUMERIC = BinaryNumeric() BINARY_REAL_NUMERIC = BinaryRealNumeric() COMPARISON = Comparison() -COMMON_SUPERTYPE = Supertype() +COERCE = CoerceCommon() LOGICAL = Logical() STRING_TRANSFORM = TypePreserving( bigframes.dtypes.is_string_like, description="numeric" diff --git a/tests/system/small/operations/test_datetimes.py b/tests/system/small/operations/test_datetimes.py index b952289a72..2824e86979 100644 --- a/tests/system/small/operations/test_datetimes.py +++ b/tests/system/small/operations/test_datetimes.py @@ -12,6 +12,8 @@ # See the License for the specific language governing permissions and # limitations under the License. +import datetime + import pandas as pd import pytest @@ -303,3 +305,65 @@ def test_dt_floor(scalars_dfs, col_name, freq): pd_result.astype(scalars_df[col_name].dtype), # floor preserves type bf_result, ) + + +def test_dt_compare_coerce_str_datetime(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + bf_series: bigframes.series.Series = scalars_df["datetime_col"] + bf_result = (bf_series >= "2024-01-01").to_pandas() + + pd_result = scalars_pandas_df["datetime_col"] >= pd.to_datetime("2024-01-01") + + # pandas produces pyarrow bool dtype + assert_series_equal(pd_result, bf_result, check_dtype=False) + + +def test_dt_clip_datetime_literals(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + bf_series: bigframes.series.Series = scalars_df["date_col"] + bf_result = bf_series.clip( + datetime.date(2020, 1, 1), datetime.date(2024, 1, 1) + ).to_pandas() + + pd_result = scalars_pandas_df["date_col"].clip( + datetime.date(2020, 1, 1), datetime.date(2024, 1, 1) + ) + + assert_series_equal( + pd_result, + bf_result, + ) + + +def test_dt_clip_coerce_str_date(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + bf_series: bigframes.series.Series = scalars_df["date_col"] + bf_result = bf_series.clip("2020-01-01", "2024-01-01").to_pandas() + + # Pandas can't coerce with pyarrow types so convert first + pd_result = scalars_pandas_df["date_col"].clip( + datetime.date(2020, 1, 1), datetime.date(2024, 1, 1) + ) + + assert_series_equal( + pd_result, + bf_result, + ) + + +def test_dt_clip_coerce_str_timestamp(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + bf_series: bigframes.series.Series = scalars_df["timestamp_col"] + bf_result = bf_series.clip( + "2020-01-01T20:03:50Z", "2024-01-01T20:03:50Z" + ).to_pandas() + + pd_result = scalars_pandas_df["timestamp_col"].clip( + pd.to_datetime("2020-01-01T20:03:50Z", utc=True), + pd.to_datetime("2024-01-01T20:03:50Z", utc=True), + ) + + assert_series_equal( + pd_result, + bf_result, + ) From b188146466780e6f7a041f51f5be51a7d60719c9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tim=20Swe=C3=B1a=20=28Swast=29?= Date: Wed, 3 Apr 2024 08:43:00 -0500 Subject: [PATCH 37/53] fix: include all names in MultiIndex repr (#564) docs: include Index in table-of-contents (#564) --- bigframes/core/blocks.py | 4 +- bigframes/core/indexes/__init__.py | 2 +- bigframes/core/indexes/{index.py => base.py} | 15 +++---- docs/reference/bigframes.pandas/indexing.rst | 2 +- docs/templates/toc.yml | 2 + scripts/publish_api_coverage.py | 4 +- tests/system/small/test_index.py | 39 +++++++++++++++++++ tests/system/small/test_session.py | 2 +- .../bigframes_vendored/pandas/core/frame.py | 2 +- .../pandas/core/indexes/base.py | 12 ++++++ .../bigframes_vendored/pandas/core/series.py | 2 +- 11 files changed, 69 insertions(+), 17 deletions(-) rename bigframes/core/indexes/{index.py => base.py} (98%) diff --git a/bigframes/core/blocks.py b/bigframes/core/blocks.py index 11899eef11..04a98ac9a4 100644 --- a/bigframes/core/blocks.py +++ b/bigframes/core/blocks.py @@ -1314,8 +1314,8 @@ def retrieve_repr_request_results( head_block = self computed_df, query_job = head_block.to_pandas() formatted_df = computed_df.set_axis(self.column_labels, axis=1) - # we reset the axis and substitute the bf index name for the default - formatted_df.index.name = self.index.name + # we reset the axis and substitute the bf index name(s) for the default + formatted_df.index.names = self.index.names # type: ignore return formatted_df, count, query_job def promote_offsets(self, label: Label = None) -> typing.Tuple[Block, str]: diff --git a/bigframes/core/indexes/__init__.py b/bigframes/core/indexes/__init__.py index 6419d0985a..ae6011ffa5 100644 --- a/bigframes/core/indexes/__init__.py +++ b/bigframes/core/indexes/__init__.py @@ -12,7 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -from bigframes.core.indexes.index import Index +from bigframes.core.indexes.base import Index __all__ = [ "Index", diff --git a/bigframes/core/indexes/index.py b/bigframes/core/indexes/base.py similarity index 98% rename from bigframes/core/indexes/index.py rename to bigframes/core/indexes/base.py index c818b68711..daa52a02b9 100644 --- a/bigframes/core/indexes/index.py +++ b/bigframes/core/indexes/base.py @@ -88,7 +88,12 @@ def from_frame( @property def name(self) -> blocks.Label: - return self.names[0] + names = self.names + if len(names) == 1: + return self.names[0] + else: + # pandas returns None for MultiIndex.name. + return None @name.setter def name(self, value: blocks.Label): @@ -460,14 +465,6 @@ def __init__( super().__init__(series_or_dataframe._block) self._whole_frame = series_or_dataframe - @property - def name(self) -> blocks.Label: - return self.names[0] - - @name.setter - def name(self, value: blocks.Label): - self.names = [value] - @property def names(self) -> typing.Sequence[blocks.Label]: """Returns the names of the Index.""" diff --git a/docs/reference/bigframes.pandas/indexing.rst b/docs/reference/bigframes.pandas/indexing.rst index 8f7f194740..2cc1acfabf 100644 --- a/docs/reference/bigframes.pandas/indexing.rst +++ b/docs/reference/bigframes.pandas/indexing.rst @@ -3,7 +3,7 @@ Index objects ============= -.. autoclass:: bigframes.core.indexes.index.Index +.. autoclass:: bigframes.core.indexes.base.Index :members: :inherited-members: :undoc-members: diff --git a/docs/templates/toc.yml b/docs/templates/toc.yml index 1898655535..3c2c688d78 100644 --- a/docs/templates/toc.yml +++ b/docs/templates/toc.yml @@ -40,6 +40,8 @@ - name: SeriesGroupBy uid: bigframes.core.groupby.SeriesGroupBy name: Groupby + - name: Index + uid: bigframes.core.indexes.base.Index - items: - name: AtDataFrameIndexer uid: bigframes.core.indexers.AtDataFrameIndexer diff --git a/scripts/publish_api_coverage.py b/scripts/publish_api_coverage.py index 4a35ade9ef..25fbfbf988 100644 --- a/scripts/publish_api_coverage.py +++ b/scripts/publish_api_coverage.py @@ -44,6 +44,9 @@ "dataframegroupby": ( "https://cloud.google.com/python/docs/reference/bigframes/latest/bigframes.core.groupby.DataFrameGroupBy#bigframes_core_groupby_DataFrameGroupBy_" ), + "index": ( + "https://cloud.google.com/python/docs/reference/bigframes/latest/bigframes.core.indexes.base.Index#bigframes_core_indexes_base_Index_" + ), "series": ( "https://cloud.google.com/python/docs/reference/bigframes/latest/bigframes.series.Series#bigframes_series_Series_" ), @@ -59,7 +62,6 @@ "window": ( "https://cloud.google.com/python/docs/reference/bigframes/latest/bigframes.core.window.Window#bigframes_core_window_Window_" ), - # TODO: Index not documented. } diff --git a/tests/system/small/test_index.py b/tests/system/small/test_index.py index 1f39ba25fe..c419dc4907 100644 --- a/tests/system/small/test_index.py +++ b/tests/system/small/test_index.py @@ -370,3 +370,42 @@ def test_index_isin(scalars_df_index, scalars_pandas_df_index): bf_series, check_names=False, ) + + +def test_multiindex_name_is_none(session): + df = pd.DataFrame( + { + "A": [0, 0, 0, 1, 1, 1], + "B": ["x", "y", "z", "x", "y", "z"], + "C": [123, 345, 789, -123, -345, -789], + "D": ["a", "b", "c", "d", "e", "f"], + }, + ) + index = session.read_pandas(df).set_index(["A", "B"]).index + assert index.name is None + + +def test_multiindex_names_not_none(session): + df = pd.DataFrame( + { + "A": [0, 0, 0, 1, 1, 1], + "B": ["x", "y", "z", "x", "y", "z"], + "C": [123, 345, 789, -123, -345, -789], + "D": ["a", "b", "c", "d", "e", "f"], + }, + ) + index = session.read_pandas(df).set_index(["A", "B"]).index + assert tuple(index.names) == ("A", "B") + + +def test_multiindex_repr_includes_all_names(session): + df = pd.DataFrame( + { + "A": [0, 0, 0, 1, 1, 1], + "B": ["x", "y", "z", "x", "y", "z"], + "C": [123, 345, 789, -123, -345, -789], + "D": ["a", "b", "c", "d", "e", "f"], + }, + ) + index = session.read_pandas(df).set_index(["A", "B"]).index + assert "names=['A', 'B']" in repr(index) diff --git a/tests/system/small/test_session.py b/tests/system/small/test_session.py index d0c20f3839..28a3f03860 100644 --- a/tests/system/small/test_session.py +++ b/tests/system/small/test_session.py @@ -27,7 +27,7 @@ import pytest import bigframes -import bigframes.core.indexes.index +import bigframes.core.indexes.base import bigframes.dataframe import bigframes.dtypes import bigframes.ml.linear_model diff --git a/third_party/bigframes_vendored/pandas/core/frame.py b/third_party/bigframes_vendored/pandas/core/frame.py index 50cce1eeab..3ae5b0db2a 100644 --- a/third_party/bigframes_vendored/pandas/core/frame.py +++ b/third_party/bigframes_vendored/pandas/core/frame.py @@ -4797,7 +4797,7 @@ def index(self): MultiIndex([( 'Alice', 'Seattle'), ( 'Bob', 'New York'), ('Aritra', 'Kona')], - name='Name') + names=['Name', 'Location']) >>> df1.index.values array([('Alice', 'Seattle'), ('Bob', 'New York'), ('Aritra', 'Kona')], dtype=object) diff --git a/third_party/bigframes_vendored/pandas/core/indexes/base.py b/third_party/bigframes_vendored/pandas/core/indexes/base.py index 3ad8729271..7f5761e45b 100644 --- a/third_party/bigframes_vendored/pandas/core/indexes/base.py +++ b/third_party/bigframes_vendored/pandas/core/indexes/base.py @@ -8,6 +8,18 @@ class Index: """Immutable sequence used for indexing and alignment. The basic object storing axis labels for all objects. + + Args: + data (pandas.Series | pandas.Index | bigframes.series.Series | bigframes.core.indexes.base.Index): + Labels (1-dimensional). + dtype: + Data type for the output Index. If not specified, this will be + inferred from `data`. + name: + Name to be stored in the index. + session (Optional[bigframes.session.Session]): + BigQuery DataFrames session where queries are run. If not set, + a default session is used. """ @property diff --git a/third_party/bigframes_vendored/pandas/core/series.py b/third_party/bigframes_vendored/pandas/core/series.py index 0aebd0660f..89b39cf8a0 100644 --- a/third_party/bigframes_vendored/pandas/core/series.py +++ b/third_party/bigframes_vendored/pandas/core/series.py @@ -87,7 +87,7 @@ def index(self): MultiIndex([( 'Alice', 'Seattle'), ( 'Bob', 'New York'), ('Aritra', 'Kona')], - name='Name') + names=['Name', 'Location']) >>> s1.index.values array([('Alice', 'Seattle'), ('Bob', 'New York'), ('Aritra', 'Kona')], dtype=object) From 90bcec5c73f7eefeff14bbd8bdcad3a4c9d91d8f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tim=20Swe=C3=B1a=20=28Swast=29?= Date: Wed, 3 Apr 2024 09:46:18 -0500 Subject: [PATCH 38/53] docs: `bigframes.options.bigquery.project` and `location` are optional in some circumstances (#548) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Thank you for opening a Pull Request! Before submitting your PR, there are a few things you can do to make sure it goes smoothly: - [ ] Make sure to open an issue as a [bug/issue](https://togithub.com/googleapis/python-bigquery-dataframes/issues/new/choose) before writing your code! That way we can discuss the change, evaluate designs, and agree on the general idea - [ ] Ensure the tests and linter pass - [ ] Code coverage does not decrease (if any source code was changed) - [ ] Appropriate docs were updated (if necessary) Follow-up to launch feedback and https://togithub.com/googleapis/python-bigquery-dataframes/pull/451 🦕 --- samples/snippets/quickstart.py | 3 ++- samples/snippets/set_options_test.py | 7 +++++++ 2 files changed, 9 insertions(+), 1 deletion(-) diff --git a/samples/snippets/quickstart.py b/samples/snippets/quickstart.py index a15ea16853..ae3a934004 100644 --- a/samples/snippets/quickstart.py +++ b/samples/snippets/quickstart.py @@ -29,8 +29,9 @@ def run_quickstart(project_id: str): import bigframes.pandas as bpd # Set BigQuery DataFrames options + # Note: The project option is not required in all environments. + # On BigQuery Studio, the project ID is automatically detected. bpd.options.bigquery.project = your_gcp_project_id - bpd.options.bigquery.location = "us" # Create a DataFrame from a BigQuery table query_or_table = "bigquery-public-data.ml_datasets.penguins" diff --git a/samples/snippets/set_options_test.py b/samples/snippets/set_options_test.py index ef6f41ce54..f981009e9a 100644 --- a/samples/snippets/set_options_test.py +++ b/samples/snippets/set_options_test.py @@ -26,7 +26,14 @@ def test_bigquery_dataframes_set_options(): REGION = "US" # @param {type:"string"} # Set BigQuery DataFrames options + # Note: The project option is not required in all environments. + # On BigQuery Studio, the project ID is automatically detected. bpd.options.bigquery.project = PROJECT_ID + + # Note: The location option is not required. + # It defaults to the location of the first table or query + # passed to read_gbq(). For APIs where a location can't be + # auto-detected, the location defaults to the "US" location. bpd.options.bigquery.location = REGION # [END bigquery_dataframes_set_options] From 853c25e8023bf877f28cda4dade0694d0299a83e Mon Sep 17 00:00:00 2001 From: Garrett Wu <6505921+GarrettWu@users.noreply.github.com> Date: Wed, 3 Apr 2024 10:33:45 -0700 Subject: [PATCH 39/53] feat: add ml.metrics.mean_squared_error (#559) * feat: add ml.metrics.mean_squared_error * fix docs * fix docs --- bigframes/ml/metrics/__init__.py | 2 + bigframes/ml/metrics/_metrics.py | 14 ++ tests/system/small/ml/test_metrics.py | 121 ++++++++---------- .../sklearn/metrics/_regression.py | 27 ++++ 4 files changed, 95 insertions(+), 69 deletions(-) diff --git a/bigframes/ml/metrics/__init__.py b/bigframes/ml/metrics/__init__.py index 6b0a243426..e79b46877b 100644 --- a/bigframes/ml/metrics/__init__.py +++ b/bigframes/ml/metrics/__init__.py @@ -18,6 +18,7 @@ auc, confusion_matrix, f1_score, + mean_squared_error, precision_score, r2_score, recall_score, @@ -35,5 +36,6 @@ "confusion_matrix", "precision_score", "f1_score", + "mean_squared_error", "pairwise", ] diff --git a/bigframes/ml/metrics/_metrics.py b/bigframes/ml/metrics/_metrics.py index ee86798b33..542e6300a8 100644 --- a/bigframes/ml/metrics/_metrics.py +++ b/bigframes/ml/metrics/_metrics.py @@ -335,3 +335,17 @@ def f1_score( f1_score.__doc__ = inspect.getdoc(vendored_metrics_classification.f1_score) + + +def mean_squared_error( + y_true: Union[bpd.DataFrame, bpd.Series], + y_pred: Union[bpd.DataFrame, bpd.Series], +) -> float: + y_true_series, y_pred_series = utils.convert_to_series(y_true, y_pred) + + return (y_pred_series - y_true_series).pow(2).sum() / len(y_true_series) + + +mean_squared_error.__doc__ = inspect.getdoc( + vendored_metrics_regression.mean_squared_error +) diff --git a/tests/system/small/ml/test_metrics.py b/tests/system/small/ml/test_metrics.py index b40982e282..c4c7eb4b88 100644 --- a/tests/system/small/ml/test_metrics.py +++ b/tests/system/small/ml/test_metrics.py @@ -19,7 +19,8 @@ import pytest import sklearn.metrics as sklearn_metrics # type: ignore -import bigframes.ml.metrics +import bigframes +from bigframes.ml import metrics def test_r2_score_perfect_fit(session): @@ -32,9 +33,7 @@ def test_r2_score_perfect_fit(session): df = session.read_pandas(pd_df) assert ( - bigframes.ml.metrics.r2_score( - df[["y_true_arbitrary_name"]], df[["y_pred_arbitrary_name"]] - ) + metrics.r2_score(df[["y_true_arbitrary_name"]], df[["y_pred_arbitrary_name"]]) == 1.0 ) @@ -43,7 +42,7 @@ def test_r2_score_bad_fit(session): pd_df = pd.DataFrame({"y_true": [1, 2, 3, 4, 5], "y_pred": [5, 4, 3, 2, 1]}) df = session.read_pandas(pd_df) - assert bigframes.ml.metrics.r2_score(df[["y_true"]], df[["y_pred"]]) == -3.0 + assert metrics.r2_score(df[["y_true"]], df[["y_pred"]]) == -3.0 def test_r2_score_force_finite(session): @@ -56,23 +55,21 @@ def test_r2_score_force_finite(session): ) df = session.read_pandas(pd_df) - assert bigframes.ml.metrics.r2_score( + assert metrics.r2_score( df[["y_true"]], df[["y_pred_1"]], force_finite=False ) == float("-inf") - assert bigframes.ml.metrics.r2_score(df[["y_true"]], df[["y_pred_1"]]) == 0.0 + assert metrics.r2_score(df[["y_true"]], df[["y_pred_1"]]) == 0.0 assert math.isnan( - bigframes.ml.metrics.r2_score( - df[["y_true"]], df[["y_pred_2"]], force_finite=False - ) + metrics.r2_score(df[["y_true"]], df[["y_pred_2"]], force_finite=False) ) - assert bigframes.ml.metrics.r2_score(df[["y_true"]], df[["y_pred_2"]]) == 1.0 + assert metrics.r2_score(df[["y_true"]], df[["y_pred_2"]]) == 1.0 def test_r2_score_ok_fit_matches_sklearn(session): pd_df = pd.DataFrame({"y_true": [1, 2, 3, 4, 5], "y_pred": [2, 3, 4, 3, 6]}) df = session.read_pandas(pd_df) - bf_result = bigframes.ml.metrics.r2_score(df[["y_true"]], df[["y_pred"]]) + bf_result = metrics.r2_score(df[["y_true"]], df[["y_pred"]]) sklearn_result = sklearn_metrics.r2_score(pd_df[["y_true"]], pd_df[["y_pred"]]) assert math.isclose(bf_result, sklearn_result) @@ -81,7 +78,7 @@ def test_r2_score_series(session): pd_df = pd.DataFrame({"y_true": [1, 7, 3, 2, 5], "y_pred": [1, 7, 3, 2, 5]}) df = session.read_pandas(pd_df) - assert bigframes.ml.metrics.r2_score(df["y_true"], df["y_pred"]) == 1.0 + assert metrics.r2_score(df["y_true"], df["y_pred"]) == 1.0 def test_accuracy_score_perfect_fit(session): @@ -94,7 +91,7 @@ def test_accuracy_score_perfect_fit(session): df = session.read_pandas(pd_df) assert ( - bigframes.ml.metrics.accuracy_score( + metrics.accuracy_score( df[["y_true_arbitrary_name"]], df[["y_pred_arbitrary_name"]] ) == 1.0 @@ -105,26 +102,21 @@ def test_accuracy_score_bad_fit(session): pd_df = pd.DataFrame({"y_true": [0, 2, 1, 3, 4], "y_pred": [0, 1, 2, 3, 4]}) df = session.read_pandas(pd_df) - assert bigframes.ml.metrics.accuracy_score(df[["y_true"]], df[["y_pred"]]) == 0.6 + assert metrics.accuracy_score(df[["y_true"]], df[["y_pred"]]) == 0.6 def test_accuracy_score_not_normailze(session): pd_df = pd.DataFrame({"y_true": [0, 2, 1, 3, 4], "y_pred": [0, 1, 2, 3, 4]}) df = session.read_pandas(pd_df) - assert ( - bigframes.ml.metrics.accuracy_score( - df[["y_true"]], df[["y_pred"]], normalize=False - ) - == 3 - ) + assert metrics.accuracy_score(df[["y_true"]], df[["y_pred"]], normalize=False) == 3 def test_accuracy_score_fit_matches_sklearn(session): pd_df = pd.DataFrame({"y_true": [1, 2, 3, 4, 5], "y_pred": [2, 3, 4, 3, 6]}) df = session.read_pandas(pd_df) - bf_result = bigframes.ml.metrics.accuracy_score(df[["y_true"]], df[["y_pred"]]) + bf_result = metrics.accuracy_score(df[["y_true"]], df[["y_pred"]]) sklearn_result = sklearn_metrics.accuracy_score( pd_df[["y_true"]], pd_df[["y_pred"]] ) @@ -135,7 +127,7 @@ def test_accuracy_score_series(session): pd_df = pd.DataFrame({"y_true": [1, 7, 3, 2, 5], "y_pred": [1, 7, 3, 2, 5]}) df = session.read_pandas(pd_df) - assert bigframes.ml.metrics.accuracy_score(df["y_true"], df["y_pred"]) == 1.0 + assert metrics.accuracy_score(df["y_true"], df["y_pred"]) == 1.0 def test_roc_curve_binary_classification_prediction_returns_expected(session): @@ -158,7 +150,7 @@ def test_roc_curve_binary_classification_prediction_returns_expected(session): ) df = session.read_pandas(pd_df) - fpr, tpr, thresholds = bigframes.ml.metrics.roc_curve( + fpr, tpr, thresholds = metrics.roc_curve( df[["y_true_arbitrary_name"]], df[["y_score_arbitrary_name"]], drop_intermediate=False, @@ -219,7 +211,7 @@ def test_roc_curve_binary_classification_prediction_matches_sklearn(session): ) df = session.read_pandas(pd_df) - fpr, tpr, thresholds = bigframes.ml.metrics.roc_curve( + fpr, tpr, thresholds = metrics.roc_curve( df[["y_true"]], df[["y_score"]], drop_intermediate=False ) expected_fpr, expected_tpr, expected_thresholds = sklearn_metrics.roc_curve( @@ -259,7 +251,7 @@ def test_roc_curve_binary_classification_decision_returns_expected(session): ) df = session.read_pandas(pd_df) - fpr, tpr, thresholds = bigframes.ml.metrics.roc_curve( + fpr, tpr, thresholds = metrics.roc_curve( df[["y_true"]], df[["y_score"]], drop_intermediate=False ) @@ -314,7 +306,7 @@ def test_roc_curve_binary_classification_decision_matches_sklearn(session): ) df = session.read_pandas(pd_df) - fpr, tpr, thresholds = bigframes.ml.metrics.roc_curve( + fpr, tpr, thresholds = metrics.roc_curve( df[["y_true"]], df[["y_score"]], drop_intermediate=False ) expected_fpr, expected_tpr, expected_thresholds = sklearn_metrics.roc_curve( @@ -350,7 +342,7 @@ def test_roc_curve_binary_classification_prediction_series(session): ) df = session.read_pandas(pd_df) - fpr, tpr, thresholds = bigframes.ml.metrics.roc_curve( + fpr, tpr, thresholds = metrics.roc_curve( df["y_true"], df["y_score"], drop_intermediate=False ) @@ -420,7 +412,7 @@ def test_roc_auc_score_returns_expected(session): ) df = session.read_pandas(pd_df) - score = bigframes.ml.metrics.roc_auc_score( + score = metrics.roc_auc_score( df[["y_true_arbitrary_name"]], df[["y_score_arbitrary_name"]] ) @@ -436,7 +428,7 @@ def test_roc_auc_score_returns_matches_sklearn(session): ) df = session.read_pandas(pd_df) - score = bigframes.ml.metrics.roc_auc_score(df[["y_true"]], df[["y_score"]]) + score = metrics.roc_auc_score(df[["y_true"]], df[["y_score"]]) expected_score = sklearn_metrics.roc_auc_score( pd_df[["y_true"]], pd_df[["y_score"]] ) @@ -453,7 +445,7 @@ def test_roc_auc_score_series(session): ) df = session.read_pandas(pd_df) - score = bigframes.ml.metrics.roc_auc_score(df["y_true"], df["y_score"]) + score = metrics.roc_auc_score(df["y_true"], df["y_score"]) assert score == 0.625 @@ -462,33 +454,33 @@ def test_auc_invalid_x_size(session): pd_df = pd.DataFrame({"x_arbitrary_name": [0], "y_arbitrary_name": [0]}) df = session.read_pandas(pd_df) with pytest.raises(ValueError): - bigframes.ml.metrics.auc(df[["x_arbitrary_name"]], df[["y_arbitrary_name"]]) + metrics.auc(df[["x_arbitrary_name"]], df[["y_arbitrary_name"]]) def test_auc_nondecreasing_x(session): pd_df = pd.DataFrame({"x": [0, 0, 0.5, 0.5, 1], "y": [0, 0.5, 0.5, 1, 1]}) df = session.read_pandas(pd_df) - assert bigframes.ml.metrics.auc(df[["x"]], df[["y"]]) == 0.75 + assert metrics.auc(df[["x"]], df[["y"]]) == 0.75 def test_auc_nonincreasing_x(session): pd_df = pd.DataFrame({"x": [0, 0, -0.5, -0.5, -1], "y": [0, 0.5, 0.5, 1, 1]}) df = session.read_pandas(pd_df) - assert bigframes.ml.metrics.auc(df[["x"]], df[["y"]]) == 0.75 + assert metrics.auc(df[["x"]], df[["y"]]) == 0.75 def test_auc_nonincreasing_x_negative(session): pd_df = pd.DataFrame({"x": [0, 0, -0.5, -0.5, -1], "y": [0, -0.5, -0.5, -1, -1]}) df = session.read_pandas(pd_df) - assert bigframes.ml.metrics.auc(df[["x"]], df[["y"]]) == -0.75 + assert metrics.auc(df[["x"]], df[["y"]]) == -0.75 def test_auc_series(session): pd_df = pd.DataFrame({"x": [0, 0, 0.5, 0.5, 1], "y": [0, 0.5, 0.5, 1, 1]}) df = session.read_pandas(pd_df) - assert bigframes.ml.metrics.auc(df["x"], df["y"]) == 0.75 + assert metrics.auc(df["x"], df["y"]) == 0.75 def test_confusion_matrix(session): @@ -499,7 +491,7 @@ def test_confusion_matrix(session): } ).astype("Int64") df = session.read_pandas(pd_df) - confusion_matrix = bigframes.ml.metrics.confusion_matrix( + confusion_matrix = metrics.confusion_matrix( df[["y_true_arbitrary_name"]], df[["y_pred_arbitrary_name"]] ) expected_pd_df = pd.DataFrame( @@ -522,9 +514,7 @@ def test_confusion_matrix_column_index(session): } ).astype("Int64") df = session.read_pandas(pd_df) - confusion_matrix = bigframes.ml.metrics.confusion_matrix( - df[["y_true"]], df[["y_pred"]] - ) + confusion_matrix = metrics.confusion_matrix(df[["y_true"]], df[["y_pred"]]) expected_pd_df = ( pd.DataFrame( {1: [1, 0, 1, 0], 2: [0, 0, 2, 0], 3: [0, 0, 0, 0], 4: [0, 1, 0, 1]} @@ -545,9 +535,7 @@ def test_confusion_matrix_matches_sklearn(session): } ).astype("Int64") df = session.read_pandas(pd_df) - confusion_matrix = bigframes.ml.metrics.confusion_matrix( - df[["y_true"]], df[["y_pred"]] - ) + confusion_matrix = metrics.confusion_matrix(df[["y_true"]], df[["y_pred"]]) expected_confusion_matrix = sklearn_metrics.confusion_matrix( pd_df[["y_true"]], pd_df[["y_pred"]] ) @@ -565,9 +553,7 @@ def test_confusion_matrix_str_matches_sklearn(session): } ).astype("str") df = session.read_pandas(pd_df) - confusion_matrix = bigframes.ml.metrics.confusion_matrix( - df[["y_true"]], df[["y_pred"]] - ) + confusion_matrix = metrics.confusion_matrix(df[["y_true"]], df[["y_pred"]]) expected_confusion_matrix = sklearn_metrics.confusion_matrix( pd_df[["y_true"]], pd_df[["y_pred"]] ) @@ -588,7 +574,7 @@ def test_confusion_matrix_series(session): } ).astype("Int64") df = session.read_pandas(pd_df) - confusion_matrix = bigframes.ml.metrics.confusion_matrix(df["y_true"], df["y_pred"]) + confusion_matrix = metrics.confusion_matrix(df["y_true"], df["y_pred"]) expected_pd_df = pd.DataFrame( { 0: [2, 0, 1], @@ -609,7 +595,7 @@ def test_recall_score(session): } ).astype("Int64") df = session.read_pandas(pd_df) - recall = bigframes.ml.metrics.recall_score( + recall = metrics.recall_score( df[["y_true_arbitrary_name"]], df[["y_pred_arbitrary_name"]], average=None ) expected_values = [1.000000, 0.000000, 0.666667] @@ -627,9 +613,7 @@ def test_recall_score_matches_sklearn(session): } ).astype("Int64") df = session.read_pandas(pd_df) - recall = bigframes.ml.metrics.recall_score( - df[["y_true"]], df[["y_pred"]], average=None - ) + recall = metrics.recall_score(df[["y_true"]], df[["y_pred"]], average=None) expected_values = sklearn_metrics.recall_score( pd_df[["y_true"]], pd_df[["y_pred"]], average=None ) @@ -646,9 +630,7 @@ def test_recall_score_str_matches_sklearn(session): } ).astype("str") df = session.read_pandas(pd_df) - recall = bigframes.ml.metrics.recall_score( - df[["y_true"]], df[["y_pred"]], average=None - ) + recall = metrics.recall_score(df[["y_true"]], df[["y_pred"]], average=None) expected_values = sklearn_metrics.recall_score( pd_df[["y_true"]], pd_df[["y_pred"]], average=None ) @@ -665,7 +647,7 @@ def test_recall_score_series(session): } ).astype("Int64") df = session.read_pandas(pd_df) - recall = bigframes.ml.metrics.recall_score(df["y_true"], df["y_pred"], average=None) + recall = metrics.recall_score(df["y_true"], df["y_pred"], average=None) expected_values = [1.000000, 0.000000, 0.666667] expected_index = [0, 1, 2] expected_recall = pd.Series(expected_values, index=expected_index) @@ -681,7 +663,7 @@ def test_precision_score(session): } ).astype("Int64") df = session.read_pandas(pd_df) - precision_score = bigframes.ml.metrics.precision_score( + precision_score = metrics.precision_score( df[["y_true_arbitrary_name"]], df[["y_pred_arbitrary_name"]], average=None ) expected_values = [0.666667, 0.000000, 0.666667] @@ -701,7 +683,7 @@ def test_precision_score_matches_sklearn(session): } ).astype("Int64") df = session.read_pandas(pd_df) - precision_score = bigframes.ml.metrics.precision_score( + precision_score = metrics.precision_score( df[["y_true"]], df[["y_pred"]], average=None ) expected_values = sklearn_metrics.precision_score( @@ -722,7 +704,7 @@ def test_precision_score_str_matches_sklearn(session): } ).astype("str") df = session.read_pandas(pd_df) - precision_score = bigframes.ml.metrics.precision_score( + precision_score = metrics.precision_score( df[["y_true"]], df[["y_pred"]], average=None ) expected_values = sklearn_metrics.precision_score( @@ -743,9 +725,7 @@ def test_precision_score_series(session): } ).astype("Int64") df = session.read_pandas(pd_df) - precision_score = bigframes.ml.metrics.precision_score( - df["y_true"], df["y_pred"], average=None - ) + precision_score = metrics.precision_score(df["y_true"], df["y_pred"], average=None) expected_values = [0.666667, 0.000000, 0.666667] expected_index = [0, 1, 2] expected_precision = pd.Series(expected_values, index=expected_index) @@ -763,7 +743,7 @@ def test_f1_score(session): } ).astype("Int64") df = session.read_pandas(pd_df) - f1_score = bigframes.ml.metrics.f1_score( + f1_score = metrics.f1_score( df[["y_true_arbitrary_name"]], df[["y_pred_arbitrary_name"]], average=None ) expected_values = [0.8, 0.000000, 0.666667] @@ -781,9 +761,7 @@ def test_f1_score_matches_sklearn(session): } ).astype("Int64") df = session.read_pandas(pd_df) - f1_score = bigframes.ml.metrics.f1_score( - df[["y_true"]], df[["y_pred"]], average=None - ) + f1_score = metrics.f1_score(df[["y_true"]], df[["y_pred"]], average=None) expected_values = sklearn_metrics.f1_score( pd_df[["y_true"]], pd_df[["y_pred"]], average=None ) @@ -800,9 +778,7 @@ def test_f1_score_str_matches_sklearn(session): } ).astype("str") df = session.read_pandas(pd_df) - f1_score = bigframes.ml.metrics.f1_score( - df[["y_true"]], df[["y_pred"]], average=None - ) + f1_score = metrics.f1_score(df[["y_true"]], df[["y_pred"]], average=None) expected_values = sklearn_metrics.f1_score( pd_df[["y_true"]], pd_df[["y_pred"]], average=None ) @@ -819,9 +795,16 @@ def test_f1_score_series(session): } ).astype("Int64") df = session.read_pandas(pd_df) - f1_score = bigframes.ml.metrics.f1_score(df["y_true"], df["y_pred"], average=None) + f1_score = metrics.f1_score(df["y_true"], df["y_pred"], average=None) expected_values = [0.8, 0.000000, 0.666667] expected_index = [0, 1, 2] expected_f1 = pd.Series(expected_values, index=expected_index) pd.testing.assert_series_equal(f1_score, expected_f1, check_index_type=False) + + +def test_mean_squared_error(session: bigframes.Session): + pd_df = pd.DataFrame({"y_true": [3, -0.5, 2, 7], "y_pred": [2.5, 0.0, 2, 8]}) + df = session.read_pandas(pd_df) + mse = metrics.mean_squared_error(df["y_true"], df["y_pred"]) + assert mse == 0.375 diff --git a/third_party/bigframes_vendored/sklearn/metrics/_regression.py b/third_party/bigframes_vendored/sklearn/metrics/_regression.py index be531a9b1c..c3e579bd29 100644 --- a/third_party/bigframes_vendored/sklearn/metrics/_regression.py +++ b/third_party/bigframes_vendored/sklearn/metrics/_regression.py @@ -64,3 +64,30 @@ def r2_score(y_true, y_pred, force_finite=True) -> float: float: The :math:`R^2` score. """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + + +def mean_squared_error(y_true, y_pred) -> float: + """Mean squared error regression loss. + + **Examples:** + + >>> import bigframes.pandas as bpd + >>> import bigframes.ml.metrics + >>> bpd.options.display.progress_bar = None + + >>> y_true = bpd.DataFrame([3, -0.5, 2, 7]) + >>> y_pred = bpd.DataFrame([2.5, 0.0, 2, 8]) + >>> mse = bigframes.ml.metrics.mean_squared_error(y_true, y_pred) + >>> mse + 0.375 + + Args: + y_true (Series or DataFrame of shape (n_samples,)): + Ground truth (correct) target values. + y_pred (Series or DataFrame of shape (n_samples,)): + Estimated target values. + + Returns: + float: Mean squared error. + """ + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) From 512e932f360ea67b2912464e566e22c1e35fffe9 Mon Sep 17 00:00:00 2001 From: Garrett Wu <6505921+GarrettWu@users.noreply.github.com> Date: Wed, 3 Apr 2024 10:34:04 -0700 Subject: [PATCH 40/53] refactor: remove to_pandas in ml.metrics.roc_auc_score (#560) * refactor: remove to_pandas in ml.metrics.roc_auc_score * fix mypy --- bigframes/ml/metrics/_metrics.py | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) diff --git a/bigframes/ml/metrics/_metrics.py b/bigframes/ml/metrics/_metrics.py index 542e6300a8..b8c264e91b 100644 --- a/bigframes/ml/metrics/_metrics.py +++ b/bigframes/ml/metrics/_metrics.py @@ -161,14 +161,10 @@ def roc_auc_score( fpr, tpr, _ = roc_curve(y_true_series, y_score_series, drop_intermediate=False) - # TODO(bmil): remove this once bigframes supports the necessary operations - pd_fpr = fpr.to_pandas() - pd_tpr = tpr.to_pandas() - # Use the trapezoid rule to compute the area under the ROC curve - width_diff = pd_fpr.diff().iloc[1:].reset_index(drop=True) - height_avg = (pd_tpr.iloc[:-1] + pd_tpr.iloc[1:].reset_index(drop=True)) / 2 - return (width_diff * height_avg).sum() + width_diff = fpr.diff().iloc[1:].reset_index(drop=True) + height_avg = (tpr.iloc[:-1] + tpr.iloc[1:].reset_index(drop=True)) / 2 + return typing.cast(float, (width_diff * height_avg).sum()) roc_auc_score.__doc__ = inspect.getdoc(vendored_metrics_ranking.roc_auc_score) From 2fce51f820a18f54c51d25894a0ed02bf53293d2 Mon Sep 17 00:00:00 2001 From: Garrett Wu <6505921+GarrettWu@users.noreply.github.com> Date: Wed, 3 Apr 2024 10:34:32 -0700 Subject: [PATCH 41/53] test: remove transformer test column reorders (#566) --- tests/system/large/ml/test_compose.py | 12 +- tests/system/small/ml/test_preprocessing.py | 163 +++----------------- 2 files changed, 25 insertions(+), 150 deletions(-) diff --git a/tests/system/large/ml/test_compose.py b/tests/system/large/ml/test_compose.py index 72e016f4bb..0107d371cb 100644 --- a/tests/system/large/ml/test_compose.py +++ b/tests/system/large/ml/test_compose.py @@ -45,14 +45,8 @@ def test_columntransformer_standalone_fit_and_transform( ) result = transformer.transform(new_penguins_df).to_pandas() - # TODO: bug? feature columns seem to be in nondeterministic random order - # workaround: sort columns by name. Can't repro it in pantheon, so could - # be a bigframes issue... - result = result.reindex(sorted(result.columns), axis=1) - expected = pandas.DataFrame( { - "min_max_scaled_culmen_length_mm": [0.269, 0.232, 0.210], "onehotencoded_species": [ [{"index": 1, "value": 1.0}], [{"index": 1, "value": 1.0}], @@ -63,6 +57,7 @@ def test_columntransformer_standalone_fit_and_transform( -0.9945520581113803, -1.104611490204711, ], + "min_max_scaled_culmen_length_mm": [0.269, 0.232, 0.210], "standard_scaled_flipper_length_mm": [-0.350044, -1.418336, -0.9198], }, index=pandas.Index([1633, 1672, 1690], dtype="Int64", name="tag_number"), @@ -91,11 +86,6 @@ def test_columntransformer_standalone_fit_transform(new_penguins_df): new_penguins_df[["species", "culmen_length_mm", "flipper_length_mm"]] ).to_pandas() - # TODO: bug? feature columns seem to be in nondeterministic random order - # workaround: sort columns by name. Can't repro it in pantheon, so could - # be a bigframes issue... - result = result.reindex(sorted(result.columns), axis=1) - expected = pandas.DataFrame( { "onehotencoded_species": [ diff --git a/tests/system/small/ml/test_preprocessing.py b/tests/system/small/ml/test_preprocessing.py index 040111f38a..22c3c84959 100644 --- a/tests/system/small/ml/test_preprocessing.py +++ b/tests/system/small/ml/test_preprocessing.py @@ -48,15 +48,10 @@ def test_standard_scaler_normalizes(penguins_df_default_index, new_penguins_df): result = scaler.transform(new_penguins_df).to_pandas() - # TODO: bug? feature columns seem to be in nondeterministic random order - # workaround: sort columns by name. Can't repro it in pantheon, so could - # be a bigframes issue... - result = result.reindex(sorted(result.columns), axis=1) - expected = pd.DataFrame( { - "standard_scaled_culmen_depth_mm": [0.836148, 0.024748, 0.48116], "standard_scaled_culmen_length_mm": [-0.81112, -0.994552, -1.104611], + "standard_scaled_culmen_depth_mm": [0.836148, 0.024748, 0.48116], "standard_scaled_flipper_length_mm": [-0.350044, -1.418336, -0.9198], }, dtype="Float64", @@ -77,15 +72,10 @@ def test_standard_scaler_normalizeds_fit_transform(new_penguins_df): for column in result.columns: assert math.isclose(result[column].mean(), 0.0, abs_tol=1e-3) - # TODO: bug? feature columns seem to be in nondeterministic random order - # workaround: sort columns by name. Can't repro it in pantheon, so could - # be a bigframes issue... - result = result.reindex(sorted(result.columns), axis=1) - expected = pd.DataFrame( { - "standard_scaled_culmen_depth_mm": [1.17072, -1.272416, 0.101848], "standard_scaled_culmen_length_mm": [1.313249, -0.20198, -1.111118], + "standard_scaled_culmen_depth_mm": [1.17072, -1.272416, 0.101848], "standard_scaled_flipper_length_mm": [1.251089, -1.196588, -0.054338], }, dtype="Float64", @@ -108,11 +98,6 @@ def test_standard_scaler_series_normalizes(penguins_df_default_index, new_pengui result = scaler.transform(new_penguins_df).to_pandas() - # TODO: bug? feature columns seem to be in nondeterministic random order - # workaround: sort columns by name. Can't repro it in pantheon, so could - # be a bigframes issue... - result = result.reindex(sorted(result.columns), axis=1) - expected = pd.DataFrame( { "standard_scaled_culmen_length_mm": [ @@ -162,15 +147,10 @@ def test_max_abs_scaler_normalizes(penguins_df_default_index, new_penguins_df): result = scaler.transform(new_penguins_df).to_pandas() - # TODO: bug? feature columns seem to be in nondeterministic random order - # workaround: sort columns by name. Can't repro it in pantheon, so could - # be a bigframes issue... - result = result.reindex(sorted(result.columns), axis=1) - expected = pd.DataFrame( { - "max_abs_scaled_culmen_depth_mm": [0.874419, 0.8, 0.84186], "max_abs_scaled_culmen_length_mm": [0.662752, 0.645973, 0.635906], + "max_abs_scaled_culmen_depth_mm": [0.874419, 0.8, 0.84186], "max_abs_scaled_flipper_length_mm": [0.848485, 0.78355, 0.813853], }, dtype="Float64", @@ -186,15 +166,10 @@ def test_max_abs_scaler_normalizeds_fit_transform(new_penguins_df): new_penguins_df[["culmen_length_mm", "culmen_depth_mm", "flipper_length_mm"]] ).to_pandas() - # TODO: bug? feature columns seem to be in nondeterministic random order - # workaround: sort columns by name. Can't repro it in pantheon, so could - # be a bigframes issue... - result = result.reindex(sorted(result.columns), axis=1) - expected = pd.DataFrame( { - "max_abs_scaled_culmen_depth_mm": [1.0, 0.914894, 0.962766], "max_abs_scaled_culmen_length_mm": [1.0, 0.974684, 0.959494], + "max_abs_scaled_culmen_depth_mm": [1.0, 0.914894, 0.962766], "max_abs_scaled_flipper_length_mm": [1.0, 0.923469, 0.959184], }, dtype="Float64", @@ -216,11 +191,6 @@ def test_max_abs_scaler_series_normalizes(penguins_df_default_index, new_penguin result = scaler.transform(new_penguins_df).to_pandas() - # TODO: bug? feature columns seem to be in nondeterministic random order - # workaround: sort columns by name. Can't repro it in pantheon, so could - # be a bigframes issue... - result = result.reindex(sorted(result.columns), axis=1) - expected = pd.DataFrame( { "max_abs_scaled_culmen_length_mm": [0.662752, 0.645973, 0.635906], @@ -251,15 +221,10 @@ def test_min_max_scaler_normalized_fit_transform(new_penguins_df): new_penguins_df[["culmen_length_mm", "culmen_depth_mm", "flipper_length_mm"]] ).to_pandas() - # TODO: bug? feature columns seem to be in nondeterministic random order - # workaround: sort columns by name. Can't repro it in pantheon, so could - # be a bigframes issue... - result = result.reindex(sorted(result.columns), axis=1) - expected = pd.DataFrame( { - "min_max_scaled_culmen_depth_mm": [1.0, 0.0, 0.5625], "min_max_scaled_culmen_length_mm": [1.0, 0.375, 0.0], + "min_max_scaled_culmen_depth_mm": [1.0, 0.0, 0.5625], "min_max_scaled_flipper_length_mm": [1.0, 0.0, 0.466667], }, dtype="Float64", @@ -282,11 +247,6 @@ def test_min_max_scaler_series_normalizes(penguins_df_default_index, new_penguin result = scaler.transform(new_penguins_df).to_pandas() - # TODO: bug? feature columns seem to be in nondeterministic random order - # workaround: sort columns by name. Can't repro it in pantheon, so could - # be a bigframes issue... - result = result.reindex(sorted(result.columns), axis=1) - expected = pd.DataFrame( { "min_max_scaled_culmen_length_mm": [0.269091, 0.232727, 0.210909], @@ -320,15 +280,10 @@ def test_min_max_scaler_normalizes(penguins_df_default_index, new_penguins_df): result = scaler.transform(new_penguins_df).to_pandas() - # TODO: bug? feature columns seem to be in nondeterministic random order - # workaround: sort columns by name. Can't repro it in pantheon, so could - # be a bigframes issue... - result = result.reindex(sorted(result.columns), axis=1) - expected = pd.DataFrame( { - "min_max_scaled_culmen_depth_mm": [0.678571, 0.4880952, 0.595238], "min_max_scaled_culmen_length_mm": [0.269091, 0.232727, 0.210909], + "min_max_scaled_culmen_depth_mm": [0.678571, 0.4880952, 0.595238], "min_max_scaled_flipper_length_mm": [0.40678, 0.152542, 0.271186], }, dtype="Float64", @@ -357,15 +312,10 @@ def test_k_bins_discretizer_normalized_fit_transform_default_params(new_penguins new_penguins_df[["culmen_length_mm", "culmen_depth_mm", "flipper_length_mm"]] ).to_pandas() - # TODO: bug? feature columns seem to be in nondeterministic random order - # workaround: sort columns by name. Can't repro it in pantheon, so could - # be a bigframes issue... - result = result.reindex(sorted(result.columns), axis=1) - expected = pd.DataFrame( { - "kbinsdiscretizer_culmen_depth_mm": ["bin_5", "bin_2", "bin_4"], "kbinsdiscretizer_culmen_length_mm": ["bin_5", "bin_3", "bin_2"], + "kbinsdiscretizer_culmen_depth_mm": ["bin_5", "bin_2", "bin_4"], "kbinsdiscretizer_flipper_length_mm": ["bin_5", "bin_2", "bin_4"], }, dtype="string[pyarrow]", @@ -386,11 +336,6 @@ def test_k_bins_discretizer_series_normalizes( ).to_pandas() result = discretizer.transform(new_penguins_df).to_pandas() - # TODO: bug? feature columns seem to be in nondeterministic random order - # workaround: sort columns by name. Can't repro it in pantheon, so could - # be a bigframes issue... - result = result.reindex(sorted(result.columns), axis=1) - expected = pd.DataFrame( { "kbinsdiscretizer_culmen_length_mm": ["bin_3", "bin_3", "bin_3"], @@ -419,15 +364,10 @@ def test_k_bins_discretizer_normalizes(penguins_df_default_index, new_penguins_d result = discretizer.transform(new_penguins_df).to_pandas() - # TODO: bug? feature columns seem to be in nondeterministic random order - # workaround: sort columns by name. Can't repro it in pantheon, so could - # be a bigframes issue... - result = result.reindex(sorted(result.columns), axis=1) - expected = pd.DataFrame( { - "kbinsdiscretizer_culmen_depth_mm": ["bin_5", "bin_4", "bin_4"], "kbinsdiscretizer_culmen_length_mm": ["bin_3", "bin_3", "bin_3"], + "kbinsdiscretizer_culmen_depth_mm": ["bin_5", "bin_4", "bin_4"], "kbinsdiscretizer_flipper_length_mm": ["bin_4", "bin_2", "bin_3"], }, dtype="string[pyarrow]", @@ -456,15 +396,10 @@ def test_k_bins_discretizer_normalizes_different_params( result = discretizer.transform(new_penguins_df).to_pandas() - # TODO: bug? feature columns seem to be in nondeterministic random order - # workaround: sort columns by name. Can't repro it in pantheon, so could - # be a bigframes issue... - result = result.reindex(sorted(result.columns), axis=1) - expected = pd.DataFrame( { - "kbinsdiscretizer_culmen_depth_mm": ["bin_6", "bin_4", "bin_5"], "kbinsdiscretizer_culmen_length_mm": ["bin_3", "bin_3", "bin_3"], + "kbinsdiscretizer_culmen_depth_mm": ["bin_6", "bin_4", "bin_5"], "kbinsdiscretizer_flipper_length_mm": ["bin_4", "bin_2", "bin_3"], }, dtype="string[pyarrow]", @@ -495,22 +430,17 @@ def test_one_hot_encoder_default_params(new_penguins_df): result = encoder.transform(new_penguins_df).to_pandas() - # TODO: bug? feature columns seem to be in nondeterministic random order - # workaround: sort columns by name. Can't repro it in pantheon, so could - # be a bigframes issue... - result = result.reindex(sorted(result.columns), axis=1) - expected = pd.DataFrame( { - "onehotencoded_sex": [ - [{"index": 2, "value": 1.0}], + "onehotencoded_species": [ [{"index": 1, "value": 1.0}], [{"index": 1, "value": 1.0}], + [{"index": 2, "value": 1.0}], ], - "onehotencoded_species": [ + "onehotencoded_sex": [ + [{"index": 2, "value": 1.0}], [{"index": 1, "value": 1.0}], [{"index": 1, "value": 1.0}], - [{"index": 2, "value": 1.0}], ], }, dtype=ONE_HOT_ENCODED_DTYPE, @@ -525,22 +455,17 @@ def test_one_hot_encoder_default_params_fit_transform(new_penguins_df): result = encoder.fit_transform(new_penguins_df[["species", "sex"]]).to_pandas() - # TODO: bug? feature columns seem to be in nondeterministic random order - # workaround: sort columns by name. Can't repro it in pantheon, so could - # be a bigframes issue... - result = result.reindex(sorted(result.columns), axis=1) - expected = pd.DataFrame( { - "onehotencoded_sex": [ - [{"index": 2, "value": 1.0}], + "onehotencoded_species": [ [{"index": 1, "value": 1.0}], [{"index": 1, "value": 1.0}], + [{"index": 2, "value": 1.0}], ], - "onehotencoded_species": [ + "onehotencoded_sex": [ + [{"index": 2, "value": 1.0}], [{"index": 1, "value": 1.0}], [{"index": 1, "value": 1.0}], - [{"index": 2, "value": 1.0}], ], }, dtype=ONE_HOT_ENCODED_DTYPE, @@ -556,11 +481,6 @@ def test_one_hot_encoder_series_default_params(new_penguins_df): result = encoder.transform(new_penguins_df).to_pandas() - # TODO: bug? feature columns seem to be in nondeterministic random order - # workaround: sort columns by name. Can't repro it in pantheon, so could - # be a bigframes issue... - result = result.reindex(sorted(result.columns), axis=1) - expected = pd.DataFrame( { "onehotencoded_species": [ @@ -582,19 +502,14 @@ def test_one_hot_encoder_params(new_penguins_df): result = encoder.transform(new_penguins_df).to_pandas() - # TODO: bug? feature columns seem to be in nondeterministic random order - # workaround: sort columns by name. Can't repro it in pantheon, so could - # be a bigframes issue... - result = result.reindex(sorted(result.columns), axis=1) - expected = pd.DataFrame( { - "onehotencoded_sex": [ + "onehotencoded_species": [ [{"index": 0, "value": 1.0}], [{"index": 0, "value": 1.0}], [{"index": 0, "value": 1.0}], ], - "onehotencoded_species": [ + "onehotencoded_sex": [ [{"index": 0, "value": 1.0}], [{"index": 0, "value": 1.0}], [{"index": 0, "value": 1.0}], @@ -613,23 +528,18 @@ def test_one_hot_encoder_different_data(penguins_df_default_index, new_penguins_ result = encoder.transform(new_penguins_df).to_pandas() - # TODO: bug? feature columns seem to be in nondeterministic random order - # workaround: sort columns by name. Can't repro it in pantheon, so could - # be a bigframes issue... - result = result.reindex(sorted(result.columns), axis=1) - expected = pd.DataFrame( { - "onehotencoded_sex": [ - [{"index": 3, "value": 1.0}], - [{"index": 2, "value": 1.0}], - [{"index": 2, "value": 1.0}], - ], "onehotencoded_species": [ [{"index": 1, "value": 1.0}], [{"index": 1, "value": 1.0}], [{"index": 2, "value": 1.0}], ], + "onehotencoded_sex": [ + [{"index": 3, "value": 1.0}], + [{"index": 2, "value": 1.0}], + [{"index": 2, "value": 1.0}], + ], }, dtype=ONE_HOT_ENCODED_DTYPE, index=pd.Index([1633, 1672, 1690], name="tag_number", dtype="Int64"), @@ -657,11 +567,6 @@ def test_label_encoder_default_params(new_penguins_df): result = encoder.transform(new_penguins_df["species"]).to_pandas() - # TODO: bug? feature columns seem to be in nondeterministic random order - # workaround: sort columns by name. Can't repro it in pantheon, so could - # be a bigframes issue... - result = result.reindex(sorted(result.columns), axis=1) - expected = pd.DataFrame( { "labelencoded_species": [ @@ -682,11 +587,6 @@ def test_label_encoder_default_params_fit_transform(new_penguins_df): result = encoder.fit_transform(new_penguins_df[["species"]]).to_pandas() - # TODO: bug? feature columns seem to be in nondeterministic random order - # workaround: sort columns by name. Can't repro it in pantheon, so could - # be a bigframes issue... - result = result.reindex(sorted(result.columns), axis=1) - expected = pd.DataFrame( { "labelencoded_species": [ @@ -708,11 +608,6 @@ def test_label_encoder_series_default_params(new_penguins_df): result = encoder.transform(new_penguins_df).to_pandas() - # TODO: bug? feature columns seem to be in nondeterministic random order - # workaround: sort columns by name. Can't repro it in pantheon, so could - # be a bigframes issue... - result = result.reindex(sorted(result.columns), axis=1) - expected = pd.DataFrame( { "labelencoded_species": [ @@ -734,11 +629,6 @@ def test_label_encoder_params(new_penguins_df): result = encoder.transform(new_penguins_df).to_pandas() - # TODO: bug? feature columns seem to be in nondeterministic random order - # workaround: sort columns by name. Can't repro it in pantheon, so could - # be a bigframes issue... - result = result.reindex(sorted(result.columns), axis=1) - expected = pd.DataFrame( { "labelencoded_species": [ @@ -760,11 +650,6 @@ def test_label_encoder_different_data(penguins_df_default_index, new_penguins_df result = encoder.transform(new_penguins_df).to_pandas() - # TODO: bug? feature columns seem to be in nondeterministic random order - # workaround: sort columns by name. Can't repro it in pantheon, so could - # be a bigframes issue... - result = result.reindex(sorted(result.columns), axis=1) - expected = pd.DataFrame( { "labelencoded_species": [ From 8d3918761a17649180aa806d7b01aa103f69b4fe Mon Sep 17 00:00:00 2001 From: Chelsea Lin <124939984+chelsea-lin@users.noreply.github.com> Date: Wed, 3 Apr 2024 12:10:17 -0700 Subject: [PATCH 42/53] fix: plot.scatter s parameter cannot accept float-like column (#563) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Fixes internal b/330574847 🦕 --- bigframes/operations/_matplotlib/core.py | 19 +++++++++----- .../system/small/operations/test_plotting.py | 26 +++++++++++++++++++ 2 files changed, 38 insertions(+), 7 deletions(-) diff --git a/bigframes/operations/_matplotlib/core.py b/bigframes/operations/_matplotlib/core.py index 2c1c2bc4ac..04534e20a9 100644 --- a/bigframes/operations/_matplotlib/core.py +++ b/bigframes/operations/_matplotlib/core.py @@ -14,7 +14,6 @@ import abc import typing -import uuid import pandas as pd @@ -115,6 +114,18 @@ def _compute_plot_data(self): if self._is_column_name(c, sample) and sample[c].dtype == dtypes.STRING_DTYPE: sample[c] = sample[c].astype("object") + # To avoid Matplotlib's automatic conversion of `Float64` or `Int64` columns + # to `object` types (which breaks float-like behavior), this code proactively + # converts the column to a compatible format. + s = self.kwargs.get("s", None) + if pd.core.dtypes.common.is_integer(s): + s = self.data.columns[s] + if self._is_column_name(s, sample): + if sample[s].dtype == dtypes.INT_DTYPE: + sample[s] = sample[s].astype("int64") + elif sample[s].dtype == dtypes.FLOAT_DTYPE: + sample[s] = sample[s].astype("float64") + return sample def _is_sequence_arg(self, arg): @@ -130,9 +141,3 @@ def _is_column_name(self, arg, data): and pd.core.dtypes.common.is_hashable(arg) and arg in data.columns ) - - def _generate_new_column_name(self, data): - col_name = None - while col_name is None or col_name in data.columns: - col_name = f"plot_temp_{str(uuid.uuid4())[:8]}" - return col_name diff --git a/tests/system/small/operations/test_plotting.py b/tests/system/small/operations/test_plotting.py index 824125adf2..6542ce6de3 100644 --- a/tests/system/small/operations/test_plotting.py +++ b/tests/system/small/operations/test_plotting.py @@ -240,6 +240,32 @@ def test_scatter_args_c(c): ) +@pytest.mark.parametrize( + ("s"), + [ + pytest.param([10, 34, 50], id="int"), + pytest.param([1.0, 3.4, 5.0], id="float"), + pytest.param( + [True, True, False], id="bool", marks=pytest.mark.xfail(raises=ValueError) + ), + ], +) +def test_scatter_args_s(s): + data = { + "a": [1, 2, 3], + "b": [1, 2, 3], + } + data["s"] = s + df = bpd.DataFrame(data) + pd_df = pd.DataFrame(data) + + ax = df.plot.scatter(x="a", y="b", s="s") + pd_ax = pd_df.plot.scatter(x="a", y="b", s="s") + tm.assert_numpy_array_equal( + ax.collections[0].get_sizes(), pd_ax.collections[0].get_sizes() + ) + + @pytest.mark.parametrize( ("arg_name"), [ From f8821fe7ecf8a80532a6aab98044fad601ff939c Mon Sep 17 00:00:00 2001 From: TrevorBergeron Date: Wed, 3 Apr 2024 18:20:18 -0700 Subject: [PATCH 43/53] feat: read_pandas accepts pandas Series and Index objects (#573) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Thank you for opening a Pull Request! Before submitting your PR, there are a few things you can do to make sure it goes smoothly: - [ ] Make sure to open an issue as a [bug/issue](https://togithub.com/googleapis/python-bigquery-dataframes/issues/new/choose) before writing your code! That way we can discuss the change, evaluate designs, and agree on the general idea - [ ] Ensure the tests and linter pass - [ ] Code coverage does not decrease (if any source code was changed) - [ ] Appropriate docs were updated (if necessary) Fixes # 🦕 --- .pre-commit-config.yaml | 2 +- bigframes/pandas/__init__.py | 15 ++++++++++ bigframes/series.py | 2 +- bigframes/session/__init__.py | 44 +++++++++++++++++++++++++++--- tests/system/small/test_session.py | 15 ++++++++++ 5 files changed, 72 insertions(+), 6 deletions(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 517176da89..af05f4423c 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -38,4 +38,4 @@ repos: rev: v1.1.1 hooks: - id: mypy - additional_dependencies: [types-requests, types-tabulate] + additional_dependencies: [types-requests, types-tabulate, pandas-stubs] diff --git a/bigframes/pandas/__init__.py b/bigframes/pandas/__init__.py index fc008f36e5..4b0ac4310c 100644 --- a/bigframes/pandas/__init__.py +++ b/bigframes/pandas/__init__.py @@ -577,7 +577,22 @@ def read_gbq_table( read_gbq_table.__doc__ = inspect.getdoc(bigframes.session.Session.read_gbq_table) +@typing.overload def read_pandas(pandas_dataframe: pandas.DataFrame) -> bigframes.dataframe.DataFrame: + ... + + +@typing.overload +def read_pandas(pandas_dataframe: pandas.Series) -> bigframes.series.Series: + ... + + +@typing.overload +def read_pandas(pandas_dataframe: pandas.Index) -> bigframes.core.indexes.Index: + ... + + +def read_pandas(pandas_dataframe: Union[pandas.DataFrame, pandas.Series, pandas.Index]): return global_session.with_default_session( bigframes.session.Session.read_pandas, pandas_dataframe, diff --git a/bigframes/series.py b/bigframes/series.py index e7b358c2fe..7e2b0408b7 100644 --- a/bigframes/series.py +++ b/bigframes/series.py @@ -1514,7 +1514,7 @@ def map( map_df = map_df.rename(columns={arg.name: self.name}) elif isinstance(arg, Mapping): map_df = bigframes.dataframe.DataFrame( - {"keys": list(arg.keys()), self.name: list(arg.values())}, + {"keys": list(arg.keys()), self.name: list(arg.values())}, # type: ignore session=self._get_block().expr.session, ) map_df = map_df.set_index("keys") diff --git a/bigframes/session/__init__.py b/bigframes/session/__init__.py index ac266da3bd..c7605e89d7 100644 --- a/bigframes/session/__init__.py +++ b/bigframes/session/__init__.py @@ -95,7 +95,9 @@ # Avoid circular imports. if typing.TYPE_CHECKING: + import bigframes.core.indexes import bigframes.dataframe as dataframe + import bigframes.series _BIGFRAMES_DEFAULT_CONNECTION_ID = "bigframes-default-connection" @@ -963,7 +965,23 @@ def read_gbq_model(self, model_name: str): model = self.bqclient.get_model(model_ref) return bigframes.ml.loader.from_bq(self, model) + @typing.overload + def read_pandas( + self, pandas_dataframe: pandas.Index + ) -> bigframes.core.indexes.Index: + ... + + @typing.overload + def read_pandas(self, pandas_dataframe: pandas.Series) -> bigframes.series.Series: + ... + + @typing.overload def read_pandas(self, pandas_dataframe: pandas.DataFrame) -> dataframe.DataFrame: + ... + + def read_pandas( + self, pandas_dataframe: Union[pandas.DataFrame, pandas.Series, pandas.Index] + ): """Loads DataFrame from a pandas DataFrame. The pandas DataFrame will be persisted as a temporary BigQuery table, which can be @@ -986,13 +1004,31 @@ def read_pandas(self, pandas_dataframe: pandas.DataFrame) -> dataframe.DataFrame [2 rows x 2 columns] Args: - pandas_dataframe (pandas.DataFrame): - a pandas DataFrame object to be loaded. + pandas_dataframe (pandas.DataFrame, pandas.Series, or pandas.Index): + a pandas DataFrame/Series/Index object to be loaded. Returns: - bigframes.dataframe.DataFrame: The BigQuery DataFrame. + An equivalent bigframes.pandas.(DataFrame/Series/Index) object """ - return self._read_pandas(pandas_dataframe, "read_pandas") + import bigframes.series as series + + # Try to handle non-dataframe pandas objects as well + if isinstance(pandas_dataframe, pandas.Series): + bf_df = self._read_pandas(pandas.DataFrame(pandas_dataframe), "read_pandas") + bf_series = typing.cast(series.Series, bf_df[bf_df.columns[0]]) + # wrapping into df can set name to 0 so reset to original object name + bf_series.name = pandas_dataframe.name + return bf_series + if isinstance(pandas_dataframe, pandas.Index): + return self._read_pandas( + pandas.DataFrame(index=pandas_dataframe), "read_pandas" + ).index + if isinstance(pandas_dataframe, pandas.DataFrame): + return self._read_pandas(pandas_dataframe, "read_pandas") + else: + raise ValueError( + f"read_pandas() expects a pandas.DataFrame, but got a {type(pandas_dataframe)}" + ) def _read_pandas( self, pandas_dataframe: pandas.DataFrame, api_name: str diff --git a/tests/system/small/test_session.py b/tests/system/small/test_session.py index 28a3f03860..eb6a0a8dd9 100644 --- a/tests/system/small/test_session.py +++ b/tests/system/small/test_session.py @@ -421,6 +421,21 @@ def test_read_pandas(session, scalars_dfs): pd.testing.assert_frame_equal(result, expected) +def test_read_pandas_series(session): + idx = pd.Index([2, 7, 1, 2, 8], dtype=pd.Int64Dtype()) + pd_series = pd.Series([3, 1, 4, 1, 5], dtype=pd.Int64Dtype(), index=idx) + bf_series = session.read_pandas(pd_series) + + pd.testing.assert_series_equal(bf_series.to_pandas(), pd_series) + + +def test_read_pandas_index(session): + pd_idx = pd.Index([2, 7, 1, 2, 8], dtype=pd.Int64Dtype()) + bf_idx = session.read_pandas(pd_idx) + + pd.testing.assert_index_equal(bf_idx.to_pandas(), pd_idx) + + def test_read_pandas_inline_respects_location(): options = bigframes.BigQueryOptions(location="europe-west1") session = bigframes.Session(options) From 6d8f3afe28d39eb15b969f50d37c58a2c3ff1967 Mon Sep 17 00:00:00 2001 From: TrevorBergeron Date: Thu, 4 Apr 2024 09:12:39 -0700 Subject: [PATCH 44/53] =?UTF-8?q?feat:=20Allow=20DataFrame=20binary=20ops?= =?UTF-8?q?=20to=20align=20on=20either=20axis=20and=20with=20loc=E2=80=A6?= =?UTF-8?q?=20(#544)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- bigframes/core/convert.py | 49 +++++++++++++++ bigframes/dataframe.py | 93 +++++++++++++++++++++------- bigframes/typing.py | 0 tests/system/small/test_dataframe.py | 32 ++++++++++ 4 files changed, 151 insertions(+), 23 deletions(-) create mode 100644 bigframes/core/convert.py create mode 100644 bigframes/typing.py diff --git a/bigframes/core/convert.py b/bigframes/core/convert.py new file mode 100644 index 0000000000..98f854ad72 --- /dev/null +++ b/bigframes/core/convert.py @@ -0,0 +1,49 @@ +# Copyright 2023 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from __future__ import annotations + +import pandas as pd + +import bigframes.core.indexes as index +import bigframes.series as series + + +def to_bf_series(obj, default_index: index.Index) -> series.Series: + if isinstance(obj, series.Series): + return obj + if isinstance(obj, pd.Series): + return series.Series(obj) + if isinstance(obj, index.Index): + return series.Series(obj, default_index) + if isinstance(obj, pd.Index): + return series.Series(obj, default_index) + if pd.api.types.is_list_like(obj): + return series.Series(obj, default_index) + else: + raise TypeError(f"Cannot interpret {obj} as series.") + + +def to_pd_series(obj, default_index: pd.Index) -> pd.Series: + if isinstance(obj, series.Series): + return obj.to_pandas() + if isinstance(obj, pd.Series): + return obj + if isinstance(obj, index.Index): + return pd.Series(obj.to_pandas(), default_index) + if isinstance(obj, pd.Index): + return pd.Series(obj, default_index) + if pd.api.types.is_list_like(obj): + return pd.Series(obj, default_index) + else: + raise TypeError(f"Cannot interpret {obj} as series.") diff --git a/bigframes/dataframe.py b/bigframes/dataframe.py index 066b082490..97a100474a 100644 --- a/bigframes/dataframe.py +++ b/bigframes/dataframe.py @@ -50,6 +50,7 @@ from bigframes.core import log_adapter import bigframes.core.block_transforms as block_ops import bigframes.core.blocks as blocks +import bigframes.core.convert import bigframes.core.expression as ex import bigframes.core.groupby as groupby import bigframes.core.guid @@ -663,22 +664,20 @@ def _apply_binop( how: str = "outer", reverse: bool = False, ): - if isinstance(other, (float, int)): + if isinstance(other, (float, int, bool)): return self._apply_scalar_binop(other, op, reverse=reverse) - elif isinstance(other, indexes.Index): - return self._apply_series_binop( - other.to_series(index=self.index), - op, - axis=axis, - how=how, - reverse=reverse, - ) - elif isinstance(other, bigframes.series.Series): - return self._apply_series_binop( - other, op, axis=axis, how=how, reverse=reverse - ) elif isinstance(other, DataFrame): return self._apply_dataframe_binop(other, op, how=how, reverse=reverse) + elif isinstance(other, pandas.DataFrame): + return self._apply_dataframe_binop( + DataFrame(other), op, how=how, reverse=reverse + ) + elif utils.get_axis_number(axis) == 0: + bf_series = bigframes.core.convert.to_bf_series(other, self.index) + return self._apply_series_binop_axis_0(bf_series, op, how, reverse) + elif utils.get_axis_number(axis) == 1: + pd_series = bigframes.core.convert.to_pd_series(other, self.columns) + return self._apply_series_binop_axis_1(pd_series, op, how, reverse) raise NotImplementedError( f"binary operation is not implemented on the second operand of type {type(other).__name__}." f"{constants.FEEDBACK_LINK}" @@ -700,22 +699,13 @@ def _apply_scalar_binop( block = block.drop_columns([column_id]) return DataFrame(block) - def _apply_series_binop( + def _apply_series_binop_axis_0( self, other: bigframes.series.Series, op: ops.BinaryOp, - axis: str | int = "columns", how: str = "outer", reverse: bool = False, ) -> DataFrame: - if axis not in ("columns", "index", 0, 1): - raise ValueError(f"Invalid input: axis {axis}.") - - if axis in ("columns", 1): - raise NotImplementedError( - f"Row Series operations haven't been supported. {constants.FEEDBACK_LINK}" - ) - block, (get_column_left, get_column_right) = self._block.join( other._block, how=how ) @@ -738,6 +728,63 @@ def _apply_series_binop( block = block.with_index_labels(self.index.names) return DataFrame(block) + def _apply_series_binop_axis_1( + self, + other: pandas.Series, + op: ops.BinaryOp, + how: str = "outer", + reverse: bool = False, + ) -> DataFrame: + # Somewhat different alignment than df-df so separate codepath for now. + if self.columns.equals(other.index): + columns, lcol_indexer, rcol_indexer = self.columns, None, None + else: + if not (self.columns.is_unique and other.index.is_unique): + raise ValueError("Cannot align non-unique indices") + columns, lcol_indexer, rcol_indexer = self.columns.join( + other.index, how=how, return_indexers=True + ) + + binop_result_ids = [] + + column_indices = zip( + lcol_indexer if (lcol_indexer is not None) else range(len(columns)), + rcol_indexer if (rcol_indexer is not None) else range(len(columns)), + ) + + block = self._block + for left_index, right_index in column_indices: + if left_index >= 0 and right_index >= 0: # -1 indices indicate missing + self_col_id = self._block.value_columns[left_index] + other_scalar = other.iloc[right_index] + expr = ( + op.as_expr(ex.const(other_scalar), self_col_id) + if reverse + else op.as_expr(self_col_id, ex.const(other_scalar)) + ) + elif left_index >= 0: + self_col_id = self._block.value_columns[left_index] + expr = ( + op.as_expr(ex.const(None), self_col_id) + if reverse + else op.as_expr(self_col_id, ex.const(None)) + ) + elif right_index >= 0: + other_scalar = other.iloc[right_index] + expr = ( + op.as_expr(ex.const(other_scalar), ex.const(None)) + if reverse + else op.as_expr(ex.const(None), ex.const(other_scalar)) + ) + else: + # Should not be possible + raise ValueError("No right or left index.") + block, result_col_id = block.project_expr(expr) + binop_result_ids.append(result_col_id) + + block = block.select_columns(binop_result_ids) + return DataFrame(block.with_column_labels(columns)) + def _apply_dataframe_binop( self, other: DataFrame, diff --git a/bigframes/typing.py b/bigframes/typing.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/tests/system/small/test_dataframe.py b/tests/system/small/test_dataframe.py index cf907b02d6..ae80a088b5 100644 --- a/tests/system/small/test_dataframe.py +++ b/tests/system/small/test_dataframe.py @@ -27,6 +27,7 @@ import bigframes import bigframes._config.display_options as display_options +import bigframes.core.indexes as bf_indexes import bigframes.dataframe as dataframe import bigframes.series as series from tests.system.utils import ( @@ -2074,6 +2075,37 @@ def test_series_binop_axis_index( assert_pandas_df_equal(bf_result, pd_result) +@skip_legacy_pandas +@pytest.mark.parametrize( + ("input"), + [ + ((1000, 2000, 3000)), + (pd.Index([1000, 2000, 3000])), + (bf_indexes.Index([1000, 2000, 3000])), + (pd.Series((1000, 2000), index=["int64_too", "float64_col"])), + (series.Series((1000, 2000), index=["int64_too", "float64_col"])), + ], + ids=[ + "tuple", + "pd_index", + "bf_index", + "pd_series", + "bf_series", + ], +) +def test_listlike_binop_axis_1(scalars_dfs, input): + scalars_df, scalars_pandas_df = scalars_dfs + + df_columns = ["int64_col", "float64_col", "int64_too"] + + bf_result = scalars_df[df_columns].add(input, axis=1).to_pandas() + if hasattr(input, "to_pandas"): + input = input.to_pandas() + pd_result = scalars_pandas_df[df_columns].add(input, axis=1) + + assert_pandas_df_equal(bf_result, pd_result, check_dtype=False) + + @pytest.mark.parametrize( ("left_labels", "right_labels"), [ From 659a161a53e93f66334cd04d1c3dc1f1f47ecc16 Mon Sep 17 00:00:00 2001 From: TrevorBergeron Date: Thu, 4 Apr 2024 09:36:41 -0700 Subject: [PATCH 45/53] fix: Use bytes limit on frame inlining rather than element count (#576) --- bigframes/session/__init__.py | 8 ++++---- tests/system/small/test_dataframe.py | 7 +++++++ tests/system/small/test_progress_bar.py | 4 ++-- 3 files changed, 13 insertions(+), 6 deletions(-) diff --git a/bigframes/session/__init__.py b/bigframes/session/__init__.py index c7605e89d7..671a3d65e7 100644 --- a/bigframes/session/__init__.py +++ b/bigframes/session/__init__.py @@ -116,9 +116,9 @@ "UTF-32LE", } -# BigQuery has 1 MB query size limit, 5000 items shouldn't take more than 10% of this depending on data type. -# TODO(tbergeron): Convert to bytes-based limit -MAX_INLINE_DF_SIZE = 5000 +# BigQuery has 1 MB query size limit. Don't want to take up more than a few % of that inlining a table. +# Also must assume that text encoding as literals is much less efficient than in-memory representation. +MAX_INLINE_DF_BYTES = 5000 logger = logging.getLogger(__name__) @@ -1051,7 +1051,7 @@ def _read_pandas_inline( ) -> Optional[dataframe.DataFrame]: import bigframes.dataframe as dataframe - if pandas_dataframe.size > MAX_INLINE_DF_SIZE: + if pandas_dataframe.memory_usage(deep=True).sum() > MAX_INLINE_DF_BYTES: return None try: diff --git a/tests/system/small/test_dataframe.py b/tests/system/small/test_dataframe.py index ae80a088b5..f28de37d68 100644 --- a/tests/system/small/test_dataframe.py +++ b/tests/system/small/test_dataframe.py @@ -66,6 +66,13 @@ def test_df_construct_pandas_default(scalars_dfs): pandas.testing.assert_frame_equal(bf_result, pd_result) +def test_df_construct_large_strings(): + data = [["hello", "w" + "o" * 50000 + "rld"]] + bf_result = dataframe.DataFrame(data).to_pandas() + pd_result = pd.DataFrame(data, dtype=pd.StringDtype(storage="pyarrow")) + pandas.testing.assert_frame_equal(bf_result, pd_result, check_index_type=False) + + def test_df_construct_pandas_load_job(scalars_dfs): # This should trigger the inlined codepath columns = [ diff --git a/tests/system/small/test_progress_bar.py b/tests/system/small/test_progress_bar.py index 1c04b580fc..ea139b9802 100644 --- a/tests/system/small/test_progress_bar.py +++ b/tests/system/small/test_progress_bar.py @@ -20,7 +20,7 @@ import bigframes as bf import bigframes.formatting_helpers as formatting_helpers -from bigframes.session import MAX_INLINE_DF_SIZE +from bigframes.session import MAX_INLINE_DF_BYTES job_load_message_regex = r"\w+ job [\w-]+ is \w+\." @@ -70,7 +70,7 @@ def test_progress_bar_load_jobs( ): # repeat the DF to be big enough to trigger the load job. df = penguins_pandas_df_default_index - while len(df) < MAX_INLINE_DF_SIZE: + while len(df) < MAX_INLINE_DF_BYTES: df = pd.DataFrame(np.repeat(df.values, 2, axis=0)) bf.options.display.progress_bar = "terminal" From 9e32f570b42c8ddae0c9b281b25beff91f0c922c Mon Sep 17 00:00:00 2001 From: Chelsea Lin <124939984+chelsea-lin@users.noreply.github.com> Date: Thu, 4 Apr 2024 09:42:36 -0700 Subject: [PATCH 46/53] feat: (Series|DataFrame).explode (#556) * feat: (Series|DataFrame).explode * fixing schema and adding tests * fixing multi-index tests * add docs and fix tests --- bigframes/core/__init__.py | 9 ++ bigframes/core/blocks.py | 30 +++++ bigframes/core/compile/compiled.py | 118 ++++++++++++++++++ bigframes/core/compile/compiler.py | 5 + bigframes/core/nodes.py | 27 ++++ bigframes/dataframe.py | 30 +++++ bigframes/dtypes.py | 15 ++- bigframes/operations/__init__.py | 2 +- bigframes/series.py | 7 ++ tests/system/small/test_dataframe.py | 71 +++++++++++ tests/system/small/test_multiindex.py | 17 +++ tests/system/small/test_series.py | 101 +++++++++++++++ .../ibis/backends/bigquery/registry.py | 6 + .../ibis/expr/operations/__init__.py | 1 + .../ibis/expr/operations/generic.py | 9 ++ .../bigframes_vendored/pandas/core/frame.py | 51 ++++++++ .../bigframes_vendored/pandas/core/series.py | 30 ++++- 17 files changed, 523 insertions(+), 6 deletions(-) create mode 100644 third_party/bigframes_vendored/ibis/expr/operations/generic.py diff --git a/bigframes/core/__init__.py b/bigframes/core/__init__.py index ce9c22132b..9358dab1b1 100644 --- a/bigframes/core/__init__.py +++ b/bigframes/core/__init__.py @@ -401,6 +401,15 @@ def join( return ArrayValue(bigframes.core.rewrite.maybe_rewrite_join(join_node)) return ArrayValue(join_node) + def explode(self, column_ids: typing.Sequence[str]) -> ArrayValue: + assert len(column_ids) > 0 + for column_id in column_ids: + assert bigframes.dtypes.is_array_like(self.get_column_type(column_id)) + + return ArrayValue( + nodes.ExplodeNode(child=self.node, column_ids=tuple(column_ids)) + ) + def _uniform_sampling(self, fraction: float) -> ArrayValue: """Sampling the table on given fraction. diff --git a/bigframes/core/blocks.py b/bigframes/core/blocks.py index 04a98ac9a4..0b6e50cfa3 100644 --- a/bigframes/core/blocks.py +++ b/bigframes/core/blocks.py @@ -1162,6 +1162,36 @@ def calculate_pairwise_metric(self, op=agg_ops.CorrOp()): index_labels=self.column_labels.names, ) + def explode( + self, + column_ids: typing.Sequence[str], + ignore_index: Optional[bool], + ) -> Block: + column_ids = [ + column_id + for column_id in column_ids + if bigframes.dtypes.is_array_like(self.expr.get_column_type(column_id)) + ] + if len(column_ids) == 0: + expr = self.expr + else: + expr = self.expr.explode(column_ids) + + if ignore_index: + return Block( + expr.drop_columns(self.index_columns), + column_labels=self.column_labels, + # Initiates default index creation using the block constructor. + index_columns=[], + ) + else: + return Block( + expr, + column_labels=self.column_labels, + index_columns=self.index_columns, + index_labels=self.column_labels.names, + ) + def _standard_stats(self, column_id) -> typing.Sequence[agg_ops.UnaryAggregateOp]: """ Gets a standard set of stats to preemptively fetch for a column if diff --git a/bigframes/core/compile/compiled.py b/bigframes/core/compile/compiled.py index af2d69275a..f1c5d62010 100644 --- a/bigframes/core/compile/compiled.py +++ b/bigframes/core/compile/compiled.py @@ -20,6 +20,7 @@ import typing from typing import Collection, Iterable, Literal, Optional, Sequence +import bigframes_vendored.ibis.expr.operations as vendored_ibis_ops import ibis import ibis.backends.bigquery as ibis_bigquery import ibis.common.deferred # type: ignore @@ -502,6 +503,51 @@ def _uniform_sampling(self, fraction: float) -> UnorderedIR: columns=columns, ) + def explode(self, column_ids: typing.Sequence[str]) -> UnorderedIR: + table = self._to_ibis_expr() + + # The offset array ensures null represents empty arrays after unnesting. + offset_array_id = bigframes.core.guid.generate_guid("offset_array_") + offset_array = ( + vendored_ibis_ops.GenerateArray( + ibis.greatest( + 0, + ibis.least( + *[table[column_id].length() - 1 for column_id in column_ids] + ), + ) + ) + .to_expr() + .name(offset_array_id), + ) + table_w_offset_array = table.select( + offset_array, + *self._column_names, + ) + + unnest_offset_id = bigframes.core.guid.generate_guid("unnest_offset_") + unnest_offset = ( + table_w_offset_array[offset_array_id].unnest().name(unnest_offset_id) + ) + table_w_offset = table_w_offset_array.select( + unnest_offset, + *self._column_names, + ) + + unnested_columns = [ + table_w_offset[column_id][table_w_offset[unnest_offset_id]].name(column_id) + if column_id in column_ids + else table_w_offset[column_id] + for column_id in self._column_names + ] + table_w_unnest = table_w_offset.select(*unnested_columns) + + columns = [table_w_unnest[column_name] for column_name in self._column_names] + return UnorderedIR( + table_w_unnest, + columns=columns, + ) + ## Helpers def _set_or_replace_by_id( self, id: str, new_value: ibis_types.Value @@ -719,6 +765,78 @@ def _uniform_sampling(self, fraction: float) -> OrderedIR: ordering=self._ordering, ) + def explode(self, column_ids: typing.Sequence[str]) -> OrderedIR: + table = self._to_ibis_expr(ordering_mode="unordered", expose_hidden_cols=True) + + offset_array_id = bigframes.core.guid.generate_guid("offset_array_") + offset_array = ( + vendored_ibis_ops.GenerateArray( + ibis.greatest( + 0, + ibis.least( + *[table[column_id].length() - 1 for column_id in column_ids] + ), + ) + ) + .to_expr() + .name(offset_array_id), + ) + table_w_offset_array = table.select( + offset_array, + *self._column_names, + *self._hidden_ordering_column_names, + ) + + unnest_offset_id = bigframes.core.guid.generate_guid("unnest_offset_") + unnest_offset = ( + table_w_offset_array[offset_array_id].unnest().name(unnest_offset_id) + ) + table_w_offset = table_w_offset_array.select( + unnest_offset, + *self._column_names, + *self._hidden_ordering_column_names, + ) + + unnested_columns = [ + table_w_offset[column_id][table_w_offset[unnest_offset_id]].name(column_id) + if column_id in column_ids + else table_w_offset[column_id] + for column_id in self._column_names + ] + + table_w_unnest = table_w_offset.select( + table_w_offset[unnest_offset_id], + *unnested_columns, + *self._hidden_ordering_column_names, + ) + + columns = [table_w_unnest[column_name] for column_name in self._column_names] + hidden_ordering_columns = [ + *[ + table_w_unnest[column_name] + for column_name in self._hidden_ordering_column_names + ], + table_w_unnest[unnest_offset_id], + ] + ordering = ExpressionOrdering( + ordering_value_columns=tuple( + [ + *self._ordering.ordering_value_columns, + ascending_over(unnest_offset_id), + ] + ), + total_ordering_columns=frozenset( + [*self._ordering.total_ordering_columns, unnest_offset_id] + ), + ) + + return OrderedIR( + table_w_unnest, + columns=columns, + hidden_ordering_columns=hidden_ordering_columns, + ordering=ordering, + ) + def promote_offsets(self, col_id: str) -> OrderedIR: """ Convenience function to promote copy of column offsets to a value column. Can be used to reset index. diff --git a/bigframes/core/compile/compiler.py b/bigframes/core/compile/compiler.py index 6f10d85f31..638e3eacdd 100644 --- a/bigframes/core/compile/compiler.py +++ b/bigframes/core/compile/compiler.py @@ -191,6 +191,11 @@ def compile_unpivot(node: nodes.UnpivotNode, ordered: bool = True): ) +@_compile_node.register +def compiler_explode(node: nodes.ExplodeNode, ordered: bool = True): + return compile_node(node.child, ordered).explode(node.column_ids) + + @_compile_node.register def compiler_random_sample(node: nodes.RandomSampleNode, ordered: bool = True): return compile_node(node.child, ordered)._uniform_sampling(node.fraction) diff --git a/bigframes/core/nodes.py b/bigframes/core/nodes.py index 8f646ac4bb..d740605a56 100644 --- a/bigframes/core/nodes.py +++ b/bigframes/core/nodes.py @@ -484,3 +484,30 @@ def row_preserving(self) -> bool: def __hash__(self): return self._node_hash + + +@dataclass(frozen=True) +class ExplodeNode(UnaryNode): + column_ids: typing.Tuple[str, ...] + + @property + def row_preserving(self) -> bool: + return False + + def __hash__(self): + return self._node_hash + + @functools.cached_property + def schema(self) -> schemata.ArraySchema: + items = tuple( + schemata.SchemaItem( + name, + bigframes.dtypes.arrow_dtype_to_bigframes_dtype( + self.child.schema.get_type(name).pyarrow_dtype.value_type + ), + ) + if name in self.column_ids + else schemata.SchemaItem(name, self.child.schema.get_type(name)) + for name in self.child.schema.names + ) + return schemata.ArraySchema(items) diff --git a/bigframes/dataframe.py b/bigframes/dataframe.py index 97a100474a..0bb88beb2b 100644 --- a/bigframes/dataframe.py +++ b/bigframes/dataframe.py @@ -2579,6 +2579,36 @@ def sample( )[0] ) + def explode( + self, + column: typing.Union[blocks.Label, typing.Sequence[blocks.Label]], + *, + ignore_index: Optional[bool] = False, + ) -> DataFrame: + if not utils.is_list_like(column): + column_labels = typing.cast(typing.Sequence[blocks.Label], (column,)) + else: + column_labels = typing.cast(typing.Sequence[blocks.Label], tuple(column)) + + if not column_labels: + raise ValueError("column must be nonempty") + if len(column_labels) > len(set(column_labels)): + raise ValueError("column must be unique") + + column_ids = [self._resolve_label_exact(label) for label in column_labels] + missing = [ + column_labels[i] for i in range(len(column_ids)) if column_ids[i] is None + ] + if len(missing) > 0: + raise KeyError(f"None of {missing} are in the columns") + + return DataFrame( + self._block.explode( + column_ids=typing.cast(typing.Sequence[str], tuple(column_ids)), + ignore_index=ignore_index, + ) + ) + def _split( self, ns: Iterable[int] = (), diff --git a/bigframes/dtypes.py b/bigframes/dtypes.py index 3d8c06d188..c5bf5db2fe 100644 --- a/bigframes/dtypes.py +++ b/bigframes/dtypes.py @@ -129,16 +129,19 @@ def is_string_like(type: ExpressionType) -> bool: def is_array_like(type: ExpressionType) -> bool: - if isinstance(type, pd.ArrowDtype) and isinstance(type.pyarrow_dtype, pa.ListType): - return True - else: - return type in (STRING_DTYPE, BYTES_DTYPE) + return isinstance(type, pd.ArrowDtype) and isinstance( + type.pyarrow_dtype, pa.ListType + ) def is_numeric(type: ExpressionType) -> bool: return type in NUMERIC_BIGFRAMES_TYPES_PERMISSIVE +def is_iterable(type: ExpressionType) -> bool: + return type in (STRING_DTYPE, BYTES_DTYPE) or is_array_like(type) + + def is_comparable(type: ExpressionType) -> bool: return (type is not None) and (type not in UNORDERED_DTYPES) @@ -348,6 +351,10 @@ def arrow_dtype_to_ibis_dtype(arrow_dtype: pa.DataType) -> ibis_dtypes.DataType: ) +def arrow_dtype_to_bigframes_dtype(arrow_dtype: pa.DataType) -> Dtype: + return ibis_dtype_to_bigframes_dtype(arrow_dtype_to_ibis_dtype(arrow_dtype)) + + def bigframes_dtype_to_ibis_dtype( bigframes_dtype: Union[DtypeString, Dtype, np.dtype[Any]] ) -> ibis_dtypes.DataType: diff --git a/bigframes/operations/__init__.py b/bigframes/operations/__init__.py index 0dcc643238..d631ba8508 100644 --- a/bigframes/operations/__init__.py +++ b/bigframes/operations/__init__.py @@ -212,7 +212,7 @@ def create_binary_op( len_op = create_unary_op( name="len", type_signature=op_typing.FixedOutputType( - dtypes.is_array_like, dtypes.INT_DTYPE, description="array-like" + dtypes.is_iterable, dtypes.INT_DTYPE, description="iterable" ), ) reverse_op = create_unary_op(name="reverse", type_signature=op_typing.STRING_TRANSFORM) diff --git a/bigframes/series.py b/bigframes/series.py index 7e2b0408b7..42264c35b6 100644 --- a/bigframes/series.py +++ b/bigframes/series.py @@ -1547,6 +1547,13 @@ def sample( )[0] ) + def explode(self, *, ignore_index: Optional[bool] = False) -> Series: + return Series( + self._block.explode( + column_ids=[self._value_column], ignore_index=ignore_index + ) + ) + def __array_ufunc__( self, ufunc: numpy.ufunc, method: str, *inputs, **kwargs ) -> Series: diff --git a/tests/system/small/test_dataframe.py b/tests/system/small/test_dataframe.py index f28de37d68..8bcdfe168b 100644 --- a/tests/system/small/test_dataframe.py +++ b/tests/system/small/test_dataframe.py @@ -20,6 +20,7 @@ from typing import Tuple import geopandas as gpd # type: ignore +import numpy as np import pandas as pd import pandas.testing import pyarrow as pa # type: ignore @@ -29,6 +30,7 @@ import bigframes._config.display_options as display_options import bigframes.core.indexes as bf_indexes import bigframes.dataframe as dataframe +import bigframes.pandas as bpd import bigframes.series as series from tests.system.utils import ( assert_pandas_df_equal, @@ -4167,3 +4169,72 @@ def test_to_gbq_and_create_dataset(session, scalars_df_index, dataset_id_not_cre loaded_scalars_df_index = session.read_gbq(result_table) assert not loaded_scalars_df_index.empty + + +@pytest.mark.parametrize( + ("col_names", "ignore_index"), + [ + pytest.param(["A"], False, id="one_array_false"), + pytest.param(["A"], True, id="one_array_true"), + pytest.param(["B"], False, id="one_float_false"), + pytest.param(["B"], True, id="one_float_true"), + pytest.param(["A", "C"], False, id="two_arrays_false"), + pytest.param(["A", "C"], True, id="two_arrays_true"), + ], +) +def test_dataframe_explode(col_names, ignore_index): + data = { + "A": [[0, 1, 2], [], [3, 4]], + "B": 3, + "C": [["a", "b", "c"], np.nan, ["d", "e"]], + } + df = bpd.DataFrame(data) + pd_df = df.to_pandas() + pd.testing.assert_frame_equal( + df.explode(col_names, ignore_index=ignore_index).to_pandas(), + pd_df.explode(col_names, ignore_index=ignore_index), + check_index_type=False, + check_dtype=False, + ) + + +@pytest.mark.parametrize( + ("ignore_index", "ordered"), + [ + pytest.param(True, True, id="include_index_ordered"), + pytest.param(True, False, id="include_index_unordered"), + pytest.param(False, True, id="ignore_index_ordered"), + ], +) +def test_dataframe_explode_reserve_order(ignore_index, ordered): + data = { + "a": [np.random.randint(0, 10, 10) for _ in range(10)], + "b": [np.random.randint(0, 10, 10) for _ in range(10)], + } + df = bpd.DataFrame(data) + pd_df = pd.DataFrame(data) + + res = df.explode(["a", "b"], ignore_index=ignore_index).to_pandas(ordered=ordered) + pd_res = pd_df.explode(["a", "b"], ignore_index=ignore_index).astype( + pd.Int64Dtype() + ) + pd.testing.assert_frame_equal( + res if ordered else res.sort_index(), + pd_res, + check_index_type=False, + ) + + +@pytest.mark.parametrize( + ("col_names"), + [ + pytest.param([], id="empty", marks=pytest.mark.xfail(raises=ValueError)), + pytest.param( + ["A", "A"], id="duplicate", marks=pytest.mark.xfail(raises=ValueError) + ), + pytest.param("unknown", id="unknown", marks=pytest.mark.xfail(raises=KeyError)), + ], +) +def test_dataframe_explode_xfail(col_names): + df = bpd.DataFrame({"A": [[0, 1, 2], [], [3, 4]]}) + df.explode(col_names) diff --git a/tests/system/small/test_multiindex.py b/tests/system/small/test_multiindex.py index d585d4f73e..6aca7628cf 100644 --- a/tests/system/small/test_multiindex.py +++ b/tests/system/small/test_multiindex.py @@ -12,6 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. +import numpy as np import pandas import pytest @@ -1168,3 +1169,19 @@ def test_column_multi_index_dot_not_supported(): NotImplementedError, match="Multi-level column input is not supported" ): bf1 @ bf2 + + +def test_explode_w_multi_index(): + data = [[[1, 1], np.nan, [3, 3]], [[2], [5], []]] + multi_level_columns = pandas.MultiIndex.from_arrays( + [["col0", "col0", "col1"], ["col00", "col01", "col11"]] + ) + + df = bpd.DataFrame(data, columns=multi_level_columns) + pd_df = df.to_pandas() + pandas.testing.assert_frame_equal( + df["col0"].explode("col00").to_pandas(), + pd_df["col0"].explode("col00"), + check_dtype=False, + check_index_type=False, + ) diff --git a/tests/system/small/test_series.py b/tests/system/small/test_series.py index 794ab6b7a2..e15dbc6a3f 100644 --- a/tests/system/small/test_series.py +++ b/tests/system/small/test_series.py @@ -3416,3 +3416,104 @@ def foo(x: int, y: int, df): ) assert_series_equal(bf_result, pd_result) + + +@pytest.mark.parametrize( + ("data"), + [ + pytest.param([1, 2, 3], id="int"), + pytest.param([[1, 2, 3], [], numpy.nan, [3, 4]], id="int_array"), + pytest.param( + [["A", "AA", "AAA"], ["BB", "B"], numpy.nan, [], ["C"]], id="string_array" + ), + pytest.param( + [ + {"A": {"x": 1.0}, "B": "b"}, + {"A": {"y": 2.0}, "B": "bb"}, + {"A": {"z": 4.0}}, + {}, + numpy.nan, + ], + id="struct_array", + ), + ], +) +def test_series_explode(data): + data = [[1, 2, 3], [], numpy.nan, [3, 4]] + s = bigframes.pandas.Series(data) + pd_s = pd.Series(data) + pd.testing.assert_series_equal( + s.explode().to_pandas(), + pd_s.explode(), + check_index_type=False, + check_dtype=False, + ) + + +@pytest.mark.parametrize( + ("index", "ignore_index"), + [ + pytest.param(None, True, id="default_index"), + pytest.param(None, False, id="ignore_default_index"), + pytest.param([5, 1, 3, 2], True, id="unordered_index"), + pytest.param([5, 1, 3, 2], False, id="ignore_unordered_index"), + pytest.param(["z", "x", "a", "b"], True, id="str_index"), + pytest.param(["z", "x", "a", "b"], False, id="ignore_str_index"), + ], +) +def test_series_explode_w_index(index, ignore_index): + data = [[], [200.0, 23.12], [4.5, -9.0], [1.0]] + s = bigframes.pandas.Series(data, index=index) + pd_s = pd.Series(data, index=index) + pd.testing.assert_series_equal( + s.explode(ignore_index=ignore_index).to_pandas(), + pd_s.explode(ignore_index=ignore_index).astype(pd.Float64Dtype()), + check_index_type=False, + ) + + +@pytest.mark.parametrize( + ("ignore_index", "ordered"), + [ + pytest.param(True, True, id="include_index_ordered"), + pytest.param(True, False, id="include_index_unordered"), + pytest.param(False, True, id="ignore_index_ordered"), + ], +) +def test_series_explode_reserve_order(ignore_index, ordered): + data = [numpy.random.randint(0, 10, 10) for _ in range(10)] + s = bigframes.pandas.Series(data) + pd_s = pd.Series(data) + + res = s.explode(ignore_index=ignore_index).to_pandas(ordered=ordered) + pd_res = pd_s.explode(ignore_index=ignore_index).astype(pd.Int64Dtype()) + pd.testing.assert_series_equal( + res if ordered else res.sort_index(), + pd_res, + check_index_type=False, + ) + + +def test_series_explode_w_aggregate(): + data = [[1, 2, 3], [], numpy.nan, [3, 4]] + s = bigframes.pandas.Series(data) + pd_s = pd.Series(data) + assert s.explode().sum() == pd_s.explode().sum() + + +@pytest.mark.parametrize( + ("data"), + [ + pytest.param(numpy.nan, id="null"), + pytest.param([numpy.nan], id="null_array"), + pytest.param([[]], id="empty_array"), + pytest.param([numpy.nan, []], id="null_and_empty_array"), + ], +) +def test_series_explode_null(data): + s = bigframes.pandas.Series(data) + pd.testing.assert_series_equal( + s.explode().to_pandas(), + s.to_pandas().explode(), + check_dtype=False, + ) diff --git a/third_party/bigframes_vendored/ibis/backends/bigquery/registry.py b/third_party/bigframes_vendored/ibis/backends/bigquery/registry.py index 3f89feaa34..88826b31ce 100644 --- a/third_party/bigframes_vendored/ibis/backends/bigquery/registry.py +++ b/third_party/bigframes_vendored/ibis/backends/bigquery/registry.py @@ -26,11 +26,17 @@ def _to_json_string(translator, op: vendored_ibis_ops.ToJsonString): return f"TO_JSON_STRING({arg})" +def _generate_array(translator, op: vendored_ibis_ops.GenerateArray): + arg = translator.translate(op.arg) + return f"GENERATE_ARRAY(0, {arg})" + + patched_ops = { vendored_ibis_ops.ApproximateMultiQuantile: _approx_quantiles, # type:ignore vendored_ibis_ops.FirstNonNullValue: _first_non_null_value, # type:ignore vendored_ibis_ops.LastNonNullValue: _last_non_null_value, # type:ignore vendored_ibis_ops.ToJsonString: _to_json_string, # type:ignore + vendored_ibis_ops.GenerateArray: _generate_array, # type:ignore } OPERATION_REGISTRY.update(patched_ops) diff --git a/third_party/bigframes_vendored/ibis/expr/operations/__init__.py b/third_party/bigframes_vendored/ibis/expr/operations/__init__.py index 2c2efe528d..3d5a5a7fa0 100644 --- a/third_party/bigframes_vendored/ibis/expr/operations/__init__.py +++ b/third_party/bigframes_vendored/ibis/expr/operations/__init__.py @@ -2,5 +2,6 @@ from __future__ import annotations from bigframes_vendored.ibis.expr.operations.analytic import * # noqa: F401 F403 +from bigframes_vendored.ibis.expr.operations.generic import * # noqa: F401 F403 from bigframes_vendored.ibis.expr.operations.json import * # noqa: F401 F403 from bigframes_vendored.ibis.expr.operations.reductions import * # noqa: F401 F403 diff --git a/third_party/bigframes_vendored/ibis/expr/operations/generic.py b/third_party/bigframes_vendored/ibis/expr/operations/generic.py new file mode 100644 index 0000000000..82d0a13371 --- /dev/null +++ b/third_party/bigframes_vendored/ibis/expr/operations/generic.py @@ -0,0 +1,9 @@ +# Contains code from https://github.com/ibis-project/ibis/blob/master/ibis/expr/operations/generic.py +from __future__ import annotations + +import ibis.expr.datatypes as dt +from ibis.expr.operations.core import Unary + + +class GenerateArray(Unary): + dtype = dt.Array(dt.int64) diff --git a/third_party/bigframes_vendored/pandas/core/frame.py b/third_party/bigframes_vendored/pandas/core/frame.py index 3ae5b0db2a..e5aa47ad3e 100644 --- a/third_party/bigframes_vendored/pandas/core/frame.py +++ b/third_party/bigframes_vendored/pandas/core/frame.py @@ -2804,6 +2804,57 @@ def combine_first(self, other) -> DataFrame: """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + def explode( + self, column: Union[str, Sequence[str]], *, ignore_index: Optional[bool] = False + ) -> DataFrame: + """ + Transform each element of an array to a row, replicating index values. + + **Examples:** + + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + + >>> df = bpd.DataFrame({'A': [[0, 1, 2], [], [], [3, 4]], + ... 'B': 1, + ... 'C': [['a', 'b', 'c'], np.nan, [], ['d', 'e']]}) + >>> df.explode('A') + A B C + 0 0 1 ['a' 'b' 'c'] + 0 1 1 ['a' 'b' 'c'] + 0 2 1 ['a' 'b' 'c'] + 1 1 [] + 2 1 [] + 3 3 1 ['d' 'e'] + 3 4 1 ['d' 'e'] + + [7 rows x 3 columns] + >>> df.explode(list('AC')) + A B C + 0 0 1 a + 0 1 1 b + 0 2 1 c + 1 1 + 2 1 + 3 3 1 d + 3 4 1 e + + [7 rows x 3 columns] + + Args: + column (str, Sequence[str]): + Column(s) to explode. For multiple columns, specify a non-empty list + with each element be str or tuple, and all specified columns their + list-like data on same row of the frame must have matching length. + ignore_index (bool, default False): + If True, the resulting index will be labeled 0, 1, …, n - 1. + + Returns: + bigframes.series.DataFrame: Exploded lists to rows of the subset columns; + index will be duplicated for these rows. + """ + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + def corr(self, method, min_periods, numeric_only) -> DataFrame: """ Compute pairwise correlation of columns, excluding NA/null values. diff --git a/third_party/bigframes_vendored/pandas/core/series.py b/third_party/bigframes_vendored/pandas/core/series.py index 89b39cf8a0..785755a562 100644 --- a/third_party/bigframes_vendored/pandas/core/series.py +++ b/third_party/bigframes_vendored/pandas/core/series.py @@ -3,7 +3,7 @@ """ from __future__ import annotations -from typing import Hashable, IO, Literal, Mapping, Sequence, TYPE_CHECKING +from typing import Hashable, IO, Literal, Mapping, Optional, Sequence, TYPE_CHECKING from bigframes_vendored.pandas.core.generic import NDFrame import numpy as np @@ -751,6 +751,34 @@ def round(self, decimals: int = 0) -> Series: """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + def explode(self, *, ignore_index: Optional[bool] = False) -> Series: + """ + Transform each element of a list-like to a row. + + **Examples:** + + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + + >>> s = bpd.Series([[1, 2, 3], [], [3, 4]]) + >>> s.explode() + 0 1 + 0 2 + 0 3 + 1 + 2 3 + 2 4 + dtype: Int64 + + Args: + ignore_index (bool, default False): + If True, the resulting index will be labeled 0, 1, …, n - 1. + + Returns: + bigframes.series.Series: Exploded lists to rows; index will be duplicated for these rows. + """ + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + def corr(self, other, method="pearson", min_periods=None) -> float: """ Compute the correlation with the other Series. Non-number values are ignored in the From 098d4443807ec1229f69760bda1dd7bf6af488aa Mon Sep 17 00:00:00 2001 From: Lily Zhang <32233490+junyazhang@users.noreply.github.com> Date: Thu, 4 Apr 2024 10:05:32 -0700 Subject: [PATCH 47/53] chore: remove outdated notebook (#567) --- .../vertex_sdk/sdk2_bigframes_pytorch.ipynb | 723 ----------------- .../vertex_sdk/sdk2_bigframes_sklearn.ipynb | 727 ------------------ .../sdk2_bigframes_tensorflow.ipynb | 646 ---------------- 3 files changed, 2096 deletions(-) delete mode 100644 notebooks/vertex_sdk/sdk2_bigframes_pytorch.ipynb delete mode 100644 notebooks/vertex_sdk/sdk2_bigframes_sklearn.ipynb delete mode 100644 notebooks/vertex_sdk/sdk2_bigframes_tensorflow.ipynb diff --git a/notebooks/vertex_sdk/sdk2_bigframes_pytorch.ipynb b/notebooks/vertex_sdk/sdk2_bigframes_pytorch.ipynb deleted file mode 100644 index 598d958f0c..0000000000 --- a/notebooks/vertex_sdk/sdk2_bigframes_pytorch.ipynb +++ /dev/null @@ -1,723 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "ur8xi4C7S06n" - }, - "outputs": [], - "source": [ - "# Copyright 2023 Google LLC\n", - "#\n", - "# Licensed under the Apache License, Version 2.0 (the \"License\");\n", - "# you may not use this file except in compliance with the License.\n", - "# You may obtain a copy of the License at\n", - "#\n", - "# https://www.apache.org/licenses/LICENSE-2.0\n", - "#\n", - "# Unless required by applicable law or agreed to in writing, software\n", - "# distributed under the License is distributed on an \"AS IS\" BASIS,\n", - "# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n", - "# See the License for the specific language governing permissions and\n", - "# limitations under the License." - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "JAPoU8Sm5E6e" - }, - "source": [ - "# Train a pytorch model with Vertex AI SDK 2.0 and Bigframes\n", - "\n", - "\n", - " \n", - " \n", - "
\n", - " \n", - " \"Colab Run in Colab\n", - " \n", - " \n", - " \n", - " \"GitHub\n", - " View on GitHub\n", - " \n", - " \n", - " \n", - " \"VertexOpen in Vertex AI Workbench\n", - " \n", - "
" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "tvgnzT1CKxrO" - }, - "source": [ - "## Overview\n", - "\n", - "This tutorial demonstrates how to train a pytorch model using Vertex AI local-to-remote training with Vertex AI SDK 2.0 and BigQuery Bigframes as the data source.\n", - "\n", - "Learn more about [bigframes](https://cloud.google.com/bigquery/docs/)." - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "d975e698c9a4" - }, - "source": [ - "### Objective\n", - "\n", - "In this tutorial, you learn to use `Vertex AI SDK 2.0` with Bigframes as input data source.\n", - "\n", - "\n", - "This tutorial uses the following Google Cloud ML services:\n", - "\n", - "- `Vertex AI Training`\n", - "- `Vertex AI Remote Training`\n", - "\n", - "\n", - "The steps performed include:\n", - "\n", - "- Initialize a dataframe from a BigQuery table and split the dataset\n", - "- Perform transformations as a Vertex AI remote training.\n", - "- Train the model remotely and evaluate the model locally\n", - "\n", - "**Local-to-remote training**\n", - "\n", - "```\n", - "import vertexai\n", - "from my_module import MyModelClass\n", - "\n", - "vertexai.preview.init(remote=True, project=\"my-project\", location=\"my-location\", staging_bucket=\"gs://my-bucket\")\n", - "\n", - "# Wrap the model class with `vertex_ai.preview.remote`\n", - "MyModelClass = vertexai.preview.remote(MyModelClass)\n", - "\n", - "# Instantiate the class\n", - "model = MyModelClass(...)\n", - "\n", - "# Optional set remote config\n", - "model.fit.vertex.remote_config.display_name = \"MyModelClass-remote-training\"\n", - "model.fit.vertex.remote_config.staging_bucket = \"gs://my-bucket\"\n", - "\n", - "# This `fit` call will be executed remotely\n", - "model.fit(...)\n", - "```" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "08d289fa873f" - }, - "source": [ - "### Dataset\n", - "\n", - "This tutorial uses the IRIS dataset, which predicts the iris species." - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "aed92deeb4a0" - }, - "source": [ - "### Costs\n", - "\n", - "This tutorial uses billable components of Google Cloud:\n", - "\n", - "* Vertex AI\n", - "* BigQuery\n", - "* Cloud Storage\n", - "\n", - "Learn about [Vertex AI pricing](https://cloud.google.com/vertex-ai/pricing),\n", - "[BigQuery pricing](https://cloud.google.com/bigquery/pricing),\n", - "and [Cloud Storage pricing](https://cloud.google.com/storage/pricing), \n", - "and use the [Pricing Calculator](https://cloud.google.com/products/calculator/)\n", - "to generate a cost estimate based on your projected usage." - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "i7EUnXsZhAGF" - }, - "source": [ - "## Installation\n", - "\n", - "Install the following packages required to execute this notebook. " - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "2b4ef9b72d43" - }, - "outputs": [], - "source": [ - "# Install the packages\n", - "! pip3 install --upgrade --quiet google-cloud-aiplatform[preview]\n", - "! pip3 install --upgrade --quiet bigframes\n", - "! pip3 install --upgrade --quiet torch" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "58707a750154" - }, - "source": [ - "### Colab only: Uncomment the following cell to restart the kernel." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "f200f10a1da3" - }, - "outputs": [], - "source": [ - "# Automatically restart kernel after installs so that your environment can access the new packages\n", - "# import IPython\n", - "\n", - "# app = IPython.Application.instance()\n", - "# app.kernel.do_shutdown(True)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "BF1j6f9HApxa" - }, - "source": [ - "## Before you begin\n", - "\n", - "### Set up your Google Cloud project\n", - "\n", - "**The following steps are required, regardless of your notebook environment.**\n", - "\n", - "1. [Select or create a Google Cloud project](https://console.cloud.google.com/cloud-resource-manager). When you first create an account, you get a $300 free credit towards your compute/storage costs.\n", - "\n", - "2. [Make sure that billing is enabled for your project](https://cloud.google.com/billing/docs/how-to/modify-project).\n", - "\n", - "3. [Enable the Vertex AI API](https://console.cloud.google.com/flows/enableapi?apiid=aiplatform.googleapis.com).\n", - "\n", - "4. If you are running this notebook locally, you need to install the [Cloud SDK](https://cloud.google.com/sdk)." - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "WReHDGG5g0XY" - }, - "source": [ - "#### Set your project ID\n", - "\n", - "**If you don't know your project ID**, try the following:\n", - "* Run `gcloud config list`.\n", - "* Run `gcloud projects list`.\n", - "* See the support page: [Locate the project ID](https://support.google.com/googleapi/answer/7014113)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "oM1iC_MfAts1" - }, - "outputs": [], - "source": [ - "PROJECT_ID = \"[your-project-id]\" # @param {type:\"string\"}\n", - "\n", - "# Set the project id\n", - "! gcloud config set project {PROJECT_ID}" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "region" - }, - "source": [ - "#### Region\n", - "\n", - "You can also change the `REGION` variable used by Vertex AI. Learn more about [Vertex AI regions](https://cloud.google.com/vertex-ai/docs/general/locations)." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "region" - }, - "outputs": [], - "source": [ - "REGION = \"us-central1\" # @param {type: \"string\"}" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "sBCra4QMA2wR" - }, - "source": [ - "### Authenticate your Google Cloud account\n", - "\n", - "Depending on your Jupyter environment, you may have to manually authenticate. Follow the relevant instructions below." - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "74ccc9e52986" - }, - "source": [ - "**1. Vertex AI Workbench**\n", - "* Do nothing as you are already authenticated." - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "de775a3773ba" - }, - "source": [ - "**2. Local JupyterLab instance, uncomment and run:**" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "254614fa0c46" - }, - "outputs": [], - "source": [ - "# ! gcloud auth login" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "ef21552ccea8" - }, - "source": [ - "**3. Colab, uncomment and run:**" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "603adbbf0532" - }, - "outputs": [], - "source": [ - "# from google.colab import auth\n", - "# auth.authenticate_user()" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "f6b2ccc891ed" - }, - "source": [ - "**4. Service account or other**\n", - "* See how to grant Cloud Storage permissions to your service account at https://cloud.google.com/storage/docs/gsutil/commands/iam#ch-examples." - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "zgPO1eR3CYjk" - }, - "source": [ - "### Create a Cloud Storage bucket\n", - "\n", - "Create a storage bucket to store intermediate artifacts such as datasets." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "MzGDU7TWdts_" - }, - "outputs": [], - "source": [ - "BUCKET_URI = f\"gs://your-bucket-name-{PROJECT_ID}-unique\" # @param {type:\"string\"}" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "-EcIXiGsCePi" - }, - "source": [ - "**Only if your bucket doesn't already exist**: Run the following cell to create your Cloud Storage bucket." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "NIq7R4HZCfIc" - }, - "outputs": [], - "source": [ - "! gsutil mb -l {REGION} -p {PROJECT_ID} {BUCKET_URI}" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "960505627ddf" - }, - "source": [ - "### Import libraries and define constants" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "PyQmSRbKA8r-" - }, - "outputs": [], - "source": [ - "import bigframes.pandas as bf\n", - "import torch\n", - "import vertexai\n", - "from vertexai.preview import VertexModel\n", - "\n", - "bf.options.bigquery.location = \"us\" # Dataset is in 'us' not 'us-central1'\n", - "bf.options.bigquery.project = PROJECT_ID\n", - "\n", - "from bigframes.ml.model_selection import \\\n", - " train_test_split as bf_train_test_split" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "init_aip:mbsdk,all" - }, - "source": [ - "## Initialize Vertex AI SDK for Python\n", - "\n", - "Initialize the Vertex AI SDK for Python for your project and corresponding bucket." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "init_aip:mbsdk,all" - }, - "outputs": [], - "source": [ - "vertexai.init(\n", - " project=PROJECT_ID,\n", - " location=REGION,\n", - " staging_bucket=BUCKET_URI,\n", - ")\n", - "\n", - "REMOTE_JOB_NAME = \"sdk2-bigframes-pytorch\"\n", - "REMOTE_JOB_BUCKET = f\"{BUCKET_URI}/{REMOTE_JOB_NAME}\"" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "105334524e96" - }, - "source": [ - "## Prepare the dataset\n", - "\n", - "Now load the Iris dataset and split the data into train and test sets." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "b44cdc4e03f1" - }, - "outputs": [], - "source": [ - "df = bf.read_gbq(\"bigquery-public-data.ml_datasets.iris\")\n", - "\n", - "species_categories = {\n", - " \"versicolor\": 0,\n", - " \"virginica\": 1,\n", - " \"setosa\": 2,\n", - "}\n", - "df[\"species\"] = df[\"species\"].map(species_categories)\n", - "\n", - "# Assign an index column name\n", - "index_col = \"index\"\n", - "df.index.name = index_col" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "9cb8616b1997" - }, - "outputs": [], - "source": [ - "feature_columns = df[[\"sepal_length\", \"sepal_width\", \"petal_length\", \"petal_width\"]]\n", - "label_columns = df[[\"species\"]]\n", - "train_X, test_X, train_y, test_y = bf_train_test_split(\n", - " feature_columns, label_columns, test_size=0.2\n", - ")\n", - "\n", - "print(\"X_train size: \", train_X.size)\n", - "print(\"X_test size: \", test_X.size)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "23fe7b734b08" - }, - "outputs": [], - "source": [ - "# Switch to remote mode for training\n", - "vertexai.preview.init(remote=True)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "5904a0f1bb03" - }, - "source": [ - "## PyTorch remote training with CPU (Custom PyTorch model)\n", - "\n", - "First, train a PyTorch model as a remote training job:\n", - "\n", - "- Reinitialize Vertex AI for remote training.\n", - "- Set TorchLogisticRegression for the remote training job.\n", - "- Invoke TorchLogisticRegression locally which will launch the remote training job." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "2a1b85195a17" - }, - "outputs": [], - "source": [ - "# define the custom model\n", - "class TorchLogisticRegression(VertexModel, torch.nn.Module):\n", - " def __init__(self, input_size: int, output_size: int):\n", - " torch.nn.Module.__init__(self)\n", - " VertexModel.__init__(self)\n", - " self.linear = torch.nn.Linear(input_size, output_size)\n", - " self.softmax = torch.nn.Softmax(dim=1)\n", - "\n", - " def forward(self, x):\n", - " return self.softmax(self.linear(x))\n", - "\n", - " @vertexai.preview.developer.mark.train()\n", - " def train(self, X, y, num_epochs, lr):\n", - " X = X.to(torch.float32)\n", - " y = torch.flatten(y) # necessary to get 1D tensor\n", - " dataloader = torch.utils.data.DataLoader(\n", - " torch.utils.data.TensorDataset(X, y),\n", - " batch_size=10,\n", - " shuffle=True,\n", - " generator=torch.Generator(device=X.device),\n", - " )\n", - "\n", - " criterion = torch.nn.CrossEntropyLoss()\n", - " optimizer = torch.optim.SGD(self.parameters(), lr=lr)\n", - "\n", - " for t in range(num_epochs):\n", - " for batch, (X, y) in enumerate(dataloader):\n", - " optimizer.zero_grad()\n", - " pred = self(X)\n", - " loss = criterion(pred, y)\n", - " loss.backward()\n", - " optimizer.step()\n", - "\n", - " @vertexai.preview.developer.mark.predict()\n", - " def predict(self, X):\n", - " X = torch.tensor(X).to(torch.float32)\n", - " with torch.no_grad():\n", - " pred = torch.argmax(self(X), dim=1)\n", - " return pred" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "4e35593f520a" - }, - "outputs": [], - "source": [ - "# Switch to remote mode for training\n", - "vertexai.preview.init(remote=True)\n", - "\n", - "# Instantiate model\n", - "model = TorchLogisticRegression(4, 3)\n", - "\n", - "# Set training config\n", - "model.train.vertex.remote_config.custom_commands = [\n", - " \"pip install torchdata\",\n", - " \"pip install torcharrow\",\n", - "]\n", - "model.train.vertex.remote_config.display_name = REMOTE_JOB_NAME + \"-torch-model\"\n", - "model.train.vertex.remote_config.staging_bucket = REMOTE_JOB_BUCKET\n", - "\n", - "# Train model on Vertex\n", - "model.train(train_X, train_y, num_epochs=200, lr=0.05)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "edf4d0708f02" - }, - "source": [ - "## Remote prediction\n", - "\n", - "Obtain predictions from the trained model." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "42dfbff0ca15" - }, - "outputs": [], - "source": [ - "vertexai.preview.init(remote=True)\n", - "\n", - "# Set remote config\n", - "model.predict.vertex.remote_config.custom_commands = [\n", - " \"pip install torchdata\",\n", - " \"pip install torcharrow\",\n", - "]\n", - "model.predict.vertex.remote_config.display_name = REMOTE_JOB_NAME + \"-torch-predict\"\n", - "model.predict.vertex.remote_config.staging_bucket = REMOTE_JOB_BUCKET\n", - "\n", - "predictions = model.predict(test_X)\n", - "\n", - "print(f\"Remote predictions: {predictions}\")" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "4340ed8316cd" - }, - "source": [ - "## Local evaluation\n", - "\n", - "Evaluate model results locally." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "eb27a31cec6f" - }, - "outputs": [], - "source": [ - "# User must convert bigframes to torch tensor for local evaluation\n", - "train_X_tensor = torch.from_numpy(\n", - " train_X.to_pandas().reset_index().drop(columns=[\"index\"]).values.astype(float)\n", - ")\n", - "train_y_tensor = torch.from_numpy(\n", - " train_y.to_pandas().reset_index().drop(columns=[\"index\"]).values.astype(float)\n", - ")\n", - "\n", - "test_X_tensor = torch.from_numpy(\n", - " test_X.to_pandas().reset_index().drop(columns=[\"index\"]).values.astype(float)\n", - ")\n", - "test_y_tensor = torch.from_numpy(\n", - " test_y.to_pandas().reset_index().drop(columns=[\"index\"]).values.astype(float)\n", - ")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "7db44ad81389" - }, - "outputs": [], - "source": [ - "from sklearn.metrics import accuracy_score\n", - "\n", - "# Switch to local mode for evaluation\n", - "vertexai.preview.init(remote=False)\n", - "\n", - "# Evaluate model's accuracy score\n", - "print(\n", - " f\"Train accuracy: {accuracy_score(train_y_tensor, model.predict(train_X_tensor))}\"\n", - ")\n", - "\n", - "print(f\"Test accuracy: {accuracy_score(test_y_tensor, model.predict(test_X_tensor))}\")" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "TpV-iwP9qw9c" - }, - "source": [ - "## Cleaning up\n", - "\n", - "To clean up all Google Cloud resources used in this project, you can [delete the Google Cloud\n", - "project](https://cloud.google.com/resource-manager/docs/creating-managing-projects#shutting_down_projects) you used for the tutorial.\n", - "\n", - "Otherwise, you can delete the individual resources you created in this tutorial:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "sx_vKniMq9ZX" - }, - "outputs": [], - "source": [ - "import os\n", - "\n", - "# Delete Cloud Storage objects that were created\n", - "delete_bucket = False\n", - "if delete_bucket or os.getenv(\"IS_TESTING\"):\n", - " ! gsutil -m rm -r $BUCKET_URI" - ] - } - ], - "metadata": { - "colab": { - "collapsed_sections": [], - "name": "sdk2_bigframes_pytorch.ipynb", - "toc_visible": true - }, - "kernelspec": { - "display_name": "Python 3", - "name": "python3" - } - }, - "nbformat": 4, - "nbformat_minor": 0 -} diff --git a/notebooks/vertex_sdk/sdk2_bigframes_sklearn.ipynb b/notebooks/vertex_sdk/sdk2_bigframes_sklearn.ipynb deleted file mode 100644 index 021c070753..0000000000 --- a/notebooks/vertex_sdk/sdk2_bigframes_sklearn.ipynb +++ /dev/null @@ -1,727 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "ur8xi4C7S06n" - }, - "outputs": [], - "source": [ - "# Copyright 2023 Google LLC\n", - "#\n", - "# Licensed under the Apache License, Version 2.0 (the \"License\");\n", - "# you may not use this file except in compliance with the License.\n", - "# You may obtain a copy of the License at\n", - "#\n", - "# https://www.apache.org/licenses/LICENSE-2.0\n", - "#\n", - "# Unless required by applicable law or agreed to in writing, software\n", - "# distributed under the License is distributed on an \"AS IS\" BASIS,\n", - "# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n", - "# See the License for the specific language governing permissions and\n", - "# limitations under the License." - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "JAPoU8Sm5E6e" - }, - "source": [ - "# Train a scikit-learn model with Vertex AI SDK 2.0 and Bigframes\n", - "\n", - "\n", - " \n", - " \n", - "
\n", - " \n", - " \"Colab Run in Colab\n", - " \n", - " \n", - " \n", - " \"GitHub\n", - " View on GitHub\n", - " \n", - " \n", - " \n", - " \"VertexOpen in Vertex AI Workbench\n", - " \n", - "
" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "tvgnzT1CKxrO" - }, - "source": [ - "## Overview\n", - "\n", - "This tutorial demonstrates how to train a scikit-learn model using Vertex AI local-to-remote training with Vertex AI SDK 2.0 and BigQuery Bigframes as the data source.\n", - "\n", - "Learn more about [bigframes](https://cloud.google.com/bigquery/docs/)." - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "d975e698c9a4" - }, - "source": [ - "### Objective\n", - "\n", - "In this tutorial, you learn to use `Vertex AI SDK 2.0` with Bigframes as input data source.\n", - "\n", - "\n", - "This tutorial uses the following Google Cloud ML services:\n", - "\n", - "- `Vertex AI Training`\n", - "- `Vertex AI Remote Training`\n", - "\n", - "\n", - "The steps performed include:\n", - "\n", - "- Initialize a dataframe from a BigQuery table and split the dataset\n", - "- Perform transformations as a Vertex AI remote training.\n", - "- Train the model remotely and evaluate the model locally\n", - "\n", - "**Local-to-remote training**\n", - "\n", - "```\n", - "import vertexai\n", - "from my_module import MyModelClass\n", - "\n", - "vertexai.preview.init(remote=True, project=\"my-project\", location=\"my-location\", staging_bucket=\"gs://my-bucket\")\n", - "\n", - "# Wrap the model class with `vertex_ai.preview.remote`\n", - "MyModelClass = vertexai.preview.remote(MyModelClass)\n", - "\n", - "# Instantiate the class\n", - "model = MyModelClass(...)\n", - "\n", - "# Optional set remote config\n", - "model.fit.vertex.remote_config.display_name = \"MyModelClass-remote-training\"\n", - "model.fit.vertex.remote_config.staging_bucket = \"gs://my-bucket\"\n", - "\n", - "# This `fit` call will be executed remotely\n", - "model.fit(...)\n", - "```" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "08d289fa873f" - }, - "source": [ - "### Dataset\n", - "\n", - "This tutorial uses the IRIS dataset, which predicts the iris species." - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "aed92deeb4a0" - }, - "source": [ - "### Costs\n", - "\n", - "This tutorial uses billable components of Google Cloud:\n", - "\n", - "* Vertex AI\n", - "* BigQuery\n", - "* Cloud Storage\n", - "\n", - "Learn about [Vertex AI pricing](https://cloud.google.com/vertex-ai/pricing),\n", - "[BigQuery pricing](https://cloud.google.com/bigquery/pricing),\n", - "and [Cloud Storage pricing](https://cloud.google.com/storage/pricing), \n", - "and use the [Pricing Calculator](https://cloud.google.com/products/calculator/)\n", - "to generate a cost estimate based on your projected usage." - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "i7EUnXsZhAGF" - }, - "source": [ - "## Installation\n", - "\n", - "Install the following packages required to execute this notebook. " - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "2b4ef9b72d43" - }, - "outputs": [], - "source": [ - "# Install the packages\n", - "! pip3 install --upgrade --quiet google-cloud-aiplatform[preview]\n", - "! pip3 install --upgrade --quiet bigframes" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "58707a750154" - }, - "source": [ - "### Colab only: Uncomment the following cell to restart the kernel." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "f200f10a1da3" - }, - "outputs": [], - "source": [ - "# Automatically restart kernel after installs so that your environment can access the new packages\n", - "# import IPython\n", - "\n", - "# app = IPython.Application.instance()\n", - "# app.kernel.do_shutdown(True)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "BF1j6f9HApxa" - }, - "source": [ - "## Before you begin\n", - "\n", - "### Set up your Google Cloud project\n", - "\n", - "**The following steps are required, regardless of your notebook environment.**\n", - "\n", - "1. [Select or create a Google Cloud project](https://console.cloud.google.com/cloud-resource-manager). When you first create an account, you get a $300 free credit towards your compute/storage costs.\n", - "\n", - "2. [Make sure that billing is enabled for your project](https://cloud.google.com/billing/docs/how-to/modify-project).\n", - "\n", - "3. [Enable the Vertex AI API](https://console.cloud.google.com/flows/enableapi?apiid=aiplatform.googleapis.com).\n", - "\n", - "4. If you are running this notebook locally, you need to install the [Cloud SDK](https://cloud.google.com/sdk)." - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "WReHDGG5g0XY" - }, - "source": [ - "#### Set your project ID\n", - "\n", - "**If you don't know your project ID**, try the following:\n", - "* Run `gcloud config list`.\n", - "* Run `gcloud projects list`.\n", - "* See the support page: [Locate the project ID](https://support.google.com/googleapi/answer/7014113)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "oM1iC_MfAts1" - }, - "outputs": [], - "source": [ - "PROJECT_ID = \"[your-project-id]\" # @param {type:\"string\"}\n", - "\n", - "# Set the project id\n", - "! gcloud config set project {PROJECT_ID}" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "region" - }, - "source": [ - "#### Region\n", - "\n", - "You can also change the `REGION` variable used by Vertex AI. Learn more about [Vertex AI regions](https://cloud.google.com/vertex-ai/docs/general/locations)." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "region" - }, - "outputs": [], - "source": [ - "REGION = \"us-central1\" # @param {type: \"string\"}" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "sBCra4QMA2wR" - }, - "source": [ - "### Authenticate your Google Cloud account\n", - "\n", - "Depending on your Jupyter environment, you may have to manually authenticate. Follow the relevant instructions below." - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "74ccc9e52986" - }, - "source": [ - "**1. Vertex AI Workbench**\n", - "* Do nothing as you are already authenticated." - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "de775a3773ba" - }, - "source": [ - "**2. Local JupyterLab instance, uncomment and run:**" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "254614fa0c46" - }, - "outputs": [], - "source": [ - "# ! gcloud auth login" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "ef21552ccea8" - }, - "source": [ - "**3. Colab, uncomment and run:**" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "603adbbf0532" - }, - "outputs": [], - "source": [ - "# from google.colab import auth\n", - "# auth.authenticate_user()" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "f6b2ccc891ed" - }, - "source": [ - "**4. Service account or other**\n", - "* See how to grant Cloud Storage permissions to your service account at https://cloud.google.com/storage/docs/gsutil/commands/iam#ch-examples." - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "zgPO1eR3CYjk" - }, - "source": [ - "### Create a Cloud Storage bucket\n", - "\n", - "Create a storage bucket to store intermediate artifacts such as datasets." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "MzGDU7TWdts_" - }, - "outputs": [], - "source": [ - "BUCKET_URI = f\"gs://your-bucket-name-{PROJECT_ID}-unique\" # @param {type:\"string\"}" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "-EcIXiGsCePi" - }, - "source": [ - "**Only if your bucket doesn't already exist**: Run the following cell to create your Cloud Storage bucket." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "NIq7R4HZCfIc" - }, - "outputs": [], - "source": [ - "! gsutil mb -l {REGION} -p {PROJECT_ID} {BUCKET_URI}" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "960505627ddf" - }, - "source": [ - "### Import libraries and define constants" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "PyQmSRbKA8r-" - }, - "outputs": [], - "source": [ - "import bigframes.pandas as bf\n", - "import vertexai\n", - "\n", - "bf.options.bigquery.location = \"us\" # Dataset is in 'us' not 'us-central1'\n", - "bf.options.bigquery.project = PROJECT_ID\n", - "\n", - "from bigframes.ml.model_selection import \\\n", - " train_test_split as bf_train_test_split\n", - "\n", - "REMOTE_JOB_NAME = \"sdk2-bigframes-sklearn\"\n", - "REMOTE_JOB_BUCKET = f\"{BUCKET_URI}/{REMOTE_JOB_NAME}\"" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "init_aip:mbsdk,all" - }, - "source": [ - "## Initialize Vertex AI SDK for Python\n", - "\n", - "Initialize the Vertex AI SDK for Python for your project and corresponding bucket." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "init_aip:mbsdk,all" - }, - "outputs": [], - "source": [ - "vertexai.init(\n", - " project=PROJECT_ID,\n", - " location=REGION,\n", - " staging_bucket=BUCKET_URI,\n", - ")" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "105334524e96" - }, - "source": [ - "## Prepare the dataset\n", - "\n", - "Now load the Iris dataset and split the data into train and test sets." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "b44cdc4e03f1" - }, - "outputs": [], - "source": [ - "df = bf.read_gbq(\"bigquery-public-data.ml_datasets.iris\")\n", - "\n", - "species_categories = {\n", - " \"versicolor\": 0,\n", - " \"virginica\": 1,\n", - " \"setosa\": 2,\n", - "}\n", - "df[\"species\"] = df[\"species\"].map(species_categories)\n", - "\n", - "# Assign an index column name\n", - "index_col = \"index\"\n", - "df.index.name = index_col" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "9cb8616b1997" - }, - "outputs": [], - "source": [ - "feature_columns = df[[\"sepal_length\", \"sepal_width\", \"petal_length\", \"petal_width\"]]\n", - "label_columns = df[[\"species\"]]\n", - "train_X, test_X, train_y, test_y = bf_train_test_split(\n", - " feature_columns, label_columns, test_size=0.2\n", - ")\n", - "\n", - "print(\"X_train size: \", train_X.size)\n", - "print(\"X_test size: \", test_X.size)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "8306545fcc57" - }, - "source": [ - "## Feature transformation\n", - "\n", - "Next, you do feature transformations on the data using the Vertex AI remote training service.\n", - "\n", - "First, you re-initialize Vertex AI to enable remote training." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "55e701c31036" - }, - "outputs": [], - "source": [ - "# Switch to remote mode for training\n", - "vertexai.preview.init(remote=True)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "4a0e9d59b273" - }, - "source": [ - "### Execute remote job for fit_transform() on training data\n", - "\n", - "Next, indicate that the `StandardScalar` class is to be executed remotely. Then set up the data transform and call the `fit_transform()` method is executed remotely." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "90333089d362" - }, - "outputs": [], - "source": [ - "from sklearn.preprocessing import StandardScaler\n", - "\n", - "# Wrap classes to enable Vertex remote execution\n", - "StandardScaler = vertexai.preview.remote(StandardScaler)\n", - "\n", - "# Instantiate transformer\n", - "transformer = StandardScaler()\n", - "\n", - "# Set training config\n", - "transformer.fit_transform.vertex.remote_config.display_name = (\n", - " f\"{REMOTE_JOB_NAME}-fit-transformer-bigframes\"\n", - ")\n", - "transformer.fit_transform.vertex.remote_config.staging_bucket = REMOTE_JOB_BUCKET\n", - "\n", - "# Execute transformer on Vertex (train_X is bigframes.dataframe.DataFrame, X_train is np.array)\n", - "X_train = transformer.fit_transform(train_X)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "6bf95574c907" - }, - "source": [ - "### Remote transform on test data" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "da6eea22a89a" - }, - "outputs": [], - "source": [ - "# Transform test dataset before calculate test score\n", - "transformer.transform.vertex.remote_config.display_name = (\n", - " REMOTE_JOB_NAME + \"-transformer\"\n", - ")\n", - "transformer.transform.vertex.remote_config.staging_bucket = REMOTE_JOB_BUCKET\n", - "\n", - "# Execute transformer on Vertex (test_X is bigframes.dataframe.DataFrame, X_test is np.array)\n", - "X_test = transformer.transform(test_X)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "ddf906c886e4" - }, - "source": [ - "## Remote training\n", - "\n", - "First, train the scikit-learn model as a remote training job:\n", - "\n", - "- Set LogisticRegression for the remote training job.\n", - "- Invoke LogisticRegression locally which will launch the remote training job." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "c7b0116fa60c" - }, - "outputs": [], - "source": [ - "from sklearn.linear_model import LogisticRegression\n", - "\n", - "# Wrap classes to enable Vertex remote execution\n", - "LogisticRegression = vertexai.preview.remote(LogisticRegression)\n", - "\n", - "# Instantiate model, warm_start=True for uptraining\n", - "model = LogisticRegression(warm_start=True)\n", - "\n", - "# Set training config\n", - "model.fit.vertex.remote_config.display_name = REMOTE_JOB_NAME + \"-sklearn-model\"\n", - "model.fit.vertex.remote_config.staging_bucket = REMOTE_JOB_BUCKET\n", - "\n", - "# Train model on Vertex\n", - "model.fit(train_X, train_y)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "ffe1d5903bcb" - }, - "source": [ - "## Remote prediction\n", - "\n", - "Obtain predictions from the trained model." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "d00ce35920fa" - }, - "outputs": [], - "source": [ - "# Remote evaluation\n", - "vertexai.preview.init(remote=True)\n", - "\n", - "# Evaluate model's accuracy score\n", - "predictions = model.predict(test_X)\n", - "\n", - "print(f\"Remote predictions: {predictions}\")" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "a8cd6cbd4403" - }, - "source": [ - "## Local evaluation\n", - "\n", - "Score model results locally." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "dc105dafdfb9" - }, - "outputs": [], - "source": [ - "# User must convert bigframes to pandas dataframe for local evaluation\n", - "train_X_pd = train_X.to_pandas().reset_index(drop=True)\n", - "train_y_pd = train_y.to_pandas().reset_index(drop=True)\n", - "\n", - "test_X_pd = test_X.to_pandas().reset_index(drop=True)\n", - "test_y_pd = test_y.to_pandas().reset_index(drop=True)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "25fec549de69" - }, - "outputs": [], - "source": [ - "# Switch to local mode for testing\n", - "vertexai.preview.init(remote=False)\n", - "\n", - "# Evaluate model's accuracy score\n", - "print(f\"Train accuracy: {model.score(train_X_pd, train_y_pd)}\")\n", - "\n", - "print(f\"Test accuracy: {model.score(test_X_pd, test_y_pd)}\")" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "TpV-iwP9qw9c" - }, - "source": [ - "## Cleaning up\n", - "\n", - "To clean up all Google Cloud resources used in this project, you can [delete the Google Cloud\n", - "project](https://cloud.google.com/resource-manager/docs/creating-managing-projects#shutting_down_projects) you used for the tutorial.\n", - "\n", - "Otherwise, you can delete the individual resources you created in this tutorial:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "sx_vKniMq9ZX" - }, - "outputs": [], - "source": [ - "import os\n", - "\n", - "# Delete Cloud Storage objects that were created\n", - "delete_bucket = False\n", - "if delete_bucket or os.getenv(\"IS_TESTING\"):\n", - " ! gsutil -m rm -r $BUCKET_URI" - ] - } - ], - "metadata": { - "colab": { - "collapsed_sections": [], - "name": "sdk2_bigframes_sklearn.ipynb", - "toc_visible": true - }, - "kernelspec": { - "display_name": "Python 3", - "name": "python3" - } - }, - "nbformat": 4, - "nbformat_minor": 0 -} diff --git a/notebooks/vertex_sdk/sdk2_bigframes_tensorflow.ipynb b/notebooks/vertex_sdk/sdk2_bigframes_tensorflow.ipynb deleted file mode 100644 index e6843b66b5..0000000000 --- a/notebooks/vertex_sdk/sdk2_bigframes_tensorflow.ipynb +++ /dev/null @@ -1,646 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "ur8xi4C7S06n" - }, - "outputs": [], - "source": [ - "# Copyright 2023 Google LLC\n", - "#\n", - "# Licensed under the Apache License, Version 2.0 (the \"License\");\n", - "# you may not use this file except in compliance with the License.\n", - "# You may obtain a copy of the License at\n", - "#\n", - "# https://www.apache.org/licenses/LICENSE-2.0\n", - "#\n", - "# Unless required by applicable law or agreed to in writing, software\n", - "# distributed under the License is distributed on an \"AS IS\" BASIS,\n", - "# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n", - "# See the License for the specific language governing permissions and\n", - "# limitations under the License." - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "JAPoU8Sm5E6e" - }, - "source": [ - "# Train a Tensorflow Keras model with Vertex AI SDK 2.0 and Bigframes \n", - "\n", - "\n", - " \n", - " \n", - "
\n", - " \n", - " \"Colab Run in Colab\n", - " \n", - " \n", - " \n", - " \"GitHub\n", - " View on GitHub\n", - " \n", - " \n", - " \n", - " \"VertexOpen in Vertex AI Workbench\n", - " \n", - "
" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "tvgnzT1CKxrO" - }, - "source": [ - "## Overview\n", - "\n", - "This tutorial demonstrates how to train a tensorflow keras model using Vertex AI local-to-remote training with Vertex AI SDK 2.0 and BigQuery Bigframes as the data source.\n", - "\n", - "Learn more about [bigframes](https://cloud.google.com/bigquery/docs/)." - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "d975e698c9a4" - }, - "source": [ - "### Objective\n", - "\n", - "In this tutorial, you learn to use `Vertex AI SDK 2.0` with Bigframes as input data source.\n", - "\n", - "\n", - "This tutorial uses the following Google Cloud ML services:\n", - "\n", - "- `Vertex AI Training`\n", - "- `Vertex AI Remote Training`\n", - "\n", - "\n", - "The steps performed include:\n", - "\n", - "- Initialize a dataframe from a BigQuery table and split the dataset\n", - "- Perform transformations as a Vertex AI remote training.\n", - "- Train the model remotely and evaluate the model locally\n", - "\n", - "**Local-to-remote training**\n", - "\n", - "```\n", - "import vertexai\n", - "from my_module import MyModelClass\n", - "\n", - "vertexai.preview.init(remote=True, project=\"my-project\", location=\"my-location\", staging_bucket=\"gs://my-bucket\")\n", - "\n", - "# Wrap the model class with `vertex_ai.preview.remote`\n", - "MyModelClass = vertexai.preview.remote(MyModelClass)\n", - "\n", - "# Instantiate the class\n", - "model = MyModelClass(...)\n", - "\n", - "# Optional set remote config\n", - "model.fit.vertex.remote_config.display_name = \"MyModelClass-remote-training\"\n", - "model.fit.vertex.remote_config.staging_bucket = \"gs://my-bucket\"\n", - "\n", - "# This `fit` call will be executed remotely\n", - "model.fit(...)\n", - "```" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "08d289fa873f" - }, - "source": [ - "### Dataset\n", - "\n", - "This tutorial uses the IRIS dataset, which predicts the iris species." - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "aed92deeb4a0" - }, - "source": [ - "### Costs\n", - "\n", - "This tutorial uses billable components of Google Cloud:\n", - "\n", - "* Vertex AI\n", - "* BigQuery\n", - "* Cloud Storage\n", - "\n", - "Learn about [Vertex AI pricing](https://cloud.google.com/vertex-ai/pricing),\n", - "[BigQuery pricing](https://cloud.google.com/bigquery/pricing),\n", - "and [Cloud Storage pricing](https://cloud.google.com/storage/pricing), \n", - "and use the [Pricing Calculator](https://cloud.google.com/products/calculator/)\n", - "to generate a cost estimate based on your projected usage." - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "i7EUnXsZhAGF" - }, - "source": [ - "## Installation\n", - "\n", - "Install the following packages required to execute this notebook. " - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "2b4ef9b72d43" - }, - "outputs": [], - "source": [ - "# Install the packages\n", - "! pip3 install --upgrade --quiet google-cloud-aiplatform[preview]\n", - "! pip3 install --upgrade --quiet bigframes\n", - "! pip3 install --upgrade --quiet tensorflow==2.12.0" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "58707a750154" - }, - "source": [ - "### Colab only: Uncomment the following cell to restart the kernel." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "f200f10a1da3" - }, - "outputs": [], - "source": [ - "# Automatically restart kernel after installs so that your environment can access the new packages\n", - "# import IPython\n", - "\n", - "# app = IPython.Application.instance()\n", - "# app.kernel.do_shutdown(True)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "BF1j6f9HApxa" - }, - "source": [ - "## Before you begin\n", - "\n", - "### Set up your Google Cloud project\n", - "\n", - "**The following steps are required, regardless of your notebook environment.**\n", - "\n", - "1. [Select or create a Google Cloud project](https://console.cloud.google.com/cloud-resource-manager). When you first create an account, you get a $300 free credit towards your compute/storage costs.\n", - "\n", - "2. [Make sure that billing is enabled for your project](https://cloud.google.com/billing/docs/how-to/modify-project).\n", - "\n", - "3. [Enable the Vertex AI API](https://console.cloud.google.com/flows/enableapi?apiid=aiplatform.googleapis.com).\n", - "\n", - "4. If you are running this notebook locally, you need to install the [Cloud SDK](https://cloud.google.com/sdk)." - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "WReHDGG5g0XY" - }, - "source": [ - "#### Set your project ID\n", - "\n", - "**If you don't know your project ID**, try the following:\n", - "* Run `gcloud config list`.\n", - "* Run `gcloud projects list`.\n", - "* See the support page: [Locate the project ID](https://support.google.com/googleapi/answer/7014113)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "oM1iC_MfAts1" - }, - "outputs": [], - "source": [ - "PROJECT_ID = \"[your-project-id]\" # @param {type:\"string\"}\n", - "\n", - "# Set the project id\n", - "! gcloud config set project {PROJECT_ID}" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "region" - }, - "source": [ - "#### Region\n", - "\n", - "You can also change the `REGION` variable used by Vertex AI. Learn more about [Vertex AI regions](https://cloud.google.com/vertex-ai/docs/general/locations)." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "region" - }, - "outputs": [], - "source": [ - "REGION = \"us-central1\" # @param {type: \"string\"}" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "sBCra4QMA2wR" - }, - "source": [ - "### Authenticate your Google Cloud account\n", - "\n", - "Depending on your Jupyter environment, you may have to manually authenticate. Follow the relevant instructions below." - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "74ccc9e52986" - }, - "source": [ - "**1. Vertex AI Workbench**\n", - "* Do nothing as you are already authenticated." - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "de775a3773ba" - }, - "source": [ - "**2. Local JupyterLab instance, uncomment and run:**" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "254614fa0c46" - }, - "outputs": [], - "source": [ - "# ! gcloud auth login" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "ef21552ccea8" - }, - "source": [ - "**3. Colab, uncomment and run:**" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "603adbbf0532" - }, - "outputs": [], - "source": [ - "# from google.colab import auth\n", - "# auth.authenticate_user()" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "f6b2ccc891ed" - }, - "source": [ - "**4. Service account or other**\n", - "* See how to grant Cloud Storage permissions to your service account at https://cloud.google.com/storage/docs/gsutil/commands/iam#ch-examples." - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "zgPO1eR3CYjk" - }, - "source": [ - "### Create a Cloud Storage bucket\n", - "\n", - "Create a storage bucket to store intermediate artifacts such as datasets." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "MzGDU7TWdts_" - }, - "outputs": [], - "source": [ - "BUCKET_URI = f\"gs://your-bucket-name-{PROJECT_ID}-unique\" # @param {type:\"string\"}" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "-EcIXiGsCePi" - }, - "source": [ - "**Only if your bucket doesn't already exist**: Run the following cell to create your Cloud Storage bucket." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "NIq7R4HZCfIc" - }, - "outputs": [], - "source": [ - "! gsutil mb -l {REGION} -p {PROJECT_ID} {BUCKET_URI}" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "960505627ddf" - }, - "source": [ - "### Import libraries and define constants" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "PyQmSRbKA8r-" - }, - "outputs": [], - "source": [ - "import bigframes.pandas as bf\n", - "import tensorflow as tf\n", - "import vertexai\n", - "from tensorflow import keras\n", - "\n", - "bf.options.bigquery.location = \"us\" # Dataset is in 'us' not 'us-central1'\n", - "bf.options.bigquery.project = PROJECT_ID\n", - "\n", - "from bigframes.ml.model_selection import \\\n", - " train_test_split as bf_train_test_split" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "init_aip:mbsdk,all" - }, - "source": [ - "## Initialize Vertex AI SDK for Python\n", - "\n", - "Initialize the Vertex AI SDK for Python for your project and corresponding bucket." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "init_aip:mbsdk,all" - }, - "outputs": [], - "source": [ - "vertexai.init(\n", - " project=PROJECT_ID,\n", - " location=REGION,\n", - " staging_bucket=BUCKET_URI,\n", - ")\n", - "\n", - "REMOTE_JOB_NAME = \"sdk2-bigframes-tensorflow\"\n", - "REMOTE_JOB_BUCKET = f\"{BUCKET_URI}/{REMOTE_JOB_NAME}\"" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "105334524e96" - }, - "source": [ - "## Prepare the dataset\n", - "\n", - "Now load the Iris dataset and split the data into train and test sets." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "94576deccd8c" - }, - "outputs": [], - "source": [ - "df = bf.read_gbq(\"bigquery-public-data.ml_datasets.iris\")\n", - "\n", - "species_categories = {\n", - " \"versicolor\": 0,\n", - " \"virginica\": 1,\n", - " \"setosa\": 2,\n", - "}\n", - "df[\"target\"] = df[\"species\"].map(species_categories)\n", - "df = df.drop(columns=[\"species\"])\n", - "\n", - "train, test = bf_train_test_split(df, test_size=0.2)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "cfcbce726efa" - }, - "source": [ - "## Remote training with GPU\n", - "\n", - "First, train a TensorFlow model as a remote training job:\n", - "\n", - "- Reinitialize Vertex AI for remote training.\n", - "- Instantiate the tensorflow keras model for the remote training job.\n", - "- Invoke the tensorflow keras model.fit() locally which will launch the remote training job." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "fd865b0c4e8b" - }, - "outputs": [], - "source": [ - "# Switch to remote mode for training\n", - "vertexai.preview.init(remote=True)\n", - "\n", - "keras.Sequential = vertexai.preview.remote(keras.Sequential)\n", - "\n", - "# Instantiate model\n", - "model = keras.Sequential(\n", - " [keras.layers.Dense(5, input_shape=(4,)), keras.layers.Softmax()]\n", - ")\n", - "\n", - "# Specify optimizer and loss function\n", - "model.compile(optimizer=\"adam\", loss=\"mean_squared_error\")\n", - "\n", - "# Set training config\n", - "model.fit.vertex.remote_config.enable_cuda = True\n", - "model.fit.vertex.remote_config.display_name = REMOTE_JOB_NAME + \"-keras-model-gpu\"\n", - "model.fit.vertex.remote_config.staging_bucket = REMOTE_JOB_BUCKET\n", - "model.fit.vertex.remote_config.custom_commands = [\"pip install tensorflow-io==0.32.0\"]\n", - "\n", - "# Manually set compute resources this time\n", - "model.fit.vertex.remote_config.machine_type = \"n1-highmem-4\"\n", - "model.fit.vertex.remote_config.accelerator_type = \"NVIDIA_TESLA_K80\"\n", - "model.fit.vertex.remote_config.accelerator_count = 4\n", - "\n", - "# Train model on Vertex\n", - "model.fit(train, epochs=10)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "f1af94ac1477" - }, - "source": [ - "## Remote prediction\n", - "\n", - "Obtain predictions from the trained model." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "1d75879948b5" - }, - "outputs": [], - "source": [ - "vertexai.preview.init(remote=True)\n", - "\n", - "# Set remote config\n", - "model.predict.vertex.remote_config.enable_cuda = False\n", - "model.predict.vertex.remote_config.display_name = REMOTE_JOB_NAME + \"-keras-predict-cpu\"\n", - "model.predict.vertex.remote_config.staging_bucket = REMOTE_JOB_BUCKET\n", - "model.predict.vertex.remote_config.custom_commands = [\n", - " \"pip install tensorflow-io==0.32.0\"\n", - "]\n", - "\n", - "predictions = model.predict(train)\n", - "\n", - "print(f\"Remote predictions: {predictions}\")" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "798b77c95067" - }, - "source": [ - "## Local evaluation\n", - "\n", - "Evaluate model results locally." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "88e734e30791" - }, - "outputs": [], - "source": [ - "# User must convert bigframes to pandas dataframe for local evaluation\n", - "feature_columns = [\"sepal_length\", \"sepal_width\", \"petal_length\", \"petal_width\"]\n", - "label_columns = [\"target\"]\n", - "\n", - "train_X_np = train[feature_columns].to_pandas().values.astype(float)\n", - "train_y_np = train[label_columns].to_pandas().values.astype(float)\n", - "train_ds = tf.data.Dataset.from_tensor_slices((train_X_np, train_y_np))\n", - "\n", - "test_X_np = test[feature_columns].to_pandas().values.astype(float)\n", - "test_y_np = test[label_columns].to_pandas().values.astype(float)\n", - "test_ds = tf.data.Dataset.from_tensor_slices((test_X_np, test_y_np))" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "cb8637f783ad" - }, - "outputs": [], - "source": [ - "# Switch to local mode for evaluation\n", - "vertexai.preview.init(remote=False)\n", - "\n", - "# Evaluate model's mean square errors\n", - "print(f\"Train loss: {model.evaluate(train_ds.batch(32))}\")\n", - "\n", - "print(f\"Test loss: {model.evaluate(test_ds.batch(32))}\")" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "TpV-iwP9qw9c" - }, - "source": [ - "## Cleaning up\n", - "\n", - "To clean up all Google Cloud resources used in this project, you can [delete the Google Cloud\n", - "project](https://cloud.google.com/resource-manager/docs/creating-managing-projects#shutting_down_projects) you used for the tutorial.\n", - "\n", - "Otherwise, you can delete the individual resources you created in this tutorial:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "sx_vKniMq9ZX" - }, - "outputs": [], - "source": [ - "import os\n", - "\n", - "# Delete Cloud Storage objects that were created\n", - "delete_bucket = False\n", - "if delete_bucket or os.getenv(\"IS_TESTING\"):\n", - " ! gsutil -m rm -r $BUCKET_URI" - ] - } - ], - "metadata": { - "colab": { - "collapsed_sections": [], - "name": "sdk2_bigframes_tensorflow.ipynb", - "toc_visible": true - }, - "kernelspec": { - "display_name": "Python 3", - "name": "python3" - } - }, - "nbformat": 4, - "nbformat_minor": 0 -} From 39fe47451d24a8cf55d7dbb15c6d3b176d25ab18 Mon Sep 17 00:00:00 2001 From: Garrett Wu <6505921+GarrettWu@users.noreply.github.com> Date: Thu, 4 Apr 2024 10:19:03 -0700 Subject: [PATCH 48/53] fix: reloaded transformer .transform error (#569) * fix: reloaded transformer .transform error * fix mypy --- bigframes/ml/compose.py | 13 +- tests/system/large/ml/test_compose.py | 23 ++++ tests/system/small/ml/test_preprocessing.py | 129 ++++++++++++++++++-- 3 files changed, 149 insertions(+), 16 deletions(-) diff --git a/bigframes/ml/compose.py b/bigframes/ml/compose.py index 8638f4d182..89969f23e7 100644 --- a/bigframes/ml/compose.py +++ b/bigframes/ml/compose.py @@ -115,14 +115,17 @@ def camel_to_snake(name): name = re.sub("(.)([A-Z][a-z]+)", r"\1_\2", name) return re.sub("([a-z0-9])([A-Z])", r"\1_\2", name).lower() + output_names = [] for transform_col in bq_model._properties["transformColumns"]: + transform_col_dict = cast(dict, transform_col) # pass the columns that are not transformed - if "transformSql" not in transform_col: + if "transformSql" not in transform_col_dict: continue - transform_sql: str = cast(dict, transform_col)["transformSql"] + transform_sql: str = transform_col_dict["transformSql"] if not transform_sql.startswith("ML."): continue + output_names.append(transform_col_dict["name"]) found_transformer = False for prefix in _BQML_TRANSFROM_TYPE_MAPPING: if transform_sql.startswith(prefix): @@ -141,7 +144,10 @@ def camel_to_snake(name): f"Unsupported transformer type. {constants.FEEDBACK_LINK}" ) - return cls(transformers=transformers) + transformer = cls(transformers=transformers) + transformer._output_names = output_names + + return transformer def _merge( self, bq_model: bigquery.Model @@ -164,6 +170,7 @@ def _merge( for feature_column in bq_model.feature_columns ] ) == sorted(columns): + transformer_0._output_names = self._output_names return transformer_0 return self diff --git a/tests/system/large/ml/test_compose.py b/tests/system/large/ml/test_compose.py index 0107d371cb..7513b78b29 100644 --- a/tests/system/large/ml/test_compose.py +++ b/tests/system/large/ml/test_compose.py @@ -142,3 +142,26 @@ def test_columntransformer_save_load(new_penguins_df, dataset_id): ] assert reloaded_transformer.transformers_ == expected assert reloaded_transformer._bqml_model is not None + + result = transformer.fit_transform( + new_penguins_df[["species", "culmen_length_mm", "flipper_length_mm"]] + ).to_pandas() + + expected = pandas.DataFrame( + { + "onehotencoded_species": [ + [{"index": 1, "value": 1.0}], + [{"index": 1, "value": 1.0}], + [{"index": 2, "value": 1.0}], + ], + "standard_scaled_culmen_length_mm": [ + 1.313249, + -0.20198, + -1.111118, + ], + "standard_scaled_flipper_length_mm": [1.251098, -1.196588, -0.054338], + }, + index=pandas.Index([1633, 1672, 1690], dtype="Int64", name="tag_number"), + ) + + pandas.testing.assert_frame_equal(result, expected, rtol=0.1, check_dtype=False) diff --git a/tests/system/small/ml/test_preprocessing.py b/tests/system/small/ml/test_preprocessing.py index 22c3c84959..faa0cd7bbd 100644 --- a/tests/system/small/ml/test_preprocessing.py +++ b/tests/system/small/ml/test_preprocessing.py @@ -58,7 +58,7 @@ def test_standard_scaler_normalizes(penguins_df_default_index, new_penguins_df): index=pd.Index([1633, 1672, 1690], name="tag_number", dtype="Int64"), ) - pd.testing.assert_frame_equal(result, expected, rtol=1e-3) + pd.testing.assert_frame_equal(result, expected, rtol=0.1) def test_standard_scaler_normalizeds_fit_transform(new_penguins_df): @@ -82,7 +82,7 @@ def test_standard_scaler_normalizeds_fit_transform(new_penguins_df): index=pd.Index([1633, 1672, 1690], name="tag_number", dtype="Int64"), ) - pd.testing.assert_frame_equal(result, expected, rtol=1e-3) + pd.testing.assert_frame_equal(result, expected, rtol=0.1) def test_standard_scaler_series_normalizes(penguins_df_default_index, new_penguins_df): @@ -110,7 +110,7 @@ def test_standard_scaler_series_normalizes(penguins_df_default_index, new_pengui index=pd.Index([1633, 1672, 1690], name="tag_number", dtype="Int64"), ) - pd.testing.assert_frame_equal(result, expected, rtol=1e-3) + pd.testing.assert_frame_equal(result, expected, rtol=0.1) def test_standard_scaler_save_load(new_penguins_df, dataset_id): @@ -125,6 +125,22 @@ def test_standard_scaler_save_load(new_penguins_df, dataset_id): assert isinstance(reloaded_transformer, preprocessing.StandardScaler) assert reloaded_transformer._bqml_model is not None + result = reloaded_transformer.transform( + new_penguins_df[["culmen_length_mm", "culmen_depth_mm", "flipper_length_mm"]] + ).to_pandas() + + expected = pd.DataFrame( + { + "standard_scaled_culmen_length_mm": [1.313249, -0.20198, -1.111118], + "standard_scaled_culmen_depth_mm": [1.17072, -1.272416, 0.101848], + "standard_scaled_flipper_length_mm": [1.251089, -1.196588, -0.054338], + }, + dtype="Float64", + index=pd.Index([1633, 1672, 1690], name="tag_number", dtype="Int64"), + ) + + pd.testing.assert_frame_equal(result, expected, rtol=0.1) + def test_max_abs_scaler_normalizes(penguins_df_default_index, new_penguins_df): # TODO(http://b/292431644): add a second test that compares output to sklearn.preprocessing.MaxAbsScaler, when BQML's change is in prod. @@ -157,7 +173,7 @@ def test_max_abs_scaler_normalizes(penguins_df_default_index, new_penguins_df): index=pd.Index([1633, 1672, 1690], name="tag_number", dtype="Int64"), ) - pd.testing.assert_frame_equal(result, expected, rtol=1e-3) + pd.testing.assert_frame_equal(result, expected, rtol=0.1) def test_max_abs_scaler_normalizeds_fit_transform(new_penguins_df): @@ -176,7 +192,7 @@ def test_max_abs_scaler_normalizeds_fit_transform(new_penguins_df): index=pd.Index([1633, 1672, 1690], name="tag_number", dtype="Int64"), ) - pd.testing.assert_frame_equal(result, expected, rtol=1e-3) + pd.testing.assert_frame_equal(result, expected, rtol=0.1) def test_max_abs_scaler_series_normalizes(penguins_df_default_index, new_penguins_df): @@ -199,7 +215,7 @@ def test_max_abs_scaler_series_normalizes(penguins_df_default_index, new_penguin index=pd.Index([1633, 1672, 1690], name="tag_number", dtype="Int64"), ) - pd.testing.assert_frame_equal(result, expected, rtol=1e-3) + pd.testing.assert_frame_equal(result, expected, rtol=0.1) def test_max_abs_scaler_save_load(new_penguins_df, dataset_id): @@ -214,6 +230,22 @@ def test_max_abs_scaler_save_load(new_penguins_df, dataset_id): assert isinstance(reloaded_transformer, preprocessing.MaxAbsScaler) assert reloaded_transformer._bqml_model is not None + result = reloaded_transformer.transform( + new_penguins_df[["culmen_length_mm", "culmen_depth_mm", "flipper_length_mm"]] + ).to_pandas() + + expected = pd.DataFrame( + { + "max_abs_scaled_culmen_length_mm": [1.0, 0.974684, 0.959494], + "max_abs_scaled_culmen_depth_mm": [1.0, 0.914894, 0.962766], + "max_abs_scaled_flipper_length_mm": [1.0, 0.923469, 0.959184], + }, + dtype="Float64", + index=pd.Index([1633, 1672, 1690], name="tag_number", dtype="Int64"), + ) + + pd.testing.assert_frame_equal(result, expected, rtol=0.1) + def test_min_max_scaler_normalized_fit_transform(new_penguins_df): scaler = preprocessing.MinMaxScaler() @@ -231,7 +263,7 @@ def test_min_max_scaler_normalized_fit_transform(new_penguins_df): index=pd.Index([1633, 1672, 1690], name="tag_number", dtype="Int64"), ) - pd.testing.assert_frame_equal(result, expected, rtol=1e-3) + pd.testing.assert_frame_equal(result, expected, rtol=0.1) def test_min_max_scaler_series_normalizes(penguins_df_default_index, new_penguins_df): @@ -255,7 +287,7 @@ def test_min_max_scaler_series_normalizes(penguins_df_default_index, new_penguin index=pd.Index([1633, 1672, 1690], name="tag_number", dtype="Int64"), ) - pd.testing.assert_frame_equal(result, expected, rtol=1e-3) + pd.testing.assert_frame_equal(result, expected, rtol=0.1) def test_min_max_scaler_normalizes(penguins_df_default_index, new_penguins_df): @@ -290,7 +322,7 @@ def test_min_max_scaler_normalizes(penguins_df_default_index, new_penguins_df): index=pd.Index([1633, 1672, 1690], name="tag_number", dtype="Int64"), ) - pd.testing.assert_frame_equal(result, expected, rtol=1e-3) + pd.testing.assert_frame_equal(result, expected, rtol=0.1) def test_min_max_scaler_save_load(new_penguins_df, dataset_id): @@ -305,6 +337,22 @@ def test_min_max_scaler_save_load(new_penguins_df, dataset_id): assert isinstance(reloaded_transformer, preprocessing.MinMaxScaler) assert reloaded_transformer._bqml_model is not None + result = reloaded_transformer.fit_transform( + new_penguins_df[["culmen_length_mm", "culmen_depth_mm", "flipper_length_mm"]] + ).to_pandas() + + expected = pd.DataFrame( + { + "min_max_scaled_culmen_length_mm": [1.0, 0.375, 0.0], + "min_max_scaled_culmen_depth_mm": [1.0, 0.0, 0.5625], + "min_max_scaled_flipper_length_mm": [1.0, 0.0, 0.466667], + }, + dtype="Float64", + index=pd.Index([1633, 1672, 1690], name="tag_number", dtype="Int64"), + ) + + pd.testing.assert_frame_equal(result, expected, rtol=0.1) + def test_k_bins_discretizer_normalized_fit_transform_default_params(new_penguins_df): discretizer = preprocessing.KBinsDiscretizer(strategy="uniform") @@ -322,7 +370,7 @@ def test_k_bins_discretizer_normalized_fit_transform_default_params(new_penguins index=pd.Index([1633, 1672, 1690], name="tag_number", dtype="Int64"), ) - pd.testing.assert_frame_equal(result, expected, rtol=1e-3) + pd.testing.assert_frame_equal(result, expected, rtol=0.1) def test_k_bins_discretizer_series_normalizes( @@ -344,7 +392,7 @@ def test_k_bins_discretizer_series_normalizes( index=pd.Index([1633, 1672, 1690], name="tag_number", dtype="Int64"), ) - pd.testing.assert_frame_equal(result, expected, rtol=1e-3) + pd.testing.assert_frame_equal(result, expected, rtol=0.1) def test_k_bins_discretizer_normalizes(penguins_df_default_index, new_penguins_df): @@ -374,7 +422,7 @@ def test_k_bins_discretizer_normalizes(penguins_df_default_index, new_penguins_d index=pd.Index([1633, 1672, 1690], name="tag_number", dtype="Int64"), ) - pd.testing.assert_frame_equal(result, expected, rtol=1e-3) + pd.testing.assert_frame_equal(result, expected, rtol=0.1) def test_k_bins_discretizer_normalizes_different_params( @@ -406,7 +454,7 @@ def test_k_bins_discretizer_normalizes_different_params( index=pd.Index([1633, 1672, 1690], name="tag_number", dtype="Int64"), ) - pd.testing.assert_frame_equal(result, expected, rtol=1e-3) + pd.testing.assert_frame_equal(result, expected, rtol=0.1) def test_k_bins_discretizer_save_load(new_penguins_df, dataset_id): @@ -423,6 +471,22 @@ def test_k_bins_discretizer_save_load(new_penguins_df, dataset_id): assert reloaded_transformer.strategy == transformer.strategy assert reloaded_transformer._bqml_model is not None + result = reloaded_transformer.fit_transform( + new_penguins_df[["culmen_length_mm", "culmen_depth_mm", "flipper_length_mm"]] + ).to_pandas() + + expected = pd.DataFrame( + { + "kbinsdiscretizer_culmen_length_mm": ["bin_6", "bin_4", "bin_2"], + "kbinsdiscretizer_culmen_depth_mm": ["bin_6", "bin_2", "bin_5"], + "kbinsdiscretizer_flipper_length_mm": ["bin_6", "bin_2", "bin_4"], + }, + dtype="string[pyarrow]", + index=pd.Index([1633, 1672, 1690], name="tag_number", dtype="Int64"), + ) + + pd.testing.assert_frame_equal(result, expected, rtol=0.1) + def test_one_hot_encoder_default_params(new_penguins_df): encoder = preprocessing.OneHotEncoder() @@ -560,6 +624,29 @@ def test_one_hot_encoder_save_load(new_penguins_df, dataset_id): assert reloaded_transformer.max_categories == transformer.max_categories assert reloaded_transformer._bqml_model is not None + result = reloaded_transformer.fit_transform( + new_penguins_df[["species", "sex"]] + ).to_pandas() + + expected = pd.DataFrame( + { + "onehotencoded_species": [ + [{"index": 1, "value": 1.0}], + [{"index": 1, "value": 1.0}], + [{"index": 2, "value": 1.0}], + ], + "onehotencoded_sex": [ + [{"index": 2, "value": 1.0}], + [{"index": 1, "value": 1.0}], + [{"index": 1, "value": 1.0}], + ], + }, + dtype=ONE_HOT_ENCODED_DTYPE, + index=pd.Index([1633, 1672, 1690], name="tag_number", dtype="Int64"), + ) + + pd.testing.assert_frame_equal(result, expected) + def test_label_encoder_default_params(new_penguins_df): encoder = preprocessing.LabelEncoder() @@ -677,5 +764,21 @@ def test_label_encoder_save_load(new_penguins_df, dataset_id): assert reloaded_transformer.max_categories == transformer.max_categories assert reloaded_transformer._bqml_model is not None + result = reloaded_transformer.transform(new_penguins_df).to_pandas() + + expected = pd.DataFrame( + { + "labelencoded_species": [ + 1, + 1, + 2, + ], + }, + dtype="Int64", + index=pd.Index([1633, 1672, 1690], name="tag_number", dtype="Int64"), + ) + + pd.testing.assert_frame_equal(result, expected) + # TODO(garrettwu): add OneHotEncoder tests to compare with sklearn. From 9084df369bc6819edf5f57ceba85667a14371ac5 Mon Sep 17 00:00:00 2001 From: Ashley Xu <139821907+ashleyxuu@users.noreply.github.com> Date: Thu, 4 Apr 2024 10:28:16 -0700 Subject: [PATCH 49/53] docs: address more comments from technical writers to meet legal purposes (#571) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Thank you for opening a Pull Request! Before submitting your PR, there are a few things you can do to make sure it goes smoothly: - [ ] Make sure to open an issue as a [bug/issue](https://togithub.com/googleapis/python-bigquery-dataframes/issues/new/choose) before writing your code! That way we can discuss the change, evaluate designs, and agree on the general idea - [ ] Ensure the tests and linter pass - [ ] Code coverage does not decrease (if any source code was changed) - [ ] Appropriate docs were updated (if necessary) Fixes # 🦕 --- bigframes/ml/base.py | 10 +++++----- bigframes/ml/ensemble.py | 18 +++++++++--------- bigframes/ml/forecasting.py | 6 +++--- bigframes/ml/metrics/_metrics.py | 2 +- bigframes/ml/model_selection.py | 4 ++-- bigframes/ml/pipeline.py | 2 +- bigframes/ml/preprocessing.py | 2 +- .../ml_fundamentals_bq_dataframes.ipynb | 2 +- .../regression/sklearn_linear_regression.ipynb | 2 +- .../pandas/core/config_init.py | 10 +++++----- .../bigframes_vendored/xgboost/sklearn.py | 2 +- 11 files changed, 30 insertions(+), 30 deletions(-) diff --git a/bigframes/ml/base.py b/bigframes/ml/base.py index 5e7aada8de..c57cb78791 100644 --- a/bigframes/ml/base.py +++ b/bigframes/ml/base.py @@ -77,7 +77,7 @@ def fit_transform(self, x_train: Union[DataFrame, Series], y_train: Union[DataFr """ def __repr__(self): - """Print the estimator's constructor with all non-default parameter values""" + """Print the estimator's constructor with all non-default parameter values.""" # Estimator pretty printer adapted from Sklearn's, which is in turn an adaption of # the inbuilt pretty-printer in CPython @@ -106,13 +106,13 @@ def predict(self, X): def register(self: _T, vertex_ai_model_id: Optional[str] = None) -> _T: """Register the model to Vertex AI. - After register, go to Google Cloud Console (https://console.cloud.google.com/vertex-ai/models) + After register, go to the Google Cloud console (https://console.cloud.google.com/vertex-ai/models) to manage the model registries. Refer to https://cloud.google.com/vertex-ai/docs/model-registry/introduction for more options. Args: vertex_ai_model_id (Optional[str], default None): - optional string id as model id in Vertex. If not set, will by default to 'bigframes_{bq_model_id}'. + Optional string id as model id in Vertex. If not set, will default to 'bigframes_{bq_model_id}'. Vertex Ai model id will be truncated to 63 characters due to its limitation. Returns: @@ -191,9 +191,9 @@ def to_gbq(self: _T, model_name: str, replace: bool = False) -> _T: Args: model_name (str): - the name of the model. + The name of the model. replace (bool, default False): - whether to replace if the model already exists. Default to False. + Whether to replace if the model already exists. Default to False. Returns: Saved transformer.""" diff --git a/bigframes/ml/ensemble.py b/bigframes/ml/ensemble.py index 72ea600c58..a8f0329145 100644 --- a/bigframes/ml/ensemble.py +++ b/bigframes/ml/ensemble.py @@ -12,7 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -"""Ensemble models. This module is styled after Scikit-Learn's ensemble module: +"""Ensemble models. This module is styled after scikit-learn's ensemble module: https://scikit-learn.org/stable/modules/ensemble.html""" from __future__ import annotations @@ -190,9 +190,9 @@ def to_gbq(self, model_name: str, replace: bool = False) -> XGBRegressor: Args: model_name (str): - the name of the model. + The name of the model. replace (bool, default False): - whether to replace if the model already exists. Default to False. + Whether to replace if the model already exists. Default to False. Returns: saved model.""" if not self._bqml_model: @@ -343,9 +343,9 @@ def to_gbq(self, model_name: str, replace: bool = False) -> XGBClassifier: Args: model_name (str): - the name of the model. + The name of the model. replace (bool, default False): - whether to replace if the model already exists. Default to False. + Whether to replace if the model already exists. Default to False. Returns: XGBClassifier: saved model.""" @@ -506,9 +506,9 @@ def to_gbq(self, model_name: str, replace: bool = False) -> RandomForestRegresso Args: model_name (str): - the name of the model. + The name of the model. replace (bool, default False): - whether to replace if the model already exists. Default to False. + Whether to replace if the model already exists. Default to False. Returns: RandomForestRegressor: saved model.""" @@ -669,9 +669,9 @@ def to_gbq(self, model_name: str, replace: bool = False) -> RandomForestClassifi Args: model_name (str): - the name of the model. + The name of the model. replace (bool, default False): - whether to replace if the model already exists. Default to False. + Whether to replace if the model already exists. Default to False. Returns: RandomForestClassifier: saved model.""" diff --git a/bigframes/ml/forecasting.py b/bigframes/ml/forecasting.py index 7993327200..e50a8ed35b 100644 --- a/bigframes/ml/forecasting.py +++ b/bigframes/ml/forecasting.py @@ -248,12 +248,12 @@ def predict( an int value that specifies the number of time points to forecast. The default value is 3, and the maximum value is 1000. confidence_level (float, default 0.95): - a float value that specifies percentage of the future values that fall in the prediction interval. + A float value that specifies percentage of the future values that fall in the prediction interval. The valid input range is [0.0, 1.0). Returns: bigframes.dataframe.DataFrame: The predicted DataFrames. Which - contains 2 columns "forecast_timestamp" and "forecast_value". + contains 2 columns: "forecast_timestamp" and "forecast_value". """ if horizon < 1 or horizon > 1000: raise ValueError(f"horizon must be [1, 1000], but is {horizon}.") @@ -284,7 +284,7 @@ def detect_anomalies( Identifies the custom threshold to use for anomaly detection. The value must be in the range [0, 1), with a default value of 0.95. Returns: - bigframes.dataframe.DataFrame: detected DataFrame.""" + bigframes.dataframe.DataFrame: Detected DataFrame.""" if anomaly_prob_threshold < 0.0 or anomaly_prob_threshold >= 1.0: raise ValueError( f"anomaly_prob_threshold must be [0.0, 1.0), but is {anomaly_prob_threshold}." diff --git a/bigframes/ml/metrics/_metrics.py b/bigframes/ml/metrics/_metrics.py index b8c264e91b..2525ecd34f 100644 --- a/bigframes/ml/metrics/_metrics.py +++ b/bigframes/ml/metrics/_metrics.py @@ -13,7 +13,7 @@ # limitations under the License. """Metrics functions for evaluating models. This module is styled after -Scikit-Learn's metrics module: https://scikit-learn.org/stable/modules/metrics.html.""" +scikit-learn's metrics module: https://scikit-learn.org/stable/modules/metrics.html.""" import inspect import typing diff --git a/bigframes/ml/model_selection.py b/bigframes/ml/model_selection.py index 443b9e7be6..42c13fdb40 100644 --- a/bigframes/ml/model_selection.py +++ b/bigframes/ml/model_selection.py @@ -13,7 +13,7 @@ # limitations under the License. """Functions for test/train split and model tuning. This module is styled after -Scikit-Learn's model_selection module: +scikit-learn's model_selection module: https://scikit-learn.org/stable/modules/classes.html#module-sklearn.model_selection.""" @@ -51,7 +51,7 @@ def train_test_split( List[Union[bigframes.dataframe.DataFrame, bigframes.series.Series]]: A list of BigQuery DataFrames or Series. """ - # TODO(garrettwu): Scikit-Learn throws an error when the dataframes don't have the same + # TODO(garrettwu): scikit-learn throws an error when the dataframes don't have the same # number of rows. We probably want to do something similar. Now the implementation is based # on index. We'll move to based on ordering first. diff --git a/bigframes/ml/pipeline.py b/bigframes/ml/pipeline.py index 92a3bae77d..5df2378575 100644 --- a/bigframes/ml/pipeline.py +++ b/bigframes/ml/pipeline.py @@ -12,7 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -"""For composing estimators together. This module is styled after Scikit-Learn's +"""For composing estimators together. This module is styled after scikit-learn's pipeline module: https://scikit-learn.org/stable/modules/pipeline.html.""" diff --git a/bigframes/ml/preprocessing.py b/bigframes/ml/preprocessing.py index fd7d44f731..673ee27db0 100644 --- a/bigframes/ml/preprocessing.py +++ b/bigframes/ml/preprocessing.py @@ -13,7 +13,7 @@ # limitations under the License. """Transformers that prepare data for other estimators. This module is styled after -Scikit-Learn's preprocessing module: https://scikit-learn.org/stable/modules/preprocessing.html.""" +scikit-learn's preprocessing module: https://scikit-learn.org/stable/modules/preprocessing.html.""" from __future__ import annotations diff --git a/notebooks/getting_started/ml_fundamentals_bq_dataframes.ipynb b/notebooks/getting_started/ml_fundamentals_bq_dataframes.ipynb index b3c965aded..e7b69f017b 100644 --- a/notebooks/getting_started/ml_fundamentals_bq_dataframes.ipynb +++ b/notebooks/getting_started/ml_fundamentals_bq_dataframes.ipynb @@ -1051,7 +1051,7 @@ "source": [ "## Estimators\n", "\n", - "Following Scikit-Learn, all learning components are \"estimators\"; objects that can learn from training data and then apply themselves to new data. Estimators share the following patterns:\n", + "Following scikit-learn, all learning components are \"estimators\"; objects that can learn from training data and then apply themselves to new data. Estimators share the following patterns:\n", "\n", "- a constructor that takes a list of parameters\n", "- a standard string representation that shows the class name and all non-default parameters, e.g. `LinearRegression(fit_intercept=False)`\n", diff --git a/notebooks/regression/sklearn_linear_regression.ipynb b/notebooks/regression/sklearn_linear_regression.ipynb index ec14d15cdf..2873527449 100644 --- a/notebooks/regression/sklearn_linear_regression.ipynb +++ b/notebooks/regression/sklearn_linear_regression.ipynb @@ -7,7 +7,7 @@ "source": [ "# Using ML - SKLearn linear regression\n", "\n", - "This demo shows how we can implement a linear regression in BigQuery DataFrames ML, with API that is exactly compatible with Scikit-Learn." + "This demo shows how we can implement a linear regression in BigQuery DataFrames ML, with API that is exactly compatible with scikit-learn." ] }, { diff --git a/third_party/bigframes_vendored/pandas/core/config_init.py b/third_party/bigframes_vendored/pandas/core/config_init.py index ecc103d7c8..a3178e2761 100644 --- a/third_party/bigframes_vendored/pandas/core/config_init.py +++ b/third_party/bigframes_vendored/pandas/core/config_init.py @@ -59,18 +59,18 @@ repr_mode (Literal[`head`, `deferred`]): `head`: Execute, download, and display results (limited to head) from - dataframe and series objects during repr. + Dataframe and Series objects during repr. `deferred`: - Prevent executions from repr statements in dataframe and series objects. - Instead estimated bytes processed will be shown. Dataframe and Series + Prevent executions from repr statements in DataFrame and Series objects. + Instead, estimated bytes processed will be shown. DataFrame and Series objects can still be computed with methods that explicitly execute and download results. max_info_columns (int): max_info_columns is used in DataFrame.info method to decide if - per column information will be printed. + information in each column will be printed. max_info_rows (int or None): df.info() will usually show null-counts for each column. - For large frames this can be quite slow. max_info_rows and max_info_cols + For large frames, this can be quite slow. max_info_rows and max_info_cols limit this null check only to frames with smaller dimensions than specified. memory_usage (bool): diff --git a/third_party/bigframes_vendored/xgboost/sklearn.py b/third_party/bigframes_vendored/xgboost/sklearn.py index 250e34dc2c..424b17a371 100644 --- a/third_party/bigframes_vendored/xgboost/sklearn.py +++ b/third_party/bigframes_vendored/xgboost/sklearn.py @@ -1,4 +1,4 @@ -"""Scikit-Learn Wrapper interface for XGBoost.""" +"""scikit-learn Wrapper interface for XGBoost.""" from typing import Any From 75dd7862e60502c97f7defe5dfefb044ea74bae8 Mon Sep 17 00:00:00 2001 From: Henry Solberg Date: Thu, 4 Apr 2024 10:40:16 -0700 Subject: [PATCH 50/53] fix: fix error in `Series.drop(0)` (#575) Due to implicit 0 non-truthfulness, 0 was getting erroneously converted to None. --- bigframes/series.py | 8 +++++--- tests/system/small/test_series.py | 14 ++++++++++---- 2 files changed, 15 insertions(+), 7 deletions(-) diff --git a/bigframes/series.py b/bigframes/series.py index 42264c35b6..e4d48904b0 100644 --- a/bigframes/series.py +++ b/bigframes/series.py @@ -351,9 +351,11 @@ def drop( columns: Union[blocks.Label, typing.Iterable[blocks.Label]] = None, level: typing.Optional[LevelType] = None, ) -> Series: - if labels and index: - raise ValueError("Must specify exacly one of 'labels' or 'index'") - index = labels or index + if (labels is None) == (index is None): + raise ValueError("Must specify exactly one of 'labels' or 'index'") + + if labels is not None: + index = labels # ignore axis, columns params block = self._block diff --git a/tests/system/small/test_series.py b/tests/system/small/test_series.py index e15dbc6a3f..e350286940 100644 --- a/tests/system/small/test_series.py +++ b/tests/system/small/test_series.py @@ -1529,10 +1529,16 @@ def test_groupby_window_ops(scalars_df_index, scalars_pandas_df_index, operator) ) -def test_drop_label(scalars_df_index, scalars_pandas_df_index): - col_name = "int64_col" - bf_series = scalars_df_index[col_name].drop(1).to_pandas() - pd_series = scalars_pandas_df_index[col_name].drop(1) +@pytest.mark.parametrize( + ("label", "col_name"), + [ + (0, "bool_col"), + (1, "int64_col"), + ], +) +def test_drop_label(scalars_df_index, scalars_pandas_df_index, label, col_name): + bf_series = scalars_df_index[col_name].drop(label).to_pandas() + pd_series = scalars_pandas_df_index[col_name].drop(label) pd.testing.assert_series_equal( pd_series, bf_series, From d2d7e33b1f8b4e184ef3e76eedbd673a8fcee60e Mon Sep 17 00:00:00 2001 From: TrevorBergeron Date: Thu, 4 Apr 2024 13:03:01 -0700 Subject: [PATCH 51/53] perf: Add multi-query execution capability for complex dataframes (#427) --- bigframes/_config/compute_options.py | 6 +- bigframes/core/blocks.py | 4 + bigframes/core/expression.py | 9 ++ bigframes/core/nodes.py | 207 ++++++++++++++++++++++++++- bigframes/core/tree_properties.py | 51 ++++++- bigframes/dataframe.py | 12 +- bigframes/series.py | 10 ++ bigframes/session/__init__.py | 52 +++++++ tests/system/conftest.py | 8 ++ tests/system/small/test_dataframe.py | 50 +++++++ 10 files changed, 403 insertions(+), 6 deletions(-) diff --git a/bigframes/_config/compute_options.py b/bigframes/_config/compute_options.py index fb708b844c..2b849c558a 100644 --- a/bigframes/_config/compute_options.py +++ b/bigframes/_config/compute_options.py @@ -40,7 +40,11 @@ class ComputeOptions: bytes billed beyond this limit will fail (without incurring a charge). If unspecified, this will be set to your project default. See `maximum_bytes_billed `_. - + enable_multi_query_execution (bool, Options): + If enabled, large queries may be factored into multiple smaller queries + in order to avoid generating queries that are too complex for the query + engine to handle. However this comes at the cost of increase cost and latency. """ maximum_bytes_billed: Optional[int] = None + enable_multi_query_execution: bool = False diff --git a/bigframes/core/blocks.py b/bigframes/core/blocks.py index 0b6e50cfa3..c7b41e93eb 100644 --- a/bigframes/core/blocks.py +++ b/bigframes/core/blocks.py @@ -1873,6 +1873,10 @@ def cached(self, *, optimize_offsets=False, force: bool = False) -> Block: expr = self.session._cache_with_cluster_cols( self.expr, cluster_cols=self.index_columns ) + return self.swap_array_expr(expr) + + def swap_array_expr(self, expr: core.ArrayValue) -> Block: + # TODO: Validate schema unchanged return Block( expr, index_columns=self.index_columns, diff --git a/bigframes/core/expression.py b/bigframes/core/expression.py index 8c3f52d22b..4980f5369d 100644 --- a/bigframes/core/expression.py +++ b/bigframes/core/expression.py @@ -108,6 +108,11 @@ def bind_all_variables(self, bindings: Mapping[str, Expression]) -> Expression: def is_bijective(self) -> bool: return False + @property + def is_identity(self) -> bool: + """True for identity operation that does not transform input.""" + return False + @dataclasses.dataclass(frozen=True) class ScalarConstantExpression(Expression): @@ -173,6 +178,10 @@ def bind_all_variables(self, bindings: Mapping[str, Expression]) -> Expression: def is_bijective(self) -> bool: return True + @property + def is_identity(self) -> bool: + return True + @dataclasses.dataclass(frozen=True) class OpExpression(Expression): diff --git a/bigframes/core/nodes.py b/bigframes/core/nodes.py index d740605a56..a1072b0d68 100644 --- a/bigframes/core/nodes.py +++ b/bigframes/core/nodes.py @@ -15,11 +15,11 @@ from __future__ import annotations import abc -from dataclasses import dataclass, field, fields +from dataclasses import dataclass, field, fields, replace import functools import itertools import typing -from typing import Tuple +from typing import Callable, Tuple import pandas @@ -39,6 +39,10 @@ import bigframes.session +# A fixed number of variable to assume for overhead on some operations +OVERHEAD_VARIABLES = 5 + + @dataclass(frozen=True) class BigFrameNode: """ @@ -102,6 +106,60 @@ def roots(self) -> typing.Set[BigFrameNode]: def schema(self) -> schemata.ArraySchema: ... + @property + @abc.abstractmethod + def variables_introduced(self) -> int: + """ + Defines number of values created by the current node. Helps represent the "width" of a query + """ + ... + + @property + def relation_ops_created(self) -> int: + """ + Defines the number of relational ops generated by the current node. Used to estimate query planning complexity. + """ + return 1 + + @property + def joins(self) -> bool: + """ + Defines whether the node joins data. + """ + return False + + @functools.cached_property + def total_variables(self) -> int: + return self.variables_introduced + sum( + map(lambda x: x.total_variables, self.child_nodes) + ) + + @functools.cached_property + def total_relational_ops(self) -> int: + return self.relation_ops_created + sum( + map(lambda x: x.total_relational_ops, self.child_nodes) + ) + + @functools.cached_property + def total_joins(self) -> int: + return int(self.joins) + sum(map(lambda x: x.total_joins, self.child_nodes)) + + @property + def planning_complexity(self) -> int: + """ + Empirical heuristic measure of planning complexity. + + Used to determine when to decompose overly complex computations. May require tuning. + """ + return self.total_variables * self.total_relational_ops * (1 + self.total_joins) + + @abc.abstractmethod + def transform_children( + self, t: Callable[[BigFrameNode], BigFrameNode] + ) -> BigFrameNode: + """Apply a function to each child node.""" + ... + @dataclass(frozen=True) class UnaryNode(BigFrameNode): @@ -115,6 +173,11 @@ def child_nodes(self) -> typing.Sequence[BigFrameNode]: def schema(self) -> schemata.ArraySchema: return self.child.schema + def transform_children( + self, t: Callable[[BigFrameNode], BigFrameNode] + ) -> BigFrameNode: + return replace(self, child=t(self.child)) + @dataclass(frozen=True) class JoinNode(BigFrameNode): @@ -154,6 +217,22 @@ def join_mapping_to_schema_item(mapping: JoinColumnMapping): ) return schemata.ArraySchema(items) + @functools.cached_property + def variables_introduced(self) -> int: + """Defines the number of variables generated by the current node. Used to estimate query planning complexity.""" + return OVERHEAD_VARIABLES + + @property + def joins(self) -> bool: + return True + + def transform_children( + self, t: Callable[[BigFrameNode], BigFrameNode] + ) -> BigFrameNode: + return replace( + self, left_child=t(self.left_child), right_child=t(self.right_child) + ) + @dataclass(frozen=True) class ConcatNode(BigFrameNode): @@ -182,6 +261,16 @@ def schema(self) -> schemata.ArraySchema: ) return schemata.ArraySchema(items) + @functools.cached_property + def variables_introduced(self) -> int: + """Defines the number of variables generated by the current node. Used to estimate query planning complexity.""" + return len(self.schema.items) + OVERHEAD_VARIABLES + + def transform_children( + self, t: Callable[[BigFrameNode], BigFrameNode] + ) -> BigFrameNode: + return replace(self, children=tuple(t(child) for child in self.children)) + # Input Nodex @dataclass(frozen=True) @@ -201,6 +290,16 @@ def roots(self) -> typing.Set[BigFrameNode]: def schema(self) -> schemata.ArraySchema: return self.data_schema + @functools.cached_property + def variables_introduced(self) -> int: + """Defines the number of variables generated by the current node. Used to estimate query planning complexity.""" + return len(self.schema.items) + 1 + + def transform_children( + self, t: Callable[[BigFrameNode], BigFrameNode] + ) -> BigFrameNode: + return self + # TODO: Refactor to take raw gbq object reference @dataclass(frozen=True) @@ -233,6 +332,20 @@ def schema(self) -> schemata.ArraySchema: ) return schemata.ArraySchema(items) + @functools.cached_property + def variables_introduced(self) -> int: + return len(self.columns) + len(self.hidden_ordering_columns) + + @property + def relation_ops_created(self) -> int: + # Assume worst case, where readgbq actually has baked in analytic operation to generate index + return 2 + + def transform_children( + self, t: Callable[[BigFrameNode], BigFrameNode] + ) -> BigFrameNode: + return self + # Unary nodes @dataclass(frozen=True) @@ -252,6 +365,14 @@ def schema(self) -> schemata.ArraySchema: schemata.SchemaItem(self.col_id, bigframes.dtypes.INT_DTYPE) ) + @property + def relation_ops_created(self) -> int: + return 2 + + @functools.cached_property + def variables_introduced(self) -> int: + return 1 + @dataclass(frozen=True) class FilterNode(UnaryNode): @@ -264,6 +385,10 @@ def row_preserving(self) -> bool: def __hash__(self): return self._node_hash + @property + def variables_introduced(self) -> int: + return 1 + @dataclass(frozen=True) class OrderByNode(UnaryNode): @@ -281,6 +406,15 @@ def __post_init__(self): def __hash__(self): return self._node_hash + @property + def variables_introduced(self) -> int: + return 0 + + @property + def relation_ops_created(self) -> int: + # Doesnt directly create any relational operations + return 0 + @dataclass(frozen=True) class ReversedNode(UnaryNode): @@ -290,6 +424,15 @@ class ReversedNode(UnaryNode): def __hash__(self): return self._node_hash + @property + def variables_introduced(self) -> int: + return 0 + + @property + def relation_ops_created(self) -> int: + # Doesnt directly create any relational operations + return 0 + @dataclass(frozen=True) class ProjectionNode(UnaryNode): @@ -315,6 +458,12 @@ def schema(self) -> schemata.ArraySchema: ) return schemata.ArraySchema(items) + @property + def variables_introduced(self) -> int: + # ignore passthrough expressions + new_vars = sum(1 for i in self.assignments if not i[0].is_identity) + return new_vars + # TODO: Merge RowCount into Aggregate Node? # Row count can be compute from table metadata sometimes, so it is a bit special. @@ -334,6 +483,10 @@ def schema(self) -> schemata.ArraySchema: (schemata.SchemaItem("count", bigframes.dtypes.INT_DTYPE),) ) + @property + def variables_introduced(self) -> int: + return 1 + @dataclass(frozen=True) class AggregateNode(UnaryNode): @@ -367,6 +520,10 @@ def schema(self) -> schemata.ArraySchema: ) return schemata.ArraySchema(tuple([*by_items, *agg_items])) + @property + def variables_introduced(self) -> int: + return len(self.aggregations) + len(self.by_column_ids) + @dataclass(frozen=True) class WindowOpNode(UnaryNode): @@ -396,12 +553,31 @@ def schema(self) -> schemata.ArraySchema: schemata.SchemaItem(self.output_name, new_item_dtype) ) + @property + def variables_introduced(self) -> int: + return 1 + + @property + def relation_ops_created(self) -> int: + # Assume that if not reprojecting, that there is a sequence of window operations sharing the same window + return 0 if self.skip_reproject_unsafe else 4 + +# TODO: Remove this op @dataclass(frozen=True) class ReprojectOpNode(UnaryNode): def __hash__(self): return self._node_hash + @property + def variables_introduced(self) -> int: + return 0 + + @property + def relation_ops_created(self) -> int: + # This op is not a real transformation, just a hint to the sql generator + return 0 + @dataclass(frozen=True) class UnpivotNode(UnaryNode): @@ -428,6 +604,10 @@ def row_preserving(self) -> bool: def non_local(self) -> bool: return True + @property + def joins(self) -> bool: + return True + @functools.cached_property def schema(self) -> schemata.ArraySchema: def infer_dtype( @@ -469,6 +649,17 @@ def infer_dtype( ] return schemata.ArraySchema((*index_items, *value_items, *passthrough_items)) + @property + def variables_introduced(self) -> int: + return ( + len(self.schema.items) - len(self.passthrough_columns) + OVERHEAD_VARIABLES + ) + + @property + def relation_ops_created(self) -> int: + # Unpivot is essentially a cross join and a projection. + return 2 + @dataclass(frozen=True) class RandomSampleNode(UnaryNode): @@ -485,6 +676,10 @@ def row_preserving(self) -> bool: def __hash__(self): return self._node_hash + @property + def variables_introduced(self) -> int: + return 1 + @dataclass(frozen=True) class ExplodeNode(UnaryNode): @@ -511,3 +706,11 @@ def schema(self) -> schemata.ArraySchema: for name in self.child.schema.names ) return schemata.ArraySchema(items) + + @property + def relation_ops_created(self) -> int: + return 3 + + @functools.cached_property + def variables_introduced(self) -> int: + return len(self.column_ids) + 1 diff --git a/bigframes/core/tree_properties.py b/bigframes/core/tree_properties.py index bc29f115f6..125a7e6bff 100644 --- a/bigframes/core/tree_properties.py +++ b/bigframes/core/tree_properties.py @@ -11,12 +11,14 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. +from __future__ import annotations +import functools +import itertools +from typing import Dict import bigframes.core.nodes as nodes -# TODO: Convert these functions to iterative or enforce hard limit on tree depth. The below algorithms can cause stack to exceed limit. - def is_trivially_executable(node: nodes.BigFrameNode) -> bool: if local_only(node): @@ -36,3 +38,48 @@ def peekable(node: nodes.BigFrameNode) -> bool: children_peekable = all(peekable(child) for child in node.child_nodes) self_peekable = not node.non_local return children_peekable and self_peekable + + +def count_complex_nodes( + root: nodes.BigFrameNode, min_complexity: float, max_complexity: float +) -> Dict[nodes.BigFrameNode, int]: + @functools.cache + def _node_counts_inner( + subtree: nodes.BigFrameNode, + ) -> Dict[nodes.BigFrameNode, int]: + """Helper function to count occurences of duplicate nodes in a subtree. Considers only nodes in a complexity range""" + empty_counts: Dict[nodes.BigFrameNode, int] = {} + if subtree.planning_complexity >= min_complexity: + child_counts = [_node_counts_inner(child) for child in subtree.child_nodes] + node_counts = functools.reduce(_combine_counts, child_counts, empty_counts) + if subtree.planning_complexity <= max_complexity: + return _combine_counts(node_counts, {subtree: 1}) + else: + return node_counts + return empty_counts + + return _node_counts_inner(root) + + +def replace_nodes( + root: nodes.BigFrameNode, + to_replace: nodes.BigFrameNode, + replacemenet: nodes.BigFrameNode, +): + @functools.cache + def apply_substition(n: nodes.BigFrameNode) -> nodes.BigFrameNode: + if n == to_replace: + return replacemenet + else: + return n.transform_children(apply_substition) + + return root.transform_children(apply_substition) + + +def _combine_counts( + left: Dict[nodes.BigFrameNode, int], right: Dict[nodes.BigFrameNode, int] +) -> Dict[nodes.BigFrameNode, int]: + return { + key: left.get(key, 0) + right.get(key, 0) + for key in itertools.chain(left.keys(), right.keys()) + } diff --git a/bigframes/dataframe.py b/bigframes/dataframe.py index 0bb88beb2b..460d1056a3 100644 --- a/bigframes/dataframe.py +++ b/bigframes/dataframe.py @@ -1123,7 +1123,7 @@ def to_pandas( downsampled rows and all columns of this DataFrame. """ # TODO(orrbradford): Optimize this in future. Potentially some cases where we can return the stored query job - + self._optimize_query_complexity() df, query_job = self._block.to_pandas( max_download_size=max_download_size, sampling_method=sampling_method, @@ -1135,6 +1135,7 @@ def to_pandas( def to_pandas_batches(self) -> Iterable[pandas.DataFrame]: """Stream DataFrame results to an iterable of pandas DataFrame""" + self._optimize_query_complexity() return self._block.to_pandas_batches() def _compute_dry_run(self) -> bigquery.QueryJob: @@ -3079,6 +3080,7 @@ def _run_io_query( """Executes a query job presenting this dataframe and returns the destination table.""" session = self._block.expr.session + self._optimize_query_complexity() export_array, id_overrides = self._prepare_export( index=index, ordering_id=ordering_id ) @@ -3215,6 +3217,14 @@ def _cached(self, *, force: bool = False) -> DataFrame: self._set_block(self._block.cached(force=force)) return self + def _optimize_query_complexity(self): + """Reduce query complexity by caching repeated subtrees and recursively materializing maximum-complexity subtrees. + May generate many queries and take substantial time to execute. + """ + # TODO: Move all this to session + new_expr = self._session._simplify_with_caching(self._block.expr) + self._set_block(self._block.swap_array_expr(new_expr)) + _DataFrameOrSeries = typing.TypeVar("_DataFrameOrSeries") def dot(self, other: _DataFrameOrSeries) -> _DataFrameOrSeries: diff --git a/bigframes/series.py b/bigframes/series.py index e4d48904b0..185891bc01 100644 --- a/bigframes/series.py +++ b/bigframes/series.py @@ -176,6 +176,7 @@ def __len__(self): return self.shape[0] def __iter__(self) -> typing.Iterator: + self._optimize_query_complexity() return itertools.chain.from_iterable( map(lambda x: x.squeeze(axis=1), self._block.to_pandas_batches()) ) @@ -328,6 +329,7 @@ def to_pandas( pandas.Series: A pandas Series with all rows of this Series if the data_sampling_threshold_mb is not exceeded; otherwise, a pandas Series with downsampled rows of the DataFrame. """ + self._optimize_query_complexity() df, query_job = self._block.to_pandas( max_download_size=max_download_size, sampling_method=sampling_method, @@ -1603,6 +1605,14 @@ def _cached(self, *, force: bool = True) -> Series: self._set_block(self._block.cached(force=force)) return self + def _optimize_query_complexity(self): + """Reduce query complexity by caching repeated subtrees and recursively materializing maximum-complexity subtrees. + May generate many queries and take substantial time to execute. + """ + # TODO: Move all this to session + new_expr = self._block.session._simplify_with_caching(self._block.expr) + self._set_block(self._block.swap_array_expr(new_expr)) + def _is_list_like(obj: typing.Any) -> typing_extensions.TypeGuard[typing.Sequence]: return pandas.api.types.is_list_like(obj) diff --git a/bigframes/session/__init__.py b/bigframes/session/__init__.py index 671a3d65e7..354352f1c9 100644 --- a/bigframes/session/__init__.py +++ b/bigframes/session/__init__.py @@ -80,6 +80,7 @@ import bigframes.core.blocks as blocks import bigframes.core.compile import bigframes.core.guid as guid +import bigframes.core.nodes as nodes from bigframes.core.ordering import IntegerEncoding import bigframes.core.ordering as order import bigframes.core.tree_properties as traversals @@ -120,6 +121,11 @@ # Also must assume that text encoding as literals is much less efficient than in-memory representation. MAX_INLINE_DF_BYTES = 5000 +# Max complexity that should be executed as a single query +QUERY_COMPLEXITY_LIMIT = 1e7 +# Number of times to factor out subqueries before giving up. +MAX_SUBTREE_FACTORINGS = 5 + logger = logging.getLogger(__name__) # Excludes geography, bytes, and nested (array, struct) datatypes @@ -1851,6 +1857,52 @@ def _cache_with_offsets(self, array_value: core.ArrayValue) -> core.ArrayValue: ordering=order.ExpressionOrdering.from_offset_col("bigframes_offsets"), ) + def _simplify_with_caching(self, array_value: core.ArrayValue) -> core.ArrayValue: + """Attempts to handle the complexity by caching duplicated subtrees and breaking the query into pieces.""" + if not bigframes.options.compute.enable_multi_query_execution: + return array_value + node = array_value.node + if node.planning_complexity < QUERY_COMPLEXITY_LIMIT: + return array_value + + for _ in range(MAX_SUBTREE_FACTORINGS): + updated = self._cache_most_complex_subtree(node) + if updated is None: + return core.ArrayValue(node) + else: + node = updated + + return core.ArrayValue(node) + + def _cache_most_complex_subtree( + self, node: nodes.BigFrameNode + ) -> Optional[nodes.BigFrameNode]: + # TODO: If query fails, retry with lower complexity limit + valid_candidates = traversals.count_complex_nodes( + node, + min_complexity=(QUERY_COMPLEXITY_LIMIT / 500), + max_complexity=QUERY_COMPLEXITY_LIMIT, + ).items() + # Heuristic: subtree_compleixty * (copies of subtree)^2 + best_candidate = max( + valid_candidates, + key=lambda i: i[0].planning_complexity + (i[1] ** 2), + default=None, + ) + + if best_candidate is None: + # No good subtrees to cache, just return original tree + return None + + # TODO: Add clustering columns based on access patterns + materialized = self._cache_with_cluster_cols( + core.ArrayValue(best_candidate[0]), [] + ).node + + return traversals.replace_nodes( + node, to_replace=best_candidate[0], replacemenet=materialized + ) + def _is_trivially_executable(self, array_value: core.ArrayValue): """ Can the block be evaluated very cheaply? diff --git a/tests/system/conftest.py b/tests/system/conftest.py index a108ff4a8e..70ff6eee39 100644 --- a/tests/system/conftest.py +++ b/tests/system/conftest.py @@ -953,6 +953,14 @@ def restore_sampling_settings(): bigframes.options.sampling.max_download_size = max_download_size +@pytest.fixture() +def with_multiquery_execution(): + original_setting = bigframes.options.compute.enable_multi_query_execution + bigframes.options.compute.enable_multi_query_execution = True + yield + bigframes.options.compute.enable_multi_query_execution = original_setting + + @pytest.fixture() def weird_strings_pd(): df = pd.DataFrame( diff --git a/tests/system/small/test_dataframe.py b/tests/system/small/test_dataframe.py index 8bcdfe168b..5d6a859c11 100644 --- a/tests/system/small/test_dataframe.py +++ b/tests/system/small/test_dataframe.py @@ -4145,6 +4145,56 @@ def test_recursion_limit(scalars_df_index): scalars_df_index.to_pandas() +def test_query_complexity_repeated_joins( + scalars_df_index, scalars_pandas_df_index, with_multiquery_execution +): + pd_df = scalars_pandas_df_index + bf_df = scalars_df_index + for _ in range(6): + # recursively join, resuling in 2^6 - 1 = 63 joins + pd_df = pd_df.merge(pd_df, on="int64_col").head(30) + pd_df = pd_df[pd_df.columns[:20]] + bf_df = bf_df.merge(bf_df, on="int64_col").head(30) + bf_df = bf_df[bf_df.columns[:20]] + + bf_result = bf_df.to_pandas() + pd_result = pd_df + assert_pandas_df_equal(bf_result, pd_result, check_index_type=False) + + +def test_query_complexity_repeated_subtrees( + scalars_df_index, scalars_pandas_df_index, with_multiquery_execution +): + # Recursively union the data, if fully inlined has 10^5 identical root tables. + pd_df = scalars_pandas_df_index + bf_df = scalars_df_index + for _ in range(5): + pd_df = pd.concat(10 * [pd_df]).head(5) + bf_df = bigframes.pandas.concat(10 * [bf_df]).head(5) + bf_result = bf_df.to_pandas() + pd_result = pd_df + assert_pandas_df_equal(bf_result, pd_result) + + +@pytest.mark.skipif( + sys.version_info >= (3, 12), + # See: https://github.com/python/cpython/issues/112282 + reason="setrecursionlimit has no effect on the Python C stack since Python 3.12.", +) +def test_query_complexity_repeated_analytic( + scalars_df_index, scalars_pandas_df_index, with_multiquery_execution +): + bf_df = scalars_df_index[["int64_col", "int64_too"]] + pd_df = scalars_pandas_df_index[["int64_col", "int64_too"]] + # Uses LAG analytic operator, each in a new SELECT + for _ in range(50): + bf_df = bf_df.diff() + pd_df = pd_df.diff() + bf_result = bf_df.to_pandas() + pd_result = pd_df + assert_pandas_df_equal(bf_result, pd_result) + + def test_to_pandas_downsampling_option_override(session): df = session.read_gbq("bigframes-dev.bigframes_tests_sys.batting") download_size = 1 From 262ff5922643039e037bd9b6c0a91b5bd20a4e08 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tim=20Swe=C3=B1a=20=28Swast=29?= Date: Thu, 4 Apr 2024 15:12:53 -0500 Subject: [PATCH 52/53] docs: add General Availability banner to README (#507) * docs: add General Availability banner to README As of 1.0.0 release, BigQuery DataFrames is available as a [Generally Available](https://github.com/googleapis/google-cloud-python/blob/main/README.rst#general-availability) Google Cloud product. * update release status --- README.rst | 8 ++++++++ setup.py | 2 +- 2 files changed, 9 insertions(+), 1 deletion(-) diff --git a/README.rst b/README.rst index 26bbbffa88..64d1e4e72c 100644 --- a/README.rst +++ b/README.rst @@ -1,6 +1,8 @@ BigQuery DataFrames =================== +|GA| |pypi| |versions| + BigQuery DataFrames provides a Pythonic DataFrame and machine learning (ML) API powered by the BigQuery engine. @@ -10,6 +12,12 @@ powered by the BigQuery engine. BigQuery DataFrames is an open-source package. You can run ``pip install --upgrade bigframes`` to install the latest version. +.. |GA| image:: https://img.shields.io/badge/support-GA-gold.svg + :target: https://github.com/googleapis/google-cloud-python/blob/main/README.rst#general-availability +.. |pypi| image:: https://img.shields.io/pypi/v/bigframes.svg + :target: https://pypi.org/project/bigframes/ +.. |versions| image:: https://img.shields.io/pypi/pyversions/bigframes.svg + :target: https://pypi.org/project/bigframes/ Documentation ------------- diff --git a/setup.py b/setup.py index 86fb9d496c..83049f9715 100644 --- a/setup.py +++ b/setup.py @@ -30,7 +30,7 @@ # 'Development Status :: 3 - Alpha' # 'Development Status :: 4 - Beta' # 'Development Status :: 5 - Production/Stable' -release_status = "Development Status :: 3 - Alpha" +release_status = "Development Status :: 5 - Production/Stable" dependencies = [ # please keep these in sync with the minimum versions in testing/constraints-3.9.txt "cloudpickle >= 2.0.0", From 8add6b1e569c385d25d13e45afdd8ad5bd8d3294 Mon Sep 17 00:00:00 2001 From: "release-please[bot]" <55107282+release-please[bot]@users.noreply.github.com> Date: Thu, 4 Apr 2024 16:00:27 -0700 Subject: [PATCH 53/53] chore(main): release 1.1.0 (#509) Co-authored-by: release-please[bot] <55107282+release-please[bot]@users.noreply.github.com> --- CHANGELOG.md | 60 ++++++++++++++++++++++++++++++++++++++++++++ bigframes/version.py | 2 +- 2 files changed, 61 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 72d0e833bb..bcb062f08f 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -4,6 +4,66 @@ [1]: https://pypi.org/project/bigframes/#history +## [1.1.0](https://github.com/googleapis/python-bigquery-dataframes/compare/v1.0.0...v1.1.0) (2024-04-04) + + +### Features + +* (Series|DataFrame).explode ([#556](https://github.com/googleapis/python-bigquery-dataframes/issues/556)) ([9e32f57](https://github.com/googleapis/python-bigquery-dataframes/commit/9e32f570b42c8ddae0c9b281b25beff91f0c922c)) +* Add `DataFrame.eval` and `DataFrame.query` ([#361](https://github.com/googleapis/python-bigquery-dataframes/issues/361)) ([5e28ebd](https://github.com/googleapis/python-bigquery-dataframes/commit/5e28ebd1ba3a5559e093c2ea676c0714c1434ba9)) +* Add ColumnTransformer save/load ([#541](https://github.com/googleapis/python-bigquery-dataframes/issues/541)) ([9d8cf67](https://github.com/googleapis/python-bigquery-dataframes/commit/9d8cf6792a8dbe03e03b102c454d15fcde7986af)) +* Add ml.metrics.mean_squared_error ([#559](https://github.com/googleapis/python-bigquery-dataframes/issues/559)) ([853c25e](https://github.com/googleapis/python-bigquery-dataframes/commit/853c25e8023bf877f28cda4dade0694d0299a83e)) +* Add support for numpy expm1, log1p, floor, ceil, arctan2 ops ([#505](https://github.com/googleapis/python-bigquery-dataframes/issues/505)) ([e8e66cf](https://github.com/googleapis/python-bigquery-dataframes/commit/e8e66cf25887f64d2a7cb26081c2ef3cea10827d)) +* Add transformers save/load ([#552](https://github.com/googleapis/python-bigquery-dataframes/issues/552)) ([d805241](https://github.com/googleapis/python-bigquery-dataframes/commit/d805241b7ec99fcb7579dce778d4b04778a72002)) +* Allow DataFrame binary ops to align on either axis and with loc… ([#544](https://github.com/googleapis/python-bigquery-dataframes/issues/544)) ([6d8f3af](https://github.com/googleapis/python-bigquery-dataframes/commit/6d8f3afe28d39eb15b969f50d37c58a2c3ff1967)) +* Expose `DataFrame.bqclient` to assist in integrations ([#519](https://github.com/googleapis/python-bigquery-dataframes/issues/519)) ([0be8911](https://github.com/googleapis/python-bigquery-dataframes/commit/0be891191ed89be77494e4dcda30fb37836842ac)) +* Read_pandas accepts pandas Series and Index objects ([#573](https://github.com/googleapis/python-bigquery-dataframes/issues/573)) ([f8821fe](https://github.com/googleapis/python-bigquery-dataframes/commit/f8821fe7ecf8a80532a6aab98044fad601ff939c)) +* Support `ML.GENERATE_EMBEDDING` in `PaLM2TextEmbeddingGenerator` ([#539](https://github.com/googleapis/python-bigquery-dataframes/issues/539)) ([1156c1e](https://github.com/googleapis/python-bigquery-dataframes/commit/1156c1e3ce8c1e62898dbe68ccd6c5ab3cd4068f)) +* Support max_columns in repr and make repr more efficient ([#515](https://github.com/googleapis/python-bigquery-dataframes/issues/515)) ([54e49cf](https://github.com/googleapis/python-bigquery-dataframes/commit/54e49cff89bd329852a823cd5cf5c5b41b7f9e32)) + + +### Bug Fixes + +* Assign NaN scalar to column error. ([#513](https://github.com/googleapis/python-bigquery-dataframes/issues/513)) ([0a4153c](https://github.com/googleapis/python-bigquery-dataframes/commit/0a4153cc71a44c09b8d691897f1e5afa58c69f25)) +* Don't download 100gb onto local python machine in load test ([#537](https://github.com/googleapis/python-bigquery-dataframes/issues/537)) ([082c58b](https://github.com/googleapis/python-bigquery-dataframes/commit/082c58bbe76821b90337dc5af0ab5fa7515682c2)) +* Exclude list-like s parameter in plot.scatter ([#568](https://github.com/googleapis/python-bigquery-dataframes/issues/568)) ([1caac27](https://github.com/googleapis/python-bigquery-dataframes/commit/1caac27fe95ef3eb36bad2ac351090891922858c)) +* Fix case where df.peek would fail to execute even with force=True ([#511](https://github.com/googleapis/python-bigquery-dataframes/issues/511)) ([8eca99a](https://github.com/googleapis/python-bigquery-dataframes/commit/8eca99a03bc4bdaccf15a979b5382f3659f2aac5)) +* Fix error in `Series.drop(0)` ([#575](https://github.com/googleapis/python-bigquery-dataframes/issues/575)) ([75dd786](https://github.com/googleapis/python-bigquery-dataframes/commit/75dd7862e60502c97f7defe5dfefb044ea74bae8)) +* Include all names in MultiIndex repr ([#564](https://github.com/googleapis/python-bigquery-dataframes/issues/564)) ([b188146](https://github.com/googleapis/python-bigquery-dataframes/commit/b188146466780e6f7a041f51f5be51a7d60719c9)) +* Plot.scatter s parameter cannot accept float-like column ([#563](https://github.com/googleapis/python-bigquery-dataframes/issues/563)) ([8d39187](https://github.com/googleapis/python-bigquery-dataframes/commit/8d3918761a17649180aa806d7b01aa103f69b4fe)) +* Product operation produces float result for all input types ([#501](https://github.com/googleapis/python-bigquery-dataframes/issues/501)) ([6873b30](https://github.com/googleapis/python-bigquery-dataframes/commit/6873b30b691a11a368308825a72013d8ec1408ed)) +* Reloaded transformer .transform error ([#569](https://github.com/googleapis/python-bigquery-dataframes/issues/569)) ([39fe474](https://github.com/googleapis/python-bigquery-dataframes/commit/39fe47451d24a8cf55d7dbb15c6d3b176d25ab18)) +* Rename PaLM2TextEmbeddingGenerator.predict output columns to be backward compatible ([#561](https://github.com/googleapis/python-bigquery-dataframes/issues/561)) ([4995c00](https://github.com/googleapis/python-bigquery-dataframes/commit/4995c0046265463bc5c502cbeb34c7632d5a255e)) +* Respect hard stack size limit and swallow limit change exception. ([#558](https://github.com/googleapis/python-bigquery-dataframes/issues/558)) ([4833908](https://github.com/googleapis/python-bigquery-dataframes/commit/483390830ae0ee2fe0fb47dc7d2aea143b2dc7d8)) +* Restore string to date/time type coercion ([#565](https://github.com/googleapis/python-bigquery-dataframes/issues/565)) ([4ae0262](https://github.com/googleapis/python-bigquery-dataframes/commit/4ae0262a2b1dfc35c1e4c3392b9e21456d6e964e)) +* Sync the notebook with embedding changes ([#550](https://github.com/googleapis/python-bigquery-dataframes/issues/550)) ([347f2dd](https://github.com/googleapis/python-bigquery-dataframes/commit/347f2dda2298e17cd44a298f04a723f2d20c080a)) +* Use bytes limit on frame inlining rather than element count ([#576](https://github.com/googleapis/python-bigquery-dataframes/issues/576)) ([659a161](https://github.com/googleapis/python-bigquery-dataframes/commit/659a161a53e93f66334cd04d1c3dc1f1f47ecc16)) + + +### Performance Improvements + +* Add multi-query execution capability for complex dataframes ([#427](https://github.com/googleapis/python-bigquery-dataframes/issues/427)) ([d2d7e33](https://github.com/googleapis/python-bigquery-dataframes/commit/d2d7e33b1f8b4e184ef3e76eedbd673a8fcee60e)) + + +### Dependencies + +* Include `pyarrow` as a dependency ([#529](https://github.com/googleapis/python-bigquery-dataframes/issues/529)) ([9b1525a](https://github.com/googleapis/python-bigquery-dataframes/commit/9b1525a0c359455160bfbc0dc1366e37982ad01f)) + + +### Documentation + +* `bigframes.options.bigquery.project` and `location` are optional in some circumstances ([#548](https://github.com/googleapis/python-bigquery-dataframes/issues/548)) ([90bcec5](https://github.com/googleapis/python-bigquery-dataframes/commit/90bcec5c73f7eefeff14bbd8bdcad3a4c9d91d8f)) +* Add "Supported pandas APIs" reference to the documentation ([#542](https://github.com/googleapis/python-bigquery-dataframes/issues/542)) ([74c3915](https://github.com/googleapis/python-bigquery-dataframes/commit/74c391586280b55c35d66c697167122d72c13386)) +* Add General Availability banner to README ([#507](https://github.com/googleapis/python-bigquery-dataframes/issues/507)) ([262ff59](https://github.com/googleapis/python-bigquery-dataframes/commit/262ff5922643039e037bd9b6c0a91b5bd20a4e08)) +* Add opeartions in API docs ([#557](https://github.com/googleapis/python-bigquery-dataframes/issues/557)) ([ea95761](https://github.com/googleapis/python-bigquery-dataframes/commit/ea9576125d46f3912372f75ebe51196ba83e96db)) +* Add progress_bar code sample ([#508](https://github.com/googleapis/python-bigquery-dataframes/issues/508)) ([92a1af3](https://github.com/googleapis/python-bigquery-dataframes/commit/92a1af35b8de4afb6cdb5b5e89facdceb5c151d2)) +* Add the code samples for metrics{auc, roc_auc_score, roc_curve} ([#520](https://github.com/googleapis/python-bigquery-dataframes/issues/520)) ([5f37b09](https://github.com/googleapis/python-bigquery-dataframes/commit/5f37b0902fae2c099207acf3ce2e251c09ac889d)) +* Address more comments from technical writers to meet legal purposes ([#571](https://github.com/googleapis/python-bigquery-dataframes/issues/571)) ([9084df3](https://github.com/googleapis/python-bigquery-dataframes/commit/9084df369bc6819edf5f57ceba85667a14371ac5)) +* Fix docs of ARIMAPlus.predict ([#512](https://github.com/googleapis/python-bigquery-dataframes/issues/512)) ([3b80f95](https://github.com/googleapis/python-bigquery-dataframes/commit/3b80f956755c9d7043138aab6e5687cba50be8cb)) +* Include Index in table-of-contents ([#564](https://github.com/googleapis/python-bigquery-dataframes/issues/564)) ([b188146](https://github.com/googleapis/python-bigquery-dataframes/commit/b188146466780e6f7a041f51f5be51a7d60719c9)) +* Mark Gemini model as Pre-GA ([#543](https://github.com/googleapis/python-bigquery-dataframes/issues/543)) ([769868b](https://github.com/googleapis/python-bigquery-dataframes/commit/769868b9fc7dfff2e7b1ed5cec52a5dd3dfd6ff2)) +* Migrate the overview page to Bigframes official landing page ([#536](https://github.com/googleapis/python-bigquery-dataframes/issues/536)) ([a0fb8bb](https://github.com/googleapis/python-bigquery-dataframes/commit/a0fb8bbfddd07f1e0ef03eeb4be653d1e9f06772)) + ## [1.0.0](https://github.com/googleapis/python-bigquery-dataframes/compare/v0.26.0...v1.0.0) (2024-03-25) diff --git a/bigframes/version.py b/bigframes/version.py index 8e31592250..41a3895549 100644 --- a/bigframes/version.py +++ b/bigframes/version.py @@ -12,4 +12,4 @@ # See the License for the specific language governing permissions and # limitations under the License. -__version__ = "1.0.0" +__version__ = "1.1.0"