Skip to content

Commit e2cf50e

Browse files
authored
deps: support pandas 2.2 (#492)
* deps: support pandas 2.2 * fix tests and loading multiindex * fix doctests * avoid circular imports * missing import * allow pandas in prerelease * avoid table.execute in ibis tests * update notebook to use native plotting
1 parent 10c0446 commit e2cf50e

File tree

19 files changed

+130
-84
lines changed

19 files changed

+130
-84
lines changed

bigframes/core/joins/merge.py

+18-10
Original file line numberDiff line numberDiff line change
@@ -18,15 +18,18 @@
1818

1919
from __future__ import annotations
2020

21+
import typing
2122
from typing import Literal, Optional
2223

23-
from bigframes.dataframe import DataFrame
24-
from bigframes.series import Series
24+
# Avoid cirular imports.
25+
if typing.TYPE_CHECKING:
26+
import bigframes.dataframe
27+
import bigframes.series
2528

2629

2730
def merge(
28-
left: DataFrame,
29-
right: DataFrame,
31+
left: bigframes.dataframe.DataFrame,
32+
right: bigframes.dataframe.DataFrame,
3033
how: Literal[
3134
"inner",
3235
"left",
@@ -40,7 +43,7 @@ def merge(
4043
right_on: Optional[str] = None,
4144
sort: bool = False,
4245
suffixes: tuple[str, str] = ("_x", "_y"),
43-
) -> DataFrame:
46+
) -> bigframes.dataframe.DataFrame:
4447
left = _validate_operand(left)
4548
right = _validate_operand(right)
4649

@@ -55,14 +58,19 @@ def merge(
5558
)
5659

5760

58-
def _validate_operand(obj: DataFrame | Series) -> DataFrame:
59-
if isinstance(obj, DataFrame):
61+
def _validate_operand(
62+
obj: bigframes.dataframe.DataFrame | bigframes.series.Series,
63+
) -> bigframes.dataframe.DataFrame:
64+
import bigframes.dataframe
65+
import bigframes.series
66+
67+
if isinstance(obj, bigframes.dataframe.DataFrame):
6068
return obj
61-
elif isinstance(obj, Series):
69+
elif isinstance(obj, bigframes.series.Series):
6270
if obj.name is None:
63-
raise ValueError("Cannot merge a Series without a name")
71+
raise ValueError("Cannot merge a bigframes.series.Series without a name")
6472
return obj.to_frame()
6573
else:
6674
raise TypeError(
67-
f"Can only merge Series or DataFrame objects, a {type(obj)} was passed"
75+
f"Can only merge bigframes.series.Series or bigframes.dataframe.DataFrame objects, a {type(obj)} was passed"
6876
)

bigframes/core/utils.py

+10-3
Original file line numberDiff line numberDiff line change
@@ -70,7 +70,9 @@ def split_index(
7070

7171

7272
def get_standardized_ids(
73-
col_labels: Iterable[Hashable], idx_labels: Iterable[Hashable] = ()
73+
col_labels: Iterable[Hashable],
74+
idx_labels: Iterable[Hashable] = (),
75+
strict: bool = False,
7476
) -> tuple[list[str], list[str]]:
7577
"""Get stardardized column ids as column_ids_list, index_ids_list.
7678
The standardized_column_id must be valid BQ SQL schema column names, can only be string type and unique.
@@ -84,11 +86,15 @@ def get_standardized_ids(
8486
Tuple of (standardized_column_ids, standardized_index_ids)
8587
"""
8688
col_ids = [
87-
UNNAMED_COLUMN_ID if col_label is None else label_to_identifier(col_label)
89+
UNNAMED_COLUMN_ID
90+
if col_label is None
91+
else label_to_identifier(col_label, strict=strict)
8892
for col_label in col_labels
8993
]
9094
idx_ids = [
91-
UNNAMED_INDEX_ID if idx_label is None else label_to_identifier(idx_label)
95+
UNNAMED_INDEX_ID
96+
if idx_label is None
97+
else label_to_identifier(idx_label, strict=strict)
9298
for idx_label in idx_labels
9399
]
94100

@@ -107,6 +113,7 @@ def label_to_identifier(label: typing.Hashable, strict: bool = False) -> str:
107113
# Column values will be loaded as null if the column name has spaces.
108114
# https://github.com/googleapis/python-bigquery/issues/1566
109115
identifier = str(label).replace(" ", "_")
116+
110117
if strict:
111118
identifier = re.sub(r"[^a-zA-Z0-9_]", "", identifier)
112119
if not identifier:

bigframes/dataframe.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -70,10 +70,10 @@
7070
if typing.TYPE_CHECKING:
7171
import bigframes.session
7272

73+
SingleItemValue = Union[bigframes.series.Series, int, float, Callable]
7374

7475
LevelType = typing.Hashable
7576
LevelsType = typing.Union[LevelType, typing.Sequence[LevelType]]
76-
SingleItemValue = Union[bigframes.series.Series, int, float, Callable]
7777

7878
ERROR_IO_ONLY_GS_PATHS = f"Only Google Cloud Storage (gs://...) paths are supported. {constants.FEEDBACK_LINK}"
7979
ERROR_IO_REQUIRES_WILDCARD = (

bigframes/session/__init__.py

+25-5
Original file line numberDiff line numberDiff line change
@@ -84,7 +84,6 @@
8484
import bigframes.core.ordering as order
8585
import bigframes.core.traversal as traversals
8686
import bigframes.core.utils as utils
87-
import bigframes.dataframe as dataframe
8887
import bigframes.dtypes
8988
import bigframes.formatting_helpers as formatting_helpers
9089
from bigframes.functions.remote_function import read_gbq_function as bigframes_rgf
@@ -93,6 +92,10 @@
9392
import bigframes.session.clients
9493
import bigframes.version
9594

95+
# Avoid circular imports.
96+
if typing.TYPE_CHECKING:
97+
import bigframes.dataframe as dataframe
98+
9699
_BIGFRAMES_DEFAULT_CONNECTION_ID = "bigframes-default-connection"
97100

98101
_MAX_CLUSTER_COLUMNS = 4
@@ -557,6 +560,8 @@ def _read_gbq_query(
557560
api_name: str = "read_gbq_query",
558561
use_cache: Optional[bool] = None,
559562
) -> dataframe.DataFrame:
563+
import bigframes.dataframe as dataframe
564+
560565
configuration = _transform_read_gbq_configuration(configuration)
561566

562567
if "query" not in configuration:
@@ -754,6 +759,8 @@ def _read_gbq_table(
754759
api_name: str,
755760
use_cache: bool = True,
756761
) -> dataframe.DataFrame:
762+
import bigframes.dataframe as dataframe
763+
757764
if max_results and max_results <= 0:
758765
raise ValueError("`max_results` should be a positive number.")
759766

@@ -989,6 +996,8 @@ def read_pandas(self, pandas_dataframe: pandas.DataFrame) -> dataframe.DataFrame
989996
def _read_pandas(
990997
self, pandas_dataframe: pandas.DataFrame, api_name: str
991998
) -> dataframe.DataFrame:
999+
import bigframes.dataframe as dataframe
1000+
9921001
if isinstance(pandas_dataframe, dataframe.DataFrame):
9931002
raise ValueError(
9941003
"read_pandas() expects a pandas.DataFrame, but got a "
@@ -1003,6 +1012,8 @@ def _read_pandas(
10031012
def _read_pandas_inline(
10041013
self, pandas_dataframe: pandas.DataFrame
10051014
) -> Optional[dataframe.DataFrame]:
1015+
import bigframes.dataframe as dataframe
1016+
10061017
if pandas_dataframe.size > MAX_INLINE_DF_SIZE:
10071018
return None
10081019

@@ -1024,11 +1035,20 @@ def _read_pandas_inline(
10241035
def _read_pandas_load_job(
10251036
self, pandas_dataframe: pandas.DataFrame, api_name: str
10261037
) -> dataframe.DataFrame:
1038+
import bigframes.dataframe as dataframe
1039+
1040+
col_index = pandas_dataframe.columns.copy()
10271041
col_labels, idx_labels = (
1028-
pandas_dataframe.columns.to_list(),
1042+
col_index.to_list(),
10291043
pandas_dataframe.index.names,
10301044
)
1031-
new_col_ids, new_idx_ids = utils.get_standardized_ids(col_labels, idx_labels)
1045+
new_col_ids, new_idx_ids = utils.get_standardized_ids(
1046+
col_labels,
1047+
idx_labels,
1048+
# Loading parquet files into BigQuery with special column names
1049+
# is only supported under an allowlist.
1050+
strict=True,
1051+
)
10321052

10331053
# Add order column to pandas DataFrame to preserve order in BigQuery
10341054
ordering_col = "rowid"
@@ -1047,7 +1067,7 @@ def _read_pandas_load_job(
10471067

10481068
# Specify the datetime dtypes, which is auto-detected as timestamp types.
10491069
schema: list[bigquery.SchemaField] = []
1050-
for column, dtype in zip(pandas_dataframe.columns, pandas_dataframe.dtypes):
1070+
for column, dtype in zip(new_col_ids, pandas_dataframe.dtypes):
10511071
if dtype == "timestamp[us][pyarrow]":
10521072
schema.append(
10531073
bigquery.SchemaField(column, bigquery.enums.SqlTypeNames.DATETIME)
@@ -1101,7 +1121,7 @@ def _read_pandas_load_job(
11011121
block = blocks.Block(
11021122
array_value,
11031123
index_columns=new_idx_ids,
1104-
column_labels=col_labels,
1124+
column_labels=col_index,
11051125
index_labels=idx_labels,
11061126
)
11071127
return dataframe.DataFrame(block)

bigframes/session/_io/bigquery.py

+1-2
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,6 @@
2929
import bigframes
3030
from bigframes.core import log_adapter
3131
import bigframes.formatting_helpers as formatting_helpers
32-
import bigframes.session._io.bigquery as bigframes_io
3332

3433
IO_ORDERING_ID = "bqdf_row_nums"
3534
MAX_LABELS_COUNT = 64
@@ -226,7 +225,7 @@ def start_query_with_client(
226225
Starts query job and waits for results.
227226
"""
228227
api_methods = log_adapter.get_and_reset_api_methods()
229-
job_config.labels = bigframes_io.create_job_configs_labels(
228+
job_config.labels = create_job_configs_labels(
230229
job_configs_labels=job_config.labels, api_methods=api_methods
231230
)
232231

notebooks/visualization/bq_dataframes_covid_line_graphs.ipynb

+14-23
Large diffs are not rendered by default.

noxfile.py

+2-13
Original file line numberDiff line numberDiff line change
@@ -556,22 +556,11 @@ def prerelease(session: nox.sessions.Session, tests_path):
556556
"--prefer-binary",
557557
"--pre",
558558
"--upgrade",
559-
# TODO(shobs): Remove excluding version 2.1.4 after
560-
# https://github.com/pandas-dev/pandas/issues/56463 is resolved.
561-
#
562-
# TODO(shobs): Remove excluding version 2.2.0rc0 after
563-
# https://github.com/pandas-dev/pandas/issues/56646 and
564-
# https://github.com/pandas-dev/pandas/issues/56651 are resolved.
565-
#
566-
# TODO(shobs): Remove excluding version 2.2.0 after
567-
# https://github.com/googleapis/python-bigquery-dataframes/issues/341
568-
# https://github.com/googleapis/python-bigquery-dataframes/issues/337
569-
# are resolved
570-
#
571559
# We exclude each version individually so that we can continue to test
572560
# some prerelease packages. See:
573561
# https://github.com/googleapis/python-bigquery-dataframes/pull/268#discussion_r1423205172
574-
"pandas!=2.1.4, !=2.2.0rc0, !=2.2.0, !=2.2.1",
562+
# "pandas!=2.1.4, !=2.2.0rc0, !=2.2.0, !=2.2.1",
563+
"pandas",
575564
)
576565
already_installed.add("pandas")
577566

setup.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -46,7 +46,7 @@
4646
"google-cloud-storage >=2.0.0",
4747
"ibis-framework[bigquery] >=8.0.0,<9.0.0dev",
4848
# TODO: Relax upper bound once we have fixed `system_prerelease` tests.
49-
"pandas >=1.5.0,<2.1.4",
49+
"pandas >=1.5.0",
5050
"pydata-google-auth >=1.8.2",
5151
"requests >=2.27.1",
5252
"scikit-learn >=1.2.2",

tests/system/conftest.py

+1
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,7 @@
3535
import test_utils.prefixer
3636

3737
import bigframes
38+
import bigframes.dataframe
3839
import tests.system.utils
3940

4041
# Use this to control the number of cloud functions being deleted in a single

tests/system/large/test_remote_function.py

+14-8
Original file line numberDiff line numberDiff line change
@@ -118,6 +118,7 @@ def bq_cf_connection() -> str:
118118
def test_remote_function_multiply_with_ibis(
119119
session,
120120
scalars_table_id,
121+
bigquery_client,
121122
ibis_client,
122123
dataset_id,
123124
bq_cf_connection,
@@ -134,20 +135,22 @@ def test_remote_function_multiply_with_ibis(
134135
def multiply(x, y):
135136
return x * y
136137

137-
project_id, dataset_name, table_name = scalars_table_id.split(".")
138+
_, dataset_name, table_name = scalars_table_id.split(".")
138139
if not ibis_client.dataset:
139140
ibis_client.dataset = dataset_name
140141

141142
col_name = "int64_col"
142143
table = ibis_client.tables[table_name]
143144
table = table.filter(table[col_name].notnull()).order_by("rowindex").head(10)
144-
pandas_df_orig = table.execute()
145+
sql = table.compile()
146+
pandas_df_orig = bigquery_client.query(sql).to_dataframe()
145147

146148
col = table[col_name]
147149
col_2x = multiply(col, 2).name("int64_col_2x")
148150
col_square = multiply(col, col).name("int64_col_square")
149151
table = table.mutate([col_2x, col_square])
150-
pandas_df_new = table.execute()
152+
sql = table.compile()
153+
pandas_df_new = bigquery_client.query(sql).to_dataframe()
151154

152155
pandas.testing.assert_series_equal(
153156
pandas_df_orig[col_name] * 2,
@@ -163,14 +166,15 @@ def multiply(x, y):
163166
finally:
164167
# clean up the gcp assets created for the remote function
165168
cleanup_remote_function_assets(
166-
session.bqclient, session.cloudfunctionsclient, multiply
169+
bigquery_client, session.cloudfunctionsclient, multiply
167170
)
168171

169172

170173
@pytest.mark.flaky(retries=2, delay=120)
171174
def test_remote_function_stringify_with_ibis(
172175
session,
173176
scalars_table_id,
177+
bigquery_client,
174178
ibis_client,
175179
dataset_id,
176180
bq_cf_connection,
@@ -187,19 +191,21 @@ def test_remote_function_stringify_with_ibis(
187191
def stringify(x):
188192
return f"I got {x}"
189193

190-
project_id, dataset_name, table_name = scalars_table_id.split(".")
194+
_, dataset_name, table_name = scalars_table_id.split(".")
191195
if not ibis_client.dataset:
192196
ibis_client.dataset = dataset_name
193197

194198
col_name = "int64_col"
195199
table = ibis_client.tables[table_name]
196200
table = table.filter(table[col_name].notnull()).order_by("rowindex").head(10)
197-
pandas_df_orig = table.execute()
201+
sql = table.compile()
202+
pandas_df_orig = bigquery_client.query(sql).to_dataframe()
198203

199204
col = table[col_name]
200205
col_2x = stringify(col).name("int64_str_col")
201206
table = table.mutate([col_2x])
202-
pandas_df_new = table.execute()
207+
sql = table.compile()
208+
pandas_df_new = bigquery_client.query(sql).to_dataframe()
203209

204210
pandas.testing.assert_series_equal(
205211
pandas_df_orig[col_name].apply(lambda x: f"I got {x}"),
@@ -209,7 +215,7 @@ def stringify(x):
209215
finally:
210216
# clean up the gcp assets created for the remote function
211217
cleanup_remote_function_assets(
212-
session.bqclient, session.cloudfunctionsclient, stringify
218+
bigquery_client, session.cloudfunctionsclient, stringify
213219
)
214220

215221

tests/system/small/test_dataframe.py

+3-1
Original file line numberDiff line numberDiff line change
@@ -2500,6 +2500,8 @@ def test_df_pivot(scalars_dfs, values, index, columns):
25002500
pd_result = scalars_pandas_df.pivot(values=values, index=index, columns=columns)
25012501

25022502
# Pandas produces NaN, where bq dataframes produces pd.NA
2503+
bf_result = bf_result.fillna(float("nan"))
2504+
pd_result = pd_result.fillna(float("nan"))
25032505
pd.testing.assert_frame_equal(bf_result, pd_result, check_dtype=False)
25042506

25052507

@@ -4026,7 +4028,7 @@ def test_to_pandas_downsampling_option_override(session):
40264028

40274029
total_memory_bytes = df.memory_usage(deep=True).sum()
40284030
total_memory_mb = total_memory_bytes / (1024 * 1024)
4029-
assert total_memory_mb == pytest.approx(download_size, rel=0.3)
4031+
assert total_memory_mb == pytest.approx(download_size, rel=0.5)
40304032

40314033

40324034
def test_to_gbq_and_create_dataset(session, scalars_df_index, dataset_id_not_created):

tests/system/small/test_groupby.py

+3-1
Original file line numberDiff line numberDiff line change
@@ -228,7 +228,9 @@ def test_dataframe_groupby_multi_sum(
228228
(lambda x: x.cumsum(numeric_only=True)),
229229
(lambda x: x.cummax(numeric_only=True)),
230230
(lambda x: x.cummin(numeric_only=True)),
231-
(lambda x: x.cumprod()),
231+
# pandas 2.2 uses floating point for cumulative product even for
232+
# integer inputs.
233+
(lambda x: x.cumprod().astype("Float64")),
232234
(lambda x: x.shift(periods=2)),
233235
],
234236
ids=[

tests/system/small/test_multiindex.py

+2-1
Original file line numberDiff line numberDiff line change
@@ -45,8 +45,9 @@ def test_read_pandas_multi_index_axes():
4545
[[1, 2], [3, 4]], index=index, columns=columns, dtype=pandas.Int64Dtype()
4646
)
4747
bf_df = bpd.DataFrame(pandas_df)
48+
bf_df_computed = bf_df.to_pandas()
4849

49-
pandas.testing.assert_frame_equal(bf_df.to_pandas(), pandas_df)
50+
pandas.testing.assert_frame_equal(bf_df_computed, pandas_df)
5051

5152

5253
# Row Multi-index tests

0 commit comments

Comments
 (0)