Skip to content

Commit 659a161

Browse files
fix: Use bytes limit on frame inlining rather than element count (#576)
1 parent 6d8f3af commit 659a161

File tree

3 files changed

+13
-6
lines changed

3 files changed

+13
-6
lines changed

bigframes/session/__init__.py

+4-4
Original file line numberDiff line numberDiff line change
@@ -116,9 +116,9 @@
116116
"UTF-32LE",
117117
}
118118

119-
# BigQuery has 1 MB query size limit, 5000 items shouldn't take more than 10% of this depending on data type.
120-
# TODO(tbergeron): Convert to bytes-based limit
121-
MAX_INLINE_DF_SIZE = 5000
119+
# BigQuery has 1 MB query size limit. Don't want to take up more than a few % of that inlining a table.
120+
# Also must assume that text encoding as literals is much less efficient than in-memory representation.
121+
MAX_INLINE_DF_BYTES = 5000
122122

123123
logger = logging.getLogger(__name__)
124124

@@ -1051,7 +1051,7 @@ def _read_pandas_inline(
10511051
) -> Optional[dataframe.DataFrame]:
10521052
import bigframes.dataframe as dataframe
10531053

1054-
if pandas_dataframe.size > MAX_INLINE_DF_SIZE:
1054+
if pandas_dataframe.memory_usage(deep=True).sum() > MAX_INLINE_DF_BYTES:
10551055
return None
10561056

10571057
try:

tests/system/small/test_dataframe.py

+7
Original file line numberDiff line numberDiff line change
@@ -66,6 +66,13 @@ def test_df_construct_pandas_default(scalars_dfs):
6666
pandas.testing.assert_frame_equal(bf_result, pd_result)
6767

6868

69+
def test_df_construct_large_strings():
70+
data = [["hello", "w" + "o" * 50000 + "rld"]]
71+
bf_result = dataframe.DataFrame(data).to_pandas()
72+
pd_result = pd.DataFrame(data, dtype=pd.StringDtype(storage="pyarrow"))
73+
pandas.testing.assert_frame_equal(bf_result, pd_result, check_index_type=False)
74+
75+
6976
def test_df_construct_pandas_load_job(scalars_dfs):
7077
# This should trigger the inlined codepath
7178
columns = [

tests/system/small/test_progress_bar.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,7 @@
2020

2121
import bigframes as bf
2222
import bigframes.formatting_helpers as formatting_helpers
23-
from bigframes.session import MAX_INLINE_DF_SIZE
23+
from bigframes.session import MAX_INLINE_DF_BYTES
2424

2525
job_load_message_regex = r"\w+ job [\w-]+ is \w+\."
2626

@@ -70,7 +70,7 @@ def test_progress_bar_load_jobs(
7070
):
7171
# repeat the DF to be big enough to trigger the load job.
7272
df = penguins_pandas_df_default_index
73-
while len(df) < MAX_INLINE_DF_SIZE:
73+
while len(df) < MAX_INLINE_DF_BYTES:
7474
df = pd.DataFrame(np.repeat(df.values, 2, axis=0))
7575

7676
bf.options.display.progress_bar = "terminal"

0 commit comments

Comments
 (0)