fix: Use bytes limit on frame inlining rather than element count (#576)

TrevorBergeron · web-flow · commit 659a161a53e9 · 2024-04-04T11:36:41.000-05:00
diff --git a/bigframes/session/__init__.py b/bigframes/session/__init__.py
@@ -116,9 +116,9 @@
     "UTF-32LE",
 }
 
-# BigQuery has 1 MB query size limit, 5000 items shouldn't take more than 10% of this depending on data type.
-# TODO(tbergeron): Convert to bytes-based limit
-MAX_INLINE_DF_SIZE = 5000
+# BigQuery has 1 MB query size limit. Don't want to take up more than a few % of that inlining a table.
+# Also must assume that text encoding as literals is much less efficient than in-memory representation.
+MAX_INLINE_DF_BYTES = 5000
 
 logger = logging.getLogger(__name__)
 
@@ -1051,7 +1051,7 @@ def _read_pandas_inline(
     ) -> Optional[dataframe.DataFrame]:
         import bigframes.dataframe as dataframe
 
-        if pandas_dataframe.size > MAX_INLINE_DF_SIZE:
+        if pandas_dataframe.memory_usage(deep=True).sum() > MAX_INLINE_DF_BYTES:
             return None
 
         try:
diff --git a/tests/system/small/test_dataframe.py b/tests/system/small/test_dataframe.py
@@ -66,6 +66,13 @@ def test_df_construct_pandas_default(scalars_dfs):
     pandas.testing.assert_frame_equal(bf_result, pd_result)
 
 
+def test_df_construct_large_strings():
+    data = [["hello", "w" + "o" * 50000 + "rld"]]
+    bf_result = dataframe.DataFrame(data).to_pandas()
+    pd_result = pd.DataFrame(data, dtype=pd.StringDtype(storage="pyarrow"))
+    pandas.testing.assert_frame_equal(bf_result, pd_result, check_index_type=False)
+
+
 def test_df_construct_pandas_load_job(scalars_dfs):
     # This should trigger the inlined codepath
     columns = [
diff --git a/tests/system/small/test_progress_bar.py b/tests/system/small/test_progress_bar.py
@@ -20,7 +20,7 @@
 
 import bigframes as bf
 import bigframes.formatting_helpers as formatting_helpers
-from bigframes.session import MAX_INLINE_DF_SIZE
+from bigframes.session import MAX_INLINE_DF_BYTES
 
 job_load_message_regex = r"\w+ job [\w-]+ is \w+\."
 
@@ -70,7 +70,7 @@ def test_progress_bar_load_jobs(
 ):
     # repeat the DF to be big enough to trigger the load job.
     df = penguins_pandas_df_default_index
-    while len(df) < MAX_INLINE_DF_SIZE:
+    while len(df) < MAX_INLINE_DF_BYTES:
         df = pd.DataFrame(np.repeat(df.values, 2, axis=0))
 
     bf.options.display.progress_bar = "terminal"