File tree 3 files changed +13
-6
lines changed
3 files changed +13
-6
lines changed Original file line number Diff line number Diff line change 116
116
"UTF-32LE" ,
117
117
}
118
118
119
- # BigQuery has 1 MB query size limit, 5000 items shouldn 't take more than 10 % of this depending on data type .
120
- # TODO(tbergeron): Convert to bytes-based limit
121
- MAX_INLINE_DF_SIZE = 5000
119
+ # BigQuery has 1 MB query size limit. Don 't want to take up more than a few % of that inlining a table .
120
+ # Also must assume that text encoding as literals is much less efficient than in-memory representation.
121
+ MAX_INLINE_DF_BYTES = 5000
122
122
123
123
logger = logging .getLogger (__name__ )
124
124
@@ -1051,7 +1051,7 @@ def _read_pandas_inline(
1051
1051
) -> Optional [dataframe .DataFrame ]:
1052
1052
import bigframes .dataframe as dataframe
1053
1053
1054
- if pandas_dataframe .size > MAX_INLINE_DF_SIZE :
1054
+ if pandas_dataframe .memory_usage ( deep = True ). sum () > MAX_INLINE_DF_BYTES :
1055
1055
return None
1056
1056
1057
1057
try :
Original file line number Diff line number Diff line change @@ -66,6 +66,13 @@ def test_df_construct_pandas_default(scalars_dfs):
66
66
pandas .testing .assert_frame_equal (bf_result , pd_result )
67
67
68
68
69
+ def test_df_construct_large_strings ():
70
+ data = [["hello" , "w" + "o" * 50000 + "rld" ]]
71
+ bf_result = dataframe .DataFrame (data ).to_pandas ()
72
+ pd_result = pd .DataFrame (data , dtype = pd .StringDtype (storage = "pyarrow" ))
73
+ pandas .testing .assert_frame_equal (bf_result , pd_result , check_index_type = False )
74
+
75
+
69
76
def test_df_construct_pandas_load_job (scalars_dfs ):
70
77
# This should trigger the inlined codepath
71
78
columns = [
Original file line number Diff line number Diff line change 20
20
21
21
import bigframes as bf
22
22
import bigframes .formatting_helpers as formatting_helpers
23
- from bigframes .session import MAX_INLINE_DF_SIZE
23
+ from bigframes .session import MAX_INLINE_DF_BYTES
24
24
25
25
job_load_message_regex = r"\w+ job [\w-]+ is \w+\."
26
26
@@ -70,7 +70,7 @@ def test_progress_bar_load_jobs(
70
70
):
71
71
# repeat the DF to be big enough to trigger the load job.
72
72
df = penguins_pandas_df_default_index
73
- while len (df ) < MAX_INLINE_DF_SIZE :
73
+ while len (df ) < MAX_INLINE_DF_BYTES :
74
74
df = pd .DataFrame (np .repeat (df .values , 2 , axis = 0 ))
75
75
76
76
bf .options .display .progress_bar = "terminal"
You can’t perform that action at this time.
0 commit comments