Skip to content
This repository was archived by the owner on May 7, 2026. It is now read-only.
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
Show all changes
59 commits
Select commit Hold shift + click to select a range
992dc21
change to ai.generate
shuoweil Oct 7, 2025
74e042a
perf: Default to interactive display for SQL in anywidget mode
shuoweil Oct 4, 2025
074d4c2
fix: resolve double printing issue in anywidget mode
shuoweil Oct 4, 2025
982ea97
feat: Add test case for STRUCT column in anywidget
shuoweil Oct 7, 2025
a9116c7
fix presubmit
shuoweil Oct 9, 2025
f0992c6
Revert accidental changes to test_function.py
shuoweil Oct 15, 2025
3aefdbf
revert accidental change to blob.py
shuoweil Oct 15, 2025
7d4cfdf
change return type
shuoweil Oct 15, 2025
a951810
add todo and revert change
shuoweil Oct 20, 2025
89521d2
Revert "add todo and revert change"
shuoweil Oct 20, 2025
1c155d0
Add todo
shuoweil Oct 20, 2025
86cb692
Fix: Handle JSON dtype in anywidget display
shuoweil Oct 21, 2025
81013c6
revert a change
shuoweil Oct 21, 2025
6ea7281
revert a change
shuoweil Oct 21, 2025
63b7918
Revert: Restore bigframes/dataframe.py to state from 42da847
shuoweil Oct 21, 2025
4aa9879
remove anywidget from early return, allow execution proceeds to _repr…
shuoweil Oct 21, 2025
62d8608
remove unnecessary changes
shuoweil Oct 21, 2025
24d766d
remove redundant code change
shuoweil Oct 21, 2025
9239f20
code style change
shuoweil Oct 21, 2025
48d6c66
tescase update
shuoweil Oct 21, 2025
4cb8cd2
revert a change
shuoweil Oct 21, 2025
75a6d68
final touch of notebook
shuoweil Oct 21, 2025
8dc2171
fix presumbit error
shuoweil Oct 21, 2025
2adc426
remove invlaid test with anywidget bug fix
shuoweil Oct 21, 2025
faf1bb2
fix presubmit
shuoweil Oct 21, 2025
7a83b80
fix polar complier
shuoweil Oct 21, 2025
233e857
Revert an unnecessary change
shuoweil Oct 21, 2025
11daddb
apply the workaround to i/O layer
shuoweil Oct 27, 2025
30a9ef6
Revert scalar_op_registry.py chnage
shuoweil Oct 27, 2025
6895def
remove unnecessary import
shuoweil Oct 27, 2025
46444c1
Remove duplicate conversation
shuoweil Oct 27, 2025
3b8367b
revert changes to test_dataframe.py
shuoweil Oct 27, 2025
6801ca4
notebook update
shuoweil Oct 27, 2025
8e4ea88
Merge branch 'main' into shuowei-anywidget-col
shuoweil Oct 27, 2025
6c3567b
call API on local data for complier.py
shuoweil Oct 28, 2025
dba9051
add more testcase
shuoweil Oct 28, 2025
0420c64
modfiy polars import
shuoweil Oct 28, 2025
31b0746
Merge branch 'main' into shuowei-anywidget-col
shuoweil Oct 29, 2025
907cf2c
fix failed tests
shuoweil Oct 29, 2025
2459aa4
chore: Migrate minimum_op operator to SQLGlot (#2205)
jialuoo Oct 29, 2025
3dbee07
chore: Migrate round_op operator to SQLGlot (#2204)
jialuoo Oct 29, 2025
d99f1ef
fix: Improve error handling in blob operations (#2194)
shuoweil Oct 29, 2025
e0ac827
refactor: update geo "spec" and split geo ops in ibis compiler (#2208)
tswast Oct 29, 2025
a538c69
feat: support INFORMATION_SCHEMA views in `read_gbq` (#1895)
tswast Oct 30, 2025
cceb532
Merge branch 'main' into shuowei-anywidget-col
shuoweil Oct 30, 2025
db5d8ea
Revert: Unwanted code changes
shuoweil Oct 30, 2025
3cc643d
Revert "Revert: Unwanted code changes"
shuoweil Oct 30, 2025
39cf595
revert 1 files to match main branch
shuoweil Oct 30, 2025
8c34512
Correctly display DataFrames with JSON columns in anywidget
shuoweil Oct 30, 2025
a86d953
Correctly display DataFrames with JSON columns in anywidget
shuoweil Oct 30, 2025
3173582
add mis-deleted comment back
shuoweil Oct 30, 2025
9c10962
revert unnecessary change
shuoweil Oct 30, 2025
cc6dd64
move helper function to dtypes.py
shuoweil Oct 30, 2025
466fd06
revert unnecessary testcase change
shuoweil Oct 30, 2025
4cbaf15
Improve JSON type handling for to_gbq and to_pandas_batches
shuoweil Oct 30, 2025
2113425
Remove unnecessary comment
shuoweil Oct 31, 2025
b75cb8c
Merge branch 'main' into shuowei-anywidget-col
shuoweil Oct 31, 2025
d97cfac
Merge branch 'main' into shuowei-anywidget-col
shuoweil Nov 4, 2025
e06a9f4
Revert bigframes/dtypes.py and mypy.ini to main branch version
shuoweil Nov 4, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
Correctly display DataFrames with JSON columns in anywidget
  • Loading branch information
shuoweil committed Oct 30, 2025
commit 8c3451266c28ec0da6dd57c4f9929ae68a593574
48 changes: 42 additions & 6 deletions bigframes/core/blocks.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,7 @@
import warnings

import bigframes_vendored.constants as constants
import db_dtypes
import google.cloud.bigquery as bigquery
import numpy
import pandas as pd
Expand Down Expand Up @@ -134,6 +135,21 @@ class MaterializationOptions:
ordered: bool = True


def _replace_json_arrow_with_string(pa_type: pa.DataType) -> pa.DataType:
"""Recursively replace JSONArrowType with string type."""
if isinstance(pa_type, db_dtypes.JSONArrowType):
return pa.string()
if isinstance(pa_type, pa.ListType):
return pa.list_(_replace_json_arrow_with_string(pa_type.value_type))
if isinstance(pa_type, pa.StructType):
new_fields = [
field.with_type(_replace_json_arrow_with_string(field.type))
for field in pa_type
]
return pa.struct(new_fields)
return pa_type


class Block:
"""A immutable 2D data structure."""

Expand Down Expand Up @@ -715,12 +731,32 @@ def to_pandas_batches(
# To reduce the number of edge cases to consider when working with the
# results of this, always return at least one DataFrame. See:
# b/428918844.
empty_val = pd.DataFrame(
{
col: pd.Series([], dtype=self.expr.get_column_type(col))
for col in itertools.chain(self.value_columns, self.index_columns)
}
)
series_map = {}
for col in itertools.chain(self.value_columns, self.index_columns):
dtype = self.expr.get_column_type(col)
if bigframes.dtypes.contains_db_dtypes_json_dtype(dtype):
# Due to a limitation in Apache Arrow (#45262), JSON columns are not
# natively supported by the to_pandas_batches() method, which is
# used by the anywidget backend.
# Workaround for https://github.com/googleapis/python-bigquery-dataframes/issues/1273
# PyArrow doesn't support creating an empty array with db_dtypes.JSONArrowType,
# especially when nested.
# Create with string type and then cast.

# MyPy doesn't automatically narrow the type of 'dtype' here,
# so we add an explicit check.
if isinstance(dtype, pd.ArrowDtype):
safe_pa_type = _replace_json_arrow_with_string(dtype.pyarrow_dtype)
safe_dtype = pd.ArrowDtype(safe_pa_type)
series_map[col] = pd.Series([], dtype=safe_dtype).astype(dtype)
else:
# This branch should ideally not be reached if
# contains_db_dtypes_json_dtype is accurate,
# but it's here for MyPy's sake.
series_map[col] = pd.Series([], dtype=dtype)
else:
series_map[col] = pd.Series([], dtype=dtype)
empty_val = pd.DataFrame(series_map)
dfs = map(
lambda a: a[0],
itertools.zip_longest(
Expand Down
2 changes: 0 additions & 2 deletions bigframes/dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -783,8 +783,6 @@ def __repr__(self) -> str:

opts = bigframes.options.display
max_results = opts.max_rows
# anywdiget mode uses the same display logic as the "deferred" mode
# for faster execution
if opts.repr_mode in ("deferred", "anywidget"):
return formatter.repr_query_job(self._compute_dry_run())

Expand Down
34 changes: 34 additions & 0 deletions bigframes/session/executor.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,8 @@ def arrow_batches(self) -> Iterator[pyarrow.RecordBatch]:
result_rows = 0

for batch in self._arrow_batches:
# Convert JSON columns to strings before casting
batch = self._convert_json_to_string(batch)
batch = pyarrow_utils.cast_batch(batch, self.schema.to_pyarrow())
result_rows += batch.num_rows

Expand All @@ -67,6 +69,38 @@ def arrow_batches(self) -> Iterator[pyarrow.RecordBatch]:

yield batch

def _convert_json_to_string(
self, batch: pyarrow.RecordBatch
) -> pyarrow.RecordBatch:
"""Convert JSON arrow extension types to string to avoid PyArrow compatibility issues."""
import logging

new_arrays = []
new_fields = []

for i, field in enumerate(batch.schema):
array = batch.column(i)

# Check if this column should be JSON based on our schema
schema_item = next(
(item for item in self.schema.items if item.column == field.name), None
)

if schema_item and schema_item.dtype == bigframes.dtypes.JSON_DTYPE:
logging.info(f"Converting JSON column: {field.name}")
# Convert JSONArrowType to string
if array.type == bigframes.dtypes.JSON_ARROW_TYPE:
array = array.cast(pyarrow.string())
new_fields.append(pyarrow.field(field.name, pyarrow.string()))
else:
new_fields.append(field)

new_arrays.append(array)

return pyarrow.RecordBatch.from_arrays(
new_arrays, schema=pyarrow.schema(new_fields)
)

def to_arrow_table(self) -> pyarrow.Table:
# Need to provide schema if no result rows, as arrow can't infer
# If ther are rows, it is safest to infer schema from batches.
Expand Down
3 changes: 3 additions & 0 deletions mypy.ini
Original file line number Diff line number Diff line change
Expand Up @@ -44,3 +44,6 @@ ignore_missing_imports = True

[mypy-anywidget]
ignore_missing_imports = True

[mypy-db_dtypes]
ignore_missing_imports = True
119 changes: 110 additions & 9 deletions notebooks/dataframes/anywidget_mode.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,16 @@
"execution_count": 2,
"id": "ca22f059",
"metadata": {},
"outputs": [],
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/usr/local/google/home/shuowei/src/python-bigquery-dataframes/venv/lib/python3.10/site-packages/google/api_core/_python_version_support.py:266: FutureWarning: You are using a Python version (3.10.15) which Google will stop supporting in new releases of google.api_core once it reaches its end of life (2026-10-04). Please upgrade to the latest Python version, or at least Python 3.11, to continue receiving updates for google.api_core past that date.\n",
" warnings.warn(message, FutureWarning)\n"
]
}
],
"source": [
"import bigframes.pandas as bpd"
]
Expand Down Expand Up @@ -142,9 +151,9 @@
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "aafd4f912b5f42e0896aa5f0c2c62620",
"model_id": "473b016aa6b24c86aafc6372352e822d",
"version_major": 2,
"version_minor": 0
"version_minor": 1
},
"text/plain": [
"TableWidget(page_size=10, row_count=5552452, table_html='<table border=\"1\" class=\"dataframe table table-stripe…"
Expand Down Expand Up @@ -205,16 +214,17 @@
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "5ec0ad9f11874d4f9d8edbc903ee7b5d",
"model_id": "339279cc312e4e7fb67923e4e6ad7779",
"version_major": 2,
"version_minor": 0
"version_minor": 1
},
"text/plain": [
"TableWidget(page_size=10, row_count=5552452, table_html='<table border=\"1\" class=\"dataframe table table-stripe…"
]
},
"execution_count": 7,
"metadata": {},
"output_type": "display_data"
"output_type": "execute_result"
}
],
"source": [
Expand Down Expand Up @@ -304,16 +314,17 @@
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "651b5aac958c408183775152c2573a03",
"model_id": "8ff1f64c44304da0944eadbd0fb3981d",
"version_major": 2,
"version_minor": 0
"version_minor": 1
},
"text/plain": [
"TableWidget(page_size=10, row_count=5, table_html='<table border=\"1\" class=\"dataframe table table-striped tabl…"
]
},
"execution_count": 9,
"metadata": {},
"output_type": "display_data"
"output_type": "execute_result"
}
],
"source": [
Expand All @@ -323,6 +334,96 @@
"print(f\"Small dataset pages: {math.ceil(small_widget.row_count / small_widget.page_size)}\")\n",
"small_widget"
]
},
{
"cell_type": "markdown",
"id": "added-cell-2",
"metadata": {},
"source": [
"### Displaying Generative AI results containing JSON\n",
"The `AI.GENERATE` function in BigQuery returns results in a JSON column. While BigQuery's JSON type is not natively supported by the underlying Arrow `to_pandas_batches()` method used in anywidget mode ([Apache Arrow issue #45262](https://github.com/apache/arrow/issues/45262)), BigQuery Dataframes automatically converts JSON columns to strings for display. This allows you to view the results of generative AI functions seamlessly."
]
},
{
"cell_type": "code",
"execution_count": 10,
"id": "added-cell-1",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"✅ Completed. \n",
" Query processed 85.9 kB in 15 seconds of slot time.\n",
" "
],
"text/plain": [
"<IPython.core.display.HTML object>"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"/usr/local/google/home/shuowei/src/python-bigquery-dataframes/bigframes/dtypes.py:969: JSONDtypeWarning: JSON columns will be represented as pandas.ArrowDtype(pyarrow.json_())\n",
"instead of using `db_dtypes` in the future when available in pandas\n",
"(https://github.com/pandas-dev/pandas/issues/60958) and pyarrow.\n",
" warnings.warn(msg, bigframes.exceptions.JSONDtypeWarning)\n"
]
},
{
"data": {
"text/html": [
"✅ Completed. "
],
"text/plain": [
"<IPython.core.display.HTML object>"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "a6d61e48cca642b7a57e6431359b4cc4",
"version_major": 2,
"version_minor": 1
},
"text/plain": [
"TableWidget(page_size=10, row_count=5, table_html='<table border=\"1\" class=\"dataframe table table-striped tabl…"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/html": [],
"text/plain": [
"Computation deferred. Computation will process 0 Bytes"
]
},
"execution_count": 10,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"bpd._read_gbq_colab(\"\"\"\n",
" SELECT\n",
" AI.GENERATE(\n",
" prompt=>(\\\"Extract the values.\\\", OBJ.GET_ACCESS_URL(OBJ.FETCH_METADATA(OBJ.MAKE_REF(gcs_path, \\\"us.conn\\\")), \\\"r\\\")),\n",
" connection_id=>\\\"bigframes-dev.us.bigframes-default-connection\\\",\n",
" output_schema=>\\\"publication_date string, class_international string, application_number string, filing_date string\\\") AS result,\n",
" *\n",
" FROM `bigquery-public-data.labeled_patents.extracted_data`\n",
" LIMIT 5;\n",
"\"\"\")"
]
}
],
"metadata": {
Expand Down