Skip to content

Commit 3dbf84b

Browse files
authored
feat: bigframes.bigquery.json_extract (#868)
* feat: bigframes.bigquery.json_extract * fixing tests
1 parent 8c352ce commit 3dbf84b

File tree

4 files changed

+90
-0
lines changed

4 files changed

+90
-0
lines changed

bigframes/bigquery/__init__.py

+35
Original file line numberDiff line numberDiff line change
@@ -208,6 +208,41 @@ def json_set(
208208
return series
209209

210210

211+
def json_extract(
212+
series: series.Series,
213+
json_path: str,
214+
) -> series.Series:
215+
"""Extracts a JSON value and converts it to a SQL JSON-formatted `STRING` or `JSON`
216+
value. This function uses single quotes and brackets to escape invalid JSONPath
217+
characters in JSON keys.
218+
219+
**Examples:**
220+
221+
>>> import bigframes.pandas as bpd
222+
>>> import bigframes.bigquery as bbq
223+
>>> bpd.options.display.progress_bar = None
224+
225+
>>> s = bpd.Series(['{"class": {"students": [{"id": 5}, {"id": 12}]}}'])
226+
>>> bbq.json_extract(s, json_path="$.class")
227+
0 "{\\\"students\\\":[{\\\"id\\\":5},{\\\"id\\\":12}]}"
228+
dtype: string
229+
230+
Args:
231+
series (bigframes.series.Series):
232+
The Series containing JSON data (as native JSON objects or JSON-formatted strings).
233+
json_path (str):
234+
The JSON path identifying the data that you want to obtain from the input.
235+
236+
Returns:
237+
bigframes.series.Series: A new Series with the JSON or JSON-formatted STRING.
238+
"""
239+
return series._apply_unary_op(ops.JSONExtract(json_path=json_path))
240+
241+
242+
# Search functions defined from
243+
# https://cloud.google.com/bigquery/docs/reference/standard-sql/search_functions
244+
245+
211246
def vector_search(
212247
base_table: str,
213248
column_to_search: str,

bigframes/core/compile/scalar_op_compiler.py

+12
Original file line numberDiff line numberDiff line change
@@ -922,6 +922,11 @@ def json_set_op_impl(x: ibis_types.Value, y: ibis_types.Value, op: ops.JSONSet):
922922
).to_expr()
923923

924924

925+
@scalar_op_compiler.register_unary_op(ops.JSONExtract, pass_op=True)
926+
def json_extract_op_impl(x: ibis_types.Value, op: ops.JSONExtract):
927+
return json_extract(json_obj=x, json_path=op.json_path)
928+
929+
925930
### Binary Ops
926931
def short_circuit_nulls(type_override: typing.Optional[ibis_dtypes.DataType] = None):
927932
"""Wraps a binary operator to generate nulls of the expected type if either input is a null scalar."""
@@ -1549,6 +1554,13 @@ def json_set(
15491554
"""Produces a new SQL JSON value with the specified JSON data inserted or replaced."""
15501555

15511556

1557+
@ibis.udf.scalar.builtin(name="json_extract")
1558+
def json_extract(
1559+
json_obj: ibis_dtypes.JSON, json_path: ibis_dtypes.str
1560+
) -> ibis_dtypes.JSON:
1561+
"""Extracts a JSON value and converts it to a SQL JSON-formatted STRING or JSON value."""
1562+
1563+
15521564
@ibis.udf.scalar.builtin(name="ML.DISTANCE")
15531565
def vector_distance(vector1, vector2, type: str) -> ibis_dtypes.Float64:
15541566
"""Computes the distance between two vectors using specified type ("EUCLIDEAN", "MANHATTAN", or "COSINE")"""

bigframes/operations/__init__.py

+16
Original file line numberDiff line numberDiff line change
@@ -602,6 +602,22 @@ def output_type(self, *input_types):
602602
return dtypes.STRING_DTYPE
603603

604604

605+
## JSON Ops
606+
@dataclasses.dataclass(frozen=True)
607+
class JSONExtract(UnaryOp):
608+
name: typing.ClassVar[str] = "json_extract"
609+
json_path: str
610+
611+
def output_type(self, *input_types):
612+
input_type = input_types[0]
613+
if not dtypes.is_json_like(input_type):
614+
raise TypeError(
615+
"Input type must be an valid JSON object or JSON-formatted string type."
616+
+ f" Received type: {input_type}"
617+
)
618+
return input_type
619+
620+
605621
# Binary Ops
606622
fillna_op = create_binary_op(name="fillna", type_signature=op_typing.COERCE)
607623
maximum_op = create_binary_op(name="maximum", type_signature=op_typing.COERCE)

tests/system/small/bigquery/test_json.py

+27
Original file line numberDiff line numberDiff line change
@@ -110,3 +110,30 @@ def test_json_set_w_invalid_value_type():
110110
def test_json_set_w_invalid_series_type():
111111
with pytest.raises(TypeError):
112112
bbq.json_set(bpd.Series([1, 2]), json_path_value_pairs=[("$.a", 1)])
113+
114+
115+
def test_json_extract_from_json():
116+
s = _get_series_from_json([{"a": {"b": [1, 2]}}, {"a": {"c": 1}}, {"a": {"b": 0}}])
117+
actual = bbq.json_extract(s, "$.a.b")
118+
# After the introduction of the JSON type, the output should be a JSON-formatted series.
119+
expected = _get_series_from_json(["[1,2]", None, "0"])
120+
pd.testing.assert_series_equal(
121+
actual.to_pandas(),
122+
expected.to_pandas(),
123+
)
124+
125+
126+
def test_json_extract_from_string():
127+
s = bpd.Series(['{"a": {"b": [1, 2]}}', '{"a": {"c": 1}}', '{"a": {"b": 0}}'])
128+
actual = bbq.json_extract(s, "$.a.b")
129+
expected = _get_series_from_json(["[1,2]", None, "0"])
130+
pd.testing.assert_series_equal(
131+
actual.to_pandas(),
132+
expected.to_pandas(),
133+
check_names=False,
134+
)
135+
136+
137+
def test_json_extract_w_invalid_series_type():
138+
with pytest.raises(TypeError):
139+
bbq.json_extract(bpd.Series([1, 2]), "$.a")

0 commit comments

Comments
 (0)