googleapis · chelsea-lin · Jun 29, 2024 · Jun 11, 2024 · Jun 12, 2024 · Jun 12, 2024
@@ -36,6 +36,10 @@
     import bigframes.series as series
 
 
+# Array functions defined from
+# https://cloud.google.com/bigquery/docs/reference/standard-sql/array_functions
+
+
 def array_length(series: series.Series) -> series.Series:
     """Compute the length of each array element in the Series.
 
@@ -154,6 +158,56 @@ def array_to_string(series: series.Series, delimiter: str) -> series.Series:
     return series._apply_unary_op(ops.ArrayToStringOp(delimiter=delimiter))
 
 
+# JSON functions defined from
+# https://cloud.google.com/bigquery/docs/reference/standard-sql/json_functions
+
+
+def json_set(
+    series: series.Series,
+    json_path_value_pairs: typing.Sequence[typing.Tuple[str, typing.Any]],
+) -> series.Series:
+    """Produces a new JSON value within a Series by inserting or replacing values at
+    specified paths.
+
+    **Examples:**
+
+        >>> import bigframes.pandas as bpd
+        >>> import bigframes.bigquery as bbq
+        >>> import numpy as np
+        >>> bpd.options.display.progress_bar = None
+
+        >>> s = bpd.read_gbq("SELECT JSON '{\\\"a\\\": 1}' AS data")["data"]
+        >>> bbq.json_set(s, json_path_value_pairs=[("$.a", 100), ("$.b", "hi")])
+            0    {"a":100,"b":"hi"}
+            Name: data, dtype: string
+
+    Args:
+        series (bigframes.series.Series):
+            The Series containing JSON data (as native JSON objects or JSON-formatted strings).
+        json_path_value_pairs (Sequence[Tuple[str, typing.Any]]):
+            Pairs of JSON path and the new value to insert/replace.
+
+    Returns:
+        bigframes.series.Series: A new Series with the transformed JSON data.
+
+    """
+    # SQLGlot parser does not support the "create_if_missing => true" syntax, so
+    # create_if_missing is not currently implemented.
+
+    for json_path_value_pair in json_path_value_pairs:
+        if len(json_path_value_pair) != 2:
+            raise ValueError(
+                "Incorrect format: Expected (<json_path>, <json_value>), but found: "
+                + f"{json_path_value_pair}"
+            )
+
+        json_path, json_value = json_path_value_pair
+        series = series._apply_binary_op(
+            json_value, ops.JSONSet(json_path=json_path), alignment="left"
+        )
+    return series
+
+
 def vector_search(
     base_table: str,
     column_to_search: str,

@@ -894,6 +894,26 @@ def array_to_string_op_impl(x: ibis_types.Value, op: ops.ArrayToStringOp):
     return typing.cast(ibis_types.ArrayValue, x).join(op.delimiter)
 
 
+# JSON Ops
+@scalar_op_compiler.register_binary_op(ops.JSONSet, pass_op=True)
+def json_set_op_impl(x: ibis_types.Value, y: ibis_types.Value, op: ops.JSONSet):
+    if x.type().is_json():
+        return json_set(
+            json_obj=x,
+            json_path=op.json_path,
+            json_value=y,
+        ).to_expr()
+    else:
+        # Enabling JSON type eliminates the need for less efficient string conversions.
+        return vendored_ibis_ops.ToJsonString(
+            json_set(
+                json_obj=parse_json(x),
+                json_path=op.json_path,
+                json_value=y,
+            )
+        ).to_expr()
+
+
 ### Binary Ops
 def short_circuit_nulls(type_override: typing.Optional[ibis_dtypes.DataType] = None):
     """Wraps a binary operator to generate nulls of the expected type if either input is a null scalar."""
@@ -1469,3 +1489,15 @@ def float_floor(a: float) -> float:
 def float_ceil(a: float) -> float:
     """Convert string to timestamp."""
     return 0  # pragma: NO COVER
+
+
+@ibis.udf.scalar.builtin(name="parse_json")
+def parse_json(a: str) -> ibis_dtypes.JSON:
+    """Converts a JSON-formatted STRING value to a JSON value."""
+
+
+@ibis.udf.scalar.builtin(name="json_set")
+def json_set(
+    json_obj: ibis_dtypes.JSON, json_path: ibis_dtypes.str, json_value
+) -> ibis_dtypes.JSON:
+    """Produces a new SQL JSON value with the specified JSON data inserted or replaced."""
@@ -240,6 +240,17 @@ def is_struct_like(type: ExpressionType) -> bool:
     )
 
 
+def is_json_like(type: ExpressionType) -> bool:
+    # TODO: Add JSON type support
+    return type == STRING_DTYPE
+
+
+def is_json_encoding_type(type: ExpressionType) -> bool:
+    # Types can be converted into JSON.
+    # https://cloud.google.com/bigquery/docs/reference/standard-sql/json_functions#json_encodings
+    return type != GEO_DTYPE
+
+
 def is_numeric(type: ExpressionType) -> bool:
     return type in NUMERIC_BIGFRAMES_TYPES_PERMISSIVE
 

@@ -707,6 +707,30 @@ def output_type(self, *input_types):
 strconcat_op = StrConcatOp()
 
 
+## JSON Ops
+@dataclasses.dataclass(frozen=True)
+class JSONSet(BinaryOp):
+    name: typing.ClassVar[str] = "json_set"
+    json_path: str
+
+    def output_type(self, *input_types):
+        left_type = input_types[0]
+        right_type = input_types[1]
+        if not dtypes.is_json_like(left_type):
+            raise TypeError(
+                "Input type must be an valid JSON object or JSON-formatted string type."
+                + f" Received type: {left_type}"
+            )
+        if not dtypes.is_json_encoding_type(right_type):
+            raise TypeError(
+                "The value to be assigned must be a type that can be encoded as JSON."
+                + f"Received type: {right_type}"
+            )
+
+        # After JSON type implementation, ONLY return JSON data.
+        return left_type
+
+
 # Ternary Ops
 @dataclasses.dataclass(frozen=True)
 class WhereOp(TernaryOp):

@@ -0,0 +1,119 @@
+# Copyright 2024 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import json
+
+import geopandas as gpd  # type: ignore
+import pandas as pd
+import pytest
+
+import bigframes.bigquery as bbq
+import bigframes.pandas as bpd
+
+
+def _get_series_from_json(json_data):
+    sql = " UNION ALL ".join(
+        [
+            f"SELECT {id} AS id, JSON '{json.dumps(data)}' AS data"
+            for id, data in enumerate(json_data)
+        ]
+    )
+    df = bpd.read_gbq(sql).set_index("id").sort_index()
+    return df["data"]
+
+
+@pytest.mark.parametrize(
+    ("json_path", "expected_json"),
+    [
+        pytest.param("$.a", [{"a": 10}], id="simple"),
+        pytest.param("$.a.b.c", [{"a": {"b": {"c": 10, "d": []}}}], id="nested"),
+    ],
+)
+def test_json_set_at_json_path(json_path, expected_json):
+    s = _get_series_from_json([{"a": {"b": {"c": "tester", "d": []}}}])
+    actual = bbq.json_set(s, json_path_value_pairs=[(json_path, 10)])
+
+    expected = _get_series_from_json(expected_json)
+    pd.testing.assert_series_equal(
+        actual.to_pandas(),
+        expected.to_pandas(),
+    )
+
+
+@pytest.mark.parametrize(
+    ("json_value", "expected_json"),
+    [
+        pytest.param(10, [{"a": {"b": 10}}, {"a": {"b": 10}}], id="int"),
+        pytest.param(0.333, [{"a": {"b": 0.333}}, {"a": {"b": 0.333}}], id="float"),
+        pytest.param("eng", [{"a": {"b": "eng"}}, {"a": {"b": "eng"}}], id="string"),
+        pytest.param([1, 2], [{"a": {"b": 1}}, {"a": {"b": 2}}], id="series"),
+    ],
+)
+def test_json_set_at_json_value_type(json_value, expected_json):
+    s = _get_series_from_json([{"a": {"b": "dev"}}, {"a": {"b": [1, 2]}}])
+    actual = bbq.json_set(s, json_path_value_pairs=[("$.a.b", json_value)])
+
+    expected = _get_series_from_json(expected_json)
+    pd.testing.assert_series_equal(
+        actual.to_pandas(),
+        expected.to_pandas(),
+    )
+
+
+def test_json_set_w_more_pairs():
+    s = _get_series_from_json([{"a": 2}, {"b": 5}, {"c": 1}])
+    actual = bbq.json_set(
+        s, json_path_value_pairs=[("$.a", 1), ("$.b", 2), ("$.a", [3, 4, 5])]
+    )
+    expected = _get_series_from_json(
+        [{"a": 3, "b": 2}, {"a": 4, "b": 2}, {"a": 5, "b": 2, "c": 1}]
+    )
+    pd.testing.assert_series_equal(
+        actual.to_pandas(),
+        expected.to_pandas(),
+    )
+
+
+@pytest.mark.parametrize(
+    ("series", "json_path_value_pairs"),
+    [
+        pytest.param(
+            _get_series_from_json([{"a": 10}]),
+            [("$.a", 1, 100)],
+            id="invalid_json_path_value_pairs",
+            marks=pytest.mark.xfail(raises=ValueError),
+        ),
+        pytest.param(
+            _get_series_from_json([{"a": 10}]),
+            [
+                (
+                    "$.a",
+                    bpd.read_pandas(
+                        gpd.GeoSeries.from_wkt(["POINT (1 2)", "POINT (2 1)"])
+                    ),
+                )
+            ],
+            id="invalid_json_value_type",
+            marks=pytest.mark.xfail(raises=TypeError),
+        ),
+        pytest.param(
+            bpd.Series([1, 2]),
+            [("$.a", 1)],
+            id="invalid_series_type",
+            marks=pytest.mark.xfail(raises=TypeError),
+        ),
+    ],
+)
+def test_json_set_w_invalid(series, json_path_value_pairs):
+    bbq.json_set(series, json_path_value_pairs=json_path_value_pairs)
@@ -2,8 +2,8 @@
 from __future__ import annotations
 
 import ibis.expr.datatypes as dt
-from ibis.expr.operations.core import Unary
+import ibis.expr.operations.core as ibis_ops_core
 
 
-class ToJsonString(Unary):
+class ToJsonString(ibis_ops_core.Unary):
     dtype = dt.string