docs: add sample for getting started with BQML (#141)

DevStephanie · gcf-owl-bot[bot] · tswast · web-flow · commit fb14f54548e9 · 2023-12-12T16:25:00.000-06:00
* docs: add sample for getting started with BQML * 🦉 Updates from OwlBot post-processor See https://github.com/googleapis/repo-automation-bots/blob/main/packages/owl-bot/README.md * Creating clarifying comments * Merging comments with this branch * 🦉 Updates from OwlBot post-processor See https://github.com/googleapis/repo-automation-bots/blob/main/packages/owl-bot/README.md * corrections on comments * Correcting code comments from BQ docs * 🦉 Updates from OwlBot post-processor See https://github.com/googleapis/repo-automation-bots/blob/main/packages/owl-bot/README.md * Fixing code comments to reflect BQML documentation * Correcting code comments * Correcting documentation code * Correcting documentation errors * 🦉 Updates from OwlBot post-processor See https://github.com/googleapis/repo-automation-bots/blob/main/packages/owl-bot/README.md * Correcting documentation comments and correcting features * 🦉 Updates from OwlBot post-processor See https://github.com/googleapis/repo-automation-bots/blob/main/packages/owl-bot/README.md * Correcting documention comments for code samples * Apply suggestions from code review Correcting documentation comments Co-authored-by: Tim Swast <swast@google.com> * Correcting documentation comments * Correcting documentation comments * Apply suggestions from code review * Apply suggestions from code review * Fixtures for temporary resources * Deleting files --------- Co-authored-by: Your Name <stabd@google.com> Co-authored-by: Owl Bot <gcf-owl-bot[bot]@users.noreply.github.com> Co-authored-by: Tim Swast <swast@google.com>
diff --git a/samples/snippets/bqml_getting_started_test.py b/samples/snippets/bqml_getting_started_test.py
@@ -0,0 +1,93 @@
+# Copyright 2023 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+def test_bqml_getting_started(random_model_id):
+    your_model_id = random_model_id
+
+    # [START bigquery_dataframes_bqml_getting_started_tutorial]
+    from bigframes.ml.linear_model import LogisticRegression
+    import bigframes.pandas as bpd
+
+    # Start by selecting the data you'll use for training. `read_gbq` accepts
+    # either a SQL query or a table ID. Since this example selects from multiple
+    # tables via a wildcard, use SQL to define this data. Watch issue
+    # https://github.com/googleapis/python-bigquery-dataframes/issues/169
+    # for updates to `read_gbq` to support wildcard tables.
+
+    df = bpd.read_gbq(
+        """
+        -- Since the order of rows isn't useful for the model training,
+        -- generate a random ID to use as the index for the DataFrame.
+        SELECT GENERATE_UUID() AS rowindex, *
+        FROM
+        `bigquery-public-data.google_analytics_sample.ga_sessions_*`
+        WHERE
+        _TABLE_SUFFIX BETWEEN '20160801' AND '20170630'
+        """,
+        index_col="rowindex",
+    )
+
+    # Extract the total number of transactions within
+    # the Google Analytics session.
+    #
+    # Because the totals column is a STRUCT data type, call
+    # Series.struct.field("transactions") to extract the transactions field.
+    # See the reference documentation below:
+    # https://cloud.google.com/python/docs/reference/bigframes/latest/bigframes.operations.structs.StructAccessor#bigframes_operations_structs_StructAccessor_field
+    transactions = df["totals"].struct.field("transactions")
+
+    # The "label" values represent the outcome of the model's
+    # prediction. In this case, the model predicts if there are any
+    # ecommerce transactions within the Google Analytics session.
+    # If the number of transactions is NULL, the value in the label
+    # column is set to 0. Otherwise, it is set to 1.
+    label = transactions.notnull().map({True: 1, False: 0})
+
+    # Extract the operating system of the visitor's device.
+    operatingSystem = df["device"].struct.field("operatingSystem")
+    operatingSystem = operatingSystem.fillna("")
+
+    # Extract whether the visitor's device is a mobile device.
+    isMobile = df["device"].struct.field("isMobile")
+
+    # Extract the country from which the sessions originated, based on the IP address.
+    country = df["geoNetwork"].struct.field("country").fillna("")
+
+    # Extract the total number of page views within the session.
+    pageviews = df["totals"].struct.field("pageviews").fillna(0)
+
+    # Combine all the feature columns into a single DataFrame
+    # to use as training data.
+    features = bpd.DataFrame(
+        {
+            "os": operatingSystem,
+            "is_mobile": isMobile,
+            "country": country,
+            "pageviews": pageviews,
+        }
+    )
+
+    # Logistic Regression model splits data into two classes, giving the
+    # a confidence score that the data is in one of the classes.
+    model = LogisticRegression()
+    model.fit(features, label)
+
+    # The model.fit() call above created a temporary model.
+    # Use the to_gbq() method to write to a permanent location.
+    model.to_gbq(
+        your_model_id,  # For example: "bqml_tutorial.sample_model",
+        replace=True,
+    )
+    # [END bigquery_dataframes_bqml_getting_started_tutorial]
diff --git a/samples/snippets/conftest.py b/samples/snippets/conftest.py
@@ -0,0 +1,66 @@
+# Copyright 2020 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Iterator
+
+from google.cloud import bigquery
+import pytest
+import test_utils.prefixer
+
+prefixer = test_utils.prefixer.Prefixer(
+    "python-bigquery-dataframes", "samples/snippets"
+)
+
+
+@pytest.fixture(scope="session", autouse=True)
+def cleanup_datasets(bigquery_client: bigquery.Client) -> None:
+    for dataset in bigquery_client.list_datasets():
+        if prefixer.should_cleanup(dataset.dataset_id):
+            bigquery_client.delete_dataset(
+                dataset, delete_contents=True, not_found_ok=True
+            )
+
+
+@pytest.fixture(scope="session")
+def bigquery_client() -> bigquery.Client:
+    bigquery_client = bigquery.Client()
+    return bigquery_client
+
+
+@pytest.fixture(scope="session")
+def project_id(bigquery_client: bigquery.Client) -> str:
+    return bigquery_client.project
+
+
+@pytest.fixture(scope="session")
+def dataset_id(bigquery_client: bigquery.Client, project_id: str) -> Iterator[str]:
+    dataset_id = prefixer.create_prefix()
+    full_dataset_id = f"{project_id}.{dataset_id}"
+    dataset = bigquery.Dataset(full_dataset_id)
+    bigquery_client.create_dataset(dataset)
+    yield dataset_id
+    bigquery_client.delete_dataset(dataset, delete_contents=True, not_found_ok=True)
+
+
+@pytest.fixture
+def random_model_id(
+    bigquery_client: bigquery.Client, project_id: str, dataset_id: str
+) -> Iterator[str]:
+    """Create a new table ID each time, so random_model_id can be used as
+    target for load jobs.
+    """
+    random_model_id = prefixer.create_prefix()
+    full_model_id = f"{project_id}.{dataset_id}.{random_model_id}"
+    yield full_model_id
+    bigquery_client.delete_model(full_model_id, not_found_ok=True)