Skip to content

Commit fb14f54

Browse files
DevStephaniegcf-owl-bot[bot]tswast
authored
docs: add sample for getting started with BQML (#141)
* docs: add sample for getting started with BQML * 🦉 Updates from OwlBot post-processor See https://github.com/googleapis/repo-automation-bots/blob/main/packages/owl-bot/README.md * Creating clarifying comments * Merging comments with this branch * 🦉 Updates from OwlBot post-processor See https://github.com/googleapis/repo-automation-bots/blob/main/packages/owl-bot/README.md * corrections on comments * Correcting code comments from BQ docs * 🦉 Updates from OwlBot post-processor See https://github.com/googleapis/repo-automation-bots/blob/main/packages/owl-bot/README.md * Fixing code comments to reflect BQML documentation * Correcting code comments * Correcting documentation code * Correcting documentation errors * 🦉 Updates from OwlBot post-processor See https://github.com/googleapis/repo-automation-bots/blob/main/packages/owl-bot/README.md * Correcting documentation comments and correcting features * 🦉 Updates from OwlBot post-processor See https://github.com/googleapis/repo-automation-bots/blob/main/packages/owl-bot/README.md * Correcting documention comments for code samples * Apply suggestions from code review Correcting documentation comments Co-authored-by: Tim Swast <[email protected]> * Correcting documentation comments * Correcting documentation comments * Apply suggestions from code review * Apply suggestions from code review * Fixtures for temporary resources * Deleting files --------- Co-authored-by: Your Name <[email protected]> Co-authored-by: Owl Bot <gcf-owl-bot[bot]@users.noreply.github.com> Co-authored-by: Tim Swast <[email protected]>
1 parent a133822 commit fb14f54

File tree

2 files changed

+159
-0
lines changed

2 files changed

+159
-0
lines changed
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,93 @@
1+
# Copyright 2023 Google LLC
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
16+
def test_bqml_getting_started(random_model_id):
17+
your_model_id = random_model_id
18+
19+
# [START bigquery_dataframes_bqml_getting_started_tutorial]
20+
from bigframes.ml.linear_model import LogisticRegression
21+
import bigframes.pandas as bpd
22+
23+
# Start by selecting the data you'll use for training. `read_gbq` accepts
24+
# either a SQL query or a table ID. Since this example selects from multiple
25+
# tables via a wildcard, use SQL to define this data. Watch issue
26+
# https://github.com/googleapis/python-bigquery-dataframes/issues/169
27+
# for updates to `read_gbq` to support wildcard tables.
28+
29+
df = bpd.read_gbq(
30+
"""
31+
-- Since the order of rows isn't useful for the model training,
32+
-- generate a random ID to use as the index for the DataFrame.
33+
SELECT GENERATE_UUID() AS rowindex, *
34+
FROM
35+
`bigquery-public-data.google_analytics_sample.ga_sessions_*`
36+
WHERE
37+
_TABLE_SUFFIX BETWEEN '20160801' AND '20170630'
38+
""",
39+
index_col="rowindex",
40+
)
41+
42+
# Extract the total number of transactions within
43+
# the Google Analytics session.
44+
#
45+
# Because the totals column is a STRUCT data type, call
46+
# Series.struct.field("transactions") to extract the transactions field.
47+
# See the reference documentation below:
48+
# https://cloud.google.com/python/docs/reference/bigframes/latest/bigframes.operations.structs.StructAccessor#bigframes_operations_structs_StructAccessor_field
49+
transactions = df["totals"].struct.field("transactions")
50+
51+
# The "label" values represent the outcome of the model's
52+
# prediction. In this case, the model predicts if there are any
53+
# ecommerce transactions within the Google Analytics session.
54+
# If the number of transactions is NULL, the value in the label
55+
# column is set to 0. Otherwise, it is set to 1.
56+
label = transactions.notnull().map({True: 1, False: 0})
57+
58+
# Extract the operating system of the visitor's device.
59+
operatingSystem = df["device"].struct.field("operatingSystem")
60+
operatingSystem = operatingSystem.fillna("")
61+
62+
# Extract whether the visitor's device is a mobile device.
63+
isMobile = df["device"].struct.field("isMobile")
64+
65+
# Extract the country from which the sessions originated, based on the IP address.
66+
country = df["geoNetwork"].struct.field("country").fillna("")
67+
68+
# Extract the total number of page views within the session.
69+
pageviews = df["totals"].struct.field("pageviews").fillna(0)
70+
71+
# Combine all the feature columns into a single DataFrame
72+
# to use as training data.
73+
features = bpd.DataFrame(
74+
{
75+
"os": operatingSystem,
76+
"is_mobile": isMobile,
77+
"country": country,
78+
"pageviews": pageviews,
79+
}
80+
)
81+
82+
# Logistic Regression model splits data into two classes, giving the
83+
# a confidence score that the data is in one of the classes.
84+
model = LogisticRegression()
85+
model.fit(features, label)
86+
87+
# The model.fit() call above created a temporary model.
88+
# Use the to_gbq() method to write to a permanent location.
89+
model.to_gbq(
90+
your_model_id, # For example: "bqml_tutorial.sample_model",
91+
replace=True,
92+
)
93+
# [END bigquery_dataframes_bqml_getting_started_tutorial]

samples/snippets/conftest.py

+66
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,66 @@
1+
# Copyright 2020 Google LLC
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
from typing import Iterator
16+
17+
from google.cloud import bigquery
18+
import pytest
19+
import test_utils.prefixer
20+
21+
prefixer = test_utils.prefixer.Prefixer(
22+
"python-bigquery-dataframes", "samples/snippets"
23+
)
24+
25+
26+
@pytest.fixture(scope="session", autouse=True)
27+
def cleanup_datasets(bigquery_client: bigquery.Client) -> None:
28+
for dataset in bigquery_client.list_datasets():
29+
if prefixer.should_cleanup(dataset.dataset_id):
30+
bigquery_client.delete_dataset(
31+
dataset, delete_contents=True, not_found_ok=True
32+
)
33+
34+
35+
@pytest.fixture(scope="session")
36+
def bigquery_client() -> bigquery.Client:
37+
bigquery_client = bigquery.Client()
38+
return bigquery_client
39+
40+
41+
@pytest.fixture(scope="session")
42+
def project_id(bigquery_client: bigquery.Client) -> str:
43+
return bigquery_client.project
44+
45+
46+
@pytest.fixture(scope="session")
47+
def dataset_id(bigquery_client: bigquery.Client, project_id: str) -> Iterator[str]:
48+
dataset_id = prefixer.create_prefix()
49+
full_dataset_id = f"{project_id}.{dataset_id}"
50+
dataset = bigquery.Dataset(full_dataset_id)
51+
bigquery_client.create_dataset(dataset)
52+
yield dataset_id
53+
bigquery_client.delete_dataset(dataset, delete_contents=True, not_found_ok=True)
54+
55+
56+
@pytest.fixture
57+
def random_model_id(
58+
bigquery_client: bigquery.Client, project_id: str, dataset_id: str
59+
) -> Iterator[str]:
60+
"""Create a new table ID each time, so random_model_id can be used as
61+
target for load jobs.
62+
"""
63+
random_model_id = prefixer.create_prefix()
64+
full_model_id = f"{project_id}.{dataset_id}.{random_model_id}"
65+
yield full_model_id
66+
bigquery_client.delete_model(full_model_id, not_found_ok=True)

0 commit comments

Comments
 (0)