-
Notifications
You must be signed in to change notification settings - Fork 48
/
Copy pathlogistic_regression_prediction_test.py
137 lines (119 loc) · 6.35 KB
/
logistic_regression_prediction_test.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
# Copyright 2024 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""BigQuery DataFrames code samples for
https://cloud.google.com/bigquery/docs/logistic-regression-prediction.
"""
def test_logistic_regression_prediction(random_model_id: str) -> None:
your_model_id = random_model_id
# [START bigquery_dataframes_logistic_regression_prediction_examine]
import bigframes.pandas as bpd
df = bpd.read_gbq(
"bigquery-public-data.ml_datasets.census_adult_income",
columns=(
"age",
"workclass",
"marital_status",
"education_num",
"occupation",
"hours_per_week",
"income_bracket",
"functional_weight",
),
max_results=100,
)
df.peek()
# Output:
# age workclass marital_status education_num occupation hours_per_week income_bracket functional_weight
# 47 Local-gov Married-civ-spouse 13 Prof-specialty 40 >50K 198660
# 56 Private Never-married 9 Adm-clerical 40 <=50K 85018
# 40 Private Married-civ-spouse 12 Tech-support 40 >50K 285787
# 34 Self-emp-inc Married-civ-spouse 9 Craft-repair 54 >50K 207668
# 23 Private Married-civ-spouse 10 Handlers-cleaners 40 <=50K 40060
# [END bigquery_dataframes_logistic_regression_prediction_examine]
# [START bigquery_dataframes_logistic_regression_prediction_prepare]
import bigframes.pandas as bpd
input_data = bpd.read_gbq(
"bigquery-public-data.ml_datasets.census_adult_income",
columns=(
"age",
"workclass",
"marital_status",
"education_num",
"occupation",
"hours_per_week",
"income_bracket",
"functional_weight",
),
)
input_data["dataframe"] = bpd.Series("training", index=input_data.index,).case_when(
[
(((input_data["functional_weight"] % 10) == 8), "evaluation"),
(((input_data["functional_weight"] % 10) == 9), "prediction"),
]
)
del input_data["functional_weight"]
# [END bigquery_dataframes_logistic_regression_prediction_prepare]
# [START bigquery_dataframes_logistic_regression_prediction_create_model]
import bigframes.ml.linear_model
# input_data is defined in an earlier step.
training_data = input_data[input_data["dataframe"] == "training"]
X = training_data.drop(columns=["income_bracket", "dataframe"])
y = training_data["income_bracket"]
census_model = bigframes.ml.linear_model.LogisticRegression()
census_model.fit(X, y)
census_model.to_gbq(
your_model_id, # For example: "your-project.census.census_model"
replace=True,
)
# [END bigquery_dataframes_logistic_regression_prediction_create_model]
# [START bigquery_dataframes_logistic_regression_prediction_evaluate_model]
# Select model you'll use for predictions. `read_gbq_model` loads model
# data from BigQuery, but you could also use the `census_model` object
# from previous steps.
census_model = bpd.read_gbq_model(
your_model_id, # For example: "your-project.census.census_model"
)
# input_data is defined in an earlier step.
evaluation_data = input_data[input_data["dataframe"] == "evaluation"]
X = evaluation_data.drop(columns=["income_bracket", "dataframe"])
y = evaluation_data["income_bracket"]
# The score() method evaluates how the model performs compared to the
# actual data. Output DataFrame matches that of ML.EVALUATE().
score = census_model.score(X, y)
score.peek()
# Output:
# precision recall accuracy f1_score log_loss roc_auc
# 0 0.685764 0.536685 0.83819 0.602134 0.350417 0.882953
# [END bigquery_dataframes_logistic_regression_prediction_evaluate_model]
# [START bigquery_dataframes_logistic_regression_prediction_predict_income_bracket]
# Select model you'll use for predictions. `read_gbq_model` loads model
# data from BigQuery, but you could also use the `census_model` object
# from previous steps.
census_model = bpd.read_gbq_model(
your_model_id, # For example: "your-project.census.census_model"
)
# input_data is defined in an earlier step.
prediction_data = input_data[input_data["dataframe"] == "prediction"]
predictions = census_model.predict(prediction_data)
predictions.peek()
# Output:
# predicted_income_bracket predicted_income_bracket_probs age workclass ... occupation hours_per_week income_bracket dataframe
# 18004 <=50K [{'label': ' >50K', 'prob': 0.0763305999358786... 75 ? ... ? 6 <=50K prediction
# 18886 <=50K [{'label': ' >50K', 'prob': 0.0448866871906495... 73 ? ... ? 22 >50K prediction
# 31024 <=50K [{'label': ' >50K', 'prob': 0.0362982319421936... 69 ? ... ? 1 <=50K prediction
# 31022 <=50K [{'label': ' >50K', 'prob': 0.0787836112058324... 75 ? ... ? 5 <=50K prediction
# 23295 <=50K [{'label': ' >50K', 'prob': 0.3385373037905673... 78 ? ... ? 32 <=50K prediction
# [END bigquery_dataframes_logistic_regression_prediction_predict_income_bracket]
# TODO(tswast): Implement ML.EXPLAIN_PREDICT() and corresponding sample.
# TODO(tswast): Implement ML.GLOBAL_EXPLAIN() and corresponding sample.