docs: use class_weight="balanced" in the logistic regression prediction tutorial (#678)

tswast · web-flow · commit b95154908fd7 · 2024-05-16T10:34:00.000-05:00
This aligns the Python code with the SQL at https://cloud.google.com/bigquery/docs/logistic-regression-prediction#create_a_logistic_regression_model ```sql CREATE OR REPLACE MODEL `census.census_model` OPTIONS ( model_type='LOGISTIC_REG', auto_class_weights=TRUE, data_split_method='NO_SPLIT', input_label_cols=['income_bracket'], max_iterations=15) AS SELECT * EXCEPT(dataframe) FROM `census.input_data` WHERE dataframe = 'training' ```
diff --git a/samples/snippets/logistic_regression_prediction_test.py b/samples/snippets/logistic_regression_prediction_test.py
@@ -80,7 +80,21 @@ def test_logistic_regression_prediction(random_model_id: str) -> None:
     X = training_data.drop(columns=["income_bracket", "dataframe"])
     y = training_data["income_bracket"]
 
-    census_model = bigframes.ml.linear_model.LogisticRegression()
+    census_model = bigframes.ml.linear_model.LogisticRegression(
+        # Balance the class labels in the training data by setting
+        # class_weight="balanced".
+        #
+        # By default, the training data is unweighted. If the labels
+        # in the training data are imbalanced, the model may learn to
+        # predict the most popular class of labels more heavily. In
+        # this case, most of the respondents in the dataset are in the
+        # lower income bracket. This may lead to a model that predicts
+        # the lower income bracket too heavily. Class weights balance
+        # the class labels by calculating the weights for each class in
+        # inverse proportion to the frequency of that class.
+        class_weight="balanced",
+        max_iterations=15,
+    )
     census_model.fit(X, y)
 
     census_model.to_gbq(