+ [3 rows x 3 columns]
Args:
function_name (str):
@@ -1520,6 +1348,10 @@ def _start_query(
Starts query job and waits for results.
"""
job_config = self._prepare_job_config(job_config)
+ api_methods = log_adapter.get_and_reset_api_methods()
+ job_config.labels = bigframes_io.create_job_configs_labels(
+ job_configs_labels=job_config.labels, api_methods=api_methods
+ )
query_job = self.bqclient.query(sql, job_config=job_config)
opts = bigframes.options.display
@@ -1554,6 +1386,8 @@ def _prepare_job_config(
) -> bigquery.QueryJobConfig:
if job_config is None:
job_config = self.bqclient.default_query_job_config
+ if job_config is None:
+ job_config = bigquery.QueryJobConfig()
if bigframes.options.compute.maximum_bytes_billed is not None:
job_config.maximum_bytes_billed = (
bigframes.options.compute.maximum_bytes_billed
@@ -1583,3 +1417,23 @@ def _can_cluster_bq(field: bigquery.SchemaField):
"BOOL",
"BOOLEAN",
)
+
+
+def _convert_to_string(column: ibis_types.Column) -> ibis_types.StringColumn:
+ col_type = column.type()
+ if (
+ col_type.is_numeric()
+ or col_type.is_boolean()
+ or col_type.is_binary()
+ or col_type.is_temporal()
+ ):
+ result = column.cast(ibis_dtypes.String(nullable=True))
+ elif col_type.is_geospatial():
+ result = typing.cast(ibis_types.GeoSpatialColumn, column).as_text()
+ elif col_type.is_string():
+ result = column
+ else:
+ # TO_JSON_STRING works with all data types, but isn't the most efficient
+ # Needed for JSON, STRUCT and ARRAY datatypes
+ result = vendored_ibis_ops.ToJsonString(column).to_expr() # type: ignore
+ return typing.cast(ibis_types.StringColumn, result)
diff --git a/bigframes/session/_io/bigquery.py b/bigframes/session/_io/bigquery.py
index 06d240fec6..dae73301e7 100644
--- a/bigframes/session/_io/bigquery.py
+++ b/bigframes/session/_io/bigquery.py
@@ -17,17 +17,36 @@
from __future__ import annotations
import datetime
+import itertools
import textwrap
import types
-from typing import Dict, Iterable, Union
+from typing import Dict, Iterable, Optional, Sequence, Union
import uuid
import google.cloud.bigquery as bigquery
IO_ORDERING_ID = "bqdf_row_nums"
+MAX_LABELS_COUNT = 64
TEMP_TABLE_PREFIX = "bqdf{date}_{random_id}"
+def create_job_configs_labels(
+ job_configs_labels: Optional[Dict[str, str]],
+ api_methods: Sequence[str],
+) -> Dict[str, str]:
+ if job_configs_labels is None:
+ job_configs_labels = {}
+
+ labels = list(
+ itertools.chain(
+ job_configs_labels.keys(),
+ (f"recent-bigframes-api-{i}" for i in range(len(api_methods))),
+ )
+ )
+ values = list(itertools.chain(job_configs_labels.values(), api_methods))
+ return dict(zip(labels[:MAX_LABELS_COUNT], values[:MAX_LABELS_COUNT]))
+
+
def create_export_csv_statement(
table_id: str, uri: str, field_delimiter: str, header: bool
) -> str:
@@ -121,11 +140,17 @@ def create_temp_table(
bqclient: bigquery.Client,
dataset: bigquery.DatasetReference,
expiration: datetime.datetime,
+ *,
+ schema: Optional[Iterable[bigquery.SchemaField]] = None,
+ cluster_columns: Optional[list[str]] = None,
) -> str:
"""Create an empty table with an expiration in the desired dataset."""
table_ref = random_table(dataset)
destination = bigquery.Table(table_ref)
destination.expires = expiration
+ destination.schema = schema
+ if cluster_columns:
+ destination.clustering_fields = cluster_columns
bqclient.create_table(destination)
return f"{table_ref.project}.{table_ref.dataset_id}.{table_ref.table_id}"
diff --git a/bigframes/version.py b/bigframes/version.py
index 0a5df27479..5a94f72649 100644
--- a/bigframes/version.py
+++ b/bigframes/version.py
@@ -12,4 +12,4 @@
# See the License for the specific language governing permissions and
# limitations under the License.
-__version__ = "0.13.0"
+__version__ = "0.14.0"
diff --git a/notebooks/generative_ai/bq_dataframes_llm_kmeans.ipynb b/notebooks/generative_ai/bq_dataframes_llm_kmeans.ipynb
new file mode 100644
index 0000000000..46c4955288
--- /dev/null
+++ b/notebooks/generative_ai/bq_dataframes_llm_kmeans.ipynb
@@ -0,0 +1,690 @@
+{
+ "cells": [
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Copyright 2023 Google LLC\n",
+ "#\n",
+ "# Licensed under the Apache License, Version 2.0 (the \"License\");\n",
+ "# you may not use this file except in compliance with the License.\n",
+ "# You may obtain a copy of the License at\n",
+ "#\n",
+ "# https://www.apache.org/licenses/LICENSE-2.0\n",
+ "#\n",
+ "# Unless required by applicable law or agreed to in writing, software\n",
+ "# distributed under the License is distributed on an \"AS IS\" BASIS,\n",
+ "# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n",
+ "# See the License for the specific language governing permissions and\n",
+ "# limitations under the License."
+ ]
+ },
+ {
+ "attachments": {},
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Use BigQuery DataFrames to cluster and characterize complaints\n",
+ "\n",
+ "\n",
+ "\n",
+ " \n",
+ " \n",
+ " Run in Colab\n",
+ " \n",
+ " | \n",
+ " \n",
+ " \n",
+ " \n",
+ " View on GitHub\n",
+ " \n",
+ " | \n",
+ " \n",
+ " \n",
+ " \n",
+ " Open in Vertex AI Workbench\n",
+ " \n",
+ " | \n",
+ "
"
+ ]
+ },
+ {
+ "attachments": {},
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Overview\n",
+ "\n",
+ "The goal of this notebook is to demonstrate a comment characterization algorithm for an online business. We will accomplish this using [Google's PaLM 2](https://ai.google/discover/palm2/) and [KMeans clustering](https://en.wikipedia.org/wiki/K-means_clustering) in three steps:\n",
+ "\n",
+ "1. Use PaLM2TextEmbeddingGenerator to [generate text embeddings](https://cloud.google.com/vertex-ai/docs/generative-ai/embeddings/get-text-embeddings) for each of 10000 complaints sent to an online bank. If you're not familiar with what a text embedding is, it's a list of numbers that are like coordinates in an imaginary \"meaning space\" for sentences. (It's like [word embeddings](https://en.wikipedia.org/wiki/Word_embedding), but for more general text.) The important point for our purposes is that similar sentences are close to each other in this imaginary space.\n",
+ "2. Use KMeans clustering to group together complaints whose text embeddings are near to eachother. This will give us sets of similar complaints, but we don't yet know _why_ these complaints are similar.\n",
+ "3. Prompt PaLM2TextGenerator in English asking what the difference is between the groups of complaints that we got. Thanks to the power of modern LLMs, the response might give us a very good idea of what these complaints are all about, but remember to [\"understand the limits of your dataset and model.\"](https://ai.google/responsibility/responsible-ai-practices/#:~:text=Understand%20the%20limitations%20of%20your%20dataset%20and%20model)\n",
+ "\n",
+ "We will tie these pieces together in Python using BigQuery DataFrames. [Click here](https://cloud.google.com/bigquery/docs/dataframes-quickstart) to learn more about BigQuery DataFrames!"
+ ]
+ },
+ {
+ "attachments": {},
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### Dataset\n",
+ "\n",
+ "This notebook uses the [CFPB Consumer Complaint Database](https://console.cloud.google.com/marketplace/product/cfpb/complaint-database)."
+ ]
+ },
+ {
+ "attachments": {},
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### Costs\n",
+ "\n",
+ "This tutorial uses billable components of Google Cloud:\n",
+ "\n",
+ "* BigQuery (compute)\n",
+ "* BigQuery ML\n",
+ "* Generative AI support on Vertex AI\n",
+ "\n",
+ "Learn about [BigQuery compute pricing](https://cloud.google.com/bigquery/pricing#analysis_pricing_models), [Generative AI support on Vertex AI pricing](https://cloud.google.com/vertex-ai/pricing#generative_ai_models),\n",
+ "and [BigQuery ML pricing](https://cloud.google.com/bigquery/pricing#bqml),\n",
+ "and use the [Pricing Calculator](https://cloud.google.com/products/calculator/)\n",
+ "to generate a cost estimate based on your projected usage."
+ ]
+ },
+ {
+ "attachments": {},
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Before you begin\n",
+ "\n",
+ "Complete the tasks in this section to set up your environment."
+ ]
+ },
+ {
+ "attachments": {},
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### Set up your Google Cloud project\n",
+ "\n",
+ "**The following steps are required, regardless of your notebook environment.**\n",
+ "\n",
+ "1. [Select or create a Google Cloud project](https://console.cloud.google.com/cloud-resource-manager). When you first create an account, you get a $300 credit towards your compute/storage costs.\n",
+ "\n",
+ "2. [Make sure that billing is enabled for your project](https://cloud.google.com/billing/docs/how-to/modify-project).\n",
+ "\n",
+ "3. [Click here](https://console.cloud.google.com/flows/enableapi?apiid=bigquery.googleapis.com,bigqueryconnection.googleapis.com,run.googleapis.com,artifactregistry.googleapis.com,cloudbuild.googleapis.com,cloudresourcemanager.googleapis.com) to enable the following APIs:\n",
+ "\n",
+ " * BigQuery API\n",
+ " * BigQuery Connection API\n",
+ " * Cloud Run API\n",
+ " * Artifact Registry API\n",
+ " * Cloud Build API\n",
+ " * Cloud Resource Manager API\n",
+ " * Vertex AI API\n",
+ "\n",
+ "4. If you are running this notebook locally, install the [Cloud SDK](https://cloud.google.com/sdk)."
+ ]
+ },
+ {
+ "attachments": {},
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "#### Set your project ID\n",
+ "\n",
+ "**If you don't know your project ID**, see the support page: [Locate the project ID](https://support.google.com/googleapi/answer/7014113)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# set your project ID below\n",
+ "PROJECT_ID = \"\" # @param {type:\"string\"}\n",
+ "\n",
+ "# Set the project id in gcloud\n",
+ "! gcloud config set project {PROJECT_ID}"
+ ]
+ },
+ {
+ "attachments": {},
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "#### Set the region\n",
+ "\n",
+ "You can also change the `REGION` variable used by BigQuery. Learn more about [BigQuery regions](https://cloud.google.com/bigquery/docs/locations#supported_locations)."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "REGION = \"US\" # @param {type: \"string\"}"
+ ]
+ },
+ {
+ "attachments": {},
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "#### Authenticate your Google Cloud account\n",
+ "\n",
+ "Depending on your Jupyter environment, you might have to manually authenticate. Follow the relevant instructions below."
+ ]
+ },
+ {
+ "attachments": {},
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "**Vertex AI Workbench**\n",
+ "\n",
+ "Do nothing, you are already authenticated."
+ ]
+ },
+ {
+ "attachments": {},
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "**Local JupyterLab instance**\n",
+ "\n",
+ "Uncomment and run the following cell:"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# ! gcloud auth login"
+ ]
+ },
+ {
+ "attachments": {},
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "**Colab**\n",
+ "\n",
+ "Uncomment and run the following cell:"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# from google.colab import auth\n",
+ "# auth.authenticate_user()"
+ ]
+ },
+ {
+ "attachments": {},
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "If you want to reset the location of the created DataFrame or Series objects, reset the session by executing `bf.close_session()`. After that, you can reuse `bf.options.bigquery.location` to specify another location."
+ ]
+ },
+ {
+ "attachments": {},
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### Connect to Vertex AI\n",
+ "\n",
+ "In order to use PaLM2TextGenerator, we will need to set up a [cloud resource connection](https://cloud.google.com/bigquery/docs/create-cloud-resource-connection)."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "from google.cloud import bigquery_connection_v1 as bq_connection\n",
+ "\n",
+ "CONN_NAME = \"bqdf-llm\"\n",
+ "\n",
+ "client = bq_connection.ConnectionServiceClient()\n",
+ "new_conn_parent = f\"projects/{PROJECT_ID}/locations/{REGION}\"\n",
+ "exists_conn_parent = f\"projects/{PROJECT_ID}/locations/{REGION}/connections/{CONN_NAME}\"\n",
+ "cloud_resource_properties = bq_connection.CloudResourceProperties({})\n",
+ "\n",
+ "try:\n",
+ " request = client.get_connection(\n",
+ " request=bq_connection.GetConnectionRequest(name=exists_conn_parent)\n",
+ " )\n",
+ " CONN_SERVICE_ACCOUNT = f\"serviceAccount:{request.cloud_resource.service_account_id}\"\n",
+ "except Exception:\n",
+ " connection = bq_connection.types.Connection(\n",
+ " {\"friendly_name\": CONN_NAME, \"cloud_resource\": cloud_resource_properties}\n",
+ " )\n",
+ " request = bq_connection.CreateConnectionRequest(\n",
+ " {\n",
+ " \"parent\": new_conn_parent,\n",
+ " \"connection_id\": CONN_NAME,\n",
+ " \"connection\": connection,\n",
+ " }\n",
+ " )\n",
+ " response = client.create_connection(request)\n",
+ " CONN_SERVICE_ACCOUNT = (\n",
+ " f\"serviceAccount:{response.cloud_resource.service_account_id}\"\n",
+ " )\n",
+ "print(CONN_SERVICE_ACCOUNT)"
+ ]
+ },
+ {
+ "attachments": {},
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Set permissions for the service account\n",
+ "\n",
+ "The resource connection service account requires certain project-level permissions:\n",
+ " - `roles/aiplatform.user` and `roles/bigquery.connectionUser`: These roles are required for the connection to create a model definition using the LLM model in Vertex AI ([documentation](https://cloud.google.com/bigquery/docs/generate-text#give_the_service_account_access)).\n",
+ " - `roles/run.invoker`: This role is required for the connection to have read-only access to Cloud Run services that back custom/remote functions ([documentation](https://cloud.google.com/bigquery/docs/remote-functions#grant_permission_on_function)).\n",
+ "\n",
+ "Set these permissions by running the following `gcloud` commands:"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "!gcloud projects add-iam-policy-binding {PROJECT_ID} --condition=None --no-user-output-enabled --member={CONN_SERVICE_ACCOUNT} --role='roles/bigquery.connectionUser'\n",
+ "!gcloud projects add-iam-policy-binding {PROJECT_ID} --condition=None --no-user-output-enabled --member={CONN_SERVICE_ACCOUNT} --role='roles/aiplatform.user'\n",
+ "!gcloud projects add-iam-policy-binding {PROJECT_ID} --condition=None --no-user-output-enabled --member={CONN_SERVICE_ACCOUNT} --role='roles/run.invoker'"
+ ]
+ },
+ {
+ "attachments": {},
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "Now we are ready to use BigQuery DataFrames!"
+ ]
+ },
+ {
+ "attachments": {},
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "xckgWno6ouHY"
+ },
+ "source": [
+ "## Step 1: Text embedding "
+ ]
+ },
+ {
+ "attachments": {},
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "Project Setup"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "id": "R7STCS8xB5d2"
+ },
+ "outputs": [],
+ "source": [
+ "import bigframes.pandas as bf\n",
+ "\n",
+ "bf.options.bigquery.project = PROJECT_ID\n",
+ "bf.options.bigquery.location = REGION"
+ ]
+ },
+ {
+ "attachments": {},
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "v6FGschEowht"
+ },
+ "source": [
+ "Data Input - read the data from a publicly available BigQuery dataset"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "id": "zDSwoBo1CU3G"
+ },
+ "outputs": [],
+ "source": [
+ "input_df = bf.read_gbq(\"bigquery-public-data.cfpb_complaints.complaint_database\")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "id": "tYDoaKgJChiq"
+ },
+ "outputs": [],
+ "source": [
+ "issues_df = input_df[[\"consumer_complaint_narrative\"]].dropna()\n",
+ "issues_df.head(n=5) # View the first five complaints"
+ ]
+ },
+ {
+ "attachments": {},
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "Download 10000 complaints to use with PaLM2TextEmbeddingGenerator"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "id": "OltYSUEcsSOW"
+ },
+ "outputs": [],
+ "source": [
+ "# Choose 10,000 complaints randomly and store them in a column in a DataFrame\n",
+ "downsampled_issues_df = issues_df.sample(n=10000)"
+ ]
+ },
+ {
+ "attachments": {},
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "Wl2o-NYMoygb"
+ },
+ "source": [
+ "Generate the text embeddings"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "id": "li38q8FzDDMu"
+ },
+ "outputs": [],
+ "source": [
+ "from bigframes.ml.llm import PaLM2TextEmbeddingGenerator\n",
+ "\n",
+ "model = PaLM2TextEmbeddingGenerator() # No connection id needed"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "id": "cOuSOQ5FDewD"
+ },
+ "outputs": [],
+ "source": [
+ "# Will take ~3 minutes to compute the embeddings\n",
+ "predicted_embeddings = model.predict(downsampled_issues_df)\n",
+ "# Notice the lists of numbers that are our text embeddings for each complaint\n",
+ "predicted_embeddings.head() "
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "id": "4H_etYfsEOFP"
+ },
+ "outputs": [],
+ "source": [
+ "# Join the complaints with their embeddings in the same DataFrame\n",
+ "combined_df = downsampled_issues_df.join(predicted_embeddings)"
+ ]
+ },
+ {
+ "attachments": {},
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "We now have the complaints and their text embeddings as two columns in our combined_df. Recall that complaints with numerically similar text embeddings should have similar meanings semantically. We will now group similar complaints together."
+ ]
+ },
+ {
+ "attachments": {},
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "OUZ3NNbzo1Tb"
+ },
+ "source": [
+ "## Step 2: KMeans clustering"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "id": "AhNTnEC5FRz2"
+ },
+ "outputs": [],
+ "source": [
+ "from bigframes.ml.cluster import KMeans\n",
+ "\n",
+ "cluster_model = KMeans(n_clusters=10) # We will divide our complaints into 10 groups"
+ ]
+ },
+ {
+ "attachments": {},
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "Perform KMeans clustering"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "id": "6poSxh-fGJF7"
+ },
+ "outputs": [],
+ "source": [
+ "# Use KMeans clustering to calculate our groups. Will take ~3 minutes.\n",
+ "cluster_model.fit(combined_df[[\"text_embedding\"]])\n",
+ "clustered_result = cluster_model.predict(combined_df[[\"text_embedding\"]])\n",
+ "# Notice the CENTROID_ID column, which is the ID number of the group that\n",
+ "# each complaint belongs to.\n",
+ "clustered_result.head(n=5)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Join the group number to the complaints and their text embeddings\n",
+ "combined_clustered_result = combined_df.join(clustered_result)"
+ ]
+ },
+ {
+ "attachments": {},
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "Our dataframe combined_clustered_result now has three columns: the complaints, their text embeddings, and an ID from 1-10 (inclusive) indicating which semantically similar group they belong to."
+ ]
+ },
+ {
+ "attachments": {},
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "21rNsFMHo8hO"
+ },
+ "source": [
+ "## Step 3: Summarize the complaints"
+ ]
+ },
+ {
+ "attachments": {},
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "Build prompts - we will choose just two of our categories and prompt PaLM2TextGenerator to identify their salient characteristics. The prompt is natural language in a python string."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "id": "2E7wXM_jGqo6"
+ },
+ "outputs": [],
+ "source": [
+ "# Using bigframes, with syntax identical to pandas,\n",
+ "# filter out the first and second groups\n",
+ "cluster_1_result = combined_clustered_result[\n",
+ " combined_clustered_result[\"CENTROID_ID\"] == 1\n",
+ "][[\"consumer_complaint_narrative\"]]\n",
+ "cluster_1_result_pandas = cluster_1_result.head(5).to_pandas()\n",
+ "\n",
+ "cluster_2_result = combined_clustered_result[\n",
+ " combined_clustered_result[\"CENTROID_ID\"] == 2\n",
+ "][[\"consumer_complaint_narrative\"]]\n",
+ "cluster_2_result_pandas = cluster_2_result.head(5).to_pandas()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "id": "ZNDiueI9IP5e"
+ },
+ "outputs": [],
+ "source": [
+ "# Build plain-text prompts to send to PaLM 2. Use only 5 complaints from each group.\n",
+ "prompt1 = 'comment list 1:\\n'\n",
+ "for i in range(5):\n",
+ " prompt1 += str(i + 1) + '. ' + \\\n",
+ " cluster_1_result_pandas[\"consumer_complaint_narrative\"].iloc[i] + '\\n'\n",
+ "\n",
+ "prompt2 = 'comment list 2:\\n'\n",
+ "for i in range(5):\n",
+ " prompt2 += str(i + 1) + '. ' + \\\n",
+ " cluster_2_result_pandas[\"consumer_complaint_narrative\"].iloc[i] + '\\n'\n",
+ "\n",
+ "print(prompt1)\n",
+ "print(prompt2)\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "id": "BfHGJLirzSvH"
+ },
+ "outputs": [],
+ "source": [
+ "# The plain English request we will make of PaLM 2\n",
+ "prompt = (\n",
+ " \"Please highlight the most obvious difference between\"\n",
+ " \"the two lists of comments:\\n\" + prompt1 + prompt2\n",
+ ")\n",
+ "print(prompt)"
+ ]
+ },
+ {
+ "attachments": {},
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "Get a response from PaLM 2 LLM by making a call to Vertex AI using our connection."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "id": "mL5P0_3X04dE"
+ },
+ "outputs": [],
+ "source": [
+ "from bigframes.ml.llm import PaLM2TextGenerator\n",
+ "\n",
+ "session = bf.get_global_session()\n",
+ "connection = f\"{PROJECT_ID}.{REGION}.{CONN_NAME}\"\n",
+ "q_a_model = PaLM2TextGenerator(session=session, connection_name=connection)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "id": "ICWHsqAW1FNk"
+ },
+ "outputs": [],
+ "source": [
+ "# Make a DataFrame containing only a single row with our prompt for PaLM 2\n",
+ "df = bf.DataFrame({\"prompt\": [prompt]})"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "id": "gB7e1LXU1pst"
+ },
+ "outputs": [],
+ "source": [
+ "# Send the request for PaLM 2 to generate a response to our prompt\n",
+ "major_difference = q_a_model.predict(df)\n",
+ "# PaLM 2's response is the only row in the dataframe result \n",
+ "major_difference[\"ml_generate_text_llm_result\"].iloc[0]"
+ ]
+ },
+ {
+ "attachments": {},
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "We now see PaLM2TextGenerator's characterization of the different comment groups. Thanks for using BigQuery DataFrames!"
+ ]
+ }
+ ],
+ "metadata": {
+ "colab": {
+ "provenance": []
+ },
+ "kernelspec": {
+ "display_name": "Python 3",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.9.16"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 0
+}
diff --git a/noxfile.py b/noxfile.py
index 34b055de44..3dd23ba04f 100644
--- a/noxfile.py
+++ b/noxfile.py
@@ -609,6 +609,7 @@ def notebook(session):
# our test infrastructure.
"notebooks/getting_started/getting_started_bq_dataframes.ipynb",
"notebooks/generative_ai/bq_dataframes_llm_code_generation.ipynb",
+ "notebooks/generative_ai/bq_dataframes_llm_kmeans.ipynb",
"notebooks/regression/bq_dataframes_ml_linear_regression.ipynb",
"notebooks/generative_ai/bq_dataframes_ml_drug_name_generation.ipynb",
"notebooks/vertex_sdk/sdk2_bigframes_pytorch.ipynb",
diff --git a/owlbot.py b/owlbot.py
index be30eea5c2..082970018d 100644
--- a/owlbot.py
+++ b/owlbot.py
@@ -99,6 +99,13 @@
"BigQuery DataFrames provides DataFrame APIs on the BigQuery engine",
)
+# Update the contributing guide to reflect some differences in this repo.
+s.replace(
+ ["CONTRIBUTING.rst"],
+ re.escape("blacken"),
+ "format",
+)
+
# ----------------------------------------------------------------------------
# Samples templates
# ----------------------------------------------------------------------------
@@ -110,5 +117,3 @@
# ----------------------------------------------------------------------------
s.shell.run(["nox", "-s", "format"], hide_output=False)
-for noxfile in REPO_ROOT.glob("samples/**/noxfile.py"):
- s.shell.run(["nox", "-s", "blacken"], cwd=noxfile.parent, hide_output=False)
diff --git a/samples/snippets/pandas_methods_test.py b/samples/snippets/pandas_methods_test.py
index 1f472d6346..bd8e29c003 100644
--- a/samples/snippets/pandas_methods_test.py
+++ b/samples/snippets/pandas_methods_test.py
@@ -22,13 +22,20 @@ def test_bigquery_dataframes_pandas_methods():
bq_df = bpd.read_gbq(query_or_table)
# Inspect one of the columns (or series) of the DataFrame:
- bq_df["body_mass_g"].head(10)
+ bq_df["body_mass_g"]
# Compute the mean of this series:
average_body_mass = bq_df["body_mass_g"].mean()
print(f"average_body_mass: {average_body_mass}")
- # Calculate the mean body_mass_g by species using the groupby operation:
- bq_df["body_mass_g"].groupby(by=bq_df["species"]).mean().head()
+ # Find the heaviest species using the groupby operation to calculate the
+ # mean body_mass_g:
+ (
+ bq_df["body_mass_g"]
+ .groupby(by=bq_df["species"])
+ .mean()
+ .sort_values(ascending=False)
+ .head(10)
+ )
# [END bigquery_dataframes_pandas_methods]
assert average_body_mass is not None
diff --git a/tests/system/large/ml/test_cluster.py b/tests/system/large/ml/test_cluster.py
index eae6896669..f01116665f 100644
--- a/tests/system/large/ml/test_cluster.py
+++ b/tests/system/large/ml/test_cluster.py
@@ -16,7 +16,7 @@
import pytest
from bigframes.ml import cluster
-from tests.system.utils import assert_pandas_df_equal_ignore_ordering
+from tests.system.utils import assert_pandas_df_equal
@pytest.mark.flaky(retries=2, delay=120)
@@ -105,7 +105,7 @@ def test_cluster_configure_fit_score_predict(
index=pd.Index(["test1", "test2", "test3", "test4"], dtype="string[pyarrow]"),
)
expected.index.name = "observation"
- assert_pandas_df_equal_ignore_ordering(result, expected)
+ assert_pandas_df_equal(result, expected, ignore_order=True)
# save, load, check n_clusters to ensure configuration was kept
reloaded_model = model.to_gbq(
diff --git a/tests/system/large/ml/test_pipeline.py b/tests/system/large/ml/test_pipeline.py
index 6874a9f301..3e56954058 100644
--- a/tests/system/large/ml/test_pipeline.py
+++ b/tests/system/large/ml/test_pipeline.py
@@ -24,7 +24,7 @@
pipeline,
preprocessing,
)
-from tests.system.utils import assert_pandas_df_equal_ignore_ordering
+from tests.system.utils import assert_pandas_df_equal
def test_pipeline_linear_regression_fit_score_predict(
@@ -555,7 +555,7 @@ def test_pipeline_standard_scaler_kmeans_fit_score_predict(
),
)
expected.index.name = "observation"
- assert_pandas_df_equal_ignore_ordering(result, expected)
+ assert_pandas_df_equal(result, expected, ignore_order=True)
def test_pipeline_columntransformer_fit_predict(session, penguins_df_default_index):
diff --git a/tests/system/large/test_remote_function.py b/tests/system/large/test_remote_function.py
index c8f8f66eba..6ed3e6511a 100644
--- a/tests/system/large/test_remote_function.py
+++ b/tests/system/large/test_remote_function.py
@@ -32,7 +32,7 @@
get_cloud_function_name,
get_remote_function_locations,
)
-from tests.system.utils import assert_pandas_df_equal_ignore_ordering
+from tests.system.utils import assert_pandas_df_equal
# Use this to control the number of cloud functions being deleted in a single
# test session. This should help soften the spike of the number of mutations per
@@ -357,7 +357,7 @@ def square(x):
pd_result_col = pd_result_col.astype(pandas.Int64Dtype())
pd_result = pd_int64_col_filtered.to_frame().assign(result=pd_result_col)
- assert_pandas_df_equal_ignore_ordering(bf_result, pd_result)
+ assert_pandas_df_equal(bf_result, pd_result)
finally:
# clean up the gcp assets created for the remote function
cleanup_remote_function_assets(session.bqclient, functions_client, square)
@@ -401,7 +401,7 @@ def add_one(x):
pd_result_col = pd_result_col.astype(pandas.Int64Dtype())
pd_result = pd_int64_col_filtered.to_frame().assign(result=pd_result_col)
- assert_pandas_df_equal_ignore_ordering(bf_result, pd_result)
+ assert_pandas_df_equal(bf_result, pd_result)
finally:
# clean up the gcp assets created for the remote function
cleanup_remote_function_assets(
@@ -446,7 +446,7 @@ def square(x):
pd_result_col = pd_result_col.astype(pandas.Int64Dtype())
pd_result = pd_int64_col_filtered.to_frame().assign(result=pd_result_col)
- assert_pandas_df_equal_ignore_ordering(bf_result, pd_result)
+ assert_pandas_df_equal(bf_result, pd_result)
finally:
# clean up the gcp assets created for the remote function
cleanup_remote_function_assets(session.bqclient, functions_client, square)
@@ -497,7 +497,7 @@ def sign(num):
pd_result_col = pd_result_col.astype(pandas.Int64Dtype())
pd_result = pd_int64_col_filtered.to_frame().assign(result=pd_result_col)
- assert_pandas_df_equal_ignore_ordering(bf_result, pd_result)
+ assert_pandas_df_equal(bf_result, pd_result)
finally:
# clean up the gcp assets created for the remote function
cleanup_remote_function_assets(session.bqclient, functions_client, remote_sign)
@@ -542,7 +542,7 @@ def circumference(radius):
pd_result_col = pd_result_col.astype(pandas.Float64Dtype())
pd_result = pd_float64_col_filtered.to_frame().assign(result=pd_result_col)
- assert_pandas_df_equal_ignore_ordering(bf_result, pd_result)
+ assert_pandas_df_equal(bf_result, pd_result)
finally:
# clean up the gcp assets created for the remote function
cleanup_remote_function_assets(
@@ -591,7 +591,7 @@ def find_team(num):
pd_result_col = pd_result_col.astype(pandas.StringDtype(storage="pyarrow"))
pd_result = pd_float64_col_filtered.to_frame().assign(result=pd_result_col)
- assert_pandas_df_equal_ignore_ordering(bf_result, pd_result)
+ assert_pandas_df_equal(bf_result, pd_result)
finally:
# clean up the gcp assets created for the remote function
cleanup_remote_function_assets(
@@ -675,7 +675,7 @@ def inner_test():
pd_result_col = pd_result_col.astype(pandas.Int64Dtype())
pd_result = pd_int64_col_filtered.to_frame().assign(result=pd_result_col)
- assert_pandas_df_equal_ignore_ordering(bf_result, pd_result)
+ assert_pandas_df_equal(bf_result, pd_result)
# Test that the remote function works as expected
inner_test()
@@ -765,7 +765,7 @@ def is_odd(num):
pd_result_col = pd_int64_col.mask(is_odd)
pd_result = pd_int64_col.to_frame().assign(result=pd_result_col)
- assert_pandas_df_equal_ignore_ordering(bf_result, pd_result)
+ assert_pandas_df_equal(bf_result, pd_result)
finally:
# clean up the gcp assets created for the remote function
cleanup_remote_function_assets(
@@ -808,7 +808,7 @@ def is_odd(num):
pd_result_col = pd_int64_col[pd_int64_col.notnull()].mask(is_odd, -1)
pd_result = pd_int64_col.to_frame().assign(result=pd_result_col)
- assert_pandas_df_equal_ignore_ordering(bf_result, pd_result)
+ assert_pandas_df_equal(bf_result, pd_result)
finally:
# clean up the gcp assets created for the remote function
cleanup_remote_function_assets(
@@ -852,7 +852,7 @@ def test_remote_udf_lambda(
pd_result_col = pd_result_col.astype(pandas.Int64Dtype())
pd_result = pd_int64_col_filtered.to_frame().assign(result=pd_result_col)
- assert_pandas_df_equal_ignore_ordering(bf_result, pd_result)
+ assert_pandas_df_equal(bf_result, pd_result)
finally:
# clean up the gcp assets created for the remote function
cleanup_remote_function_assets(
@@ -909,7 +909,7 @@ def square(x):
pd_result_col = pd_result_col.astype(pandas.Int64Dtype())
pd_result = pd_int64_col.to_frame().assign(result=pd_result_col)
- assert_pandas_df_equal_ignore_ordering(bf_result, pd_result)
+ assert_pandas_df_equal(bf_result, pd_result)
finally:
# clean up the gcp assets created for the remote function
cleanup_remote_function_assets(
@@ -954,7 +954,7 @@ def pd_np_foo(x):
# comparing for the purpose of this test
pd_result.result = pd_result.result.astype(pandas.Float64Dtype())
- assert_pandas_df_equal_ignore_ordering(bf_result, pd_result)
+ assert_pandas_df_equal(bf_result, pd_result)
finally:
# clean up the gcp assets created for the remote function
cleanup_remote_function_assets(
@@ -998,7 +998,7 @@ def test_internal(rf, udf):
pd_result_col = pd_result_col.astype(pandas.Int64Dtype())
pd_result = pd_int64_col.to_frame().assign(result=pd_result_col)
- assert_pandas_df_equal_ignore_ordering(bf_result, pd_result)
+ assert_pandas_df_equal(bf_result, pd_result)
# Create an explicit name for the remote function
prefixer = test_utils.prefixer.Prefixer("foo", "")
@@ -1167,7 +1167,7 @@ def square(x):
pd_result_col = pd_result_col.astype(pandas.Int64Dtype())
pd_result = pd_int64_col_filtered.to_frame().assign(result=pd_result_col)
- assert_pandas_df_equal_ignore_ordering(bf_result, pd_result)
+ assert_pandas_df_equal(bf_result, pd_result)
finally:
# clean up the gcp assets created for the remote function
cleanup_remote_function_assets(
@@ -1204,7 +1204,7 @@ def square(x):
pd_result_col = pd_result_col.astype(pandas.Int64Dtype())
pd_result = pd_int64_col_filtered.to_frame().assign(result=pd_result_col)
- assert_pandas_df_equal_ignore_ordering(bf_result, pd_result)
+ assert_pandas_df_equal(bf_result, pd_result)
finally:
# clean up the gcp assets created for the remote function
cleanup_remote_function_assets(
diff --git a/tests/system/small/ml/test_cluster.py b/tests/system/small/ml/test_cluster.py
index d95a1e1bc2..266a38e3ee 100644
--- a/tests/system/small/ml/test_cluster.py
+++ b/tests/system/small/ml/test_cluster.py
@@ -15,7 +15,7 @@
import pandas as pd
from bigframes.ml import cluster
-from tests.system.utils import assert_pandas_df_equal_ignore_ordering
+from tests.system.utils import assert_pandas_df_equal
_PD_NEW_PENGUINS = pd.DataFrame.from_dict(
{
@@ -68,7 +68,7 @@ def test_kmeans_predict(session, penguins_kmeans_model: cluster.KMeans):
dtype="Int64",
index=pd.Index(["test1", "test2", "test3", "test4"], dtype="string[pyarrow]"),
)
- assert_pandas_df_equal_ignore_ordering(result, expected)
+ assert_pandas_df_equal(result, expected, ignore_order=True)
def test_kmeans_score(session, penguins_kmeans_model: cluster.KMeans):
@@ -89,59 +89,67 @@ def test_kmeans_score(session, penguins_kmeans_model: cluster.KMeans):
def test_kmeans_cluster_centers(penguins_kmeans_model: cluster.KMeans):
- result = penguins_kmeans_model.cluster_centers_.to_pandas()
- expected = pd.DataFrame(
- {
- "centroid_id": [1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3],
- "feature": [
- "culmen_length_mm",
- "culmen_depth_mm",
- "flipper_length_mm",
- "sex",
- ]
- * 3,
- "numerical_value": [
- 47.509677,
- 14.993548,
- 217.040123,
- pd.NA,
- 38.207813,
- 18.03125,
- 187.992188,
- pd.NA,
- 47.036346,
- 18.834808,
- 197.1612,
- pd.NA,
- ],
- "categorical_value": [
- [],
- [],
- [],
- [
- {"category": ".", "value": 0.008064516129032258},
- {"category": "MALE", "value": 0.49193548387096775},
- {"category": "FEMALE", "value": 0.47580645161290325},
- {"category": "_null_filler", "value": 0.024193548387096774},
- ],
- [],
- [],
- [],
- [
- {"category": "MALE", "value": 0.34375},
- {"category": "FEMALE", "value": 0.625},
- {"category": "_null_filler", "value": 0.03125},
+ result = (
+ penguins_kmeans_model.cluster_centers_.to_pandas()
+ .sort_values(["centroid_id", "feature"])
+ .reset_index(drop=True)
+ )
+ expected = (
+ pd.DataFrame(
+ {
+ "centroid_id": [1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3],
+ "feature": [
+ "culmen_length_mm",
+ "culmen_depth_mm",
+ "flipper_length_mm",
+ "sex",
+ ]
+ * 3,
+ "numerical_value": [
+ 47.509677,
+ 14.993548,
+ 217.040123,
+ pd.NA,
+ 38.207813,
+ 18.03125,
+ 187.992188,
+ pd.NA,
+ 47.036346,
+ 18.834808,
+ 197.1612,
+ pd.NA,
],
- [],
- [],
- [],
- [
- {"category": "MALE", "value": 0.6847826086956522},
- {"category": "FEMALE", "value": 0.2826086956521739},
- {"category": "_null_filler", "value": 0.03260869565217391},
+ "categorical_value": [
+ [],
+ [],
+ [],
+ [
+ {"category": ".", "value": 0.008064516129032258},
+ {"category": "MALE", "value": 0.49193548387096775},
+ {"category": "FEMALE", "value": 0.47580645161290325},
+ {"category": "_null_filler", "value": 0.024193548387096774},
+ ],
+ [],
+ [],
+ [],
+ [
+ {"category": "MALE", "value": 0.34375},
+ {"category": "FEMALE", "value": 0.625},
+ {"category": "_null_filler", "value": 0.03125},
+ ],
+ [],
+ [],
+ [],
+ [
+ {"category": "MALE", "value": 0.6847826086956522},
+ {"category": "FEMALE", "value": 0.2826086956521739},
+ {"category": "_null_filler", "value": 0.03260869565217391},
+ ],
],
- ],
- },
+ },
+ )
+ .sort_values(["centroid_id", "feature"])
+ .reset_index(drop=True)
)
pd.testing.assert_frame_equal(
result,
diff --git a/tests/system/small/ml/test_core.py b/tests/system/small/ml/test_core.py
index f911dd7eeb..be34a4871c 100644
--- a/tests/system/small/ml/test_core.py
+++ b/tests/system/small/ml/test_core.py
@@ -78,58 +78,62 @@ def test_model_eval_with_data(penguins_bqml_linear_model, penguins_df_default_in
def test_model_centroids(penguins_bqml_kmeans_model: core.BqmlModel):
result = penguins_bqml_kmeans_model.centroids().to_pandas()
- expected = pd.DataFrame(
- {
- "centroid_id": [1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3],
- "feature": [
- "culmen_length_mm",
- "culmen_depth_mm",
- "flipper_length_mm",
- "sex",
- ]
- * 3,
- "numerical_value": [
- 47.509677,
- 14.993548,
- 217.040123,
- pd.NA,
- 38.207813,
- 18.03125,
- 187.992188,
- pd.NA,
- 47.036346,
- 18.834808,
- 197.1612,
- pd.NA,
- ],
- "categorical_value": [
- [],
- [],
- [],
- [
- {"category": ".", "value": 0.008064516129032258},
- {"category": "MALE", "value": 0.49193548387096775},
- {"category": "FEMALE", "value": 0.47580645161290325},
- {"category": "_null_filler", "value": 0.024193548387096774},
+ expected = (
+ pd.DataFrame(
+ {
+ "centroid_id": [1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3],
+ "feature": [
+ "culmen_length_mm",
+ "culmen_depth_mm",
+ "flipper_length_mm",
+ "sex",
+ ]
+ * 3,
+ "numerical_value": [
+ 47.509677,
+ 14.993548,
+ 217.040123,
+ pd.NA,
+ 38.207813,
+ 18.03125,
+ 187.992188,
+ pd.NA,
+ 47.036346,
+ 18.834808,
+ 197.1612,
+ pd.NA,
],
- [],
- [],
- [],
- [
- {"category": "MALE", "value": 0.34375},
- {"category": "FEMALE", "value": 0.625},
- {"category": "_null_filler", "value": 0.03125},
+ "categorical_value": [
+ [],
+ [],
+ [],
+ [
+ {"category": ".", "value": 0.008064516129032258},
+ {"category": "MALE", "value": 0.49193548387096775},
+ {"category": "FEMALE", "value": 0.47580645161290325},
+ {"category": "_null_filler", "value": 0.024193548387096774},
+ ],
+ [],
+ [],
+ [],
+ [
+ {"category": "MALE", "value": 0.34375},
+ {"category": "FEMALE", "value": 0.625},
+ {"category": "_null_filler", "value": 0.03125},
+ ],
+ [],
+ [],
+ [],
+ [
+ {"category": "MALE", "value": 0.6847826086956522},
+ {"category": "FEMALE", "value": 0.2826086956521739},
+ {"category": "_null_filler", "value": 0.03260869565217391},
+ ],
],
- [],
- [],
- [],
- [
- {"category": "MALE", "value": 0.6847826086956522},
- {"category": "FEMALE", "value": 0.2826086956521739},
- {"category": "_null_filler", "value": 0.03260869565217391},
- ],
- ],
- },
+ },
+ )
+ .sort_values(["centroid_id", "feature"])
+ .reset_index(drop=True)
)
pd.testing.assert_frame_equal(
result,
@@ -148,59 +152,63 @@ def test_pca_model_principal_components(penguins_bqml_pca_model: core.BqmlModel)
# result is too long, only check the first principal component here.
result = result.head(7)
- expected = pd.DataFrame(
- {
- "principal_component_id": [0] * 7,
- "feature": [
- "species",
- "island",
- "culmen_length_mm",
- "culmen_depth_mm",
- "flipper_length_mm",
- "body_mass_g",
- "sex",
- ],
- "numerical_value": [
- pd.NA,
- pd.NA,
- 0.401489,
- -0.377482,
- 0.524052,
- 0.501174,
- pd.NA,
- ],
- "categorical_value": [
- [
- {
- "category": "Gentoo penguin (Pygoscelis papua)",
- "value": 0.25068877125667804,
- },
- {
- "category": "Adelie Penguin (Pygoscelis adeliae)",
- "value": -0.20622291900416198,
- },
- {
- "category": "Chinstrap penguin (Pygoscelis antarctica)",
- "value": -0.030161149275185855,
- },
+ expected = (
+ pd.DataFrame(
+ {
+ "principal_component_id": [0] * 7,
+ "feature": [
+ "species",
+ "island",
+ "culmen_length_mm",
+ "culmen_depth_mm",
+ "flipper_length_mm",
+ "body_mass_g",
+ "sex",
],
- [
- {"category": "Biscoe", "value": 0.19761120114410635},
- {"category": "Dream", "value": -0.11264736305259061},
- {"category": "Torgersen", "value": -0.07065913511418596},
+ "numerical_value": [
+ pd.NA,
+ pd.NA,
+ 0.401489,
+ -0.377482,
+ 0.524052,
+ 0.501174,
+ pd.NA,
],
- [],
- [],
- [],
- [],
- [
- {"category": ".", "value": 0.0015916894448071784},
- {"category": "MALE", "value": 0.06869704739750442},
- {"category": "FEMALE", "value": -0.052521171596813174},
- {"category": "_null_filler", "value": -0.0034628622681684906},
+ "categorical_value": [
+ [
+ {
+ "category": "Gentoo penguin (Pygoscelis papua)",
+ "value": 0.25068877125667804,
+ },
+ {
+ "category": "Adelie Penguin (Pygoscelis adeliae)",
+ "value": -0.20622291900416198,
+ },
+ {
+ "category": "Chinstrap penguin (Pygoscelis antarctica)",
+ "value": -0.030161149275185855,
+ },
+ ],
+ [
+ {"category": "Biscoe", "value": 0.19761120114410635},
+ {"category": "Dream", "value": -0.11264736305259061},
+ {"category": "Torgersen", "value": -0.07065913511418596},
+ ],
+ [],
+ [],
+ [],
+ [],
+ [
+ {"category": ".", "value": 0.0015916894448071784},
+ {"category": "MALE", "value": 0.06869704739750442},
+ {"category": "FEMALE", "value": -0.052521171596813174},
+ {"category": "_null_filler", "value": -0.0034628622681684906},
+ ],
],
- ],
- },
+ },
+ )
+ .sort_values(["principal_component_id", "feature"])
+ .reset_index(drop=True)
)
pd.testing.assert_frame_equal(
result,
@@ -225,7 +233,7 @@ def test_pca_model_principal_component_info(penguins_bqml_pca_model: core.BqmlMo
"cumulative_explained_variance_ratio": [0.469357, 0.651283, 0.812383],
},
)
- tests.system.utils.assert_pandas_df_equal_ignore_ordering(
+ tests.system.utils.assert_pandas_df_equal(
result,
expected,
check_exact=False,
@@ -233,6 +241,7 @@ def test_pca_model_principal_component_info(penguins_bqml_pca_model: core.BqmlMo
# int64 Index by default in pandas versus Int64 (nullable) Index in BigQuery DataFrame
check_index_type=False,
check_dtype=False,
+ ignore_order=True,
)
diff --git a/tests/system/small/ml/test_decomposition.py b/tests/system/small/ml/test_decomposition.py
index e31681f4a0..42fea66cf8 100644
--- a/tests/system/small/ml/test_decomposition.py
+++ b/tests/system/small/ml/test_decomposition.py
@@ -57,59 +57,63 @@ def test_pca_components_(penguins_pca_model: decomposition.PCA):
# result is too long, only check the first principal component here.
result = result.head(7)
- expected = pd.DataFrame(
- {
- "principal_component_id": [0] * 7,
- "feature": [
- "species",
- "island",
- "culmen_length_mm",
- "culmen_depth_mm",
- "flipper_length_mm",
- "body_mass_g",
- "sex",
- ],
- "numerical_value": [
- pd.NA,
- pd.NA,
- 0.401489,
- -0.377482,
- 0.524052,
- 0.501174,
- pd.NA,
- ],
- "categorical_value": [
- [
- {
- "category": "Gentoo penguin (Pygoscelis papua)",
- "value": 0.25068877125667804,
- },
- {
- "category": "Adelie Penguin (Pygoscelis adeliae)",
- "value": -0.20622291900416198,
- },
- {
- "category": "Chinstrap penguin (Pygoscelis antarctica)",
- "value": -0.030161149275185855,
- },
+ expected = (
+ pd.DataFrame(
+ {
+ "principal_component_id": [0] * 7,
+ "feature": [
+ "species",
+ "island",
+ "culmen_length_mm",
+ "culmen_depth_mm",
+ "flipper_length_mm",
+ "body_mass_g",
+ "sex",
],
- [
- {"category": "Biscoe", "value": 0.19761120114410635},
- {"category": "Dream", "value": -0.11264736305259061},
- {"category": "Torgersen", "value": -0.07065913511418596},
+ "numerical_value": [
+ pd.NA,
+ pd.NA,
+ 0.401489,
+ -0.377482,
+ 0.524052,
+ 0.501174,
+ pd.NA,
],
- [],
- [],
- [],
- [],
- [
- {"category": ".", "value": 0.0015916894448071784},
- {"category": "MALE", "value": 0.06869704739750442},
- {"category": "FEMALE", "value": -0.052521171596813174},
- {"category": "_null_filler", "value": -0.0034628622681684906},
+ "categorical_value": [
+ [
+ {
+ "category": "Gentoo penguin (Pygoscelis papua)",
+ "value": 0.25068877125667804,
+ },
+ {
+ "category": "Adelie Penguin (Pygoscelis adeliae)",
+ "value": -0.20622291900416198,
+ },
+ {
+ "category": "Chinstrap penguin (Pygoscelis antarctica)",
+ "value": -0.030161149275185855,
+ },
+ ],
+ [
+ {"category": "Biscoe", "value": 0.19761120114410635},
+ {"category": "Dream", "value": -0.11264736305259061},
+ {"category": "Torgersen", "value": -0.07065913511418596},
+ ],
+ [],
+ [],
+ [],
+ [],
+ [
+ {"category": ".", "value": 0.0015916894448071784},
+ {"category": "MALE", "value": 0.06869704739750442},
+ {"category": "FEMALE", "value": -0.052521171596813174},
+ {"category": "_null_filler", "value": -0.0034628622681684906},
+ ],
],
- ],
- },
+ },
+ )
+ .sort_values(["principal_component_id", "feature"])
+ .reset_index(drop=True)
)
pd.testing.assert_frame_equal(
result,
@@ -130,13 +134,14 @@ def test_pca_explained_variance_(penguins_pca_model: decomposition.PCA):
"explained_variance": [3.278657, 1.270829, 1.125354],
},
)
- tests.system.utils.assert_pandas_df_equal_ignore_ordering(
+ tests.system.utils.assert_pandas_df_equal(
result,
expected,
check_exact=False,
rtol=0.1,
check_index_type=False,
check_dtype=False,
+ ignore_order=True,
)
@@ -149,11 +154,12 @@ def test_pca_explained_variance_ratio_(penguins_pca_model: decomposition.PCA):
"explained_variance_ratio": [0.469357, 0.181926, 0.1611],
},
)
- tests.system.utils.assert_pandas_df_equal_ignore_ordering(
+ tests.system.utils.assert_pandas_df_equal(
result,
expected,
check_exact=False,
rtol=0.1,
check_index_type=False,
check_dtype=False,
+ ignore_order=True,
)
diff --git a/tests/system/small/ml/test_forecasting.py b/tests/system/small/ml/test_forecasting.py
index cb27dd388c..55079c94cf 100644
--- a/tests/system/small/ml/test_forecasting.py
+++ b/tests/system/small/ml/test_forecasting.py
@@ -36,6 +36,7 @@ def test_model_predict(time_series_arima_plus_model):
expected["forecast_timestamp"] = expected["forecast_timestamp"].astype(
pd.ArrowDtype(pa.timestamp("us", tz="UTC"))
)
+
pd.testing.assert_frame_equal(
predictions,
expected,
diff --git a/tests/system/small/operations/test_datetimes.py b/tests/system/small/operations/test_datetimes.py
index 7dc55b9367..177194c7a8 100644
--- a/tests/system/small/operations/test_datetimes.py
+++ b/tests/system/small/operations/test_datetimes.py
@@ -16,7 +16,7 @@
import pytest
import bigframes.series
-from tests.system.utils import assert_series_equal_ignoring_order
+from tests.system.utils import assert_series_equal
DATETIME_COL_NAMES = [("datetime_col",), ("timestamp_col",)]
@@ -33,7 +33,7 @@ def test_day(scalars_dfs, col_name):
bf_result = bf_series.dt.day.to_pandas()
pd_result = scalars_pandas_df[col_name].dt.day
- assert_series_equal_ignoring_order(
+ assert_series_equal(
pd_result.astype(pd.Int64Dtype()),
bf_result,
)
@@ -51,7 +51,7 @@ def test_date(scalars_dfs, col_name):
bf_result = bf_series.dt.date.to_pandas()
pd_result = scalars_pandas_df[col_name].dt.date
- assert_series_equal_ignoring_order(
+ assert_series_equal(
pd_result,
bf_result,
)
@@ -69,7 +69,7 @@ def test_dayofweek(scalars_dfs, col_name):
bf_result = bf_series.dt.dayofweek.to_pandas()
pd_result = scalars_pandas_df[col_name].dt.dayofweek
- assert_series_equal_ignoring_order(pd_result, bf_result, check_dtype=False)
+ assert_series_equal(pd_result, bf_result, check_dtype=False)
@pytest.mark.parametrize(
@@ -84,7 +84,7 @@ def test_hour(scalars_dfs, col_name):
bf_result = bf_series.dt.hour.to_pandas()
pd_result = scalars_pandas_df[col_name].dt.hour
- assert_series_equal_ignoring_order(
+ assert_series_equal(
pd_result.astype(pd.Int64Dtype()),
bf_result,
)
@@ -102,7 +102,7 @@ def test_minute(scalars_dfs, col_name):
bf_result = bf_series.dt.minute.to_pandas()
pd_result = scalars_pandas_df[col_name].dt.minute
- assert_series_equal_ignoring_order(
+ assert_series_equal(
pd_result.astype(pd.Int64Dtype()),
bf_result,
)
@@ -120,7 +120,7 @@ def test_month(scalars_dfs, col_name):
bf_result = bf_series.dt.month.to_pandas()
pd_result = scalars_pandas_df[col_name].dt.month
- assert_series_equal_ignoring_order(
+ assert_series_equal(
pd_result.astype(pd.Int64Dtype()),
bf_result,
)
@@ -138,7 +138,7 @@ def test_quarter(scalars_dfs, col_name):
bf_result = bf_series.dt.quarter.to_pandas()
pd_result = scalars_pandas_df[col_name].dt.quarter
- assert_series_equal_ignoring_order(
+ assert_series_equal(
pd_result.astype(pd.Int64Dtype()),
bf_result,
)
@@ -156,7 +156,7 @@ def test_second(scalars_dfs, col_name):
bf_result = bf_series.dt.second.to_pandas()
pd_result = scalars_pandas_df[col_name].dt.second
- assert_series_equal_ignoring_order(
+ assert_series_equal(
pd_result.astype(pd.Int64Dtype()),
bf_result,
)
@@ -174,7 +174,7 @@ def test_time(scalars_dfs, col_name):
bf_result = bf_series.dt.time.to_pandas()
pd_result = scalars_pandas_df[col_name].dt.time
- assert_series_equal_ignoring_order(
+ assert_series_equal(
pd_result,
bf_result,
)
@@ -192,7 +192,7 @@ def test_year(scalars_dfs, col_name):
bf_result = bf_series.dt.year.to_pandas()
pd_result = scalars_pandas_df[col_name].dt.year
- assert_series_equal_ignoring_order(
+ assert_series_equal(
pd_result.astype(pd.Int64Dtype()),
bf_result,
)
diff --git a/tests/system/small/operations/test_strings.py b/tests/system/small/operations/test_strings.py
index 241cbd576b..27a35134d4 100644
--- a/tests/system/small/operations/test_strings.py
+++ b/tests/system/small/operations/test_strings.py
@@ -19,7 +19,7 @@
import bigframes.series
-from ...utils import assert_series_equal_ignoring_order
+from ...utils import assert_series_equal
def test_find(scalars_dfs):
@@ -31,7 +31,7 @@ def test_find(scalars_dfs):
# One of type mismatches to be documented. Here, the `bf_result.dtype` is `Int64` but
# the `pd_result.dtype` is `float64`: https://github.com/pandas-dev/pandas/issues/51948
- assert_series_equal_ignoring_order(
+ assert_series_equal(
pd_result.astype(pd.Int64Dtype()),
bf_result,
)
@@ -173,7 +173,7 @@ def test_len(scalars_dfs):
# One of dtype mismatches to be documented. Here, the `bf_result.dtype` is `Int64` but
# the `pd_result.dtype` is `float64`: https://github.com/pandas-dev/pandas/issues/51948
- assert_series_equal_ignoring_order(
+ assert_series_equal(
pd_result.astype(pd.Int64Dtype()),
bf_result,
)
@@ -186,7 +186,7 @@ def test_lower(scalars_dfs):
bf_result = bf_series.str.lower().to_pandas()
pd_result = scalars_pandas_df[col_name].str.lower()
- assert_series_equal_ignoring_order(
+ assert_series_equal(
pd_result,
bf_result,
)
@@ -205,7 +205,7 @@ def test_reverse(scalars_dfs):
else:
pd_result.loc[i] = cell[::-1]
- assert_series_equal_ignoring_order(
+ assert_series_equal(
pd_result,
bf_result,
)
@@ -222,7 +222,7 @@ def test_slice(scalars_dfs, start, stop):
pd_series = scalars_pandas_df[col_name]
pd_result = pd_series.str.slice(start, stop)
- assert_series_equal_ignoring_order(
+ assert_series_equal(
pd_result,
bf_result,
)
@@ -235,7 +235,7 @@ def test_strip(scalars_dfs):
bf_result = bf_series.str.strip().to_pandas()
pd_result = scalars_pandas_df[col_name].str.strip()
- assert_series_equal_ignoring_order(
+ assert_series_equal(
pd_result,
bf_result,
)
@@ -248,7 +248,7 @@ def test_upper(scalars_dfs):
bf_result = bf_series.str.upper().to_pandas()
pd_result = scalars_pandas_df[col_name].str.upper()
- assert_series_equal_ignoring_order(
+ assert_series_equal(
pd_result,
bf_result,
)
@@ -330,7 +330,7 @@ def test_islower(weird_strings, weird_strings_pd):
pd_result = weird_strings_pd.str.islower()
bf_result = weird_strings.str.islower().to_pandas()
- assert_series_equal_ignoring_order(
+ assert_series_equal(
bf_result,
pd_result.astype(pd.BooleanDtype())
# the dtype here is a case of intentional diversion from pandas
@@ -342,7 +342,7 @@ def test_isupper(weird_strings, weird_strings_pd):
pd_result = weird_strings_pd.str.isupper()
bf_result = weird_strings.str.isupper().to_pandas()
- assert_series_equal_ignoring_order(
+ assert_series_equal(
bf_result,
pd_result.astype(pd.BooleanDtype())
# the dtype here is a case of intentional diversion from pandas
@@ -357,7 +357,7 @@ def test_rstrip(scalars_dfs):
bf_result = bf_series.str.rstrip().to_pandas()
pd_result = scalars_pandas_df[col_name].str.rstrip()
- assert_series_equal_ignoring_order(
+ assert_series_equal(
pd_result,
bf_result,
)
@@ -370,7 +370,7 @@ def test_lstrip(scalars_dfs):
bf_result = bf_series.str.lstrip().to_pandas()
pd_result = scalars_pandas_df[col_name].str.lstrip()
- assert_series_equal_ignoring_order(
+ assert_series_equal(
pd_result,
bf_result,
)
@@ -384,7 +384,7 @@ def test_repeat(scalars_dfs, repeats):
bf_result = bf_series.str.repeat(repeats).to_pandas()
pd_result = scalars_pandas_df[col_name].str.repeat(repeats)
- assert_series_equal_ignoring_order(
+ assert_series_equal(
pd_result,
bf_result,
)
@@ -397,7 +397,7 @@ def test_capitalize(scalars_dfs):
bf_result = bf_series.str.capitalize().to_pandas()
pd_result = scalars_pandas_df[col_name].str.capitalize()
- assert_series_equal_ignoring_order(
+ assert_series_equal(
pd_result,
bf_result,
)
@@ -415,7 +415,7 @@ def test_cat_with_series(scalars_dfs):
pd_right = scalars_pandas_df[col_name]
pd_result = pd_left.str.cat(others=pd_right)
- assert_series_equal_ignoring_order(
+ assert_series_equal(
pd_result,
bf_result,
)
@@ -429,7 +429,7 @@ def test_str_match(scalars_dfs):
bf_result = bf_series.str.match(pattern).to_pandas()
pd_result = scalars_pandas_df[col_name].str.match(pattern)
- assert_series_equal_ignoring_order(
+ assert_series_equal(
pd_result,
bf_result,
)
@@ -443,7 +443,7 @@ def test_str_fullmatch(scalars_dfs):
bf_result = bf_series.str.fullmatch(pattern).to_pandas()
pd_result = scalars_pandas_df[col_name].str.fullmatch(pattern)
- assert_series_equal_ignoring_order(
+ assert_series_equal(
pd_result,
bf_result,
)
@@ -456,7 +456,7 @@ def test_str_get(scalars_dfs):
bf_result = bf_series.str.get(8).to_pandas()
pd_result = scalars_pandas_df[col_name].str.get(8)
- assert_series_equal_ignoring_order(
+ assert_series_equal(
pd_result,
bf_result,
)
@@ -469,7 +469,7 @@ def test_str_pad(scalars_dfs):
bf_result = bf_series.str.pad(8, side="both", fillchar="%").to_pandas()
pd_result = scalars_pandas_df[col_name].str.pad(8, side="both", fillchar="%")
- assert_series_equal_ignoring_order(
+ assert_series_equal(
pd_result,
bf_result,
)
@@ -492,7 +492,7 @@ def test_str_ljust(scalars_dfs):
bf_result = bf_series.str.ljust(7, fillchar="%").to_pandas()
pd_result = scalars_pandas_df[col_name].str.ljust(7, fillchar="%")
- assert_series_equal_ignoring_order(
+ assert_series_equal(
pd_result,
bf_result,
)
@@ -505,7 +505,7 @@ def test_str_rjust(scalars_dfs):
bf_result = bf_series.str.rjust(9, fillchar="%").to_pandas()
pd_result = scalars_pandas_df[col_name].str.rjust(9, fillchar="%")
- assert_series_equal_ignoring_order(
+ assert_series_equal(
pd_result,
bf_result,
)
diff --git a/tests/system/small/test_dataframe.py b/tests/system/small/test_dataframe.py
index bd5930e508..e522878229 100644
--- a/tests/system/small/test_dataframe.py
+++ b/tests/system/small/test_dataframe.py
@@ -28,10 +28,7 @@
import bigframes._config.display_options as display_options
import bigframes.dataframe as dataframe
import bigframes.series as series
-from tests.system.utils import (
- assert_pandas_df_equal_ignore_ordering,
- assert_series_equal_ignoring_order,
-)
+from tests.system.utils import assert_pandas_df_equal, assert_series_equal
def test_df_construct_copy(scalars_dfs):
@@ -98,7 +95,7 @@ def test_get_column(scalars_dfs):
series = scalars_df[col_name]
bf_result = series.to_pandas()
pd_result = scalars_pandas_df[col_name]
- assert_series_equal_ignoring_order(bf_result, pd_result)
+ assert_series_equal(bf_result, pd_result)
def test_get_column_nonstring(scalars_dfs):
@@ -106,7 +103,7 @@ def test_get_column_nonstring(scalars_dfs):
series = scalars_df.rename(columns={"int64_col": 123.1})[123.1]
bf_result = series.to_pandas()
pd_result = scalars_pandas_df.rename(columns={"int64_col": 123.1})[123.1]
- assert_series_equal_ignoring_order(bf_result, pd_result)
+ assert_series_equal(bf_result, pd_result)
def test_hasattr(scalars_dfs):
@@ -116,15 +113,24 @@ def test_hasattr(scalars_dfs):
assert not hasattr(scalars_df, "not_exist")
-def test_head_with_custom_column_labels(scalars_df_index, scalars_pandas_df_index):
+@pytest.mark.parametrize(
+ ("ordered"),
+ [
+ (True),
+ (False),
+ ],
+)
+def test_head_with_custom_column_labels(
+ scalars_df_index, scalars_pandas_df_index, ordered
+):
rename_mapping = {
"int64_col": "Integer Column",
"string_col": "言語列",
}
bf_df = scalars_df_index.rename(columns=rename_mapping).head(3)
- bf_result = bf_df.to_pandas()
+ bf_result = bf_df.to_pandas(ordered=ordered)
pd_result = scalars_pandas_df_index.rename(columns=rename_mapping).head(3)
- pandas.testing.assert_frame_equal(bf_result, pd_result)
+ assert_pandas_df_equal(bf_result, pd_result, ignore_order=not ordered)
def test_tail_with_custom_column_labels(scalars_df_index, scalars_pandas_df_index):
@@ -183,7 +189,7 @@ def test_get_column_by_attr(scalars_dfs):
series = scalars_df.int64_col
bf_result = series.to_pandas()
pd_result = scalars_pandas_df.int64_col
- assert_series_equal_ignoring_order(bf_result, pd_result)
+ assert_series_equal(bf_result, pd_result)
def test_get_columns(scalars_dfs):
@@ -246,7 +252,7 @@ def test_drop_with_custom_column_labels(scalars_dfs):
pd_result = scalars_pandas_df.rename(columns=rename_mapping).drop(
columns=dropped_columns
)
- assert_pandas_df_equal_ignore_ordering(bf_result, pd_result)
+ assert_pandas_df_equal(bf_result, pd_result)
def test_drop_index(scalars_dfs):
@@ -420,7 +426,7 @@ def test_filter_df(scalars_dfs):
pd_bool_series = scalars_pandas_df["bool_col"]
pd_result = scalars_pandas_df[pd_bool_series]
- assert_pandas_df_equal_ignore_ordering(bf_result, pd_result)
+ assert_pandas_df_equal(bf_result, pd_result)
def test_assign_new_column(scalars_dfs):
@@ -433,7 +439,7 @@ def test_assign_new_column(scalars_dfs):
# Convert default pandas dtypes `int64` to match BigQuery DataFrames dtypes.
pd_result["new_col"] = pd_result["new_col"].astype("Int64")
- assert_pandas_df_equal_ignore_ordering(bf_result, pd_result)
+ assert_pandas_df_equal(bf_result, pd_result)
def test_assign_new_column_w_loc(scalars_dfs):
@@ -564,17 +570,52 @@ def test_assign_existing_column(scalars_dfs):
# Convert default pandas dtypes `int64` to match BigQuery DataFrames dtypes.
pd_result["int64_col"] = pd_result["int64_col"].astype("Int64")
- assert_pandas_df_equal_ignore_ordering(bf_result, pd_result)
+ assert_pandas_df_equal(bf_result, pd_result)
+
+
+def test_assign_listlike_to_empty_df(session):
+ empty_df = dataframe.DataFrame(session=session)
+ empty_pandas_df = pd.DataFrame()
+ bf_result = empty_df.assign(new_col=[1, 2, 3])
+ pd_result = empty_pandas_df.assign(new_col=[1, 2, 3])
+
+ pd_result["new_col"] = pd_result["new_col"].astype("Int64")
+ pd_result.index = pd_result.index.astype("Int64")
+ assert_pandas_df_equal(bf_result.to_pandas(), pd_result)
-def test_assign_series(scalars_dfs):
+
+def test_assign_to_empty_df_multiindex_error(session):
+ empty_df = dataframe.DataFrame(session=session)
+ empty_pandas_df = pd.DataFrame()
+ empty_df["empty_col_1"] = []
+ empty_df["empty_col_2"] = []
+ empty_pandas_df["empty_col_1"] = []
+ empty_pandas_df["empty_col_2"] = []
+ empty_df = empty_df.set_index(["empty_col_1", "empty_col_2"])
+ empty_pandas_df = empty_pandas_df.set_index(["empty_col_1", "empty_col_2"])
+
+ with pytest.raises(ValueError):
+ empty_df.assign(new_col=[1, 2, 3, 4, 5, 6, 7, 8, 9])
+ with pytest.raises(ValueError):
+ empty_pandas_df.assign(new_col=[1, 2, 3, 4, 5, 6, 7, 8, 9])
+
+
+@pytest.mark.parametrize(
+ ("ordered"),
+ [
+ (True),
+ (False),
+ ],
+)
+def test_assign_series(scalars_dfs, ordered):
scalars_df, scalars_pandas_df = scalars_dfs
column_name = "int64_col"
df = scalars_df.assign(new_col=scalars_df[column_name])
- bf_result = df.to_pandas()
+ bf_result = df.to_pandas(ordered=ordered)
pd_result = scalars_pandas_df.assign(new_col=scalars_pandas_df[column_name])
- assert_pandas_df_equal_ignore_ordering(bf_result, pd_result)
+ assert_pandas_df_equal(bf_result, pd_result, ignore_order=not ordered)
def test_assign_series_overwrite(scalars_dfs):
@@ -586,7 +627,7 @@ def test_assign_series_overwrite(scalars_dfs):
**{column_name: scalars_pandas_df[column_name] + 3}
)
- assert_pandas_df_equal_ignore_ordering(bf_result, pd_result)
+ assert_pandas_df_equal(bf_result, pd_result)
def test_assign_sequential(scalars_dfs):
@@ -601,7 +642,7 @@ def test_assign_sequential(scalars_dfs):
pd_result["new_col"] = pd_result["new_col"].astype("Int64")
pd_result["new_col2"] = pd_result["new_col2"].astype("Int64")
- assert_pandas_df_equal_ignore_ordering(bf_result, pd_result)
+ assert_pandas_df_equal(bf_result, pd_result)
# Require an index so that the self-join is consistent each time.
@@ -635,7 +676,7 @@ def test_assign_different_df(
new_col=scalars_pandas_df_index[column_name]
)
- assert_pandas_df_equal_ignore_ordering(bf_result, pd_result)
+ assert_pandas_df_equal(bf_result, pd_result)
def test_assign_different_df_w_loc(
@@ -686,7 +727,7 @@ def test_assign_callable_lambda(scalars_dfs):
# Convert default pandas dtypes `int64` to match BigQuery DataFrames dtypes.
pd_result["new_col"] = pd_result["new_col"].astype("Int64")
- assert_pandas_df_equal_ignore_ordering(bf_result, pd_result)
+ assert_pandas_df_equal(bf_result, pd_result)
@pytest.mark.parametrize(
@@ -886,6 +927,26 @@ def test_df_isin_dict(scalars_dfs):
pandas.testing.assert_frame_equal(bf_result, pd_result.astype("boolean"))
+def test_df_cross_merge(scalars_dfs):
+ scalars_df, scalars_pandas_df = scalars_dfs
+ left_columns = ["int64_col", "float64_col", "rowindex_2"]
+ right_columns = ["int64_col", "bool_col", "string_col", "rowindex_2"]
+
+ left = scalars_df[left_columns]
+ # Offset the rows somewhat so that outer join can have an effect.
+ right = scalars_df[right_columns].assign(rowindex_2=scalars_df["rowindex_2"] + 2)
+
+ bf_result = left.merge(right, "cross").to_pandas()
+
+ pd_result = scalars_pandas_df[left_columns].merge(
+ scalars_pandas_df[right_columns].assign(
+ rowindex_2=scalars_pandas_df["rowindex_2"] + 2
+ ),
+ "cross",
+ )
+ pd.testing.assert_frame_equal(bf_result, pd_result, check_index_type=False)
+
+
@pytest.mark.parametrize(
("merge_how",),
[
@@ -917,7 +978,9 @@ def test_df_merge(scalars_dfs, merge_how):
sort=True,
)
- assert_pandas_df_equal_ignore_ordering(bf_result, pd_result)
+ assert_pandas_df_equal(
+ bf_result, pd_result, ignore_order=True, check_index_type=False
+ )
@pytest.mark.parametrize(
@@ -950,7 +1013,9 @@ def test_df_merge_multi_key(scalars_dfs, left_on, right_on):
sort=True,
)
- assert_pandas_df_equal_ignore_ordering(bf_result, pd_result)
+ assert_pandas_df_equal(
+ bf_result, pd_result, ignore_order=True, check_index_type=False
+ )
@pytest.mark.parametrize(
@@ -980,7 +1045,9 @@ def test_merge_custom_col_name(scalars_dfs, merge_how):
pandas_right_df = scalars_pandas_df[right_columns]
pd_result = pandas_left_df.merge(pandas_right_df, merge_how, on, sort=True)
- assert_pandas_df_equal_ignore_ordering(bf_result, pd_result)
+ assert_pandas_df_equal(
+ bf_result, pd_result, ignore_order=True, check_index_type=False
+ )
@pytest.mark.parametrize(
@@ -1013,7 +1080,9 @@ def test_merge_left_on_right_on(scalars_dfs, merge_how):
sort=True,
)
- assert_pandas_df_equal_ignore_ordering(bf_result, pd_result)
+ assert_pandas_df_equal(
+ bf_result, pd_result, ignore_order=True, check_index_type=False
+ )
def test_get_dtypes(scalars_df_default_index):
@@ -1207,6 +1276,28 @@ def test_reset_index_with_unnamed_index(
pandas.testing.assert_frame_equal(bf_result, pd_result)
+def test_reset_index_with_unnamed_multiindex(
+ scalars_df_index,
+ scalars_pandas_df_index,
+):
+ bf_df = dataframe.DataFrame(
+ ([1, 2, 3], [2, 5, 7]),
+ index=pd.MultiIndex.from_tuples([("a", "aa"), ("a", "aa")]),
+ )
+ pd_df = pd.DataFrame(
+ ([1, 2, 3], [2, 5, 7]),
+ index=pd.MultiIndex.from_tuples([("a", "aa"), ("a", "aa")]),
+ )
+
+ bf_df = bf_df.reset_index()
+ pd_df = pd_df.reset_index()
+
+ assert pd_df.columns[0] == "level_0"
+ assert bf_df.columns[0] == "level_0"
+ assert pd_df.columns[1] == "level_1"
+ assert bf_df.columns[1] == "level_1"
+
+
def test_reset_index_with_unnamed_index_and_index_column(
scalars_df_index,
scalars_pandas_df_index,
@@ -1305,7 +1396,7 @@ def test_df_abs(scalars_dfs):
bf_result = scalars_df[columns].abs().to_pandas()
pd_result = scalars_pandas_df[columns].abs()
- assert_pandas_df_equal_ignore_ordering(bf_result, pd_result)
+ assert_pandas_df_equal(bf_result, pd_result)
def test_df_isnull(scalars_dfs):
@@ -1322,7 +1413,7 @@ def test_df_isnull(scalars_dfs):
pd_result["string_col"] = pd_result["string_col"].astype(pd.BooleanDtype())
pd_result["bool_col"] = pd_result["bool_col"].astype(pd.BooleanDtype())
- assert_pandas_df_equal_ignore_ordering(bf_result, pd_result)
+ assert_pandas_df_equal(bf_result, pd_result)
def test_df_notnull(scalars_dfs):
@@ -1339,7 +1430,7 @@ def test_df_notnull(scalars_dfs):
pd_result["string_col"] = pd_result["string_col"].astype(pd.BooleanDtype())
pd_result["bool_col"] = pd_result["bool_col"].astype(pd.BooleanDtype())
- assert_pandas_df_equal_ignore_ordering(bf_result, pd_result)
+ assert_pandas_df_equal(bf_result, pd_result)
@pytest.mark.parametrize(
@@ -1559,7 +1650,7 @@ def test_scalar_binop(scalars_dfs, op, other_scalar, reverse_operands):
bf_result = maybe_reversed_op(scalars_df[columns], other_scalar).to_pandas()
pd_result = maybe_reversed_op(scalars_pandas_df[columns], other_scalar)
- assert_pandas_df_equal_ignore_ordering(bf_result, pd_result)
+ assert_pandas_df_equal(bf_result, pd_result)
@pytest.mark.parametrize(("other_scalar"), [1, -2])
@@ -1571,7 +1662,7 @@ def test_mod(scalars_dfs, other_scalar):
bf_result = (scalars_df[["int64_col", "int64_too"]] % other_scalar).to_pandas()
pd_result = scalars_pandas_df[["int64_col", "int64_too"]] % other_scalar
- assert_pandas_df_equal_ignore_ordering(bf_result, pd_result)
+ assert_pandas_df_equal(bf_result, pd_result)
def test_scalar_binop_str_exception(scalars_dfs):
@@ -1627,7 +1718,7 @@ def test_series_binop_axis_index(
bf_result = op(scalars_df[df_columns], scalars_df[series_column]).to_pandas()
pd_result = op(scalars_pandas_df[df_columns], scalars_pandas_df[series_column])
- assert_pandas_df_equal_ignore_ordering(bf_result, pd_result)
+ assert_pandas_df_equal(bf_result, pd_result)
@pytest.mark.parametrize(
@@ -1673,8 +1764,15 @@ def test_binop_df_df_binary_op(
# Differnt table will only work for explicit index, since default index orders are arbitrary.
+@pytest.mark.parametrize(
+ ("ordered"),
+ [
+ (True),
+ (False),
+ ],
+)
def test_series_binop_add_different_table(
- scalars_df_index, scalars_pandas_df_index, scalars_df_2_index
+ scalars_df_index, scalars_pandas_df_index, scalars_df_2_index, ordered
):
df_columns = ["int64_col", "float64_col"]
series_column = "int64_too"
@@ -1682,25 +1780,20 @@ def test_series_binop_add_different_table(
bf_result = (
scalars_df_index[df_columns]
.add(scalars_df_2_index[series_column], axis="index")
- .to_pandas()
+ .to_pandas(ordered=ordered)
)
pd_result = scalars_pandas_df_index[df_columns].add(
scalars_pandas_df_index[series_column], axis="index"
)
- assert_pandas_df_equal_ignore_ordering(bf_result, pd_result)
+ assert_pandas_df_equal(bf_result, pd_result, ignore_order=not ordered)
# TODO(garrettwu): Test series binop with different index
all_joins = pytest.mark.parametrize(
("how",),
- (
- ("outer",),
- ("left",),
- ("right",),
- ("inner",),
- ),
+ (("outer",), ("left",), ("right",), ("inner",), ("cross",)),
)
@@ -1714,7 +1807,7 @@ def test_join_same_table(scalars_dfs, how):
pd_df_a = pd_df.set_index("int64_too")[["string_col", "int64_col"]]
pd_df_b = pd_df.set_index("int64_too")[["float64_col"]]
pd_result = pd_df_a.join(pd_df_b, how=how)
- assert_pandas_df_equal_ignore_ordering(bf_result, pd_result)
+ assert_pandas_df_equal(bf_result, pd_result, ignore_order=True)
@all_joins
@@ -1727,7 +1820,7 @@ def test_join_different_table(
pd_df_a = scalars_pandas_df_index[["string_col", "int64_col"]]
pd_df_b = scalars_pandas_df_index.dropna()[["float64_col"]]
pd_result = pd_df_a.join(pd_df_b, how=how)
- assert_pandas_df_equal_ignore_ordering(bf_result, pd_result)
+ assert_pandas_df_equal(bf_result, pd_result, ignore_order=True)
def test_join_duplicate_columns_raises_not_implemented(scalars_dfs):
@@ -1745,13 +1838,18 @@ def test_join_param_on(scalars_dfs, how):
bf_df_a = bf_df[["string_col", "int64_col", "rowindex_2"]]
bf_df_a = bf_df_a.assign(rowindex_2=bf_df_a["rowindex_2"] + 2)
bf_df_b = bf_df[["float64_col"]]
- bf_result = bf_df_a.join(bf_df_b, on="rowindex_2", how=how).to_pandas()
- pd_df_a = pd_df[["string_col", "int64_col", "rowindex_2"]]
- pd_df_a = pd_df_a.assign(rowindex_2=pd_df_a["rowindex_2"] + 2)
- pd_df_b = pd_df[["float64_col"]]
- pd_result = pd_df_a.join(pd_df_b, on="rowindex_2", how=how)
- assert_pandas_df_equal_ignore_ordering(bf_result, pd_result)
+ if how == "cross":
+ with pytest.raises(ValueError):
+ bf_df_a.join(bf_df_b, on="rowindex_2", how=how)
+ else:
+ bf_result = bf_df_a.join(bf_df_b, on="rowindex_2", how=how).to_pandas()
+
+ pd_df_a = pd_df[["string_col", "int64_col", "rowindex_2"]]
+ pd_df_a = pd_df_a.assign(rowindex_2=pd_df_a["rowindex_2"] + 2)
+ pd_df_b = pd_df[["float64_col"]]
+ pd_result = pd_df_a.join(pd_df_b, on="rowindex_2", how=how)
+ assert_pandas_df_equal(bf_result, pd_result, ignore_order=True)
@pytest.mark.parametrize(
@@ -1967,7 +2065,14 @@ def test_df_describe(scalars_dfs):
).all()
-def test_df_stack(scalars_dfs):
+@pytest.mark.parametrize(
+ ("ordered"),
+ [
+ (True),
+ (False),
+ ],
+)
+def test_df_stack(scalars_dfs, ordered):
if pandas.__version__.startswith("1.") or pandas.__version__.startswith("2.0"):
pytest.skip("pandas <2.1 uses different stack implementation")
scalars_df, scalars_pandas_df = scalars_dfs
@@ -1977,11 +2082,13 @@ def test_df_stack(scalars_dfs):
# Can only stack identically-typed columns
columns = ["int64_col", "int64_too", "rowindex_2"]
- bf_result = scalars_df[columns].stack().to_pandas()
+ bf_result = scalars_df[columns].stack().to_pandas(ordered=ordered)
pd_result = scalars_pandas_df[columns].stack(future_stack=True)
# Pandas produces NaN, where bq dataframes produces pd.NA
- pd.testing.assert_series_equal(bf_result, pd_result, check_dtype=False)
+ assert_series_equal(
+ bf_result, pd_result, check_dtype=False, ignore_order=not ordered
+ )
def test_df_melt_default(scalars_dfs):
@@ -2027,7 +2134,14 @@ def test_df_melt_parameterized(scalars_dfs):
)
-def test_df_unstack(scalars_dfs):
+@pytest.mark.parametrize(
+ ("ordered"),
+ [
+ (True),
+ (False),
+ ],
+)
+def test_df_unstack(scalars_dfs, ordered):
scalars_df, scalars_pandas_df = scalars_dfs
# To match bigquery dataframes
scalars_pandas_df = scalars_pandas_df.copy()
@@ -2040,11 +2154,13 @@ def test_df_unstack(scalars_dfs):
]
# unstack on mono-index produces series
- bf_result = scalars_df[columns].unstack().to_pandas()
+ bf_result = scalars_df[columns].unstack().to_pandas(ordered=ordered)
pd_result = scalars_pandas_df[columns].unstack()
# Pandas produces NaN, where bq dataframes produces pd.NA
- pd.testing.assert_series_equal(bf_result, pd_result, check_dtype=False)
+ assert_series_equal(
+ bf_result, pd_result, check_dtype=False, ignore_order=not ordered
+ )
@pytest.mark.parametrize(
@@ -2189,14 +2305,18 @@ def test_iloc_slice_zero_step(scalars_df_index):
scalars_df_index.iloc[0:0:0]
-def test_iloc_slice_nested(scalars_df_index, scalars_pandas_df_index):
- bf_result = scalars_df_index.iloc[1:].iloc[1:].to_pandas()
+@pytest.mark.parametrize(
+ ("ordered"),
+ [
+ (True),
+ (False),
+ ],
+)
+def test_iloc_slice_nested(scalars_df_index, scalars_pandas_df_index, ordered):
+ bf_result = scalars_df_index.iloc[1:].iloc[1:].to_pandas(ordered=ordered)
pd_result = scalars_pandas_df_index.iloc[1:].iloc[1:]
- pd.testing.assert_frame_equal(
- bf_result,
- pd_result,
- )
+ assert_pandas_df_equal(bf_result, pd_result, ignore_order=not ordered)
@pytest.mark.parametrize(
@@ -2387,6 +2507,13 @@ def test_loc_setitem_bool_series_scalar_type_error(scalars_dfs):
pd_df.loc[pd_df["int64_too"] == 1, "string_col"] = 99
+@pytest.mark.parametrize(
+ ("ordered"),
+ [
+ (True),
+ (False),
+ ],
+)
@pytest.mark.parametrize(
("op"),
[
@@ -2401,16 +2528,18 @@ def test_loc_setitem_bool_series_scalar_type_error(scalars_dfs):
],
ids=["sum", "mean", "min", "max", "std", "var", "count", "nunique"],
)
-def test_dataframe_aggregates(scalars_df_index, scalars_pandas_df_index, op):
+def test_dataframe_aggregates(scalars_df_index, scalars_pandas_df_index, op, ordered):
col_names = ["int64_too", "float64_col", "string_col", "int64_col", "bool_col"]
bf_series = op(scalars_df_index[col_names])
pd_series = op(scalars_pandas_df_index[col_names])
- bf_result = bf_series.to_pandas()
+ bf_result = bf_series.to_pandas(ordered=ordered)
# Pandas may produce narrower numeric types, but bigframes always produces Float64
pd_series = pd_series.astype("Float64")
# Pandas has object index type
- pd.testing.assert_series_equal(pd_series, bf_result, check_index_type=False)
+ assert_series_equal(
+ pd_series, bf_result, check_index_type=False, ignore_order=not ordered
+ )
@pytest.mark.parametrize(
@@ -2501,16 +2630,25 @@ def test_df_skew_too_few_values(scalars_dfs):
pd.testing.assert_series_equal(pd_result, bf_result, check_index_type=False)
-def test_df_skew(scalars_dfs):
+@pytest.mark.parametrize(
+ ("ordered"),
+ [
+ (True),
+ (False),
+ ],
+)
+def test_df_skew(scalars_dfs, ordered):
columns = ["float64_col", "int64_col"]
scalars_df, scalars_pandas_df = scalars_dfs
- bf_result = scalars_df[columns].skew().to_pandas()
+ bf_result = scalars_df[columns].skew().to_pandas(ordered=ordered)
pd_result = scalars_pandas_df[columns].skew()
# Pandas may produce narrower numeric types, but bigframes always produces Float64
pd_result = pd_result.astype("Float64")
- pd.testing.assert_series_equal(pd_result, bf_result, check_index_type=False)
+ assert_series_equal(
+ pd_result, bf_result, check_index_type=False, ignore_order=not ordered
+ )
def test_df_kurt_too_few_values(scalars_dfs):
@@ -2661,9 +2799,10 @@ def test_df_rows_filter_items(scalars_df_index, scalars_pandas_df_index):
# Pandas uses int64 instead of Int64 (nullable) dtype.
pd_result.index = pd_result.index.astype(pd.Int64Dtype())
# Ignore ordering as pandas order differently depending on version
- assert_pandas_df_equal_ignore_ordering(
+ assert_pandas_df_equal(
bf_result,
pd_result,
+ ignore_order=True,
check_names=False,
)
diff --git a/tests/system/small/test_dataframe_io.py b/tests/system/small/test_dataframe_io.py
index 8f5d706f62..fb9fb7bb89 100644
--- a/tests/system/small/test_dataframe_io.py
+++ b/tests/system/small/test_dataframe_io.py
@@ -19,10 +19,7 @@
import pyarrow as pa
import pytest
-from tests.system.utils import (
- assert_pandas_df_equal_ignore_ordering,
- convert_pandas_dtypes,
-)
+from tests.system.utils import assert_pandas_df_equal, convert_pandas_dtypes
try:
import pandas_gbq # type: ignore
@@ -83,6 +80,24 @@ def test_to_pandas_array_struct_correct_result(session):
)
+def test_load_json(session):
+ df = session.read_gbq(
+ """SELECT
+ JSON_OBJECT('foo', 10, 'bar', TRUE) AS json_column
+ """
+ )
+
+ result = df.to_pandas()
+ expected = pd.DataFrame(
+ {
+ "json_column": ['{"bar":true,"foo":10}'],
+ }
+ )
+ expected.index = expected.index.astype("Int64")
+ pd.testing.assert_series_equal(result.dtypes, expected.dtypes)
+ pd.testing.assert_series_equal(result["json_column"], expected["json_column"])
+
+
def test_to_pandas_batches_w_correct_dtypes(scalars_df_default_index):
"""Verify to_pandas_batches() APIs returns the expected dtypes."""
expected = scalars_df_default_index.dtypes
@@ -380,7 +395,7 @@ def test_to_sql_query_unnamed_index_included(
pd_df = scalars_pandas_df_default_index.reset_index(drop=True)
roundtrip = session.read_gbq(sql, index_col=idx_ids)
roundtrip.index.names = [None]
- assert_pandas_df_equal_ignore_ordering(roundtrip.to_pandas(), pd_df)
+ assert_pandas_df_equal(roundtrip.to_pandas(), pd_df, check_index_type=False)
def test_to_sql_query_named_index_included(
@@ -397,7 +412,7 @@ def test_to_sql_query_named_index_included(
pd_df = scalars_pandas_df_default_index.set_index("rowindex_2", drop=True)
roundtrip = session.read_gbq(sql, index_col=idx_ids)
- assert_pandas_df_equal_ignore_ordering(roundtrip.to_pandas(), pd_df)
+ assert_pandas_df_equal(roundtrip.to_pandas(), pd_df)
def test_to_sql_query_unnamed_index_excluded(
@@ -412,7 +427,9 @@ def test_to_sql_query_unnamed_index_excluded(
pd_df = scalars_pandas_df_default_index.reset_index(drop=True)
roundtrip = session.read_gbq(sql)
- assert_pandas_df_equal_ignore_ordering(roundtrip.to_pandas(), pd_df)
+ assert_pandas_df_equal(
+ roundtrip.to_pandas(), pd_df, check_index_type=False, ignore_order=True
+ )
def test_to_sql_query_named_index_excluded(
@@ -429,4 +446,6 @@ def test_to_sql_query_named_index_excluded(
"rowindex_2", drop=True
).reset_index(drop=True)
roundtrip = session.read_gbq(sql)
- assert_pandas_df_equal_ignore_ordering(roundtrip.to_pandas(), pd_df)
+ assert_pandas_df_equal(
+ roundtrip.to_pandas(), pd_df, check_index_type=False, ignore_order=True
+ )
diff --git a/tests/system/small/test_groupby.py b/tests/system/small/test_groupby.py
index 05154f7ab7..a24713c2b3 100644
--- a/tests/system/small/test_groupby.py
+++ b/tests/system/small/test_groupby.py
@@ -16,6 +16,7 @@
import pytest
import bigframes.pandas as bpd
+from tests.system.utils import assert_pandas_df_equal
@pytest.mark.parametrize(
@@ -88,16 +89,23 @@ def test_dataframe_groupby_aggregate(
pd.testing.assert_frame_equal(pd_result, bf_result_computed, check_dtype=False)
-def test_dataframe_groupby_agg_string(scalars_df_index, scalars_pandas_df_index):
+@pytest.mark.parametrize(
+ ("ordered"),
+ [
+ (True),
+ (False),
+ ],
+)
+def test_dataframe_groupby_agg_string(
+ scalars_df_index, scalars_pandas_df_index, ordered
+):
col_names = ["int64_too", "float64_col", "int64_col", "bool_col", "string_col"]
bf_result = scalars_df_index[col_names].groupby("string_col").agg("count")
pd_result = scalars_pandas_df_index[col_names].groupby("string_col").agg("count")
- bf_result_computed = bf_result.to_pandas()
+ bf_result_computed = bf_result.to_pandas(ordered=ordered)
- pd.testing.assert_frame_equal(
- pd_result,
- bf_result_computed,
- check_dtype=False,
+ assert_pandas_df_equal(
+ pd_result, bf_result_computed, check_dtype=False, ignore_order=not ordered
)
@@ -270,13 +278,22 @@ def test_dataframe_groupby_kurt(scalars_df_index, scalars_pandas_df_index):
pd.testing.assert_frame_equal(pd_result, bf_result, check_dtype=False)
-def test_dataframe_groupby_diff(scalars_df_index, scalars_pandas_df_index):
+@pytest.mark.parametrize(
+ ("ordered"),
+ [
+ (True),
+ (False),
+ ],
+)
+def test_dataframe_groupby_diff(scalars_df_index, scalars_pandas_df_index, ordered):
col_names = ["float64_col", "int64_col", "string_col"]
bf_result = scalars_df_index[col_names].groupby("string_col").diff(-1)
pd_result = scalars_pandas_df_index[col_names].groupby("string_col").diff(-1)
- bf_result_computed = bf_result.to_pandas()
+ bf_result_computed = bf_result.to_pandas(ordered=ordered)
- pd.testing.assert_frame_equal(pd_result, bf_result_computed, check_dtype=False)
+ assert_pandas_df_equal(
+ pd_result, bf_result_computed, check_dtype=False, ignore_order=not ordered
+ )
def test_dataframe_groupby_getitem(
diff --git a/tests/system/small/test_multiindex.py b/tests/system/small/test_multiindex.py
index bc35f633fd..e7e93849c6 100644
--- a/tests/system/small/test_multiindex.py
+++ b/tests/system/small/test_multiindex.py
@@ -16,7 +16,7 @@
import pytest
import bigframes.pandas as bpd
-from tests.system.utils import assert_pandas_df_equal_ignore_ordering
+from tests.system.utils import assert_pandas_df_equal
# Row Multi-index tests
@@ -429,7 +429,7 @@ def test_multi_index_dataframe_join(scalars_dfs, how):
(["bool_col", "rowindex_2"])
)[["float64_col"]]
pd_result = pd_df_a.join(pd_df_b, how=how)
- assert_pandas_df_equal_ignore_ordering(bf_result, pd_result)
+ assert_pandas_df_equal(bf_result, pd_result, ignore_order=True)
@all_joins
@@ -450,7 +450,7 @@ def test_multi_index_dataframe_join_on(scalars_dfs, how):
pd_df_a = pd_df_a.assign(rowindex_2=pd_df_a["rowindex_2"] + 2)
pd_df_b = pd_df[["float64_col"]]
pd_result = pd_df_a.join(pd_df_b, on="rowindex_2", how=how)
- assert_pandas_df_equal_ignore_ordering(bf_result, pd_result)
+ assert_pandas_df_equal(bf_result, pd_result, ignore_order=True)
@pytest.mark.parametrize(
diff --git a/tests/system/small/test_pandas.py b/tests/system/small/test_pandas.py
index 0292ebd206..a1079288cf 100644
--- a/tests/system/small/test_pandas.py
+++ b/tests/system/small/test_pandas.py
@@ -16,16 +16,23 @@
import pytest
import bigframes.pandas as bpd
-from tests.system.utils import assert_pandas_df_equal_ignore_ordering
+from tests.system.utils import assert_pandas_df_equal
-def test_concat_dataframe(scalars_dfs):
+@pytest.mark.parametrize(
+ ("ordered"),
+ [
+ (True),
+ (False),
+ ],
+)
+def test_concat_dataframe(scalars_dfs, ordered):
scalars_df, scalars_pandas_df = scalars_dfs
bf_result = bpd.concat(11 * [scalars_df])
- bf_result = bf_result.to_pandas()
+ bf_result = bf_result.to_pandas(ordered=ordered)
pd_result = pd.concat(11 * [scalars_pandas_df])
- pd.testing.assert_frame_equal(bf_result, pd_result)
+ assert_pandas_df_equal(bf_result, pd_result, ignore_order=not ordered)
def test_concat_series(scalars_dfs):
@@ -252,7 +259,7 @@ def test_merge(scalars_dfs, merge_how):
sort=True,
)
- assert_pandas_df_equal_ignore_ordering(bf_result, pd_result)
+ assert_pandas_df_equal(bf_result, pd_result, ignore_order=True)
@pytest.mark.parametrize(
@@ -286,7 +293,28 @@ def test_merge_left_on_right_on(scalars_dfs, merge_how):
sort=True,
)
- assert_pandas_df_equal_ignore_ordering(bf_result, pd_result)
+ assert_pandas_df_equal(bf_result, pd_result, ignore_order=True)
+
+
+def test_pd_merge_cross(scalars_dfs):
+ scalars_df, scalars_pandas_df = scalars_dfs
+ left_columns = ["int64_col", "float64_col", "int64_too"]
+ right_columns = ["int64_col", "bool_col", "string_col", "rowindex_2"]
+
+ left = scalars_df[left_columns]
+ right = scalars_df[right_columns]
+
+ df = bpd.merge(left, right, "cross", sort=True)
+ bf_result = df.to_pandas()
+
+ pd_result = pd.merge(
+ scalars_pandas_df[left_columns],
+ scalars_pandas_df[right_columns],
+ "cross",
+ sort=True,
+ )
+
+ pd.testing.assert_frame_equal(bf_result, pd_result, check_index_type=False)
@pytest.mark.parametrize(
@@ -320,7 +348,7 @@ def test_merge_series(scalars_dfs, merge_how):
sort=True,
)
- assert_pandas_df_equal_ignore_ordering(bf_result, pd_result)
+ assert_pandas_df_equal(bf_result, pd_result, ignore_order=True)
def test_cut(scalars_dfs):
diff --git a/tests/system/small/test_pandas_options.py b/tests/system/small/test_pandas_options.py
index ca67710d4e..c410d70fe7 100644
--- a/tests/system/small/test_pandas_options.py
+++ b/tests/system/small/test_pandas_options.py
@@ -74,7 +74,7 @@ def test_read_gbq_start_sets_session_location(
# Now read_gbq* from another location should fail
with pytest.raises(
- google.api_core.exceptions.NotFound,
+ (google.api_core.exceptions.NotFound, ValueError),
match=dataset_id_permanent,
):
read_method(query)
@@ -99,7 +99,7 @@ def test_read_gbq_start_sets_session_location(
# Now read_gbq* from another location should fail
with pytest.raises(
- google.api_core.exceptions.NotFound,
+ (google.api_core.exceptions.NotFound, ValueError),
match=dataset_id_permanent_tokyo,
):
read_method(query_tokyo)
@@ -145,7 +145,7 @@ def test_read_gbq_after_session_start_must_comply_with_default_location(
# Doing read_gbq* from a table in another location should fail
with pytest.raises(
- google.api_core.exceptions.NotFound,
+ (google.api_core.exceptions.NotFound, ValueError),
match=dataset_id_permanent_tokyo,
):
read_method(query_tokyo)
@@ -193,7 +193,7 @@ def test_read_gbq_must_comply_with_set_location_US(
# Starting user journey with read_gbq* from another location should fail
with pytest.raises(
- google.api_core.exceptions.NotFound,
+ (google.api_core.exceptions.NotFound, ValueError),
match=dataset_id_permanent_tokyo,
):
read_method(query_tokyo)
@@ -243,7 +243,7 @@ def test_read_gbq_must_comply_with_set_location_non_US(
# Starting user journey with read_gbq* from another location should fail
with pytest.raises(
- google.api_core.exceptions.NotFound,
+ (google.api_core.exceptions.NotFound, ValueError),
match=dataset_id_permanent,
):
read_method(query)
diff --git a/tests/system/small/test_progress_bar.py b/tests/system/small/test_progress_bar.py
index 30ea63b483..bd13ac2240 100644
--- a/tests/system/small/test_progress_bar.py
+++ b/tests/system/small/test_progress_bar.py
@@ -52,14 +52,6 @@ def test_progress_bar_scalar(penguins_df_default_index: bf.dataframe.DataFrame,
assert_loading_msg_exist(capsys.readouterr().out)
-def test_progress_bar_read_gbq(session: bf.Session, penguins_table_id: str, capsys):
- bf.options.display.progress_bar = "terminal"
- capsys.readouterr() # clear output
- session.read_gbq(penguins_table_id)
-
- assert_loading_msg_exist(capsys.readouterr().out)
-
-
def test_progress_bar_extract_jobs(
penguins_df_default_index: bf.dataframe.DataFrame, gcs_folder, capsys
):
@@ -98,9 +90,6 @@ def assert_loading_msg_exist(capystOut: str, pattern=job_load_message_regex):
def test_query_job_repr_html(penguins_df_default_index: bf.dataframe.DataFrame):
bf.options.display.progress_bar = "terminal"
- penguins_df_default_index._block._expr.session.bqclient.default_query_job_config.use_query_cache = (
- False
- )
penguins_df_default_index.to_pandas()
query_job_repr = formatting_helpers.repr_query_job_html(
penguins_df_default_index.query_job
@@ -117,9 +106,6 @@ def test_query_job_repr_html(penguins_df_default_index: bf.dataframe.DataFrame):
def test_query_job_repr(penguins_df_default_index: bf.dataframe.DataFrame):
- penguins_df_default_index._block._expr.session.bqclient.default_query_job_config.use_query_cache = (
- False
- )
penguins_df_default_index.to_pandas()
query_job_repr = formatting_helpers.repr_query_job(
penguins_df_default_index.query_job
diff --git a/tests/system/small/test_remote_function.py b/tests/system/small/test_remote_function.py
index 89907a53df..3d8532a13b 100644
--- a/tests/system/small/test_remote_function.py
+++ b/tests/system/small/test_remote_function.py
@@ -18,7 +18,7 @@
import bigframes
from bigframes import remote_function as rf
-from tests.system.utils import assert_pandas_df_equal_ignore_ordering
+from tests.system.utils import assert_pandas_df_equal
@pytest.fixture(scope="module")
@@ -121,7 +121,7 @@ def square(x):
pd_result_col = pd_result_col.astype(pd.Int64Dtype())
pd_result = pd_int64_col_filtered.to_frame().assign(result=pd_result_col)
- assert_pandas_df_equal_ignore_ordering(bf_result, pd_result)
+ assert_pandas_df_equal(bf_result, pd_result)
@pytest.mark.flaky(retries=2, delay=120)
@@ -170,7 +170,7 @@ def square(x):
pd_result_col = pd_result_col.astype(pd.Int64Dtype())
pd_result = pd_int64_col_filtered.to_frame().assign(result=pd_result_col)
- assert_pandas_df_equal_ignore_ordering(bf_result, pd_result)
+ assert_pandas_df_equal(bf_result, pd_result)
@pytest.mark.flaky(retries=2, delay=120)
@@ -246,7 +246,7 @@ def square(x):
pd_result_col = pd_result_col.astype(pd.Int64Dtype())
pd_result = pd_int64_col_filtered.to_frame().assign(result=pd_result_col)
- assert_pandas_df_equal_ignore_ordering(bf_result, pd_result)
+ assert_pandas_df_equal(bf_result, pd_result)
@pytest.mark.flaky(retries=2, delay=120)
@@ -309,7 +309,7 @@ def square(x):
pd_result_col = pd_result_col.astype(pd.Int64Dtype())
pd_result = pd_int64_col_filtered.to_frame().assign(result=pd_result_col)
- assert_pandas_df_equal_ignore_ordering(bf_result, pd_result)
+ assert_pandas_df_equal(bf_result, pd_result)
@pytest.mark.flaky(retries=2, delay=120)
@@ -348,7 +348,7 @@ def square(x):
pd_result_col = pd_result_col.astype(pd.Int64Dtype())
pd_result = pd_int64_col_filtered.to_frame().assign(result=pd_result_col)
- assert_pandas_df_equal_ignore_ordering(bf_result, pd_result)
+ assert_pandas_df_equal(bf_result, pd_result)
@pytest.mark.flaky(retries=2, delay=120)
@@ -387,7 +387,7 @@ def square(x):
pd_result_col = pd_result_col.astype(pd.Int64Dtype())
pd_result = pd_int64_col_filtered.to_frame().assign(result=pd_result_col)
- assert_pandas_df_equal_ignore_ordering(bf_result, pd_result)
+ assert_pandas_df_equal(bf_result, pd_result)
@pytest.mark.flaky(retries=2, delay=120)
@@ -418,7 +418,7 @@ def add_one(x):
for col in pd_result:
pd_result[col] = pd_result[col].astype(pd_int64_df_filtered[col].dtype)
- assert_pandas_df_equal_ignore_ordering(bf_result, pd_result)
+ assert_pandas_df_equal(bf_result, pd_result)
@pytest.mark.flaky(retries=2, delay=120)
@@ -447,7 +447,7 @@ def add_one(x):
for col in pd_result:
pd_result[col] = pd_result[col].astype(pd_int64_df[col].dtype)
- assert_pandas_df_equal_ignore_ordering(bf_result, pd_result)
+ assert_pandas_df_equal(bf_result, pd_result)
@pytest.mark.flaky(retries=2, delay=120)
@@ -535,7 +535,7 @@ def square1(x):
s2_result_col = int64_col_filtered.apply(square2)
s2_result = int64_col_filtered.to_frame().assign(result=s2_result_col)
- assert_pandas_df_equal_ignore_ordering(s1_result.to_pandas(), s2_result.to_pandas())
+ assert_pandas_df_equal(s1_result.to_pandas(), s2_result.to_pandas())
@pytest.mark.flaky(retries=2, delay=120)
@@ -583,7 +583,9 @@ def test_read_gbq_function_reads_udfs(bigquery_client, dataset_id):
indirect_df = indirect_df.assign(y=indirect_df.x.apply(square))
indirect_df = indirect_df.to_pandas()
- assert_pandas_df_equal_ignore_ordering(direct_df, indirect_df)
+ assert_pandas_df_equal(
+ direct_df, indirect_df, ignore_order=True, check_index_type=False
+ )
@pytest.mark.flaky(retries=2, delay=120)
diff --git a/tests/system/small/test_series.py b/tests/system/small/test_series.py
index 183ba01c0e..d9fc23fad0 100644
--- a/tests/system/small/test_series.py
+++ b/tests/system/small/test_series.py
@@ -24,10 +24,7 @@
import bigframes.pandas
import bigframes.series as series
-from tests.system.utils import (
- assert_pandas_df_equal_ignore_ordering,
- assert_series_equal_ignoring_order,
-)
+from tests.system.utils import assert_pandas_df_equal, assert_series_equal
def test_series_construct_copy(scalars_dfs):
@@ -210,7 +207,7 @@ def test_abs(scalars_dfs, col_name):
bf_result = scalars_df[col_name].abs().to_pandas()
pd_result = scalars_pandas_df[col_name].abs()
- assert_series_equal_ignoring_order(pd_result, bf_result)
+ assert_series_equal(pd_result, bf_result)
def test_fillna(scalars_dfs):
@@ -218,7 +215,7 @@ def test_fillna(scalars_dfs):
col_name = "string_col"
bf_result = scalars_df[col_name].fillna("Missing").to_pandas()
pd_result = scalars_pandas_df[col_name].fillna("Missing")
- assert_series_equal_ignoring_order(
+ assert_series_equal(
pd_result,
bf_result,
)
@@ -273,21 +270,26 @@ def test_series_replace_list_scalar(scalars_dfs):
@pytest.mark.parametrize(
- ("values",),
+ ("method",),
(
- ([None, 1, 2, None, None, 16, None],),
- ([None, None, 3.6, None],),
- ([403.2, None, 352.1, None, None, 111.9],),
+ ("linear",),
+ ("values",),
+ ("slinear",),
+ ("nearest",),
+ ("zero",),
+ ("pad",),
),
)
-def test_series_interpolate(values):
- pd_series = pd.Series(values)
+def test_series_interpolate(method):
+ values = [None, 1, 2, None, None, 16, None]
+ index = [-3.2, 11.4, 3.56, 4, 4.32, 5.55, 76.8]
+ pd_series = pd.Series(values, index)
bf_series = series.Series(pd_series)
# Pandas can only interpolate on "float64" columns
# https://github.com/pandas-dev/pandas/issues/40252
- pd_result = pd_series.astype("float64").interpolate()
- bf_result = bf_series.interpolate().to_pandas()
+ pd_result = pd_series.astype("float64").interpolate(method=method)
+ bf_result = bf_series.interpolate(method=method).to_pandas()
# pd uses non-null types, while bf uses nullable types
pd.testing.assert_series_equal(
@@ -491,7 +493,7 @@ def test_series_int_int_operators_scalar(
bf_result = maybe_reversed_op(scalars_df["int64_col"], other_scalar).to_pandas()
pd_result = maybe_reversed_op(scalars_pandas_df["int64_col"], other_scalar)
- assert_series_equal_ignoring_order(pd_result, bf_result)
+ assert_series_equal(pd_result, bf_result)
def test_series_pow_scalar(scalars_dfs):
@@ -500,7 +502,7 @@ def test_series_pow_scalar(scalars_dfs):
bf_result = (scalars_df["int64_col"] ** 2).to_pandas()
pd_result = scalars_pandas_df["int64_col"] ** 2
- assert_series_equal_ignoring_order(pd_result, bf_result)
+ assert_series_equal(pd_result, bf_result)
def test_series_pow_scalar_reverse(scalars_dfs):
@@ -509,7 +511,7 @@ def test_series_pow_scalar_reverse(scalars_dfs):
bf_result = (0.8 ** scalars_df["int64_col"]).to_pandas()
pd_result = 0.8 ** scalars_pandas_df["int64_col"]
- assert_series_equal_ignoring_order(pd_result, bf_result)
+ assert_series_equal(pd_result, bf_result)
@pytest.mark.parametrize(
@@ -535,7 +537,7 @@ def test_series_bool_bool_operators_scalar(
bf_result = maybe_reversed_op(scalars_df["bool_col"], other_scalar).to_pandas()
pd_result = maybe_reversed_op(scalars_pandas_df["bool_col"], other_scalar)
- assert_series_equal_ignoring_order(pd_result.astype(pd.BooleanDtype()), bf_result)
+ assert_series_equal(pd_result.astype(pd.BooleanDtype()), bf_result)
@pytest.mark.parametrize(
@@ -573,7 +575,7 @@ def test_series_int_int_operators_series(scalars_dfs, operator):
scalars_df, scalars_pandas_df = scalars_dfs
bf_result = operator(scalars_df["int64_col"], scalars_df["int64_too"]).to_pandas()
pd_result = operator(scalars_pandas_df["int64_col"], scalars_pandas_df["int64_too"])
- assert_series_equal_ignoring_order(pd_result, bf_result)
+ assert_series_equal(pd_result, bf_result)
@pytest.mark.parametrize(
@@ -723,7 +725,7 @@ def test_series_add_scalar(scalars_dfs, other):
bf_result = (scalars_df["float64_col"] + other).to_pandas()
pd_result = scalars_pandas_df["float64_col"] + other
- assert_series_equal_ignoring_order(pd_result, bf_result)
+ assert_series_equal(pd_result, bf_result)
@pytest.mark.parametrize(
@@ -739,7 +741,7 @@ def test_series_add_bigframes_series(scalars_dfs, left_col, right_col):
bf_result = (scalars_df[left_col] + scalars_df[right_col]).to_pandas()
pd_result = scalars_pandas_df[left_col] + scalars_pandas_df[right_col]
- assert_series_equal_ignoring_order(pd_result, bf_result)
+ assert_series_equal(pd_result, bf_result)
@pytest.mark.parametrize(
@@ -761,7 +763,7 @@ def test_series_add_bigframes_series_nested(
scalars_pandas_df[left_col] + scalars_pandas_df[right_col]
) + scalars_pandas_df[righter_col]
- assert_series_equal_ignoring_order(pd_result, bf_result)
+ assert_series_equal(pd_result, bf_result)
def test_series_add_different_table_default_index(
@@ -919,7 +921,7 @@ def test_isnull(scalars_dfs):
# One of dtype mismatches to be documented. Here, the `bf_series.dtype` is `BooleanDtype` but
# the `pd_series.dtype` is `bool`.
- assert_series_equal_ignoring_order(pd_series.astype(pd.BooleanDtype()), bf_series)
+ assert_series_equal(pd_series.astype(pd.BooleanDtype()), bf_series)
def test_notnull(scalars_dfs):
@@ -930,7 +932,7 @@ def test_notnull(scalars_dfs):
# One of dtype mismatches to be documented. Here, the `bf_series.dtype` is `BooleanDtype` but
# the `pd_series.dtype` is `bool`.
- assert_series_equal_ignoring_order(pd_series.astype(pd.BooleanDtype()), bf_series)
+ assert_series_equal(pd_series.astype(pd.BooleanDtype()), bf_series)
def test_round(scalars_dfs):
@@ -939,7 +941,7 @@ def test_round(scalars_dfs):
bf_result = scalars_df[col_name].round().to_pandas()
pd_result = scalars_pandas_df[col_name].round()
- assert_series_equal_ignoring_order(pd_result, bf_result)
+ assert_series_equal(pd_result, bf_result)
def test_eq_scalar(scalars_dfs):
@@ -948,7 +950,7 @@ def test_eq_scalar(scalars_dfs):
bf_result = scalars_df[col_name].eq(0).to_pandas()
pd_result = scalars_pandas_df[col_name].eq(0)
- assert_series_equal_ignoring_order(pd_result, bf_result)
+ assert_series_equal(pd_result, bf_result)
def test_eq_wider_type_scalar(scalars_dfs):
@@ -957,7 +959,7 @@ def test_eq_wider_type_scalar(scalars_dfs):
bf_result = scalars_df[col_name].eq(1.0).to_pandas()
pd_result = scalars_pandas_df[col_name].eq(1.0)
- assert_series_equal_ignoring_order(pd_result, bf_result)
+ assert_series_equal(pd_result, bf_result)
def test_ne_scalar(scalars_dfs):
@@ -966,7 +968,7 @@ def test_ne_scalar(scalars_dfs):
bf_result = (scalars_df[col_name] != 0).to_pandas()
pd_result = scalars_pandas_df[col_name] != 0
- assert_series_equal_ignoring_order(pd_result, bf_result)
+ assert_series_equal(pd_result, bf_result)
def test_eq_int_scalar(scalars_dfs):
@@ -975,7 +977,7 @@ def test_eq_int_scalar(scalars_dfs):
bf_result = (scalars_df[col_name] == 0).to_pandas()
pd_result = scalars_pandas_df[col_name] == 0
- assert_series_equal_ignoring_order(pd_result, bf_result)
+ assert_series_equal(pd_result, bf_result)
@pytest.mark.parametrize(
@@ -994,7 +996,7 @@ def test_eq_same_type_series(scalars_dfs, col_name):
# One of dtype mismatches to be documented. Here, the `bf_series.dtype` is `BooleanDtype` but
# the `pd_series.dtype` is `bool`.
- assert_series_equal_ignoring_order(pd_result.astype(pd.BooleanDtype()), bf_result)
+ assert_series_equal(pd_result.astype(pd.BooleanDtype()), bf_result)
def test_loc_setitem_cell(scalars_df_index, scalars_pandas_df_index):
@@ -1012,6 +1014,17 @@ def test_loc_setitem_cell(scalars_df_index, scalars_pandas_df_index):
pd.testing.assert_series_equal(bf_original.to_pandas(), pd_original)
+def test_at_setitem_row_label_scalar(scalars_dfs):
+ scalars_df, scalars_pandas_df = scalars_dfs
+ bf_series = scalars_df["int64_col"]
+ pd_series = scalars_pandas_df["int64_col"].copy()
+ bf_series.at[1] = 1000
+ pd_series.at[1] = 1000
+ bf_result = bf_series.to_pandas()
+ pd_result = pd_series.astype("Int64")
+ pd.testing.assert_series_equal(bf_result, pd_result)
+
+
def test_ne_obj_series(scalars_dfs):
scalars_df, scalars_pandas_df = scalars_dfs
col_name = "string_col"
@@ -1020,7 +1033,7 @@ def test_ne_obj_series(scalars_dfs):
# One of dtype mismatches to be documented. Here, the `bf_series.dtype` is `BooleanDtype` but
# the `pd_series.dtype` is `bool`.
- assert_series_equal_ignoring_order(pd_result.astype(pd.BooleanDtype()), bf_result)
+ assert_series_equal(pd_result.astype(pd.BooleanDtype()), bf_result)
def test_indexing_using_unselected_series(scalars_dfs):
@@ -1029,7 +1042,7 @@ def test_indexing_using_unselected_series(scalars_dfs):
bf_result = scalars_df[col_name][scalars_df["int64_too"].eq(0)].to_pandas()
pd_result = scalars_pandas_df[col_name][scalars_pandas_df["int64_too"].eq(0)]
- assert_series_equal_ignoring_order(
+ assert_series_equal(
pd_result,
bf_result,
)
@@ -1045,7 +1058,7 @@ def test_indexing_using_selected_series(scalars_dfs):
scalars_pandas_df["string_col"].eq("Hello, World!")
]
- assert_series_equal_ignoring_order(
+ assert_series_equal(
pd_result,
bf_result,
)
@@ -1067,7 +1080,7 @@ def test_nested_filter(scalars_dfs):
) # Convert from nullable bool to nonnullable bool usable as indexer
pd_result = pd_string_col[pd_int64_too == 0][~pd_bool_col]
- assert_series_equal_ignoring_order(
+ assert_series_equal(
pd_result,
bf_result,
)
@@ -1086,7 +1099,7 @@ def test_binop_repeated_application_does_row_identity_joins(scalars_dfs):
bf_result = bf_series.to_pandas()
pd_result = pd_series
- assert_series_equal_ignoring_order(
+ assert_series_equal(
bf_result,
pd_result,
)
@@ -1108,10 +1121,9 @@ def test_binop_opposite_filters(scalars_dfs):
pd_bool_col = scalars_pandas_df["bool_col"]
pd_result = pd_int64_col1[pd_bool_col] + pd_int64_col2[pd_bool_col.__invert__()]
- assert_series_equal_ignoring_order(
- bf_result,
- pd_result,
- )
+ # Passes with ignore_order=False only with some dependency sets
+ # TODO: Determine desired behavior and make test more strict
+ assert_series_equal(bf_result, pd_result, ignore_order=True)
def test_binop_left_filtered(scalars_dfs):
@@ -1126,10 +1138,9 @@ def test_binop_left_filtered(scalars_dfs):
pd_bool_col = scalars_pandas_df["bool_col"]
pd_result = pd_int64_col[pd_bool_col] + pd_float64_col
- assert_series_equal_ignoring_order(
- bf_result,
- pd_result,
- )
+ # Passes with ignore_order=False only with some dependency sets
+ # TODO: Determine desired behavior and make test more strict
+ assert_series_equal(bf_result, pd_result, ignore_order=True)
def test_binop_right_filtered(scalars_dfs):
@@ -1144,7 +1155,7 @@ def test_binop_right_filtered(scalars_dfs):
pd_bool_col = scalars_pandas_df["bool_col"]
pd_result = pd_float64_col + pd_int64_col[pd_bool_col]
- assert_series_equal_ignoring_order(
+ assert_series_equal(
bf_result,
pd_result,
)
@@ -1249,7 +1260,7 @@ def test_groupby_sum(scalars_dfs):
)
# TODO(swast): Update groupby to use index based on group by key(s).
bf_result = bf_series.to_pandas()
- assert_series_equal_ignoring_order(
+ assert_series_equal(
pd_series,
bf_result,
check_exact=False,
@@ -1267,7 +1278,7 @@ def test_groupby_std(scalars_dfs):
.astype(pd.Float64Dtype())
)
bf_result = bf_series.to_pandas()
- assert_series_equal_ignoring_order(
+ assert_series_equal(
pd_series,
bf_result,
check_exact=False,
@@ -1282,7 +1293,7 @@ def test_groupby_var(scalars_dfs):
scalars_pandas_df[col_name].groupby(scalars_pandas_df["string_col"]).var()
)
bf_result = bf_series.to_pandas()
- assert_series_equal_ignoring_order(
+ assert_series_equal(
pd_series,
bf_result,
check_exact=False,
@@ -1334,7 +1345,7 @@ def test_groupby_mean(scalars_dfs):
)
# TODO(swast): Update groupby to use index based on group by key(s).
bf_result = bf_series.to_pandas()
- assert_series_equal_ignoring_order(
+ assert_series_equal(
pd_series,
bf_result,
)
@@ -1372,7 +1383,7 @@ def test_groupby_prod(scalars_dfs):
)
# TODO(swast): Update groupby to use index based on group by key(s).
bf_result = bf_series.to_pandas()
- assert_series_equal_ignoring_order(
+ assert_series_equal(
pd_series,
bf_result,
)
@@ -1582,7 +1593,7 @@ def test_head(scalars_dfs):
bf_result = scalars_df["string_col"].head(2).to_pandas()
pd_result = scalars_pandas_df["string_col"].head(2)
- assert_series_equal_ignoring_order(
+ assert_series_equal(
pd_result,
bf_result,
)
@@ -1597,7 +1608,7 @@ def test_tail(scalars_dfs):
bf_result = scalars_df["string_col"].tail(2).to_pandas()
pd_result = scalars_pandas_df["string_col"].tail(2)
- assert_series_equal_ignoring_order(
+ assert_series_equal(
pd_result,
bf_result,
)
@@ -2065,11 +2076,7 @@ def test_series_filter_items(scalars_df_index, scalars_pandas_df_index):
# Pandas uses int64 instead of Int64 (nullable) dtype.
pd_result.index = pd_result.index.astype(pd.Int64Dtype())
# Ignore ordering as pandas order differently depending on version
- assert_series_equal_ignoring_order(
- bf_result,
- pd_result,
- check_names=False,
- )
+ assert_series_equal(bf_result, pd_result, check_names=False, ignore_order=True)
def test_series_filter_like(scalars_df_index, scalars_pandas_df_index):
@@ -2197,21 +2204,25 @@ def test_where_with_default(scalars_df_index, scalars_pandas_df_index):
)
-def test_clip(scalars_df_index, scalars_pandas_df_index):
+@pytest.mark.parametrize(
+ ("ordered"),
+ [
+ (True),
+ (False),
+ ],
+)
+def test_clip(scalars_df_index, scalars_pandas_df_index, ordered):
col_bf = scalars_df_index["int64_col"]
lower_bf = scalars_df_index["int64_too"] - 1
upper_bf = scalars_df_index["int64_too"] + 1
- bf_result = col_bf.clip(lower_bf, upper_bf).to_pandas()
+ bf_result = col_bf.clip(lower_bf, upper_bf).to_pandas(ordered=ordered)
col_pd = scalars_pandas_df_index["int64_col"]
lower_pd = scalars_pandas_df_index["int64_too"] - 1
upper_pd = scalars_pandas_df_index["int64_too"] + 1
pd_result = col_pd.clip(lower_pd, upper_pd)
- pd.testing.assert_series_equal(
- bf_result,
- pd_result,
- )
+ assert_series_equal(bf_result, pd_result, ignore_order=not ordered)
def test_clip_filtered_two_sided(scalars_df_index, scalars_pandas_df_index):
@@ -2282,7 +2293,7 @@ def test_to_frame(scalars_dfs):
bf_result = scalars_df["int64_col"].to_frame().to_pandas()
pd_result = scalars_pandas_df["int64_col"].to_frame()
- assert_pandas_df_equal_ignore_ordering(bf_result, pd_result)
+ assert_pandas_df_equal(bf_result, pd_result)
def test_to_json(scalars_df_index, scalars_pandas_df_index):
@@ -2450,7 +2461,7 @@ def test_mask_default_value(scalars_dfs):
pd_col_masked = pd_col.mask(pd_col % 2 == 1)
pd_result = pd_col.to_frame().assign(int64_col_masked=pd_col_masked)
- assert_pandas_df_equal_ignore_ordering(bf_result, pd_result)
+ assert_pandas_df_equal(bf_result, pd_result)
def test_mask_custom_value(scalars_dfs):
@@ -2468,7 +2479,7 @@ def test_mask_custom_value(scalars_dfs):
# odd so should be left as is, but it is being masked in pandas.
# Accidentally the bigframes bahavior matches, but it should be updated
# after the resolution of https://github.com/pandas-dev/pandas/issues/52955
- assert_pandas_df_equal_ignore_ordering(bf_result, pd_result)
+ assert_pandas_df_equal(bf_result, pd_result)
@pytest.mark.parametrize(
@@ -2487,6 +2498,7 @@ def test_mask_custom_value(scalars_dfs):
# with timezone conversions, so we'll allow it.
("timestamp_col", pd.ArrowDtype(pa.timestamp("us"))),
("datetime_col", pd.ArrowDtype(pa.timestamp("us", tz="UTC"))),
+ ("date_col", "string[pyarrow]"),
# TODO(bmil): fix Ibis bug: BigQuery backend rounds to nearest int
# ("float64_col", "Int64"),
# TODO(bmil): decide whether to fix Ibis bug: BigQuery backend
@@ -2564,7 +2576,7 @@ def test_loc_bool_series_default_index(
scalars_pandas_df_default_index.bool_col
]
- assert_pandas_df_equal_ignore_ordering(
+ assert_pandas_df_equal(
bf_result.to_frame(),
pd_result.to_frame(),
)
@@ -2910,3 +2922,30 @@ def test_map_series_input_duplicates_error(scalars_dfs):
scalars_pandas_df.int64_too.map(pd_map_series)
with pytest.raises(pd.errors.InvalidIndexError):
scalars_df.int64_too.map(bf_map_series, verify_integrity=True)
+
+
+@pytest.mark.parametrize(
+ ("frac", "n", "random_state"),
+ [
+ (None, 4, None),
+ (0.5, None, None),
+ (None, 4, 10),
+ (0.5, None, 10),
+ (None, None, None),
+ ],
+ ids=[
+ "n_wo_random_state",
+ "frac_wo_random_state",
+ "n_w_random_state",
+ "frac_w_random_state",
+ "n_default",
+ ],
+)
+def test_sample(scalars_dfs, frac, n, random_state):
+ scalars_df, _ = scalars_dfs
+ df = scalars_df.int64_col.sample(frac=frac, n=n, random_state=random_state)
+ bf_result = df.to_pandas()
+
+ n = 1 if n is None else n
+ expected_sample_size = round(frac * scalars_df.shape[0]) if frac is not None else n
+ assert bf_result.shape[0] == expected_sample_size
diff --git a/tests/system/small/test_session.py b/tests/system/small/test_session.py
index bf72e444eb..7cd9f1dd59 100644
--- a/tests/system/small/test_session.py
+++ b/tests/system/small/test_session.py
@@ -19,7 +19,6 @@
import typing
from typing import List
-import google.api_core.exceptions
import google.cloud.bigquery as bigquery
import numpy as np
import pandas as pd
@@ -985,26 +984,3 @@ def test_read_json_gcs_default_engine(session, scalars_dfs, gcs_folder):
assert df.shape[0] == scalars_df.shape[0]
pd.testing.assert_series_equal(df.dtypes, scalars_df.dtypes)
-
-
-def test_session_id(session):
- assert session._session_id is not None
-
- # BQ client always runs query within the opened session.
- query_job = session.bqclient.query("SELECT 1")
- assert query_job.session_info.session_id == session._session_id
-
- # TODO(chelsealin): Verify the session id can be binded with a load job.
-
-
-@pytest.mark.flaky(retries=2)
-def test_to_close_session():
- session = bigframes.Session()
- assert session._session_id is not None
- session.close()
- assert session._session_id is None
-
- # Session has expired and is no longer available.
- with pytest.raises(google.api_core.exceptions.BadRequest):
- query_job = session.bqclient.query("SELECT 1")
- query_job.result() # blocks until finished
diff --git a/tests/system/utils.py b/tests/system/utils.py
index e2daf3b8bf..f7831972b8 100644
--- a/tests/system/utils.py
+++ b/tests/system/utils.py
@@ -21,29 +21,33 @@
import pyarrow as pa # type: ignore
-def assert_pandas_df_equal_ignore_ordering(df0, df1, **kwargs):
- # Sort by a column to get consistent results.
- if df0.index.name != "rowindex":
- df0 = df0.sort_values(
- list(df0.columns.drop("geography_col", errors="ignore"))
- ).reset_index(drop=True)
- df1 = df1.sort_values(
- list(df1.columns.drop("geography_col", errors="ignore"))
- ).reset_index(drop=True)
- else:
- df0 = df0.sort_index()
- df1 = df1.sort_index()
+def assert_pandas_df_equal(df0, df1, ignore_order: bool = False, **kwargs):
+ if ignore_order:
+ # Sort by a column to get consistent results.
+ if df0.index.name != "rowindex":
+ df0 = df0.sort_values(
+ list(df0.columns.drop("geography_col", errors="ignore"))
+ ).reset_index(drop=True)
+ df1 = df1.sort_values(
+ list(df1.columns.drop("geography_col", errors="ignore"))
+ ).reset_index(drop=True)
+ else:
+ df0 = df0.sort_index()
+ df1 = df1.sort_index()
pd.testing.assert_frame_equal(df0, df1, **kwargs)
-def assert_series_equal_ignoring_order(left: pd.Series, right: pd.Series, **kwargs):
- if left.index.name is None:
- left = left.sort_values().reset_index(drop=True)
- right = right.sort_values().reset_index(drop=True)
- else:
- left = left.sort_index()
- right = right.sort_index()
+def assert_series_equal(
+ left: pd.Series, right: pd.Series, ignore_order: bool = False, **kwargs
+):
+ if ignore_order:
+ if left.index.name is None:
+ left = left.sort_values().reset_index(drop=True)
+ right = right.sort_values().reset_index(drop=True)
+ else:
+ left = left.sort_index()
+ right = right.sort_index()
pd.testing.assert_series_equal(left, right, **kwargs)
diff --git a/tests/unit/core/test_log_adapter.py b/tests/unit/core/test_log_adapter.py
new file mode 100644
index 0000000000..376b7f2075
--- /dev/null
+++ b/tests/unit/core/test_log_adapter.py
@@ -0,0 +1,60 @@
+# Copyright 2023 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import pytest
+
+from bigframes.core import log_adapter
+
+MAX_LABELS_COUNT = 64
+
+
+@pytest.fixture
+def test_instance():
+ # Create a simple class for testing
+ @log_adapter.class_logger
+ class TestClass:
+ def method1(self):
+ pass
+
+ def method2(self):
+ pass
+
+ return TestClass()
+
+
+def test_method_logging(test_instance):
+ test_instance.method1()
+ test_instance.method2()
+
+ # Check if the methods were added to the _api_methods list
+ api_methods = log_adapter.get_and_reset_api_methods()
+ assert api_methods is not None
+ assert "method1" in api_methods
+ assert "method2" in api_methods
+
+
+def test_add_api_method_limit(test_instance):
+ # Ensure that add_api_method correctly adds a method to _api_methods
+ for i in range(70):
+ test_instance.method2()
+ assert len(log_adapter._api_methods) == MAX_LABELS_COUNT
+
+
+def test_get_and_reset_api_methods(test_instance):
+ # Ensure that get_and_reset_api_methods returns a copy and resets the list
+ test_instance.method1()
+ test_instance.method2()
+ previous_methods = log_adapter.get_and_reset_api_methods()
+ assert previous_methods is not None
+ assert log_adapter._api_methods == []
diff --git a/tests/unit/ml/test_golden_sql.py b/tests/unit/ml/test_golden_sql.py
index 3ca7e144a5..017c96d46d 100644
--- a/tests/unit/ml/test_golden_sql.py
+++ b/tests/unit/ml/test_golden_sql.py
@@ -23,21 +23,46 @@
from bigframes.ml import core, linear_model
import bigframes.pandas as bpd
+TEMP_MODEL_ID = bigquery.ModelReference.from_string(
+ "test-project._anon123.temp_model_id"
+)
+
@pytest.fixture
def mock_session():
mock_session = mock.create_autospec(spec=bigframes.Session)
- # return values we don't care about, but need to provide to continue the program when calling session._start_query()
- mock_session._start_query.return_value = (None, mock.MagicMock())
+ mock_session._anonymous_dataset = bigquery.DatasetReference(
+ TEMP_MODEL_ID.project, TEMP_MODEL_ID.dataset_id
+ )
+
+ query_job = mock.create_autospec(bigquery.QueryJob)
+ type(query_job).destination = mock.PropertyMock(
+ return_value=bigquery.TableReference(
+ mock_session._anonymous_dataset, TEMP_MODEL_ID.model_id
+ )
+ )
+ mock_session._start_query.return_value = (None, query_job)
return mock_session
+@pytest.fixture
+def bqml_model_factory(mocker: pytest_mock.MockerFixture):
+ mocker.patch(
+ "bigframes.ml.core.BqmlModelFactory._create_model_ref",
+ return_value=TEMP_MODEL_ID,
+ )
+ bqml_model_factory = core.BqmlModelFactory()
+
+ return bqml_model_factory
+
+
@pytest.fixture
def mock_y():
mock_y = mock.create_autospec(spec=bpd.DataFrame)
mock_y.columns = pd.Index(["input_column_label"])
+ mock_y._cached.return_value = mock_y
return mock_y
@@ -57,21 +82,11 @@ def mock_X(mock_y, mock_session):
["index_column_id"],
["index_column_label"],
)
+ mock_X._cached.return_value = mock_X
return mock_X
-@pytest.fixture
-def bqml_model_factory(mocker: pytest_mock.MockerFixture):
- mocker.patch(
- "bigframes.ml.core.BqmlModelFactory._create_temp_model_id",
- return_value="temp_model_id",
- )
- bqml_model_factory = core.BqmlModelFactory()
-
- return bqml_model_factory
-
-
@pytest.fixture
def bqml_model(mock_session):
bqml_model = core.BqmlModel(
@@ -89,7 +104,7 @@ def test_linear_regression_default_fit(
model.fit(mock_X, mock_y)
mock_session._start_query.assert_called_once_with(
- 'CREATE TEMP MODEL `temp_model_id`\nOPTIONS(\n model_type="LINEAR_REG",\n data_split_method="NO_SPLIT",\n optimize_strategy="normal_equation",\n fit_intercept=True,\n l2_reg=0.0,\n max_iterations=20,\n learn_rate_strategy="line_search",\n early_stop=True,\n min_rel_progress=0.01,\n ls_init_learn_rate=0.1,\n calculate_p_values=False,\n enable_global_explain=False,\n INPUT_LABEL_COLS=["input_column_label"])\nAS input_X_y_sql'
+ 'CREATE OR REPLACE MODEL `test-project`.`_anon123`.`temp_model_id`\nOPTIONS(\n model_type="LINEAR_REG",\n data_split_method="NO_SPLIT",\n optimize_strategy="normal_equation",\n fit_intercept=True,\n l2_reg=0.0,\n max_iterations=20,\n learn_rate_strategy="line_search",\n early_stop=True,\n min_rel_progress=0.01,\n ls_init_learn_rate=0.1,\n calculate_p_values=False,\n enable_global_explain=False,\n INPUT_LABEL_COLS=["input_column_label"])\nAS input_X_y_sql'
)
@@ -99,7 +114,7 @@ def test_linear_regression_params_fit(bqml_model_factory, mock_session, mock_X,
model.fit(mock_X, mock_y)
mock_session._start_query.assert_called_once_with(
- 'CREATE TEMP MODEL `temp_model_id`\nOPTIONS(\n model_type="LINEAR_REG",\n data_split_method="NO_SPLIT",\n optimize_strategy="normal_equation",\n fit_intercept=False,\n l2_reg=0.0,\n max_iterations=20,\n learn_rate_strategy="line_search",\n early_stop=True,\n min_rel_progress=0.01,\n ls_init_learn_rate=0.1,\n calculate_p_values=False,\n enable_global_explain=False,\n INPUT_LABEL_COLS=["input_column_label"])\nAS input_X_y_sql'
+ 'CREATE OR REPLACE MODEL `test-project`.`_anon123`.`temp_model_id`\nOPTIONS(\n model_type="LINEAR_REG",\n data_split_method="NO_SPLIT",\n optimize_strategy="normal_equation",\n fit_intercept=False,\n l2_reg=0.0,\n max_iterations=20,\n learn_rate_strategy="line_search",\n early_stop=True,\n min_rel_progress=0.01,\n ls_init_learn_rate=0.1,\n calculate_p_values=False,\n enable_global_explain=False,\n INPUT_LABEL_COLS=["input_column_label"])\nAS input_X_y_sql'
)
@@ -132,7 +147,7 @@ def test_logistic_regression_default_fit(
model.fit(mock_X, mock_y)
mock_session._start_query.assert_called_once_with(
- 'CREATE TEMP MODEL `temp_model_id`\nOPTIONS(\n model_type="LOGISTIC_REG",\n data_split_method="NO_SPLIT",\n fit_intercept=True,\n auto_class_weights=False,\n INPUT_LABEL_COLS=["input_column_label"])\nAS input_X_y_sql'
+ 'CREATE OR REPLACE MODEL `test-project`.`_anon123`.`temp_model_id`\nOPTIONS(\n model_type="LOGISTIC_REG",\n data_split_method="NO_SPLIT",\n fit_intercept=True,\n auto_class_weights=False,\n INPUT_LABEL_COLS=["input_column_label"])\nAS input_X_y_sql'
)
@@ -146,7 +161,7 @@ def test_logistic_regression_params_fit(
model.fit(mock_X, mock_y)
mock_session._start_query.assert_called_once_with(
- 'CREATE TEMP MODEL `temp_model_id`\nOPTIONS(\n model_type="LOGISTIC_REG",\n data_split_method="NO_SPLIT",\n fit_intercept=False,\n auto_class_weights=True,\n INPUT_LABEL_COLS=["input_column_label"])\nAS input_X_y_sql'
+ 'CREATE OR REPLACE MODEL `test-project`.`_anon123`.`temp_model_id`\nOPTIONS(\n model_type="LOGISTIC_REG",\n data_split_method="NO_SPLIT",\n fit_intercept=False,\n auto_class_weights=True,\n INPUT_LABEL_COLS=["input_column_label"])\nAS input_X_y_sql'
)
diff --git a/tests/unit/ml/test_sql.py b/tests/unit/ml/test_sql.py
index 34a02edd42..ea16722393 100644
--- a/tests/unit/ml/test_sql.py
+++ b/tests/unit/ml/test_sql.py
@@ -14,6 +14,7 @@
from unittest import mock
+import google.cloud.bigquery as bigquery
import pytest
import bigframes.ml.sql as ml_sql
@@ -27,7 +28,7 @@ def base_sql_generator() -> ml_sql.BaseSqlGenerator:
@pytest.fixture(scope="session")
def model_creation_sql_generator() -> ml_sql.ModelCreationSqlGenerator:
- return ml_sql.ModelCreationSqlGenerator(model_id="my_model_id")
+ return ml_sql.ModelCreationSqlGenerator()
@pytest.fixture(scope="session")
@@ -126,11 +127,14 @@ def test_create_model_produces_correct_sql(
):
sql = model_creation_sql_generator.create_model(
source_df=mock_df,
+ model_ref=bigquery.ModelReference.from_string(
+ "test-proj._anonXYZ.create_model_correct_sql"
+ ),
options={"option_key1": "option_value1", "option_key2": 2},
)
assert (
sql
- == """CREATE TEMP MODEL `my_model_id`
+ == """CREATE OR REPLACE MODEL `test-proj`.`_anonXYZ`.`create_model_correct_sql`
OPTIONS(
option_key1="option_value1",
option_key2=2)
@@ -144,6 +148,9 @@ def test_create_model_transform_produces_correct_sql(
):
sql = model_creation_sql_generator.create_model(
source_df=mock_df,
+ model_ref=bigquery.ModelReference.from_string(
+ "test-proj._anonXYZ.create_model_transform"
+ ),
options={"option_key1": "option_value1", "option_key2": 2},
transforms=[
"ML.STANDARD_SCALER(col_a) OVER(col_a) AS scaled_col_a",
@@ -152,7 +159,7 @@ def test_create_model_transform_produces_correct_sql(
)
assert (
sql
- == """CREATE TEMP MODEL `my_model_id`
+ == """CREATE OR REPLACE MODEL `test-proj`.`_anonXYZ`.`create_model_transform`
TRANSFORM(
ML.STANDARD_SCALER(col_a) OVER(col_a) AS scaled_col_a,
ML.ONE_HOT_ENCODER(col_b) OVER(col_b) AS encoded_col_b)
@@ -168,11 +175,14 @@ def test_create_remote_model_produces_correct_sql(
):
sql = model_creation_sql_generator.create_remote_model(
connection_name="my_project.us.my_connection",
+ model_ref=bigquery.ModelReference.from_string(
+ "test-proj._anonXYZ.create_remote_model"
+ ),
options={"option_key1": "option_value1", "option_key2": 2},
)
assert (
sql
- == """CREATE TEMP MODEL `my_model_id`
+ == """CREATE OR REPLACE MODEL `test-proj`.`_anonXYZ`.`create_remote_model`
REMOTE WITH CONNECTION `my_project.us.my_connection`
OPTIONS(
option_key1="option_value1",
@@ -184,11 +194,14 @@ def test_create_imported_model_produces_correct_sql(
model_creation_sql_generator: ml_sql.ModelCreationSqlGenerator,
):
sql = model_creation_sql_generator.create_imported_model(
+ model_ref=bigquery.ModelReference.from_string(
+ "test-proj._anonXYZ.create_imported_model"
+ ),
options={"option_key1": "option_value1", "option_key2": 2},
)
assert (
sql
- == """CREATE TEMP MODEL `my_model_id`
+ == """CREATE OR REPLACE MODEL `test-proj`.`_anonXYZ`.`create_imported_model`
OPTIONS(
option_key1="option_value1",
option_key2=2)"""
diff --git a/tests/unit/resources.py b/tests/unit/resources.py
index 8fc8acd175..8ba321d122 100644
--- a/tests/unit/resources.py
+++ b/tests/unit/resources.py
@@ -66,7 +66,6 @@ def create_bigquery_session(
credentials=credentials, location="test-region"
)
session = bigframes.Session(context=bqoptions, clients_provider=clients_provider)
- session._session_id = session_id
return session
diff --git a/tests/unit/session/test_io_bigquery.py b/tests/unit/session/test_io_bigquery.py
index 03470208e4..e1481d3f05 100644
--- a/tests/unit/session/test_io_bigquery.py
+++ b/tests/unit/session/test_io_bigquery.py
@@ -19,7 +19,113 @@
import google.cloud.bigquery as bigquery
import pytest
-import bigframes.session._io.bigquery
+import bigframes
+from bigframes.core import log_adapter
+import bigframes.pandas as bpd
+import bigframes.session._io.bigquery as io_bq
+
+
+def test_create_job_configs_labels_is_none():
+ api_methods = ["agg", "series-mode"]
+ labels = io_bq.create_job_configs_labels(
+ job_configs_labels=None, api_methods=api_methods
+ )
+ expected_dict = {
+ "recent-bigframes-api-0": "agg",
+ "recent-bigframes-api-1": "series-mode",
+ }
+ assert labels is not None
+ assert labels == expected_dict
+
+
+def test_create_job_configs_labels_length_limit_not_met():
+ cur_labels = {
+ "bigframes-api": "read_pandas",
+ "source": "bigquery-dataframes-temp",
+ }
+ api_methods = ["agg", "series-mode"]
+ labels = io_bq.create_job_configs_labels(
+ job_configs_labels=cur_labels, api_methods=api_methods
+ )
+ expected_dict = {
+ "bigframes-api": "read_pandas",
+ "source": "bigquery-dataframes-temp",
+ "recent-bigframes-api-0": "agg",
+ "recent-bigframes-api-1": "series-mode",
+ }
+ assert labels is not None
+ assert len(labels) == 4
+ assert labels == expected_dict
+
+
+def test_create_job_configs_labels_log_adaptor_call_method_under_length_limit():
+ cur_labels = {
+ "bigframes-api": "read_pandas",
+ "source": "bigquery-dataframes-temp",
+ }
+ df = bpd.DataFrame({"col1": [1, 2], "col2": [3, 4]})
+ # Test running two methods
+ df.head()
+ df.max()
+ api_methods = log_adapter._api_methods
+
+ labels = io_bq.create_job_configs_labels(
+ job_configs_labels=cur_labels, api_methods=api_methods
+ )
+ expected_dict = {
+ "bigframes-api": "read_pandas",
+ "source": "bigquery-dataframes-temp",
+ "recent-bigframes-api-0": "__init__",
+ "recent-bigframes-api-1": "max",
+ "recent-bigframes-api-2": "__init__",
+ "recent-bigframes-api-3": "head",
+ "recent-bigframes-api-4": "__init__",
+ }
+ assert labels is not None
+ assert len(labels) == 7
+ assert labels == expected_dict
+
+
+def test_create_job_configs_labels_length_limit_met_and_labels_is_none():
+ df = bpd.DataFrame({"col1": [1, 2], "col2": [3, 4]})
+ # Test running methods more than the labels' length limit
+ for i in range(66):
+ df.head()
+ api_methods = log_adapter._api_methods
+
+ labels = io_bq.create_job_configs_labels(
+ job_configs_labels=None, api_methods=api_methods
+ )
+ assert labels is not None
+ assert len(labels) == 64
+ assert "head" in labels.values()
+
+
+def test_create_job_configs_labels_length_limit_met():
+ cur_labels = {
+ "bigframes-api": "read_pandas",
+ "source": "bigquery-dataframes-temp",
+ }
+ for i in range(60):
+ key = f"bigframes-api-test-{i}"
+ value = f"test{i}"
+ cur_labels[key] = value
+ # If cur_labels length is 62, we can only add one label from api_methods
+ df = bpd.DataFrame({"col1": [1, 2], "col2": [3, 4]})
+ # Test running two methods
+ df.head()
+ df.max()
+ api_methods = log_adapter._api_methods
+
+ labels = io_bq.create_job_configs_labels(
+ job_configs_labels=cur_labels, api_methods=api_methods
+ )
+ assert labels is not None
+ assert len(labels) == 64
+ assert "max" in labels.values()
+ assert "head" not in labels.values()
+ assert "bigframes-api" in labels.keys()
+ assert "source" in labels.keys()
def test_create_snapshot_sql_doesnt_timetravel_anonymous_datasets():
@@ -125,5 +231,5 @@ def test_create_temp_table_default_expiration():
),
)
def test_bq_schema_to_sql(schema: Iterable[bigquery.SchemaField], expected: str):
- sql = bigframes.session._io.bigquery.bq_schema_to_sql(schema)
+ sql = io_bq.bq_schema_to_sql(schema)
assert sql == expected
diff --git a/tests/unit/test_compute_options.py b/tests/unit/test_compute_options.py
index 499a0a5fef..a613bca7b9 100644
--- a/tests/unit/test_compute_options.py
+++ b/tests/unit/test_compute_options.py
@@ -18,13 +18,9 @@
def test_maximum_bytes_option():
session = resources.create_bigquery_session()
- num_query_calls = 0
with bf.option_context("compute.maximum_bytes_billed", 10000):
- # clear initial method calls
- session.bqclient.method_calls = []
+ session.bqclient.query.reset_mock()
session._start_query("query")
- for call in session.bqclient.method_calls:
- _, _, kwargs = call
- num_query_calls += 1
- assert kwargs["job_config"].maximum_bytes_billed == 10000
- assert num_query_calls > 0
+ call = session.bqclient.query.call_args
+ assert call.kwargs["job_config"].maximum_bytes_billed == 10000
+ session.bqclient.query.assert_called_once()
diff --git a/tests/unit/test_core.py b/tests/unit/test_core.py
index d9672b2635..623448b3aa 100644
--- a/tests/unit/test_core.py
+++ b/tests/unit/test_core.py
@@ -49,7 +49,7 @@ def test_arrayvalue_constructor_from_ibis_table_adds_all_columns():
ordering=ordering,
hidden_ordering_columns=(),
)
- assert actual.compile()._table is ibis_table
+ assert actual._compile_ordered()._table is ibis_table
assert len(actual.column_ids) == 3
@@ -83,7 +83,7 @@ def test_arrayvalue_with_get_column():
),
total_ordering_columns=["col1"],
)
- col1 = value.compile()._get_ibis_column("col1")
+ col1 = value._compile_ordered()._get_ibis_column("col1")
assert isinstance(col1, ibis_types.Value)
assert col1.get_name() == "col1"
assert col1.type().is_int64()
@@ -100,7 +100,7 @@ def test_arrayvalues_to_ibis_expr_with_get_column():
),
total_ordering_columns=["col1"],
)
- expr = value.compile()._get_ibis_column("col1")
+ expr = value._compile_ordered()._get_ibis_column("col1")
assert expr.get_name() == "col1"
assert expr.type().is_int64()
@@ -117,7 +117,7 @@ def test_arrayvalues_to_ibis_expr_with_concat():
total_ordering_columns=["col1"],
)
expr = value.concat([value])
- actual = expr.compile()._to_ibis_expr("unordered")
+ actual = expr._compile_ordered()._to_ibis_expr(ordering_mode="unordered")
assert len(actual.columns) == 3
# TODO(ashleyxu, b/299631930): test out the union expression
assert actual.columns[0] == "column_0"
@@ -136,8 +136,8 @@ def test_arrayvalues_to_ibis_expr_with_project_unary_op():
),
total_ordering_columns=["col1"],
)
- expr = value.project_unary_op("col1", ops.AsTypeOp("string")).compile()
- assert value.compile().columns[0].type().is_int64()
+ expr = value.project_unary_op("col1", ops.AsTypeOp("string"))._compile_ordered()
+ assert value._compile_ordered().columns[0].type().is_int64()
assert expr.columns[0].type().is_string()
@@ -152,9 +152,11 @@ def test_arrayvalues_to_ibis_expr_with_project_binary_op():
),
total_ordering_columns=["col1"],
)
- expr = value.project_binary_op("col2", "col3", ops.add_op, "col4").compile()
+ expr = value.project_binary_op(
+ "col2", "col3", ops.add_op, "col4"
+ )._compile_ordered()
assert expr.columns[3].type().is_float64()
- actual = expr._to_ibis_expr("unordered")
+ actual = expr._to_ibis_expr(ordering_mode="unordered")
assert len(expr.columns) == 4
assert actual.columns[3] == "col4"
@@ -173,9 +175,9 @@ def test_arrayvalues_to_ibis_expr_with_project_ternary_op():
)
expr = value.project_ternary_op(
"col2", "col3", "col4", ops.where_op, "col5"
- ).compile()
+ )._compile_ordered()
assert expr.columns[4].type().is_float64()
- actual = expr._to_ibis_expr("unordered")
+ actual = expr._to_ibis_expr(ordering_mode="unordered")
assert len(expr.columns) == 5
assert actual.columns[4] == "col5"
@@ -195,8 +197,8 @@ def test_arrayvalue_to_ibis_expr_with_aggregate():
aggregations=(("col1", agg_ops.sum_op, "col4"),),
by_column_ids=["col1"],
dropna=False,
- ).compile()
- actual = expr._to_ibis_expr("unordered")
+ )._compile_ordered()
+ actual = expr._to_ibis_expr(ordering_mode="unordered")
assert len(expr.columns) == 2
assert actual.columns[0] == "col1"
assert actual.columns[1] == "col4"
@@ -214,8 +216,10 @@ def test_arrayvalue_to_ibis_expr_with_corr_aggregate():
),
total_ordering_columns=["col1"],
)
- expr = value.corr_aggregate(corr_aggregations=[("col1", "col3", "col4")]).compile()
- actual = expr._to_ibis_expr("unordered")
+ expr = value.corr_aggregate(
+ corr_aggregations=[("col1", "col3", "col4")]
+ )._compile_ordered()
+ actual = expr._to_ibis_expr(ordering_mode="unordered")
assert len(expr.columns) == 1
assert actual.columns[0] == "col4"
assert expr.columns[0].type().is_float64()
diff --git a/tests/unit/test_pandas.py b/tests/unit/test_pandas.py
index 70c5441c68..4835a24dc7 100644
--- a/tests/unit/test_pandas.py
+++ b/tests/unit/test_pandas.py
@@ -17,8 +17,6 @@
import sys
import unittest.mock as mock
-import google.api_core.exceptions
-import google.cloud.bigquery
import pandas as pd
import pytest
@@ -26,8 +24,6 @@
import bigframes.pandas as bpd
import bigframes.session
-from . import resources
-
leading_whitespace = re.compile(r"^\s+", flags=re.MULTILINE)
@@ -114,37 +110,3 @@ def test_pandas_attribute():
assert bpd.Int64Dtype is pd.Int64Dtype
assert bpd.StringDtype is pd.StringDtype
assert bpd.ArrowDtype is pd.ArrowDtype
-
-
-def test_close_session_after_bq_session_ended(monkeypatch: pytest.MonkeyPatch):
- bqclient = mock.create_autospec(google.cloud.bigquery.Client, instance=True)
- bqclient.project = "test-project"
- session = resources.create_bigquery_session(
- bqclient=bqclient, session_id="JUST_A_TEST"
- )
-
- # Simulate that the session has already expired.
- # Note: this needs to be done after the Session is constructed, as the
- # initializer sends a query to start the BigQuery Session.
- query_job = mock.create_autospec(google.cloud.bigquery.QueryJob, instance=True)
- query_job.result.side_effect = google.api_core.exceptions.BadRequest(
- "Session JUST_A_TEST has expired and is no longer available."
- )
- bqclient.query.return_value = query_job
-
- # Simulate that the session has already started.
- monkeypatch.setattr(bigframes.core.global_session, "_global_session", session)
- bpd.options.bigquery._session_started = True
-
- # Confirm that as a result bigframes.pandas interface is unusable
- with pytest.raises(
- google.api_core.exceptions.BadRequest,
- match="Session JUST_A_TEST has expired and is no longer available.",
- ):
- bpd.read_gbq("SELECT 'ABC'")
-
- # Even though the query to stop the session raises an exception, we should
- # still be able to close it without raising an error to the user.
- bpd.close_session()
- assert "CALL BQ.ABORT_SESSION('JUST_A_TEST')" in bqclient.query.call_args.args[0]
- assert bigframes.core.global_session._global_session is None
diff --git a/third_party/bigframes_vendored/ibis/backends/bigquery/registry.py b/third_party/bigframes_vendored/ibis/backends/bigquery/registry.py
index a4e61ca0f9..e1b28690d7 100644
--- a/third_party/bigframes_vendored/ibis/backends/bigquery/registry.py
+++ b/third_party/bigframes_vendored/ibis/backends/bigquery/registry.py
@@ -22,10 +22,16 @@ def _last_non_null_value(translator, op: vendored_ibis_ops.LastNonNullValue):
return f"LAST_VALUE({arg} IGNORE NULLS)"
+def _to_json_string(translator, op: vendored_ibis_ops.ToJsonString):
+ arg = translator.translate(op.arg)
+ return f"TO_JSON_STRING({arg})"
+
+
patched_ops = {
- vendored_ibis_ops.ApproximateMultiQuantile: _approx_quantiles,
- vendored_ibis_ops.FirstNonNullValue: _first_non_null_value,
- vendored_ibis_ops.LastNonNullValue: _last_non_null_value,
+ vendored_ibis_ops.ApproximateMultiQuantile: _approx_quantiles, # type:ignore
+ vendored_ibis_ops.FirstNonNullValue: _first_non_null_value, # type:ignore
+ vendored_ibis_ops.LastNonNullValue: _last_non_null_value, # type:ignore
+ vendored_ibis_ops.ToJsonString: _to_json_string, # type:ignore
}
OPERATION_REGISTRY.update(patched_ops)
diff --git a/third_party/bigframes_vendored/ibis/expr/operations/__init__.py b/third_party/bigframes_vendored/ibis/expr/operations/__init__.py
index 1612d9c12e..8219701392 100644
--- a/third_party/bigframes_vendored/ibis/expr/operations/__init__.py
+++ b/third_party/bigframes_vendored/ibis/expr/operations/__init__.py
@@ -1,5 +1,6 @@
# Contains code from https://github.com/ibis-project/ibis/blob/master/ibis/expr/operations/__init__.py
from __future__ import annotations
-from third_party.bigframes_vendored.ibis.expr.operations.analytic import * # noqa: F403
-from third_party.bigframes_vendored.ibis.expr.operations.reductions import * # noqa: F403
+from third_party.bigframes_vendored.ibis.expr.operations.analytic import * # noqa: F401 F403
+from third_party.bigframes_vendored.ibis.expr.operations.json import * # noqa: F401 F403
+from third_party.bigframes_vendored.ibis.expr.operations.reductions import * # noqa: F401 F403
diff --git a/third_party/bigframes_vendored/ibis/expr/operations/json.py b/third_party/bigframes_vendored/ibis/expr/operations/json.py
new file mode 100644
index 0000000000..dbb3fa3066
--- /dev/null
+++ b/third_party/bigframes_vendored/ibis/expr/operations/json.py
@@ -0,0 +1,9 @@
+# Contains code from https://github.com/ibis-project/ibis/blob/master/ibis/expr/operations/json.py
+from __future__ import annotations
+
+import ibis.expr.datatypes as dt
+from ibis.expr.operations.core import Unary
+
+
+class ToJsonString(Unary):
+ output_dtype = dt.string
diff --git a/third_party/bigframes_vendored/pandas/core/frame.py b/third_party/bigframes_vendored/pandas/core/frame.py
index 6f4f6be35d..b35d0f3b2e 100644
--- a/third_party/bigframes_vendored/pandas/core/frame.py
+++ b/third_party/bigframes_vendored/pandas/core/frame.py
@@ -2159,8 +2159,68 @@ def map(self, func, na_action: Optional[str] = None) -> DataFrame:
In pandas 2.1.0, DataFrame.applymap is deprecated and renamed to
DataFrame.map.
+ **Examples:**
+
+ >>> import bigframes.pandas as bpd
+ >>> bpd.options.display.progress_bar = None
+
+ Let's use ``reuse=False`` flag to make sure a new ``remote_function``
+ is created every time we run the following code, but you can skip it
+ to potentially reuse a previously deployed ``remote_function`` from
+ the same user defined function.
+
+ >>> @bpd.remote_function([int], float, reuse=False)
+ ... def minutes_to_hours(x):
+ ... return x/60
+
+ >>> df_minutes = bpd.DataFrame(
+ ... {"system_minutes" : [0, 30, 60, 90, 120],
+ ... "user_minutes" : [0, 15, 75, 90, 6]})
+ >>> df_minutes
+ system_minutes user_minutes
+ 0 0 0
+ 1 30 15
+ 2 60 75
+ 3 90 90
+ 4 120 6
+
+ [5 rows x 2 columns]
+
+ >>> df_hours = df_minutes.map(minutes_to_hours)
+ >>> df_hours
+ system_minutes user_minutes
+ 0 0.0 0.0
+ 1 0.5 0.25
+ 2 1.0 1.25
+ 3 1.5 1.5
+ 4 2.0 0.1
+
+ [5 rows x 2 columns]
+
+ If there are ``NA``/``None`` values in the data, you can ignore
+ applying the remote function on such values by specifying
+ ``na_action='ignore'``.
+
+ >>> df_minutes = bpd.DataFrame(
+ ... {
+ ... "system_minutes" : [0, 30, 60, None, 90, 120, bpd.NA],
+ ... "user_minutes" : [0, 15, 75, 90, 6, None, bpd.NA]
+ ... }, dtype="Int64")
+ >>> df_hours = df_minutes.map(minutes_to_hours, na_action='ignore')
+ >>> df_hours
+ system_minutes user_minutes
+ 0 0.0 0.0
+ 1 0.5 0.25
+ 2 1.0 1.25
+ 3 1.5
+ 4 1.5 0.1
+ 5 2.0
+ 6
+
+ [7 rows x 2 columns]
+
Args:
- func:
+ func (function):
Python function wrapped by ``remote_function`` decorator,
returns a single value from a single value.
na_action (Optional[str], default None):
@@ -2194,6 +2254,8 @@ def join(self, other, *, on: Optional[str] = None, how: str) -> DataFrame:
and sort it lexicographically. ``inner``: form intersection of
calling frame's index (or column if on is specified) with `other`'s
index, preserving the order of the calling's one.
+ ``cross``: creates the cartesian product from both frames, preserves
+ the order of the left keys.
Returns:
bigframes.dataframe.DataFrame: A dataframe containing columns from both the caller and `other`.
@@ -2208,6 +2270,7 @@ def merge(
"left",
"outer",
"right",
+ "cross",
] = "inner",
on: Optional[str] = None,
*,
@@ -2243,6 +2306,8 @@ def merge(
join; sort keys lexicographically.
``inner``: use intersection of keys from both frames, similar to a SQL inner
join; preserve the order of the left keys.
+ ``cross``: creates the cartesian product from both frames, preserves the order
+ of the left keys.
on (label or list of labels):
Columns to join on. It must be found in both DataFrames. Either on or left_on + right_on
@@ -2867,17 +2932,6 @@ def interpolate(self, method: str = "linear"):
"""
Fill NaN values using an interpolation method.
- Args:
- method (str, default 'linear'):
- Interpolation technique to use. Only 'linear' supported.
- 'linear': Ignore the index and treat the values as equally spaced.
- This is the only method supported on MultiIndexes.
-
- Returns:
- DataFrame:
- Returns the same object type as the caller, interpolated at
- some or all ``NaN`` values
-
**Examples:**
>>> import bigframes.pandas as bpd
@@ -2886,17 +2940,41 @@ def interpolate(self, method: str = "linear"):
>>> df = bpd.DataFrame({
... 'A': [1, 2, 3, None, None, 6],
... 'B': [None, 6, None, 2, None, 3],
- ... })
+ ... }, index=[0, 0.1, 0.3, 0.7, 0.9, 1.0])
>>> df.interpolate()
- A B
- 0 1.0
- 1 2.0 6.0
- 2 3.0 4.0
- 3 4.0 2.0
- 4 5.0 2.5
- 5 6.0 3.0
+ A B
+ 0.0 1.0
+ 0.1 2.0 6.0
+ 0.3 3.0 4.0
+ 0.7 4.0 2.0
+ 0.9 5.0 2.5
+ 1.0 6.0 3.0
[6 rows x 2 columns]
+ >>> df.interpolate(method="values")
+ A B
+ 0.0 1.0
+ 0.1 2.0 6.0
+ 0.3 3.0 4.666667
+ 0.7 4.714286 2.0
+ 0.9 5.571429 2.666667
+ 1.0 6.0 3.0
+
+ [6 rows x 2 columns]
+
+ Args:
+ method (str, default 'linear'):
+ Interpolation technique to use. Only 'linear' supported.
+ 'linear': Ignore the index and treat the values as equally spaced.
+ This is the only method supported on MultiIndexes.
+ 'index', 'values': use the actual numerical values of the index.
+ 'pad': Fill in NaNs using existing values.
+ 'nearest', 'zero', 'slinear': Emulates `scipy.interpolate.interp1d`
+
+ Returns:
+ DataFrame:
+ Returns the same object type as the caller, interpolated at
+ some or all ``NaN`` values
"""
raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)
diff --git a/third_party/bigframes_vendored/pandas/core/reshape/merge.py b/third_party/bigframes_vendored/pandas/core/reshape/merge.py
index b03f366fca..704e50f516 100644
--- a/third_party/bigframes_vendored/pandas/core/reshape/merge.py
+++ b/third_party/bigframes_vendored/pandas/core/reshape/merge.py
@@ -49,6 +49,8 @@ def merge(
join; sort keys lexicographically.
``inner``: use intersection of keys from both frames, similar to a SQL inner
join; preserve the order of the left keys.
+ ``cross``: creates the cartesian product from both frames, preserves the order
+ of the left keys.
on (label or list of labels):
Columns to join on. It must be found in both DataFrames. Either on or left_on + right_on
diff --git a/third_party/bigframes_vendored/pandas/core/series.py b/third_party/bigframes_vendored/pandas/core/series.py
index b569e5699c..c6d98075f5 100644
--- a/third_party/bigframes_vendored/pandas/core/series.py
+++ b/third_party/bigframes_vendored/pandas/core/series.py
@@ -728,18 +728,74 @@ def apply(
func,
) -> DataFrame | Series:
"""
- Invoke function on values of Series.
+ Invoke function on values of a Series.
- Can be ufunc (a NumPy function that applies to the entire Series)
- or a Python function that only works on single values.
+ **Examples:**
+
+ >>> import bigframes.pandas as bpd
+ >>> bpd.options.display.progress_bar = None
+
+ Let's use ``reuse=False`` flag to make sure a new ``remote_function``
+ is created every time we run the following code, but you can skip it
+ to potentially reuse a previously deployed ``remote_function`` from
+ the same user defined function.
+
+ >>> @bpd.remote_function([int], float, reuse=False)
+ ... def minutes_to_hours(x):
+ ... return x/60
+
+ >>> minutes = bpd.Series([0, 30, 60, 90, 120])
+ >>> minutes
+ 0 0
+ 1 30
+ 2 60
+ 3 90
+ 4 120
+ dtype: Int64
+
+ >>> hours = minutes.apply(minutes_to_hours)
+ >>> hours
+ 0 0.0
+ 1 0.5
+ 2 1.0
+ 3 1.5
+ 4 2.0
+ dtype: Float64
+
+ You could turn a user defined function with external package
+ dependencies into a BigQuery DataFrames remote function. You would
+ provide the names of the packages via ``packages`` param.
+
+ >>> @bpd.remote_function(
+ ... [str],
+ ... str,
+ ... reuse=False,
+ ... packages=["cryptography"],
+ ... )
+ ... def get_hash(input):
+ ... from cryptography.fernet import Fernet
+ ...
+ ... # handle missing value
+ ... if input is None:
+ ... input = ""
+ ...
+ ... key = Fernet.generate_key()
+ ... f = Fernet(key)
+ ... return f.encrypt(input.encode()).decode()
+
+ >>> names = bpd.Series(["Alice", "Bob"])
+ >>> hashes = names.apply(get_hash)
Args:
func (function):
- Python function or NumPy ufunc to apply.
+ BigFrames DataFrames ``remote_function`` to apply. The function
+ should take a scalar and return a scalar. It will be applied to
+ every element in the ``Series``.
Returns:
- bigframes.series.Series: If func returns a Series object the result
- will be a DataFrame.
+ bigframes.series.Series: A new Series with values representing the
+ return value of the ``func`` applied to each element of the original
+ Series.
"""
raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)
@@ -920,31 +976,49 @@ def interpolate(self, method: str = "linear"):
"""
Fill NaN values using an interpolation method.
+ **Examples:**
+
+ >>> import bigframes.pandas as bpd
+ >>> bpd.options.display.progress_bar = None
+
+ >>> df = bpd.DataFrame({
+ ... 'A': [1, 2, 3, None, None, 6],
+ ... 'B': [None, 6, None, 2, None, 3],
+ ... }, index=[0, 0.1, 0.3, 0.7, 0.9, 1.0])
+ >>> df.interpolate()
+ A B
+ 0.0 1.0
+ 0.1 2.0 6.0
+ 0.3 3.0 4.0
+ 0.7 4.0 2.0
+ 0.9 5.0 2.5
+ 1.0 6.0 3.0
+
+ [6 rows x 2 columns]
+ >>> df.interpolate(method="values")
+ A B
+ 0.0 1.0
+ 0.1 2.0 6.0
+ 0.3 3.0 4.666667
+ 0.7 4.714286 2.0
+ 0.9 5.571429 2.666667
+ 1.0 6.0 3.0
+
+ [6 rows x 2 columns]
+
+
Args:
method (str, default 'linear'):
Interpolation technique to use. Only 'linear' supported.
'linear': Ignore the index and treat the values as equally spaced.
This is the only method supported on MultiIndexes.
-
+ 'index', 'values': use the actual numerical values of the index.
+ 'pad': Fill in NaNs using existing values.
+ 'nearest', 'zero', 'slinear': Emulates `scipy.interpolate.interp1d`
Returns:
Series:
Returns the same object type as the caller, interpolated at
some or all ``NaN`` values
-
- **Examples:**
-
- >>> import bigframes.pandas as bpd
- >>> bpd.options.display.progress_bar = None
-
- >>> series = bpd.Series([1, 2, 3, None, None, 6])
- >>> series.interpolate()
- 0 1.0
- 1 2.0
- 2 3.0
- 3 4.0
- 4 5.0
- 5 6.0
- dtype: Float64
"""
raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)
diff --git a/third_party/bigframes_vendored/pandas/io/gbq.py b/third_party/bigframes_vendored/pandas/io/gbq.py
index 575c501618..2161310b07 100644
--- a/third_party/bigframes_vendored/pandas/io/gbq.py
+++ b/third_party/bigframes_vendored/pandas/io/gbq.py
@@ -45,16 +45,6 @@ def read_gbq(
If the input is a table ID:
>>> df = bpd.read_gbq("bigquery-public-data.ml_datasets.penguins")
- >>> df.head(2)
- species island culmen_length_mm \\
- 0 Adelie Penguin (Pygoscelis adeliae) Dream 36.6
- 1 Adelie Penguin (Pygoscelis adeliae) Dream 39.8
-
- culmen_depth_mm flipper_length_mm body_mass_g sex
- 0 18.4 184.0 3475.0 FEMALE
- 1 19.1 184.0 4650.0 MALE
-
- [2 rows x 7 columns]
Preserve ordering in a query input.
diff --git a/third_party/bigframes_vendored/pandas/io/parquet.py b/third_party/bigframes_vendored/pandas/io/parquet.py
index f97bd386a4..0f664e70fc 100644
--- a/third_party/bigframes_vendored/pandas/io/parquet.py
+++ b/third_party/bigframes_vendored/pandas/io/parquet.py
@@ -24,12 +24,6 @@ def read_parquet(
>>> gcs_path = "gs://cloud-samples-data/bigquery/us-states/us-states.parquet"
>>> df = bpd.read_parquet(path=gcs_path)
- >>> df.head(2)
- name post_abbr
- 0 Alabama AL
- 1 Alaska AK
-
- [2 rows x 2 columns]
Args:
path (str):
diff --git a/third_party/bigframes_vendored/pandas/io/pickle.py b/third_party/bigframes_vendored/pandas/io/pickle.py
index 053ba4871c..096d9b13d6 100644
--- a/third_party/bigframes_vendored/pandas/io/pickle.py
+++ b/third_party/bigframes_vendored/pandas/io/pickle.py
@@ -32,16 +32,6 @@ def read_pickle(
>>> gcs_path = "gs://bigframes-dev-testing/test_pickle.pkl"
>>> df = bpd.read_pickle(filepath_or_buffer=gcs_path)
- >>> df.head(2)
- species island culmen_length_mm \\
- 0 Adelie Penguin (Pygoscelis adeliae) Dream 36.6
- 1 Adelie Penguin (Pygoscelis adeliae) Dream 39.8
-
- culmen_depth_mm flipper_length_mm body_mass_g sex
- 0 18.4 184.0 3475.0 FEMALE
- 1 19.1 184.0 4650.0 MALE
-
- [2 rows x 7 columns]
Args:
filepath_or_buffer (str, path object, or file-like object):