Skip to content
This repository was archived by the owner on May 7, 2026. It is now read-only.
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
use pk before index, fix unit test
  • Loading branch information
TrevorBergeron committed Feb 3, 2025
commit f16233b707791ebff1442a9a6414111a8c81a3e2
55 changes: 2 additions & 53 deletions bigframes/session/_io/bigquery/read_gbq_table.py
Original file line number Diff line number Diff line change
Expand Up @@ -166,7 +166,7 @@ def infer_primary_key(
# Essentially, just reordering the primary key to match the index col order
return tuple(index_col for index_col in index_cols if index_col in primary_keys)

if metadata_only:
if primary_keys or metadata_only:
# Sometimes not worth scanning data to check uniqueness
return primary_keys
# TODO(b/337925142): Avoid a "SELECT *" subquery here by ensuring
Expand All @@ -179,7 +179,7 @@ def infer_primary_key(

if row["total_count"] == row["distinct_count"]:
return tuple(index_cols)
return primary_keys
return ()


def _get_primary_keys(
Expand Down Expand Up @@ -280,54 +280,3 @@ def get_index_cols(
index_cols = primary_keys

return index_cols


def get_time_travel_datetime_and_table_metadata(
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Dead code?

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

yes, dead and now, gone

bqclient: bigquery.Client,
table_ref: bigquery.TableReference,
*,
api_name: str,
cache: Dict[bigquery.TableReference, Tuple[datetime.datetime, bigquery.Table]],
use_cache: bool = True,
) -> Tuple[datetime.datetime, bigquery.Table]:
cached_table = cache.get(table_ref)
if use_cache and cached_table is not None:
snapshot_timestamp, _ = cached_table

# Cache hit could be unexpected. See internal issue 329545805.
# Raise a warning with more information about how to avoid the
# problems with the cache.
msg = (
f"Reading cached table from {snapshot_timestamp} to avoid "
"incompatibilies with previous reads of this table. To read "
"the latest version, set `use_cache=False` or close the "
"current session with Session.close() or "
"bigframes.pandas.close_session()."
)
# There are many layers before we get to (possibly) the user's code:
# pandas.read_gbq_table
# -> with_default_session
# -> Session.read_gbq_table
# -> _read_gbq_table
# -> _get_snapshot_sql_and_primary_key
# -> get_snapshot_datetime_and_table_metadata
warnings.warn(msg, stacklevel=7)
return cached_table

# TODO(swast): It's possible that the table metadata is changed between now
# and when we run the CURRENT_TIMESTAMP() query to see when we can time
# travel to. Find a way to fetch the table metadata and BQ's current time
# atomically.
table = bqclient.get_table(table_ref)

job_config = bigquery.QueryJobConfig()
job_config.labels["bigframes-api"] = api_name
snapshot_timestamp = list(
bqclient.query(
"SELECT CURRENT_TIMESTAMP() AS `current_timestamp`",
job_config=job_config,
).result()
)[0][0]
cached_table = (snapshot_timestamp, table)
cache[table_ref] = cached_table
return cached_table
6 changes: 3 additions & 3 deletions tests/unit/session/test_session.py
Original file line number Diff line number Diff line change
Expand Up @@ -217,10 +217,10 @@ def test_read_gbq_cached_table():
table,
)

session.bqclient.get_table.return_value = table
session.bqclient.query_and_wait.return_value = (
{"total_count": 3, "distinct_count": 2},
session.bqclient.query_and_wait = mock.MagicMock(
return_value=({"total_count": 3, "distinct_count": 2},)
)
session.bqclient.get_table.return_value = table

with pytest.warns(UserWarning, match=re.escape("use_cache=False")):
df = session.read_gbq("my-project.my_dataset.my_table")
Expand Down