62
62
import ibis
63
63
import ibis .backends .bigquery as ibis_bigquery
64
64
import ibis .expr .types as ibis_types
65
+ import jellyfish
65
66
import numpy as np
66
67
import pandas
67
68
from pandas ._typing import (
@@ -339,19 +340,6 @@ def read_gbq(
339
340
elif col_order :
340
341
columns = col_order
341
342
342
- filters = list (filters )
343
- if len (filters ) != 0 or bf_io_bigquery .is_table_with_wildcard_suffix (
344
- query_or_table
345
- ):
346
- # TODO(b/338111344): This appears to be missing index_cols, which
347
- # are necessary to be selected.
348
- # TODO(b/338039517): Refactor this to be called inside both
349
- # _read_gbq_query and _read_gbq_table (after detecting primary keys)
350
- # so we can make sure index_col/index_cols reflects primary keys.
351
- query_or_table = bf_io_bigquery .to_query (
352
- query_or_table , _to_index_cols (index_col ), columns , filters
353
- )
354
-
355
343
if bf_io_bigquery .is_query (query_or_table ):
356
344
return self ._read_gbq_query (
357
345
query_or_table ,
@@ -361,6 +349,7 @@ def read_gbq(
361
349
max_results = max_results ,
362
350
api_name = "read_gbq" ,
363
351
use_cache = use_cache ,
352
+ filters = filters ,
364
353
)
365
354
else :
366
355
if configuration is not None :
@@ -377,6 +366,7 @@ def read_gbq(
377
366
max_results = max_results ,
378
367
api_name = "read_gbq" ,
379
368
use_cache = use_cache if use_cache is not None else True ,
369
+ filters = filters ,
380
370
)
381
371
382
372
def _query_to_destination (
@@ -451,6 +441,7 @@ def read_gbq_query(
451
441
max_results : Optional [int ] = None ,
452
442
use_cache : Optional [bool ] = None ,
453
443
col_order : Iterable [str ] = (),
444
+ filters : third_party_pandas_gbq .FiltersType = (),
454
445
) -> dataframe .DataFrame :
455
446
"""Turn a SQL query into a DataFrame.
456
447
@@ -517,6 +508,7 @@ def read_gbq_query(
517
508
max_results = max_results ,
518
509
api_name = "read_gbq_query" ,
519
510
use_cache = use_cache ,
511
+ filters = filters ,
520
512
)
521
513
522
514
def _read_gbq_query (
@@ -529,6 +521,7 @@ def _read_gbq_query(
529
521
max_results : Optional [int ] = None ,
530
522
api_name : str = "read_gbq_query" ,
531
523
use_cache : Optional [bool ] = None ,
524
+ filters : third_party_pandas_gbq .FiltersType = (),
532
525
) -> dataframe .DataFrame :
533
526
import bigframes .dataframe as dataframe
534
527
@@ -557,6 +550,21 @@ def _read_gbq_query(
557
550
558
551
index_cols = _to_index_cols (index_col )
559
552
553
+ filters = list (filters )
554
+ if len (filters ) != 0 or max_results is not None :
555
+ # TODO(b/338111344): If we are running a query anyway, we might as
556
+ # well generate ROW_NUMBER() at the same time.
557
+ query = bf_io_bigquery .to_query (
558
+ query ,
559
+ index_cols ,
560
+ columns ,
561
+ filters ,
562
+ max_results = max_results ,
563
+ # We're executing the query, so we don't need time travel for
564
+ # determinism.
565
+ time_travel_timestamp = None ,
566
+ )
567
+
560
568
destination , query_job = self ._query_to_destination (
561
569
query ,
562
570
index_cols ,
@@ -580,12 +588,14 @@ def _read_gbq_query(
580
588
session = self ,
581
589
)
582
590
583
- return self .read_gbq_table (
591
+ return self ._read_gbq_table (
584
592
f"{ destination .project } .{ destination .dataset_id } .{ destination .table_id } " ,
585
593
index_col = index_col ,
586
594
columns = columns ,
587
- max_results = max_results ,
588
595
use_cache = configuration ["query" ]["useQueryCache" ],
596
+ api_name = api_name ,
597
+ # max_results and filters are omitted because they are already
598
+ # handled by to_query(), above.
589
599
)
590
600
591
601
def read_gbq_table (
@@ -621,31 +631,14 @@ def read_gbq_table(
621
631
elif col_order :
622
632
columns = col_order
623
633
624
- filters = list (filters )
625
- if len (filters ) != 0 or bf_io_bigquery .is_table_with_wildcard_suffix (query ):
626
- # TODO(b/338039517): Refactor this to be called inside both
627
- # _read_gbq_query and _read_gbq_table (after detecting primary keys)
628
- # so we can make sure index_col/index_cols reflects primary keys.
629
- query = bf_io_bigquery .to_query (
630
- query , _to_index_cols (index_col ), columns , filters
631
- )
632
-
633
- return self ._read_gbq_query (
634
- query ,
635
- index_col = index_col ,
636
- columns = columns ,
637
- max_results = max_results ,
638
- api_name = "read_gbq_table" ,
639
- use_cache = use_cache ,
640
- )
641
-
642
634
return self ._read_gbq_table (
643
635
query = query ,
644
636
index_col = index_col ,
645
637
columns = columns ,
646
638
max_results = max_results ,
647
639
api_name = "read_gbq_table" ,
648
640
use_cache = use_cache ,
641
+ filters = filters ,
649
642
)
650
643
651
644
def _read_gbq_table (
@@ -657,6 +650,7 @@ def _read_gbq_table(
657
650
max_results : Optional [int ] = None ,
658
651
api_name : str ,
659
652
use_cache : bool = True ,
653
+ filters : third_party_pandas_gbq .FiltersType = (),
660
654
) -> dataframe .DataFrame :
661
655
import bigframes .dataframe as dataframe
662
656
@@ -673,6 +667,9 @@ def _read_gbq_table(
673
667
query , default_project = self .bqclient .project
674
668
)
675
669
670
+ columns = list (columns )
671
+ filters = list (filters )
672
+
676
673
# ---------------------------------
677
674
# Fetch table metadata and validate
678
675
# ---------------------------------
@@ -684,62 +681,110 @@ def _read_gbq_table(
684
681
cache = self ._df_snapshot ,
685
682
use_cache = use_cache ,
686
683
)
684
+ table_column_names = {field .name for field in table .schema }
687
685
688
686
if table .location .casefold () != self ._location .casefold ():
689
687
raise ValueError (
690
688
f"Current session is in { self ._location } but dataset '{ table .project } .{ table .dataset_id } ' is located in { table .location } "
691
689
)
692
690
693
- # -----------------------------------------
694
- # Create Ibis table expression and validate
695
- # -----------------------------------------
696
-
697
- # Use a time travel to make sure the DataFrame is deterministic, even
698
- # if the underlying table changes.
699
- table_expression = bf_read_gbq_table .get_ibis_time_travel_table (
700
- self .ibis_client ,
701
- table_ref ,
702
- time_travel_timestamp ,
703
- )
704
-
705
691
for key in columns :
706
- if key not in table_expression .columns :
692
+ if key not in table_column_names :
693
+ possibility = min (
694
+ table_column_names ,
695
+ key = lambda item : jellyfish .levenshtein_distance (key , item ),
696
+ )
707
697
raise ValueError (
708
- f"Column '{ key } ' of `columns` not found in this table."
698
+ f"Column '{ key } ' of `columns` not found in this table. Did you mean ' { possibility } '? "
709
699
)
710
700
711
- # ---------------------------------------
712
- # Create a non-default index and validate
713
- # ---------------------------------------
714
-
715
- # TODO(b/337925142): Move index_cols creation to before we create the
716
- # Ibis table expression so we don't have a "SELECT *" subquery in the
717
- # query that checks for index uniqueness.
718
-
719
- index_cols , is_index_unique = bf_read_gbq_table .get_index_cols_and_uniqueness (
720
- bqclient = self .bqclient ,
721
- ibis_client = self .ibis_client ,
701
+ # Converting index_col into a list of column names requires
702
+ # the table metadata because we might use the primary keys
703
+ # when constructing the index.
704
+ index_cols = bf_read_gbq_table .get_index_cols (
722
705
table = table ,
723
- table_expression = table_expression ,
724
706
index_col = index_col ,
725
- api_name = api_name ,
726
707
)
727
708
728
709
for key in index_cols :
729
- if key not in table_expression .columns :
710
+ if key not in table_column_names :
711
+ possibility = min (
712
+ table_column_names ,
713
+ key = lambda item : jellyfish .levenshtein_distance (key , item ),
714
+ )
730
715
raise ValueError (
731
- f"Column ` { key } ` of `index_col` not found in this table."
716
+ f"Column ' { key } ' of `index_col` not found in this table. Did you mean ' { possibility } '? "
732
717
)
733
718
734
- # TODO(b/337925142): We should push down column filters when we get the time
735
- # travel table to avoid "SELECT *" subqueries.
736
- if columns :
737
- table_expression = table_expression .select ([* index_cols , * columns ])
719
+ # -----------------------------
720
+ # Optionally, execute the query
721
+ # -----------------------------
722
+
723
+ # max_results introduces non-determinism and limits the cost on
724
+ # clustered tables, so fallback to a query. We do this here so that
725
+ # the index is consistent with tables that have primary keys, even
726
+ # when max_results is set.
727
+ # TODO(b/338419730): We don't need to fallback to a query for wildcard
728
+ # tables if we allow some non-determinism when time travel isn't supported.
729
+ if max_results is not None or bf_io_bigquery .is_table_with_wildcard_suffix (
730
+ query
731
+ ):
732
+ # TODO(b/338111344): If we are running a query anyway, we might as
733
+ # well generate ROW_NUMBER() at the same time.
734
+ query = bf_io_bigquery .to_query (
735
+ query ,
736
+ index_cols = index_cols ,
737
+ columns = columns ,
738
+ filters = filters ,
739
+ max_results = max_results ,
740
+ # We're executing the query, so we don't need time travel for
741
+ # determinism.
742
+ time_travel_timestamp = None ,
743
+ )
744
+
745
+ return self ._read_gbq_query (
746
+ query ,
747
+ index_col = index_cols ,
748
+ columns = columns ,
749
+ api_name = "read_gbq_table" ,
750
+ use_cache = use_cache ,
751
+ )
752
+
753
+ # -----------------------------------------
754
+ # Create Ibis table expression and validate
755
+ # -----------------------------------------
756
+
757
+ # Use a time travel to make sure the DataFrame is deterministic, even
758
+ # if the underlying table changes.
759
+ # TODO(b/340540991): If a dry run query fails with time travel but
760
+ # succeeds without it, omit the time travel clause and raise a warning
761
+ # about potential non-determinism if the underlying tables are modified.
762
+ table_expression = bf_read_gbq_table .get_ibis_time_travel_table (
763
+ ibis_client = self .ibis_client ,
764
+ table_ref = table_ref ,
765
+ index_cols = index_cols ,
766
+ columns = columns ,
767
+ filters = filters ,
768
+ time_travel_timestamp = time_travel_timestamp ,
769
+ )
738
770
739
771
# ----------------------------
740
772
# Create ordering and validate
741
773
# ----------------------------
742
774
775
+ # TODO(b/337925142): Generate a new subquery with just the index_cols
776
+ # in the Ibis table expression so we don't have a "SELECT *" subquery
777
+ # in the query that checks for index uniqueness.
778
+ # TODO(b/338065601): Provide a way to assume uniqueness and avoid this
779
+ # check.
780
+ is_index_unique = bf_read_gbq_table .are_index_cols_unique (
781
+ bqclient = self .bqclient ,
782
+ ibis_client = self .ibis_client ,
783
+ table = table ,
784
+ index_cols = index_cols ,
785
+ api_name = api_name ,
786
+ )
787
+
743
788
if is_index_unique :
744
789
array_value = bf_read_gbq_table .to_array_value_with_total_ordering (
745
790
session = self ,
0 commit comments