54
54
import google .api_core .gapic_v1 .client_info
55
55
import google .auth .credentials
56
56
import google .cloud .bigquery as bigquery
57
+ import google .cloud .bigquery .table
57
58
import google .cloud .bigquery_connection_v1
58
59
import google .cloud .bigquery_storage_v1
59
60
import google .cloud .functions_v2
@@ -693,7 +694,7 @@ def read_gbq_table(
693
694
694
695
def _get_snapshot_sql_and_primary_key (
695
696
self ,
696
- table_ref : bigquery .table .TableReference ,
697
+ table : google . cloud . bigquery .table .Table ,
697
698
* ,
698
699
api_name : str ,
699
700
use_cache : bool = True ,
@@ -709,7 +710,7 @@ def _get_snapshot_sql_and_primary_key(
709
710
table ,
710
711
) = bigframes_io .get_snapshot_datetime_and_table_metadata (
711
712
self .bqclient ,
712
- table_ref = table_ref ,
713
+ table_ref = table . reference ,
713
714
api_name = api_name ,
714
715
cache = self ._df_snapshot ,
715
716
use_cache = use_cache ,
@@ -735,7 +736,7 @@ def _get_snapshot_sql_and_primary_key(
735
736
736
737
try :
737
738
table_expression = self .ibis_client .sql (
738
- bigframes_io .create_snapshot_sql (table_ref , snapshot_timestamp )
739
+ bigframes_io .create_snapshot_sql (table . reference , snapshot_timestamp )
739
740
)
740
741
except google .api_core .exceptions .Forbidden as ex :
741
742
if "Drive credentials" in ex .message :
@@ -763,8 +764,9 @@ def _read_gbq_table(
763
764
query , default_project = self .bqclient .project
764
765
)
765
766
767
+ table = self .bqclient .get_table (table_ref )
766
768
(table_expression , primary_keys ,) = self ._get_snapshot_sql_and_primary_key (
767
- table_ref , api_name = api_name , use_cache = use_cache
769
+ table , api_name = api_name , use_cache = use_cache
768
770
)
769
771
total_ordering_cols = primary_keys
770
772
@@ -836,9 +838,13 @@ def _read_gbq_table(
836
838
ordering = ordering ,
837
839
)
838
840
else :
839
- array_value = self ._create_total_ordering (table_expression )
841
+ array_value = self ._create_total_ordering (
842
+ table_expression , table_rows = table .num_rows
843
+ )
840
844
else :
841
- array_value = self ._create_total_ordering (table_expression )
845
+ array_value = self ._create_total_ordering (
846
+ table_expression , table_rows = table .num_rows
847
+ )
842
848
843
849
value_columns = [col for col in array_value .column_ids if col not in index_cols ]
844
850
block = blocks .Block (
@@ -1459,10 +1465,19 @@ def _create_empty_temp_table(
1459
1465
def _create_total_ordering (
1460
1466
self ,
1461
1467
table : ibis_types .Table ,
1468
+ table_rows : Optional [int ],
1462
1469
) -> core .ArrayValue :
1463
1470
# Since this might also be used as the index, don't use the default
1464
1471
# "ordering ID" name.
1472
+
1473
+ # For small tables, 64 bits is enough to avoid collisions, 128 bits will never ever collide no matter what
1474
+ # Assume table is large if table row count is unknown
1475
+ use_double_hash = (
1476
+ (table_rows is None ) or (table_rows == 0 ) or (table_rows > 100000 )
1477
+ )
1478
+
1465
1479
ordering_hash_part = guid .generate_guid ("bigframes_ordering_" )
1480
+ ordering_hash_part2 = guid .generate_guid ("bigframes_ordering_" )
1466
1481
ordering_rand_part = guid .generate_guid ("bigframes_ordering_" )
1467
1482
1468
1483
# All inputs into hash must be non-null or resulting hash will be null
@@ -1475,25 +1490,30 @@ def _create_total_ordering(
1475
1490
else str_values [0 ]
1476
1491
)
1477
1492
full_row_hash = full_row_str .hash ().name (ordering_hash_part )
1493
+ # By modifying value slightly, we get another hash uncorrelated with the first
1494
+ full_row_hash_p2 = (full_row_str + "_" ).hash ().name (ordering_hash_part2 )
1478
1495
# Used to disambiguate between identical rows (which will have identical hash)
1479
1496
random_value = ibis .random ().name (ordering_rand_part )
1480
1497
1498
+ order_values = (
1499
+ [full_row_hash , full_row_hash_p2 , random_value ]
1500
+ if use_double_hash
1501
+ else [full_row_hash , random_value ]
1502
+ )
1503
+
1481
1504
original_column_ids = table .columns
1482
1505
table_with_ordering = table .select (
1483
- itertools .chain (original_column_ids , [ full_row_hash , random_value ] )
1506
+ itertools .chain (original_column_ids , order_values )
1484
1507
)
1485
1508
1486
- ordering_ref1 = order .ascending_over (ordering_hash_part )
1487
- ordering_ref2 = order .ascending_over (ordering_rand_part )
1488
1509
ordering = order .ExpressionOrdering (
1489
- ordering_value_columns = (ordering_ref1 , ordering_ref2 ),
1490
- total_ordering_columns = frozenset ([ordering_hash_part , ordering_rand_part ]),
1510
+ ordering_value_columns = tuple (
1511
+ order .ascending_over (col .get_name ()) for col in order_values
1512
+ ),
1513
+ total_ordering_columns = frozenset (col .get_name () for col in order_values ),
1491
1514
)
1492
1515
columns = [table_with_ordering [col ] for col in original_column_ids ]
1493
- hidden_columns = [
1494
- table_with_ordering [ordering_hash_part ],
1495
- table_with_ordering [ordering_rand_part ],
1496
- ]
1516
+ hidden_columns = [table_with_ordering [col .get_name ()] for col in order_values ]
1497
1517
return core .ArrayValue .from_ibis (
1498
1518
self ,
1499
1519
table_with_ordering ,
0 commit comments