@@ -1197,15 +1197,203 @@ def corr(self, method="pearson", min_periods=None, numeric_only=False) -> DataFr
1197
1197
else :
1198
1198
frame = self ._drop_non_numeric ()
1199
1199
1200
- return DataFrame (frame ._block .calculate_pairwise_metric (op = agg_ops .CorrOp ()))
1200
+ orig_columns = frame .columns
1201
+ # Replace column names with 0 to n - 1 to keep order
1202
+ # and avoid the influence of duplicated column name
1203
+ frame .columns = pandas .Index (range (len (orig_columns )))
1204
+ frame = frame .astype (bigframes .dtypes .FLOAT_DTYPE )
1205
+ block = frame ._block
1206
+
1207
+ # A new column that uniquely identifies each row
1208
+ block , ordering_col = frame ._block .promote_offsets (label = "_bigframes_idx" )
1209
+
1210
+ val_col_ids = [
1211
+ col_id for col_id in block .value_columns if col_id != ordering_col
1212
+ ]
1213
+
1214
+ block = block .melt (
1215
+ [ordering_col ], val_col_ids , ["_bigframes_variable" ], "_bigframes_value"
1216
+ )
1217
+
1218
+ block = block .merge (
1219
+ block ,
1220
+ left_join_ids = [ordering_col ],
1221
+ right_join_ids = [ordering_col ],
1222
+ how = "inner" ,
1223
+ sort = False ,
1224
+ )
1225
+
1226
+ frame = DataFrame (block ).dropna (
1227
+ subset = ["_bigframes_value_x" , "_bigframes_value_y" ]
1228
+ )
1229
+
1230
+ paired_mean_frame = (
1231
+ frame .groupby (["_bigframes_variable_x" , "_bigframes_variable_y" ])
1232
+ .agg (
1233
+ _bigframes_paired_mean_x = bigframes .pandas .NamedAgg (
1234
+ column = "_bigframes_value_x" , aggfunc = "mean"
1235
+ ),
1236
+ _bigframes_paired_mean_y = bigframes .pandas .NamedAgg (
1237
+ column = "_bigframes_value_y" , aggfunc = "mean"
1238
+ ),
1239
+ )
1240
+ .reset_index ()
1241
+ )
1242
+
1243
+ frame = frame .merge (
1244
+ paired_mean_frame , on = ["_bigframes_variable_x" , "_bigframes_variable_y" ]
1245
+ )
1246
+ frame ["_bigframes_value_x" ] -= frame ["_bigframes_paired_mean_x" ]
1247
+ frame ["_bigframes_value_y" ] -= frame ["_bigframes_paired_mean_y" ]
1248
+
1249
+ frame ["_bigframes_dividend" ] = (
1250
+ frame ["_bigframes_value_x" ] * frame ["_bigframes_value_y" ]
1251
+ )
1252
+ frame ["_bigframes_x_square" ] = (
1253
+ frame ["_bigframes_value_x" ] * frame ["_bigframes_value_x" ]
1254
+ )
1255
+ frame ["_bigframes_y_square" ] = (
1256
+ frame ["_bigframes_value_y" ] * frame ["_bigframes_value_y" ]
1257
+ )
1258
+
1259
+ result = (
1260
+ frame .groupby (["_bigframes_variable_x" , "_bigframes_variable_y" ])
1261
+ .agg (
1262
+ _bigframes_dividend_sum = bigframes .pandas .NamedAgg (
1263
+ column = "_bigframes_dividend" , aggfunc = "sum"
1264
+ ),
1265
+ _bigframes_x_square_sum = bigframes .pandas .NamedAgg (
1266
+ column = "_bigframes_x_square" , aggfunc = "sum"
1267
+ ),
1268
+ _bigframes_y_square_sum = bigframes .pandas .NamedAgg (
1269
+ column = "_bigframes_y_square" , aggfunc = "sum"
1270
+ ),
1271
+ )
1272
+ .reset_index ()
1273
+ )
1274
+ result ["_bigframes_corr" ] = result ["_bigframes_dividend_sum" ] / (
1275
+ (
1276
+ result ["_bigframes_x_square_sum" ] * result ["_bigframes_y_square_sum" ]
1277
+ )._apply_unary_op (ops .sqrt_op )
1278
+ )
1279
+ result = result ._pivot (
1280
+ index = "_bigframes_variable_x" ,
1281
+ columns = "_bigframes_variable_y" ,
1282
+ values = "_bigframes_corr" ,
1283
+ )
1284
+
1285
+ map_data = {
1286
+ f"_bigframes_level_{ i } " : orig_columns .get_level_values (i )
1287
+ for i in range (orig_columns .nlevels )
1288
+ }
1289
+ map_data ["_bigframes_keys" ] = range (len (orig_columns ))
1290
+ map_df = bigframes .dataframe .DataFrame (
1291
+ map_data ,
1292
+ session = self ._get_block ().expr .session ,
1293
+ ).set_index ("_bigframes_keys" )
1294
+ result = result .join (map_df ).sort_index ()
1295
+ index_columns = [f"_bigframes_level_{ i } " for i in range (orig_columns .nlevels )]
1296
+ result = result .set_index (index_columns )
1297
+ result .index .names = orig_columns .names
1298
+ result .columns = orig_columns
1299
+
1300
+ return result
1201
1301
1202
1302
def cov (self , * , numeric_only : bool = False ) -> DataFrame :
1203
1303
if not numeric_only :
1204
1304
frame = self ._raise_on_non_numeric ("corr" )
1205
1305
else :
1206
1306
frame = self ._drop_non_numeric ()
1207
1307
1208
- return DataFrame (frame ._block .calculate_pairwise_metric (agg_ops .CovOp ()))
1308
+ orig_columns = frame .columns
1309
+ # Replace column names with 0 to n - 1 to keep order
1310
+ # and avoid the influence of duplicated column name
1311
+ frame .columns = pandas .Index (range (len (orig_columns )))
1312
+ frame = frame .astype (bigframes .dtypes .FLOAT_DTYPE )
1313
+ block = frame ._block
1314
+
1315
+ # A new column that uniquely identifies each row
1316
+ block , ordering_col = frame ._block .promote_offsets (label = "_bigframes_idx" )
1317
+
1318
+ val_col_ids = [
1319
+ col_id for col_id in block .value_columns if col_id != ordering_col
1320
+ ]
1321
+
1322
+ block = block .melt (
1323
+ [ordering_col ], val_col_ids , ["_bigframes_variable" ], "_bigframes_value"
1324
+ )
1325
+ block = block .merge (
1326
+ block ,
1327
+ left_join_ids = [ordering_col ],
1328
+ right_join_ids = [ordering_col ],
1329
+ how = "inner" ,
1330
+ sort = False ,
1331
+ )
1332
+
1333
+ frame = DataFrame (block ).dropna (
1334
+ subset = ["_bigframes_value_x" , "_bigframes_value_y" ]
1335
+ )
1336
+
1337
+ paired_mean_frame = (
1338
+ frame .groupby (["_bigframes_variable_x" , "_bigframes_variable_y" ])
1339
+ .agg (
1340
+ _bigframes_paired_mean_x = bigframes .pandas .NamedAgg (
1341
+ column = "_bigframes_value_x" , aggfunc = "mean"
1342
+ ),
1343
+ _bigframes_paired_mean_y = bigframes .pandas .NamedAgg (
1344
+ column = "_bigframes_value_y" , aggfunc = "mean"
1345
+ ),
1346
+ )
1347
+ .reset_index ()
1348
+ )
1349
+
1350
+ frame = frame .merge (
1351
+ paired_mean_frame , on = ["_bigframes_variable_x" , "_bigframes_variable_y" ]
1352
+ )
1353
+ frame ["_bigframes_value_x" ] -= frame ["_bigframes_paired_mean_x" ]
1354
+ frame ["_bigframes_value_y" ] -= frame ["_bigframes_paired_mean_y" ]
1355
+
1356
+ frame ["_bigframes_dividend" ] = (
1357
+ frame ["_bigframes_value_x" ] * frame ["_bigframes_value_y" ]
1358
+ )
1359
+
1360
+ result = (
1361
+ frame .groupby (["_bigframes_variable_x" , "_bigframes_variable_y" ])
1362
+ .agg (
1363
+ _bigframes_dividend_sum = bigframes .pandas .NamedAgg (
1364
+ column = "_bigframes_dividend" , aggfunc = "sum"
1365
+ ),
1366
+ _bigframes_dividend_count = bigframes .pandas .NamedAgg (
1367
+ column = "_bigframes_dividend" , aggfunc = "count"
1368
+ ),
1369
+ )
1370
+ .reset_index ()
1371
+ )
1372
+ result ["_bigframes_cov" ] = result ["_bigframes_dividend_sum" ] / (
1373
+ result ["_bigframes_dividend_count" ] - 1
1374
+ )
1375
+ result = result ._pivot (
1376
+ index = "_bigframes_variable_x" ,
1377
+ columns = "_bigframes_variable_y" ,
1378
+ values = "_bigframes_cov" ,
1379
+ )
1380
+
1381
+ map_data = {
1382
+ f"_bigframes_level_{ i } " : orig_columns .get_level_values (i )
1383
+ for i in range (orig_columns .nlevels )
1384
+ }
1385
+ map_data ["_bigframes_keys" ] = range (len (orig_columns ))
1386
+ map_df = bigframes .dataframe .DataFrame (
1387
+ map_data ,
1388
+ session = self ._get_block ().expr .session ,
1389
+ ).set_index ("_bigframes_keys" )
1390
+ result = result .join (map_df ).sort_index ()
1391
+ index_columns = [f"_bigframes_level_{ i } " for i in range (orig_columns .nlevels )]
1392
+ result = result .set_index (index_columns )
1393
+ result .index .names = orig_columns .names
1394
+ result .columns = orig_columns
1395
+
1396
+ return result
1209
1397
1210
1398
def to_arrow (
1211
1399
self ,
0 commit comments