diff --git a/.github/.OwlBot.lock.yaml b/.github/.OwlBot.lock.yaml index ec696b558c..453b540c1e 100644 --- a/.github/.OwlBot.lock.yaml +++ b/.github/.OwlBot.lock.yaml @@ -13,5 +13,5 @@ # limitations under the License. docker: image: gcr.io/cloud-devrel-public-resources/owlbot-python:latest - digest: sha256:30470597773378105e239b59fce8eb27cc97375580d592699206d17d117143d0 -# created: 2023-11-03T00:57:07.335914631Z + digest: sha256:caffe0a9277daeccc4d1de5c9b55ebba0901b57c2f713ec9c876b0d4ec064f61 +# created: 2023-11-08T19:46:45.022803742Z diff --git a/.kokoro/requirements.txt b/.kokoro/requirements.txt index 16170d0ca7..8957e21104 100644 --- a/.kokoro/requirements.txt +++ b/.kokoro/requirements.txt @@ -4,91 +4,75 @@ # # pip-compile --allow-unsafe --generate-hashes requirements.in # -argcomplete==2.0.0 \ - --hash=sha256:6372ad78c89d662035101418ae253668445b391755cfe94ea52f1b9d22425b20 \ - --hash=sha256:cffa11ea77999bb0dd27bb25ff6dc142a6796142f68d45b1a26b11f58724561e +argcomplete==3.1.4 \ + --hash=sha256:72558ba729e4c468572609817226fb0a6e7e9a0a7d477b882be168c0b4a62b94 \ + --hash=sha256:fbe56f8cda08aa9a04b307d8482ea703e96a6a801611acb4be9bf3942017989f # via nox -attrs==22.1.0 \ - --hash=sha256:29adc2665447e5191d0e7c568fde78b21f9672d344281d0c6e1ab085429b22b6 \ - --hash=sha256:86efa402f67bf2df34f51a335487cf46b1ec130d02b8d39fd248abfd30da551c +attrs==23.1.0 \ + --hash=sha256:1f28b4522cdc2fb4256ac1a020c78acf9cba2c6b461ccd2c126f3aa8e8335d04 \ + --hash=sha256:6279836d581513a26f1bf235f9acd333bc9115683f14f7e8fae46c98fc50e015 # via gcp-releasetool -bleach==5.0.1 \ - --hash=sha256:085f7f33c15bd408dd9b17a4ad77c577db66d76203e5984b1bd59baeee948b2a \ - --hash=sha256:0d03255c47eb9bd2f26aa9bb7f2107732e7e8fe195ca2f64709fcf3b0a4a085c - # via readme-renderer -cachetools==5.2.0 \ - --hash=sha256:6a94c6402995a99c3970cc7e4884bb60b4a8639938157eeed436098bf9831757 \ - --hash=sha256:f9f17d2aec496a9aa6b76f53e3b614c965223c061982d434d160f930c698a9db +cachetools==5.3.2 \ + --hash=sha256:086ee420196f7b2ab9ca2db2520aca326318b68fe5ba8bc4d49cca91add450f2 \ + --hash=sha256:861f35a13a451f94e301ce2bec7cac63e881232ccce7ed67fab9b5df4d3beaa1 # via google-auth certifi==2023.7.22 \ --hash=sha256:539cc1d13202e33ca466e88b2807e29f4c13049d6d87031a3c110744495cb082 \ --hash=sha256:92d6037539857d8206b8f6ae472e8b77db8058fec5937a1ef3f54304089edbb9 # via requests -cffi==1.15.1 \ - --hash=sha256:00a9ed42e88df81ffae7a8ab6d9356b371399b91dbdf0c3cb1e84c03a13aceb5 \ - --hash=sha256:03425bdae262c76aad70202debd780501fabeaca237cdfddc008987c0e0f59ef \ - --hash=sha256:04ed324bda3cda42b9b695d51bb7d54b680b9719cfab04227cdd1e04e5de3104 \ - --hash=sha256:0e2642fe3142e4cc4af0799748233ad6da94c62a8bec3a6648bf8ee68b1c7426 \ - --hash=sha256:173379135477dc8cac4bc58f45db08ab45d228b3363adb7af79436135d028405 \ - --hash=sha256:198caafb44239b60e252492445da556afafc7d1e3ab7a1fb3f0584ef6d742375 \ - --hash=sha256:1e74c6b51a9ed6589199c787bf5f9875612ca4a8a0785fb2d4a84429badaf22a \ - --hash=sha256:2012c72d854c2d03e45d06ae57f40d78e5770d252f195b93f581acf3ba44496e \ - --hash=sha256:21157295583fe8943475029ed5abdcf71eb3911894724e360acff1d61c1d54bc \ - --hash=sha256:2470043b93ff09bf8fb1d46d1cb756ce6132c54826661a32d4e4d132e1977adf \ - --hash=sha256:285d29981935eb726a4399badae8f0ffdff4f5050eaa6d0cfc3f64b857b77185 \ - --hash=sha256:30d78fbc8ebf9c92c9b7823ee18eb92f2e6ef79b45ac84db507f52fbe3ec4497 \ - --hash=sha256:320dab6e7cb2eacdf0e658569d2575c4dad258c0fcc794f46215e1e39f90f2c3 \ - --hash=sha256:33ab79603146aace82c2427da5ca6e58f2b3f2fb5da893ceac0c42218a40be35 \ - --hash=sha256:3548db281cd7d2561c9ad9984681c95f7b0e38881201e157833a2342c30d5e8c \ - --hash=sha256:3799aecf2e17cf585d977b780ce79ff0dc9b78d799fc694221ce814c2c19db83 \ - --hash=sha256:39d39875251ca8f612b6f33e6b1195af86d1b3e60086068be9cc053aa4376e21 \ - --hash=sha256:3b926aa83d1edb5aa5b427b4053dc420ec295a08e40911296b9eb1b6170f6cca \ - --hash=sha256:3bcde07039e586f91b45c88f8583ea7cf7a0770df3a1649627bf598332cb6984 \ - --hash=sha256:3d08afd128ddaa624a48cf2b859afef385b720bb4b43df214f85616922e6a5ac \ - --hash=sha256:3eb6971dcff08619f8d91607cfc726518b6fa2a9eba42856be181c6d0d9515fd \ - --hash=sha256:40f4774f5a9d4f5e344f31a32b5096977b5d48560c5592e2f3d2c4374bd543ee \ - --hash=sha256:4289fc34b2f5316fbb762d75362931e351941fa95fa18789191b33fc4cf9504a \ - --hash=sha256:470c103ae716238bbe698d67ad020e1db9d9dba34fa5a899b5e21577e6d52ed2 \ - --hash=sha256:4f2c9f67e9821cad2e5f480bc8d83b8742896f1242dba247911072d4fa94c192 \ - --hash=sha256:50a74364d85fd319352182ef59c5c790484a336f6db772c1a9231f1c3ed0cbd7 \ - --hash=sha256:54a2db7b78338edd780e7ef7f9f6c442500fb0d41a5a4ea24fff1c929d5af585 \ - --hash=sha256:5635bd9cb9731e6d4a1132a498dd34f764034a8ce60cef4f5319c0541159392f \ - --hash=sha256:59c0b02d0a6c384d453fece7566d1c7e6b7bae4fc5874ef2ef46d56776d61c9e \ - --hash=sha256:5d598b938678ebf3c67377cdd45e09d431369c3b1a5b331058c338e201f12b27 \ - --hash=sha256:5df2768244d19ab7f60546d0c7c63ce1581f7af8b5de3eb3004b9b6fc8a9f84b \ - --hash=sha256:5ef34d190326c3b1f822a5b7a45f6c4535e2f47ed06fec77d3d799c450b2651e \ - --hash=sha256:6975a3fac6bc83c4a65c9f9fcab9e47019a11d3d2cf7f3c0d03431bf145a941e \ - --hash=sha256:6c9a799e985904922a4d207a94eae35c78ebae90e128f0c4e521ce339396be9d \ - --hash=sha256:70df4e3b545a17496c9b3f41f5115e69a4f2e77e94e1d2a8e1070bc0c38c8a3c \ - --hash=sha256:7473e861101c9e72452f9bf8acb984947aa1661a7704553a9f6e4baa5ba64415 \ - --hash=sha256:8102eaf27e1e448db915d08afa8b41d6c7ca7a04b7d73af6514df10a3e74bd82 \ - --hash=sha256:87c450779d0914f2861b8526e035c5e6da0a3199d8f1add1a665e1cbc6fc6d02 \ - --hash=sha256:8b7ee99e510d7b66cdb6c593f21c043c248537a32e0bedf02e01e9553a172314 \ - --hash=sha256:91fc98adde3d7881af9b59ed0294046f3806221863722ba7d8d120c575314325 \ - --hash=sha256:94411f22c3985acaec6f83c6df553f2dbe17b698cc7f8ae751ff2237d96b9e3c \ - --hash=sha256:98d85c6a2bef81588d9227dde12db8a7f47f639f4a17c9ae08e773aa9c697bf3 \ - --hash=sha256:9ad5db27f9cabae298d151c85cf2bad1d359a1b9c686a275df03385758e2f914 \ - --hash=sha256:a0b71b1b8fbf2b96e41c4d990244165e2c9be83d54962a9a1d118fd8657d2045 \ - --hash=sha256:a0f100c8912c114ff53e1202d0078b425bee3649ae34d7b070e9697f93c5d52d \ - --hash=sha256:a591fe9e525846e4d154205572a029f653ada1a78b93697f3b5a8f1f2bc055b9 \ - --hash=sha256:a5c84c68147988265e60416b57fc83425a78058853509c1b0629c180094904a5 \ - --hash=sha256:a66d3508133af6e8548451b25058d5812812ec3798c886bf38ed24a98216fab2 \ - --hash=sha256:a8c4917bd7ad33e8eb21e9a5bbba979b49d9a97acb3a803092cbc1133e20343c \ - --hash=sha256:b3bbeb01c2b273cca1e1e0c5df57f12dce9a4dd331b4fa1635b8bec26350bde3 \ - --hash=sha256:cba9d6b9a7d64d4bd46167096fc9d2f835e25d7e4c121fb2ddfc6528fb0413b2 \ - --hash=sha256:cc4d65aeeaa04136a12677d3dd0b1c0c94dc43abac5860ab33cceb42b801c1e8 \ - --hash=sha256:ce4bcc037df4fc5e3d184794f27bdaab018943698f4ca31630bc7f84a7b69c6d \ - --hash=sha256:cec7d9412a9102bdc577382c3929b337320c4c4c4849f2c5cdd14d7368c5562d \ - --hash=sha256:d400bfb9a37b1351253cb402671cea7e89bdecc294e8016a707f6d1d8ac934f9 \ - --hash=sha256:d61f4695e6c866a23a21acab0509af1cdfd2c013cf256bbf5b6b5e2695827162 \ - --hash=sha256:db0fbb9c62743ce59a9ff687eb5f4afbe77e5e8403d6697f7446e5f609976f76 \ - --hash=sha256:dd86c085fae2efd48ac91dd7ccffcfc0571387fe1193d33b6394db7ef31fe2a4 \ - --hash=sha256:e00b098126fd45523dd056d2efba6c5a63b71ffe9f2bbe1a4fe1716e1d0c331e \ - --hash=sha256:e229a521186c75c8ad9490854fd8bbdd9a0c9aa3a524326b55be83b54d4e0ad9 \ - --hash=sha256:e263d77ee3dd201c3a142934a086a4450861778baaeeb45db4591ef65550b0a6 \ - --hash=sha256:ed9cb427ba5504c1dc15ede7d516b84757c3e3d7868ccc85121d9310d27eed0b \ - --hash=sha256:fa6693661a4c91757f4412306191b6dc88c1703f780c8234035eac011922bc01 \ - --hash=sha256:fcd131dd944808b5bdb38e6f5b53013c5aa4f334c5cad0c72742f6eba4b73db0 +cffi==1.16.0 \ + --hash=sha256:0c9ef6ff37e974b73c25eecc13952c55bceed9112be2d9d938ded8e856138bcc \ + --hash=sha256:131fd094d1065b19540c3d72594260f118b231090295d8c34e19a7bbcf2e860a \ + --hash=sha256:1b8ebc27c014c59692bb2664c7d13ce7a6e9a629be20e54e7271fa696ff2b417 \ + --hash=sha256:2c56b361916f390cd758a57f2e16233eb4f64bcbeee88a4881ea90fca14dc6ab \ + --hash=sha256:2d92b25dbf6cae33f65005baf472d2c245c050b1ce709cc4588cdcdd5495b520 \ + --hash=sha256:31d13b0f99e0836b7ff893d37af07366ebc90b678b6664c955b54561fc36ef36 \ + --hash=sha256:32c68ef735dbe5857c810328cb2481e24722a59a2003018885514d4c09af9743 \ + --hash=sha256:3686dffb02459559c74dd3d81748269ffb0eb027c39a6fc99502de37d501faa8 \ + --hash=sha256:582215a0e9adbe0e379761260553ba11c58943e4bbe9c36430c4ca6ac74b15ed \ + --hash=sha256:5b50bf3f55561dac5438f8e70bfcdfd74543fd60df5fa5f62d94e5867deca684 \ + --hash=sha256:5bf44d66cdf9e893637896c7faa22298baebcd18d1ddb6d2626a6e39793a1d56 \ + --hash=sha256:6602bc8dc6f3a9e02b6c22c4fc1e47aa50f8f8e6d3f78a5e16ac33ef5fefa324 \ + --hash=sha256:673739cb539f8cdaa07d92d02efa93c9ccf87e345b9a0b556e3ecc666718468d \ + --hash=sha256:68678abf380b42ce21a5f2abde8efee05c114c2fdb2e9eef2efdb0257fba1235 \ + --hash=sha256:68e7c44931cc171c54ccb702482e9fc723192e88d25a0e133edd7aff8fcd1f6e \ + --hash=sha256:6b3d6606d369fc1da4fd8c357d026317fbb9c9b75d36dc16e90e84c26854b088 \ + --hash=sha256:748dcd1e3d3d7cd5443ef03ce8685043294ad6bd7c02a38d1bd367cfd968e000 \ + --hash=sha256:7651c50c8c5ef7bdb41108b7b8c5a83013bfaa8a935590c5d74627c047a583c7 \ + --hash=sha256:7b78010e7b97fef4bee1e896df8a4bbb6712b7f05b7ef630f9d1da00f6444d2e \ + --hash=sha256:7e61e3e4fa664a8588aa25c883eab612a188c725755afff6289454d6362b9673 \ + --hash=sha256:80876338e19c951fdfed6198e70bc88f1c9758b94578d5a7c4c91a87af3cf31c \ + --hash=sha256:8895613bcc094d4a1b2dbe179d88d7fb4a15cee43c052e8885783fac397d91fe \ + --hash=sha256:88e2b3c14bdb32e440be531ade29d3c50a1a59cd4e51b1dd8b0865c54ea5d2e2 \ + --hash=sha256:8f8e709127c6c77446a8c0a8c8bf3c8ee706a06cd44b1e827c3e6a2ee6b8c098 \ + --hash=sha256:9cb4a35b3642fc5c005a6755a5d17c6c8b6bcb6981baf81cea8bfbc8903e8ba8 \ + --hash=sha256:9f90389693731ff1f659e55c7d1640e2ec43ff725cc61b04b2f9c6d8d017df6a \ + --hash=sha256:a09582f178759ee8128d9270cd1344154fd473bb77d94ce0aeb2a93ebf0feaf0 \ + --hash=sha256:a6a14b17d7e17fa0d207ac08642c8820f84f25ce17a442fd15e27ea18d67c59b \ + --hash=sha256:a72e8961a86d19bdb45851d8f1f08b041ea37d2bd8d4fd19903bc3083d80c896 \ + --hash=sha256:abd808f9c129ba2beda4cfc53bde801e5bcf9d6e0f22f095e45327c038bfe68e \ + --hash=sha256:ac0f5edd2360eea2f1daa9e26a41db02dd4b0451b48f7c318e217ee092a213e9 \ + --hash=sha256:b29ebffcf550f9da55bec9e02ad430c992a87e5f512cd63388abb76f1036d8d2 \ + --hash=sha256:b2ca4e77f9f47c55c194982e10f058db063937845bb2b7a86c84a6cfe0aefa8b \ + --hash=sha256:b7be2d771cdba2942e13215c4e340bfd76398e9227ad10402a8767ab1865d2e6 \ + --hash=sha256:b84834d0cf97e7d27dd5b7f3aca7b6e9263c56308ab9dc8aae9784abb774d404 \ + --hash=sha256:b86851a328eedc692acf81fb05444bdf1891747c25af7529e39ddafaf68a4f3f \ + --hash=sha256:bcb3ef43e58665bbda2fb198698fcae6776483e0c4a631aa5647806c25e02cc0 \ + --hash=sha256:c0f31130ebc2d37cdd8e44605fb5fa7ad59049298b3f745c74fa74c62fbfcfc4 \ + --hash=sha256:c6a164aa47843fb1b01e941d385aab7215563bb8816d80ff3a363a9f8448a8dc \ + --hash=sha256:d8a9d3ebe49f084ad71f9269834ceccbf398253c9fac910c4fd7053ff1386936 \ + --hash=sha256:db8e577c19c0fda0beb7e0d4e09e0ba74b1e4c092e0e40bfa12fe05b6f6d75ba \ + --hash=sha256:dc9b18bf40cc75f66f40a7379f6a9513244fe33c0e8aa72e2d56b0196a7ef872 \ + --hash=sha256:e09f3ff613345df5e8c3667da1d918f9149bd623cd9070c983c013792a9a62eb \ + --hash=sha256:e4108df7fe9b707191e55f33efbcb2d81928e10cea45527879a4749cbe472614 \ + --hash=sha256:e6024675e67af929088fda399b2094574609396b1decb609c55fa58b028a32a1 \ + --hash=sha256:e70f54f1796669ef691ca07d046cd81a29cb4deb1e5f942003f401c0c4a2695d \ + --hash=sha256:e715596e683d2ce000574bae5d07bd522c781a822866c20495e52520564f0969 \ + --hash=sha256:e760191dd42581e023a68b758769e2da259b5d52e3103c6060ddc02c9edb8d7b \ + --hash=sha256:ed86a35631f7bfbb28e108dd96773b9d5a6ce4811cf6ea468bb6a359b256b1e4 \ + --hash=sha256:ee07e47c12890ef248766a6e55bd38ebfb2bb8edd4142d56db91b21ea68b7627 \ + --hash=sha256:fa3a0128b152627161ce47201262d3140edb5a5c3da88d73a1b790a959126956 \ + --hash=sha256:fcc8eb6d5902bb1cf6dc4f187ee3ea80a1eba0a89aba40a5cb20a5087d961357 # via cryptography charset-normalizer==2.1.1 \ --hash=sha256:5a3d016c7c547f69d6f81fb0db9449ce888b418b5b9952cc5e6e66843e9dd845 \ @@ -109,78 +93,74 @@ colorlog==6.7.0 \ # via # gcp-docuploader # nox -commonmark==0.9.1 \ - --hash=sha256:452f9dc859be7f06631ddcb328b6919c67984aca654e5fefb3914d54691aed60 \ - --hash=sha256:da2f38c92590f83de410ba1a3cbceafbc74fee9def35f9251ba9a971d6d66fd9 - # via rich -cryptography==41.0.4 \ - --hash=sha256:004b6ccc95943f6a9ad3142cfabcc769d7ee38a3f60fb0dddbfb431f818c3a67 \ - --hash=sha256:047c4603aeb4bbd8db2756e38f5b8bd7e94318c047cfe4efeb5d715e08b49311 \ - --hash=sha256:0d9409894f495d465fe6fda92cb70e8323e9648af912d5b9141d616df40a87b8 \ - --hash=sha256:23a25c09dfd0d9f28da2352503b23e086f8e78096b9fd585d1d14eca01613e13 \ - --hash=sha256:2ed09183922d66c4ec5fdaa59b4d14e105c084dd0febd27452de8f6f74704143 \ - --hash=sha256:35c00f637cd0b9d5b6c6bd11b6c3359194a8eba9c46d4e875a3660e3b400005f \ - --hash=sha256:37480760ae08065437e6573d14be973112c9e6dcaf5f11d00147ee74f37a3829 \ - --hash=sha256:3b224890962a2d7b57cf5eeb16ccaafba6083f7b811829f00476309bce2fe0fd \ - --hash=sha256:5a0f09cefded00e648a127048119f77bc2b2ec61e736660b5789e638f43cc397 \ - --hash=sha256:5b72205a360f3b6176485a333256b9bcd48700fc755fef51c8e7e67c4b63e3ac \ - --hash=sha256:7e53db173370dea832190870e975a1e09c86a879b613948f09eb49324218c14d \ - --hash=sha256:7febc3094125fc126a7f6fb1f420d0da639f3f32cb15c8ff0dc3997c4549f51a \ - --hash=sha256:80907d3faa55dc5434a16579952ac6da800935cd98d14dbd62f6f042c7f5e839 \ - --hash=sha256:86defa8d248c3fa029da68ce61fe735432b047e32179883bdb1e79ed9bb8195e \ - --hash=sha256:8ac4f9ead4bbd0bc8ab2d318f97d85147167a488be0e08814a37eb2f439d5cf6 \ - --hash=sha256:93530900d14c37a46ce3d6c9e6fd35dbe5f5601bf6b3a5c325c7bffc030344d9 \ - --hash=sha256:9eeb77214afae972a00dee47382d2591abe77bdae166bda672fb1e24702a3860 \ - --hash=sha256:b5f4dfe950ff0479f1f00eda09c18798d4f49b98f4e2006d644b3301682ebdca \ - --hash=sha256:c3391bd8e6de35f6f1140e50aaeb3e2b3d6a9012536ca23ab0d9c35ec18c8a91 \ - --hash=sha256:c880eba5175f4307129784eca96f4e70b88e57aa3f680aeba3bab0e980b0f37d \ - --hash=sha256:cecfefa17042941f94ab54f769c8ce0fe14beff2694e9ac684176a2535bf9714 \ - --hash=sha256:e40211b4923ba5a6dc9769eab704bdb3fbb58d56c5b336d30996c24fcf12aadb \ - --hash=sha256:efc8ad4e6fc4f1752ebfb58aefece8b4e3c4cae940b0994d43649bdfce8d0d4f +cryptography==41.0.5 \ + --hash=sha256:0c327cac00f082013c7c9fb6c46b7cc9fa3c288ca702c74773968173bda421bf \ + --hash=sha256:0d2a6a598847c46e3e321a7aef8af1436f11c27f1254933746304ff014664d84 \ + --hash=sha256:227ec057cd32a41c6651701abc0328135e472ed450f47c2766f23267b792a88e \ + --hash=sha256:22892cc830d8b2c89ea60148227631bb96a7da0c1b722f2aac8824b1b7c0b6b8 \ + --hash=sha256:392cb88b597247177172e02da6b7a63deeff1937fa6fec3bbf902ebd75d97ec7 \ + --hash=sha256:3be3ca726e1572517d2bef99a818378bbcf7d7799d5372a46c79c29eb8d166c1 \ + --hash=sha256:573eb7128cbca75f9157dcde974781209463ce56b5804983e11a1c462f0f4e88 \ + --hash=sha256:580afc7b7216deeb87a098ef0674d6ee34ab55993140838b14c9b83312b37b86 \ + --hash=sha256:5a70187954ba7292c7876734183e810b728b4f3965fbe571421cb2434d279179 \ + --hash=sha256:73801ac9736741f220e20435f84ecec75ed70eda90f781a148f1bad546963d81 \ + --hash=sha256:7d208c21e47940369accfc9e85f0de7693d9a5d843c2509b3846b2db170dfd20 \ + --hash=sha256:8254962e6ba1f4d2090c44daf50a547cd5f0bf446dc658a8e5f8156cae0d8548 \ + --hash=sha256:88417bff20162f635f24f849ab182b092697922088b477a7abd6664ddd82291d \ + --hash=sha256:a48e74dad1fb349f3dc1d449ed88e0017d792997a7ad2ec9587ed17405667e6d \ + --hash=sha256:b948e09fe5fb18517d99994184854ebd50b57248736fd4c720ad540560174ec5 \ + --hash=sha256:c707f7afd813478e2019ae32a7c49cd932dd60ab2d2a93e796f68236b7e1fbf1 \ + --hash=sha256:d38e6031e113b7421db1de0c1b1f7739564a88f1684c6b89234fbf6c11b75147 \ + --hash=sha256:d3977f0e276f6f5bf245c403156673db103283266601405376f075c849a0b936 \ + --hash=sha256:da6a0ff8f1016ccc7477e6339e1d50ce5f59b88905585f77193ebd5068f1e797 \ + --hash=sha256:e270c04f4d9b5671ebcc792b3ba5d4488bf7c42c3c241a3748e2599776f29696 \ + --hash=sha256:e886098619d3815e0ad5790c973afeee2c0e6e04b4da90b88e6bd06e2a0b1b72 \ + --hash=sha256:ec3b055ff8f1dce8e6ef28f626e0972981475173d7973d63f271b29c8a2897da \ + --hash=sha256:fba1e91467c65fe64a82c689dc6cf58151158993b13eb7a7f3f4b7f395636723 # via # gcp-releasetool # secretstorage -distlib==0.3.6 \ - --hash=sha256:14bad2d9b04d3a36127ac97f30b12a19268f211063d8f8ee4f47108896e11b46 \ - --hash=sha256:f35c4b692542ca110de7ef0bea44d73981caeb34ca0b9b6b2e6d7790dda8f80e +distlib==0.3.7 \ + --hash=sha256:2e24928bc811348f0feb63014e97aaae3037f2cf48712d51ae61df7fd6075057 \ + --hash=sha256:9dafe54b34a028eafd95039d5e5d4851a13734540f1331060d31c9916e7147a8 # via virtualenv -docutils==0.19 \ - --hash=sha256:33995a6753c30b7f577febfc2c50411fec6aac7f7ffeb7c4cfe5991072dcf9e6 \ - --hash=sha256:5e1de4d849fee02c63b040a4a3fd567f4ab104defd8a5511fbbc24a8a017efbc +docutils==0.20.1 \ + --hash=sha256:96f387a2c5562db4476f09f13bbab2192e764cac08ebbf3a34a95d9b1e4a59d6 \ + --hash=sha256:f08a4e276c3a1583a86dce3e34aba3fe04d02bba2dd51ed16106244e8a923e3b # via readme-renderer -filelock==3.8.0 \ - --hash=sha256:55447caa666f2198c5b6b13a26d2084d26fa5b115c00d065664b2124680c4edc \ - --hash=sha256:617eb4e5eedc82fc5f47b6d61e4d11cb837c56cb4544e39081099fa17ad109d4 +filelock==3.13.1 \ + --hash=sha256:521f5f56c50f8426f5e03ad3b281b490a87ef15bc6c526f168290f0c7148d44e \ + --hash=sha256:57dbda9b35157b05fb3e58ee91448612eb674172fab98ee235ccb0b5bee19a1c # via virtualenv -gcp-docuploader==0.6.4 \ - --hash=sha256:01486419e24633af78fd0167db74a2763974765ee8078ca6eb6964d0ebd388af \ - --hash=sha256:70861190c123d907b3b067da896265ead2eeb9263969d6955c9e0bb091b5ccbf +gcp-docuploader==0.6.5 \ + --hash=sha256:30221d4ac3e5a2b9c69aa52fdbef68cc3f27d0e6d0d90e220fc024584b8d2318 \ + --hash=sha256:b7458ef93f605b9d46a4bf3a8dc1755dad1f31d030c8679edf304e343b347eea # via -r requirements.in -gcp-releasetool==1.10.5 \ - --hash=sha256:174b7b102d704b254f2a26a3eda2c684fd3543320ec239baf771542a2e58e109 \ - --hash=sha256:e29d29927fe2ca493105a82958c6873bb2b90d503acac56be2c229e74de0eec9 +gcp-releasetool==1.16.0 \ + --hash=sha256:27bf19d2e87aaa884096ff941aa3c592c482be3d6a2bfe6f06afafa6af2353e3 \ + --hash=sha256:a316b197a543fd036209d0caba7a8eb4d236d8e65381c80cbc6d7efaa7606d63 # via -r requirements.in -google-api-core==2.10.2 \ - --hash=sha256:10c06f7739fe57781f87523375e8e1a3a4674bf6392cd6131a3222182b971320 \ - --hash=sha256:34f24bd1d5f72a8c4519773d99ca6bf080a6c4e041b4e9f024fe230191dda62e +google-api-core==2.12.0 \ + --hash=sha256:c22e01b1e3c4dcd90998494879612c38d0a3411d1f7b679eb89e2abe3ce1f553 \ + --hash=sha256:ec6054f7d64ad13b41e43d96f735acbd763b0f3b695dabaa2d579673f6a6e160 # via # google-cloud-core # google-cloud-storage -google-auth==2.14.1 \ - --hash=sha256:ccaa901f31ad5cbb562615eb8b664b3dd0bf5404a67618e642307f00613eda4d \ - --hash=sha256:f5d8701633bebc12e0deea4df8abd8aff31c28b355360597f7f2ee60f2e4d016 +google-auth==2.23.4 \ + --hash=sha256:79905d6b1652187def79d491d6e23d0cbb3a21d3c7ba0dbaa9c8a01906b13ff3 \ + --hash=sha256:d4bbc92fe4b8bfd2f3e8d88e5ba7085935da208ee38a134fc280e7ce682a05f2 # via # gcp-releasetool # google-api-core # google-cloud-core # google-cloud-storage -google-cloud-core==2.3.2 \ - --hash=sha256:8417acf6466be2fa85123441696c4badda48db314c607cf1e5d543fa8bdc22fe \ - --hash=sha256:b9529ee7047fd8d4bf4a2182de619154240df17fbe60ead399078c1ae152af9a +google-cloud-core==2.3.3 \ + --hash=sha256:37b80273c8d7eee1ae816b3a20ae43585ea50506cb0e60f3cf5be5f87f1373cb \ + --hash=sha256:fbd11cad3e98a7e5b0343dc07cb1039a5ffd7a5bb96e1f1e27cee4bda4a90863 # via google-cloud-storage -google-cloud-storage==2.6.0 \ - --hash=sha256:104ca28ae61243b637f2f01455cc8a05e8f15a2a18ced96cb587241cdd3820f5 \ - --hash=sha256:4ad0415ff61abdd8bb2ae81c1f8f7ec7d91a1011613f2db87c614c550f97bfe9 +google-cloud-storage==2.13.0 \ + --hash=sha256:ab0bf2e1780a1b74cf17fccb13788070b729f50c252f0c94ada2aae0ca95437d \ + --hash=sha256:f62dc4c7b6cd4360d072e3deb28035fbdad491ac3d9b0b1815a12daea10f37c7 # via gcp-docuploader google-crc32c==1.5.0 \ --hash=sha256:024894d9d3cfbc5943f8f230e23950cd4906b2fe004c72e29b209420a1e6b05a \ @@ -251,29 +231,31 @@ google-crc32c==1.5.0 \ --hash=sha256:f583edb943cf2e09c60441b910d6a20b4d9d626c75a36c8fcac01a6c96c01183 \ --hash=sha256:fd8536e902db7e365f49e7d9029283403974ccf29b13fc7028b97e2295b33556 \ --hash=sha256:fe70e325aa68fa4b5edf7d1a4b6f691eb04bbccac0ace68e34820d283b5f80d4 - # via google-resumable-media -google-resumable-media==2.4.0 \ - --hash=sha256:2aa004c16d295c8f6c33b2b4788ba59d366677c0a25ae7382436cb30f776deaa \ - --hash=sha256:8d5518502f92b9ecc84ac46779bd4f09694ecb3ba38a3e7ca737a86d15cbca1f + # via + # google-cloud-storage + # google-resumable-media +google-resumable-media==2.6.0 \ + --hash=sha256:972852f6c65f933e15a4a210c2b96930763b47197cdf4aa5f5bea435efb626e7 \ + --hash=sha256:fc03d344381970f79eebb632a3c18bb1828593a2dc5572b5f90115ef7d11e81b # via google-cloud-storage -googleapis-common-protos==1.57.0 \ - --hash=sha256:27a849d6205838fb6cc3c1c21cb9800707a661bb21c6ce7fb13e99eb1f8a0c46 \ - --hash=sha256:a9f4a1d7f6d9809657b7f1316a1aa527f6664891531bcfcc13b6696e685f443c +googleapis-common-protos==1.61.0 \ + --hash=sha256:22f1915393bb3245343f6efe87f6fe868532efc12aa26b391b15132e1279f1c0 \ + --hash=sha256:8a64866a97f6304a7179873a465d6eee97b7a24ec6cfd78e0f575e96b821240b # via google-api-core idna==3.4 \ --hash=sha256:814f528e8dead7d329833b91c5faa87d60bf71824cd12a7530b5526063d02cb4 \ --hash=sha256:90b77e79eaa3eba6de819a0c442c0b4ceefc341a7a2ab77d7562bf49f425c5c2 # via requests -importlib-metadata==5.0.0 \ - --hash=sha256:da31db32b304314d044d3c12c79bd59e307889b287ad12ff387b3500835fc2ab \ - --hash=sha256:ddb0e35065e8938f867ed4928d0ae5bf2a53b7773871bfe6bcc7e4fcdc7dea43 +importlib-metadata==6.8.0 \ + --hash=sha256:3ebb78df84a805d7698245025b975d9d67053cd94c79245ba4b3eb694abe68bb \ + --hash=sha256:dbace7892d8c0c4ac1ad096662232f831d4e64f4c4545bd53016a3e9d4654743 # via # -r requirements.in # keyring # twine -jaraco-classes==3.2.3 \ - --hash=sha256:2353de3288bc6b82120752201c6b1c1a14b058267fa424ed5ce5984e3b922158 \ - --hash=sha256:89559fa5c1d3c34eff6f631ad80bb21f378dbcbb35dd161fd2c6b93f5be2f98a +jaraco-classes==3.3.0 \ + --hash=sha256:10afa92b6743f25c0cf5f37c6bb6e18e2c5bb84a16527ccfc0040ea377e7aaeb \ + --hash=sha256:c063dd08e89217cee02c8d5e5ec560f2c8ce6cdc2fcdc2e68f7b2e5547ed3621 # via keyring jeepney==0.8.0 \ --hash=sha256:5efe48d255973902f6badc3ce55e2aa6c5c3b3bc642059ef3a91247bcfcc5806 \ @@ -285,75 +267,121 @@ jinja2==3.1.2 \ --hash=sha256:31351a702a408a9e7595a8fc6150fc3f43bb6bf7e319770cbc0db9df9437e852 \ --hash=sha256:6088930bfe239f0e6710546ab9c19c9ef35e29792895fed6e6e31a023a182a61 # via gcp-releasetool -keyring==23.11.0 \ - --hash=sha256:3dd30011d555f1345dec2c262f0153f2f0ca6bca041fb1dc4588349bb4c0ac1e \ - --hash=sha256:ad192263e2cdd5f12875dedc2da13534359a7e760e77f8d04b50968a821c2361 +keyring==24.2.0 \ + --hash=sha256:4901caaf597bfd3bbd78c9a0c7c4c29fcd8310dab2cffefe749e916b6527acd6 \ + --hash=sha256:ca0746a19ec421219f4d713f848fa297a661a8a8c1504867e55bfb5e09091509 # via # gcp-releasetool # twine -markupsafe==2.1.1 \ - --hash=sha256:0212a68688482dc52b2d45013df70d169f542b7394fc744c02a57374a4207003 \ - --hash=sha256:089cf3dbf0cd6c100f02945abeb18484bd1ee57a079aefd52cffd17fba910b88 \ - --hash=sha256:10c1bfff05d95783da83491be968e8fe789263689c02724e0c691933c52994f5 \ - --hash=sha256:33b74d289bd2f5e527beadcaa3f401e0df0a89927c1559c8566c066fa4248ab7 \ - --hash=sha256:3799351e2336dc91ea70b034983ee71cf2f9533cdff7c14c90ea126bfd95d65a \ - --hash=sha256:3ce11ee3f23f79dbd06fb3d63e2f6af7b12db1d46932fe7bd8afa259a5996603 \ - --hash=sha256:421be9fbf0ffe9ffd7a378aafebbf6f4602d564d34be190fc19a193232fd12b1 \ - --hash=sha256:43093fb83d8343aac0b1baa75516da6092f58f41200907ef92448ecab8825135 \ - --hash=sha256:46d00d6cfecdde84d40e572d63735ef81423ad31184100411e6e3388d405e247 \ - --hash=sha256:4a33dea2b688b3190ee12bd7cfa29d39c9ed176bda40bfa11099a3ce5d3a7ac6 \ - --hash=sha256:4b9fe39a2ccc108a4accc2676e77da025ce383c108593d65cc909add5c3bd601 \ - --hash=sha256:56442863ed2b06d19c37f94d999035e15ee982988920e12a5b4ba29b62ad1f77 \ - --hash=sha256:671cd1187ed5e62818414afe79ed29da836dde67166a9fac6d435873c44fdd02 \ - --hash=sha256:694deca8d702d5db21ec83983ce0bb4b26a578e71fbdbd4fdcd387daa90e4d5e \ - --hash=sha256:6a074d34ee7a5ce3effbc526b7083ec9731bb3cbf921bbe1d3005d4d2bdb3a63 \ - --hash=sha256:6d0072fea50feec76a4c418096652f2c3238eaa014b2f94aeb1d56a66b41403f \ - --hash=sha256:6fbf47b5d3728c6aea2abb0589b5d30459e369baa772e0f37a0320185e87c980 \ - --hash=sha256:7f91197cc9e48f989d12e4e6fbc46495c446636dfc81b9ccf50bb0ec74b91d4b \ - --hash=sha256:86b1f75c4e7c2ac2ccdaec2b9022845dbb81880ca318bb7a0a01fbf7813e3812 \ - --hash=sha256:8dc1c72a69aa7e082593c4a203dcf94ddb74bb5c8a731e4e1eb68d031e8498ff \ - --hash=sha256:8e3dcf21f367459434c18e71b2a9532d96547aef8a871872a5bd69a715c15f96 \ - --hash=sha256:8e576a51ad59e4bfaac456023a78f6b5e6e7651dcd383bcc3e18d06f9b55d6d1 \ - --hash=sha256:96e37a3dc86e80bf81758c152fe66dbf60ed5eca3d26305edf01892257049925 \ - --hash=sha256:97a68e6ada378df82bc9f16b800ab77cbf4b2fada0081794318520138c088e4a \ - --hash=sha256:99a2a507ed3ac881b975a2976d59f38c19386d128e7a9a18b7df6fff1fd4c1d6 \ - --hash=sha256:a49907dd8420c5685cfa064a1335b6754b74541bbb3706c259c02ed65b644b3e \ - --hash=sha256:b09bf97215625a311f669476f44b8b318b075847b49316d3e28c08e41a7a573f \ - --hash=sha256:b7bd98b796e2b6553da7225aeb61f447f80a1ca64f41d83612e6139ca5213aa4 \ - --hash=sha256:b87db4360013327109564f0e591bd2a3b318547bcef31b468a92ee504d07ae4f \ - --hash=sha256:bcb3ed405ed3222f9904899563d6fc492ff75cce56cba05e32eff40e6acbeaa3 \ - --hash=sha256:d4306c36ca495956b6d568d276ac11fdd9c30a36f1b6eb928070dc5360b22e1c \ - --hash=sha256:d5ee4f386140395a2c818d149221149c54849dfcfcb9f1debfe07a8b8bd63f9a \ - --hash=sha256:dda30ba7e87fbbb7eab1ec9f58678558fd9a6b8b853530e176eabd064da81417 \ - --hash=sha256:e04e26803c9c3851c931eac40c695602c6295b8d432cbe78609649ad9bd2da8a \ - --hash=sha256:e1c0b87e09fa55a220f058d1d49d3fb8df88fbfab58558f1198e08c1e1de842a \ - --hash=sha256:e72591e9ecd94d7feb70c1cbd7be7b3ebea3f548870aa91e2732960fa4d57a37 \ - --hash=sha256:e8c843bbcda3a2f1e3c2ab25913c80a3c5376cd00c6e8c4a86a89a28c8dc5452 \ - --hash=sha256:efc1913fd2ca4f334418481c7e595c00aad186563bbc1ec76067848c7ca0a933 \ - --hash=sha256:f121a1420d4e173a5d96e47e9a0c0dcff965afdf1626d28de1460815f7c4ee7a \ - --hash=sha256:fc7b548b17d238737688817ab67deebb30e8073c95749d55538ed473130ec0c7 +markdown-it-py==3.0.0 \ + --hash=sha256:355216845c60bd96232cd8d8c40e8f9765cc86f46880e43a8fd22dc1a1a8cab1 \ + --hash=sha256:e3f60a94fa066dc52ec76661e37c851cb232d92f9886b15cb560aaada2df8feb + # via rich +markupsafe==2.1.3 \ + --hash=sha256:05fb21170423db021895e1ea1e1f3ab3adb85d1c2333cbc2310f2a26bc77272e \ + --hash=sha256:0a4e4a1aff6c7ac4cd55792abf96c915634c2b97e3cc1c7129578aa68ebd754e \ + --hash=sha256:10bbfe99883db80bdbaff2dcf681dfc6533a614f700da1287707e8a5d78a8431 \ + --hash=sha256:134da1eca9ec0ae528110ccc9e48041e0828d79f24121a1a146161103c76e686 \ + --hash=sha256:14ff806850827afd6b07a5f32bd917fb7f45b046ba40c57abdb636674a8b559c \ + --hash=sha256:1577735524cdad32f9f694208aa75e422adba74f1baee7551620e43a3141f559 \ + --hash=sha256:1b40069d487e7edb2676d3fbdb2b0829ffa2cd63a2ec26c4938b2d34391b4ecc \ + --hash=sha256:1b8dd8c3fd14349433c79fa8abeb573a55fc0fdd769133baac1f5e07abf54aeb \ + --hash=sha256:1f67c7038d560d92149c060157d623c542173016c4babc0c1913cca0564b9939 \ + --hash=sha256:282c2cb35b5b673bbcadb33a585408104df04f14b2d9b01d4c345a3b92861c2c \ + --hash=sha256:2c1b19b3aaacc6e57b7e25710ff571c24d6c3613a45e905b1fde04d691b98ee0 \ + --hash=sha256:2ef12179d3a291be237280175b542c07a36e7f60718296278d8593d21ca937d4 \ + --hash=sha256:338ae27d6b8745585f87218a3f23f1512dbf52c26c28e322dbe54bcede54ccb9 \ + --hash=sha256:3c0fae6c3be832a0a0473ac912810b2877c8cb9d76ca48de1ed31e1c68386575 \ + --hash=sha256:3fd4abcb888d15a94f32b75d8fd18ee162ca0c064f35b11134be77050296d6ba \ + --hash=sha256:42de32b22b6b804f42c5d98be4f7e5e977ecdd9ee9b660fda1a3edf03b11792d \ + --hash=sha256:47d4f1c5f80fc62fdd7777d0d40a2e9dda0a05883ab11374334f6c4de38adffd \ + --hash=sha256:504b320cd4b7eff6f968eddf81127112db685e81f7e36e75f9f84f0df46041c3 \ + --hash=sha256:525808b8019e36eb524b8c68acdd63a37e75714eac50e988180b169d64480a00 \ + --hash=sha256:56d9f2ecac662ca1611d183feb03a3fa4406469dafe241673d521dd5ae92a155 \ + --hash=sha256:5bbe06f8eeafd38e5d0a4894ffec89378b6c6a625ff57e3028921f8ff59318ac \ + --hash=sha256:65c1a9bcdadc6c28eecee2c119465aebff8f7a584dd719facdd9e825ec61ab52 \ + --hash=sha256:68e78619a61ecf91e76aa3e6e8e33fc4894a2bebe93410754bd28fce0a8a4f9f \ + --hash=sha256:69c0f17e9f5a7afdf2cc9fb2d1ce6aabdb3bafb7f38017c0b77862bcec2bbad8 \ + --hash=sha256:6b2b56950d93e41f33b4223ead100ea0fe11f8e6ee5f641eb753ce4b77a7042b \ + --hash=sha256:715d3562f79d540f251b99ebd6d8baa547118974341db04f5ad06d5ea3eb8007 \ + --hash=sha256:787003c0ddb00500e49a10f2844fac87aa6ce977b90b0feaaf9de23c22508b24 \ + --hash=sha256:7ef3cb2ebbf91e330e3bb937efada0edd9003683db6b57bb108c4001f37a02ea \ + --hash=sha256:8023faf4e01efadfa183e863fefde0046de576c6f14659e8782065bcece22198 \ + --hash=sha256:8758846a7e80910096950b67071243da3e5a20ed2546e6392603c096778d48e0 \ + --hash=sha256:8afafd99945ead6e075b973fefa56379c5b5c53fd8937dad92c662da5d8fd5ee \ + --hash=sha256:8c41976a29d078bb235fea9b2ecd3da465df42a562910f9022f1a03107bd02be \ + --hash=sha256:8e254ae696c88d98da6555f5ace2279cf7cd5b3f52be2b5cf97feafe883b58d2 \ + --hash=sha256:8f9293864fe09b8149f0cc42ce56e3f0e54de883a9de90cd427f191c346eb2e1 \ + --hash=sha256:9402b03f1a1b4dc4c19845e5c749e3ab82d5078d16a2a4c2cd2df62d57bb0707 \ + --hash=sha256:962f82a3086483f5e5f64dbad880d31038b698494799b097bc59c2edf392fce6 \ + --hash=sha256:9aad3c1755095ce347e26488214ef77e0485a3c34a50c5a5e2471dff60b9dd9c \ + --hash=sha256:9dcdfd0eaf283af041973bff14a2e143b8bd64e069f4c383416ecd79a81aab58 \ + --hash=sha256:aa57bd9cf8ae831a362185ee444e15a93ecb2e344c8e52e4d721ea3ab6ef1823 \ + --hash=sha256:aa7bd130efab1c280bed0f45501b7c8795f9fdbeb02e965371bbef3523627779 \ + --hash=sha256:ab4a0df41e7c16a1392727727e7998a467472d0ad65f3ad5e6e765015df08636 \ + --hash=sha256:ad9e82fb8f09ade1c3e1b996a6337afac2b8b9e365f926f5a61aacc71adc5b3c \ + --hash=sha256:af598ed32d6ae86f1b747b82783958b1a4ab8f617b06fe68795c7f026abbdcad \ + --hash=sha256:b076b6226fb84157e3f7c971a47ff3a679d837cf338547532ab866c57930dbee \ + --hash=sha256:b7ff0f54cb4ff66dd38bebd335a38e2c22c41a8ee45aa608efc890ac3e3931bc \ + --hash=sha256:bfce63a9e7834b12b87c64d6b155fdd9b3b96191b6bd334bf37db7ff1fe457f2 \ + --hash=sha256:c011a4149cfbcf9f03994ec2edffcb8b1dc2d2aede7ca243746df97a5d41ce48 \ + --hash=sha256:c9c804664ebe8f83a211cace637506669e7890fec1b4195b505c214e50dd4eb7 \ + --hash=sha256:ca379055a47383d02a5400cb0d110cef0a776fc644cda797db0c5696cfd7e18e \ + --hash=sha256:cb0932dc158471523c9637e807d9bfb93e06a95cbf010f1a38b98623b929ef2b \ + --hash=sha256:cd0f502fe016460680cd20aaa5a76d241d6f35a1c3350c474bac1273803893fa \ + --hash=sha256:ceb01949af7121f9fc39f7d27f91be8546f3fb112c608bc4029aef0bab86a2a5 \ + --hash=sha256:d080e0a5eb2529460b30190fcfcc4199bd7f827663f858a226a81bc27beaa97e \ + --hash=sha256:dd15ff04ffd7e05ffcb7fe79f1b98041b8ea30ae9234aed2a9168b5797c3effb \ + --hash=sha256:df0be2b576a7abbf737b1575f048c23fb1d769f267ec4358296f31c2479db8f9 \ + --hash=sha256:e09031c87a1e51556fdcb46e5bd4f59dfb743061cf93c4d6831bf894f125eb57 \ + --hash=sha256:e4dd52d80b8c83fdce44e12478ad2e85c64ea965e75d66dbeafb0a3e77308fcc \ + --hash=sha256:f698de3fd0c4e6972b92290a45bd9b1536bffe8c6759c62471efaa8acb4c37bc \ + --hash=sha256:fec21693218efe39aa7f8599346e90c705afa52c5b31ae019b2e57e8f6542bb2 \ + --hash=sha256:ffcc3f7c66b5f5b7931a5aa68fc9cecc51e685ef90282f4a82f0f5e9b704ad11 # via jinja2 -more-itertools==9.0.0 \ - --hash=sha256:250e83d7e81d0c87ca6bd942e6aeab8cc9daa6096d12c5308f3f92fa5e5c1f41 \ - --hash=sha256:5a6257e40878ef0520b1803990e3e22303a41b5714006c32a3fd8304b26ea1ab +mdurl==0.1.2 \ + --hash=sha256:84008a41e51615a49fc9966191ff91509e3c40b939176e643fd50a5c2196b8f8 \ + --hash=sha256:bb413d29f5eea38f31dd4754dd7377d4465116fb207585f97bf925588687c1ba + # via markdown-it-py +more-itertools==10.1.0 \ + --hash=sha256:626c369fa0eb37bac0291bce8259b332fd59ac792fa5497b59837309cd5b114a \ + --hash=sha256:64e0735fcfdc6f3464ea133afe8ea4483b1c5fe3a3d69852e6503b43a0b222e6 # via jaraco-classes -nox==2022.11.21 \ - --hash=sha256:0e41a990e290e274cb205a976c4c97ee3c5234441a8132c8c3fd9ea3c22149eb \ - --hash=sha256:e21c31de0711d1274ca585a2c5fde36b1aa962005ba8e9322bf5eeed16dcd684 +nh3==0.2.14 \ + --hash=sha256:116c9515937f94f0057ef50ebcbcc10600860065953ba56f14473ff706371873 \ + --hash=sha256:18415df36db9b001f71a42a3a5395db79cf23d556996090d293764436e98e8ad \ + --hash=sha256:203cac86e313cf6486704d0ec620a992c8bc164c86d3a4fd3d761dd552d839b5 \ + --hash=sha256:2b0be5c792bd43d0abef8ca39dd8acb3c0611052ce466d0401d51ea0d9aa7525 \ + --hash=sha256:377aaf6a9e7c63962f367158d808c6a1344e2b4f83d071c43fbd631b75c4f0b2 \ + --hash=sha256:525846c56c2bcd376f5eaee76063ebf33cf1e620c1498b2a40107f60cfc6054e \ + --hash=sha256:5529a3bf99402c34056576d80ae5547123f1078da76aa99e8ed79e44fa67282d \ + --hash=sha256:7771d43222b639a4cd9e341f870cee336b9d886de1ad9bec8dddab22fe1de450 \ + --hash=sha256:88c753efbcdfc2644a5012938c6b9753f1c64a5723a67f0301ca43e7b85dcf0e \ + --hash=sha256:93a943cfd3e33bd03f77b97baa11990148687877b74193bf777956b67054dcc6 \ + --hash=sha256:9be2f68fb9a40d8440cbf34cbf40758aa7f6093160bfc7fb018cce8e424f0c3a \ + --hash=sha256:a0c509894fd4dccdff557068e5074999ae3b75f4c5a2d6fb5415e782e25679c4 \ + --hash=sha256:ac8056e937f264995a82bf0053ca898a1cb1c9efc7cd68fa07fe0060734df7e4 \ + --hash=sha256:aed56a86daa43966dd790ba86d4b810b219f75b4bb737461b6886ce2bde38fd6 \ + --hash=sha256:e8986f1dd3221d1e741fda0a12eaa4a273f1d80a35e31a1ffe579e7c621d069e \ + --hash=sha256:f99212a81c62b5f22f9e7c3e347aa00491114a5647e1f13bbebd79c3e5f08d75 + # via readme-renderer +nox==2023.4.22 \ + --hash=sha256:0b1adc619c58ab4fa57d6ab2e7823fe47a32e70202f287d78474adcc7bda1891 \ + --hash=sha256:46c0560b0dc609d7d967dc99e22cb463d3c4caf54a5fda735d6c11b5177e3a9f # via -r requirements.in -packaging==21.3 \ - --hash=sha256:dd47c42927d89ab911e606518907cc2d3a1f38bbd026385970643f9c5b8ecfeb \ - --hash=sha256:ef103e05f519cdc783ae24ea4e2e0f508a9c99b2d4969652eed6a2e1ea5bd522 +packaging==23.2 \ + --hash=sha256:048fb0e9405036518eaaf48a55953c750c11e1a1b68e0dd1a9d62ed0c092cfc5 \ + --hash=sha256:8c491190033a9af7e1d931d0b5dacc2ef47509b34dd0de67ed209b5203fc88c7 # via # gcp-releasetool # nox -pkginfo==1.8.3 \ - --hash=sha256:848865108ec99d4901b2f7e84058b6e7660aae8ae10164e015a6dcf5b242a594 \ - --hash=sha256:a84da4318dd86f870a9447a8c98340aa06216bfc6f2b7bdc4b8766984ae1867c +pkginfo==1.9.6 \ + --hash=sha256:4b7a555a6d5a22169fcc9cf7bfd78d296b0361adad412a346c1226849af5e546 \ + --hash=sha256:8fd5896e8718a4372f0ea9cc9d96f6417c9b986e23a4d116dda26b62cc29d046 # via twine -platformdirs==2.5.4 \ - --hash=sha256:1006647646d80f16130f052404c6b901e80ee4ed6bef6792e1f238a8969106f7 \ - --hash=sha256:af0276409f9a02373d540bf8480021a048711d572745aef4b7842dad245eba10 +platformdirs==3.11.0 \ + --hash=sha256:cf8ee52a3afdb965072dcc652433e0c7e3e40cf5ea1477cd4b3b1d2eb75495b3 \ + --hash=sha256:e9d171d00af68be50e9202731309c4e658fd8bc76f55c11c7dd760d023bda68e # via virtualenv protobuf==3.20.3 \ --hash=sha256:03038ac1cfbc41aa21f6afcbcd357281d7521b4157926f30ebecc8d4ea59dcb7 \ @@ -383,34 +411,30 @@ protobuf==3.20.3 \ # gcp-releasetool # google-api-core # googleapis-common-protos -pyasn1==0.4.8 \ - --hash=sha256:39c7e2ec30515947ff4e87fb6f456dfc6e84857d34be479c9d4a4ba4bf46aa5d \ - --hash=sha256:aef77c9fb94a3ac588e87841208bdec464471d9871bd5050a287cc9a475cd0ba +pyasn1==0.5.0 \ + --hash=sha256:87a2121042a1ac9358cabcaf1d07680ff97ee6404333bacca15f76aa8ad01a57 \ + --hash=sha256:97b7290ca68e62a832558ec3976f15cbf911bf5d7c7039d8b861c2a0ece69fde # via # pyasn1-modules # rsa -pyasn1-modules==0.2.8 \ - --hash=sha256:905f84c712230b2c592c19470d3ca8d552de726050d1d1716282a1f6146be65e \ - --hash=sha256:a50b808ffeb97cb3601dd25981f6b016cbb3d31fbf57a8b8a87428e6158d0c74 +pyasn1-modules==0.3.0 \ + --hash=sha256:5bd01446b736eb9d31512a30d46c1ac3395d676c6f3cafa4c03eb54b9925631c \ + --hash=sha256:d3ccd6ed470d9ffbc716be08bd90efbd44d0734bc9303818f7336070984a162d # via google-auth pycparser==2.21 \ --hash=sha256:8ee45429555515e1f6b185e78100aea234072576aa43ab53aefcae078162fca9 \ --hash=sha256:e644fdec12f7872f86c58ff790da456218b10f863970249516d60a5eaca77206 # via cffi -pygments==2.15.0 \ - --hash=sha256:77a3299119af881904cd5ecd1ac6a66214b6e9bed1f2db16993b54adede64094 \ - --hash=sha256:f7e36cffc4c517fbc252861b9a6e4644ca0e5abadf9a113c72d1358ad09b9500 +pygments==2.16.1 \ + --hash=sha256:13fc09fa63bc8d8671a6d247e1eb303c4b343eaee81d861f3404db2935653692 \ + --hash=sha256:1daff0494820c69bc8941e407aa20f577374ee88364ee10a98fdbe0aece96e29 # via # readme-renderer # rich -pyjwt==2.6.0 \ - --hash=sha256:69285c7e31fc44f68a1feb309e948e0df53259d579295e6cfe2b1792329f05fd \ - --hash=sha256:d83c3d892a77bbb74d3e1a2cfa90afaadb60945205d1095d9221f04466f64c14 +pyjwt==2.8.0 \ + --hash=sha256:57e28d156e3d5c10088e0c68abb90bfac3df82b40a71bd0daa20c65ccd5c23de \ + --hash=sha256:59127c392cc44c2da5bb3192169a91f429924e17aff6534d70fdc02ab3e04320 # via gcp-releasetool -pyparsing==3.0.9 \ - --hash=sha256:2b020ecf7d21b687f219b71ecad3631f644a47f01403fa1d1036b0c6416d70fb \ - --hash=sha256:5026bae9a10eeaefb61dab2f09052b9f4307d44aee4eda64b309723d8d206bbc - # via packaging pyperclip==1.8.2 \ --hash=sha256:105254a8b04934f0bc84e9c24eb360a591aaf6535c9def5f29d92af107a9bf57 # via gcp-releasetool @@ -418,9 +442,9 @@ python-dateutil==2.8.2 \ --hash=sha256:0123cacc1627ae19ddf3c27a5de5bd67ee4586fbdd6440d9748f8abb483d3e86 \ --hash=sha256:961d03dc3453ebbc59dbdea9e4e11c5651520a876d0f4db161e8674aae935da9 # via gcp-releasetool -readme-renderer==37.3 \ - --hash=sha256:cd653186dfc73055656f090f227f5cb22a046d7f71a841dfa305f55c9a513273 \ - --hash=sha256:f67a16caedfa71eef48a31b39708637a6f4664c4394801a7b0d6432d13907343 +readme-renderer==42.0 \ + --hash=sha256:13d039515c1f24de668e2c93f2e877b9dbe6c6c32328b90a40a49d8b2b85f36d \ + --hash=sha256:2d55489f83be4992fe4454939d1a051c33edbab778e82761d060c9fc6b308cd1 # via twine requests==2.31.0 \ --hash=sha256:58cd2187c01e70e6e26505bca751777aa9f2ee0b7f4300988b709f44e013003f \ @@ -431,17 +455,17 @@ requests==2.31.0 \ # google-cloud-storage # requests-toolbelt # twine -requests-toolbelt==0.10.1 \ - --hash=sha256:18565aa58116d9951ac39baa288d3adb5b3ff975c4f25eee78555d89e8f247f7 \ - --hash=sha256:62e09f7ff5ccbda92772a29f394a49c3ad6cb181d568b1337626b2abb628a63d +requests-toolbelt==1.0.0 \ + --hash=sha256:7681a0a3d047012b5bdc0ee37d7f8f07ebe76ab08caeccfc3921ce23c88d5bc6 \ + --hash=sha256:cccfdd665f0a24fcf4726e690f65639d272bb0637b9b92dfd91a5568ccf6bd06 # via twine rfc3986==2.0.0 \ --hash=sha256:50b1502b60e289cb37883f3dfd34532b8873c7de9f49bb546641ce9cbd256ebd \ --hash=sha256:97aacf9dbd4bfd829baad6e6309fa6573aaf1be3f6fa735c8ab05e46cecb261c # via twine -rich==12.6.0 \ - --hash=sha256:a4eb26484f2c82589bd9a17c73d32a010b1e29d89f1604cd9bf3a2097b81bb5e \ - --hash=sha256:ba3a3775974105c221d31141f2c116f4fd65c5ceb0698657a11e9f295ec93fd0 +rich==13.6.0 \ + --hash=sha256:2b38e2fe9ca72c9a00170a1a2d20c63c790d0e10ef1fe35eba76e1e7b1d7d245 \ + --hash=sha256:5c14d22737e6d5084ef4771b62d5d4363165b403455a30a1c8ca39dc7b644bef # via twine rsa==4.9 \ --hash=sha256:90260d9058e514786967344d0ef75fa8727eed8a7d2e43ce9f4bcf1b536174f7 \ @@ -455,43 +479,37 @@ six==1.16.0 \ --hash=sha256:1e61c37477a1626458e36f7b1d82aa5c9b094fa4802892072e49de9c60c4c926 \ --hash=sha256:8abb2f1d86890a2dfb989f9a77cfcfd3e47c2a354b01111771326f8aa26e0254 # via - # bleach # gcp-docuploader - # google-auth # python-dateutil -twine==4.0.1 \ - --hash=sha256:42026c18e394eac3e06693ee52010baa5313e4811d5a11050e7d48436cf41b9e \ - --hash=sha256:96b1cf12f7ae611a4a40b6ae8e9570215daff0611828f5fe1f37a16255ab24a0 +twine==4.0.2 \ + --hash=sha256:929bc3c280033347a00f847236564d1c52a3e61b1ac2516c97c48f3ceab756d8 \ + --hash=sha256:9e102ef5fdd5a20661eb88fad46338806c3bd32cf1db729603fe3697b1bc83c8 # via -r requirements.in -typing-extensions==4.4.0 \ - --hash=sha256:1511434bb92bf8dd198c12b1cc812e800d4181cfcb867674e0f8279cc93087aa \ - --hash=sha256:16fa4864408f655d35ec496218b85f79b3437c829e93320c7c9215ccfd92489e +typing-extensions==4.8.0 \ + --hash=sha256:8f92fc8806f9a6b641eaa5318da32b44d401efaac0f6678c9bc448ba3605faa0 \ + --hash=sha256:df8e4339e9cb77357558cbdbceca33c303714cf861d1eef15e1070055ae8b7ef # via -r requirements.in -urllib3==1.26.18 \ - --hash=sha256:34b97092d7e0a3a8cf7cd10e386f401b3737364026c45e622aa02903dffe0f07 \ - --hash=sha256:f8ecc1bba5667413457c529ab955bf8c67b45db799d159066261719e328580a0 +urllib3==2.0.7 \ + --hash=sha256:c97dfde1f7bd43a71c8d2a58e369e9b2bf692d1334ea9f9cae55add7d0dd0f84 \ + --hash=sha256:fdb6d215c776278489906c2f8916e6e7d4f5a9b602ccbcfdf7f016fc8da0596e # via # requests # twine -virtualenv==20.16.7 \ - --hash=sha256:8691e3ff9387f743e00f6bb20f70121f5e4f596cae754531f2b3b3a1b1ac696e \ - --hash=sha256:efd66b00386fdb7dbe4822d172303f40cd05e50e01740b19ea42425cbe653e29 +virtualenv==20.24.6 \ + --hash=sha256:02ece4f56fbf939dbbc33c0715159951d6bf14aaf5457b092e4548e1382455af \ + --hash=sha256:520d056652454c5098a00c0f073611ccbea4c79089331f60bf9d7ba247bb7381 # via nox -webencodings==0.5.1 \ - --hash=sha256:a0af1213f3c2226497a97e2b3aa01a7e4bee4f403f95be16fc9acd2947514a78 \ - --hash=sha256:b36a1c245f2d304965eb4e0a82848379241dc04b865afcc4aab16748587e1923 - # via bleach -wheel==0.38.4 \ - --hash=sha256:965f5259b566725405b05e7cf774052044b1ed30119b5d586b2703aafe8719ac \ - --hash=sha256:b60533f3f5d530e971d6737ca6d58681ee434818fab630c83a734bb10c083ce8 +wheel==0.41.3 \ + --hash=sha256:488609bc63a29322326e05560731bf7bfea8e48ad646e1f5e40d366607de0942 \ + --hash=sha256:4d4987ce51a49370ea65c0bfd2234e8ce80a12780820d9dc462597a6e60d0841 # via -r requirements.in -zipp==3.10.0 \ - --hash=sha256:4fcb6f278987a6605757302a6e40e896257570d11c51628968ccb2a47e80c6c1 \ - --hash=sha256:7a7262fd930bd3e36c50b9a64897aec3fafff3dfdeec9623ae22b40e93f99bb8 +zipp==3.17.0 \ + --hash=sha256:0e923e726174922dce09c53c59ad483ff7bbb8e572e00c7f7c46b88556409f31 \ + --hash=sha256:84e64a1c28cf7e91ed2078bb8cc8c259cb19b76942096c8d7b84947690cabaf0 # via importlib-metadata # The following packages are considered to be unsafe in a requirements file: -setuptools==65.5.1 \ - --hash=sha256:d0b9a8433464d5800cbe05094acf5c6d52a91bfac9b52bcfc4d41382be5d5d31 \ - --hash=sha256:e197a19aa8ec9722928f2206f8de752def0e4c9fc6953527360d1c36d94ddb2f +setuptools==68.2.2 \ + --hash=sha256:4ac1475276d2f1c48684874089fefcd83bd7162ddaafb81fac866ba0db282a87 \ + --hash=sha256:b454a35605876da60632df1a60f736524eb73cc47bbc9f3f1ef1b644de74fd2a # via -r requirements.in diff --git a/CHANGELOG.md b/CHANGELOG.md index fc327b2e96..1f76b78272 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -4,6 +4,41 @@ [1]: https://pypi.org/project/bigframes/#history +## [0.14.0](https://github.com/googleapis/python-bigquery-dataframes/compare/v0.13.0...v0.14.0) (2023-11-14) + + +### Features + +* Add 'cross' join support ([#176](https://github.com/googleapis/python-bigquery-dataframes/issues/176)) ([765446a](https://github.com/googleapis/python-bigquery-dataframes/commit/765446a929abe1ac076c3037afa7892f64105356)) +* Add 'index', 'pad', 'nearest' interpolate methods ([#162](https://github.com/googleapis/python-bigquery-dataframes/issues/162)) ([6a28403](https://github.com/googleapis/python-bigquery-dataframes/commit/6a2840349a23035bdfdabacd1e231b41bbb5ed7a)) +* Add series.sample (identical to existing dataframe.sample) ([#187](https://github.com/googleapis/python-bigquery-dataframes/issues/187)) ([37914a4](https://github.com/googleapis/python-bigquery-dataframes/commit/37914a4077c681881491f5c36d1a9c9f4255e18f)) +* Add unordered sql compilation ([#156](https://github.com/googleapis/python-bigquery-dataframes/issues/156)) ([58f420c](https://github.com/googleapis/python-bigquery-dataframes/commit/58f420c91d94ca085e9810f36513ffe772bfddcf)) +* Log most recent API calls as `recent-bigframes-api-xx` labels on BigQuery jobs ([#145](https://github.com/googleapis/python-bigquery-dataframes/issues/145)) ([4ea33b7](https://github.com/googleapis/python-bigquery-dataframes/commit/4ea33b7433532ae3a386a6ffa9eb57360ea39526)) +* Read_gbq creates order deterministically without table copy ([#191](https://github.com/googleapis/python-bigquery-dataframes/issues/191)) ([8ab81de](https://github.com/googleapis/python-bigquery-dataframes/commit/8ab81dee4d0eee499094f2dd576550f0c59d7551)) +* Support `date_series.astype("string[pyarrow]")` to cast DATE to STRING ([#186](https://github.com/googleapis/python-bigquery-dataframes/issues/186)) ([aee0e8e](https://github.com/googleapis/python-bigquery-dataframes/commit/aee0e8e2518c59bd1e0b07940c3309871fde8899)) +* Support `series.at[row_label] = scalar` ([#173](https://github.com/googleapis/python-bigquery-dataframes/issues/173)) ([0c8bd33](https://github.com/googleapis/python-bigquery-dataframes/commit/0c8bd33806bb99206b8b12dbdf7d7485c6ffb759)) +* Temporary resources no longer use BigQuery Sessions ([#194](https://github.com/googleapis/python-bigquery-dataframes/issues/194)) ([4a02cac](https://github.com/googleapis/python-bigquery-dataframes/commit/4a02cac88c7d7b46bed1fa813a862fc2ef9ef084)) + + +### Bug Fixes + +* All sort operation are now stable ([#195](https://github.com/googleapis/python-bigquery-dataframes/issues/195)) ([3a2761f](https://github.com/googleapis/python-bigquery-dataframes/commit/3a2761f3c38d0de8b8eda47fffa15b8412aa84b0)) +* Default to 7 days expiration for `read_csv`, `read_json`, `read_parquet` ([#193](https://github.com/googleapis/python-bigquery-dataframes/issues/193)) ([03606cd](https://github.com/googleapis/python-bigquery-dataframes/commit/03606cda30eb7645bfd4534460112dcca56b0ab0)) +* Deprecate the `remote_service_type` in llm model ([#180](https://github.com/googleapis/python-bigquery-dataframes/issues/180)) ([a8a409a](https://github.com/googleapis/python-bigquery-dataframes/commit/a8a409ab0bd1f99dfb442df0703bf8786e0fe58e)) +* For reset_index on unnamed multiindex, always use level_[n] label ([#182](https://github.com/googleapis/python-bigquery-dataframes/issues/182)) ([f95000d](https://github.com/googleapis/python-bigquery-dataframes/commit/f95000d3f88662be4d88c8b0152f1b838e99ec55)) +* Match pandas behavior when assigning listlike to empty dfs ([#172](https://github.com/googleapis/python-bigquery-dataframes/issues/172)) ([c1d1f42](https://github.com/googleapis/python-bigquery-dataframes/commit/c1d1f42a21cc089877f79ebb46a39ddef6958e04)) +* Use anonymous dataset instead of session dataset for temp tables ([#181](https://github.com/googleapis/python-bigquery-dataframes/issues/181)) ([800d44e](https://github.com/googleapis/python-bigquery-dataframes/commit/800d44eb5eb77da5d87b2e005f5a2ed53842e7b5)) +* Use random table for `read_pandas` ([#192](https://github.com/googleapis/python-bigquery-dataframes/issues/192)) ([741c75e](https://github.com/googleapis/python-bigquery-dataframes/commit/741c75e5797e26a1487ff3da76a07953d9537f3f)) +* Use random table when loading data for `read_csv`, `read_json`, `read_parquet` ([#175](https://github.com/googleapis/python-bigquery-dataframes/issues/175)) ([9d2e6dc](https://github.com/googleapis/python-bigquery-dataframes/commit/9d2e6dc1ae4e11e80da4aabe0daa3a6044137cc6)) + + +### Documentation + +* Add code samples for `read_gbq_function` using community UDFs ([#188](https://github.com/googleapis/python-bigquery-dataframes/issues/188)) ([7506eab](https://github.com/googleapis/python-bigquery-dataframes/commit/7506eabf2e58159507809e36abfe90c417dfe92f)) +* Add docstring code samples for `Series.apply` and `DataFrame.map` ([#185](https://github.com/googleapis/python-bigquery-dataframes/issues/185)) ([c816d84](https://github.com/googleapis/python-bigquery-dataframes/commit/c816d843e6f3c5a944cd4395ed0e1e91cec49812)) +* Add llm kmeans notebook as an included example ([#177](https://github.com/googleapis/python-bigquery-dataframes/issues/177)) ([d49ae42](https://github.com/googleapis/python-bigquery-dataframes/commit/d49ae42a379fafd601cc94227e7f8f14b3d5f8c3)) +* Use `head()` to get top `n` results, not to preview results ([#190](https://github.com/googleapis/python-bigquery-dataframes/issues/190)) ([87f84c9](https://github.com/googleapis/python-bigquery-dataframes/commit/87f84c9e58e7d0ea521ac386c9f02791cdddd19f)) + ## [0.13.0](https://github.com/googleapis/python-bigquery-dataframes/compare/v0.12.0...v0.13.0) (2023-11-07) diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md deleted file mode 100644 index b16bd94428..0000000000 --- a/CONTRIBUTING.md +++ /dev/null @@ -1,33 +0,0 @@ -# How to contribute - -We'd love to accept your patches and contributions to this project. - -## Before you begin - -### Sign our Contributor License Agreement - -Contributions to this project must be accompanied by a -[Contributor License Agreement](https://cla.developers.google.com/about) (CLA). -You (or your employer) retain the copyright to your contribution; this simply -gives us permission to use and redistribute your contributions as part of the -project. - -If you or your current employer have already signed the Google CLA (even if it -was for a different project), you probably don't need to do it again. - -Visit to see your current agreements or to -sign a new one. - -### Review our community guidelines - -This project follows -[Google's Open Source Community Guidelines](https://opensource.google/conduct/). - -## Contribution process - -### Code reviews - -All submissions, including submissions by project members, require review. We -use GitHub pull requests for this purpose. Consult -[GitHub Help](https://help.github.com/articles/about-pull-requests/) for more -information on using pull requests. diff --git a/CONTRIBUTING.rst b/CONTRIBUTING.rst index 3933152cf7..f9103bfa72 100644 --- a/CONTRIBUTING.rst +++ b/CONTRIBUTING.rst @@ -96,9 +96,9 @@ On Debian/Ubuntu:: Coding Style ************ - We use the automatic code formatter ``black``. You can run it using - the nox session ``blacken``. This will eliminate many lint errors. Run via:: + the nox session ``format``. This will eliminate many lint errors. Run via:: - $ nox -s blacken + $ nox -s format - PEP8 compliance is required, with exceptions defined in the linter configuration. If you have ``nox`` installed, you can test that you have not introduced diff --git a/bigframes/core/__init__.py b/bigframes/core/__init__.py index 4653f0ab6a..b476961bdc 100644 --- a/bigframes/core/__init__.py +++ b/bigframes/core/__init__.py @@ -23,7 +23,8 @@ import ibis.expr.types as ibis_types import pandas -import bigframes.core.compile as compiled +import bigframes.core.compile.compiled as compiled +import bigframes.core.compile.compiler as compiler import bigframes.core.guid import bigframes.core.nodes as nodes from bigframes.core.ordering import OrderingColumnReference @@ -32,6 +33,7 @@ import bigframes.dtypes import bigframes.operations as ops import bigframes.operations.aggregations as agg_ops +import bigframes.session._io.bigquery if typing.TYPE_CHECKING: from bigframes.session import Session @@ -77,7 +79,7 @@ def from_pandas(cls, pd_df: pandas.DataFrame): @property def column_ids(self) -> typing.Sequence[str]: - return self.compile().column_ids + return self._compile_ordered().column_ids @property def session(self) -> Session: @@ -87,15 +89,18 @@ def session(self) -> Session: return self.node.session[0] if required_session else get_global_session() def get_column_type(self, key: str) -> bigframes.dtypes.Dtype: - return self.compile().get_column_type(key) + return self._compile_ordered().get_column_type(key) - def compile(self) -> compiled.CompiledArrayValue: - return compiled.compile_node(self.node) + def _compile_ordered(self) -> compiled.OrderedIR: + return compiler.compile_ordered(self.node) + + def _compile_unordered(self) -> compiled.UnorderedIR: + return compiler.compile_unordered(self.node) def shape(self) -> typing.Tuple[int, int]: """Returns dimensions as (length, width) tuple.""" - width = len(self.compile().columns) - count_expr = self.compile()._to_ibis_expr("unordered").count() + width = len(self._compile_unordered().columns) + count_expr = self._compile_unordered()._to_ibis_expr().count() # Support in-memory engines for hermetic unit tests. if not self.node.session: @@ -120,11 +125,14 @@ def to_sql( col_id_overrides: typing.Mapping[str, str] = {}, sorted: bool = False, ) -> str: - return self.compile().to_sql( - offset_column=offset_column, - col_id_overrides=col_id_overrides, - sorted=sorted, - ) + if sorted or offset_column: + return self._compile_ordered().to_sql( + offset_column=offset_column, + col_id_overrides=col_id_overrides, + sorted=sorted, + ) + else: + return self._compile_unordered().to_sql(col_id_overrides=col_id_overrides) def start_query( self, @@ -153,25 +161,28 @@ def start_query( def cached(self, cluster_cols: typing.Sequence[str]) -> ArrayValue: """Write the ArrayValue to a session table and create a new block object that references it.""" - compiled = self.compile() - ibis_expr = compiled._to_ibis_expr("unordered", expose_hidden_cols=True) - destination = self.session._ibis_to_session_table( - ibis_expr, cluster_cols=cluster_cols, api_name="cache" + compiled_value = self._compile_ordered() + ibis_expr = compiled_value._to_ibis_expr( + ordering_mode="unordered", expose_hidden_cols=True + ) + tmp_table = self.session._ibis_to_temp_table( + ibis_expr, cluster_cols=cluster_cols, api_name="cached" ) + table_expression = self.session.ibis_client.table( - f"{destination.project}.{destination.dataset_id}.{destination.table_id}" + f"{tmp_table.project}.{tmp_table.dataset_id}.{tmp_table.table_id}" ) - new_columns = [table_expression[column] for column in compiled.column_ids] + new_columns = [table_expression[column] for column in compiled_value.column_ids] new_hidden_columns = [ table_expression[column] - for column in compiled._hidden_ordering_column_names + for column in compiled_value._hidden_ordering_column_names ] return ArrayValue.from_ibis( self.session, table_expression, columns=new_columns, hidden_ordering_columns=new_hidden_columns, - ordering=compiled._ordering, + ordering=compiled_value._ordering, ) # Operations @@ -189,12 +200,8 @@ def filter(self, predicate_id: str, keep_null: bool = False) -> ArrayValue: ) ) - def order_by( - self, by: Sequence[OrderingColumnReference], stable: bool = False - ) -> ArrayValue: - return ArrayValue( - nodes.OrderByNode(child=self.node, by=tuple(by), stable=stable) - ) + def order_by(self, by: Sequence[OrderingColumnReference]) -> ArrayValue: + return ArrayValue(nodes.OrderByNode(child=self.node, by=tuple(by))) def reversed(self) -> ArrayValue: return ArrayValue(nodes.ReversedNode(child=self.node)) @@ -413,6 +420,7 @@ def join( "left", "outer", "right", + "cross", ], allow_row_identity_join: bool = True, ): diff --git a/bigframes/core/block_transforms.py b/bigframes/core/block_transforms.py index 917edac0de..ce0fdd219a 100644 --- a/bigframes/core/block_transforms.py +++ b/bigframes/core/block_transforms.py @@ -22,6 +22,7 @@ import bigframes.core.blocks as blocks import bigframes.core.ordering as ordering import bigframes.core.window_spec as windows +import bigframes.dtypes as dtypes import bigframes.operations as ops import bigframes.operations.aggregations as agg_ops @@ -106,18 +107,33 @@ def indicate_duplicates( def interpolate(block: blocks.Block, method: str = "linear") -> blocks.Block: - if method != "linear": + supported_methods = [ + "linear", + "values", + "index", + "nearest", + "zero", + "slinear", + ] + if method not in supported_methods: raise NotImplementedError( - f"Only 'linear' interpolate method supported. {constants.FEEDBACK_LINK}" + f"Method {method} not supported, following interpolate methods supported: {', '.join(supported_methods)}. {constants.FEEDBACK_LINK}" ) - backwards_window = windows.WindowSpec(following=0) - forwards_window = windows.WindowSpec(preceding=0) - output_column_ids = [] original_columns = block.value_columns original_labels = block.column_labels - block, offsets = block.promote_offsets() + + if method == "linear": # Assumes evenly spaced, ignore index + block, xvalues = block.promote_offsets() + else: + index_columns = block.index_columns + if len(index_columns) != 1: + raise ValueError("only method 'linear' supports multi-index") + xvalues = block.index_columns[0] + if block.index_dtypes[0] not in dtypes.NUMERIC_BIGFRAMES_TYPES: + raise ValueError("Can only interpolate on numeric index.") + for column in original_columns: # null in same places column is null should_interpolate = block._column_type(column) in [ @@ -125,48 +141,25 @@ def interpolate(block: blocks.Block, method: str = "linear") -> blocks.Block: pd.Int64Dtype(), ] if should_interpolate: - block, notnull = block.apply_unary_op(column, ops.notnull_op) - block, masked_offsets = block.apply_binary_op( - offsets, notnull, ops.partial_arg3(ops.where_op, None) - ) - - block, previous_value = block.apply_window_op( - column, agg_ops.LastNonNullOp(), backwards_window - ) - block, next_value = block.apply_window_op( - column, agg_ops.FirstNonNullOp(), forwards_window - ) - block, previous_value_offset = block.apply_window_op( - masked_offsets, - agg_ops.LastNonNullOp(), - backwards_window, - skip_reproject_unsafe=True, - ) - block, next_value_offset = block.apply_window_op( - masked_offsets, - agg_ops.FirstNonNullOp(), - forwards_window, - skip_reproject_unsafe=True, - ) - - block, prediction_id = _interpolate( + interpolate_method_map = { + "linear": "linear", + "values": "linear", + "index": "linear", + "slinear": "linear", + "zero": "ffill", + "nearest": "nearest", + } + extrapolating_methods = ["linear", "values", "index"] + interpolate_method = interpolate_method_map[method] + do_extrapolate = method in extrapolating_methods + block, interpolated = _interpolate_column( block, - previous_value_offset, - previous_value, - next_value_offset, - next_value, - offsets, + column, + xvalues, + interpolate_method=interpolate_method, + do_extrapolate=do_extrapolate, ) - - block, interpolated_column = block.apply_binary_op( - column, prediction_id, ops.fillna_op - ) - # Pandas performs ffill-like behavior to extrapolate forwards - block, interpolated_and_ffilled = block.apply_binary_op( - interpolated_column, previous_value, ops.fillna_op - ) - - output_column_ids.append(interpolated_and_ffilled) + output_column_ids.append(interpolated) else: output_column_ids.append(column) @@ -175,7 +168,80 @@ def interpolate(block: blocks.Block, method: str = "linear") -> blocks.Block: return block.with_column_labels(original_labels) -def _interpolate( +def _interpolate_column( + block: blocks.Block, + column: str, + x_values: str, + interpolate_method: str, + do_extrapolate: bool = True, +) -> typing.Tuple[blocks.Block, str]: + if interpolate_method not in ["linear", "nearest", "ffill"]: + raise ValueError("interpolate method not supported") + window_ordering = (ordering.OrderingColumnReference(x_values),) + backwards_window = windows.WindowSpec(following=0, ordering=window_ordering) + forwards_window = windows.WindowSpec(preceding=0, ordering=window_ordering) + + # Note, this method may + block, notnull = block.apply_unary_op(column, ops.notnull_op) + block, masked_offsets = block.apply_binary_op( + x_values, notnull, ops.partial_arg3(ops.where_op, None) + ) + + block, previous_value = block.apply_window_op( + column, agg_ops.LastNonNullOp(), backwards_window + ) + block, next_value = block.apply_window_op( + column, agg_ops.FirstNonNullOp(), forwards_window + ) + block, previous_value_offset = block.apply_window_op( + masked_offsets, + agg_ops.LastNonNullOp(), + backwards_window, + skip_reproject_unsafe=True, + ) + block, next_value_offset = block.apply_window_op( + masked_offsets, + agg_ops.FirstNonNullOp(), + forwards_window, + skip_reproject_unsafe=True, + ) + + if interpolate_method == "linear": + block, prediction_id = _interpolate_points_linear( + block, + previous_value_offset, + previous_value, + next_value_offset, + next_value, + x_values, + ) + elif interpolate_method == "nearest": + block, prediction_id = _interpolate_points_nearest( + block, + previous_value_offset, + previous_value, + next_value_offset, + next_value, + x_values, + ) + else: # interpolate_method == 'ffill': + block, prediction_id = _interpolate_points_ffill( + block, + previous_value_offset, + previous_value, + next_value_offset, + next_value, + x_values, + ) + if do_extrapolate: + block, prediction_id = block.apply_binary_op( + prediction_id, previous_value, ops.fillna_op + ) + + return block.apply_binary_op(column, prediction_id, ops.fillna_op) + + +def _interpolate_points_linear( block: blocks.Block, x0_id: str, y0_id: str, @@ -196,6 +262,53 @@ def _interpolate( return block, prediction_id +def _interpolate_points_nearest( + block: blocks.Block, + x0_id: str, + y0_id: str, + x1_id: str, + y1_id: str, + xpredict_id: str, +) -> typing.Tuple[blocks.Block, str]: + """Interpolate by taking the y value of the nearest x value""" + block, left_diff = block.apply_binary_op(xpredict_id, x0_id, ops.sub_op) + block, right_diff = block.apply_binary_op(x1_id, xpredict_id, ops.sub_op) + # If diffs equal, choose left + block, choose_left = block.apply_binary_op(left_diff, right_diff, ops.le_op) + block, choose_left = block.apply_unary_op( + choose_left, ops.partial_right(ops.fillna_op, False) + ) + + block, nearest = block.apply_ternary_op(y0_id, choose_left, y1_id, ops.where_op) + + block, y0_exists = block.apply_unary_op(y0_id, ops.notnull_op) + block, y1_exists = block.apply_unary_op(y1_id, ops.notnull_op) + block, is_interpolation = block.apply_binary_op(y0_exists, y1_exists, ops.and_op) + + block, prediction_id = block.apply_binary_op( + nearest, is_interpolation, ops.partial_arg3(ops.where_op, None) + ) + + return block, prediction_id + + +def _interpolate_points_ffill( + block: blocks.Block, + x0_id: str, + y0_id: str, + x1_id: str, + y1_id: str, + xpredict_id: str, +) -> typing.Tuple[blocks.Block, str]: + """Interpolates by using the preceding values""" + # check for existance of y1, otherwise we are extrapolating instead of interpolating + block, y1_exists = block.apply_unary_op(y1_id, ops.notnull_op) + block, prediction_id = block.apply_binary_op( + y0_id, y1_exists, ops.partial_arg3(ops.where_op, None) + ) + return block, prediction_id + + def drop_duplicates( block: blocks.Block, columns: typing.Sequence[str], keep: str = "first" ) -> blocks.Block: @@ -396,7 +509,7 @@ def nsmallest( ) for col_id in column_ids ] - block = block.order_by(order_refs, stable=True) + block = block.order_by(order_refs) if keep in ("first", "last"): return block.slice(0, n) else: # keep == "all": @@ -428,7 +541,7 @@ def nlargest( ) for col_id in column_ids ] - block = block.order_by(order_refs, stable=True) + block = block.order_by(order_refs) if keep in ("first", "last"): return block.slice(0, n) else: # keep == "all": diff --git a/bigframes/core/blocks.py b/bigframes/core/blocks.py index 635e7db865..f1113d938e 100644 --- a/bigframes/core/blocks.py +++ b/bigframes/core/blocks.py @@ -235,10 +235,9 @@ def cols_matching_label(self, partial_label: Label) -> typing.Sequence[str]: def order_by( self, by: typing.Sequence[ordering.OrderingColumnReference], - stable: bool = False, ) -> Block: return Block( - self._expr.order_by(by, stable=stable), + self._expr.order_by(by), index_columns=self.index_columns, column_labels=self.column_labels, index_labels=self.index.names, @@ -282,7 +281,7 @@ def reset_index(self, drop: bool = True) -> Block: column_labels_modified = self.column_labels for level, label in enumerate(index_labels): if label is None: - if "index" not in self.column_labels: + if "index" not in self.column_labels and len(index_labels) <= 1: label = "index" else: label = f"level_{level}" @@ -386,6 +385,8 @@ def to_pandas( max_download_size: Optional[int] = None, sampling_method: Optional[str] = None, random_state: Optional[int] = None, + *, + ordered: bool = True, ) -> Tuple[pd.DataFrame, bigquery.QueryJob]: """Run query and download results as a pandas DataFrame.""" if max_download_size is None: @@ -412,6 +413,7 @@ def to_pandas( max_download_size=max_download_size, sampling_method=sampling_method, random_state=random_state, + ordered=ordered, ) return df, query_job @@ -446,12 +448,16 @@ def _compute_and_count( max_download_size: Optional[int] = None, sampling_method: Optional[str] = None, random_state: Optional[int] = None, + *, + ordered: bool = True, ) -> Tuple[pd.DataFrame, int, bigquery.QueryJob]: """Run query and download results as a pandas DataFrame. Return the total number of results as well.""" # TODO(swast): Allow for dry run and timeout. expr = self._apply_value_keys_to_expr(value_keys=value_keys) - results_iterator, query_job = expr.start_query(max_results=max_results) + results_iterator, query_job = expr.start_query( + max_results=max_results, sorted=ordered + ) table_size = ( expr.session._get_table_size(query_job.destination) / _BYTES_TO_MEGABYTES @@ -1531,6 +1537,7 @@ def merge( "left", "outer", "right", + "cross", ], left_join_ids: typing.Sequence[str], right_join_ids: typing.Sequence[str], @@ -1588,7 +1595,6 @@ def merge( # sort uses coalesced join keys always joined_expr = joined_expr.order_by( [ordering.OrderingColumnReference(col_id) for col_id in coalesced_ids], - stable=True, ) joined_expr = joined_expr.select_columns(result_columns) diff --git a/bigframes/core/compile/__init__.py b/bigframes/core/compile/__init__.py index c86f4463dc..761fd9a465 100644 --- a/bigframes/core/compile/__init__.py +++ b/bigframes/core/compile/__init__.py @@ -12,10 +12,12 @@ # See the License for the specific language governing permissions and # limitations under the License. -from bigframes.core.compile.compiled import CompiledArrayValue -from bigframes.core.compile.compiler import compile_node +from bigframes.core.compile.compiled import OrderedIR, UnorderedIR +from bigframes.core.compile.compiler import compile_ordered, compile_unordered __all__ = [ - "compile_node", - "CompiledArrayValue", + "compile_ordered", + "compile_unordered", + "OrderedIR", + "UnorderedIR", ] diff --git a/bigframes/core/compile/compiled.py b/bigframes/core/compile/compiled.py index 1134f1aab0..78050ed4f0 100644 --- a/bigframes/core/compile/compiled.py +++ b/bigframes/core/compile/compiled.py @@ -13,8 +13,8 @@ # limitations under the License. from __future__ import annotations +import abc import functools -import math import textwrap import typing from typing import Collection, Iterable, Literal, Optional, Sequence @@ -32,8 +32,6 @@ ExpressionOrdering, IntegerEncoding, OrderingColumnReference, - reencode_order_string, - StringEncoding, ) import bigframes.core.utils as utils from bigframes.core.window_spec import WindowSpec @@ -44,8 +42,568 @@ ORDER_ID_COLUMN = "bigframes_ordering_id" PREDICATE_COLUMN = "bigframes_predicate" +T = typing.TypeVar("T", bound="BaseIbisIR") -class CompiledArrayValue: + +class BaseIbisIR(abc.ABC): + """Implementation detail, contains common logic between ordered and unordered IR""" + + def __init__( + self, + table: ibis_types.Table, + columns: Sequence[ibis_types.Value], + predicates: Optional[Collection[ibis_types.BooleanValue]] = None, + ): + self._table = table + self._predicates = tuple(predicates) if predicates is not None else () + # Allow creating a DataFrame directly from an Ibis table expression. + # TODO(swast): Validate that each column references the same table (or + # no table for literal values). + self._columns = tuple(columns) + # To allow for more efficient lookup by column name, create a + # dictionary mapping names to column values. + self._column_names = {column.get_name(): column for column in self._columns} + + @property + def columns(self) -> typing.Tuple[ibis_types.Value, ...]: + return self._columns + + @property + def column_ids(self) -> typing.Sequence[str]: + return tuple(self._column_names.keys()) + + @property + def _reduced_predicate(self) -> typing.Optional[ibis_types.BooleanValue]: + """Returns the frame's predicates as an equivalent boolean value, useful where a single predicate value is preferred.""" + return ( + _reduce_predicate_list(self._predicates).name(PREDICATE_COLUMN) + if self._predicates + else None + ) + + @abc.abstractmethod + def select_columns(self: T, column_ids: typing.Sequence[str]) -> T: + """Creates a new expression based on this expression with new columns.""" + ... + + def drop_columns(self: T, columns: Iterable[str]) -> T: + return self.select_columns( + [col for col in self.column_ids if col not in columns] + ) + + @abc.abstractmethod + def filter(self: T, predicate_id: str, keep_null: bool = False) -> T: + """Filter the table on a given expression, the predicate must be a boolean series aligned with the table expression.""" + ... + + @abc.abstractmethod + def unpivot( + self: T, + row_labels: typing.Sequence[typing.Hashable], + unpivot_columns: typing.Sequence[ + typing.Tuple[str, typing.Sequence[typing.Optional[str]]] + ], + *, + passthrough_columns: typing.Sequence[str] = (), + index_col_ids: typing.Sequence[str] = ["index"], + dtype: typing.Union[ + bigframes.dtypes.Dtype, typing.Sequence[bigframes.dtypes.Dtype] + ] = pandas.Float64Dtype(), + how="left", + ) -> T: + """ + Unpivot ArrayValue columns. + + Args: + row_labels: Identifies the source of the row. Must be equal to length to source column list in unpivot_columns argument. + unpivot_columns: Mapping of column id to list of input column ids. Lists of input columns may use None. + passthrough_columns: Columns that will not be unpivoted. Column id will be preserved. + index_col_id (str): The column id to be used for the row labels. + dtype (dtype or list of dtype): Dtype to use for the unpivot columns. If list, must be equal in number to unpivot_columns. + + Returns: + ArrayValue: The unpivoted ArrayValue + """ + ... + + @abc.abstractmethod + def _reproject_to_table(self: T) -> T: + """ + Internal operators that projects the internal representation into a + new ibis table expression where each value column is a direct + reference to a column in that table expression. Needed after + some operations such as window operations that cannot be used + recursively in projections. + """ + ... + + def project_unary_op( + self: T, + input_column_id: str, + op: ops.UnaryOp, + output_column_id: typing.Optional[str] = None, + ) -> T: + """Creates a new expression based on this expression with unary operation applied to one column.""" + result_id = ( + output_column_id or input_column_id + ) # overwrite input if not output id provided + value = op._as_ibis(self._get_ibis_column(input_column_id)).name(result_id) + return self._set_or_replace_by_id(result_id, value) + + def project_binary_op( + self: T, + left_column_id: str, + right_column_id: str, + op: ops.BinaryOp, + output_column_id: str, + ) -> T: + """Creates a new expression based on this expression with binary operation applied to two columns.""" + value = op( + self._get_ibis_column(left_column_id), + self._get_ibis_column(right_column_id), + ).name(output_column_id) + return self._set_or_replace_by_id(output_column_id, value) + + def project_ternary_op( + self: T, + col_id_1: str, + col_id_2: str, + col_id_3: str, + op: ops.TernaryOp, + output_column_id: str, + ) -> T: + """Creates a new expression based on this expression with ternary operation applied to three columns.""" + value = op( + self._get_ibis_column(col_id_1), + self._get_ibis_column(col_id_2), + self._get_ibis_column(col_id_3), + ).name(output_column_id) + return self._set_or_replace_by_id(output_column_id, value) + + def assign(self: T, source_id: str, destination_id: str) -> T: + return self._set_or_replace_by_id( + destination_id, self._get_ibis_column(source_id) + ) + + def assign_constant( + self: T, + destination_id: str, + value: typing.Any, + dtype: typing.Optional[bigframes.dtypes.Dtype], + ) -> T: + # TODO(b/281587571): Solve scalar constant aggregation problem w/Ibis. + ibis_value = bigframes.dtypes.literal_to_ibis_scalar(value, dtype) + if ibis_value is None: + raise NotImplementedError( + f"Type not supported as scalar value {type(value)}. {constants.FEEDBACK_LINK}" + ) + expr = self._set_or_replace_by_id(destination_id, ibis_value) + return expr._reproject_to_table() + + @abc.abstractmethod + def _set_or_replace_by_id(self: T, id: str, new_value: ibis_types.Value) -> T: + ... + + def _get_ibis_column(self, key: str) -> ibis_types.Value: + """Gets the Ibis expression for a given column.""" + if key not in self.column_ids: + raise ValueError( + "Column name {} not in set of values: {}".format(key, self.column_ids) + ) + return typing.cast(ibis_types.Value, self._column_names[key]) + + def get_column_type(self, key: str) -> bigframes.dtypes.Dtype: + ibis_type = typing.cast( + bigframes.dtypes.IbisDtype, self._get_ibis_column(key).type() + ) + return typing.cast( + bigframes.dtypes.Dtype, + bigframes.dtypes.ibis_dtype_to_bigframes_dtype(ibis_type), + ) + + +# Ibis Implementations +class UnorderedIR(BaseIbisIR): + def __init__( + self, + table: ibis_types.Table, + columns: Sequence[ibis_types.Value], + predicates: Optional[Collection[ibis_types.BooleanValue]] = None, + ): + super().__init__(table, columns, predicates) + + def builder(self): + """Creates a mutable builder for expressions.""" + # Since ArrayValue is intended to be immutable (immutability offers + # potential opportunities for caching, though we might need to introduce + # more node types for that to be useful), we create a builder class. + return UnorderedIR.Builder( + self._table, + columns=self._columns, + predicates=self._predicates, + ) + + def to_sql( + self, + offset_column: typing.Optional[str] = None, + col_id_overrides: typing.Mapping[str, str] = {}, + sorted: bool = False, + ) -> str: + if offset_column or sorted: + raise ValueError("Cannot produce sorted sql in unordered mode") + sql = ibis_bigquery.Backend().compile( + self._to_ibis_expr( + col_id_overrides=col_id_overrides, + ) + ) + return typing.cast(str, sql) + + def _to_ibis_expr( + self, + *, + expose_hidden_cols: bool = False, + fraction: Optional[float] = None, + col_id_overrides: typing.Mapping[str, str] = {}, + ): + """ + Creates an Ibis table expression representing the DataFrame. + + ArrayValue objects are sorted, so the following options are available + to reflect this in the ibis expression. + + * "offset_col": Zero-based offsets are generated as a column, this will + not sort the rows however. + * "string_encoded": An ordered string column is provided in output table. + * "unordered": No ordering information will be provided in output. Only + value columns are projected. + + For offset or ordered column, order_col_name can be used to assign the + output label for the ordering column. If none is specified, the default + column name will be 'bigframes_ordering_id' + + Args: + expose_hidden_cols: + If True, include the hidden ordering columns in the results. + Only compatible with `order_by` and `unordered` + ``ordering_mode``. + col_id_overrides: + overrides the column ids for the result + Returns: + An ibis expression representing the data help by the ArrayValue object. + """ + columns = list(self._columns) + columns_to_drop: list[ + str + ] = [] # Ordering/Filtering columns that will be dropped at end + + if self._reduced_predicate is not None: + columns.append(self._reduced_predicate) + # Usually drop predicate as it is will be all TRUE after filtering + if not expose_hidden_cols: + columns_to_drop.append(self._reduced_predicate.get_name()) + + # Special case for empty tables, since we can't create an empty + # projection. + if not columns: + return ibis.memtable([]) + + # Make sure all dtypes are the "canonical" ones for BigFrames. This is + # important for operations like UNION where the schema must match. + table = self._table.select( + bigframes.dtypes.ibis_value_to_canonical_type(column) for column in columns + ) + base_table = table + if self._reduced_predicate is not None: + table = table.filter(base_table[PREDICATE_COLUMN]) + table = table.drop(*columns_to_drop) + if col_id_overrides: + table = table.relabel(col_id_overrides) + if fraction is not None: + table = table.filter(ibis.random() < ibis.literal(fraction)) + return table + + def select_columns(self, column_ids: typing.Sequence[str]) -> UnorderedIR: + """Creates a new expression based on this expression with new columns.""" + columns = [self._get_ibis_column(col_id) for col_id in column_ids] + builder = self.builder() + builder.columns = list(columns) + new_expr = builder.build() + return new_expr + + def filter(self, predicate_id: str, keep_null: bool = False) -> UnorderedIR: + condition = typing.cast( + ibis_types.BooleanValue, self._get_ibis_column(predicate_id) + ) + if keep_null: + condition = typing.cast( + ibis_types.BooleanValue, + condition.fillna( + typing.cast(ibis_types.BooleanScalar, ibis_types.literal(True)) + ), + ) + return self._filter(condition) + + def _filter(self, predicate_value: ibis_types.BooleanValue) -> UnorderedIR: + """Filter the table on a given expression, the predicate must be a boolean series aligned with the table expression.""" + expr = self.builder() + expr.predicates = [*self._predicates, predicate_value] + return expr.build() + + def unpivot( + self, + row_labels: typing.Sequence[typing.Hashable], + unpivot_columns: typing.Sequence[ + typing.Tuple[str, typing.Sequence[typing.Optional[str]]] + ], + *, + passthrough_columns: typing.Sequence[str] = (), + index_col_ids: typing.Sequence[str] = ["index"], + dtype: typing.Union[ + bigframes.dtypes.Dtype, typing.Sequence[bigframes.dtypes.Dtype] + ] = pandas.Float64Dtype(), + how="left", + ) -> UnorderedIR: + if how not in ("left", "right"): + raise ValueError("'how' must be 'left' or 'right'") + table = self._to_ibis_expr() + row_n = len(row_labels) + if not all( + len(source_columns) == row_n for _, source_columns in unpivot_columns + ): + raise ValueError("Columns and row labels must all be same length.") + + unpivot_offset_id = bigframes.core.guid.generate_guid("unpivot_offsets_") + unpivot_table = table.cross_join( + ibis.memtable({unpivot_offset_id: range(row_n)}) + ) + # Use ibis memtable to infer type of rowlabels (if possible) + # TODO: Allow caller to specify dtype + if isinstance(row_labels[0], tuple): + labels_table = ibis.memtable(row_labels) + labels_ibis_types = [ + labels_table[col].type() for col in labels_table.columns + ] + else: + labels_ibis_types = [ibis.memtable({"col": row_labels})["col"].type()] + labels_dtypes = [ + bigframes.dtypes.ibis_dtype_to_bigframes_dtype(ibis_type) + for ibis_type in labels_ibis_types + ] + + label_columns = [] + for label_part, (col_id, label_dtype) in enumerate( + zip(index_col_ids, labels_dtypes) + ): + # interpret as tuples even if it wasn't originally so can apply same logic for multi-column labels + labels_as_tuples = [ + label if isinstance(label, tuple) else (label,) for label in row_labels + ] + cases = [ + ( + i, + bigframes.dtypes.literal_to_ibis_scalar( + label_tuple[label_part], # type:ignore + force_dtype=label_dtype, # type:ignore + ), + ) + for i, label_tuple in enumerate(labels_as_tuples) + ] + labels_value = ( + typing.cast(ibis_types.IntegerColumn, unpivot_table[unpivot_offset_id]) + .cases(cases, default=None) # type:ignore + .name(col_id) + ) + label_columns.append(labels_value) + + unpivot_values = [] + for j in range(len(unpivot_columns)): + col_dtype = dtype[j] if utils.is_list_like(dtype) else dtype + result_col, source_cols = unpivot_columns[j] + null_value = bigframes.dtypes.literal_to_ibis_scalar( + None, force_dtype=col_dtype + ) + ibis_values = [ + ops.AsTypeOp(col_dtype)._as_ibis(unpivot_table[col]) + if col is not None + else null_value + for col in source_cols + ] + cases = [(i, ibis_values[i]) for i in range(len(ibis_values))] + unpivot_value = typing.cast( + ibis_types.IntegerColumn, unpivot_table[unpivot_offset_id] + ).cases( + cases, default=null_value # type:ignore + ) + unpivot_values.append(unpivot_value.name(result_col)) + + unpivot_table = unpivot_table.select( + passthrough_columns, + *label_columns, + *unpivot_values, + unpivot_offset_id, + ) + + value_columns = [ + unpivot_table[value_col_id] for value_col_id, _ in unpivot_columns + ] + passthrough_values = [unpivot_table[col] for col in passthrough_columns] + return UnorderedIR( + table=unpivot_table, + columns=[ + *[unpivot_table[col_id] for col_id in index_col_ids], + *value_columns, + *passthrough_values, + ], + ) + + def aggregate( + self, + aggregations: typing.Sequence[typing.Tuple[str, agg_ops.AggregateOp, str]], + by_column_ids: typing.Sequence[str] = (), + dropna: bool = True, + ) -> OrderedIR: + """ + Apply aggregations to the expression. + Arguments: + aggregations: input_column_id, operation, output_column_id tuples + by_column_id: column id of the aggregation key, this is preserved through the transform + dropna: whether null keys should be dropped + """ + table = self._to_ibis_expr() + stats = { + col_out: agg_op._as_ibis(table[col_in]) + for col_in, agg_op, col_out in aggregations + } + if by_column_ids: + result = table.group_by(by_column_ids).aggregate(**stats) + # Must have deterministic ordering, so order by the unique "by" column + ordering = ExpressionOrdering( + tuple( + [ + OrderingColumnReference(column_id=column_id) + for column_id in by_column_ids + ] + ), + total_ordering_columns=frozenset(by_column_ids), + ) + columns = tuple(result[key] for key in result.columns) + expr = OrderedIR(result, columns=columns, ordering=ordering) + if dropna: + for column_id in by_column_ids: + expr = expr._filter( + ops.notnull_op._as_ibis(expr._get_ibis_column(column_id)) + ) + # Can maybe remove this as Ordering id is redundant as by_column is unique after aggregation + return expr._project_offsets() + else: + aggregates = {**stats, ORDER_ID_COLUMN: ibis_types.literal(0)} + result = table.aggregate(**aggregates) + # Ordering is irrelevant for single-row output, but set ordering id regardless as other ops(join etc.) expect it. + ordering = ExpressionOrdering( + ordering_value_columns=tuple( + [OrderingColumnReference(ORDER_ID_COLUMN)] + ), + total_ordering_columns=frozenset([ORDER_ID_COLUMN]), + integer_encoding=IntegerEncoding(is_encoded=True, is_sequential=True), + ) + return OrderedIR( + result, + columns=[result[col_id] for col_id in [*stats.keys()]], + hidden_ordering_columns=[result[ORDER_ID_COLUMN]], + ordering=ordering, + ) + + def corr_aggregate( + self, corr_aggregations: typing.Sequence[typing.Tuple[str, str, str]] + ) -> OrderedIR: + """ + Get correlations between each lef_column_id and right_column_id, stored in the respective output_column_id. + This uses BigQuery's CORR under the hood, and thus only Pearson's method is used. + Arguments: + corr_aggregations: left_column_id, right_column_id, output_column_id tuples + """ + table = self._to_ibis_expr() + stats = { + col_out: table[col_left].corr(table[col_right], how="pop") + for col_left, col_right, col_out in corr_aggregations + } + aggregates = {**stats, ORDER_ID_COLUMN: ibis_types.literal(0)} + result = table.aggregate(**aggregates) + # Ordering is irrelevant for single-row output, but set ordering id regardless as other ops(join etc.) expect it. + ordering = ExpressionOrdering( + ordering_value_columns=tuple([OrderingColumnReference(ORDER_ID_COLUMN)]), + total_ordering_columns=frozenset([ORDER_ID_COLUMN]), + integer_encoding=IntegerEncoding(is_encoded=True, is_sequential=True), + ) + return OrderedIR( + result, + columns=[result[col_id] for col_id in [*stats.keys()]], + hidden_ordering_columns=[result[ORDER_ID_COLUMN]], + ordering=ordering, + ) + + def _uniform_sampling(self, fraction: float) -> UnorderedIR: + """Sampling the table on given fraction. + + .. warning:: + The row numbers of result is non-deterministic, avoid to use. + """ + table = self._to_ibis_expr(fraction=fraction) + columns = [table[column_name] for column_name in self._column_names] + return UnorderedIR( + table, + columns=columns, + ) + + ## Helpers + def _set_or_replace_by_id( + self, id: str, new_value: ibis_types.Value + ) -> UnorderedIR: + builder = self.builder() + if id in self.column_ids: + builder.columns = [ + val if (col_id != id) else new_value.name(id) + for col_id, val in zip(self.column_ids, self._columns) + ] + else: + builder.columns = [*self.columns, new_value.name(id)] + return builder.build() + + def _reproject_to_table(self) -> UnorderedIR: + """ + Internal operators that projects the internal representation into a + new ibis table expression where each value column is a direct + reference to a column in that table expression. Needed after + some operations such as window operations that cannot be used + recursively in projections. + """ + table = self._to_ibis_expr() + columns = [table[column_name] for column_name in self._column_names] + return UnorderedIR( + table, + columns=columns, + ) + + class Builder: + def __init__( + self, + table: ibis_types.Table, + columns: Collection[ibis_types.Value] = (), + predicates: Optional[Collection[ibis_types.BooleanValue]] = None, + ): + self.table = table + self.columns = list(columns) + self.predicates = list(predicates) if predicates is not None else None + + def build(self) -> UnorderedIR: + return UnorderedIR( + table=self.table, + columns=self.columns, + predicates=self.predicates, + ) + + +class OrderedIR(BaseIbisIR): """Immutable BigQuery DataFrames expression tree. Note: Usage of this class is considered to be private and subject to change @@ -71,17 +629,11 @@ def __init__( ordering: ExpressionOrdering = ExpressionOrdering(), predicates: Optional[Collection[ibis_types.BooleanValue]] = None, ): - self._table = table - self._predicates = tuple(predicates) if predicates is not None else () + super().__init__(table, columns, predicates) # TODO: Validate ordering if not ordering.total_ordering_columns: raise ValueError("Must have total ordering defined by one or more columns") self._ordering = ordering - # Allow creating a DataFrame directly from an Ibis table expression. - # TODO(swast): Validate that each column references the same table (or - # no table for literal values). - self._columns = tuple(columns) - # Meta columns store ordering, or other data that doesn't correspond to dataframe columns self._hidden_ordering_columns = ( tuple(hidden_ordering_columns) @@ -111,10 +663,10 @@ def __init__( raise ValueError(f"Illegal ordering keys: {ordering.all_ordering_columns}") @classmethod - def mem_expr_from_pandas( + def from_pandas( cls, pd_df: pandas.DataFrame, - ) -> CompiledArrayValue: + ) -> OrderedIR: """ Builds an in-memory only (SQL only) expr from a pandas dataframe. """ @@ -173,27 +725,10 @@ def mem_expr_from_pandas( hidden_ordering_columns=(keys_memtable[ORDER_ID_COLUMN],), ) - @property - def columns(self) -> typing.Tuple[ibis_types.Value, ...]: - return self._columns - - @property - def column_ids(self) -> typing.Sequence[str]: - return tuple(self._column_names.keys()) - @property def _hidden_column_ids(self) -> typing.Sequence[str]: return tuple(self._hidden_ordering_column_names.keys()) - @property - def _reduced_predicate(self) -> typing.Optional[ibis_types.BooleanValue]: - """Returns the frame's predicates as an equivalent boolean value, useful where a single predicate value is preferred.""" - return ( - _reduce_predicate_list(self._predicates).name(PREDICATE_COLUMN) - if self._predicates - else None - ) - @property def _ibis_order(self) -> Sequence[ibis_types.Value]: """Returns a sequence of ibis values which can be directly used to order a table expression. Has direction modifiers applied.""" @@ -202,12 +737,15 @@ def _ibis_order(self) -> Sequence[ibis_types.Value]: self._ordering.all_ordering_columns, ) - def builder(self) -> ArrayValueBuilder: + def to_unordered(self) -> UnorderedIR: + return UnorderedIR(self._table, self._columns, self._predicates) + + def builder(self) -> OrderedIR.Builder: """Creates a mutable builder for expressions.""" # Since ArrayValue is intended to be immutable (immutability offers # potential opportunities for caching, though we might need to introduce # more node types for that to be useful), we create a builder class. - return ArrayValueBuilder( + return OrderedIR.Builder( self._table, columns=self._columns, hidden_ordering_columns=self._hidden_ordering_columns, @@ -215,160 +753,37 @@ def builder(self) -> ArrayValueBuilder: predicates=self._predicates, ) - def drop_columns(self, columns: Iterable[str]) -> CompiledArrayValue: - # Must generate offsets if we are dropping a column that ordering depends on - expr = self - for ordering_column in set(columns).intersection( - [col.column_id for col in self._ordering.ordering_value_columns] - ): - expr = self._hide_column(ordering_column) - - expr_builder = expr.builder() - remain_cols = [ - column for column in expr.columns if column.get_name() not in columns - ] - expr_builder.columns = remain_cols - return expr_builder.build() - - def get_column_type(self, key: str) -> bigframes.dtypes.Dtype: - ibis_type = typing.cast( - bigframes.dtypes.IbisDtype, self._get_any_column(key).type() - ) - return typing.cast( - bigframes.dtypes.Dtype, - bigframes.dtypes.ibis_dtype_to_bigframes_dtype(ibis_type), - ) - - def _get_ibis_column(self, key: str) -> ibis_types.Value: - """Gets the Ibis expression for a given column.""" - if key not in self.column_ids: - raise ValueError( - "Column name {} not in set of values: {}".format(key, self.column_ids) - ) - return typing.cast(ibis_types.Value, self._column_names[key]) - - def _get_any_column(self, key: str) -> ibis_types.Value: - """Gets the Ibis expression for a given column. Will also get hidden columns.""" - all_columns = {**self._column_names, **self._hidden_ordering_column_names} - if key not in all_columns.keys(): - raise ValueError( - "Column name {} not in set of values: {}".format( - key, all_columns.keys() - ) - ) - return typing.cast(ibis_types.Value, all_columns[key]) - - def _get_hidden_ordering_column(self, key: str) -> ibis_types.Column: - """Gets the Ibis expression for a given hidden column.""" - if key not in self._hidden_ordering_column_names.keys(): - raise ValueError( - "Column name {} not in set of values: {}".format( - key, self._hidden_ordering_column_names.keys() - ) - ) - return typing.cast(ibis_types.Column, self._hidden_ordering_column_names[key]) - - def filter(self, predicate_id: str, keep_null: bool = False) -> CompiledArrayValue: - """Filter the table on a given expression, the predicate must be a boolean series aligned with the table expression.""" - condition = typing.cast( - ibis_types.BooleanValue, self._get_ibis_column(predicate_id) - ) - if keep_null: - condition = typing.cast( - ibis_types.BooleanValue, - condition.fillna( - typing.cast(ibis_types.BooleanScalar, ibis_types.literal(True)) - ), - ) - return self._filter(condition) - - def _filter(self, predicate_value: ibis_types.BooleanValue) -> CompiledArrayValue: - """Filter the table on a given expression, the predicate must be a boolean series aligned with the table expression.""" - expr = self.builder() - expr.ordering = expr.ordering.with_non_sequential() - expr.predicates = [*self._predicates, predicate_value] - return expr.build() - - def order_by( - self, by: Sequence[OrderingColumnReference], stable: bool = False - ) -> CompiledArrayValue: + def order_by(self, by: Sequence[OrderingColumnReference]) -> OrderedIR: expr_builder = self.builder() - expr_builder.ordering = self._ordering.with_ordering_columns(by, stable=stable) + expr_builder.ordering = self._ordering.with_ordering_columns(by) return expr_builder.build() - def reversed(self) -> CompiledArrayValue: + def reversed(self) -> OrderedIR: expr_builder = self.builder() expr_builder.ordering = self._ordering.with_reverse() return expr_builder.build() - def _uniform_sampling(self, fraction: float) -> CompiledArrayValue: + def _uniform_sampling(self, fraction: float) -> OrderedIR: """Sampling the table on given fraction. .. warning:: The row numbers of result is non-deterministic, avoid to use. """ table = self._to_ibis_expr( - "unordered", expose_hidden_cols=True, fraction=fraction + ordering_mode="unordered", expose_hidden_cols=True, fraction=fraction ) columns = [table[column_name] for column_name in self._column_names] hidden_ordering_columns = [ table[column_name] for column_name in self._hidden_ordering_column_names ] - return CompiledArrayValue( + return OrderedIR( table, columns=columns, hidden_ordering_columns=hidden_ordering_columns, ordering=self._ordering, ) - @property - def _offsets(self) -> ibis_types.IntegerColumn: - if not self._ordering.is_sequential: - raise ValueError( - "Expression does not have offsets. Generate them first using project_offsets." - ) - if not self._ordering.total_order_col: - raise ValueError( - "Ordering is invalid. Marked as sequential but no total order columns." - ) - column = self._get_any_column(self._ordering.total_order_col.column_id) - return typing.cast(ibis_types.IntegerColumn, column) - - def _project_offsets(self) -> CompiledArrayValue: - """Create a new expression that contains offsets. Should only be executed when offsets are needed for an operations. Has no effect on expression semantics.""" - if self._ordering.is_sequential: - return self - # TODO(tbergeron): Enforce total ordering - table = self._to_ibis_expr( - ordering_mode="offset_col", order_col_name=ORDER_ID_COLUMN - ) - columns = [table[column_name] for column_name in self._column_names] - ordering = ExpressionOrdering( - ordering_value_columns=tuple([OrderingColumnReference(ORDER_ID_COLUMN)]), - total_ordering_columns=frozenset([ORDER_ID_COLUMN]), - integer_encoding=IntegerEncoding(True, is_sequential=True), - ) - return CompiledArrayValue( - table, - columns=columns, - hidden_ordering_columns=[table[ORDER_ID_COLUMN]], - ordering=ordering, - ) - - def _hide_column(self, column_id) -> CompiledArrayValue: - """Pushes columns to hidden columns list. Used to hide ordering columns that have been dropped or destructively mutated.""" - expr_builder = self.builder() - # Need to rename column as caller might be creating a new row with the same name but different values. - # Can avoid this if don't allow callers to determine ids and instead generate unique ones in this class. - new_name = bigframes.core.guid.generate_guid(prefix="bigframes_hidden_") - expr_builder.hidden_ordering_columns = [ - *self._hidden_ordering_columns, - self._get_ibis_column(column_id).name(new_name), - ] - expr_builder.ordering = self._ordering.with_column_remap({column_id: new_name}) - return expr_builder.build() - - def promote_offsets(self, col_id: str) -> CompiledArrayValue: + def promote_offsets(self, col_id: str) -> OrderedIR: """ Convenience function to promote copy of column offsets to a value column. Can be used to reset index. """ @@ -384,194 +799,21 @@ def promote_offsets(self, col_id: str) -> CompiledArrayValue: ] return expr_builder.build() - def select_columns(self, column_ids: typing.Sequence[str]) -> CompiledArrayValue: + def select_columns(self, column_ids: typing.Sequence[str]) -> OrderedIR: """Creates a new expression based on this expression with new columns.""" columns = [self._get_ibis_column(col_id) for col_id in column_ids] expr = self for ordering_column in set(self.column_ids).intersection( [col_ref.column_id for col_ref in self._ordering.ordering_value_columns] - ): - # Need to hide ordering columns that are being dropped. Alternatively, could project offsets - expr = expr._hide_column(ordering_column) - builder = expr.builder() - builder.columns = list(columns) - new_expr = builder.build() - return new_expr - - def concat(self, other: typing.Sequence[CompiledArrayValue]) -> CompiledArrayValue: - """Append together multiple ArrayValue objects.""" - if len(other) == 0: - return self - tables = [] - prefix_base = 10 - prefix_size = math.ceil(math.log(len(other) + 1, prefix_base)) - # Must normalize all ids to the same encoding size - max_encoding_size = max( - self._ordering.string_encoding.length, - *[expression._ordering.string_encoding.length for expression in other], - ) - for i, expr in enumerate([self, *other]): - ordering_prefix = str(i).zfill(prefix_size) - table = expr._to_ibis_expr( - ordering_mode="string_encoded", order_col_name=ORDER_ID_COLUMN - ) - # Rename the value columns based on horizontal offset before applying union. - table = table.select( - [ - table[col].name(f"column_{i}") - if col != ORDER_ID_COLUMN - else ( - ordering_prefix - + reencode_order_string( - table[ORDER_ID_COLUMN], max_encoding_size - ) - ).name(ORDER_ID_COLUMN) - for i, col in enumerate(table.columns) - ] - ) - tables.append(table) - combined_table = ibis.union(*tables) - ordering = ExpressionOrdering( - ordering_value_columns=tuple([OrderingColumnReference(ORDER_ID_COLUMN)]), - total_ordering_columns=frozenset([ORDER_ID_COLUMN]), - string_encoding=StringEncoding(True, prefix_size + max_encoding_size), - ) - return CompiledArrayValue( - combined_table, - columns=[ - combined_table[col] - for col in combined_table.columns - if col != ORDER_ID_COLUMN - ], - hidden_ordering_columns=[combined_table[ORDER_ID_COLUMN]], - ordering=ordering, - ) - - def project_unary_op( - self, column_name: str, op: ops.UnaryOp, output_name=None - ) -> CompiledArrayValue: - """Creates a new expression based on this expression with unary operation applied to one column.""" - value = op._as_ibis(self._get_ibis_column(column_name)).name( - output_name or column_name - ) - return self._set_or_replace_by_id(output_name or column_name, value) - - def project_binary_op( - self, - left_column_id: str, - right_column_id: str, - op: ops.BinaryOp, - output_column_id: str, - ) -> CompiledArrayValue: - """Creates a new expression based on this expression with binary operation applied to two columns.""" - value = op( - self._get_ibis_column(left_column_id), - self._get_ibis_column(right_column_id), - ).name(output_column_id) - return self._set_or_replace_by_id(output_column_id, value) - - def project_ternary_op( - self, - col_id_1: str, - col_id_2: str, - col_id_3: str, - op: ops.TernaryOp, - output_column_id: str, - ) -> CompiledArrayValue: - """Creates a new expression based on this expression with ternary operation applied to three columns.""" - value = op( - self._get_ibis_column(col_id_1), - self._get_ibis_column(col_id_2), - self._get_ibis_column(col_id_3), - ).name(output_column_id) - return self._set_or_replace_by_id(output_column_id, value) - - def aggregate( - self, - aggregations: typing.Sequence[typing.Tuple[str, agg_ops.AggregateOp, str]], - by_column_ids: typing.Sequence[str] = (), - dropna: bool = True, - ) -> CompiledArrayValue: - """ - Apply aggregations to the expression. - Arguments: - aggregations: input_column_id, operation, output_column_id tuples - by_column_id: column id of the aggregation key, this is preserved through the transform - dropna: whether null keys should be dropped - """ - table = self._to_ibis_expr("unordered") - stats = { - col_out: agg_op._as_ibis(table[col_in]) - for col_in, agg_op, col_out in aggregations - } - if by_column_ids: - result = table.group_by(by_column_ids).aggregate(**stats) - # Must have deterministic ordering, so order by the unique "by" column - ordering = ExpressionOrdering( - tuple( - [ - OrderingColumnReference(column_id=column_id) - for column_id in by_column_ids - ] - ), - total_ordering_columns=frozenset(by_column_ids), - ) - columns = tuple(result[key] for key in result.columns) - expr = CompiledArrayValue(result, columns=columns, ordering=ordering) - if dropna: - for column_id in by_column_ids: - expr = expr._filter( - ops.notnull_op._as_ibis(expr._get_ibis_column(column_id)) - ) - # Can maybe remove this as Ordering id is redundant as by_column is unique after aggregation - return expr._project_offsets() - else: - aggregates = {**stats, ORDER_ID_COLUMN: ibis_types.literal(0)} - result = table.aggregate(**aggregates) - # Ordering is irrelevant for single-row output, but set ordering id regardless as other ops(join etc.) expect it. - ordering = ExpressionOrdering( - ordering_value_columns=tuple( - [OrderingColumnReference(ORDER_ID_COLUMN)] - ), - total_ordering_columns=frozenset([ORDER_ID_COLUMN]), - integer_encoding=IntegerEncoding(is_encoded=True, is_sequential=True), - ) - return CompiledArrayValue( - result, - columns=[result[col_id] for col_id in [*stats.keys()]], - hidden_ordering_columns=[result[ORDER_ID_COLUMN]], - ordering=ordering, - ) - - def corr_aggregate( - self, corr_aggregations: typing.Sequence[typing.Tuple[str, str, str]] - ) -> CompiledArrayValue: - """ - Get correlations between each lef_column_id and right_column_id, stored in the respective output_column_id. - This uses BigQuery's CORR under the hood, and thus only Pearson's method is used. - Arguments: - corr_aggregations: left_column_id, right_column_id, output_column_id tuples - """ - table = self._to_ibis_expr("unordered") - stats = { - col_out: table[col_left].corr(table[col_right], how="pop") - for col_left, col_right, col_out in corr_aggregations - } - aggregates = {**stats, ORDER_ID_COLUMN: ibis_types.literal(0)} - result = table.aggregate(**aggregates) - # Ordering is irrelevant for single-row output, but set ordering id regardless as other ops(join etc.) expect it. - ordering = ExpressionOrdering( - ordering_value_columns=tuple([OrderingColumnReference(ORDER_ID_COLUMN)]), - total_ordering_columns=frozenset([ORDER_ID_COLUMN]), - integer_encoding=IntegerEncoding(is_encoded=True, is_sequential=True), - ) - return CompiledArrayValue( - result, - columns=[result[col_id] for col_id in [*stats.keys()]], - hidden_ordering_columns=[result[ORDER_ID_COLUMN]], - ordering=ordering, - ) + ): + # Need to hide ordering columns that are being dropped. Alternatively, could project offsets + expr = expr._hide_column(ordering_column) + builder = expr.builder() + builder.columns = list(columns) + new_expr = builder.build() + return new_expr + ## Methods that only work with ordering def project_window_op( self, column_name: str, @@ -581,7 +823,7 @@ def project_window_op( *, never_skip_nulls=False, skip_reproject_unsafe: bool = False, - ) -> CompiledArrayValue: + ) -> OrderedIR: """ Creates a new expression based on this expression with unary operation applied to one column. column_name: the id of the input column present in the expression @@ -625,6 +867,168 @@ def project_window_op( # TODO(tbergeron): Automatically track analytic expression usage and defer reprojection until required for valid query generation. return result._reproject_to_table() if not skip_reproject_unsafe else result + def unpivot( + self, + row_labels: typing.Sequence[typing.Hashable], + unpivot_columns: typing.Sequence[ + typing.Tuple[str, typing.Sequence[typing.Optional[str]]] + ], + *, + passthrough_columns: typing.Sequence[str] = (), + index_col_ids: typing.Sequence[str] = ["index"], + dtype: typing.Union[ + bigframes.dtypes.Dtype, typing.Sequence[bigframes.dtypes.Dtype] + ] = pandas.Float64Dtype(), + how="left", + ) -> OrderedIR: + if how not in ("left", "right"): + raise ValueError("'how' must be 'left' or 'right'") + table = self._to_ibis_expr(ordering_mode="unordered", expose_hidden_cols=True) + row_n = len(row_labels) + hidden_col_ids = self._hidden_ordering_column_names.keys() + if not all( + len(source_columns) == row_n for _, source_columns in unpivot_columns + ): + raise ValueError("Columns and row labels must all be same length.") + + unpivot_offset_id = bigframes.core.guid.generate_guid("unpivot_offsets_") + unpivot_table = table.cross_join( + ibis.memtable({unpivot_offset_id: range(row_n)}) + ) + # Use ibis memtable to infer type of rowlabels (if possible) + # TODO: Allow caller to specify dtype + if isinstance(row_labels[0], tuple): + labels_table = ibis.memtable(row_labels) + labels_ibis_types = [ + labels_table[col].type() for col in labels_table.columns + ] + else: + labels_ibis_types = [ibis.memtable({"col": row_labels})["col"].type()] + labels_dtypes = [ + bigframes.dtypes.ibis_dtype_to_bigframes_dtype(ibis_type) + for ibis_type in labels_ibis_types + ] + + label_columns = [] + for label_part, (col_id, label_dtype) in enumerate( + zip(index_col_ids, labels_dtypes) + ): + # interpret as tuples even if it wasn't originally so can apply same logic for multi-column labels + labels_as_tuples = [ + label if isinstance(label, tuple) else (label,) for label in row_labels + ] + cases = [ + ( + i, + bigframes.dtypes.literal_to_ibis_scalar( + label_tuple[label_part], # type:ignore + force_dtype=label_dtype, # type:ignore + ), + ) + for i, label_tuple in enumerate(labels_as_tuples) + ] + labels_value = ( + typing.cast(ibis_types.IntegerColumn, unpivot_table[unpivot_offset_id]) + .cases(cases, default=None) # type:ignore + .name(col_id) + ) + label_columns.append(labels_value) + + unpivot_values = [] + for j in range(len(unpivot_columns)): + col_dtype = dtype[j] if utils.is_list_like(dtype) else dtype + result_col, source_cols = unpivot_columns[j] + null_value = bigframes.dtypes.literal_to_ibis_scalar( + None, force_dtype=col_dtype + ) + ibis_values = [ + ops.AsTypeOp(col_dtype)._as_ibis(unpivot_table[col]) + if col is not None + else null_value + for col in source_cols + ] + cases = [(i, ibis_values[i]) for i in range(len(ibis_values))] + unpivot_value = typing.cast( + ibis_types.IntegerColumn, unpivot_table[unpivot_offset_id] + ).cases( + cases, default=null_value # type:ignore + ) + unpivot_values.append(unpivot_value.name(result_col)) + + unpivot_table = unpivot_table.select( + passthrough_columns, + *label_columns, + *unpivot_values, + *hidden_col_ids, + unpivot_offset_id, + ) + + # Extend the original ordering using unpivot_offset_id + old_ordering = self._ordering + if how == "left": + new_ordering = ExpressionOrdering( + ordering_value_columns=tuple( + [ + *old_ordering.ordering_value_columns, + OrderingColumnReference(unpivot_offset_id), + ] + ), + total_ordering_columns=frozenset( + [*old_ordering.total_ordering_columns, unpivot_offset_id] + ), + ) + else: # how=="right" + new_ordering = ExpressionOrdering( + ordering_value_columns=tuple( + [ + OrderingColumnReference(unpivot_offset_id), + *old_ordering.ordering_value_columns, + ] + ), + total_ordering_columns=frozenset( + [*old_ordering.total_ordering_columns, unpivot_offset_id] + ), + ) + value_columns = [ + unpivot_table[value_col_id] for value_col_id, _ in unpivot_columns + ] + passthrough_values = [unpivot_table[col] for col in passthrough_columns] + hidden_ordering_columns = [ + unpivot_table[unpivot_offset_id], + *[unpivot_table[hidden_col] for hidden_col in hidden_col_ids], + ] + return OrderedIR( + table=unpivot_table, + columns=[ + *[unpivot_table[col_id] for col_id in index_col_ids], + *value_columns, + *passthrough_values, + ], + hidden_ordering_columns=hidden_ordering_columns, + ordering=new_ordering, + ) + + def _reproject_to_table(self) -> OrderedIR: + table = self._to_ibis_expr( + ordering_mode="unordered", + expose_hidden_cols=True, + ) + columns = [table[column_name] for column_name in self._column_names] + ordering_col_ids = [ + ref.column_id for ref in self._ordering.all_ordering_columns + ] + hidden_ordering_columns = [ + table[column_name] + for column_name in self._hidden_ordering_column_names + if column_name in ordering_col_ids + ] + return OrderedIR( + table, + columns=columns, + hidden_ordering_columns=hidden_ordering_columns, + ordering=self._ordering, + ) + def to_sql( self, offset_column: typing.Optional[str] = None, @@ -644,21 +1048,22 @@ def to_sql( ) if sorted: sql = textwrap.dedent( - f""" - SELECT * EXCEPT (`{offsets_id}`) - FROM ({sql}) - ORDER BY `{offsets_id}` - """ + f"SELECT * EXCEPT (`{offsets_id}`)\n" + "FROM (\n" + f"{sql}\n" + ")\n" + f"ORDER BY `{offsets_id}`\n" ) return typing.cast(str, sql) def _to_ibis_expr( self, - ordering_mode: Literal["string_encoded", "offset_col", "unordered"], - order_col_name: Optional[str] = ORDER_ID_COLUMN, + *, expose_hidden_cols: bool = False, fraction: Optional[float] = None, col_id_overrides: typing.Mapping[str, str] = {}, + ordering_mode: Literal["string_encoded", "offset_col", "unordered"], + order_col_name: Optional[str] = ORDER_ID_COLUMN, ): """ Creates an Ibis table expression representing the DataFrame. @@ -677,16 +1082,16 @@ def _to_ibis_expr( column name will be 'bigframes_ordering_id' Args: + expose_hidden_cols: + If True, include the hidden ordering columns in the results. + Only compatible with `order_by` and `unordered` + ``ordering_mode``. ordering_mode: How to construct the Ibis expression from the ArrayValue. See above for details. order_col_name: If the ordering mode outputs a single ordering or offsets column, use this as the column name. - expose_hidden_cols: - If True, include the hidden ordering columns in the results. - Only compatible with `order_by` and `unordered` - ``ordering_mode``. col_id_overrides: overrides the column ids for the result Returns: @@ -723,20 +1128,115 @@ def _to_ibis_expr( if not columns: return ibis.memtable([]) - # Make sure all dtypes are the "canonical" ones for BigFrames. This is - # important for operations like UNION where the schema must match. - table = self._table.select( - bigframes.dtypes.ibis_value_to_canonical_type(column) for column in columns + # Make sure all dtypes are the "canonical" ones for BigFrames. This is + # important for operations like UNION where the schema must match. + table = self._table.select( + bigframes.dtypes.ibis_value_to_canonical_type(column) for column in columns + ) + base_table = table + if self._reduced_predicate is not None: + table = table.filter(base_table[PREDICATE_COLUMN]) + table = table.drop(*columns_to_drop) + if col_id_overrides: + table = table.relabel(col_id_overrides) + if fraction is not None: + table = table.filter(ibis.random() < ibis.literal(fraction)) + return table + + def filter(self, predicate_id: str, keep_null: bool = False) -> OrderedIR: + condition = typing.cast( + ibis_types.BooleanValue, self._get_ibis_column(predicate_id) + ) + if keep_null: + condition = typing.cast( + ibis_types.BooleanValue, + condition.fillna( + typing.cast(ibis_types.BooleanScalar, ibis_types.literal(True)) + ), + ) + return self._filter(condition) + + def _filter(self, predicate_value: ibis_types.BooleanValue) -> OrderedIR: + """Filter the table on a given expression, the predicate must be a boolean series aligned with the table expression.""" + expr = self.builder() + expr.ordering = expr.ordering.with_non_sequential() + expr.predicates = [*self._predicates, predicate_value] + return expr.build() + + def _set_or_replace_by_id(self, id: str, new_value: ibis_types.Value) -> OrderedIR: + """Safely assign by id while maintaining ordering integrity.""" + # TODO: Split into explicit set and replace methods + ordering_col_ids = [ + col_ref.column_id for col_ref in self._ordering.ordering_value_columns + ] + if id in ordering_col_ids: + return self._hide_column(id)._set_or_replace_by_id(id, new_value) + + builder = self.builder() + if id in self.column_ids: + builder.columns = [ + val if (col_id != id) else new_value.name(id) + for col_id, val in zip(self.column_ids, self._columns) + ] + else: + builder.columns = [*self.columns, new_value.name(id)] + return builder.build() + + ## Ordering specific helpers + def _get_any_column(self, key: str) -> ibis_types.Value: + """Gets the Ibis expression for a given column. Will also get hidden columns.""" + all_columns = {**self._column_names, **self._hidden_ordering_column_names} + if key not in all_columns.keys(): + raise ValueError( + "Column name {} not in set of values: {}".format( + key, all_columns.keys() + ) + ) + return typing.cast(ibis_types.Value, all_columns[key]) + + def _get_hidden_ordering_column(self, key: str) -> ibis_types.Column: + """Gets the Ibis expression for a given hidden column.""" + if key not in self._hidden_ordering_column_names.keys(): + raise ValueError( + "Column name {} not in set of values: {}".format( + key, self._hidden_ordering_column_names.keys() + ) + ) + return typing.cast(ibis_types.Column, self._hidden_ordering_column_names[key]) + + def _hide_column(self, column_id) -> OrderedIR: + """Pushes columns to hidden columns list. Used to hide ordering columns that have been dropped or destructively mutated.""" + expr_builder = self.builder() + # Need to rename column as caller might be creating a new row with the same name but different values. + # Can avoid this if don't allow callers to determine ids and instead generate unique ones in this class. + new_name = bigframes.core.guid.generate_guid(prefix="bigframes_hidden_") + expr_builder.hidden_ordering_columns = [ + *self._hidden_ordering_columns, + self._get_ibis_column(column_id).name(new_name), + ] + expr_builder.ordering = self._ordering.with_column_remap({column_id: new_name}) + return expr_builder.build() + + def _project_offsets(self) -> OrderedIR: + """Create a new expression that contains offsets. Should only be executed when offsets are needed for an operations. Has no effect on expression semantics.""" + if self._ordering.is_sequential: + return self + # TODO(tbergeron): Enforce total ordering + table = self._to_ibis_expr( + ordering_mode="offset_col", order_col_name=ORDER_ID_COLUMN + ) + columns = [table[column_name] for column_name in self._column_names] + ordering = ExpressionOrdering( + ordering_value_columns=tuple([OrderingColumnReference(ORDER_ID_COLUMN)]), + total_ordering_columns=frozenset([ORDER_ID_COLUMN]), + integer_encoding=IntegerEncoding(True, is_sequential=True), + ) + return OrderedIR( + table, + columns=columns, + hidden_ordering_columns=[table[ORDER_ID_COLUMN]], + ordering=ordering, ) - base_table = table - if self._reduced_predicate is not None: - table = table.filter(base_table[PREDICATE_COLUMN]) - table = table.drop(*columns_to_drop) - if col_id_overrides: - table = table.relabel(col_id_overrides) - if fraction is not None: - table = table.filter(ibis.random() < ibis.literal(fraction)) - return table def _create_order_columns( self, @@ -789,34 +1289,6 @@ def _create_string_ordering_column(self) -> ibis_types.StringColumn: ) return encode_order_string(row_nums) - def _reproject_to_table(self) -> CompiledArrayValue: - """ - Internal operators that projects the internal representation into a - new ibis table expression where each value column is a direct - reference to a column in that table expression. Needed after - some operations such as window operations that cannot be used - recursively in projections. - """ - table = self._to_ibis_expr( - "unordered", - expose_hidden_cols=True, - ) - columns = [table[column_name] for column_name in self._column_names] - ordering_col_ids = [ - ref.column_id for ref in self._ordering.all_ordering_columns - ] - hidden_ordering_columns = [ - table[column_name] - for column_name in self._hidden_ordering_column_names - if column_name in ordering_col_ids - ] - return CompiledArrayValue( - table, - columns=columns, - hidden_ordering_columns=hidden_ordering_columns, - ordering=self._ordering, - ) - def _ibis_window_from_spec(self, window_spec: WindowSpec, allow_ties: bool = False): group_by: typing.List[ibis_types.Value] = ( [ @@ -851,229 +1323,29 @@ def _ibis_window_from_spec(self, window_spec: WindowSpec, allow_ties: bool = Fal group_by=group_by, ) - def unpivot( - self, - row_labels: typing.Sequence[typing.Hashable], - unpivot_columns: typing.Sequence[ - typing.Tuple[str, typing.Sequence[typing.Optional[str]]] - ], - *, - passthrough_columns: typing.Sequence[str] = (), - index_col_ids: typing.Sequence[str] = ["index"], - dtype: typing.Union[ - bigframes.dtypes.Dtype, typing.Sequence[bigframes.dtypes.Dtype] - ] = pandas.Float64Dtype(), - how="left", - ) -> CompiledArrayValue: - """ - Unpivot ArrayValue columns. - - Args: - row_labels: Identifies the source of the row. Must be equal to length to source column list in unpivot_columns argument. - unpivot_columns: Mapping of column id to list of input column ids. Lists of input columns may use None. - passthrough_columns: Columns that will not be unpivoted. Column id will be preserved. - index_col_id (str): The column id to be used for the row labels. - dtype (dtype or list of dtype): Dtype to use for the unpivot columns. If list, must be equal in number to unpivot_columns. - - Returns: - ArrayValue: The unpivoted ArrayValue - """ - if how not in ("left", "right"): - raise ValueError("'how' must be 'left' or 'right'") - table = self._to_ibis_expr("unordered", expose_hidden_cols=True) - row_n = len(row_labels) - hidden_col_ids = self._hidden_ordering_column_names.keys() - if not all( - len(source_columns) == row_n for _, source_columns in unpivot_columns - ): - raise ValueError("Columns and row labels must all be same length.") - - unpivot_offset_id = bigframes.core.guid.generate_guid("unpivot_offsets_") - unpivot_table = table.cross_join( - ibis.memtable({unpivot_offset_id: range(row_n)}) - ) - # Use ibis memtable to infer type of rowlabels (if possible) - # TODO: Allow caller to specify dtype - if isinstance(row_labels[0], tuple): - labels_table = ibis.memtable(row_labels) - labels_ibis_types = [ - labels_table[col].type() for col in labels_table.columns - ] - else: - labels_ibis_types = [ibis.memtable({"col": row_labels})["col"].type()] - labels_dtypes = [ - bigframes.dtypes.ibis_dtype_to_bigframes_dtype(ibis_type) - for ibis_type in labels_ibis_types - ] - - label_columns = [] - for label_part, (col_id, label_dtype) in enumerate( - zip(index_col_ids, labels_dtypes) + class Builder: + def __init__( + self, + table: ibis_types.Table, + ordering: ExpressionOrdering, + columns: Collection[ibis_types.Value] = (), + hidden_ordering_columns: Collection[ibis_types.Value] = (), + predicates: Optional[Collection[ibis_types.BooleanValue]] = None, ): - # interpret as tuples even if it wasn't originally so can apply same logic for multi-column labels - labels_as_tuples = [ - label if isinstance(label, tuple) else (label,) for label in row_labels - ] - cases = [ - ( - i, - bigframes.dtypes.literal_to_ibis_scalar( - label_tuple[label_part], # type:ignore - force_dtype=label_dtype, # type:ignore - ), - ) - for i, label_tuple in enumerate(labels_as_tuples) - ] - labels_value = ( - typing.cast(ibis_types.IntegerColumn, unpivot_table[unpivot_offset_id]) - .cases(cases, default=None) # type:ignore - .name(col_id) - ) - label_columns.append(labels_value) - - unpivot_values = [] - for j in range(len(unpivot_columns)): - col_dtype = dtype[j] if utils.is_list_like(dtype) else dtype - result_col, source_cols = unpivot_columns[j] - null_value = bigframes.dtypes.literal_to_ibis_scalar( - None, force_dtype=col_dtype - ) - ibis_values = [ - ops.AsTypeOp(col_dtype)._as_ibis(unpivot_table[col]) - if col is not None - else null_value - for col in source_cols - ] - cases = [(i, ibis_values[i]) for i in range(len(ibis_values))] - unpivot_value = typing.cast( - ibis_types.IntegerColumn, unpivot_table[unpivot_offset_id] - ).cases( - cases, default=null_value # type:ignore - ) - unpivot_values.append(unpivot_value.name(result_col)) - - unpivot_table = unpivot_table.select( - passthrough_columns, - *label_columns, - *unpivot_values, - *hidden_col_ids, - unpivot_offset_id, - ) - - # Extend the original ordering using unpivot_offset_id - old_ordering = self._ordering - if how == "left": - new_ordering = ExpressionOrdering( - ordering_value_columns=tuple( - [ - *old_ordering.ordering_value_columns, - OrderingColumnReference(unpivot_offset_id), - ] - ), - total_ordering_columns=frozenset( - [*old_ordering.total_ordering_columns, unpivot_offset_id] - ), - ) - else: # how=="right" - new_ordering = ExpressionOrdering( - ordering_value_columns=tuple( - [ - OrderingColumnReference(unpivot_offset_id), - *old_ordering.ordering_value_columns, - ] - ), - total_ordering_columns=frozenset( - [*old_ordering.total_ordering_columns, unpivot_offset_id] - ), - ) - value_columns = [ - unpivot_table[value_col_id] for value_col_id, _ in unpivot_columns - ] - passthrough_values = [unpivot_table[col] for col in passthrough_columns] - hidden_ordering_columns = [ - unpivot_table[unpivot_offset_id], - *[unpivot_table[hidden_col] for hidden_col in hidden_col_ids], - ] - return CompiledArrayValue( - table=unpivot_table, - columns=[ - *[unpivot_table[col_id] for col_id in index_col_ids], - *value_columns, - *passthrough_values, - ], - hidden_ordering_columns=hidden_ordering_columns, - ordering=new_ordering, - ) - - def assign(self, source_id: str, destination_id: str) -> CompiledArrayValue: - return self._set_or_replace_by_id( - destination_id, self._get_ibis_column(source_id) - ) - - def assign_constant( - self, - destination_id: str, - value: typing.Any, - dtype: typing.Optional[bigframes.dtypes.Dtype], - ) -> CompiledArrayValue: - # TODO(b/281587571): Solve scalar constant aggregation problem w/Ibis. - ibis_value = bigframes.dtypes.literal_to_ibis_scalar(value, dtype) - if ibis_value is None: - raise NotImplementedError( - f"Type not supported as scalar value {type(value)}. {constants.FEEDBACK_LINK}" + self.table = table + self.columns = list(columns) + self.hidden_ordering_columns = list(hidden_ordering_columns) + self.ordering = ordering + self.predicates = list(predicates) if predicates is not None else None + + def build(self) -> OrderedIR: + return OrderedIR( + table=self.table, + columns=self.columns, + hidden_ordering_columns=self.hidden_ordering_columns, + ordering=self.ordering, + predicates=self.predicates, ) - expr = self._set_or_replace_by_id(destination_id, ibis_value) - return expr._reproject_to_table() - - def _set_or_replace_by_id( - self, id: str, new_value: ibis_types.Value - ) -> CompiledArrayValue: - """Safely assign by id while maintaining ordering integrity.""" - # TODO: Split into explicit set and replace methods - ordering_col_ids = [ - col_ref.column_id for col_ref in self._ordering.ordering_value_columns - ] - if id in ordering_col_ids: - return self._hide_column(id)._set_or_replace_by_id(id, new_value) - - builder = self.builder() - if id in self.column_ids: - builder.columns = [ - val if (col_id != id) else new_value.name(id) - for col_id, val in zip(self.column_ids, self._columns) - ] - else: - builder.columns = [*self.columns, new_value.name(id)] - return builder.build() - - -class ArrayValueBuilder: - """Mutable expression class. - Use ArrayValue.builder() to create from a ArrayValue object. - """ - - def __init__( - self, - table: ibis_types.Table, - ordering: ExpressionOrdering, - columns: Collection[ibis_types.Value] = (), - hidden_ordering_columns: Collection[ibis_types.Value] = (), - predicates: Optional[Collection[ibis_types.BooleanValue]] = None, - ): - self.table = table - self.columns = list(columns) - self.hidden_ordering_columns = list(hidden_ordering_columns) - self.ordering = ordering - self.predicates = list(predicates) if predicates is not None else None - - def build(self) -> CompiledArrayValue: - return CompiledArrayValue( - table=self.table, - columns=self.columns, - hidden_ordering_columns=self.hidden_ordering_columns, - ordering=self.ordering, - predicates=self.predicates, - ) def _reduce_predicate_list( diff --git a/bigframes/core/compile/compiler.py b/bigframes/core/compile/compiler.py index 195d830122..39892635f1 100644 --- a/bigframes/core/compile/compiler.py +++ b/bigframes/core/compile/compiler.py @@ -19,7 +19,8 @@ import pandas as pd -import bigframes.core.compile as compiled +import bigframes.core.compile.compiled as compiled +import bigframes.core.compile.concat as concat_impl import bigframes.core.compile.single_column import bigframes.core.nodes as nodes @@ -28,120 +29,167 @@ import bigframes.session +def compile_ordered(node: nodes.BigFrameNode) -> compiled.OrderedIR: + return typing.cast(compiled.OrderedIR, compile_node(node, True)) + + +def compile_unordered(node: nodes.BigFrameNode) -> compiled.UnorderedIR: + return typing.cast(compiled.UnorderedIR, compile_node(node, False)) + + @functools.cache -def compile_node(node: nodes.BigFrameNode) -> compiled.CompiledArrayValue: +def compile_node( + node: nodes.BigFrameNode, ordered: bool = True +) -> compiled.UnorderedIR | compiled.OrderedIR: """Compile node into CompileArrayValue. Caches result.""" - return _compile_node(node) + return _compile_node(node, ordered) @functools.singledispatch -def _compile_node(node: nodes.BigFrameNode) -> compiled.CompiledArrayValue: +def _compile_node( + node: nodes.BigFrameNode, ordered: bool = True +) -> compiled.UnorderedIR: """Defines transformation but isn't cached, always use compile_node instead""" - raise ValueError(f"Can't compile unnrecognized node: {node}") + raise ValueError(f"Can't compile unrecognized node: {node}") @_compile_node.register -def compile_join(node: nodes.JoinNode): - compiled_left = compile_node(node.left_child) - compiled_right = compile_node(node.right_child) - return bigframes.core.compile.single_column.join_by_column( - compiled_left, - node.left_column_ids, - compiled_right, - node.right_column_ids, - how=node.how, - allow_row_identity_join=node.allow_row_identity_join, - ) +def compile_join(node: nodes.JoinNode, ordered: bool = True): + if ordered: + left_ordered = compile_ordered(node.left_child) + right_ordered = compile_ordered(node.right_child) + return bigframes.core.compile.single_column.join_by_column_ordered( + left_ordered, + node.left_column_ids, + right_ordered, + node.right_column_ids, + how=node.how, + allow_row_identity_join=node.allow_row_identity_join, + ) + else: + left_unordered = compile_unordered(node.left_child) + right_unordered = compile_unordered(node.right_child) + return bigframes.core.compile.single_column.join_by_column_unordered( + left_unordered, + node.left_column_ids, + right_unordered, + node.right_column_ids, + how=node.how, + allow_row_identity_join=node.allow_row_identity_join, + ) @_compile_node.register -def compile_select(node: nodes.SelectNode): - return compile_node(node.child).select_columns(node.column_ids) +def compile_select(node: nodes.SelectNode, ordered: bool = True): + return compile_node(node.child, ordered).select_columns(node.column_ids) @_compile_node.register -def compile_drop(node: nodes.DropColumnsNode): - return compile_node(node.child).drop_columns(node.columns) +def compile_drop(node: nodes.DropColumnsNode, ordered: bool = True): + return compile_node(node.child, ordered).drop_columns(node.columns) @_compile_node.register -def compile_readlocal(node: nodes.ReadLocalNode): +def compile_readlocal(node: nodes.ReadLocalNode, ordered: bool = True): array_as_pd = pd.read_feather(io.BytesIO(node.feather_bytes)) - return compiled.CompiledArrayValue.mem_expr_from_pandas(array_as_pd) + ordered_ir = compiled.OrderedIR.from_pandas(array_as_pd) + if ordered: + return ordered_ir + else: + return ordered_ir.to_unordered() @_compile_node.register -def compile_readgbq(node: nodes.ReadGbqNode): - return compiled.CompiledArrayValue( - node.table, - node.columns, - node.hidden_ordering_columns, - node.ordering, - ) +def compile_readgbq(node: nodes.ReadGbqNode, ordered: bool = True): + if ordered: + return compiled.OrderedIR( + node.table, + node.columns, + node.hidden_ordering_columns, + node.ordering, + ) + else: + return compiled.UnorderedIR( + node.table, + node.columns, + ) @_compile_node.register -def compile_promote_offsets(node: nodes.PromoteOffsetsNode): - return compile_node(node.child).promote_offsets(node.col_id) +def compile_promote_offsets(node: nodes.PromoteOffsetsNode, ordered: bool = True): + result = compile_ordered(node.child).promote_offsets(node.col_id) + return result if ordered else result.to_unordered() @_compile_node.register -def compile_filter(node: nodes.FilterNode): - return compile_node(node.child).filter(node.predicate_id, node.keep_null) +def compile_filter(node: nodes.FilterNode, ordered: bool = True): + return compile_node(node.child, ordered).filter(node.predicate_id, node.keep_null) @_compile_node.register -def compile_orderby(node: nodes.OrderByNode): - return compile_node(node.child).order_by(node.by, node.stable) +def compile_orderby(node: nodes.OrderByNode, ordered: bool = True): + if ordered: + return compile_ordered(node.child).order_by(node.by) + else: + return compile_unordered(node.child) @_compile_node.register -def compile_reversed(node: nodes.ReversedNode): - return compile_node(node.child).reversed() +def compile_reversed(node: nodes.ReversedNode, ordered: bool = True): + if ordered: + return compile_ordered(node.child).reversed() + else: + return compile_unordered(node.child) @_compile_node.register -def compile_project_unary(node: nodes.ProjectUnaryOpNode): - return compile_node(node.child).project_unary_op( +def compile_project_unary(node: nodes.ProjectUnaryOpNode, ordered: bool = True): + return compile_node(node.child, ordered).project_unary_op( node.input_id, node.op, node.output_id ) @_compile_node.register -def compile_project_binary(node: nodes.ProjectBinaryOpNode): - return compile_node(node.child).project_binary_op( +def compile_project_binary(node: nodes.ProjectBinaryOpNode, ordered: bool = True): + return compile_node(node.child, ordered).project_binary_op( node.left_input_id, node.right_input_id, node.op, node.output_id ) @_compile_node.register -def compile_project_ternary(node: nodes.ProjectTernaryOpNode): - return compile_node(node.child).project_ternary_op( +def compile_project_ternary(node: nodes.ProjectTernaryOpNode, ordered: bool = True): + return compile_node(node.child, ordered).project_ternary_op( node.input_id1, node.input_id2, node.input_id3, node.op, node.output_id ) @_compile_node.register -def compile_concat(node: nodes.ConcatNode): - compiled_nodes = [compile_node(node) for node in node.children] - return compiled_nodes[0].concat(compiled_nodes[1:]) +def compile_concat(node: nodes.ConcatNode, ordered: bool = True): + if ordered: + compiled_ordered = [compile_ordered(node) for node in node.children] + return concat_impl.concat_ordered(compiled_ordered) + else: + compiled_unordered = [compile_unordered(node) for node in node.children] + return concat_impl.concat_unordered(compiled_unordered) @_compile_node.register -def compile_aggregate(node: nodes.AggregateNode): - return compile_node(node.child).aggregate( +def compile_aggregate(node: nodes.AggregateNode, ordered: bool = True): + result = compile_unordered(node.child).aggregate( node.aggregations, node.by_column_ids, node.dropna ) + return result if ordered else result.to_unordered() @_compile_node.register -def compile_corr(node: nodes.CorrNode): - return compile_node(node.child).corr_aggregate(node.corr_aggregations) +def compile_corr(node: nodes.CorrNode, ordered: bool = True): + result = compile_unordered(node.child).corr_aggregate(node.corr_aggregations) + return result if ordered else result.to_unordered() @_compile_node.register -def compile_window(node: nodes.WindowOpNode): - return compile_node(node.child).project_window_op( +def compile_window(node: nodes.WindowOpNode, ordered: bool = True): + result = compile_ordered(node.child).project_window_op( node.column_name, node.op, node.window_spec, @@ -149,16 +197,17 @@ def compile_window(node: nodes.WindowOpNode): never_skip_nulls=node.never_skip_nulls, skip_reproject_unsafe=node.skip_reproject_unsafe, ) + return result if ordered else result.to_unordered() @_compile_node.register -def compile_reproject(node: nodes.ReprojectOpNode): - return compile_node(node.child)._reproject_to_table() +def compile_reproject(node: nodes.ReprojectOpNode, ordered: bool = True): + return compile_node(node.child, ordered)._reproject_to_table() @_compile_node.register -def compile_unpivot(node: nodes.UnpivotNode): - return compile_node(node.child).unpivot( +def compile_unpivot(node: nodes.UnpivotNode, ordered: bool = True): + return compile_node(node.child, ordered).unpivot( node.row_labels, node.unpivot_columns, passthrough_columns=node.passthrough_columns, @@ -169,17 +218,17 @@ def compile_unpivot(node: nodes.UnpivotNode): @_compile_node.register -def compile_assign(node: nodes.AssignNode): - return compile_node(node.child).assign(node.source_id, node.destination_id) +def compile_assign(node: nodes.AssignNode, ordered: bool = True): + return compile_node(node.child, ordered).assign(node.source_id, node.destination_id) @_compile_node.register -def compile_assign_constant(node: nodes.AssignConstantNode): - return compile_node(node.child).assign_constant( +def compile_assign_constant(node: nodes.AssignConstantNode, ordered: bool = True): + return compile_node(node.child, ordered).assign_constant( node.destination_id, node.value, node.dtype ) @_compile_node.register -def compiler_random_sample(node: nodes.RandomSampleNode): - return compile_node(node.child)._uniform_sampling(node.fraction) +def compiler_random_sample(node: nodes.RandomSampleNode, ordered: bool = True): + return compile_node(node.child, ordered)._uniform_sampling(node.fraction) diff --git a/bigframes/core/compile/concat.py b/bigframes/core/compile/concat.py new file mode 100644 index 0000000000..d39569370e --- /dev/null +++ b/bigframes/core/compile/concat.py @@ -0,0 +1,100 @@ +# Copyright 2023 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from __future__ import annotations + +import math +import typing + +import ibis + +import bigframes.core.compile.compiled as compiled +from bigframes.core.ordering import ( + ExpressionOrdering, + OrderingColumnReference, + reencode_order_string, + StringEncoding, +) + +ORDER_ID_COLUMN = "bigframes_ordering_id" + + +def concat_unordered( + items: typing.Sequence[compiled.UnorderedIR], +) -> compiled.UnorderedIR: + """Append together multiple ArrayValue objects.""" + if len(items) == 1: + return items[0] + tables = [] + for expr in items: + table = expr._to_ibis_expr() + # Rename the value columns based on horizontal offset before applying union. + table = table.select( + [table[col].name(f"column_{i}") for i, col in enumerate(table.columns)] + ) + tables.append(table) + combined_table = ibis.union(*tables) + return compiled.UnorderedIR( + combined_table, + columns=[combined_table[col] for col in combined_table.columns], + ) + + +def concat_ordered( + items: typing.Sequence[compiled.OrderedIR], +) -> compiled.OrderedIR: + """Append together multiple ArrayValue objects.""" + if len(items) == 1: + return items[0] + + tables = [] + prefix_base = 10 + prefix_size = math.ceil(math.log(len(items), prefix_base)) + # Must normalize all ids to the same encoding size + max_encoding_size = max( + *[expression._ordering.string_encoding.length for expression in items], + ) + for i, expr in enumerate(items): + ordering_prefix = str(i).zfill(prefix_size) + table = expr._to_ibis_expr( + ordering_mode="string_encoded", order_col_name=ORDER_ID_COLUMN + ) + # Rename the value columns based on horizontal offset before applying union. + table = table.select( + [ + table[col].name(f"column_{i}") + if col != ORDER_ID_COLUMN + else ( + ordering_prefix + + reencode_order_string(table[ORDER_ID_COLUMN], max_encoding_size) + ).name(ORDER_ID_COLUMN) + for i, col in enumerate(table.columns) + ] + ) + tables.append(table) + combined_table = ibis.union(*tables) + ordering = ExpressionOrdering( + ordering_value_columns=tuple([OrderingColumnReference(ORDER_ID_COLUMN)]), + total_ordering_columns=frozenset([ORDER_ID_COLUMN]), + string_encoding=StringEncoding(True, prefix_size + max_encoding_size), + ) + return compiled.OrderedIR( + combined_table, + columns=[ + combined_table[col] + for col in combined_table.columns + if col != ORDER_ID_COLUMN + ], + hidden_ordering_columns=[combined_table[ORDER_ID_COLUMN]], + ordering=ordering, + ) diff --git a/bigframes/core/compile/row_identity.py b/bigframes/core/compile/row_identity.py index 2e9bc0527c..71d53f90dc 100644 --- a/bigframes/core/compile/row_identity.py +++ b/bigframes/core/compile/row_identity.py @@ -23,16 +23,76 @@ import ibis.expr.types as ibis_types import bigframes.constants as constants -import bigframes.core.compile as compiled +import bigframes.core.compile.compiled as compiled import bigframes.core.joins.name_resolution as naming import bigframes.core.ordering as orderings SUPPORTED_ROW_IDENTITY_HOW = {"outer", "left", "inner"} -def join_by_row_identity( - left: compiled.CompiledArrayValue, right: compiled.CompiledArrayValue, *, how: str -) -> compiled.CompiledArrayValue: +def join_by_row_identity_unordered( + left: compiled.UnorderedIR, + right: compiled.UnorderedIR, + *, + how: str, +) -> compiled.UnorderedIR: + """Compute join when we are joining by row identity not a specific column.""" + if how not in SUPPORTED_ROW_IDENTITY_HOW: + raise NotImplementedError( + f"Only how='outer','left','inner' currently supported. {constants.FEEDBACK_LINK}" + ) + + if not left._table.equals(right._table): + raise ValueError( + "Cannot combine objects without an explicit join/merge key. " + f"Left based on: {left._table.compile()}, but " + f"right based on: {right._table.compile()}" + ) + + left_predicates = left._predicates + right_predicates = right._predicates + # TODO(tbergeron): Skip generating these for inner part of join + ( + left_relative_predicates, + right_relative_predicates, + ) = _get_relative_predicates(left_predicates, right_predicates) + + combined_predicates = [] + if left_predicates or right_predicates: + joined_predicates = _join_predicates( + left_predicates, right_predicates, join_type=how + ) + combined_predicates = list(joined_predicates) # builder expects mutable list + + left_mask = left_relative_predicates if how in ["right", "outer"] else None + right_mask = right_relative_predicates if how in ["left", "outer"] else None + + # Public mapping must use JOIN_NAME_REMAPPER to stay in sync with consumers of join result + map_left_id, map_right_id = naming.JOIN_NAME_REMAPPER( + left.column_ids, right.column_ids + ) + joined_columns = [ + _mask_value(left._get_ibis_column(key), left_mask).name(map_left_id[key]) + for key in left.column_ids + ] + [ + _mask_value(right._get_ibis_column(key), right_mask).name(map_right_id[key]) + for key in right.column_ids + ] + + joined_expr = compiled.UnorderedIR( + left._table, + columns=joined_columns, + predicates=combined_predicates, + ) + return joined_expr + + +def join_by_row_identity_ordered( + left: compiled.OrderedIR, + right: compiled.OrderedIR, + *, + how: str, +) -> compiled.OrderedIR: """Compute join when we are joining by row identity not a specific column.""" if how not in SUPPORTED_ROW_IDENTITY_HOW: raise NotImplementedError( @@ -118,7 +178,7 @@ def join_by_row_identity( if key.column_id in right._hidden_ordering_column_names.keys() ] - joined_expr = compiled.CompiledArrayValue( + joined_expr = compiled.OrderedIR( left._table, columns=joined_columns, hidden_ordering_columns=hidden_ordering_columns, diff --git a/bigframes/core/compile/single_column.py b/bigframes/core/compile/single_column.py index b992aa1d1d..a9088feb49 100644 --- a/bigframes/core/compile/single_column.py +++ b/bigframes/core/compile/single_column.py @@ -23,16 +23,16 @@ import ibis.expr.datatypes as ibis_dtypes import ibis.expr.types as ibis_types -import bigframes.core.compile as compiled +import bigframes.core.compile.compiled as compiled import bigframes.core.compile.row_identity import bigframes.core.joins as joining import bigframes.core.ordering as orderings -def join_by_column( - left: compiled.CompiledArrayValue, +def join_by_column_ordered( + left: compiled.OrderedIR, left_column_ids: typing.Sequence[str], - right: compiled.CompiledArrayValue, + right: compiled.OrderedIR, right_column_ids: typing.Sequence[str], *, how: Literal[ @@ -40,9 +40,10 @@ def join_by_column( "left", "outer", "right", + "cross", ], allow_row_identity_join: bool = True, -) -> compiled.CompiledArrayValue: +) -> compiled.OrderedIR: """Join two expressions by column equality. Arguments: @@ -67,13 +68,13 @@ def join_by_column( # regards to value its possible that they both have the same names but # were modified in different ways. Ignore differences in the names. and all( - left._get_any_column(lcol) + left._get_ibis_column(lcol) .name("index") - .equals(right._get_any_column(rcol).name("index")) + .equals(right._get_ibis_column(rcol).name("index")) for lcol, rcol in zip(left_column_ids, right_column_ids) ) ): - return bigframes.core.compile.row_identity.join_by_row_identity( + return bigframes.core.compile.row_identity.join_by_row_identity_ordered( left, right, how=how ) else: @@ -88,12 +89,12 @@ def join_by_column( r_mapping = {**r_public_mapping, **r_hidden_mapping} left_table = left._to_ibis_expr( - "unordered", + ordering_mode="unordered", expose_hidden_cols=True, col_id_overrides=l_mapping, ) right_table = right._to_ibis_expr( - "unordered", + ordering_mode="unordered", expose_hidden_cols=True, col_id_overrides=r_mapping, ) @@ -107,7 +108,7 @@ def join_by_column( left_table, right_table, predicates=join_conditions, - how=how, + how=how, # type: ignore ) # Preserve ordering accross joins. @@ -134,7 +135,7 @@ def join_by_column( for col in right._hidden_ordering_columns ], ] - return compiled.CompiledArrayValue( + return compiled.OrderedIR( combined_table, columns=columns, hidden_ordering_columns=hidden_ordering_columns, @@ -142,6 +143,88 @@ def join_by_column( ) +def join_by_column_unordered( + left: compiled.UnorderedIR, + left_column_ids: typing.Sequence[str], + right: compiled.UnorderedIR, + right_column_ids: typing.Sequence[str], + *, + how: Literal[ + "inner", + "left", + "outer", + "right", + "cross", + ], + allow_row_identity_join: bool = True, +) -> compiled.UnorderedIR: + """Join two expressions by column equality. + + Arguments: + left: Expression for left table to join. + left_column_ids: Column IDs (not label) to join by. + right: Expression for right table to join. + right_column_ids: Column IDs (not label) to join by. + how: The type of join to perform. + allow_row_identity_join (bool): + If True, allow matching by row identity. Set to False to always + perform a true JOIN in generated SQL. + Returns: + The joined expression. The resulting columns will be, in order, + first the coalesced join keys, then, all the left columns, and + finally, all the right columns. + """ + if ( + allow_row_identity_join + and how in bigframes.core.compile.row_identity.SUPPORTED_ROW_IDENTITY_HOW + and left._table.equals(right._table) + # Make sure we're joining on exactly the same column(s), at least with + # regards to value its possible that they both have the same names but + # were modified in different ways. Ignore differences in the names. + and all( + left._get_ibis_column(lcol) + .name("index") + .equals(right._get_ibis_column(rcol).name("index")) + for lcol, rcol in zip(left_column_ids, right_column_ids) + ) + ): + return bigframes.core.compile.row_identity.join_by_row_identity_unordered( + left, right, how=how + ) + else: + # Value column mapping must use JOIN_NAME_REMAPPER to stay in sync with consumers of join result + l_mapping, r_mapping = joining.JOIN_NAME_REMAPPER( + left.column_ids, right.column_ids + ) + left_table = left._to_ibis_expr( + col_id_overrides=l_mapping, + ) + right_table = right._to_ibis_expr( + col_id_overrides=r_mapping, + ) + join_conditions = [ + value_to_join_key(left_table[l_mapping[left_index]]) + == value_to_join_key(right_table[r_mapping[right_index]]) + for left_index, right_index in zip(left_column_ids, right_column_ids) + ] + + combined_table = ibis.join( + left_table, + right_table, + predicates=join_conditions, + how=how, # type: ignore + ) + # We could filter out the original join columns, but predicates/ordering + # might still reference them in implicit joins. + columns = [ + combined_table[l_mapping[col.get_name()]] for col in left.columns + ] + [combined_table[r_mapping[col.get_name()]] for col in right.columns] + return compiled.UnorderedIR( + combined_table, + columns=columns, + ) + + def value_to_join_key(value: ibis_types.Value): """Converts nullable values to non-null string SQL will not match null keys together - but pandas does.""" if not value.type().is_string(): diff --git a/bigframes/core/groupby/__init__.py b/bigframes/core/groupby/__init__.py index 2a19a83dd5..18cb83fa18 100644 --- a/bigframes/core/groupby/__init__.py +++ b/bigframes/core/groupby/__init__.py @@ -19,6 +19,7 @@ import pandas as pd import bigframes.constants as constants +from bigframes.core import log_adapter import bigframes.core as core import bigframes.core.block_transforms as block_ops import bigframes.core.blocks as blocks @@ -33,6 +34,7 @@ import third_party.bigframes_vendored.pandas.core.groupby as vendored_pandas_groupby +@log_adapter.class_logger class DataFrameGroupBy(vendored_pandas_groupby.DataFrameGroupBy): __doc__ = vendored_pandas_groupby.GroupBy.__doc__ @@ -217,7 +219,6 @@ def rolling(self, window: int, min_periods=None) -> windows.Window: ) block = self._block.order_by( [order.OrderingColumnReference(col) for col in self._by_col_ids], - stable=True, ) return windows.Window( block, window_spec, self._selected_cols, drop_null_groups=self._dropna @@ -231,7 +232,6 @@ def expanding(self, min_periods: int = 1) -> windows.Window: ) block = self._block.order_by( [order.OrderingColumnReference(col) for col in self._by_col_ids], - stable=True, ) return windows.Window( block, window_spec, self._selected_cols, drop_null_groups=self._dropna @@ -408,6 +408,7 @@ def _resolve_label(self, label: blocks.Label) -> str: return col_ids[0] +@log_adapter.class_logger class SeriesGroupBy(vendored_pandas_groupby.SeriesGroupBy): __doc__ = vendored_pandas_groupby.GroupBy.__doc__ @@ -552,7 +553,6 @@ def rolling(self, window: int, min_periods=None) -> windows.Window: ) block = self._block.order_by( [order.OrderingColumnReference(col) for col in self._by_col_ids], - stable=True, ) return windows.Window( block, @@ -570,7 +570,6 @@ def expanding(self, min_periods: int = 1) -> windows.Window: ) block = self._block.order_by( [order.OrderingColumnReference(col) for col in self._by_col_ids], - stable=True, ) return windows.Window( block, diff --git a/bigframes/core/indexers.py b/bigframes/core/indexers.py index f6ce084714..69048b6845 100644 --- a/bigframes/core/indexers.py +++ b/bigframes/core/indexers.py @@ -117,6 +117,18 @@ def __getitem__( ) -> Union[bigframes.core.scalar.Scalar, bigframes.series.Series]: return self._series.loc[key] + def __setitem__( + self, + key: LocSingleKey, + value: bigframes.core.scalar.Scalar, + ): + if not pd.api.types.is_scalar(value): + raise NotImplementedError( + "series.at.__setitem__ only supports scalar right-hand values. " + f"{constants.FEEDBACK_LINK}" + ) + self._series.loc[key] = value + class LocDataFrameIndexer: def __init__(self, dataframe: bigframes.dataframe.DataFrame): diff --git a/bigframes/core/joins/merge.py b/bigframes/core/joins/merge.py index fac16b3607..c65e1bdd54 100644 --- a/bigframes/core/joins/merge.py +++ b/bigframes/core/joins/merge.py @@ -32,6 +32,7 @@ def merge( "left", "outer", "right", + "cross", ] = "inner", on: Optional[str] = None, *, diff --git a/bigframes/core/log_adapter.py b/bigframes/core/log_adapter.py new file mode 100644 index 0000000000..b790d19562 --- /dev/null +++ b/bigframes/core/log_adapter.py @@ -0,0 +1,61 @@ +# Copyright 2023 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import functools +import threading +from typing import List + +_lock = threading.Lock() +MAX_LABELS_COUNT = 64 +_api_methods: List = [] + + +def class_logger(decorated_cls): + """Decorator that adds logging functionality to each method of the class.""" + for attr_name, attr_value in decorated_cls.__dict__.items(): + if callable(attr_value): + setattr(decorated_cls, attr_name, method_logger(attr_value)) + return decorated_cls + + +def method_logger(method): + """Decorator that adds logging functionality to a method.""" + + @functools.wraps(method) + def wrapper(*args, **kwargs): + api_method_name = str(method.__name__) + # Track regular and "dunder" methods + if api_method_name.startswith("__") or not api_method_name.startswith("_"): + add_api_method(api_method_name) + return method(*args, **kwargs) + + return wrapper + + +def add_api_method(api_method_name): + global _lock + global _api_methods + with _lock: + # Push the method to the front of the _api_methods list + _api_methods.insert(0, api_method_name) + # Keep the list length within the maximum limit (adjust MAX_LABELS_COUNT as needed) + _api_methods = _api_methods[:MAX_LABELS_COUNT] + + +def get_and_reset_api_methods(): + global _lock + with _lock: + previous_api_methods = list(_api_methods) + _api_methods.clear() + return previous_api_methods diff --git a/bigframes/core/nodes.py b/bigframes/core/nodes.py index 7b252b164f..44a8d808ff 100644 --- a/bigframes/core/nodes.py +++ b/bigframes/core/nodes.py @@ -11,6 +11,7 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. + from __future__ import annotations from dataclasses import dataclass, field @@ -86,6 +87,7 @@ class JoinNode(BigFrameNode): "left", "outer", "right", + "cross", ] allow_row_identity_join: bool = True @@ -144,7 +146,6 @@ class FilterNode(UnaryNode): @dataclass(frozen=True) class OrderByNode(UnaryNode): by: Tuple[OrderingColumnReference, ...] - stable: bool = False @dataclass(frozen=True) diff --git a/bigframes/core/ordering.py b/bigframes/core/ordering.py index 2cecd2fe7b..3ab89e0213 100644 --- a/bigframes/core/ordering.py +++ b/bigframes/core/ordering.py @@ -28,8 +28,6 @@ # Sufficient to store any value up to 2^63 DEFAULT_ORDERING_ID_LENGTH: int = math.ceil(63 * math.log(2, ORDERING_ID_STRING_BASE)) -STABLE_SORTS = ["mergesort", "stable"] - class OrderingDirection(Enum): ASC = 1 @@ -113,17 +111,12 @@ def with_non_sequential(self): def with_ordering_columns( self, ordering_value_columns: Sequence[OrderingColumnReference] = (), - stable: bool = False, ) -> ExpressionOrdering: """Creates a new ordering that reorders by the given columns. Args: ordering_value_columns: In decreasing precedence order, the values used to sort the ordering - stable: - If True, will use apply a stable sorting, using the old ordering where - the new ordering produces ties. Otherwise, ties will be resolved in - a performance maximizing way, Returns: Modified ExpressionOrdering @@ -131,29 +124,33 @@ def with_ordering_columns( col_ids_new = [ ordering_ref.column_id for ordering_ref in ordering_value_columns ] - if stable: - # Only reference each column once, so discard old referenc if there is a new reference - old_ordering_keep = [ - ordering_ref - for ordering_ref in self.ordering_value_columns - if ordering_ref.column_id not in col_ids_new - ] - else: - # New ordering needs to keep all total ordering columns no matter what. - # All other old ordering references can be discarded as does not need - # to be a stable sort. - old_ordering_keep = [ - ordering_ref - for ordering_ref in self.ordering_value_columns - if (ordering_ref.column_id not in col_ids_new) - and (ordering_ref.column_id in self.total_ordering_columns) - ] - new_ordering = (*ordering_value_columns, *old_ordering_keep) + old_ordering_keep = [ + ordering_ref + for ordering_ref in self.ordering_value_columns + if ordering_ref.column_id not in col_ids_new + ] + + # Truncate to remove any unneded col references after all total order cols included + new_ordering = self._truncate_ordering( + (*ordering_value_columns, *old_ordering_keep) + ) return ExpressionOrdering( new_ordering, total_ordering_columns=self.total_ordering_columns, ) + def _truncate_ordering( + self, order_refs: tuple[OrderingColumnReference, ...] + ) -> tuple[OrderingColumnReference, ...]: + total_order_cols_remaining = set(self.total_ordering_columns) + for i in range(len(order_refs)): + column = order_refs[i].column_id + if column in total_order_cols_remaining: + total_order_cols_remaining.remove(column) + if len(total_order_cols_remaining) == 0: + return order_refs[: i + 1] + raise ValueError("Ordering did not contain all total_order_cols") + def with_reverse(self): """Reverses the ordering.""" return ExpressionOrdering( diff --git a/bigframes/core/window/__init__.py b/bigframes/core/window/__init__.py index d3d081124e..240715b6df 100644 --- a/bigframes/core/window/__init__.py +++ b/bigframes/core/window/__init__.py @@ -16,12 +16,14 @@ import typing +from bigframes.core import log_adapter import bigframes.core as core import bigframes.core.blocks as blocks import bigframes.operations.aggregations as agg_ops import third_party.bigframes_vendored.pandas.core.window.rolling as vendored_pandas_rolling +@log_adapter.class_logger class Window(vendored_pandas_rolling.Window): __doc__ = vendored_pandas_rolling.Window.__doc__ diff --git a/bigframes/dataframe.py b/bigframes/dataframe.py index 40f12671ae..57b4ca42cf 100644 --- a/bigframes/dataframe.py +++ b/bigframes/dataframe.py @@ -41,6 +41,7 @@ import bigframes._config.display_options as display_options import bigframes.constants as constants import bigframes.core +from bigframes.core import log_adapter import bigframes.core.block_transforms as block_ops import bigframes.core.blocks as blocks import bigframes.core.groupby as groupby @@ -81,6 +82,7 @@ # Inherits from pandas DataFrame so that we can use the same docstrings. +@log_adapter.class_logger class DataFrame(vendored_pandas_frame.DataFrame): __doc__ = vendored_pandas_frame.DataFrame.__doc__ @@ -861,6 +863,8 @@ def to_pandas( max_download_size: Optional[int] = None, sampling_method: Optional[str] = None, random_state: Optional[int] = None, + *, + ordered: bool = True, ) -> pandas.DataFrame: """Write DataFrame to pandas DataFrame. @@ -880,6 +884,9 @@ def to_pandas( The seed for the uniform downsampling algorithm. If provided, the uniform method may take longer to execute and require more computation. If set to a value other than None, this will supersede the global config. + ordered (bool, default True): + Determines whether the resulting pandas dataframe will be deterministically ordered. + In some cases, unordered may result in a faster-executing query. Returns: pandas.DataFrame: A pandas DataFrame with all rows and columns of this DataFrame if the @@ -891,6 +898,7 @@ def to_pandas( max_download_size=max_download_size, sampling_method=sampling_method, random_state=random_state, + ordered=ordered, ) self._set_internal_query_job(query_job) return df.set_axis(self._block.column_labels, axis=1, copy=False) @@ -1101,23 +1109,38 @@ def _assign_single_item( copy[k] = v(copy) return copy elif utils.is_list_like(v): - given_rows = len(v) - actual_rows = len(self) - if given_rows != actual_rows: - raise ValueError( - f"Length of values ({given_rows}) does not match length of index ({actual_rows})" - ) + return self._assign_single_item_listlike(k, v) + else: + return self._assign_scalar(k, v) - local_df = bigframes.dataframe.DataFrame( - {k: v}, session=self._get_block().expr.session + def _assign_single_item_listlike(self, k: str, v: Sequence) -> DataFrame: + given_rows = len(v) + actual_rows = len(self) + assigning_to_empty_df = len(self.columns) == 0 and actual_rows == 0 + if not assigning_to_empty_df and given_rows != actual_rows: + raise ValueError( + f"Length of values ({given_rows}) does not match length of index ({actual_rows})" ) - # local_df is likely (but not guarunteed) to be cached locally - # since the original list came from memory and so is probably < MAX_INLINE_DF_SIZE - new_column_block = local_df._block - original_index_column_ids = self._block.index_columns - self_block = self._block.reset_index(drop=False) - result_index, (get_column_left, get_column_right) = self_block.index.join( + local_df = bigframes.dataframe.DataFrame( + {k: v}, session=self._get_block().expr.session + ) + # local_df is likely (but not guaranteed) to be cached locally + # since the original list came from memory and so is probably < MAX_INLINE_DF_SIZE + + new_column_block = local_df._block + original_index_column_ids = self._block.index_columns + self_block = self._block.reset_index(drop=False) + if assigning_to_empty_df: + if len(self._block.index_columns) > 1: + # match error raised by pandas here + raise ValueError( + "Assigning listlike to a first column under multiindex is not supported." + ) + result_block = new_column_block.with_index_labels(self._block.index_labels) + result_block = result_block.with_column_labels([k]) + else: + result_index, (get_column_left, get_column_right,) = self_block.index.join( new_column_block.index, how="left", block_identity_join=True ) result_block = result_index._block @@ -1125,13 +1148,9 @@ def _assign_single_item( [get_column_left[col_id] for col_id in original_index_column_ids], index_labels=self._block.index_labels, ) - return DataFrame(result_block) - else: - return self._assign_scalar(k, v) + return DataFrame(result_block) def _assign_scalar(self, label: str, value: Union[int, float]) -> DataFrame: - # TODO(swast): Make sure that k is the ID / SQL name, not a label, - # which could be invalid SQL. col_ids = self._block.cols_matching_label(label) block, constant_col_id = self._block.create_constant(value, label) @@ -1245,9 +1264,7 @@ def sort_values( column_id, direction=direction, na_last=na_last ) ) - return DataFrame( - self._block.order_by(ordering, stable=kind in order.STABLE_SORTS) - ) + return DataFrame(self._block.order_by(ordering)) def value_counts( self, @@ -1439,6 +1456,8 @@ def reindex_like(self, other: DataFrame, *, validate: typing.Optional[bool] = No return self.reindex(index=other.index, columns=other.columns, validate=validate) def interpolate(self, method: str = "linear") -> DataFrame: + if method == "pad": + return self.ffill() result = block_ops.interpolate(self._block, method) return DataFrame(result) @@ -1922,6 +1941,7 @@ def merge( "left", "outer", "right", + "cross", ] = "inner", # TODO(garrettwu): Currently can take inner, outer, left and right. To support # cross joins @@ -1932,6 +1952,19 @@ def merge( sort: bool = False, suffixes: tuple[str, str] = ("_x", "_y"), ) -> DataFrame: + if how == "cross": + if on is not None: + raise ValueError("'on' is not supported for cross join.") + result_block = self._block.merge( + right._block, + left_join_ids=[], + right_join_ids=[], + suffixes=suffixes, + how=how, + sort=True, + ) + return DataFrame(result_block) + if on is None: if left_on is None or right_on is None: raise ValueError("Must specify `on` or `left_on` + `right_on`.") @@ -1985,6 +2018,18 @@ def join( raise NotImplementedError( f"Deduping column names is not implemented. {constants.FEEDBACK_LINK}" ) + if how == "cross": + if on is not None: + raise ValueError("'on' is not supported for cross join.") + result_block = left._block.merge( + right._block, + left_join_ids=[], + right_join_ids=[], + suffixes=("", ""), + how="cross", + sort=True, + ) + return DataFrame(result_block) # Join left columns with right index if on is not None: @@ -2674,7 +2719,8 @@ def _get_block(self) -> blocks.Block: return self._block def _cached(self) -> DataFrame: - return DataFrame(self._block.cached()) + self._set_block(self._block.cached()) + return self _DataFrameOrSeries = typing.TypeVar("_DataFrameOrSeries") diff --git a/bigframes/dtypes.py b/bigframes/dtypes.py index 079f0cc27a..cd35e380c0 100644 --- a/bigframes/dtypes.py +++ b/bigframes/dtypes.py @@ -375,7 +375,7 @@ def cast_ibis_value( ), ibis_dtypes.float64: (ibis_dtypes.string, ibis_dtypes.int64), ibis_dtypes.string: (ibis_dtypes.int64, ibis_dtypes.float64), - ibis_dtypes.date: (), + ibis_dtypes.date: (ibis_dtypes.string,), ibis_dtypes.Decimal(precision=38, scale=9): (ibis_dtypes.float64,), ibis_dtypes.Decimal(precision=76, scale=38): (ibis_dtypes.float64,), ibis_dtypes.time: (), diff --git a/bigframes/ml/core.py b/bigframes/ml/core.py index 4c5a48cf62..d8135f7085 100644 --- a/bigframes/ml/core.py +++ b/bigframes/ml/core.py @@ -16,12 +16,14 @@ from __future__ import annotations +import datetime from typing import Callable, cast, Iterable, Mapping, Optional, Union import uuid from google.cloud import bigquery import bigframes +import bigframes.constants as constants from bigframes.ml import sql as ml_sql import bigframes.pandas as bpd @@ -126,7 +128,7 @@ def generate_text_embedding( def forecast(self) -> bpd.DataFrame: sql = self._model_manipulation_sql_generator.ml_forecast() - return self._session.read_gbq(sql) + return self._session.read_gbq(sql, index_col="forecast_timestamp").reset_index() def evaluate(self, input_data: Optional[bpd.DataFrame] = None): # TODO: validate input data schema @@ -139,14 +141,18 @@ def centroids(self) -> bpd.DataFrame: sql = self._model_manipulation_sql_generator.ml_centroids() - return self._session.read_gbq(sql) + return self._session.read_gbq( + sql, index_col=["centroid_id", "feature"] + ).reset_index() def principal_components(self) -> bpd.DataFrame: assert self._model.model_type == "PCA" sql = self._model_manipulation_sql_generator.ml_principal_components() - return self._session.read_gbq(sql) + return self._session.read_gbq( + sql, index_col=["principal_component_id", "feature"] + ).reset_index() def principal_component_info(self) -> bpd.DataFrame: assert self._model.model_type == "PCA" @@ -188,24 +194,27 @@ def register(self, vertex_ai_model_id: Optional[str] = None) -> BqmlModel: class BqmlModelFactory: def __init__(self): - model_id = self._create_temp_model_id() - self._model_creation_sql_generator = ml_sql.ModelCreationSqlGenerator(model_id) + self._model_creation_sql_generator = ml_sql.ModelCreationSqlGenerator() - def _create_temp_model_id(self) -> str: - return uuid.uuid4().hex - - def _reset_model_id(self): - self._model_creation_sql_generator._model_id = self._create_temp_model_id() + def _create_model_ref( + self, dataset: bigquery.DatasetReference + ) -> bigquery.ModelReference: + return bigquery.ModelReference.from_string( + f"{dataset.project}.{dataset.dataset_id}.{uuid.uuid4().hex}" + ) def _create_model_with_sql(self, session: bigframes.Session, sql: str) -> BqmlModel: # fit the model, synchronously _, job = session._start_query(sql) # real model path in the session specific hidden dataset and table prefix - model_name_full = f"{job.destination.dataset_id}.{job.destination.table_id}" - model = session.bqclient.get_model(model_name_full) + model_name_full = f"{job.destination.project}.{job.destination.dataset_id}.{job.destination.table_id}" + model = bigquery.Model(model_name_full) + model.expires = ( + datetime.datetime.now(datetime.timezone.utc) + constants.DEFAULT_EXPIRATION + ) + model = session.bqclient.update_model(model, ["expires"]) - self._reset_model_id() return BqmlModel(session, model) def create_model( @@ -215,7 +224,7 @@ def create_model( transforms: Optional[Iterable[str]] = None, options: Mapping[str, Union[str, int, float, Iterable[str]]] = {}, ) -> BqmlModel: - """Create a session-temporary BQML model with the CREATE MODEL statement + """Create a session-temporary BQML model with the CREATE OR REPLACE MODEL statement Args: X_train: features columns for training @@ -228,16 +237,20 @@ def create_model( Returns: a BqmlModel, wrapping a trained model in BigQuery """ options = dict(options) + # Cache dataframes to make sure base table is not a snapshot + # cached dataframe creates a full copy, never uses snapshot if y_train is None: - input_data = X_train + input_data = X_train._cached() else: - input_data = X_train.join(y_train, how="outer") + input_data = X_train._cached().join(y_train._cached(), how="outer") options.update({"INPUT_LABEL_COLS": y_train.columns.tolist()}) session = X_train._session + model_ref = self._create_model_ref(session._anonymous_dataset) sql = self._model_creation_sql_generator.create_model( source_df=input_data, + model_ref=model_ref, transforms=transforms, options=options, ) @@ -259,14 +272,18 @@ def create_time_series_model( ), "Time stamp data input must only contain 1 column." options = dict(options) - input_data = X_train.join(y_train, how="outer") + # Cache dataframes to make sure base table is not a snapshot + # cached dataframe creates a full copy, never uses snapshot + input_data = X_train._cached().join(y_train._cached(), how="outer") options.update({"TIME_SERIES_TIMESTAMP_COL": X_train.columns.tolist()[0]}) options.update({"TIME_SERIES_DATA_COL": y_train.columns.tolist()[0]}) session = X_train._session + model_ref = self._create_model_ref(session._anonymous_dataset) sql = self._model_creation_sql_generator.create_model( source_df=input_data, + model_ref=model_ref, transforms=transforms, options=options, ) @@ -279,7 +296,7 @@ def create_remote_model( connection_name: str, options: Mapping[str, Union[str, int, float, Iterable[str]]] = {}, ) -> BqmlModel: - """Create a session-temporary BQML remote model with the CREATE MODEL statement + """Create a session-temporary BQML remote model with the CREATE OR REPLACE MODEL statement Args: connection_name: @@ -290,8 +307,10 @@ def create_remote_model( Returns: BqmlModel: a BqmlModel wrapping a trained model in BigQuery """ + model_ref = self._create_model_ref(session._anonymous_dataset) sql = self._model_creation_sql_generator.create_remote_model( connection_name=connection_name, + model_ref=model_ref, options=options, ) @@ -302,7 +321,7 @@ def create_imported_model( session: bigframes.Session, options: Mapping[str, Union[str, int, float, Iterable[str]]] = {}, ) -> BqmlModel: - """Create a session-temporary BQML imported model with the CREATE MODEL statement + """Create a session-temporary BQML imported model with the CREATE OR REPLACE MODEL statement Args: options: a dict of options to configure the model. Generates a BQML OPTIONS @@ -310,7 +329,9 @@ def create_imported_model( Returns: a BqmlModel, wrapping a trained model in BigQuery """ + model_ref = self._create_model_ref(session._anonymous_dataset) sql = self._model_creation_sql_generator.create_imported_model( + model_ref=model_ref, options=options, ) diff --git a/bigframes/ml/llm.py b/bigframes/ml/llm.py index 2e5a9a1e5e..3cfc28e61f 100644 --- a/bigframes/ml/llm.py +++ b/bigframes/ml/llm.py @@ -24,12 +24,14 @@ from bigframes.ml import base, core, globals, utils import bigframes.pandas as bpd -_REMOTE_TEXT_GENERATOR_MODEL_CODE = "CLOUD_AI_LARGE_LANGUAGE_MODEL_V1" -_REMOTE_TEXT_GENERATOR_32K_MODEL_CODE = "text-bison-32k" +_REMOTE_TEXT_GENERATOR_MODEL_ENDPOINT = "text-bison" +_REMOTE_TEXT_GENERATOR_32K_MODEL_ENDPOINT = "text-bison-32k" _TEXT_GENERATE_RESULT_COLUMN = "ml_generate_text_llm_result" -_REMOTE_EMBEDDING_GENERATOR_MODEL_CODE = "CLOUD_AI_TEXT_EMBEDDING_MODEL_V1" -_REMOTE_EMBEDDING_GENERATOR_MUlTILINGUAL_MODEL_CODE = "textembedding-gecko-multilingual" +_REMOTE_EMBEDDING_GENERATOR_MODEL_ENDPOINT = "textembedding-gecko" +_REMOTE_EMBEDDING_GENERATOR_MUlTILINGUAL_MODEL_ENDPOINT = ( + "textembedding-gecko-multilingual" +) _EMBED_TEXT_RESULT_COLUMN = "text_embedding" @@ -88,14 +90,18 @@ def _create_bqml_model(self): connection_id=connection_name_parts[2], iam_role="aiplatform.user", ) - if self.model_name == "text-bison": + if self.model_name == _REMOTE_TEXT_GENERATOR_MODEL_ENDPOINT: options = { - "remote_service_type": _REMOTE_TEXT_GENERATOR_MODEL_CODE, + "endpoint": _REMOTE_TEXT_GENERATOR_MODEL_ENDPOINT, } - else: + elif self.model_name == _REMOTE_TEXT_GENERATOR_32K_MODEL_ENDPOINT: options = { - "endpoint": _REMOTE_TEXT_GENERATOR_32K_MODEL_CODE, + "endpoint": _REMOTE_TEXT_GENERATOR_32K_MODEL_ENDPOINT, } + else: + raise ValueError( + f"Model name {self.model_name} is not supported. We only support {_REMOTE_TEXT_GENERATOR_MODEL_ENDPOINT} and {_REMOTE_TEXT_GENERATOR_32K_MODEL_ENDPOINT}." + ) return self._bqml_model_factory.create_remote_model( session=self.session, connection_name=self.connection_name, options=options ) @@ -240,12 +246,16 @@ def _create_bqml_model(self): ) if self.model_name == "textembedding-gecko": options = { - "remote_service_type": _REMOTE_EMBEDDING_GENERATOR_MODEL_CODE, + "endpoint": _REMOTE_EMBEDDING_GENERATOR_MODEL_ENDPOINT, } - else: + elif self.model_name == _REMOTE_EMBEDDING_GENERATOR_MUlTILINGUAL_MODEL_ENDPOINT: options = { - "endpoint": _REMOTE_EMBEDDING_GENERATOR_MUlTILINGUAL_MODEL_CODE, + "endpoint": _REMOTE_EMBEDDING_GENERATOR_MUlTILINGUAL_MODEL_ENDPOINT, } + else: + raise ValueError( + f"Model name {self.model_name} is not supported. We only support {_REMOTE_EMBEDDING_GENERATOR_MODEL_ENDPOINT} and {_REMOTE_EMBEDDING_GENERATOR_MUlTILINGUAL_MODEL_ENDPOINT}." + ) return self._bqml_model_factory.create_remote_model( session=self.session, connection_name=self.connection_name, options=options diff --git a/bigframes/ml/sql.py b/bigframes/ml/sql.py index 601b271099..ab051231fb 100644 --- a/bigframes/ml/sql.py +++ b/bigframes/ml/sql.py @@ -18,6 +18,8 @@ from typing import Iterable, Mapping, Optional, Union +import google.cloud.bigquery + import bigframes.constants as constants import bigframes.pandas as bpd @@ -121,22 +123,26 @@ def ml_label_encoder( class ModelCreationSqlGenerator(BaseSqlGenerator): """Sql generator for creating a model entity. Model id is the standalone id without project id and dataset id.""" - def __init__(self, model_id: str): - self._model_id = model_id + def _model_id_sql( + self, + model_ref: google.cloud.bigquery.ModelReference, + ): + return f"`{model_ref.project}`.`{model_ref.dataset_id}`.`{model_ref.model_id}`" # Model create and alter def create_model( self, source_df: bpd.DataFrame, + model_ref: google.cloud.bigquery.ModelReference, options: Mapping[str, Union[str, int, float, Iterable[str]]] = {}, transforms: Optional[Iterable[str]] = None, ) -> str: - """Encode the CREATE TEMP MODEL statement for BQML""" + """Encode the CREATE OR REPLACE MODEL statement for BQML""" source_sql = source_df.sql transform_sql = self.transform(*transforms) if transforms is not None else None options_sql = self.options(**options) - parts = [f"CREATE TEMP MODEL `{self._model_id}`"] + parts = [f"CREATE OR REPLACE MODEL {self._model_id_sql(model_ref)}"] if transform_sql: parts.append(transform_sql) if options_sql: @@ -147,12 +153,13 @@ def create_model( def create_remote_model( self, connection_name: str, + model_ref: google.cloud.bigquery.ModelReference, options: Mapping[str, Union[str, int, float, Iterable[str]]] = {}, ) -> str: - """Encode the CREATE TEMP MODEL statement for BQML remote model.""" + """Encode the CREATE OR REPLACE MODEL statement for BQML remote model.""" options_sql = self.options(**options) - parts = [f"CREATE TEMP MODEL `{self._model_id}`"] + parts = [f"CREATE OR REPLACE MODEL {self._model_id_sql(model_ref)}"] parts.append(self.connection(connection_name)) if options_sql: parts.append(options_sql) @@ -160,12 +167,13 @@ def create_remote_model( def create_imported_model( self, + model_ref: google.cloud.bigquery.ModelReference, options: Mapping[str, Union[str, int, float, Iterable[str]]] = {}, ) -> str: - """Encode the CREATE TEMP MODEL statement for BQML remote model.""" + """Encode the CREATE OR REPLACE MODEL statement for BQML remote model.""" options_sql = self.options(**options) - parts = [f"CREATE TEMP MODEL `{self._model_id}`"] + parts = [f"CREATE OR REPLACE MODEL {self._model_id_sql(model_ref)}"] if options_sql: parts.append(options_sql) return "\n".join(parts) diff --git a/bigframes/operations/datetimes.py b/bigframes/operations/datetimes.py index 1b20c2d593..a8a33beb57 100644 --- a/bigframes/operations/datetimes.py +++ b/bigframes/operations/datetimes.py @@ -14,12 +14,14 @@ from __future__ import annotations +from bigframes.core import log_adapter import bigframes.operations as ops import bigframes.operations.base import bigframes.series as series import third_party.bigframes_vendored.pandas.core.indexes.accessor as vendordt +@log_adapter.class_logger class DatetimeMethods( bigframes.operations.base.SeriesMethods, vendordt.DatetimeProperties ): diff --git a/bigframes/operations/strings.py b/bigframes/operations/strings.py index 0545ea34d6..201b19abe8 100644 --- a/bigframes/operations/strings.py +++ b/bigframes/operations/strings.py @@ -18,6 +18,7 @@ from typing import cast, Literal, Optional, Union import bigframes.constants as constants +from bigframes.core import log_adapter import bigframes.dataframe as df import bigframes.operations as ops import bigframes.operations.base @@ -32,6 +33,7 @@ } +@log_adapter.class_logger class StringMethods(bigframes.operations.base.SeriesMethods, vendorstr.StringMethods): __doc__ = vendorstr.StringMethods.__doc__ diff --git a/bigframes/operations/structs.py b/bigframes/operations/structs.py index 506a557709..b2ae98f378 100644 --- a/bigframes/operations/structs.py +++ b/bigframes/operations/structs.py @@ -18,6 +18,7 @@ import ibis.expr.types as ibis_types +from bigframes.core import log_adapter import bigframes.dataframe import bigframes.operations import bigframes.operations.base @@ -38,6 +39,7 @@ def _as_ibis(self, x: ibis_types.Value): return struct_value[name].name(name) +@log_adapter.class_logger class StructAccessor( bigframes.operations.base.SeriesMethods, vendoracessors.StructAccessor ): diff --git a/bigframes/pandas/__init__.py b/bigframes/pandas/__init__.py index 1c52b103fb..d35f838366 100644 --- a/bigframes/pandas/__init__.py +++ b/bigframes/pandas/__init__.py @@ -332,6 +332,7 @@ def merge( "left", "outer", "right", + "cross", ] = "inner", on: Optional[str] = None, *, diff --git a/bigframes/series.py b/bigframes/series.py index 032bdf6c42..c929775a00 100644 --- a/bigframes/series.py +++ b/bigframes/series.py @@ -30,16 +30,13 @@ import bigframes.constants as constants import bigframes.core +from bigframes.core import log_adapter import bigframes.core.block_transforms as block_ops import bigframes.core.blocks as blocks import bigframes.core.groupby as groupby import bigframes.core.indexers import bigframes.core.indexes as indexes -from bigframes.core.ordering import ( - OrderingColumnReference, - OrderingDirection, - STABLE_SORTS, -) +from bigframes.core.ordering import OrderingColumnReference, OrderingDirection import bigframes.core.scalar as scalars import bigframes.core.utils as utils import bigframes.core.window @@ -59,6 +56,7 @@ LevelsType = typing.Union[LevelType, typing.Sequence[LevelType]] +@log_adapter.class_logger class Series(bigframes.operations.base.SeriesMethods, vendored_pandas_series.Series): def __init__(self, *args, **kwargs): self._query_job: Optional[bigquery.QueryJob] = None @@ -271,6 +269,8 @@ def to_pandas( max_download_size: Optional[int] = None, sampling_method: Optional[str] = None, random_state: Optional[int] = None, + *, + ordered: bool = True, ) -> pandas.Series: """Writes Series to pandas Series. @@ -290,6 +290,10 @@ def to_pandas( The seed for the uniform downsampling algorithm. If provided, the uniform method may take longer to execute and require more computation. If set to a value other than None, this will supersede the global config. + ordered (bool, default True): + Determines whether the resulting pandas series will be deterministically ordered. + In some cases, unordered may result in a faster-executing query. + Returns: pandas.Series: A pandas Series with all rows of this Series if the data_sampling_threshold_mb @@ -300,6 +304,7 @@ def to_pandas( max_download_size=max_download_size, sampling_method=sampling_method, random_state=random_state, + ordered=ordered, ) self._set_internal_query_job(query_job) series = df[self._value_column] @@ -475,6 +480,8 @@ def replace( return Series(block.select_column(result_col)) def interpolate(self, method: str = "linear") -> Series: + if method == "pad": + return self.ffill() result = block_ops.interpolate(self._block, method) return Series(result) @@ -1058,7 +1065,6 @@ def sort_values( na_last=(na_position == "last"), ) ], - stable=kind in STABLE_SORTS, ) return Series(block) @@ -1438,6 +1444,22 @@ def map( result_df = self_df.join(map_df, on="series") return result_df[self.name] + def sample( + self, + n: Optional[int] = None, + frac: Optional[float] = None, + *, + random_state: Optional[int] = None, + ) -> Series: + if n is not None and frac is not None: + raise ValueError("Only one of 'n' or 'frac' parameter can be specified.") + + ns = (n,) if n is not None else () + fracs = (frac,) if frac is not None else () + return Series( + self._block._split(ns=ns, fracs=fracs, random_state=random_state)[0] + ) + def __array_ufunc__( self, ufunc: numpy.ufunc, method: str, *inputs, **kwargs ) -> Series: @@ -1478,7 +1500,8 @@ def _slice( ) def _cached(self) -> Series: - return Series(self._block.cached()) + self._set_block(self._block.cached()) + return self def _is_list_like(obj: typing.Any) -> typing_extensions.TypeGuard[typing.Sequence]: diff --git a/bigframes/session/__init__.py b/bigframes/session/__init__.py index 2537e81e19..069bd5d260 100644 --- a/bigframes/session/__init__.py +++ b/bigframes/session/__init__.py @@ -16,10 +16,11 @@ from __future__ import annotations +import datetime +import itertools import logging import os import re -import textwrap import typing from typing import ( Any, @@ -35,7 +36,6 @@ Tuple, Union, ) -import uuid import warnings import google.api_core.client_info @@ -64,6 +64,7 @@ import bigframes._config.bigquery_options as bigquery_options import bigframes.constants as constants +from bigframes.core import log_adapter import bigframes.core as core import bigframes.core.blocks as blocks import bigframes.core.guid as guid @@ -81,6 +82,7 @@ # Even though the ibis.backends.bigquery.registry import is unused, it's needed # to register new and replacement ops with the Ibis BigQuery backend. import third_party.bigframes_vendored.ibis.backends.bigquery.registry # noqa +import third_party.bigframes_vendored.ibis.expr.operations as vendored_ibis_ops import third_party.bigframes_vendored.pandas.io.gbq as third_party_pandas_gbq import third_party.bigframes_vendored.pandas.io.parquet as third_party_pandas_parquet import third_party.bigframes_vendored.pandas.io.parsers.readers as third_party_pandas_readers @@ -160,7 +162,7 @@ def __init__( application_name=context.application_name, ) - self._create_and_bind_bq_session() + self._create_bq_datasets() self.ibis_client = typing.cast( ibis_bigquery.Backend, ibis.bigquery.connect( @@ -209,19 +211,12 @@ def _project(self): def __hash__(self): # Stable hash needed to use in expression tree - return hash(self._session_id) - - def _create_and_bind_bq_session(self): - """Create a BQ session and bind the session id with clients to capture BQ activities: - go/bigframes-transient-data""" - job_config = bigquery.QueryJobConfig(create_session=True) - # Make sure the session is a new one, not one associated with another query. - job_config.use_query_cache = False - query_job = self.bqclient.query( - "SELECT 1", job_config=job_config, location=self._location - ) + return hash(str(self._anonymous_dataset)) + + def _create_bq_datasets(self): + """Create and identify dataset(s) for temporary BQ resources.""" + query_job = self.bqclient.query("SELECT 1", location=self._location) query_job.result() # blocks until finished - self._session_id = query_job.session_info.session_id # The anonymous dataset is used by BigQuery to write query results and # session tables. BigQuery DataFrames also writes temp tables directly @@ -234,17 +229,6 @@ def _create_and_bind_bq_session(self): query_destination.dataset_id, ) - self.bqclient.default_query_job_config = bigquery.QueryJobConfig( - connection_properties=[ - bigquery.ConnectionProperty("session_id", self._session_id) - ] - ) - self.bqclient.default_load_job_config = bigquery.LoadJobConfig( - connection_properties=[ - bigquery.ConnectionProperty("session_id", self._session_id) - ] - ) - # Dataset for storing remote functions, which don't yet # support proper session temporary storage yet self._session_dataset = bigquery.Dataset( @@ -253,28 +237,7 @@ def _create_and_bind_bq_session(self): self._session_dataset.location = self._location def close(self): - """Terminated the BQ session, otherwises the session will be terminated automatically after - 24 hours of inactivity or after 7 days.""" - if self._session_id is not None and self.bqclient is not None: - abort_session_query = "CALL BQ.ABORT_SESSION('{}')".format(self._session_id) - try: - query_job = self.bqclient.query(abort_session_query) - query_job.result() # blocks until finished - except google.api_core.exceptions.BadRequest as exc: - # Ignore the exception when the BQ session itself has expired - # https://cloud.google.com/bigquery/docs/sessions-terminating#auto-terminate_a_session - if not exc.message.startswith( - f"Session {self._session_id} has expired and is no longer available." - ): - raise - except google.auth.exceptions.RefreshError: - # The refresh token may itself have been invalidated or expired - # https://developers.google.com/identity/protocols/oauth2#expiration - # Don't raise the exception in this case while closing the - # BigFrames session, so that the end user has a path for getting - # out of a bad session due to unusable credentials. - pass - self._session_id = None + """No-op. Temporary resources are deleted after 7 days.""" def read_gbq( self, @@ -325,9 +288,15 @@ def _query_to_destination( # internal issue 303057336. # Since we have a `statement_type == 'SELECT'`, schema should be populated. schema = typing.cast(Iterable[bigquery.SchemaField], dry_run_job.schema) - temp_table = self._create_session_table_empty(api_name, schema, index_cols) + cluster_cols = [ + item.name + for item in schema + if (item.name in index_cols) and _can_cluster_bq(item) + ][:_MAX_CLUSTER_COLUMNS] + temp_table = self._create_empty_temp_table(schema, cluster_cols) job_config = bigquery.QueryJobConfig() + job_config.labels["bigframes-api"] = api_name job_config.destination = temp_table try: @@ -372,12 +341,6 @@ def read_gbq_query( ... pitchSpeed, ... FROM `bigquery-public-data.baseball.games_wide` ... ''') - >>> df.head(2) - pitcherFirstName pitcherLastName pitchSpeed - 0 0 - 1 0 - - [2 rows x 3 columns] Preserve ordering in a query input. @@ -422,7 +385,7 @@ def _read_gbq_query( index_col: Iterable[str] | str = (), col_order: Iterable[str] = (), max_results: Optional[int] = None, - api_name: str, + api_name: str = "read_gbq_query", ) -> dataframe.DataFrame: if isinstance(index_col, str): index_cols = [index_col] @@ -430,9 +393,7 @@ def _read_gbq_query( index_cols = list(index_col) destination, query_job = self._query_to_destination( - query, - index_cols, - api_name=api_name, + query, index_cols, api_name=api_name ) # If there was no destination table, that means the query must have @@ -476,16 +437,6 @@ def read_gbq_table( Read a whole table, with arbitrary ordering or ordering corresponding to the primary key(s). >>> df = bpd.read_gbq_table("bigquery-public-data.ml_datasets.penguins") - >>> df.head(2) - species island culmen_length_mm \\ - 0 Adelie Penguin (Pygoscelis adeliae) Dream 36.6 - 1 Adelie Penguin (Pygoscelis adeliae) Dream 39.8 - - culmen_depth_mm flipper_length_mm body_mass_g sex - 0 18.4 184.0 3475.0 FEMALE - 1 19.1 184.0 4650.0 MALE - - [2 rows x 7 columns] See also: :meth:`Session.read_gbq`. """ @@ -499,7 +450,7 @@ def read_gbq_table( api_name="read_gbq_table", ) - def _read_gbq_table_to_ibis_with_total_ordering( + def _get_snapshot_sql_and_primary_key( self, table_ref: bigquery.table.TableReference, *, @@ -519,7 +470,6 @@ def _read_gbq_table_to_ibis_with_total_ordering( ), None, ) - table_expression = self.ibis_client.table( table_ref.table_id, database=f"{table_ref.project}.{table_ref.dataset_id}", @@ -530,6 +480,11 @@ def _read_gbq_table_to_ibis_with_total_ordering( # the same assumption and use these columns as the total ordering keys. table = self.bqclient.get_table(table_ref) + if table.location.casefold() != self._location.casefold(): + raise ValueError( + f"Current session is in {self._location} but dataset '{table.project}.{table.dataset_id}' is located in {table.location}" + ) + # TODO(b/305264153): Use public properties to fetch primary keys once # added to google-cloud-bigquery. primary_keys = ( @@ -538,22 +493,18 @@ def _read_gbq_table_to_ibis_with_total_ordering( .get("columns") ) - if not primary_keys: - return table_expression, None - else: - # Read from a snapshot since we won't have to copy the table data to create a total ordering. - job_config = bigquery.QueryJobConfig() - job_config.labels["bigframes-api"] = api_name - current_timestamp = list( - self.bqclient.query( - "SELECT CURRENT_TIMESTAMP() AS `current_timestamp`", - job_config=job_config, - ).result() - )[0][0] - table_expression = self.ibis_client.sql( - bigframes_io.create_snapshot_sql(table_ref, current_timestamp) - ) - return table_expression, primary_keys + job_config = bigquery.QueryJobConfig() + job_config.labels["bigframes-api"] = api_name + current_timestamp = list( + self.bqclient.query( + "SELECT CURRENT_TIMESTAMP() AS `current_timestamp`", + job_config=job_config, + ).result() + )[0][0] + table_expression = self.ibis_client.sql( + bigframes_io.create_snapshot_sql(table_ref, current_timestamp) + ) + return table_expression, primary_keys def _read_gbq_table( self, @@ -576,10 +527,7 @@ def _read_gbq_table( ( table_expression, total_ordering_cols, - ) = self._read_gbq_table_to_ibis_with_total_ordering( - table_ref, - api_name=api_name, - ) + ) = self._get_snapshot_sql_and_primary_key(table_ref, api_name=api_name) for key in col_order: if key not in table_expression.columns: @@ -592,68 +540,40 @@ def _read_gbq_table( else: index_cols = list(index_col) - hidden_cols: typing.Sequence[str] = () - for key in index_cols: if key not in table_expression.columns: raise ValueError( f"Column `{key}` of `index_col` not found in this table." ) + if col_order: + table_expression = table_expression.select([*index_cols, *col_order]) + # If the index is unique and sortable, then we don't need to generate # an ordering column. ordering = None - is_total_ordering = False - if total_ordering_cols is not None: - # Note: currently, this a table has a total ordering only when the + # Note: currently, a table has a total ordering only when the # primary key(s) are set on a table. The query engine assumes such # columns are unique, even if not enforced. - is_total_ordering = True ordering = orderings.ExpressionOrdering( ordering_value_columns=tuple( - [ - core.OrderingColumnReference(column_id) - for column_id in total_ordering_cols - ] + core.OrderingColumnReference(column_id) + for column_id in total_ordering_cols ), total_ordering_columns=frozenset(total_ordering_cols), ) - - if len(index_cols) != 0: - index_labels = typing.cast(List[Optional[str]], index_cols) - else: - # Use the total_ordering_cols to project offsets to use as the default index. - table_expression = table_expression.order_by(index_cols) - default_index_id = guid.generate_guid("bigframes_index_") - default_index_col = ( - ibis.row_number().cast(ibis_dtypes.int64).name(default_index_id) - ) - table_expression = table_expression.mutate( - **{default_index_id: default_index_col} - ) - index_cols = [default_index_id] - index_labels = [None] - elif len(index_cols) != 0: - index_labels = typing.cast(List[Optional[str]], index_cols) - distinct_table = table_expression.select(*index_cols).distinct() - is_unique_sql = f"""WITH full_table AS ( - {self.ibis_client.compile(table_expression)} - ), - distinct_table AS ( - {self.ibis_client.compile(distinct_table)} + column_values = [table_expression[col] for col in table_expression.columns] + array_value = core.ArrayValue.from_ibis( + self, + table_expression, + columns=column_values, + hidden_ordering_columns=[], + ordering=ordering, ) - SELECT (SELECT COUNT(*) FROM full_table) AS `total_count`, - (SELECT COUNT(*) FROM distinct_table) AS `distinct_count` - """ - results, query_job = self._start_query(is_unique_sql) - row = next(iter(results)) - - total_count = row["total_count"] - distinct_count = row["distinct_count"] - is_total_ordering = total_count == distinct_count - + elif len(index_cols) != 0: + # We have index columns, lets see if those are actually total_order_columns ordering = orderings.ExpressionOrdering( ordering_value_columns=tuple( [ @@ -663,147 +583,66 @@ def _read_gbq_table( ), total_ordering_columns=frozenset(index_cols), ) - - # We have a total ordering, so query via "time travel" so that - # the underlying data doesn't mutate. + is_total_ordering = self._check_index_uniqueness( + table_expression, index_cols + ) if is_total_ordering: - # Get the timestamp from the job metadata rather than the query - # text so that the query for determining uniqueness of the ID - # columns can be cached. - current_timestamp = query_job.started - - # The job finished, so we should have a start time. - assert current_timestamp is not None - table_expression = self.ibis_client.sql( - bigframes_io.create_snapshot_sql(table_ref, current_timestamp) + column_values = [ + table_expression[col] for col in table_expression.columns + ] + array_value = core.ArrayValue.from_ibis( + self, + table_expression, + columns=column_values, + hidden_ordering_columns=[], + ordering=ordering, ) else: - # Make sure when we generate an ordering, the row_number() - # coresponds to the index columns. - table_expression = table_expression.order_by(index_cols) - warnings.warn( - textwrap.dedent( - f""" - Got a non-unique index. A consistent ordering is not - guaranteed. DataFrame has {total_count} rows, - but only {distinct_count} distinct index values. - """, - ) - ) - - # When ordering by index columns, apply limit after ordering to - # make limit more predictable. - if max_results is not None: - table_expression = table_expression.limit(max_results) + array_value = self._create_total_ordering(table_expression) else: - if max_results is not None: - # Apply limit before generating rownums and creating temp table - # This makes sure the offsets are valid and limits the number of - # rows for which row numbers must be generated - table_expression = table_expression.limit(max_results) - table_expression, ordering = self._create_sequential_ordering( - table=table_expression, - api_name=api_name, - ) - hidden_cols = ( - (ordering.total_order_col.column_id,) - if ordering.total_order_col - else () - ) - assert len(ordering.ordering_value_columns) > 0 - is_total_ordering = True - # Block constructor will generate default index if passed empty - index_cols = [] - index_labels = [] - - return self._read_gbq_with_ordering( - table_expression=table_expression, - col_order=col_order, - index_cols=index_cols, - index_labels=index_labels, - hidden_cols=hidden_cols, - ordering=ordering, - is_total_ordering=is_total_ordering, - api_name=api_name, + array_value = self._create_total_ordering(table_expression) + + value_columns = [col for col in array_value.column_ids if col not in index_cols] + block = blocks.Block( + array_value, + index_columns=index_cols, + column_labels=value_columns, + index_labels=index_cols, ) + if max_results: + block = block.slice(stop=max_results) + df = dataframe.DataFrame(block) - def _read_gbq_with_ordering( - self, - table_expression: ibis_types.Table, - *, - col_order: Iterable[str] = (), - col_labels: Iterable[Optional[str]] = (), - index_cols: Iterable[str] = (), - index_labels: Iterable[Optional[str]] = (), - hidden_cols: Iterable[str] = (), - ordering: orderings.ExpressionOrdering, - is_total_ordering: bool = False, - api_name: str, - ) -> dataframe.DataFrame: - """Internal helper method that loads DataFrame from Google BigQuery given an ordering column. + # If user provided index columns, should sort over it + if len(index_cols) > 0: + df.sort_index() + return df - Args: - table_expression: - an ibis table expression to be executed in BigQuery. - col_order: - List of BigQuery column ids in the desired order for results DataFrame. - col_labels: - List of column labels as the column names. - index_cols: - List of index ids to use as the index or multi-index. - index_labels: - List of index labels as names of index. - hidden_cols: - Columns that should be hidden. Ordering columns may (not always) be hidden - ordering: - Column name to be used for ordering. If not supplied, a default ordering is generated. - api_name: - The name of the API method. + def _check_index_uniqueness( + self, table: ibis_types.Table, index_cols: List[str] + ) -> bool: + distinct_table = table.select(*index_cols).distinct() + is_unique_sql = f"""WITH full_table AS ( + {self.ibis_client.compile(table)} + ), + distinct_table AS ( + {self.ibis_client.compile(distinct_table)} + ) - Returns: - A DataFrame representing results of the query or table. + SELECT (SELECT COUNT(*) FROM full_table) AS `total_count`, + (SELECT COUNT(*) FROM distinct_table) AS `distinct_count` """ - index_cols, index_labels = list(index_cols), list(index_labels) - if len(index_cols) != len(index_labels): - raise ValueError( - "Needs same number of index labels are there are index columns. " - f"Got {len(index_labels)}, expected {len(index_cols)}." - ) + results, _ = self._start_query(is_unique_sql) + row = next(iter(results)) - # Logic: - # no total ordering, index -> create sequential order, ordered by index, use for both ordering and index - # total ordering, index -> use ordering as ordering, index as index - - # This code block ensures the existence of a total ordering. - column_keys = list(col_order) - if len(column_keys) == 0: - non_value_columns = set([*index_cols, *hidden_cols]) - column_keys = [ - key for key in table_expression.columns if key not in non_value_columns - ] - if not is_total_ordering: - # Rows are not ordered, we need to generate a default ordering and materialize it - table_expression, ordering = self._create_sequential_ordering( - table=table_expression, - index_cols=index_cols, - api_name=api_name, - ) - index_col_values = [table_expression[index_id] for index_id in index_cols] - if not col_labels: - col_labels = column_keys - return self._read_ibis( - table_expression, - index_col_values, - index_labels, - column_keys, - col_labels, - ordering=ordering, - ) + total_count = row["total_count"] + distinct_count = row["distinct_count"] + return total_count == distinct_count def _read_bigquery_load_job( self, filepath_or_buffer: str | IO["bytes"], - table: bigquery.Table, + table: Union[bigquery.Table, bigquery.TableReference], *, job_config: bigquery.LoadJobConfig, index_col: Iterable[str] | str = (), @@ -833,49 +672,24 @@ def _read_bigquery_load_job( ) self._start_generic_job(load_job) + table_id = f"{table.project}.{table.dataset_id}.{table.table_id}" + + # Update the table expiration so we aren't limited to the default 24 + # hours of the anonymous dataset. + table_expiration = bigquery.Table(table_id) + table_expiration.expires = ( + datetime.datetime.now(datetime.timezone.utc) + constants.DEFAULT_EXPIRATION + ) + self.bqclient.update_table(table_expiration, ["expires"]) # The BigQuery REST API for tables.get doesn't take a session ID, so we # can't get the schema for a temp table that way. return self.read_gbq_table( - f"{table.project}.{table.dataset_id}.{table.table_id}", + table_id, index_col=index_col, col_order=col_order, ) - def _read_ibis( - self, - table_expression: ibis_types.Table, - index_cols: Iterable[ibis_types.Value], - index_labels: Iterable[blocks.Label], - column_keys: Iterable[str], - column_labels: Iterable[blocks.Label], - ordering: orderings.ExpressionOrdering, - ) -> dataframe.DataFrame: - """Turns a table expression (plus index column) into a DataFrame.""" - - columns = list(index_cols) - for key in column_keys: - if key not in table_expression.columns: - raise ValueError(f"Column '{key}' not found in this table.") - columns.append(table_expression[key]) - - non_hidden_ids = [col.get_name() for col in columns] - hidden_ordering_columns = [] - for ref in ordering.all_ordering_columns: - if ref.column_id not in non_hidden_ids: - hidden_ordering_columns.append(table_expression[ref.column_id]) - - block = blocks.Block( - core.ArrayValue.from_ibis( - self, table_expression, columns, hidden_ordering_columns, ordering - ), - index_columns=[index_col.get_name() for index_col in index_cols], - column_labels=column_labels, - index_labels=index_labels, - ) - - return dataframe.DataFrame(block) - def read_gbq_model(self, model_name: str): """Loads a BigQuery ML model from BigQuery. @@ -974,7 +788,7 @@ def _read_pandas( job_config.clustering_fields = cluster_cols job_config.labels = {"bigframes-api": api_name} - load_table_destination = self._create_session_table() + load_table_destination = bigframes_io.random_table(self._anonymous_dataset) load_job = self.bqclient.load_table_from_dataframe( pandas_dataframe_copy, load_table_destination, @@ -987,8 +801,9 @@ def _read_pandas( total_ordering_columns=frozenset([ordering_col]), integer_encoding=IntegerEncoding(True, is_sequential=True), ) - table_expression = self.ibis_client.sql( - f"SELECT * FROM `{load_table_destination.table_id}`" + table_expression = self.ibis_client.table( + load_table_destination.table_id, + database=f"{load_table_destination.project}.{load_table_destination.dataset_id}", ) # b/297590178 Potentially a bug in bqclient.load_table_from_dataframe(), that only when the DF is empty, the index columns disappear in table_expression. @@ -997,17 +812,26 @@ def _read_pandas( ): new_idx_ids, idx_labels = [], [] - df = self._read_gbq_with_ordering( - table_expression=table_expression, - col_labels=col_labels, - index_cols=new_idx_ids, - index_labels=idx_labels, - hidden_cols=(ordering_col,), + column_values = [ + table_expression[col] + for col in table_expression.columns + if col != ordering_col + ] + array_value = core.ArrayValue.from_ibis( + self, + table_expression, + columns=column_values, + hidden_ordering_columns=[table_expression[ordering_col]], ordering=ordering, - is_total_ordering=True, - api_name=api_name, ) - return df + + block = blocks.Block( + array_value, + index_columns=new_idx_ids, + column_labels=col_labels, + index_labels=idx_labels, + ) + return dataframe.DataFrame(block) def read_csv( self, @@ -1039,7 +863,7 @@ def read_csv( encoding: Optional[str] = None, **kwargs, ) -> dataframe.DataFrame: - table = bigquery.Table(self._create_session_table()) + table = bigframes_io.random_table(self._anonymous_dataset) if engine is not None and engine == "bigquery": if any(param is not None for param in (dtype, names)): @@ -1153,7 +977,7 @@ def read_parquet( # Note: "engine" is omitted because it is redundant. Loading a table # from a pandas DataFrame will just create another parquet file + load # job anyway. - table = bigquery.Table(self._create_session_table()) + table = bigframes_io.random_table(self._anonymous_dataset) job_config = bigquery.LoadJobConfig() job_config.create_disposition = bigquery.CreateDisposition.CREATE_IF_NEEDED @@ -1176,7 +1000,7 @@ def read_json( engine: Literal["ujson", "pyarrow", "bigquery"] = "ujson", **kwargs, ) -> dataframe.DataFrame: - table = bigquery.Table(self._create_session_table()) + table = bigframes_io.random_table(self._anonymous_dataset) if engine == "bigquery": @@ -1266,103 +1090,85 @@ def _check_file_size(self, filepath: str): "for large files to avoid loading the file into local memory." ) - def _create_session_table(self) -> bigquery.TableReference: - table_name = f"{uuid.uuid4().hex}" - dataset = bigquery.Dataset( - bigquery.DatasetReference(self.bqclient.project, "_SESSION") - ) - return dataset.table(table_name) - - def _create_session_table_empty( + def _create_empty_temp_table( self, - api_name: str, schema: Iterable[bigquery.SchemaField], cluster_cols: List[str], ) -> bigquery.TableReference: # Can't set a table in _SESSION as destination via query job API, so we # run DDL, instead. - table = self._create_session_table() - schema_sql = bigframes_io.bq_schema_to_sql(schema) - - clusterable_cols = [ - col.name - for col in schema - if col.name in cluster_cols and _can_cluster_bq(col) - ][:_MAX_CLUSTER_COLUMNS] - - if clusterable_cols: - cluster_cols_sql = ", ".join( - f"`{cluster_col}`" for cluster_col in clusterable_cols - ) - cluster_sql = f"CLUSTER BY {cluster_cols_sql}" - else: - cluster_sql = "" - - ddl_text = f""" - CREATE TEMP TABLE - `_SESSION`.`{table.table_id}` - ({schema_sql}) - {cluster_sql} - """ - - job_config = bigquery.QueryJobConfig() - - # Include a label so that Dataplex Lineage can identify temporary - # tables that BigQuery DataFrames creates. Googlers: See internal issue - # 296779699. We're labeling the job instead of the table because - # otherwise we get `BadRequest: 400 OPTIONS on temporary tables are not - # supported`. - job_config.labels = {"source": "bigquery-dataframes-temp"} - job_config.labels["bigframes-api"] = api_name - - _, query_job = self._start_query(ddl_text, job_config=job_config) + dataset = self._anonymous_dataset + expiration = ( + datetime.datetime.now(datetime.timezone.utc) + constants.DEFAULT_EXPIRATION + ) - # Use fully-qualified name instead of `_SESSION` name so that the - # created table can be used as the destination table. - return query_job.destination + table = bigframes_io.create_temp_table( + self.bqclient, + dataset, + expiration, + schema=schema, + cluster_columns=cluster_cols, + ) + return bigquery.TableReference.from_string(table) - def _create_sequential_ordering( + def _create_total_ordering( self, table: ibis_types.Table, - index_cols: Iterable[str] = (), - api_name: str = "", - ) -> Tuple[ibis_types.Table, orderings.ExpressionOrdering]: + ) -> core.ArrayValue: # Since this might also be used as the index, don't use the default # "ordering ID" name. - default_ordering_name = guid.generate_guid("bigframes_ordering_") - default_ordering_col = ( - ibis.row_number().cast(ibis_dtypes.int64).name(default_ordering_name) + ordering_hash_part = guid.generate_guid("bigframes_ordering_") + ordering_rand_part = guid.generate_guid("bigframes_ordering_") + + str_values = list( + map(lambda col: _convert_to_string(table[col]), table.columns) ) - table = table.mutate(**{default_ordering_name: default_ordering_col}) - table_ref = self._ibis_to_session_table( - table, - cluster_cols=list(index_cols) + [default_ordering_name], - api_name=api_name, + full_row_str = ( + str_values[0].concat(*str_values[1:]) + if len(str_values) > 1 + else str_values[0] ) - table = self.ibis_client.table( - f"{table_ref.project}.{table_ref.dataset_id}.{table_ref.table_id}" + full_row_hash = full_row_str.hash().name(ordering_hash_part) + # Used to disambiguate between identical rows (which will have identical hash) + random_value = ibis.random().name(ordering_rand_part) + + original_column_ids = table.columns + table_with_ordering = table.select( + itertools.chain(original_column_ids, [full_row_hash, random_value]) ) - ordering_reference = core.OrderingColumnReference(default_ordering_name) + + ordering_ref1 = core.OrderingColumnReference(ordering_hash_part) + ordering_ref2 = core.OrderingColumnReference(ordering_rand_part) ordering = orderings.ExpressionOrdering( - ordering_value_columns=tuple([ordering_reference]), - total_ordering_columns=frozenset([default_ordering_name]), - integer_encoding=IntegerEncoding(is_encoded=True, is_sequential=True), + ordering_value_columns=(ordering_ref1, ordering_ref2), + total_ordering_columns=frozenset([ordering_hash_part, ordering_rand_part]), + ) + columns = [table_with_ordering[col] for col in original_column_ids] + hidden_columns = [ + table_with_ordering[ordering_hash_part], + table_with_ordering[ordering_rand_part], + ] + return core.ArrayValue.from_ibis( + self, + table_with_ordering, + columns, + hidden_ordering_columns=hidden_columns, + ordering=ordering, ) - return table, ordering - def _ibis_to_session_table( + def _ibis_to_temp_table( self, table: ibis_types.Table, cluster_cols: Iterable[str], api_name: str, ) -> bigquery.TableReference: - desination, _ = self._query_to_destination( + destination, _ = self._query_to_destination( self.ibis_client.compile(table), index_cols=list(cluster_cols), api_name=api_name, ) # There should always be a destination table for this query type. - return typing.cast(bigquery.TableReference, desination) + return typing.cast(bigquery.TableReference, destination) def remote_function( self, @@ -1478,15 +1284,37 @@ def read_gbq_function( The return type of the function must be explicitly specified in the function's original definition even if not otherwise required. + BigQuery Utils provides many public functions under the ``bqutil`` project on Google Cloud Platform project + (See: https://github.com/GoogleCloudPlatform/bigquery-utils/tree/master/udfs#using-the-udfs). + You can checkout Community UDFs to use community-contributed functions. + (See: https://github.com/GoogleCloudPlatform/bigquery-utils/tree/master/udfs/community#community-udfs). + **Examples:** + Use the ``cw_lower_case_ascii_only`` function from Community UDFs. + (https://github.com/GoogleCloudPlatform/bigquery-utils/blob/master/udfs/community/cw_lower_case_ascii_only.sqlx) + >>> import bigframes.pandas as bpd >>> bpd.options.display.progress_bar = None - >>> function_name = "bqutil.fn.cw_lower_case_ascii_only" - >>> func = bpd.read_gbq_function(function_name=function_name) - >>> func.bigframes_remote_function - 'bqutil.fn.cw_lower_case_ascii_only' + >>> df = bpd.DataFrame({'id': [1, 2, 3], 'name': ['AURÉLIE', 'CÉLESTINE', 'DAPHNÉ']}) + >>> df + id name + 0 1 AURÉLIE + 1 2 CÉLESTINE + 2 3 DAPHNÉ + + [3 rows x 2 columns] + + >>> func = bpd.read_gbq_function("bqutil.fn.cw_lower_case_ascii_only") + >>> df1 = df.assign(new_name=df['name'].apply(func)) + >>> df1 + id name new_name + 0 1 AURÉLIE aurÉlie + 1 2 CÉLESTINE cÉlestine + 2 3 DAPHNÉ daphnÉ + + [3 rows x 3 columns] Args: function_name (str): @@ -1520,6 +1348,10 @@ def _start_query( Starts query job and waits for results. """ job_config = self._prepare_job_config(job_config) + api_methods = log_adapter.get_and_reset_api_methods() + job_config.labels = bigframes_io.create_job_configs_labels( + job_configs_labels=job_config.labels, api_methods=api_methods + ) query_job = self.bqclient.query(sql, job_config=job_config) opts = bigframes.options.display @@ -1554,6 +1386,8 @@ def _prepare_job_config( ) -> bigquery.QueryJobConfig: if job_config is None: job_config = self.bqclient.default_query_job_config + if job_config is None: + job_config = bigquery.QueryJobConfig() if bigframes.options.compute.maximum_bytes_billed is not None: job_config.maximum_bytes_billed = ( bigframes.options.compute.maximum_bytes_billed @@ -1583,3 +1417,23 @@ def _can_cluster_bq(field: bigquery.SchemaField): "BOOL", "BOOLEAN", ) + + +def _convert_to_string(column: ibis_types.Column) -> ibis_types.StringColumn: + col_type = column.type() + if ( + col_type.is_numeric() + or col_type.is_boolean() + or col_type.is_binary() + or col_type.is_temporal() + ): + result = column.cast(ibis_dtypes.String(nullable=True)) + elif col_type.is_geospatial(): + result = typing.cast(ibis_types.GeoSpatialColumn, column).as_text() + elif col_type.is_string(): + result = column + else: + # TO_JSON_STRING works with all data types, but isn't the most efficient + # Needed for JSON, STRUCT and ARRAY datatypes + result = vendored_ibis_ops.ToJsonString(column).to_expr() # type: ignore + return typing.cast(ibis_types.StringColumn, result) diff --git a/bigframes/session/_io/bigquery.py b/bigframes/session/_io/bigquery.py index 06d240fec6..dae73301e7 100644 --- a/bigframes/session/_io/bigquery.py +++ b/bigframes/session/_io/bigquery.py @@ -17,17 +17,36 @@ from __future__ import annotations import datetime +import itertools import textwrap import types -from typing import Dict, Iterable, Union +from typing import Dict, Iterable, Optional, Sequence, Union import uuid import google.cloud.bigquery as bigquery IO_ORDERING_ID = "bqdf_row_nums" +MAX_LABELS_COUNT = 64 TEMP_TABLE_PREFIX = "bqdf{date}_{random_id}" +def create_job_configs_labels( + job_configs_labels: Optional[Dict[str, str]], + api_methods: Sequence[str], +) -> Dict[str, str]: + if job_configs_labels is None: + job_configs_labels = {} + + labels = list( + itertools.chain( + job_configs_labels.keys(), + (f"recent-bigframes-api-{i}" for i in range(len(api_methods))), + ) + ) + values = list(itertools.chain(job_configs_labels.values(), api_methods)) + return dict(zip(labels[:MAX_LABELS_COUNT], values[:MAX_LABELS_COUNT])) + + def create_export_csv_statement( table_id: str, uri: str, field_delimiter: str, header: bool ) -> str: @@ -121,11 +140,17 @@ def create_temp_table( bqclient: bigquery.Client, dataset: bigquery.DatasetReference, expiration: datetime.datetime, + *, + schema: Optional[Iterable[bigquery.SchemaField]] = None, + cluster_columns: Optional[list[str]] = None, ) -> str: """Create an empty table with an expiration in the desired dataset.""" table_ref = random_table(dataset) destination = bigquery.Table(table_ref) destination.expires = expiration + destination.schema = schema + if cluster_columns: + destination.clustering_fields = cluster_columns bqclient.create_table(destination) return f"{table_ref.project}.{table_ref.dataset_id}.{table_ref.table_id}" diff --git a/bigframes/version.py b/bigframes/version.py index 0a5df27479..5a94f72649 100644 --- a/bigframes/version.py +++ b/bigframes/version.py @@ -12,4 +12,4 @@ # See the License for the specific language governing permissions and # limitations under the License. -__version__ = "0.13.0" +__version__ = "0.14.0" diff --git a/notebooks/generative_ai/bq_dataframes_llm_kmeans.ipynb b/notebooks/generative_ai/bq_dataframes_llm_kmeans.ipynb new file mode 100644 index 0000000000..46c4955288 --- /dev/null +++ b/notebooks/generative_ai/bq_dataframes_llm_kmeans.ipynb @@ -0,0 +1,690 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Copyright 2023 Google LLC\n", + "#\n", + "# Licensed under the Apache License, Version 2.0 (the \"License\");\n", + "# you may not use this file except in compliance with the License.\n", + "# You may obtain a copy of the License at\n", + "#\n", + "# https://www.apache.org/licenses/LICENSE-2.0\n", + "#\n", + "# Unless required by applicable law or agreed to in writing, software\n", + "# distributed under the License is distributed on an \"AS IS\" BASIS,\n", + "# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n", + "# See the License for the specific language governing permissions and\n", + "# limitations under the License." + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Use BigQuery DataFrames to cluster and characterize complaints\n", + "\n", + "\n", + "\n", + " \n", + " \n", + " \n", + "
\n", + " \n", + " \"Colab Run in Colab\n", + " \n", + " \n", + " \n", + " \"GitHub\n", + " View on GitHub\n", + " \n", + " \n", + " \n", + " \"Vertex\n", + " Open in Vertex AI Workbench\n", + " \n", + "
" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Overview\n", + "\n", + "The goal of this notebook is to demonstrate a comment characterization algorithm for an online business. We will accomplish this using [Google's PaLM 2](https://ai.google/discover/palm2/) and [KMeans clustering](https://en.wikipedia.org/wiki/K-means_clustering) in three steps:\n", + "\n", + "1. Use PaLM2TextEmbeddingGenerator to [generate text embeddings](https://cloud.google.com/vertex-ai/docs/generative-ai/embeddings/get-text-embeddings) for each of 10000 complaints sent to an online bank. If you're not familiar with what a text embedding is, it's a list of numbers that are like coordinates in an imaginary \"meaning space\" for sentences. (It's like [word embeddings](https://en.wikipedia.org/wiki/Word_embedding), but for more general text.) The important point for our purposes is that similar sentences are close to each other in this imaginary space.\n", + "2. Use KMeans clustering to group together complaints whose text embeddings are near to eachother. This will give us sets of similar complaints, but we don't yet know _why_ these complaints are similar.\n", + "3. Prompt PaLM2TextGenerator in English asking what the difference is between the groups of complaints that we got. Thanks to the power of modern LLMs, the response might give us a very good idea of what these complaints are all about, but remember to [\"understand the limits of your dataset and model.\"](https://ai.google/responsibility/responsible-ai-practices/#:~:text=Understand%20the%20limitations%20of%20your%20dataset%20and%20model)\n", + "\n", + "We will tie these pieces together in Python using BigQuery DataFrames. [Click here](https://cloud.google.com/bigquery/docs/dataframes-quickstart) to learn more about BigQuery DataFrames!" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Dataset\n", + "\n", + "This notebook uses the [CFPB Consumer Complaint Database](https://console.cloud.google.com/marketplace/product/cfpb/complaint-database)." + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Costs\n", + "\n", + "This tutorial uses billable components of Google Cloud:\n", + "\n", + "* BigQuery (compute)\n", + "* BigQuery ML\n", + "* Generative AI support on Vertex AI\n", + "\n", + "Learn about [BigQuery compute pricing](https://cloud.google.com/bigquery/pricing#analysis_pricing_models), [Generative AI support on Vertex AI pricing](https://cloud.google.com/vertex-ai/pricing#generative_ai_models),\n", + "and [BigQuery ML pricing](https://cloud.google.com/bigquery/pricing#bqml),\n", + "and use the [Pricing Calculator](https://cloud.google.com/products/calculator/)\n", + "to generate a cost estimate based on your projected usage." + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Before you begin\n", + "\n", + "Complete the tasks in this section to set up your environment." + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Set up your Google Cloud project\n", + "\n", + "**The following steps are required, regardless of your notebook environment.**\n", + "\n", + "1. [Select or create a Google Cloud project](https://console.cloud.google.com/cloud-resource-manager). When you first create an account, you get a $300 credit towards your compute/storage costs.\n", + "\n", + "2. [Make sure that billing is enabled for your project](https://cloud.google.com/billing/docs/how-to/modify-project).\n", + "\n", + "3. [Click here](https://console.cloud.google.com/flows/enableapi?apiid=bigquery.googleapis.com,bigqueryconnection.googleapis.com,run.googleapis.com,artifactregistry.googleapis.com,cloudbuild.googleapis.com,cloudresourcemanager.googleapis.com) to enable the following APIs:\n", + "\n", + " * BigQuery API\n", + " * BigQuery Connection API\n", + " * Cloud Run API\n", + " * Artifact Registry API\n", + " * Cloud Build API\n", + " * Cloud Resource Manager API\n", + " * Vertex AI API\n", + "\n", + "4. If you are running this notebook locally, install the [Cloud SDK](https://cloud.google.com/sdk)." + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Set your project ID\n", + "\n", + "**If you don't know your project ID**, see the support page: [Locate the project ID](https://support.google.com/googleapi/answer/7014113)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# set your project ID below\n", + "PROJECT_ID = \"\" # @param {type:\"string\"}\n", + "\n", + "# Set the project id in gcloud\n", + "! gcloud config set project {PROJECT_ID}" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Set the region\n", + "\n", + "You can also change the `REGION` variable used by BigQuery. Learn more about [BigQuery regions](https://cloud.google.com/bigquery/docs/locations#supported_locations)." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "REGION = \"US\" # @param {type: \"string\"}" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Authenticate your Google Cloud account\n", + "\n", + "Depending on your Jupyter environment, you might have to manually authenticate. Follow the relevant instructions below." + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**Vertex AI Workbench**\n", + "\n", + "Do nothing, you are already authenticated." + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**Local JupyterLab instance**\n", + "\n", + "Uncomment and run the following cell:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# ! gcloud auth login" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**Colab**\n", + "\n", + "Uncomment and run the following cell:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# from google.colab import auth\n", + "# auth.authenticate_user()" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "If you want to reset the location of the created DataFrame or Series objects, reset the session by executing `bf.close_session()`. After that, you can reuse `bf.options.bigquery.location` to specify another location." + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Connect to Vertex AI\n", + "\n", + "In order to use PaLM2TextGenerator, we will need to set up a [cloud resource connection](https://cloud.google.com/bigquery/docs/create-cloud-resource-connection)." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from google.cloud import bigquery_connection_v1 as bq_connection\n", + "\n", + "CONN_NAME = \"bqdf-llm\"\n", + "\n", + "client = bq_connection.ConnectionServiceClient()\n", + "new_conn_parent = f\"projects/{PROJECT_ID}/locations/{REGION}\"\n", + "exists_conn_parent = f\"projects/{PROJECT_ID}/locations/{REGION}/connections/{CONN_NAME}\"\n", + "cloud_resource_properties = bq_connection.CloudResourceProperties({})\n", + "\n", + "try:\n", + " request = client.get_connection(\n", + " request=bq_connection.GetConnectionRequest(name=exists_conn_parent)\n", + " )\n", + " CONN_SERVICE_ACCOUNT = f\"serviceAccount:{request.cloud_resource.service_account_id}\"\n", + "except Exception:\n", + " connection = bq_connection.types.Connection(\n", + " {\"friendly_name\": CONN_NAME, \"cloud_resource\": cloud_resource_properties}\n", + " )\n", + " request = bq_connection.CreateConnectionRequest(\n", + " {\n", + " \"parent\": new_conn_parent,\n", + " \"connection_id\": CONN_NAME,\n", + " \"connection\": connection,\n", + " }\n", + " )\n", + " response = client.create_connection(request)\n", + " CONN_SERVICE_ACCOUNT = (\n", + " f\"serviceAccount:{response.cloud_resource.service_account_id}\"\n", + " )\n", + "print(CONN_SERVICE_ACCOUNT)" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Set permissions for the service account\n", + "\n", + "The resource connection service account requires certain project-level permissions:\n", + " - `roles/aiplatform.user` and `roles/bigquery.connectionUser`: These roles are required for the connection to create a model definition using the LLM model in Vertex AI ([documentation](https://cloud.google.com/bigquery/docs/generate-text#give_the_service_account_access)).\n", + " - `roles/run.invoker`: This role is required for the connection to have read-only access to Cloud Run services that back custom/remote functions ([documentation](https://cloud.google.com/bigquery/docs/remote-functions#grant_permission_on_function)).\n", + "\n", + "Set these permissions by running the following `gcloud` commands:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "!gcloud projects add-iam-policy-binding {PROJECT_ID} --condition=None --no-user-output-enabled --member={CONN_SERVICE_ACCOUNT} --role='roles/bigquery.connectionUser'\n", + "!gcloud projects add-iam-policy-binding {PROJECT_ID} --condition=None --no-user-output-enabled --member={CONN_SERVICE_ACCOUNT} --role='roles/aiplatform.user'\n", + "!gcloud projects add-iam-policy-binding {PROJECT_ID} --condition=None --no-user-output-enabled --member={CONN_SERVICE_ACCOUNT} --role='roles/run.invoker'" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now we are ready to use BigQuery DataFrames!" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": { + "id": "xckgWno6ouHY" + }, + "source": [ + "## Step 1: Text embedding " + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Project Setup" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "R7STCS8xB5d2" + }, + "outputs": [], + "source": [ + "import bigframes.pandas as bf\n", + "\n", + "bf.options.bigquery.project = PROJECT_ID\n", + "bf.options.bigquery.location = REGION" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": { + "id": "v6FGschEowht" + }, + "source": [ + "Data Input - read the data from a publicly available BigQuery dataset" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "zDSwoBo1CU3G" + }, + "outputs": [], + "source": [ + "input_df = bf.read_gbq(\"bigquery-public-data.cfpb_complaints.complaint_database\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "tYDoaKgJChiq" + }, + "outputs": [], + "source": [ + "issues_df = input_df[[\"consumer_complaint_narrative\"]].dropna()\n", + "issues_df.head(n=5) # View the first five complaints" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Download 10000 complaints to use with PaLM2TextEmbeddingGenerator" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "OltYSUEcsSOW" + }, + "outputs": [], + "source": [ + "# Choose 10,000 complaints randomly and store them in a column in a DataFrame\n", + "downsampled_issues_df = issues_df.sample(n=10000)" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": { + "id": "Wl2o-NYMoygb" + }, + "source": [ + "Generate the text embeddings" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "li38q8FzDDMu" + }, + "outputs": [], + "source": [ + "from bigframes.ml.llm import PaLM2TextEmbeddingGenerator\n", + "\n", + "model = PaLM2TextEmbeddingGenerator() # No connection id needed" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "cOuSOQ5FDewD" + }, + "outputs": [], + "source": [ + "# Will take ~3 minutes to compute the embeddings\n", + "predicted_embeddings = model.predict(downsampled_issues_df)\n", + "# Notice the lists of numbers that are our text embeddings for each complaint\n", + "predicted_embeddings.head() " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "4H_etYfsEOFP" + }, + "outputs": [], + "source": [ + "# Join the complaints with their embeddings in the same DataFrame\n", + "combined_df = downsampled_issues_df.join(predicted_embeddings)" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We now have the complaints and their text embeddings as two columns in our combined_df. Recall that complaints with numerically similar text embeddings should have similar meanings semantically. We will now group similar complaints together." + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": { + "id": "OUZ3NNbzo1Tb" + }, + "source": [ + "## Step 2: KMeans clustering" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "AhNTnEC5FRz2" + }, + "outputs": [], + "source": [ + "from bigframes.ml.cluster import KMeans\n", + "\n", + "cluster_model = KMeans(n_clusters=10) # We will divide our complaints into 10 groups" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Perform KMeans clustering" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "6poSxh-fGJF7" + }, + "outputs": [], + "source": [ + "# Use KMeans clustering to calculate our groups. Will take ~3 minutes.\n", + "cluster_model.fit(combined_df[[\"text_embedding\"]])\n", + "clustered_result = cluster_model.predict(combined_df[[\"text_embedding\"]])\n", + "# Notice the CENTROID_ID column, which is the ID number of the group that\n", + "# each complaint belongs to.\n", + "clustered_result.head(n=5)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Join the group number to the complaints and their text embeddings\n", + "combined_clustered_result = combined_df.join(clustered_result)" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Our dataframe combined_clustered_result now has three columns: the complaints, their text embeddings, and an ID from 1-10 (inclusive) indicating which semantically similar group they belong to." + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": { + "id": "21rNsFMHo8hO" + }, + "source": [ + "## Step 3: Summarize the complaints" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Build prompts - we will choose just two of our categories and prompt PaLM2TextGenerator to identify their salient characteristics. The prompt is natural language in a python string." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "2E7wXM_jGqo6" + }, + "outputs": [], + "source": [ + "# Using bigframes, with syntax identical to pandas,\n", + "# filter out the first and second groups\n", + "cluster_1_result = combined_clustered_result[\n", + " combined_clustered_result[\"CENTROID_ID\"] == 1\n", + "][[\"consumer_complaint_narrative\"]]\n", + "cluster_1_result_pandas = cluster_1_result.head(5).to_pandas()\n", + "\n", + "cluster_2_result = combined_clustered_result[\n", + " combined_clustered_result[\"CENTROID_ID\"] == 2\n", + "][[\"consumer_complaint_narrative\"]]\n", + "cluster_2_result_pandas = cluster_2_result.head(5).to_pandas()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "ZNDiueI9IP5e" + }, + "outputs": [], + "source": [ + "# Build plain-text prompts to send to PaLM 2. Use only 5 complaints from each group.\n", + "prompt1 = 'comment list 1:\\n'\n", + "for i in range(5):\n", + " prompt1 += str(i + 1) + '. ' + \\\n", + " cluster_1_result_pandas[\"consumer_complaint_narrative\"].iloc[i] + '\\n'\n", + "\n", + "prompt2 = 'comment list 2:\\n'\n", + "for i in range(5):\n", + " prompt2 += str(i + 1) + '. ' + \\\n", + " cluster_2_result_pandas[\"consumer_complaint_narrative\"].iloc[i] + '\\n'\n", + "\n", + "print(prompt1)\n", + "print(prompt2)\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "BfHGJLirzSvH" + }, + "outputs": [], + "source": [ + "# The plain English request we will make of PaLM 2\n", + "prompt = (\n", + " \"Please highlight the most obvious difference between\"\n", + " \"the two lists of comments:\\n\" + prompt1 + prompt2\n", + ")\n", + "print(prompt)" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Get a response from PaLM 2 LLM by making a call to Vertex AI using our connection." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "mL5P0_3X04dE" + }, + "outputs": [], + "source": [ + "from bigframes.ml.llm import PaLM2TextGenerator\n", + "\n", + "session = bf.get_global_session()\n", + "connection = f\"{PROJECT_ID}.{REGION}.{CONN_NAME}\"\n", + "q_a_model = PaLM2TextGenerator(session=session, connection_name=connection)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "ICWHsqAW1FNk" + }, + "outputs": [], + "source": [ + "# Make a DataFrame containing only a single row with our prompt for PaLM 2\n", + "df = bf.DataFrame({\"prompt\": [prompt]})" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "gB7e1LXU1pst" + }, + "outputs": [], + "source": [ + "# Send the request for PaLM 2 to generate a response to our prompt\n", + "major_difference = q_a_model.predict(df)\n", + "# PaLM 2's response is the only row in the dataframe result \n", + "major_difference[\"ml_generate_text_llm_result\"].iloc[0]" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We now see PaLM2TextGenerator's characterization of the different comment groups. Thanks for using BigQuery DataFrames!" + ] + } + ], + "metadata": { + "colab": { + "provenance": [] + }, + "kernelspec": { + "display_name": "Python 3", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.16" + } + }, + "nbformat": 4, + "nbformat_minor": 0 +} diff --git a/noxfile.py b/noxfile.py index 34b055de44..3dd23ba04f 100644 --- a/noxfile.py +++ b/noxfile.py @@ -609,6 +609,7 @@ def notebook(session): # our test infrastructure. "notebooks/getting_started/getting_started_bq_dataframes.ipynb", "notebooks/generative_ai/bq_dataframes_llm_code_generation.ipynb", + "notebooks/generative_ai/bq_dataframes_llm_kmeans.ipynb", "notebooks/regression/bq_dataframes_ml_linear_regression.ipynb", "notebooks/generative_ai/bq_dataframes_ml_drug_name_generation.ipynb", "notebooks/vertex_sdk/sdk2_bigframes_pytorch.ipynb", diff --git a/owlbot.py b/owlbot.py index be30eea5c2..082970018d 100644 --- a/owlbot.py +++ b/owlbot.py @@ -99,6 +99,13 @@ "BigQuery DataFrames provides DataFrame APIs on the BigQuery engine", ) +# Update the contributing guide to reflect some differences in this repo. +s.replace( + ["CONTRIBUTING.rst"], + re.escape("blacken"), + "format", +) + # ---------------------------------------------------------------------------- # Samples templates # ---------------------------------------------------------------------------- @@ -110,5 +117,3 @@ # ---------------------------------------------------------------------------- s.shell.run(["nox", "-s", "format"], hide_output=False) -for noxfile in REPO_ROOT.glob("samples/**/noxfile.py"): - s.shell.run(["nox", "-s", "blacken"], cwd=noxfile.parent, hide_output=False) diff --git a/samples/snippets/pandas_methods_test.py b/samples/snippets/pandas_methods_test.py index 1f472d6346..bd8e29c003 100644 --- a/samples/snippets/pandas_methods_test.py +++ b/samples/snippets/pandas_methods_test.py @@ -22,13 +22,20 @@ def test_bigquery_dataframes_pandas_methods(): bq_df = bpd.read_gbq(query_or_table) # Inspect one of the columns (or series) of the DataFrame: - bq_df["body_mass_g"].head(10) + bq_df["body_mass_g"] # Compute the mean of this series: average_body_mass = bq_df["body_mass_g"].mean() print(f"average_body_mass: {average_body_mass}") - # Calculate the mean body_mass_g by species using the groupby operation: - bq_df["body_mass_g"].groupby(by=bq_df["species"]).mean().head() + # Find the heaviest species using the groupby operation to calculate the + # mean body_mass_g: + ( + bq_df["body_mass_g"] + .groupby(by=bq_df["species"]) + .mean() + .sort_values(ascending=False) + .head(10) + ) # [END bigquery_dataframes_pandas_methods] assert average_body_mass is not None diff --git a/tests/system/large/ml/test_cluster.py b/tests/system/large/ml/test_cluster.py index eae6896669..f01116665f 100644 --- a/tests/system/large/ml/test_cluster.py +++ b/tests/system/large/ml/test_cluster.py @@ -16,7 +16,7 @@ import pytest from bigframes.ml import cluster -from tests.system.utils import assert_pandas_df_equal_ignore_ordering +from tests.system.utils import assert_pandas_df_equal @pytest.mark.flaky(retries=2, delay=120) @@ -105,7 +105,7 @@ def test_cluster_configure_fit_score_predict( index=pd.Index(["test1", "test2", "test3", "test4"], dtype="string[pyarrow]"), ) expected.index.name = "observation" - assert_pandas_df_equal_ignore_ordering(result, expected) + assert_pandas_df_equal(result, expected, ignore_order=True) # save, load, check n_clusters to ensure configuration was kept reloaded_model = model.to_gbq( diff --git a/tests/system/large/ml/test_pipeline.py b/tests/system/large/ml/test_pipeline.py index 6874a9f301..3e56954058 100644 --- a/tests/system/large/ml/test_pipeline.py +++ b/tests/system/large/ml/test_pipeline.py @@ -24,7 +24,7 @@ pipeline, preprocessing, ) -from tests.system.utils import assert_pandas_df_equal_ignore_ordering +from tests.system.utils import assert_pandas_df_equal def test_pipeline_linear_regression_fit_score_predict( @@ -555,7 +555,7 @@ def test_pipeline_standard_scaler_kmeans_fit_score_predict( ), ) expected.index.name = "observation" - assert_pandas_df_equal_ignore_ordering(result, expected) + assert_pandas_df_equal(result, expected, ignore_order=True) def test_pipeline_columntransformer_fit_predict(session, penguins_df_default_index): diff --git a/tests/system/large/test_remote_function.py b/tests/system/large/test_remote_function.py index c8f8f66eba..6ed3e6511a 100644 --- a/tests/system/large/test_remote_function.py +++ b/tests/system/large/test_remote_function.py @@ -32,7 +32,7 @@ get_cloud_function_name, get_remote_function_locations, ) -from tests.system.utils import assert_pandas_df_equal_ignore_ordering +from tests.system.utils import assert_pandas_df_equal # Use this to control the number of cloud functions being deleted in a single # test session. This should help soften the spike of the number of mutations per @@ -357,7 +357,7 @@ def square(x): pd_result_col = pd_result_col.astype(pandas.Int64Dtype()) pd_result = pd_int64_col_filtered.to_frame().assign(result=pd_result_col) - assert_pandas_df_equal_ignore_ordering(bf_result, pd_result) + assert_pandas_df_equal(bf_result, pd_result) finally: # clean up the gcp assets created for the remote function cleanup_remote_function_assets(session.bqclient, functions_client, square) @@ -401,7 +401,7 @@ def add_one(x): pd_result_col = pd_result_col.astype(pandas.Int64Dtype()) pd_result = pd_int64_col_filtered.to_frame().assign(result=pd_result_col) - assert_pandas_df_equal_ignore_ordering(bf_result, pd_result) + assert_pandas_df_equal(bf_result, pd_result) finally: # clean up the gcp assets created for the remote function cleanup_remote_function_assets( @@ -446,7 +446,7 @@ def square(x): pd_result_col = pd_result_col.astype(pandas.Int64Dtype()) pd_result = pd_int64_col_filtered.to_frame().assign(result=pd_result_col) - assert_pandas_df_equal_ignore_ordering(bf_result, pd_result) + assert_pandas_df_equal(bf_result, pd_result) finally: # clean up the gcp assets created for the remote function cleanup_remote_function_assets(session.bqclient, functions_client, square) @@ -497,7 +497,7 @@ def sign(num): pd_result_col = pd_result_col.astype(pandas.Int64Dtype()) pd_result = pd_int64_col_filtered.to_frame().assign(result=pd_result_col) - assert_pandas_df_equal_ignore_ordering(bf_result, pd_result) + assert_pandas_df_equal(bf_result, pd_result) finally: # clean up the gcp assets created for the remote function cleanup_remote_function_assets(session.bqclient, functions_client, remote_sign) @@ -542,7 +542,7 @@ def circumference(radius): pd_result_col = pd_result_col.astype(pandas.Float64Dtype()) pd_result = pd_float64_col_filtered.to_frame().assign(result=pd_result_col) - assert_pandas_df_equal_ignore_ordering(bf_result, pd_result) + assert_pandas_df_equal(bf_result, pd_result) finally: # clean up the gcp assets created for the remote function cleanup_remote_function_assets( @@ -591,7 +591,7 @@ def find_team(num): pd_result_col = pd_result_col.astype(pandas.StringDtype(storage="pyarrow")) pd_result = pd_float64_col_filtered.to_frame().assign(result=pd_result_col) - assert_pandas_df_equal_ignore_ordering(bf_result, pd_result) + assert_pandas_df_equal(bf_result, pd_result) finally: # clean up the gcp assets created for the remote function cleanup_remote_function_assets( @@ -675,7 +675,7 @@ def inner_test(): pd_result_col = pd_result_col.astype(pandas.Int64Dtype()) pd_result = pd_int64_col_filtered.to_frame().assign(result=pd_result_col) - assert_pandas_df_equal_ignore_ordering(bf_result, pd_result) + assert_pandas_df_equal(bf_result, pd_result) # Test that the remote function works as expected inner_test() @@ -765,7 +765,7 @@ def is_odd(num): pd_result_col = pd_int64_col.mask(is_odd) pd_result = pd_int64_col.to_frame().assign(result=pd_result_col) - assert_pandas_df_equal_ignore_ordering(bf_result, pd_result) + assert_pandas_df_equal(bf_result, pd_result) finally: # clean up the gcp assets created for the remote function cleanup_remote_function_assets( @@ -808,7 +808,7 @@ def is_odd(num): pd_result_col = pd_int64_col[pd_int64_col.notnull()].mask(is_odd, -1) pd_result = pd_int64_col.to_frame().assign(result=pd_result_col) - assert_pandas_df_equal_ignore_ordering(bf_result, pd_result) + assert_pandas_df_equal(bf_result, pd_result) finally: # clean up the gcp assets created for the remote function cleanup_remote_function_assets( @@ -852,7 +852,7 @@ def test_remote_udf_lambda( pd_result_col = pd_result_col.astype(pandas.Int64Dtype()) pd_result = pd_int64_col_filtered.to_frame().assign(result=pd_result_col) - assert_pandas_df_equal_ignore_ordering(bf_result, pd_result) + assert_pandas_df_equal(bf_result, pd_result) finally: # clean up the gcp assets created for the remote function cleanup_remote_function_assets( @@ -909,7 +909,7 @@ def square(x): pd_result_col = pd_result_col.astype(pandas.Int64Dtype()) pd_result = pd_int64_col.to_frame().assign(result=pd_result_col) - assert_pandas_df_equal_ignore_ordering(bf_result, pd_result) + assert_pandas_df_equal(bf_result, pd_result) finally: # clean up the gcp assets created for the remote function cleanup_remote_function_assets( @@ -954,7 +954,7 @@ def pd_np_foo(x): # comparing for the purpose of this test pd_result.result = pd_result.result.astype(pandas.Float64Dtype()) - assert_pandas_df_equal_ignore_ordering(bf_result, pd_result) + assert_pandas_df_equal(bf_result, pd_result) finally: # clean up the gcp assets created for the remote function cleanup_remote_function_assets( @@ -998,7 +998,7 @@ def test_internal(rf, udf): pd_result_col = pd_result_col.astype(pandas.Int64Dtype()) pd_result = pd_int64_col.to_frame().assign(result=pd_result_col) - assert_pandas_df_equal_ignore_ordering(bf_result, pd_result) + assert_pandas_df_equal(bf_result, pd_result) # Create an explicit name for the remote function prefixer = test_utils.prefixer.Prefixer("foo", "") @@ -1167,7 +1167,7 @@ def square(x): pd_result_col = pd_result_col.astype(pandas.Int64Dtype()) pd_result = pd_int64_col_filtered.to_frame().assign(result=pd_result_col) - assert_pandas_df_equal_ignore_ordering(bf_result, pd_result) + assert_pandas_df_equal(bf_result, pd_result) finally: # clean up the gcp assets created for the remote function cleanup_remote_function_assets( @@ -1204,7 +1204,7 @@ def square(x): pd_result_col = pd_result_col.astype(pandas.Int64Dtype()) pd_result = pd_int64_col_filtered.to_frame().assign(result=pd_result_col) - assert_pandas_df_equal_ignore_ordering(bf_result, pd_result) + assert_pandas_df_equal(bf_result, pd_result) finally: # clean up the gcp assets created for the remote function cleanup_remote_function_assets( diff --git a/tests/system/small/ml/test_cluster.py b/tests/system/small/ml/test_cluster.py index d95a1e1bc2..266a38e3ee 100644 --- a/tests/system/small/ml/test_cluster.py +++ b/tests/system/small/ml/test_cluster.py @@ -15,7 +15,7 @@ import pandas as pd from bigframes.ml import cluster -from tests.system.utils import assert_pandas_df_equal_ignore_ordering +from tests.system.utils import assert_pandas_df_equal _PD_NEW_PENGUINS = pd.DataFrame.from_dict( { @@ -68,7 +68,7 @@ def test_kmeans_predict(session, penguins_kmeans_model: cluster.KMeans): dtype="Int64", index=pd.Index(["test1", "test2", "test3", "test4"], dtype="string[pyarrow]"), ) - assert_pandas_df_equal_ignore_ordering(result, expected) + assert_pandas_df_equal(result, expected, ignore_order=True) def test_kmeans_score(session, penguins_kmeans_model: cluster.KMeans): @@ -89,59 +89,67 @@ def test_kmeans_score(session, penguins_kmeans_model: cluster.KMeans): def test_kmeans_cluster_centers(penguins_kmeans_model: cluster.KMeans): - result = penguins_kmeans_model.cluster_centers_.to_pandas() - expected = pd.DataFrame( - { - "centroid_id": [1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3], - "feature": [ - "culmen_length_mm", - "culmen_depth_mm", - "flipper_length_mm", - "sex", - ] - * 3, - "numerical_value": [ - 47.509677, - 14.993548, - 217.040123, - pd.NA, - 38.207813, - 18.03125, - 187.992188, - pd.NA, - 47.036346, - 18.834808, - 197.1612, - pd.NA, - ], - "categorical_value": [ - [], - [], - [], - [ - {"category": ".", "value": 0.008064516129032258}, - {"category": "MALE", "value": 0.49193548387096775}, - {"category": "FEMALE", "value": 0.47580645161290325}, - {"category": "_null_filler", "value": 0.024193548387096774}, - ], - [], - [], - [], - [ - {"category": "MALE", "value": 0.34375}, - {"category": "FEMALE", "value": 0.625}, - {"category": "_null_filler", "value": 0.03125}, + result = ( + penguins_kmeans_model.cluster_centers_.to_pandas() + .sort_values(["centroid_id", "feature"]) + .reset_index(drop=True) + ) + expected = ( + pd.DataFrame( + { + "centroid_id": [1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3], + "feature": [ + "culmen_length_mm", + "culmen_depth_mm", + "flipper_length_mm", + "sex", + ] + * 3, + "numerical_value": [ + 47.509677, + 14.993548, + 217.040123, + pd.NA, + 38.207813, + 18.03125, + 187.992188, + pd.NA, + 47.036346, + 18.834808, + 197.1612, + pd.NA, ], - [], - [], - [], - [ - {"category": "MALE", "value": 0.6847826086956522}, - {"category": "FEMALE", "value": 0.2826086956521739}, - {"category": "_null_filler", "value": 0.03260869565217391}, + "categorical_value": [ + [], + [], + [], + [ + {"category": ".", "value": 0.008064516129032258}, + {"category": "MALE", "value": 0.49193548387096775}, + {"category": "FEMALE", "value": 0.47580645161290325}, + {"category": "_null_filler", "value": 0.024193548387096774}, + ], + [], + [], + [], + [ + {"category": "MALE", "value": 0.34375}, + {"category": "FEMALE", "value": 0.625}, + {"category": "_null_filler", "value": 0.03125}, + ], + [], + [], + [], + [ + {"category": "MALE", "value": 0.6847826086956522}, + {"category": "FEMALE", "value": 0.2826086956521739}, + {"category": "_null_filler", "value": 0.03260869565217391}, + ], ], - ], - }, + }, + ) + .sort_values(["centroid_id", "feature"]) + .reset_index(drop=True) ) pd.testing.assert_frame_equal( result, diff --git a/tests/system/small/ml/test_core.py b/tests/system/small/ml/test_core.py index f911dd7eeb..be34a4871c 100644 --- a/tests/system/small/ml/test_core.py +++ b/tests/system/small/ml/test_core.py @@ -78,58 +78,62 @@ def test_model_eval_with_data(penguins_bqml_linear_model, penguins_df_default_in def test_model_centroids(penguins_bqml_kmeans_model: core.BqmlModel): result = penguins_bqml_kmeans_model.centroids().to_pandas() - expected = pd.DataFrame( - { - "centroid_id": [1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3], - "feature": [ - "culmen_length_mm", - "culmen_depth_mm", - "flipper_length_mm", - "sex", - ] - * 3, - "numerical_value": [ - 47.509677, - 14.993548, - 217.040123, - pd.NA, - 38.207813, - 18.03125, - 187.992188, - pd.NA, - 47.036346, - 18.834808, - 197.1612, - pd.NA, - ], - "categorical_value": [ - [], - [], - [], - [ - {"category": ".", "value": 0.008064516129032258}, - {"category": "MALE", "value": 0.49193548387096775}, - {"category": "FEMALE", "value": 0.47580645161290325}, - {"category": "_null_filler", "value": 0.024193548387096774}, + expected = ( + pd.DataFrame( + { + "centroid_id": [1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3], + "feature": [ + "culmen_length_mm", + "culmen_depth_mm", + "flipper_length_mm", + "sex", + ] + * 3, + "numerical_value": [ + 47.509677, + 14.993548, + 217.040123, + pd.NA, + 38.207813, + 18.03125, + 187.992188, + pd.NA, + 47.036346, + 18.834808, + 197.1612, + pd.NA, ], - [], - [], - [], - [ - {"category": "MALE", "value": 0.34375}, - {"category": "FEMALE", "value": 0.625}, - {"category": "_null_filler", "value": 0.03125}, + "categorical_value": [ + [], + [], + [], + [ + {"category": ".", "value": 0.008064516129032258}, + {"category": "MALE", "value": 0.49193548387096775}, + {"category": "FEMALE", "value": 0.47580645161290325}, + {"category": "_null_filler", "value": 0.024193548387096774}, + ], + [], + [], + [], + [ + {"category": "MALE", "value": 0.34375}, + {"category": "FEMALE", "value": 0.625}, + {"category": "_null_filler", "value": 0.03125}, + ], + [], + [], + [], + [ + {"category": "MALE", "value": 0.6847826086956522}, + {"category": "FEMALE", "value": 0.2826086956521739}, + {"category": "_null_filler", "value": 0.03260869565217391}, + ], ], - [], - [], - [], - [ - {"category": "MALE", "value": 0.6847826086956522}, - {"category": "FEMALE", "value": 0.2826086956521739}, - {"category": "_null_filler", "value": 0.03260869565217391}, - ], - ], - }, + }, + ) + .sort_values(["centroid_id", "feature"]) + .reset_index(drop=True) ) pd.testing.assert_frame_equal( result, @@ -148,59 +152,63 @@ def test_pca_model_principal_components(penguins_bqml_pca_model: core.BqmlModel) # result is too long, only check the first principal component here. result = result.head(7) - expected = pd.DataFrame( - { - "principal_component_id": [0] * 7, - "feature": [ - "species", - "island", - "culmen_length_mm", - "culmen_depth_mm", - "flipper_length_mm", - "body_mass_g", - "sex", - ], - "numerical_value": [ - pd.NA, - pd.NA, - 0.401489, - -0.377482, - 0.524052, - 0.501174, - pd.NA, - ], - "categorical_value": [ - [ - { - "category": "Gentoo penguin (Pygoscelis papua)", - "value": 0.25068877125667804, - }, - { - "category": "Adelie Penguin (Pygoscelis adeliae)", - "value": -0.20622291900416198, - }, - { - "category": "Chinstrap penguin (Pygoscelis antarctica)", - "value": -0.030161149275185855, - }, + expected = ( + pd.DataFrame( + { + "principal_component_id": [0] * 7, + "feature": [ + "species", + "island", + "culmen_length_mm", + "culmen_depth_mm", + "flipper_length_mm", + "body_mass_g", + "sex", ], - [ - {"category": "Biscoe", "value": 0.19761120114410635}, - {"category": "Dream", "value": -0.11264736305259061}, - {"category": "Torgersen", "value": -0.07065913511418596}, + "numerical_value": [ + pd.NA, + pd.NA, + 0.401489, + -0.377482, + 0.524052, + 0.501174, + pd.NA, ], - [], - [], - [], - [], - [ - {"category": ".", "value": 0.0015916894448071784}, - {"category": "MALE", "value": 0.06869704739750442}, - {"category": "FEMALE", "value": -0.052521171596813174}, - {"category": "_null_filler", "value": -0.0034628622681684906}, + "categorical_value": [ + [ + { + "category": "Gentoo penguin (Pygoscelis papua)", + "value": 0.25068877125667804, + }, + { + "category": "Adelie Penguin (Pygoscelis adeliae)", + "value": -0.20622291900416198, + }, + { + "category": "Chinstrap penguin (Pygoscelis antarctica)", + "value": -0.030161149275185855, + }, + ], + [ + {"category": "Biscoe", "value": 0.19761120114410635}, + {"category": "Dream", "value": -0.11264736305259061}, + {"category": "Torgersen", "value": -0.07065913511418596}, + ], + [], + [], + [], + [], + [ + {"category": ".", "value": 0.0015916894448071784}, + {"category": "MALE", "value": 0.06869704739750442}, + {"category": "FEMALE", "value": -0.052521171596813174}, + {"category": "_null_filler", "value": -0.0034628622681684906}, + ], ], - ], - }, + }, + ) + .sort_values(["principal_component_id", "feature"]) + .reset_index(drop=True) ) pd.testing.assert_frame_equal( result, @@ -225,7 +233,7 @@ def test_pca_model_principal_component_info(penguins_bqml_pca_model: core.BqmlMo "cumulative_explained_variance_ratio": [0.469357, 0.651283, 0.812383], }, ) - tests.system.utils.assert_pandas_df_equal_ignore_ordering( + tests.system.utils.assert_pandas_df_equal( result, expected, check_exact=False, @@ -233,6 +241,7 @@ def test_pca_model_principal_component_info(penguins_bqml_pca_model: core.BqmlMo # int64 Index by default in pandas versus Int64 (nullable) Index in BigQuery DataFrame check_index_type=False, check_dtype=False, + ignore_order=True, ) diff --git a/tests/system/small/ml/test_decomposition.py b/tests/system/small/ml/test_decomposition.py index e31681f4a0..42fea66cf8 100644 --- a/tests/system/small/ml/test_decomposition.py +++ b/tests/system/small/ml/test_decomposition.py @@ -57,59 +57,63 @@ def test_pca_components_(penguins_pca_model: decomposition.PCA): # result is too long, only check the first principal component here. result = result.head(7) - expected = pd.DataFrame( - { - "principal_component_id": [0] * 7, - "feature": [ - "species", - "island", - "culmen_length_mm", - "culmen_depth_mm", - "flipper_length_mm", - "body_mass_g", - "sex", - ], - "numerical_value": [ - pd.NA, - pd.NA, - 0.401489, - -0.377482, - 0.524052, - 0.501174, - pd.NA, - ], - "categorical_value": [ - [ - { - "category": "Gentoo penguin (Pygoscelis papua)", - "value": 0.25068877125667804, - }, - { - "category": "Adelie Penguin (Pygoscelis adeliae)", - "value": -0.20622291900416198, - }, - { - "category": "Chinstrap penguin (Pygoscelis antarctica)", - "value": -0.030161149275185855, - }, + expected = ( + pd.DataFrame( + { + "principal_component_id": [0] * 7, + "feature": [ + "species", + "island", + "culmen_length_mm", + "culmen_depth_mm", + "flipper_length_mm", + "body_mass_g", + "sex", ], - [ - {"category": "Biscoe", "value": 0.19761120114410635}, - {"category": "Dream", "value": -0.11264736305259061}, - {"category": "Torgersen", "value": -0.07065913511418596}, + "numerical_value": [ + pd.NA, + pd.NA, + 0.401489, + -0.377482, + 0.524052, + 0.501174, + pd.NA, ], - [], - [], - [], - [], - [ - {"category": ".", "value": 0.0015916894448071784}, - {"category": "MALE", "value": 0.06869704739750442}, - {"category": "FEMALE", "value": -0.052521171596813174}, - {"category": "_null_filler", "value": -0.0034628622681684906}, + "categorical_value": [ + [ + { + "category": "Gentoo penguin (Pygoscelis papua)", + "value": 0.25068877125667804, + }, + { + "category": "Adelie Penguin (Pygoscelis adeliae)", + "value": -0.20622291900416198, + }, + { + "category": "Chinstrap penguin (Pygoscelis antarctica)", + "value": -0.030161149275185855, + }, + ], + [ + {"category": "Biscoe", "value": 0.19761120114410635}, + {"category": "Dream", "value": -0.11264736305259061}, + {"category": "Torgersen", "value": -0.07065913511418596}, + ], + [], + [], + [], + [], + [ + {"category": ".", "value": 0.0015916894448071784}, + {"category": "MALE", "value": 0.06869704739750442}, + {"category": "FEMALE", "value": -0.052521171596813174}, + {"category": "_null_filler", "value": -0.0034628622681684906}, + ], ], - ], - }, + }, + ) + .sort_values(["principal_component_id", "feature"]) + .reset_index(drop=True) ) pd.testing.assert_frame_equal( result, @@ -130,13 +134,14 @@ def test_pca_explained_variance_(penguins_pca_model: decomposition.PCA): "explained_variance": [3.278657, 1.270829, 1.125354], }, ) - tests.system.utils.assert_pandas_df_equal_ignore_ordering( + tests.system.utils.assert_pandas_df_equal( result, expected, check_exact=False, rtol=0.1, check_index_type=False, check_dtype=False, + ignore_order=True, ) @@ -149,11 +154,12 @@ def test_pca_explained_variance_ratio_(penguins_pca_model: decomposition.PCA): "explained_variance_ratio": [0.469357, 0.181926, 0.1611], }, ) - tests.system.utils.assert_pandas_df_equal_ignore_ordering( + tests.system.utils.assert_pandas_df_equal( result, expected, check_exact=False, rtol=0.1, check_index_type=False, check_dtype=False, + ignore_order=True, ) diff --git a/tests/system/small/ml/test_forecasting.py b/tests/system/small/ml/test_forecasting.py index cb27dd388c..55079c94cf 100644 --- a/tests/system/small/ml/test_forecasting.py +++ b/tests/system/small/ml/test_forecasting.py @@ -36,6 +36,7 @@ def test_model_predict(time_series_arima_plus_model): expected["forecast_timestamp"] = expected["forecast_timestamp"].astype( pd.ArrowDtype(pa.timestamp("us", tz="UTC")) ) + pd.testing.assert_frame_equal( predictions, expected, diff --git a/tests/system/small/operations/test_datetimes.py b/tests/system/small/operations/test_datetimes.py index 7dc55b9367..177194c7a8 100644 --- a/tests/system/small/operations/test_datetimes.py +++ b/tests/system/small/operations/test_datetimes.py @@ -16,7 +16,7 @@ import pytest import bigframes.series -from tests.system.utils import assert_series_equal_ignoring_order +from tests.system.utils import assert_series_equal DATETIME_COL_NAMES = [("datetime_col",), ("timestamp_col",)] @@ -33,7 +33,7 @@ def test_day(scalars_dfs, col_name): bf_result = bf_series.dt.day.to_pandas() pd_result = scalars_pandas_df[col_name].dt.day - assert_series_equal_ignoring_order( + assert_series_equal( pd_result.astype(pd.Int64Dtype()), bf_result, ) @@ -51,7 +51,7 @@ def test_date(scalars_dfs, col_name): bf_result = bf_series.dt.date.to_pandas() pd_result = scalars_pandas_df[col_name].dt.date - assert_series_equal_ignoring_order( + assert_series_equal( pd_result, bf_result, ) @@ -69,7 +69,7 @@ def test_dayofweek(scalars_dfs, col_name): bf_result = bf_series.dt.dayofweek.to_pandas() pd_result = scalars_pandas_df[col_name].dt.dayofweek - assert_series_equal_ignoring_order(pd_result, bf_result, check_dtype=False) + assert_series_equal(pd_result, bf_result, check_dtype=False) @pytest.mark.parametrize( @@ -84,7 +84,7 @@ def test_hour(scalars_dfs, col_name): bf_result = bf_series.dt.hour.to_pandas() pd_result = scalars_pandas_df[col_name].dt.hour - assert_series_equal_ignoring_order( + assert_series_equal( pd_result.astype(pd.Int64Dtype()), bf_result, ) @@ -102,7 +102,7 @@ def test_minute(scalars_dfs, col_name): bf_result = bf_series.dt.minute.to_pandas() pd_result = scalars_pandas_df[col_name].dt.minute - assert_series_equal_ignoring_order( + assert_series_equal( pd_result.astype(pd.Int64Dtype()), bf_result, ) @@ -120,7 +120,7 @@ def test_month(scalars_dfs, col_name): bf_result = bf_series.dt.month.to_pandas() pd_result = scalars_pandas_df[col_name].dt.month - assert_series_equal_ignoring_order( + assert_series_equal( pd_result.astype(pd.Int64Dtype()), bf_result, ) @@ -138,7 +138,7 @@ def test_quarter(scalars_dfs, col_name): bf_result = bf_series.dt.quarter.to_pandas() pd_result = scalars_pandas_df[col_name].dt.quarter - assert_series_equal_ignoring_order( + assert_series_equal( pd_result.astype(pd.Int64Dtype()), bf_result, ) @@ -156,7 +156,7 @@ def test_second(scalars_dfs, col_name): bf_result = bf_series.dt.second.to_pandas() pd_result = scalars_pandas_df[col_name].dt.second - assert_series_equal_ignoring_order( + assert_series_equal( pd_result.astype(pd.Int64Dtype()), bf_result, ) @@ -174,7 +174,7 @@ def test_time(scalars_dfs, col_name): bf_result = bf_series.dt.time.to_pandas() pd_result = scalars_pandas_df[col_name].dt.time - assert_series_equal_ignoring_order( + assert_series_equal( pd_result, bf_result, ) @@ -192,7 +192,7 @@ def test_year(scalars_dfs, col_name): bf_result = bf_series.dt.year.to_pandas() pd_result = scalars_pandas_df[col_name].dt.year - assert_series_equal_ignoring_order( + assert_series_equal( pd_result.astype(pd.Int64Dtype()), bf_result, ) diff --git a/tests/system/small/operations/test_strings.py b/tests/system/small/operations/test_strings.py index 241cbd576b..27a35134d4 100644 --- a/tests/system/small/operations/test_strings.py +++ b/tests/system/small/operations/test_strings.py @@ -19,7 +19,7 @@ import bigframes.series -from ...utils import assert_series_equal_ignoring_order +from ...utils import assert_series_equal def test_find(scalars_dfs): @@ -31,7 +31,7 @@ def test_find(scalars_dfs): # One of type mismatches to be documented. Here, the `bf_result.dtype` is `Int64` but # the `pd_result.dtype` is `float64`: https://github.com/pandas-dev/pandas/issues/51948 - assert_series_equal_ignoring_order( + assert_series_equal( pd_result.astype(pd.Int64Dtype()), bf_result, ) @@ -173,7 +173,7 @@ def test_len(scalars_dfs): # One of dtype mismatches to be documented. Here, the `bf_result.dtype` is `Int64` but # the `pd_result.dtype` is `float64`: https://github.com/pandas-dev/pandas/issues/51948 - assert_series_equal_ignoring_order( + assert_series_equal( pd_result.astype(pd.Int64Dtype()), bf_result, ) @@ -186,7 +186,7 @@ def test_lower(scalars_dfs): bf_result = bf_series.str.lower().to_pandas() pd_result = scalars_pandas_df[col_name].str.lower() - assert_series_equal_ignoring_order( + assert_series_equal( pd_result, bf_result, ) @@ -205,7 +205,7 @@ def test_reverse(scalars_dfs): else: pd_result.loc[i] = cell[::-1] - assert_series_equal_ignoring_order( + assert_series_equal( pd_result, bf_result, ) @@ -222,7 +222,7 @@ def test_slice(scalars_dfs, start, stop): pd_series = scalars_pandas_df[col_name] pd_result = pd_series.str.slice(start, stop) - assert_series_equal_ignoring_order( + assert_series_equal( pd_result, bf_result, ) @@ -235,7 +235,7 @@ def test_strip(scalars_dfs): bf_result = bf_series.str.strip().to_pandas() pd_result = scalars_pandas_df[col_name].str.strip() - assert_series_equal_ignoring_order( + assert_series_equal( pd_result, bf_result, ) @@ -248,7 +248,7 @@ def test_upper(scalars_dfs): bf_result = bf_series.str.upper().to_pandas() pd_result = scalars_pandas_df[col_name].str.upper() - assert_series_equal_ignoring_order( + assert_series_equal( pd_result, bf_result, ) @@ -330,7 +330,7 @@ def test_islower(weird_strings, weird_strings_pd): pd_result = weird_strings_pd.str.islower() bf_result = weird_strings.str.islower().to_pandas() - assert_series_equal_ignoring_order( + assert_series_equal( bf_result, pd_result.astype(pd.BooleanDtype()) # the dtype here is a case of intentional diversion from pandas @@ -342,7 +342,7 @@ def test_isupper(weird_strings, weird_strings_pd): pd_result = weird_strings_pd.str.isupper() bf_result = weird_strings.str.isupper().to_pandas() - assert_series_equal_ignoring_order( + assert_series_equal( bf_result, pd_result.astype(pd.BooleanDtype()) # the dtype here is a case of intentional diversion from pandas @@ -357,7 +357,7 @@ def test_rstrip(scalars_dfs): bf_result = bf_series.str.rstrip().to_pandas() pd_result = scalars_pandas_df[col_name].str.rstrip() - assert_series_equal_ignoring_order( + assert_series_equal( pd_result, bf_result, ) @@ -370,7 +370,7 @@ def test_lstrip(scalars_dfs): bf_result = bf_series.str.lstrip().to_pandas() pd_result = scalars_pandas_df[col_name].str.lstrip() - assert_series_equal_ignoring_order( + assert_series_equal( pd_result, bf_result, ) @@ -384,7 +384,7 @@ def test_repeat(scalars_dfs, repeats): bf_result = bf_series.str.repeat(repeats).to_pandas() pd_result = scalars_pandas_df[col_name].str.repeat(repeats) - assert_series_equal_ignoring_order( + assert_series_equal( pd_result, bf_result, ) @@ -397,7 +397,7 @@ def test_capitalize(scalars_dfs): bf_result = bf_series.str.capitalize().to_pandas() pd_result = scalars_pandas_df[col_name].str.capitalize() - assert_series_equal_ignoring_order( + assert_series_equal( pd_result, bf_result, ) @@ -415,7 +415,7 @@ def test_cat_with_series(scalars_dfs): pd_right = scalars_pandas_df[col_name] pd_result = pd_left.str.cat(others=pd_right) - assert_series_equal_ignoring_order( + assert_series_equal( pd_result, bf_result, ) @@ -429,7 +429,7 @@ def test_str_match(scalars_dfs): bf_result = bf_series.str.match(pattern).to_pandas() pd_result = scalars_pandas_df[col_name].str.match(pattern) - assert_series_equal_ignoring_order( + assert_series_equal( pd_result, bf_result, ) @@ -443,7 +443,7 @@ def test_str_fullmatch(scalars_dfs): bf_result = bf_series.str.fullmatch(pattern).to_pandas() pd_result = scalars_pandas_df[col_name].str.fullmatch(pattern) - assert_series_equal_ignoring_order( + assert_series_equal( pd_result, bf_result, ) @@ -456,7 +456,7 @@ def test_str_get(scalars_dfs): bf_result = bf_series.str.get(8).to_pandas() pd_result = scalars_pandas_df[col_name].str.get(8) - assert_series_equal_ignoring_order( + assert_series_equal( pd_result, bf_result, ) @@ -469,7 +469,7 @@ def test_str_pad(scalars_dfs): bf_result = bf_series.str.pad(8, side="both", fillchar="%").to_pandas() pd_result = scalars_pandas_df[col_name].str.pad(8, side="both", fillchar="%") - assert_series_equal_ignoring_order( + assert_series_equal( pd_result, bf_result, ) @@ -492,7 +492,7 @@ def test_str_ljust(scalars_dfs): bf_result = bf_series.str.ljust(7, fillchar="%").to_pandas() pd_result = scalars_pandas_df[col_name].str.ljust(7, fillchar="%") - assert_series_equal_ignoring_order( + assert_series_equal( pd_result, bf_result, ) @@ -505,7 +505,7 @@ def test_str_rjust(scalars_dfs): bf_result = bf_series.str.rjust(9, fillchar="%").to_pandas() pd_result = scalars_pandas_df[col_name].str.rjust(9, fillchar="%") - assert_series_equal_ignoring_order( + assert_series_equal( pd_result, bf_result, ) diff --git a/tests/system/small/test_dataframe.py b/tests/system/small/test_dataframe.py index bd5930e508..e522878229 100644 --- a/tests/system/small/test_dataframe.py +++ b/tests/system/small/test_dataframe.py @@ -28,10 +28,7 @@ import bigframes._config.display_options as display_options import bigframes.dataframe as dataframe import bigframes.series as series -from tests.system.utils import ( - assert_pandas_df_equal_ignore_ordering, - assert_series_equal_ignoring_order, -) +from tests.system.utils import assert_pandas_df_equal, assert_series_equal def test_df_construct_copy(scalars_dfs): @@ -98,7 +95,7 @@ def test_get_column(scalars_dfs): series = scalars_df[col_name] bf_result = series.to_pandas() pd_result = scalars_pandas_df[col_name] - assert_series_equal_ignoring_order(bf_result, pd_result) + assert_series_equal(bf_result, pd_result) def test_get_column_nonstring(scalars_dfs): @@ -106,7 +103,7 @@ def test_get_column_nonstring(scalars_dfs): series = scalars_df.rename(columns={"int64_col": 123.1})[123.1] bf_result = series.to_pandas() pd_result = scalars_pandas_df.rename(columns={"int64_col": 123.1})[123.1] - assert_series_equal_ignoring_order(bf_result, pd_result) + assert_series_equal(bf_result, pd_result) def test_hasattr(scalars_dfs): @@ -116,15 +113,24 @@ def test_hasattr(scalars_dfs): assert not hasattr(scalars_df, "not_exist") -def test_head_with_custom_column_labels(scalars_df_index, scalars_pandas_df_index): +@pytest.mark.parametrize( + ("ordered"), + [ + (True), + (False), + ], +) +def test_head_with_custom_column_labels( + scalars_df_index, scalars_pandas_df_index, ordered +): rename_mapping = { "int64_col": "Integer Column", "string_col": "言語列", } bf_df = scalars_df_index.rename(columns=rename_mapping).head(3) - bf_result = bf_df.to_pandas() + bf_result = bf_df.to_pandas(ordered=ordered) pd_result = scalars_pandas_df_index.rename(columns=rename_mapping).head(3) - pandas.testing.assert_frame_equal(bf_result, pd_result) + assert_pandas_df_equal(bf_result, pd_result, ignore_order=not ordered) def test_tail_with_custom_column_labels(scalars_df_index, scalars_pandas_df_index): @@ -183,7 +189,7 @@ def test_get_column_by_attr(scalars_dfs): series = scalars_df.int64_col bf_result = series.to_pandas() pd_result = scalars_pandas_df.int64_col - assert_series_equal_ignoring_order(bf_result, pd_result) + assert_series_equal(bf_result, pd_result) def test_get_columns(scalars_dfs): @@ -246,7 +252,7 @@ def test_drop_with_custom_column_labels(scalars_dfs): pd_result = scalars_pandas_df.rename(columns=rename_mapping).drop( columns=dropped_columns ) - assert_pandas_df_equal_ignore_ordering(bf_result, pd_result) + assert_pandas_df_equal(bf_result, pd_result) def test_drop_index(scalars_dfs): @@ -420,7 +426,7 @@ def test_filter_df(scalars_dfs): pd_bool_series = scalars_pandas_df["bool_col"] pd_result = scalars_pandas_df[pd_bool_series] - assert_pandas_df_equal_ignore_ordering(bf_result, pd_result) + assert_pandas_df_equal(bf_result, pd_result) def test_assign_new_column(scalars_dfs): @@ -433,7 +439,7 @@ def test_assign_new_column(scalars_dfs): # Convert default pandas dtypes `int64` to match BigQuery DataFrames dtypes. pd_result["new_col"] = pd_result["new_col"].astype("Int64") - assert_pandas_df_equal_ignore_ordering(bf_result, pd_result) + assert_pandas_df_equal(bf_result, pd_result) def test_assign_new_column_w_loc(scalars_dfs): @@ -564,17 +570,52 @@ def test_assign_existing_column(scalars_dfs): # Convert default pandas dtypes `int64` to match BigQuery DataFrames dtypes. pd_result["int64_col"] = pd_result["int64_col"].astype("Int64") - assert_pandas_df_equal_ignore_ordering(bf_result, pd_result) + assert_pandas_df_equal(bf_result, pd_result) + + +def test_assign_listlike_to_empty_df(session): + empty_df = dataframe.DataFrame(session=session) + empty_pandas_df = pd.DataFrame() + bf_result = empty_df.assign(new_col=[1, 2, 3]) + pd_result = empty_pandas_df.assign(new_col=[1, 2, 3]) + + pd_result["new_col"] = pd_result["new_col"].astype("Int64") + pd_result.index = pd_result.index.astype("Int64") + assert_pandas_df_equal(bf_result.to_pandas(), pd_result) -def test_assign_series(scalars_dfs): + +def test_assign_to_empty_df_multiindex_error(session): + empty_df = dataframe.DataFrame(session=session) + empty_pandas_df = pd.DataFrame() + empty_df["empty_col_1"] = [] + empty_df["empty_col_2"] = [] + empty_pandas_df["empty_col_1"] = [] + empty_pandas_df["empty_col_2"] = [] + empty_df = empty_df.set_index(["empty_col_1", "empty_col_2"]) + empty_pandas_df = empty_pandas_df.set_index(["empty_col_1", "empty_col_2"]) + + with pytest.raises(ValueError): + empty_df.assign(new_col=[1, 2, 3, 4, 5, 6, 7, 8, 9]) + with pytest.raises(ValueError): + empty_pandas_df.assign(new_col=[1, 2, 3, 4, 5, 6, 7, 8, 9]) + + +@pytest.mark.parametrize( + ("ordered"), + [ + (True), + (False), + ], +) +def test_assign_series(scalars_dfs, ordered): scalars_df, scalars_pandas_df = scalars_dfs column_name = "int64_col" df = scalars_df.assign(new_col=scalars_df[column_name]) - bf_result = df.to_pandas() + bf_result = df.to_pandas(ordered=ordered) pd_result = scalars_pandas_df.assign(new_col=scalars_pandas_df[column_name]) - assert_pandas_df_equal_ignore_ordering(bf_result, pd_result) + assert_pandas_df_equal(bf_result, pd_result, ignore_order=not ordered) def test_assign_series_overwrite(scalars_dfs): @@ -586,7 +627,7 @@ def test_assign_series_overwrite(scalars_dfs): **{column_name: scalars_pandas_df[column_name] + 3} ) - assert_pandas_df_equal_ignore_ordering(bf_result, pd_result) + assert_pandas_df_equal(bf_result, pd_result) def test_assign_sequential(scalars_dfs): @@ -601,7 +642,7 @@ def test_assign_sequential(scalars_dfs): pd_result["new_col"] = pd_result["new_col"].astype("Int64") pd_result["new_col2"] = pd_result["new_col2"].astype("Int64") - assert_pandas_df_equal_ignore_ordering(bf_result, pd_result) + assert_pandas_df_equal(bf_result, pd_result) # Require an index so that the self-join is consistent each time. @@ -635,7 +676,7 @@ def test_assign_different_df( new_col=scalars_pandas_df_index[column_name] ) - assert_pandas_df_equal_ignore_ordering(bf_result, pd_result) + assert_pandas_df_equal(bf_result, pd_result) def test_assign_different_df_w_loc( @@ -686,7 +727,7 @@ def test_assign_callable_lambda(scalars_dfs): # Convert default pandas dtypes `int64` to match BigQuery DataFrames dtypes. pd_result["new_col"] = pd_result["new_col"].astype("Int64") - assert_pandas_df_equal_ignore_ordering(bf_result, pd_result) + assert_pandas_df_equal(bf_result, pd_result) @pytest.mark.parametrize( @@ -886,6 +927,26 @@ def test_df_isin_dict(scalars_dfs): pandas.testing.assert_frame_equal(bf_result, pd_result.astype("boolean")) +def test_df_cross_merge(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + left_columns = ["int64_col", "float64_col", "rowindex_2"] + right_columns = ["int64_col", "bool_col", "string_col", "rowindex_2"] + + left = scalars_df[left_columns] + # Offset the rows somewhat so that outer join can have an effect. + right = scalars_df[right_columns].assign(rowindex_2=scalars_df["rowindex_2"] + 2) + + bf_result = left.merge(right, "cross").to_pandas() + + pd_result = scalars_pandas_df[left_columns].merge( + scalars_pandas_df[right_columns].assign( + rowindex_2=scalars_pandas_df["rowindex_2"] + 2 + ), + "cross", + ) + pd.testing.assert_frame_equal(bf_result, pd_result, check_index_type=False) + + @pytest.mark.parametrize( ("merge_how",), [ @@ -917,7 +978,9 @@ def test_df_merge(scalars_dfs, merge_how): sort=True, ) - assert_pandas_df_equal_ignore_ordering(bf_result, pd_result) + assert_pandas_df_equal( + bf_result, pd_result, ignore_order=True, check_index_type=False + ) @pytest.mark.parametrize( @@ -950,7 +1013,9 @@ def test_df_merge_multi_key(scalars_dfs, left_on, right_on): sort=True, ) - assert_pandas_df_equal_ignore_ordering(bf_result, pd_result) + assert_pandas_df_equal( + bf_result, pd_result, ignore_order=True, check_index_type=False + ) @pytest.mark.parametrize( @@ -980,7 +1045,9 @@ def test_merge_custom_col_name(scalars_dfs, merge_how): pandas_right_df = scalars_pandas_df[right_columns] pd_result = pandas_left_df.merge(pandas_right_df, merge_how, on, sort=True) - assert_pandas_df_equal_ignore_ordering(bf_result, pd_result) + assert_pandas_df_equal( + bf_result, pd_result, ignore_order=True, check_index_type=False + ) @pytest.mark.parametrize( @@ -1013,7 +1080,9 @@ def test_merge_left_on_right_on(scalars_dfs, merge_how): sort=True, ) - assert_pandas_df_equal_ignore_ordering(bf_result, pd_result) + assert_pandas_df_equal( + bf_result, pd_result, ignore_order=True, check_index_type=False + ) def test_get_dtypes(scalars_df_default_index): @@ -1207,6 +1276,28 @@ def test_reset_index_with_unnamed_index( pandas.testing.assert_frame_equal(bf_result, pd_result) +def test_reset_index_with_unnamed_multiindex( + scalars_df_index, + scalars_pandas_df_index, +): + bf_df = dataframe.DataFrame( + ([1, 2, 3], [2, 5, 7]), + index=pd.MultiIndex.from_tuples([("a", "aa"), ("a", "aa")]), + ) + pd_df = pd.DataFrame( + ([1, 2, 3], [2, 5, 7]), + index=pd.MultiIndex.from_tuples([("a", "aa"), ("a", "aa")]), + ) + + bf_df = bf_df.reset_index() + pd_df = pd_df.reset_index() + + assert pd_df.columns[0] == "level_0" + assert bf_df.columns[0] == "level_0" + assert pd_df.columns[1] == "level_1" + assert bf_df.columns[1] == "level_1" + + def test_reset_index_with_unnamed_index_and_index_column( scalars_df_index, scalars_pandas_df_index, @@ -1305,7 +1396,7 @@ def test_df_abs(scalars_dfs): bf_result = scalars_df[columns].abs().to_pandas() pd_result = scalars_pandas_df[columns].abs() - assert_pandas_df_equal_ignore_ordering(bf_result, pd_result) + assert_pandas_df_equal(bf_result, pd_result) def test_df_isnull(scalars_dfs): @@ -1322,7 +1413,7 @@ def test_df_isnull(scalars_dfs): pd_result["string_col"] = pd_result["string_col"].astype(pd.BooleanDtype()) pd_result["bool_col"] = pd_result["bool_col"].astype(pd.BooleanDtype()) - assert_pandas_df_equal_ignore_ordering(bf_result, pd_result) + assert_pandas_df_equal(bf_result, pd_result) def test_df_notnull(scalars_dfs): @@ -1339,7 +1430,7 @@ def test_df_notnull(scalars_dfs): pd_result["string_col"] = pd_result["string_col"].astype(pd.BooleanDtype()) pd_result["bool_col"] = pd_result["bool_col"].astype(pd.BooleanDtype()) - assert_pandas_df_equal_ignore_ordering(bf_result, pd_result) + assert_pandas_df_equal(bf_result, pd_result) @pytest.mark.parametrize( @@ -1559,7 +1650,7 @@ def test_scalar_binop(scalars_dfs, op, other_scalar, reverse_operands): bf_result = maybe_reversed_op(scalars_df[columns], other_scalar).to_pandas() pd_result = maybe_reversed_op(scalars_pandas_df[columns], other_scalar) - assert_pandas_df_equal_ignore_ordering(bf_result, pd_result) + assert_pandas_df_equal(bf_result, pd_result) @pytest.mark.parametrize(("other_scalar"), [1, -2]) @@ -1571,7 +1662,7 @@ def test_mod(scalars_dfs, other_scalar): bf_result = (scalars_df[["int64_col", "int64_too"]] % other_scalar).to_pandas() pd_result = scalars_pandas_df[["int64_col", "int64_too"]] % other_scalar - assert_pandas_df_equal_ignore_ordering(bf_result, pd_result) + assert_pandas_df_equal(bf_result, pd_result) def test_scalar_binop_str_exception(scalars_dfs): @@ -1627,7 +1718,7 @@ def test_series_binop_axis_index( bf_result = op(scalars_df[df_columns], scalars_df[series_column]).to_pandas() pd_result = op(scalars_pandas_df[df_columns], scalars_pandas_df[series_column]) - assert_pandas_df_equal_ignore_ordering(bf_result, pd_result) + assert_pandas_df_equal(bf_result, pd_result) @pytest.mark.parametrize( @@ -1673,8 +1764,15 @@ def test_binop_df_df_binary_op( # Differnt table will only work for explicit index, since default index orders are arbitrary. +@pytest.mark.parametrize( + ("ordered"), + [ + (True), + (False), + ], +) def test_series_binop_add_different_table( - scalars_df_index, scalars_pandas_df_index, scalars_df_2_index + scalars_df_index, scalars_pandas_df_index, scalars_df_2_index, ordered ): df_columns = ["int64_col", "float64_col"] series_column = "int64_too" @@ -1682,25 +1780,20 @@ def test_series_binop_add_different_table( bf_result = ( scalars_df_index[df_columns] .add(scalars_df_2_index[series_column], axis="index") - .to_pandas() + .to_pandas(ordered=ordered) ) pd_result = scalars_pandas_df_index[df_columns].add( scalars_pandas_df_index[series_column], axis="index" ) - assert_pandas_df_equal_ignore_ordering(bf_result, pd_result) + assert_pandas_df_equal(bf_result, pd_result, ignore_order=not ordered) # TODO(garrettwu): Test series binop with different index all_joins = pytest.mark.parametrize( ("how",), - ( - ("outer",), - ("left",), - ("right",), - ("inner",), - ), + (("outer",), ("left",), ("right",), ("inner",), ("cross",)), ) @@ -1714,7 +1807,7 @@ def test_join_same_table(scalars_dfs, how): pd_df_a = pd_df.set_index("int64_too")[["string_col", "int64_col"]] pd_df_b = pd_df.set_index("int64_too")[["float64_col"]] pd_result = pd_df_a.join(pd_df_b, how=how) - assert_pandas_df_equal_ignore_ordering(bf_result, pd_result) + assert_pandas_df_equal(bf_result, pd_result, ignore_order=True) @all_joins @@ -1727,7 +1820,7 @@ def test_join_different_table( pd_df_a = scalars_pandas_df_index[["string_col", "int64_col"]] pd_df_b = scalars_pandas_df_index.dropna()[["float64_col"]] pd_result = pd_df_a.join(pd_df_b, how=how) - assert_pandas_df_equal_ignore_ordering(bf_result, pd_result) + assert_pandas_df_equal(bf_result, pd_result, ignore_order=True) def test_join_duplicate_columns_raises_not_implemented(scalars_dfs): @@ -1745,13 +1838,18 @@ def test_join_param_on(scalars_dfs, how): bf_df_a = bf_df[["string_col", "int64_col", "rowindex_2"]] bf_df_a = bf_df_a.assign(rowindex_2=bf_df_a["rowindex_2"] + 2) bf_df_b = bf_df[["float64_col"]] - bf_result = bf_df_a.join(bf_df_b, on="rowindex_2", how=how).to_pandas() - pd_df_a = pd_df[["string_col", "int64_col", "rowindex_2"]] - pd_df_a = pd_df_a.assign(rowindex_2=pd_df_a["rowindex_2"] + 2) - pd_df_b = pd_df[["float64_col"]] - pd_result = pd_df_a.join(pd_df_b, on="rowindex_2", how=how) - assert_pandas_df_equal_ignore_ordering(bf_result, pd_result) + if how == "cross": + with pytest.raises(ValueError): + bf_df_a.join(bf_df_b, on="rowindex_2", how=how) + else: + bf_result = bf_df_a.join(bf_df_b, on="rowindex_2", how=how).to_pandas() + + pd_df_a = pd_df[["string_col", "int64_col", "rowindex_2"]] + pd_df_a = pd_df_a.assign(rowindex_2=pd_df_a["rowindex_2"] + 2) + pd_df_b = pd_df[["float64_col"]] + pd_result = pd_df_a.join(pd_df_b, on="rowindex_2", how=how) + assert_pandas_df_equal(bf_result, pd_result, ignore_order=True) @pytest.mark.parametrize( @@ -1967,7 +2065,14 @@ def test_df_describe(scalars_dfs): ).all() -def test_df_stack(scalars_dfs): +@pytest.mark.parametrize( + ("ordered"), + [ + (True), + (False), + ], +) +def test_df_stack(scalars_dfs, ordered): if pandas.__version__.startswith("1.") or pandas.__version__.startswith("2.0"): pytest.skip("pandas <2.1 uses different stack implementation") scalars_df, scalars_pandas_df = scalars_dfs @@ -1977,11 +2082,13 @@ def test_df_stack(scalars_dfs): # Can only stack identically-typed columns columns = ["int64_col", "int64_too", "rowindex_2"] - bf_result = scalars_df[columns].stack().to_pandas() + bf_result = scalars_df[columns].stack().to_pandas(ordered=ordered) pd_result = scalars_pandas_df[columns].stack(future_stack=True) # Pandas produces NaN, where bq dataframes produces pd.NA - pd.testing.assert_series_equal(bf_result, pd_result, check_dtype=False) + assert_series_equal( + bf_result, pd_result, check_dtype=False, ignore_order=not ordered + ) def test_df_melt_default(scalars_dfs): @@ -2027,7 +2134,14 @@ def test_df_melt_parameterized(scalars_dfs): ) -def test_df_unstack(scalars_dfs): +@pytest.mark.parametrize( + ("ordered"), + [ + (True), + (False), + ], +) +def test_df_unstack(scalars_dfs, ordered): scalars_df, scalars_pandas_df = scalars_dfs # To match bigquery dataframes scalars_pandas_df = scalars_pandas_df.copy() @@ -2040,11 +2154,13 @@ def test_df_unstack(scalars_dfs): ] # unstack on mono-index produces series - bf_result = scalars_df[columns].unstack().to_pandas() + bf_result = scalars_df[columns].unstack().to_pandas(ordered=ordered) pd_result = scalars_pandas_df[columns].unstack() # Pandas produces NaN, where bq dataframes produces pd.NA - pd.testing.assert_series_equal(bf_result, pd_result, check_dtype=False) + assert_series_equal( + bf_result, pd_result, check_dtype=False, ignore_order=not ordered + ) @pytest.mark.parametrize( @@ -2189,14 +2305,18 @@ def test_iloc_slice_zero_step(scalars_df_index): scalars_df_index.iloc[0:0:0] -def test_iloc_slice_nested(scalars_df_index, scalars_pandas_df_index): - bf_result = scalars_df_index.iloc[1:].iloc[1:].to_pandas() +@pytest.mark.parametrize( + ("ordered"), + [ + (True), + (False), + ], +) +def test_iloc_slice_nested(scalars_df_index, scalars_pandas_df_index, ordered): + bf_result = scalars_df_index.iloc[1:].iloc[1:].to_pandas(ordered=ordered) pd_result = scalars_pandas_df_index.iloc[1:].iloc[1:] - pd.testing.assert_frame_equal( - bf_result, - pd_result, - ) + assert_pandas_df_equal(bf_result, pd_result, ignore_order=not ordered) @pytest.mark.parametrize( @@ -2387,6 +2507,13 @@ def test_loc_setitem_bool_series_scalar_type_error(scalars_dfs): pd_df.loc[pd_df["int64_too"] == 1, "string_col"] = 99 +@pytest.mark.parametrize( + ("ordered"), + [ + (True), + (False), + ], +) @pytest.mark.parametrize( ("op"), [ @@ -2401,16 +2528,18 @@ def test_loc_setitem_bool_series_scalar_type_error(scalars_dfs): ], ids=["sum", "mean", "min", "max", "std", "var", "count", "nunique"], ) -def test_dataframe_aggregates(scalars_df_index, scalars_pandas_df_index, op): +def test_dataframe_aggregates(scalars_df_index, scalars_pandas_df_index, op, ordered): col_names = ["int64_too", "float64_col", "string_col", "int64_col", "bool_col"] bf_series = op(scalars_df_index[col_names]) pd_series = op(scalars_pandas_df_index[col_names]) - bf_result = bf_series.to_pandas() + bf_result = bf_series.to_pandas(ordered=ordered) # Pandas may produce narrower numeric types, but bigframes always produces Float64 pd_series = pd_series.astype("Float64") # Pandas has object index type - pd.testing.assert_series_equal(pd_series, bf_result, check_index_type=False) + assert_series_equal( + pd_series, bf_result, check_index_type=False, ignore_order=not ordered + ) @pytest.mark.parametrize( @@ -2501,16 +2630,25 @@ def test_df_skew_too_few_values(scalars_dfs): pd.testing.assert_series_equal(pd_result, bf_result, check_index_type=False) -def test_df_skew(scalars_dfs): +@pytest.mark.parametrize( + ("ordered"), + [ + (True), + (False), + ], +) +def test_df_skew(scalars_dfs, ordered): columns = ["float64_col", "int64_col"] scalars_df, scalars_pandas_df = scalars_dfs - bf_result = scalars_df[columns].skew().to_pandas() + bf_result = scalars_df[columns].skew().to_pandas(ordered=ordered) pd_result = scalars_pandas_df[columns].skew() # Pandas may produce narrower numeric types, but bigframes always produces Float64 pd_result = pd_result.astype("Float64") - pd.testing.assert_series_equal(pd_result, bf_result, check_index_type=False) + assert_series_equal( + pd_result, bf_result, check_index_type=False, ignore_order=not ordered + ) def test_df_kurt_too_few_values(scalars_dfs): @@ -2661,9 +2799,10 @@ def test_df_rows_filter_items(scalars_df_index, scalars_pandas_df_index): # Pandas uses int64 instead of Int64 (nullable) dtype. pd_result.index = pd_result.index.astype(pd.Int64Dtype()) # Ignore ordering as pandas order differently depending on version - assert_pandas_df_equal_ignore_ordering( + assert_pandas_df_equal( bf_result, pd_result, + ignore_order=True, check_names=False, ) diff --git a/tests/system/small/test_dataframe_io.py b/tests/system/small/test_dataframe_io.py index 8f5d706f62..fb9fb7bb89 100644 --- a/tests/system/small/test_dataframe_io.py +++ b/tests/system/small/test_dataframe_io.py @@ -19,10 +19,7 @@ import pyarrow as pa import pytest -from tests.system.utils import ( - assert_pandas_df_equal_ignore_ordering, - convert_pandas_dtypes, -) +from tests.system.utils import assert_pandas_df_equal, convert_pandas_dtypes try: import pandas_gbq # type: ignore @@ -83,6 +80,24 @@ def test_to_pandas_array_struct_correct_result(session): ) +def test_load_json(session): + df = session.read_gbq( + """SELECT + JSON_OBJECT('foo', 10, 'bar', TRUE) AS json_column + """ + ) + + result = df.to_pandas() + expected = pd.DataFrame( + { + "json_column": ['{"bar":true,"foo":10}'], + } + ) + expected.index = expected.index.astype("Int64") + pd.testing.assert_series_equal(result.dtypes, expected.dtypes) + pd.testing.assert_series_equal(result["json_column"], expected["json_column"]) + + def test_to_pandas_batches_w_correct_dtypes(scalars_df_default_index): """Verify to_pandas_batches() APIs returns the expected dtypes.""" expected = scalars_df_default_index.dtypes @@ -380,7 +395,7 @@ def test_to_sql_query_unnamed_index_included( pd_df = scalars_pandas_df_default_index.reset_index(drop=True) roundtrip = session.read_gbq(sql, index_col=idx_ids) roundtrip.index.names = [None] - assert_pandas_df_equal_ignore_ordering(roundtrip.to_pandas(), pd_df) + assert_pandas_df_equal(roundtrip.to_pandas(), pd_df, check_index_type=False) def test_to_sql_query_named_index_included( @@ -397,7 +412,7 @@ def test_to_sql_query_named_index_included( pd_df = scalars_pandas_df_default_index.set_index("rowindex_2", drop=True) roundtrip = session.read_gbq(sql, index_col=idx_ids) - assert_pandas_df_equal_ignore_ordering(roundtrip.to_pandas(), pd_df) + assert_pandas_df_equal(roundtrip.to_pandas(), pd_df) def test_to_sql_query_unnamed_index_excluded( @@ -412,7 +427,9 @@ def test_to_sql_query_unnamed_index_excluded( pd_df = scalars_pandas_df_default_index.reset_index(drop=True) roundtrip = session.read_gbq(sql) - assert_pandas_df_equal_ignore_ordering(roundtrip.to_pandas(), pd_df) + assert_pandas_df_equal( + roundtrip.to_pandas(), pd_df, check_index_type=False, ignore_order=True + ) def test_to_sql_query_named_index_excluded( @@ -429,4 +446,6 @@ def test_to_sql_query_named_index_excluded( "rowindex_2", drop=True ).reset_index(drop=True) roundtrip = session.read_gbq(sql) - assert_pandas_df_equal_ignore_ordering(roundtrip.to_pandas(), pd_df) + assert_pandas_df_equal( + roundtrip.to_pandas(), pd_df, check_index_type=False, ignore_order=True + ) diff --git a/tests/system/small/test_groupby.py b/tests/system/small/test_groupby.py index 05154f7ab7..a24713c2b3 100644 --- a/tests/system/small/test_groupby.py +++ b/tests/system/small/test_groupby.py @@ -16,6 +16,7 @@ import pytest import bigframes.pandas as bpd +from tests.system.utils import assert_pandas_df_equal @pytest.mark.parametrize( @@ -88,16 +89,23 @@ def test_dataframe_groupby_aggregate( pd.testing.assert_frame_equal(pd_result, bf_result_computed, check_dtype=False) -def test_dataframe_groupby_agg_string(scalars_df_index, scalars_pandas_df_index): +@pytest.mark.parametrize( + ("ordered"), + [ + (True), + (False), + ], +) +def test_dataframe_groupby_agg_string( + scalars_df_index, scalars_pandas_df_index, ordered +): col_names = ["int64_too", "float64_col", "int64_col", "bool_col", "string_col"] bf_result = scalars_df_index[col_names].groupby("string_col").agg("count") pd_result = scalars_pandas_df_index[col_names].groupby("string_col").agg("count") - bf_result_computed = bf_result.to_pandas() + bf_result_computed = bf_result.to_pandas(ordered=ordered) - pd.testing.assert_frame_equal( - pd_result, - bf_result_computed, - check_dtype=False, + assert_pandas_df_equal( + pd_result, bf_result_computed, check_dtype=False, ignore_order=not ordered ) @@ -270,13 +278,22 @@ def test_dataframe_groupby_kurt(scalars_df_index, scalars_pandas_df_index): pd.testing.assert_frame_equal(pd_result, bf_result, check_dtype=False) -def test_dataframe_groupby_diff(scalars_df_index, scalars_pandas_df_index): +@pytest.mark.parametrize( + ("ordered"), + [ + (True), + (False), + ], +) +def test_dataframe_groupby_diff(scalars_df_index, scalars_pandas_df_index, ordered): col_names = ["float64_col", "int64_col", "string_col"] bf_result = scalars_df_index[col_names].groupby("string_col").diff(-1) pd_result = scalars_pandas_df_index[col_names].groupby("string_col").diff(-1) - bf_result_computed = bf_result.to_pandas() + bf_result_computed = bf_result.to_pandas(ordered=ordered) - pd.testing.assert_frame_equal(pd_result, bf_result_computed, check_dtype=False) + assert_pandas_df_equal( + pd_result, bf_result_computed, check_dtype=False, ignore_order=not ordered + ) def test_dataframe_groupby_getitem( diff --git a/tests/system/small/test_multiindex.py b/tests/system/small/test_multiindex.py index bc35f633fd..e7e93849c6 100644 --- a/tests/system/small/test_multiindex.py +++ b/tests/system/small/test_multiindex.py @@ -16,7 +16,7 @@ import pytest import bigframes.pandas as bpd -from tests.system.utils import assert_pandas_df_equal_ignore_ordering +from tests.system.utils import assert_pandas_df_equal # Row Multi-index tests @@ -429,7 +429,7 @@ def test_multi_index_dataframe_join(scalars_dfs, how): (["bool_col", "rowindex_2"]) )[["float64_col"]] pd_result = pd_df_a.join(pd_df_b, how=how) - assert_pandas_df_equal_ignore_ordering(bf_result, pd_result) + assert_pandas_df_equal(bf_result, pd_result, ignore_order=True) @all_joins @@ -450,7 +450,7 @@ def test_multi_index_dataframe_join_on(scalars_dfs, how): pd_df_a = pd_df_a.assign(rowindex_2=pd_df_a["rowindex_2"] + 2) pd_df_b = pd_df[["float64_col"]] pd_result = pd_df_a.join(pd_df_b, on="rowindex_2", how=how) - assert_pandas_df_equal_ignore_ordering(bf_result, pd_result) + assert_pandas_df_equal(bf_result, pd_result, ignore_order=True) @pytest.mark.parametrize( diff --git a/tests/system/small/test_pandas.py b/tests/system/small/test_pandas.py index 0292ebd206..a1079288cf 100644 --- a/tests/system/small/test_pandas.py +++ b/tests/system/small/test_pandas.py @@ -16,16 +16,23 @@ import pytest import bigframes.pandas as bpd -from tests.system.utils import assert_pandas_df_equal_ignore_ordering +from tests.system.utils import assert_pandas_df_equal -def test_concat_dataframe(scalars_dfs): +@pytest.mark.parametrize( + ("ordered"), + [ + (True), + (False), + ], +) +def test_concat_dataframe(scalars_dfs, ordered): scalars_df, scalars_pandas_df = scalars_dfs bf_result = bpd.concat(11 * [scalars_df]) - bf_result = bf_result.to_pandas() + bf_result = bf_result.to_pandas(ordered=ordered) pd_result = pd.concat(11 * [scalars_pandas_df]) - pd.testing.assert_frame_equal(bf_result, pd_result) + assert_pandas_df_equal(bf_result, pd_result, ignore_order=not ordered) def test_concat_series(scalars_dfs): @@ -252,7 +259,7 @@ def test_merge(scalars_dfs, merge_how): sort=True, ) - assert_pandas_df_equal_ignore_ordering(bf_result, pd_result) + assert_pandas_df_equal(bf_result, pd_result, ignore_order=True) @pytest.mark.parametrize( @@ -286,7 +293,28 @@ def test_merge_left_on_right_on(scalars_dfs, merge_how): sort=True, ) - assert_pandas_df_equal_ignore_ordering(bf_result, pd_result) + assert_pandas_df_equal(bf_result, pd_result, ignore_order=True) + + +def test_pd_merge_cross(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + left_columns = ["int64_col", "float64_col", "int64_too"] + right_columns = ["int64_col", "bool_col", "string_col", "rowindex_2"] + + left = scalars_df[left_columns] + right = scalars_df[right_columns] + + df = bpd.merge(left, right, "cross", sort=True) + bf_result = df.to_pandas() + + pd_result = pd.merge( + scalars_pandas_df[left_columns], + scalars_pandas_df[right_columns], + "cross", + sort=True, + ) + + pd.testing.assert_frame_equal(bf_result, pd_result, check_index_type=False) @pytest.mark.parametrize( @@ -320,7 +348,7 @@ def test_merge_series(scalars_dfs, merge_how): sort=True, ) - assert_pandas_df_equal_ignore_ordering(bf_result, pd_result) + assert_pandas_df_equal(bf_result, pd_result, ignore_order=True) def test_cut(scalars_dfs): diff --git a/tests/system/small/test_pandas_options.py b/tests/system/small/test_pandas_options.py index ca67710d4e..c410d70fe7 100644 --- a/tests/system/small/test_pandas_options.py +++ b/tests/system/small/test_pandas_options.py @@ -74,7 +74,7 @@ def test_read_gbq_start_sets_session_location( # Now read_gbq* from another location should fail with pytest.raises( - google.api_core.exceptions.NotFound, + (google.api_core.exceptions.NotFound, ValueError), match=dataset_id_permanent, ): read_method(query) @@ -99,7 +99,7 @@ def test_read_gbq_start_sets_session_location( # Now read_gbq* from another location should fail with pytest.raises( - google.api_core.exceptions.NotFound, + (google.api_core.exceptions.NotFound, ValueError), match=dataset_id_permanent_tokyo, ): read_method(query_tokyo) @@ -145,7 +145,7 @@ def test_read_gbq_after_session_start_must_comply_with_default_location( # Doing read_gbq* from a table in another location should fail with pytest.raises( - google.api_core.exceptions.NotFound, + (google.api_core.exceptions.NotFound, ValueError), match=dataset_id_permanent_tokyo, ): read_method(query_tokyo) @@ -193,7 +193,7 @@ def test_read_gbq_must_comply_with_set_location_US( # Starting user journey with read_gbq* from another location should fail with pytest.raises( - google.api_core.exceptions.NotFound, + (google.api_core.exceptions.NotFound, ValueError), match=dataset_id_permanent_tokyo, ): read_method(query_tokyo) @@ -243,7 +243,7 @@ def test_read_gbq_must_comply_with_set_location_non_US( # Starting user journey with read_gbq* from another location should fail with pytest.raises( - google.api_core.exceptions.NotFound, + (google.api_core.exceptions.NotFound, ValueError), match=dataset_id_permanent, ): read_method(query) diff --git a/tests/system/small/test_progress_bar.py b/tests/system/small/test_progress_bar.py index 30ea63b483..bd13ac2240 100644 --- a/tests/system/small/test_progress_bar.py +++ b/tests/system/small/test_progress_bar.py @@ -52,14 +52,6 @@ def test_progress_bar_scalar(penguins_df_default_index: bf.dataframe.DataFrame, assert_loading_msg_exist(capsys.readouterr().out) -def test_progress_bar_read_gbq(session: bf.Session, penguins_table_id: str, capsys): - bf.options.display.progress_bar = "terminal" - capsys.readouterr() # clear output - session.read_gbq(penguins_table_id) - - assert_loading_msg_exist(capsys.readouterr().out) - - def test_progress_bar_extract_jobs( penguins_df_default_index: bf.dataframe.DataFrame, gcs_folder, capsys ): @@ -98,9 +90,6 @@ def assert_loading_msg_exist(capystOut: str, pattern=job_load_message_regex): def test_query_job_repr_html(penguins_df_default_index: bf.dataframe.DataFrame): bf.options.display.progress_bar = "terminal" - penguins_df_default_index._block._expr.session.bqclient.default_query_job_config.use_query_cache = ( - False - ) penguins_df_default_index.to_pandas() query_job_repr = formatting_helpers.repr_query_job_html( penguins_df_default_index.query_job @@ -117,9 +106,6 @@ def test_query_job_repr_html(penguins_df_default_index: bf.dataframe.DataFrame): def test_query_job_repr(penguins_df_default_index: bf.dataframe.DataFrame): - penguins_df_default_index._block._expr.session.bqclient.default_query_job_config.use_query_cache = ( - False - ) penguins_df_default_index.to_pandas() query_job_repr = formatting_helpers.repr_query_job( penguins_df_default_index.query_job diff --git a/tests/system/small/test_remote_function.py b/tests/system/small/test_remote_function.py index 89907a53df..3d8532a13b 100644 --- a/tests/system/small/test_remote_function.py +++ b/tests/system/small/test_remote_function.py @@ -18,7 +18,7 @@ import bigframes from bigframes import remote_function as rf -from tests.system.utils import assert_pandas_df_equal_ignore_ordering +from tests.system.utils import assert_pandas_df_equal @pytest.fixture(scope="module") @@ -121,7 +121,7 @@ def square(x): pd_result_col = pd_result_col.astype(pd.Int64Dtype()) pd_result = pd_int64_col_filtered.to_frame().assign(result=pd_result_col) - assert_pandas_df_equal_ignore_ordering(bf_result, pd_result) + assert_pandas_df_equal(bf_result, pd_result) @pytest.mark.flaky(retries=2, delay=120) @@ -170,7 +170,7 @@ def square(x): pd_result_col = pd_result_col.astype(pd.Int64Dtype()) pd_result = pd_int64_col_filtered.to_frame().assign(result=pd_result_col) - assert_pandas_df_equal_ignore_ordering(bf_result, pd_result) + assert_pandas_df_equal(bf_result, pd_result) @pytest.mark.flaky(retries=2, delay=120) @@ -246,7 +246,7 @@ def square(x): pd_result_col = pd_result_col.astype(pd.Int64Dtype()) pd_result = pd_int64_col_filtered.to_frame().assign(result=pd_result_col) - assert_pandas_df_equal_ignore_ordering(bf_result, pd_result) + assert_pandas_df_equal(bf_result, pd_result) @pytest.mark.flaky(retries=2, delay=120) @@ -309,7 +309,7 @@ def square(x): pd_result_col = pd_result_col.astype(pd.Int64Dtype()) pd_result = pd_int64_col_filtered.to_frame().assign(result=pd_result_col) - assert_pandas_df_equal_ignore_ordering(bf_result, pd_result) + assert_pandas_df_equal(bf_result, pd_result) @pytest.mark.flaky(retries=2, delay=120) @@ -348,7 +348,7 @@ def square(x): pd_result_col = pd_result_col.astype(pd.Int64Dtype()) pd_result = pd_int64_col_filtered.to_frame().assign(result=pd_result_col) - assert_pandas_df_equal_ignore_ordering(bf_result, pd_result) + assert_pandas_df_equal(bf_result, pd_result) @pytest.mark.flaky(retries=2, delay=120) @@ -387,7 +387,7 @@ def square(x): pd_result_col = pd_result_col.astype(pd.Int64Dtype()) pd_result = pd_int64_col_filtered.to_frame().assign(result=pd_result_col) - assert_pandas_df_equal_ignore_ordering(bf_result, pd_result) + assert_pandas_df_equal(bf_result, pd_result) @pytest.mark.flaky(retries=2, delay=120) @@ -418,7 +418,7 @@ def add_one(x): for col in pd_result: pd_result[col] = pd_result[col].astype(pd_int64_df_filtered[col].dtype) - assert_pandas_df_equal_ignore_ordering(bf_result, pd_result) + assert_pandas_df_equal(bf_result, pd_result) @pytest.mark.flaky(retries=2, delay=120) @@ -447,7 +447,7 @@ def add_one(x): for col in pd_result: pd_result[col] = pd_result[col].astype(pd_int64_df[col].dtype) - assert_pandas_df_equal_ignore_ordering(bf_result, pd_result) + assert_pandas_df_equal(bf_result, pd_result) @pytest.mark.flaky(retries=2, delay=120) @@ -535,7 +535,7 @@ def square1(x): s2_result_col = int64_col_filtered.apply(square2) s2_result = int64_col_filtered.to_frame().assign(result=s2_result_col) - assert_pandas_df_equal_ignore_ordering(s1_result.to_pandas(), s2_result.to_pandas()) + assert_pandas_df_equal(s1_result.to_pandas(), s2_result.to_pandas()) @pytest.mark.flaky(retries=2, delay=120) @@ -583,7 +583,9 @@ def test_read_gbq_function_reads_udfs(bigquery_client, dataset_id): indirect_df = indirect_df.assign(y=indirect_df.x.apply(square)) indirect_df = indirect_df.to_pandas() - assert_pandas_df_equal_ignore_ordering(direct_df, indirect_df) + assert_pandas_df_equal( + direct_df, indirect_df, ignore_order=True, check_index_type=False + ) @pytest.mark.flaky(retries=2, delay=120) diff --git a/tests/system/small/test_series.py b/tests/system/small/test_series.py index 183ba01c0e..d9fc23fad0 100644 --- a/tests/system/small/test_series.py +++ b/tests/system/small/test_series.py @@ -24,10 +24,7 @@ import bigframes.pandas import bigframes.series as series -from tests.system.utils import ( - assert_pandas_df_equal_ignore_ordering, - assert_series_equal_ignoring_order, -) +from tests.system.utils import assert_pandas_df_equal, assert_series_equal def test_series_construct_copy(scalars_dfs): @@ -210,7 +207,7 @@ def test_abs(scalars_dfs, col_name): bf_result = scalars_df[col_name].abs().to_pandas() pd_result = scalars_pandas_df[col_name].abs() - assert_series_equal_ignoring_order(pd_result, bf_result) + assert_series_equal(pd_result, bf_result) def test_fillna(scalars_dfs): @@ -218,7 +215,7 @@ def test_fillna(scalars_dfs): col_name = "string_col" bf_result = scalars_df[col_name].fillna("Missing").to_pandas() pd_result = scalars_pandas_df[col_name].fillna("Missing") - assert_series_equal_ignoring_order( + assert_series_equal( pd_result, bf_result, ) @@ -273,21 +270,26 @@ def test_series_replace_list_scalar(scalars_dfs): @pytest.mark.parametrize( - ("values",), + ("method",), ( - ([None, 1, 2, None, None, 16, None],), - ([None, None, 3.6, None],), - ([403.2, None, 352.1, None, None, 111.9],), + ("linear",), + ("values",), + ("slinear",), + ("nearest",), + ("zero",), + ("pad",), ), ) -def test_series_interpolate(values): - pd_series = pd.Series(values) +def test_series_interpolate(method): + values = [None, 1, 2, None, None, 16, None] + index = [-3.2, 11.4, 3.56, 4, 4.32, 5.55, 76.8] + pd_series = pd.Series(values, index) bf_series = series.Series(pd_series) # Pandas can only interpolate on "float64" columns # https://github.com/pandas-dev/pandas/issues/40252 - pd_result = pd_series.astype("float64").interpolate() - bf_result = bf_series.interpolate().to_pandas() + pd_result = pd_series.astype("float64").interpolate(method=method) + bf_result = bf_series.interpolate(method=method).to_pandas() # pd uses non-null types, while bf uses nullable types pd.testing.assert_series_equal( @@ -491,7 +493,7 @@ def test_series_int_int_operators_scalar( bf_result = maybe_reversed_op(scalars_df["int64_col"], other_scalar).to_pandas() pd_result = maybe_reversed_op(scalars_pandas_df["int64_col"], other_scalar) - assert_series_equal_ignoring_order(pd_result, bf_result) + assert_series_equal(pd_result, bf_result) def test_series_pow_scalar(scalars_dfs): @@ -500,7 +502,7 @@ def test_series_pow_scalar(scalars_dfs): bf_result = (scalars_df["int64_col"] ** 2).to_pandas() pd_result = scalars_pandas_df["int64_col"] ** 2 - assert_series_equal_ignoring_order(pd_result, bf_result) + assert_series_equal(pd_result, bf_result) def test_series_pow_scalar_reverse(scalars_dfs): @@ -509,7 +511,7 @@ def test_series_pow_scalar_reverse(scalars_dfs): bf_result = (0.8 ** scalars_df["int64_col"]).to_pandas() pd_result = 0.8 ** scalars_pandas_df["int64_col"] - assert_series_equal_ignoring_order(pd_result, bf_result) + assert_series_equal(pd_result, bf_result) @pytest.mark.parametrize( @@ -535,7 +537,7 @@ def test_series_bool_bool_operators_scalar( bf_result = maybe_reversed_op(scalars_df["bool_col"], other_scalar).to_pandas() pd_result = maybe_reversed_op(scalars_pandas_df["bool_col"], other_scalar) - assert_series_equal_ignoring_order(pd_result.astype(pd.BooleanDtype()), bf_result) + assert_series_equal(pd_result.astype(pd.BooleanDtype()), bf_result) @pytest.mark.parametrize( @@ -573,7 +575,7 @@ def test_series_int_int_operators_series(scalars_dfs, operator): scalars_df, scalars_pandas_df = scalars_dfs bf_result = operator(scalars_df["int64_col"], scalars_df["int64_too"]).to_pandas() pd_result = operator(scalars_pandas_df["int64_col"], scalars_pandas_df["int64_too"]) - assert_series_equal_ignoring_order(pd_result, bf_result) + assert_series_equal(pd_result, bf_result) @pytest.mark.parametrize( @@ -723,7 +725,7 @@ def test_series_add_scalar(scalars_dfs, other): bf_result = (scalars_df["float64_col"] + other).to_pandas() pd_result = scalars_pandas_df["float64_col"] + other - assert_series_equal_ignoring_order(pd_result, bf_result) + assert_series_equal(pd_result, bf_result) @pytest.mark.parametrize( @@ -739,7 +741,7 @@ def test_series_add_bigframes_series(scalars_dfs, left_col, right_col): bf_result = (scalars_df[left_col] + scalars_df[right_col]).to_pandas() pd_result = scalars_pandas_df[left_col] + scalars_pandas_df[right_col] - assert_series_equal_ignoring_order(pd_result, bf_result) + assert_series_equal(pd_result, bf_result) @pytest.mark.parametrize( @@ -761,7 +763,7 @@ def test_series_add_bigframes_series_nested( scalars_pandas_df[left_col] + scalars_pandas_df[right_col] ) + scalars_pandas_df[righter_col] - assert_series_equal_ignoring_order(pd_result, bf_result) + assert_series_equal(pd_result, bf_result) def test_series_add_different_table_default_index( @@ -919,7 +921,7 @@ def test_isnull(scalars_dfs): # One of dtype mismatches to be documented. Here, the `bf_series.dtype` is `BooleanDtype` but # the `pd_series.dtype` is `bool`. - assert_series_equal_ignoring_order(pd_series.astype(pd.BooleanDtype()), bf_series) + assert_series_equal(pd_series.astype(pd.BooleanDtype()), bf_series) def test_notnull(scalars_dfs): @@ -930,7 +932,7 @@ def test_notnull(scalars_dfs): # One of dtype mismatches to be documented. Here, the `bf_series.dtype` is `BooleanDtype` but # the `pd_series.dtype` is `bool`. - assert_series_equal_ignoring_order(pd_series.astype(pd.BooleanDtype()), bf_series) + assert_series_equal(pd_series.astype(pd.BooleanDtype()), bf_series) def test_round(scalars_dfs): @@ -939,7 +941,7 @@ def test_round(scalars_dfs): bf_result = scalars_df[col_name].round().to_pandas() pd_result = scalars_pandas_df[col_name].round() - assert_series_equal_ignoring_order(pd_result, bf_result) + assert_series_equal(pd_result, bf_result) def test_eq_scalar(scalars_dfs): @@ -948,7 +950,7 @@ def test_eq_scalar(scalars_dfs): bf_result = scalars_df[col_name].eq(0).to_pandas() pd_result = scalars_pandas_df[col_name].eq(0) - assert_series_equal_ignoring_order(pd_result, bf_result) + assert_series_equal(pd_result, bf_result) def test_eq_wider_type_scalar(scalars_dfs): @@ -957,7 +959,7 @@ def test_eq_wider_type_scalar(scalars_dfs): bf_result = scalars_df[col_name].eq(1.0).to_pandas() pd_result = scalars_pandas_df[col_name].eq(1.0) - assert_series_equal_ignoring_order(pd_result, bf_result) + assert_series_equal(pd_result, bf_result) def test_ne_scalar(scalars_dfs): @@ -966,7 +968,7 @@ def test_ne_scalar(scalars_dfs): bf_result = (scalars_df[col_name] != 0).to_pandas() pd_result = scalars_pandas_df[col_name] != 0 - assert_series_equal_ignoring_order(pd_result, bf_result) + assert_series_equal(pd_result, bf_result) def test_eq_int_scalar(scalars_dfs): @@ -975,7 +977,7 @@ def test_eq_int_scalar(scalars_dfs): bf_result = (scalars_df[col_name] == 0).to_pandas() pd_result = scalars_pandas_df[col_name] == 0 - assert_series_equal_ignoring_order(pd_result, bf_result) + assert_series_equal(pd_result, bf_result) @pytest.mark.parametrize( @@ -994,7 +996,7 @@ def test_eq_same_type_series(scalars_dfs, col_name): # One of dtype mismatches to be documented. Here, the `bf_series.dtype` is `BooleanDtype` but # the `pd_series.dtype` is `bool`. - assert_series_equal_ignoring_order(pd_result.astype(pd.BooleanDtype()), bf_result) + assert_series_equal(pd_result.astype(pd.BooleanDtype()), bf_result) def test_loc_setitem_cell(scalars_df_index, scalars_pandas_df_index): @@ -1012,6 +1014,17 @@ def test_loc_setitem_cell(scalars_df_index, scalars_pandas_df_index): pd.testing.assert_series_equal(bf_original.to_pandas(), pd_original) +def test_at_setitem_row_label_scalar(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + bf_series = scalars_df["int64_col"] + pd_series = scalars_pandas_df["int64_col"].copy() + bf_series.at[1] = 1000 + pd_series.at[1] = 1000 + bf_result = bf_series.to_pandas() + pd_result = pd_series.astype("Int64") + pd.testing.assert_series_equal(bf_result, pd_result) + + def test_ne_obj_series(scalars_dfs): scalars_df, scalars_pandas_df = scalars_dfs col_name = "string_col" @@ -1020,7 +1033,7 @@ def test_ne_obj_series(scalars_dfs): # One of dtype mismatches to be documented. Here, the `bf_series.dtype` is `BooleanDtype` but # the `pd_series.dtype` is `bool`. - assert_series_equal_ignoring_order(pd_result.astype(pd.BooleanDtype()), bf_result) + assert_series_equal(pd_result.astype(pd.BooleanDtype()), bf_result) def test_indexing_using_unselected_series(scalars_dfs): @@ -1029,7 +1042,7 @@ def test_indexing_using_unselected_series(scalars_dfs): bf_result = scalars_df[col_name][scalars_df["int64_too"].eq(0)].to_pandas() pd_result = scalars_pandas_df[col_name][scalars_pandas_df["int64_too"].eq(0)] - assert_series_equal_ignoring_order( + assert_series_equal( pd_result, bf_result, ) @@ -1045,7 +1058,7 @@ def test_indexing_using_selected_series(scalars_dfs): scalars_pandas_df["string_col"].eq("Hello, World!") ] - assert_series_equal_ignoring_order( + assert_series_equal( pd_result, bf_result, ) @@ -1067,7 +1080,7 @@ def test_nested_filter(scalars_dfs): ) # Convert from nullable bool to nonnullable bool usable as indexer pd_result = pd_string_col[pd_int64_too == 0][~pd_bool_col] - assert_series_equal_ignoring_order( + assert_series_equal( pd_result, bf_result, ) @@ -1086,7 +1099,7 @@ def test_binop_repeated_application_does_row_identity_joins(scalars_dfs): bf_result = bf_series.to_pandas() pd_result = pd_series - assert_series_equal_ignoring_order( + assert_series_equal( bf_result, pd_result, ) @@ -1108,10 +1121,9 @@ def test_binop_opposite_filters(scalars_dfs): pd_bool_col = scalars_pandas_df["bool_col"] pd_result = pd_int64_col1[pd_bool_col] + pd_int64_col2[pd_bool_col.__invert__()] - assert_series_equal_ignoring_order( - bf_result, - pd_result, - ) + # Passes with ignore_order=False only with some dependency sets + # TODO: Determine desired behavior and make test more strict + assert_series_equal(bf_result, pd_result, ignore_order=True) def test_binop_left_filtered(scalars_dfs): @@ -1126,10 +1138,9 @@ def test_binop_left_filtered(scalars_dfs): pd_bool_col = scalars_pandas_df["bool_col"] pd_result = pd_int64_col[pd_bool_col] + pd_float64_col - assert_series_equal_ignoring_order( - bf_result, - pd_result, - ) + # Passes with ignore_order=False only with some dependency sets + # TODO: Determine desired behavior and make test more strict + assert_series_equal(bf_result, pd_result, ignore_order=True) def test_binop_right_filtered(scalars_dfs): @@ -1144,7 +1155,7 @@ def test_binop_right_filtered(scalars_dfs): pd_bool_col = scalars_pandas_df["bool_col"] pd_result = pd_float64_col + pd_int64_col[pd_bool_col] - assert_series_equal_ignoring_order( + assert_series_equal( bf_result, pd_result, ) @@ -1249,7 +1260,7 @@ def test_groupby_sum(scalars_dfs): ) # TODO(swast): Update groupby to use index based on group by key(s). bf_result = bf_series.to_pandas() - assert_series_equal_ignoring_order( + assert_series_equal( pd_series, bf_result, check_exact=False, @@ -1267,7 +1278,7 @@ def test_groupby_std(scalars_dfs): .astype(pd.Float64Dtype()) ) bf_result = bf_series.to_pandas() - assert_series_equal_ignoring_order( + assert_series_equal( pd_series, bf_result, check_exact=False, @@ -1282,7 +1293,7 @@ def test_groupby_var(scalars_dfs): scalars_pandas_df[col_name].groupby(scalars_pandas_df["string_col"]).var() ) bf_result = bf_series.to_pandas() - assert_series_equal_ignoring_order( + assert_series_equal( pd_series, bf_result, check_exact=False, @@ -1334,7 +1345,7 @@ def test_groupby_mean(scalars_dfs): ) # TODO(swast): Update groupby to use index based on group by key(s). bf_result = bf_series.to_pandas() - assert_series_equal_ignoring_order( + assert_series_equal( pd_series, bf_result, ) @@ -1372,7 +1383,7 @@ def test_groupby_prod(scalars_dfs): ) # TODO(swast): Update groupby to use index based on group by key(s). bf_result = bf_series.to_pandas() - assert_series_equal_ignoring_order( + assert_series_equal( pd_series, bf_result, ) @@ -1582,7 +1593,7 @@ def test_head(scalars_dfs): bf_result = scalars_df["string_col"].head(2).to_pandas() pd_result = scalars_pandas_df["string_col"].head(2) - assert_series_equal_ignoring_order( + assert_series_equal( pd_result, bf_result, ) @@ -1597,7 +1608,7 @@ def test_tail(scalars_dfs): bf_result = scalars_df["string_col"].tail(2).to_pandas() pd_result = scalars_pandas_df["string_col"].tail(2) - assert_series_equal_ignoring_order( + assert_series_equal( pd_result, bf_result, ) @@ -2065,11 +2076,7 @@ def test_series_filter_items(scalars_df_index, scalars_pandas_df_index): # Pandas uses int64 instead of Int64 (nullable) dtype. pd_result.index = pd_result.index.astype(pd.Int64Dtype()) # Ignore ordering as pandas order differently depending on version - assert_series_equal_ignoring_order( - bf_result, - pd_result, - check_names=False, - ) + assert_series_equal(bf_result, pd_result, check_names=False, ignore_order=True) def test_series_filter_like(scalars_df_index, scalars_pandas_df_index): @@ -2197,21 +2204,25 @@ def test_where_with_default(scalars_df_index, scalars_pandas_df_index): ) -def test_clip(scalars_df_index, scalars_pandas_df_index): +@pytest.mark.parametrize( + ("ordered"), + [ + (True), + (False), + ], +) +def test_clip(scalars_df_index, scalars_pandas_df_index, ordered): col_bf = scalars_df_index["int64_col"] lower_bf = scalars_df_index["int64_too"] - 1 upper_bf = scalars_df_index["int64_too"] + 1 - bf_result = col_bf.clip(lower_bf, upper_bf).to_pandas() + bf_result = col_bf.clip(lower_bf, upper_bf).to_pandas(ordered=ordered) col_pd = scalars_pandas_df_index["int64_col"] lower_pd = scalars_pandas_df_index["int64_too"] - 1 upper_pd = scalars_pandas_df_index["int64_too"] + 1 pd_result = col_pd.clip(lower_pd, upper_pd) - pd.testing.assert_series_equal( - bf_result, - pd_result, - ) + assert_series_equal(bf_result, pd_result, ignore_order=not ordered) def test_clip_filtered_two_sided(scalars_df_index, scalars_pandas_df_index): @@ -2282,7 +2293,7 @@ def test_to_frame(scalars_dfs): bf_result = scalars_df["int64_col"].to_frame().to_pandas() pd_result = scalars_pandas_df["int64_col"].to_frame() - assert_pandas_df_equal_ignore_ordering(bf_result, pd_result) + assert_pandas_df_equal(bf_result, pd_result) def test_to_json(scalars_df_index, scalars_pandas_df_index): @@ -2450,7 +2461,7 @@ def test_mask_default_value(scalars_dfs): pd_col_masked = pd_col.mask(pd_col % 2 == 1) pd_result = pd_col.to_frame().assign(int64_col_masked=pd_col_masked) - assert_pandas_df_equal_ignore_ordering(bf_result, pd_result) + assert_pandas_df_equal(bf_result, pd_result) def test_mask_custom_value(scalars_dfs): @@ -2468,7 +2479,7 @@ def test_mask_custom_value(scalars_dfs): # odd so should be left as is, but it is being masked in pandas. # Accidentally the bigframes bahavior matches, but it should be updated # after the resolution of https://github.com/pandas-dev/pandas/issues/52955 - assert_pandas_df_equal_ignore_ordering(bf_result, pd_result) + assert_pandas_df_equal(bf_result, pd_result) @pytest.mark.parametrize( @@ -2487,6 +2498,7 @@ def test_mask_custom_value(scalars_dfs): # with timezone conversions, so we'll allow it. ("timestamp_col", pd.ArrowDtype(pa.timestamp("us"))), ("datetime_col", pd.ArrowDtype(pa.timestamp("us", tz="UTC"))), + ("date_col", "string[pyarrow]"), # TODO(bmil): fix Ibis bug: BigQuery backend rounds to nearest int # ("float64_col", "Int64"), # TODO(bmil): decide whether to fix Ibis bug: BigQuery backend @@ -2564,7 +2576,7 @@ def test_loc_bool_series_default_index( scalars_pandas_df_default_index.bool_col ] - assert_pandas_df_equal_ignore_ordering( + assert_pandas_df_equal( bf_result.to_frame(), pd_result.to_frame(), ) @@ -2910,3 +2922,30 @@ def test_map_series_input_duplicates_error(scalars_dfs): scalars_pandas_df.int64_too.map(pd_map_series) with pytest.raises(pd.errors.InvalidIndexError): scalars_df.int64_too.map(bf_map_series, verify_integrity=True) + + +@pytest.mark.parametrize( + ("frac", "n", "random_state"), + [ + (None, 4, None), + (0.5, None, None), + (None, 4, 10), + (0.5, None, 10), + (None, None, None), + ], + ids=[ + "n_wo_random_state", + "frac_wo_random_state", + "n_w_random_state", + "frac_w_random_state", + "n_default", + ], +) +def test_sample(scalars_dfs, frac, n, random_state): + scalars_df, _ = scalars_dfs + df = scalars_df.int64_col.sample(frac=frac, n=n, random_state=random_state) + bf_result = df.to_pandas() + + n = 1 if n is None else n + expected_sample_size = round(frac * scalars_df.shape[0]) if frac is not None else n + assert bf_result.shape[0] == expected_sample_size diff --git a/tests/system/small/test_session.py b/tests/system/small/test_session.py index bf72e444eb..7cd9f1dd59 100644 --- a/tests/system/small/test_session.py +++ b/tests/system/small/test_session.py @@ -19,7 +19,6 @@ import typing from typing import List -import google.api_core.exceptions import google.cloud.bigquery as bigquery import numpy as np import pandas as pd @@ -985,26 +984,3 @@ def test_read_json_gcs_default_engine(session, scalars_dfs, gcs_folder): assert df.shape[0] == scalars_df.shape[0] pd.testing.assert_series_equal(df.dtypes, scalars_df.dtypes) - - -def test_session_id(session): - assert session._session_id is not None - - # BQ client always runs query within the opened session. - query_job = session.bqclient.query("SELECT 1") - assert query_job.session_info.session_id == session._session_id - - # TODO(chelsealin): Verify the session id can be binded with a load job. - - -@pytest.mark.flaky(retries=2) -def test_to_close_session(): - session = bigframes.Session() - assert session._session_id is not None - session.close() - assert session._session_id is None - - # Session has expired and is no longer available. - with pytest.raises(google.api_core.exceptions.BadRequest): - query_job = session.bqclient.query("SELECT 1") - query_job.result() # blocks until finished diff --git a/tests/system/utils.py b/tests/system/utils.py index e2daf3b8bf..f7831972b8 100644 --- a/tests/system/utils.py +++ b/tests/system/utils.py @@ -21,29 +21,33 @@ import pyarrow as pa # type: ignore -def assert_pandas_df_equal_ignore_ordering(df0, df1, **kwargs): - # Sort by a column to get consistent results. - if df0.index.name != "rowindex": - df0 = df0.sort_values( - list(df0.columns.drop("geography_col", errors="ignore")) - ).reset_index(drop=True) - df1 = df1.sort_values( - list(df1.columns.drop("geography_col", errors="ignore")) - ).reset_index(drop=True) - else: - df0 = df0.sort_index() - df1 = df1.sort_index() +def assert_pandas_df_equal(df0, df1, ignore_order: bool = False, **kwargs): + if ignore_order: + # Sort by a column to get consistent results. + if df0.index.name != "rowindex": + df0 = df0.sort_values( + list(df0.columns.drop("geography_col", errors="ignore")) + ).reset_index(drop=True) + df1 = df1.sort_values( + list(df1.columns.drop("geography_col", errors="ignore")) + ).reset_index(drop=True) + else: + df0 = df0.sort_index() + df1 = df1.sort_index() pd.testing.assert_frame_equal(df0, df1, **kwargs) -def assert_series_equal_ignoring_order(left: pd.Series, right: pd.Series, **kwargs): - if left.index.name is None: - left = left.sort_values().reset_index(drop=True) - right = right.sort_values().reset_index(drop=True) - else: - left = left.sort_index() - right = right.sort_index() +def assert_series_equal( + left: pd.Series, right: pd.Series, ignore_order: bool = False, **kwargs +): + if ignore_order: + if left.index.name is None: + left = left.sort_values().reset_index(drop=True) + right = right.sort_values().reset_index(drop=True) + else: + left = left.sort_index() + right = right.sort_index() pd.testing.assert_series_equal(left, right, **kwargs) diff --git a/tests/unit/core/test_log_adapter.py b/tests/unit/core/test_log_adapter.py new file mode 100644 index 0000000000..376b7f2075 --- /dev/null +++ b/tests/unit/core/test_log_adapter.py @@ -0,0 +1,60 @@ +# Copyright 2023 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pytest + +from bigframes.core import log_adapter + +MAX_LABELS_COUNT = 64 + + +@pytest.fixture +def test_instance(): + # Create a simple class for testing + @log_adapter.class_logger + class TestClass: + def method1(self): + pass + + def method2(self): + pass + + return TestClass() + + +def test_method_logging(test_instance): + test_instance.method1() + test_instance.method2() + + # Check if the methods were added to the _api_methods list + api_methods = log_adapter.get_and_reset_api_methods() + assert api_methods is not None + assert "method1" in api_methods + assert "method2" in api_methods + + +def test_add_api_method_limit(test_instance): + # Ensure that add_api_method correctly adds a method to _api_methods + for i in range(70): + test_instance.method2() + assert len(log_adapter._api_methods) == MAX_LABELS_COUNT + + +def test_get_and_reset_api_methods(test_instance): + # Ensure that get_and_reset_api_methods returns a copy and resets the list + test_instance.method1() + test_instance.method2() + previous_methods = log_adapter.get_and_reset_api_methods() + assert previous_methods is not None + assert log_adapter._api_methods == [] diff --git a/tests/unit/ml/test_golden_sql.py b/tests/unit/ml/test_golden_sql.py index 3ca7e144a5..017c96d46d 100644 --- a/tests/unit/ml/test_golden_sql.py +++ b/tests/unit/ml/test_golden_sql.py @@ -23,21 +23,46 @@ from bigframes.ml import core, linear_model import bigframes.pandas as bpd +TEMP_MODEL_ID = bigquery.ModelReference.from_string( + "test-project._anon123.temp_model_id" +) + @pytest.fixture def mock_session(): mock_session = mock.create_autospec(spec=bigframes.Session) - # return values we don't care about, but need to provide to continue the program when calling session._start_query() - mock_session._start_query.return_value = (None, mock.MagicMock()) + mock_session._anonymous_dataset = bigquery.DatasetReference( + TEMP_MODEL_ID.project, TEMP_MODEL_ID.dataset_id + ) + + query_job = mock.create_autospec(bigquery.QueryJob) + type(query_job).destination = mock.PropertyMock( + return_value=bigquery.TableReference( + mock_session._anonymous_dataset, TEMP_MODEL_ID.model_id + ) + ) + mock_session._start_query.return_value = (None, query_job) return mock_session +@pytest.fixture +def bqml_model_factory(mocker: pytest_mock.MockerFixture): + mocker.patch( + "bigframes.ml.core.BqmlModelFactory._create_model_ref", + return_value=TEMP_MODEL_ID, + ) + bqml_model_factory = core.BqmlModelFactory() + + return bqml_model_factory + + @pytest.fixture def mock_y(): mock_y = mock.create_autospec(spec=bpd.DataFrame) mock_y.columns = pd.Index(["input_column_label"]) + mock_y._cached.return_value = mock_y return mock_y @@ -57,21 +82,11 @@ def mock_X(mock_y, mock_session): ["index_column_id"], ["index_column_label"], ) + mock_X._cached.return_value = mock_X return mock_X -@pytest.fixture -def bqml_model_factory(mocker: pytest_mock.MockerFixture): - mocker.patch( - "bigframes.ml.core.BqmlModelFactory._create_temp_model_id", - return_value="temp_model_id", - ) - bqml_model_factory = core.BqmlModelFactory() - - return bqml_model_factory - - @pytest.fixture def bqml_model(mock_session): bqml_model = core.BqmlModel( @@ -89,7 +104,7 @@ def test_linear_regression_default_fit( model.fit(mock_X, mock_y) mock_session._start_query.assert_called_once_with( - 'CREATE TEMP MODEL `temp_model_id`\nOPTIONS(\n model_type="LINEAR_REG",\n data_split_method="NO_SPLIT",\n optimize_strategy="normal_equation",\n fit_intercept=True,\n l2_reg=0.0,\n max_iterations=20,\n learn_rate_strategy="line_search",\n early_stop=True,\n min_rel_progress=0.01,\n ls_init_learn_rate=0.1,\n calculate_p_values=False,\n enable_global_explain=False,\n INPUT_LABEL_COLS=["input_column_label"])\nAS input_X_y_sql' + 'CREATE OR REPLACE MODEL `test-project`.`_anon123`.`temp_model_id`\nOPTIONS(\n model_type="LINEAR_REG",\n data_split_method="NO_SPLIT",\n optimize_strategy="normal_equation",\n fit_intercept=True,\n l2_reg=0.0,\n max_iterations=20,\n learn_rate_strategy="line_search",\n early_stop=True,\n min_rel_progress=0.01,\n ls_init_learn_rate=0.1,\n calculate_p_values=False,\n enable_global_explain=False,\n INPUT_LABEL_COLS=["input_column_label"])\nAS input_X_y_sql' ) @@ -99,7 +114,7 @@ def test_linear_regression_params_fit(bqml_model_factory, mock_session, mock_X, model.fit(mock_X, mock_y) mock_session._start_query.assert_called_once_with( - 'CREATE TEMP MODEL `temp_model_id`\nOPTIONS(\n model_type="LINEAR_REG",\n data_split_method="NO_SPLIT",\n optimize_strategy="normal_equation",\n fit_intercept=False,\n l2_reg=0.0,\n max_iterations=20,\n learn_rate_strategy="line_search",\n early_stop=True,\n min_rel_progress=0.01,\n ls_init_learn_rate=0.1,\n calculate_p_values=False,\n enable_global_explain=False,\n INPUT_LABEL_COLS=["input_column_label"])\nAS input_X_y_sql' + 'CREATE OR REPLACE MODEL `test-project`.`_anon123`.`temp_model_id`\nOPTIONS(\n model_type="LINEAR_REG",\n data_split_method="NO_SPLIT",\n optimize_strategy="normal_equation",\n fit_intercept=False,\n l2_reg=0.0,\n max_iterations=20,\n learn_rate_strategy="line_search",\n early_stop=True,\n min_rel_progress=0.01,\n ls_init_learn_rate=0.1,\n calculate_p_values=False,\n enable_global_explain=False,\n INPUT_LABEL_COLS=["input_column_label"])\nAS input_X_y_sql' ) @@ -132,7 +147,7 @@ def test_logistic_regression_default_fit( model.fit(mock_X, mock_y) mock_session._start_query.assert_called_once_with( - 'CREATE TEMP MODEL `temp_model_id`\nOPTIONS(\n model_type="LOGISTIC_REG",\n data_split_method="NO_SPLIT",\n fit_intercept=True,\n auto_class_weights=False,\n INPUT_LABEL_COLS=["input_column_label"])\nAS input_X_y_sql' + 'CREATE OR REPLACE MODEL `test-project`.`_anon123`.`temp_model_id`\nOPTIONS(\n model_type="LOGISTIC_REG",\n data_split_method="NO_SPLIT",\n fit_intercept=True,\n auto_class_weights=False,\n INPUT_LABEL_COLS=["input_column_label"])\nAS input_X_y_sql' ) @@ -146,7 +161,7 @@ def test_logistic_regression_params_fit( model.fit(mock_X, mock_y) mock_session._start_query.assert_called_once_with( - 'CREATE TEMP MODEL `temp_model_id`\nOPTIONS(\n model_type="LOGISTIC_REG",\n data_split_method="NO_SPLIT",\n fit_intercept=False,\n auto_class_weights=True,\n INPUT_LABEL_COLS=["input_column_label"])\nAS input_X_y_sql' + 'CREATE OR REPLACE MODEL `test-project`.`_anon123`.`temp_model_id`\nOPTIONS(\n model_type="LOGISTIC_REG",\n data_split_method="NO_SPLIT",\n fit_intercept=False,\n auto_class_weights=True,\n INPUT_LABEL_COLS=["input_column_label"])\nAS input_X_y_sql' ) diff --git a/tests/unit/ml/test_sql.py b/tests/unit/ml/test_sql.py index 34a02edd42..ea16722393 100644 --- a/tests/unit/ml/test_sql.py +++ b/tests/unit/ml/test_sql.py @@ -14,6 +14,7 @@ from unittest import mock +import google.cloud.bigquery as bigquery import pytest import bigframes.ml.sql as ml_sql @@ -27,7 +28,7 @@ def base_sql_generator() -> ml_sql.BaseSqlGenerator: @pytest.fixture(scope="session") def model_creation_sql_generator() -> ml_sql.ModelCreationSqlGenerator: - return ml_sql.ModelCreationSqlGenerator(model_id="my_model_id") + return ml_sql.ModelCreationSqlGenerator() @pytest.fixture(scope="session") @@ -126,11 +127,14 @@ def test_create_model_produces_correct_sql( ): sql = model_creation_sql_generator.create_model( source_df=mock_df, + model_ref=bigquery.ModelReference.from_string( + "test-proj._anonXYZ.create_model_correct_sql" + ), options={"option_key1": "option_value1", "option_key2": 2}, ) assert ( sql - == """CREATE TEMP MODEL `my_model_id` + == """CREATE OR REPLACE MODEL `test-proj`.`_anonXYZ`.`create_model_correct_sql` OPTIONS( option_key1="option_value1", option_key2=2) @@ -144,6 +148,9 @@ def test_create_model_transform_produces_correct_sql( ): sql = model_creation_sql_generator.create_model( source_df=mock_df, + model_ref=bigquery.ModelReference.from_string( + "test-proj._anonXYZ.create_model_transform" + ), options={"option_key1": "option_value1", "option_key2": 2}, transforms=[ "ML.STANDARD_SCALER(col_a) OVER(col_a) AS scaled_col_a", @@ -152,7 +159,7 @@ def test_create_model_transform_produces_correct_sql( ) assert ( sql - == """CREATE TEMP MODEL `my_model_id` + == """CREATE OR REPLACE MODEL `test-proj`.`_anonXYZ`.`create_model_transform` TRANSFORM( ML.STANDARD_SCALER(col_a) OVER(col_a) AS scaled_col_a, ML.ONE_HOT_ENCODER(col_b) OVER(col_b) AS encoded_col_b) @@ -168,11 +175,14 @@ def test_create_remote_model_produces_correct_sql( ): sql = model_creation_sql_generator.create_remote_model( connection_name="my_project.us.my_connection", + model_ref=bigquery.ModelReference.from_string( + "test-proj._anonXYZ.create_remote_model" + ), options={"option_key1": "option_value1", "option_key2": 2}, ) assert ( sql - == """CREATE TEMP MODEL `my_model_id` + == """CREATE OR REPLACE MODEL `test-proj`.`_anonXYZ`.`create_remote_model` REMOTE WITH CONNECTION `my_project.us.my_connection` OPTIONS( option_key1="option_value1", @@ -184,11 +194,14 @@ def test_create_imported_model_produces_correct_sql( model_creation_sql_generator: ml_sql.ModelCreationSqlGenerator, ): sql = model_creation_sql_generator.create_imported_model( + model_ref=bigquery.ModelReference.from_string( + "test-proj._anonXYZ.create_imported_model" + ), options={"option_key1": "option_value1", "option_key2": 2}, ) assert ( sql - == """CREATE TEMP MODEL `my_model_id` + == """CREATE OR REPLACE MODEL `test-proj`.`_anonXYZ`.`create_imported_model` OPTIONS( option_key1="option_value1", option_key2=2)""" diff --git a/tests/unit/resources.py b/tests/unit/resources.py index 8fc8acd175..8ba321d122 100644 --- a/tests/unit/resources.py +++ b/tests/unit/resources.py @@ -66,7 +66,6 @@ def create_bigquery_session( credentials=credentials, location="test-region" ) session = bigframes.Session(context=bqoptions, clients_provider=clients_provider) - session._session_id = session_id return session diff --git a/tests/unit/session/test_io_bigquery.py b/tests/unit/session/test_io_bigquery.py index 03470208e4..e1481d3f05 100644 --- a/tests/unit/session/test_io_bigquery.py +++ b/tests/unit/session/test_io_bigquery.py @@ -19,7 +19,113 @@ import google.cloud.bigquery as bigquery import pytest -import bigframes.session._io.bigquery +import bigframes +from bigframes.core import log_adapter +import bigframes.pandas as bpd +import bigframes.session._io.bigquery as io_bq + + +def test_create_job_configs_labels_is_none(): + api_methods = ["agg", "series-mode"] + labels = io_bq.create_job_configs_labels( + job_configs_labels=None, api_methods=api_methods + ) + expected_dict = { + "recent-bigframes-api-0": "agg", + "recent-bigframes-api-1": "series-mode", + } + assert labels is not None + assert labels == expected_dict + + +def test_create_job_configs_labels_length_limit_not_met(): + cur_labels = { + "bigframes-api": "read_pandas", + "source": "bigquery-dataframes-temp", + } + api_methods = ["agg", "series-mode"] + labels = io_bq.create_job_configs_labels( + job_configs_labels=cur_labels, api_methods=api_methods + ) + expected_dict = { + "bigframes-api": "read_pandas", + "source": "bigquery-dataframes-temp", + "recent-bigframes-api-0": "agg", + "recent-bigframes-api-1": "series-mode", + } + assert labels is not None + assert len(labels) == 4 + assert labels == expected_dict + + +def test_create_job_configs_labels_log_adaptor_call_method_under_length_limit(): + cur_labels = { + "bigframes-api": "read_pandas", + "source": "bigquery-dataframes-temp", + } + df = bpd.DataFrame({"col1": [1, 2], "col2": [3, 4]}) + # Test running two methods + df.head() + df.max() + api_methods = log_adapter._api_methods + + labels = io_bq.create_job_configs_labels( + job_configs_labels=cur_labels, api_methods=api_methods + ) + expected_dict = { + "bigframes-api": "read_pandas", + "source": "bigquery-dataframes-temp", + "recent-bigframes-api-0": "__init__", + "recent-bigframes-api-1": "max", + "recent-bigframes-api-2": "__init__", + "recent-bigframes-api-3": "head", + "recent-bigframes-api-4": "__init__", + } + assert labels is not None + assert len(labels) == 7 + assert labels == expected_dict + + +def test_create_job_configs_labels_length_limit_met_and_labels_is_none(): + df = bpd.DataFrame({"col1": [1, 2], "col2": [3, 4]}) + # Test running methods more than the labels' length limit + for i in range(66): + df.head() + api_methods = log_adapter._api_methods + + labels = io_bq.create_job_configs_labels( + job_configs_labels=None, api_methods=api_methods + ) + assert labels is not None + assert len(labels) == 64 + assert "head" in labels.values() + + +def test_create_job_configs_labels_length_limit_met(): + cur_labels = { + "bigframes-api": "read_pandas", + "source": "bigquery-dataframes-temp", + } + for i in range(60): + key = f"bigframes-api-test-{i}" + value = f"test{i}" + cur_labels[key] = value + # If cur_labels length is 62, we can only add one label from api_methods + df = bpd.DataFrame({"col1": [1, 2], "col2": [3, 4]}) + # Test running two methods + df.head() + df.max() + api_methods = log_adapter._api_methods + + labels = io_bq.create_job_configs_labels( + job_configs_labels=cur_labels, api_methods=api_methods + ) + assert labels is not None + assert len(labels) == 64 + assert "max" in labels.values() + assert "head" not in labels.values() + assert "bigframes-api" in labels.keys() + assert "source" in labels.keys() def test_create_snapshot_sql_doesnt_timetravel_anonymous_datasets(): @@ -125,5 +231,5 @@ def test_create_temp_table_default_expiration(): ), ) def test_bq_schema_to_sql(schema: Iterable[bigquery.SchemaField], expected: str): - sql = bigframes.session._io.bigquery.bq_schema_to_sql(schema) + sql = io_bq.bq_schema_to_sql(schema) assert sql == expected diff --git a/tests/unit/test_compute_options.py b/tests/unit/test_compute_options.py index 499a0a5fef..a613bca7b9 100644 --- a/tests/unit/test_compute_options.py +++ b/tests/unit/test_compute_options.py @@ -18,13 +18,9 @@ def test_maximum_bytes_option(): session = resources.create_bigquery_session() - num_query_calls = 0 with bf.option_context("compute.maximum_bytes_billed", 10000): - # clear initial method calls - session.bqclient.method_calls = [] + session.bqclient.query.reset_mock() session._start_query("query") - for call in session.bqclient.method_calls: - _, _, kwargs = call - num_query_calls += 1 - assert kwargs["job_config"].maximum_bytes_billed == 10000 - assert num_query_calls > 0 + call = session.bqclient.query.call_args + assert call.kwargs["job_config"].maximum_bytes_billed == 10000 + session.bqclient.query.assert_called_once() diff --git a/tests/unit/test_core.py b/tests/unit/test_core.py index d9672b2635..623448b3aa 100644 --- a/tests/unit/test_core.py +++ b/tests/unit/test_core.py @@ -49,7 +49,7 @@ def test_arrayvalue_constructor_from_ibis_table_adds_all_columns(): ordering=ordering, hidden_ordering_columns=(), ) - assert actual.compile()._table is ibis_table + assert actual._compile_ordered()._table is ibis_table assert len(actual.column_ids) == 3 @@ -83,7 +83,7 @@ def test_arrayvalue_with_get_column(): ), total_ordering_columns=["col1"], ) - col1 = value.compile()._get_ibis_column("col1") + col1 = value._compile_ordered()._get_ibis_column("col1") assert isinstance(col1, ibis_types.Value) assert col1.get_name() == "col1" assert col1.type().is_int64() @@ -100,7 +100,7 @@ def test_arrayvalues_to_ibis_expr_with_get_column(): ), total_ordering_columns=["col1"], ) - expr = value.compile()._get_ibis_column("col1") + expr = value._compile_ordered()._get_ibis_column("col1") assert expr.get_name() == "col1" assert expr.type().is_int64() @@ -117,7 +117,7 @@ def test_arrayvalues_to_ibis_expr_with_concat(): total_ordering_columns=["col1"], ) expr = value.concat([value]) - actual = expr.compile()._to_ibis_expr("unordered") + actual = expr._compile_ordered()._to_ibis_expr(ordering_mode="unordered") assert len(actual.columns) == 3 # TODO(ashleyxu, b/299631930): test out the union expression assert actual.columns[0] == "column_0" @@ -136,8 +136,8 @@ def test_arrayvalues_to_ibis_expr_with_project_unary_op(): ), total_ordering_columns=["col1"], ) - expr = value.project_unary_op("col1", ops.AsTypeOp("string")).compile() - assert value.compile().columns[0].type().is_int64() + expr = value.project_unary_op("col1", ops.AsTypeOp("string"))._compile_ordered() + assert value._compile_ordered().columns[0].type().is_int64() assert expr.columns[0].type().is_string() @@ -152,9 +152,11 @@ def test_arrayvalues_to_ibis_expr_with_project_binary_op(): ), total_ordering_columns=["col1"], ) - expr = value.project_binary_op("col2", "col3", ops.add_op, "col4").compile() + expr = value.project_binary_op( + "col2", "col3", ops.add_op, "col4" + )._compile_ordered() assert expr.columns[3].type().is_float64() - actual = expr._to_ibis_expr("unordered") + actual = expr._to_ibis_expr(ordering_mode="unordered") assert len(expr.columns) == 4 assert actual.columns[3] == "col4" @@ -173,9 +175,9 @@ def test_arrayvalues_to_ibis_expr_with_project_ternary_op(): ) expr = value.project_ternary_op( "col2", "col3", "col4", ops.where_op, "col5" - ).compile() + )._compile_ordered() assert expr.columns[4].type().is_float64() - actual = expr._to_ibis_expr("unordered") + actual = expr._to_ibis_expr(ordering_mode="unordered") assert len(expr.columns) == 5 assert actual.columns[4] == "col5" @@ -195,8 +197,8 @@ def test_arrayvalue_to_ibis_expr_with_aggregate(): aggregations=(("col1", agg_ops.sum_op, "col4"),), by_column_ids=["col1"], dropna=False, - ).compile() - actual = expr._to_ibis_expr("unordered") + )._compile_ordered() + actual = expr._to_ibis_expr(ordering_mode="unordered") assert len(expr.columns) == 2 assert actual.columns[0] == "col1" assert actual.columns[1] == "col4" @@ -214,8 +216,10 @@ def test_arrayvalue_to_ibis_expr_with_corr_aggregate(): ), total_ordering_columns=["col1"], ) - expr = value.corr_aggregate(corr_aggregations=[("col1", "col3", "col4")]).compile() - actual = expr._to_ibis_expr("unordered") + expr = value.corr_aggregate( + corr_aggregations=[("col1", "col3", "col4")] + )._compile_ordered() + actual = expr._to_ibis_expr(ordering_mode="unordered") assert len(expr.columns) == 1 assert actual.columns[0] == "col4" assert expr.columns[0].type().is_float64() diff --git a/tests/unit/test_pandas.py b/tests/unit/test_pandas.py index 70c5441c68..4835a24dc7 100644 --- a/tests/unit/test_pandas.py +++ b/tests/unit/test_pandas.py @@ -17,8 +17,6 @@ import sys import unittest.mock as mock -import google.api_core.exceptions -import google.cloud.bigquery import pandas as pd import pytest @@ -26,8 +24,6 @@ import bigframes.pandas as bpd import bigframes.session -from . import resources - leading_whitespace = re.compile(r"^\s+", flags=re.MULTILINE) @@ -114,37 +110,3 @@ def test_pandas_attribute(): assert bpd.Int64Dtype is pd.Int64Dtype assert bpd.StringDtype is pd.StringDtype assert bpd.ArrowDtype is pd.ArrowDtype - - -def test_close_session_after_bq_session_ended(monkeypatch: pytest.MonkeyPatch): - bqclient = mock.create_autospec(google.cloud.bigquery.Client, instance=True) - bqclient.project = "test-project" - session = resources.create_bigquery_session( - bqclient=bqclient, session_id="JUST_A_TEST" - ) - - # Simulate that the session has already expired. - # Note: this needs to be done after the Session is constructed, as the - # initializer sends a query to start the BigQuery Session. - query_job = mock.create_autospec(google.cloud.bigquery.QueryJob, instance=True) - query_job.result.side_effect = google.api_core.exceptions.BadRequest( - "Session JUST_A_TEST has expired and is no longer available." - ) - bqclient.query.return_value = query_job - - # Simulate that the session has already started. - monkeypatch.setattr(bigframes.core.global_session, "_global_session", session) - bpd.options.bigquery._session_started = True - - # Confirm that as a result bigframes.pandas interface is unusable - with pytest.raises( - google.api_core.exceptions.BadRequest, - match="Session JUST_A_TEST has expired and is no longer available.", - ): - bpd.read_gbq("SELECT 'ABC'") - - # Even though the query to stop the session raises an exception, we should - # still be able to close it without raising an error to the user. - bpd.close_session() - assert "CALL BQ.ABORT_SESSION('JUST_A_TEST')" in bqclient.query.call_args.args[0] - assert bigframes.core.global_session._global_session is None diff --git a/third_party/bigframes_vendored/ibis/backends/bigquery/registry.py b/third_party/bigframes_vendored/ibis/backends/bigquery/registry.py index a4e61ca0f9..e1b28690d7 100644 --- a/third_party/bigframes_vendored/ibis/backends/bigquery/registry.py +++ b/third_party/bigframes_vendored/ibis/backends/bigquery/registry.py @@ -22,10 +22,16 @@ def _last_non_null_value(translator, op: vendored_ibis_ops.LastNonNullValue): return f"LAST_VALUE({arg} IGNORE NULLS)" +def _to_json_string(translator, op: vendored_ibis_ops.ToJsonString): + arg = translator.translate(op.arg) + return f"TO_JSON_STRING({arg})" + + patched_ops = { - vendored_ibis_ops.ApproximateMultiQuantile: _approx_quantiles, - vendored_ibis_ops.FirstNonNullValue: _first_non_null_value, - vendored_ibis_ops.LastNonNullValue: _last_non_null_value, + vendored_ibis_ops.ApproximateMultiQuantile: _approx_quantiles, # type:ignore + vendored_ibis_ops.FirstNonNullValue: _first_non_null_value, # type:ignore + vendored_ibis_ops.LastNonNullValue: _last_non_null_value, # type:ignore + vendored_ibis_ops.ToJsonString: _to_json_string, # type:ignore } OPERATION_REGISTRY.update(patched_ops) diff --git a/third_party/bigframes_vendored/ibis/expr/operations/__init__.py b/third_party/bigframes_vendored/ibis/expr/operations/__init__.py index 1612d9c12e..8219701392 100644 --- a/third_party/bigframes_vendored/ibis/expr/operations/__init__.py +++ b/third_party/bigframes_vendored/ibis/expr/operations/__init__.py @@ -1,5 +1,6 @@ # Contains code from https://github.com/ibis-project/ibis/blob/master/ibis/expr/operations/__init__.py from __future__ import annotations -from third_party.bigframes_vendored.ibis.expr.operations.analytic import * # noqa: F403 -from third_party.bigframes_vendored.ibis.expr.operations.reductions import * # noqa: F403 +from third_party.bigframes_vendored.ibis.expr.operations.analytic import * # noqa: F401 F403 +from third_party.bigframes_vendored.ibis.expr.operations.json import * # noqa: F401 F403 +from third_party.bigframes_vendored.ibis.expr.operations.reductions import * # noqa: F401 F403 diff --git a/third_party/bigframes_vendored/ibis/expr/operations/json.py b/third_party/bigframes_vendored/ibis/expr/operations/json.py new file mode 100644 index 0000000000..dbb3fa3066 --- /dev/null +++ b/third_party/bigframes_vendored/ibis/expr/operations/json.py @@ -0,0 +1,9 @@ +# Contains code from https://github.com/ibis-project/ibis/blob/master/ibis/expr/operations/json.py +from __future__ import annotations + +import ibis.expr.datatypes as dt +from ibis.expr.operations.core import Unary + + +class ToJsonString(Unary): + output_dtype = dt.string diff --git a/third_party/bigframes_vendored/pandas/core/frame.py b/third_party/bigframes_vendored/pandas/core/frame.py index 6f4f6be35d..b35d0f3b2e 100644 --- a/third_party/bigframes_vendored/pandas/core/frame.py +++ b/third_party/bigframes_vendored/pandas/core/frame.py @@ -2159,8 +2159,68 @@ def map(self, func, na_action: Optional[str] = None) -> DataFrame: In pandas 2.1.0, DataFrame.applymap is deprecated and renamed to DataFrame.map. + **Examples:** + + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + + Let's use ``reuse=False`` flag to make sure a new ``remote_function`` + is created every time we run the following code, but you can skip it + to potentially reuse a previously deployed ``remote_function`` from + the same user defined function. + + >>> @bpd.remote_function([int], float, reuse=False) + ... def minutes_to_hours(x): + ... return x/60 + + >>> df_minutes = bpd.DataFrame( + ... {"system_minutes" : [0, 30, 60, 90, 120], + ... "user_minutes" : [0, 15, 75, 90, 6]}) + >>> df_minutes + system_minutes user_minutes + 0 0 0 + 1 30 15 + 2 60 75 + 3 90 90 + 4 120 6 + + [5 rows x 2 columns] + + >>> df_hours = df_minutes.map(minutes_to_hours) + >>> df_hours + system_minutes user_minutes + 0 0.0 0.0 + 1 0.5 0.25 + 2 1.0 1.25 + 3 1.5 1.5 + 4 2.0 0.1 + + [5 rows x 2 columns] + + If there are ``NA``/``None`` values in the data, you can ignore + applying the remote function on such values by specifying + ``na_action='ignore'``. + + >>> df_minutes = bpd.DataFrame( + ... { + ... "system_minutes" : [0, 30, 60, None, 90, 120, bpd.NA], + ... "user_minutes" : [0, 15, 75, 90, 6, None, bpd.NA] + ... }, dtype="Int64") + >>> df_hours = df_minutes.map(minutes_to_hours, na_action='ignore') + >>> df_hours + system_minutes user_minutes + 0 0.0 0.0 + 1 0.5 0.25 + 2 1.0 1.25 + 3 1.5 + 4 1.5 0.1 + 5 2.0 + 6 + + [7 rows x 2 columns] + Args: - func: + func (function): Python function wrapped by ``remote_function`` decorator, returns a single value from a single value. na_action (Optional[str], default None): @@ -2194,6 +2254,8 @@ def join(self, other, *, on: Optional[str] = None, how: str) -> DataFrame: and sort it lexicographically. ``inner``: form intersection of calling frame's index (or column if on is specified) with `other`'s index, preserving the order of the calling's one. + ``cross``: creates the cartesian product from both frames, preserves + the order of the left keys. Returns: bigframes.dataframe.DataFrame: A dataframe containing columns from both the caller and `other`. @@ -2208,6 +2270,7 @@ def merge( "left", "outer", "right", + "cross", ] = "inner", on: Optional[str] = None, *, @@ -2243,6 +2306,8 @@ def merge( join; sort keys lexicographically. ``inner``: use intersection of keys from both frames, similar to a SQL inner join; preserve the order of the left keys. + ``cross``: creates the cartesian product from both frames, preserves the order + of the left keys. on (label or list of labels): Columns to join on. It must be found in both DataFrames. Either on or left_on + right_on @@ -2867,17 +2932,6 @@ def interpolate(self, method: str = "linear"): """ Fill NaN values using an interpolation method. - Args: - method (str, default 'linear'): - Interpolation technique to use. Only 'linear' supported. - 'linear': Ignore the index and treat the values as equally spaced. - This is the only method supported on MultiIndexes. - - Returns: - DataFrame: - Returns the same object type as the caller, interpolated at - some or all ``NaN`` values - **Examples:** >>> import bigframes.pandas as bpd @@ -2886,17 +2940,41 @@ def interpolate(self, method: str = "linear"): >>> df = bpd.DataFrame({ ... 'A': [1, 2, 3, None, None, 6], ... 'B': [None, 6, None, 2, None, 3], - ... }) + ... }, index=[0, 0.1, 0.3, 0.7, 0.9, 1.0]) >>> df.interpolate() - A B - 0 1.0 - 1 2.0 6.0 - 2 3.0 4.0 - 3 4.0 2.0 - 4 5.0 2.5 - 5 6.0 3.0 + A B + 0.0 1.0 + 0.1 2.0 6.0 + 0.3 3.0 4.0 + 0.7 4.0 2.0 + 0.9 5.0 2.5 + 1.0 6.0 3.0 [6 rows x 2 columns] + >>> df.interpolate(method="values") + A B + 0.0 1.0 + 0.1 2.0 6.0 + 0.3 3.0 4.666667 + 0.7 4.714286 2.0 + 0.9 5.571429 2.666667 + 1.0 6.0 3.0 + + [6 rows x 2 columns] + + Args: + method (str, default 'linear'): + Interpolation technique to use. Only 'linear' supported. + 'linear': Ignore the index and treat the values as equally spaced. + This is the only method supported on MultiIndexes. + 'index', 'values': use the actual numerical values of the index. + 'pad': Fill in NaNs using existing values. + 'nearest', 'zero', 'slinear': Emulates `scipy.interpolate.interp1d` + + Returns: + DataFrame: + Returns the same object type as the caller, interpolated at + some or all ``NaN`` values """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) diff --git a/third_party/bigframes_vendored/pandas/core/reshape/merge.py b/third_party/bigframes_vendored/pandas/core/reshape/merge.py index b03f366fca..704e50f516 100644 --- a/third_party/bigframes_vendored/pandas/core/reshape/merge.py +++ b/third_party/bigframes_vendored/pandas/core/reshape/merge.py @@ -49,6 +49,8 @@ def merge( join; sort keys lexicographically. ``inner``: use intersection of keys from both frames, similar to a SQL inner join; preserve the order of the left keys. + ``cross``: creates the cartesian product from both frames, preserves the order + of the left keys. on (label or list of labels): Columns to join on. It must be found in both DataFrames. Either on or left_on + right_on diff --git a/third_party/bigframes_vendored/pandas/core/series.py b/third_party/bigframes_vendored/pandas/core/series.py index b569e5699c..c6d98075f5 100644 --- a/third_party/bigframes_vendored/pandas/core/series.py +++ b/third_party/bigframes_vendored/pandas/core/series.py @@ -728,18 +728,74 @@ def apply( func, ) -> DataFrame | Series: """ - Invoke function on values of Series. + Invoke function on values of a Series. - Can be ufunc (a NumPy function that applies to the entire Series) - or a Python function that only works on single values. + **Examples:** + + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + + Let's use ``reuse=False`` flag to make sure a new ``remote_function`` + is created every time we run the following code, but you can skip it + to potentially reuse a previously deployed ``remote_function`` from + the same user defined function. + + >>> @bpd.remote_function([int], float, reuse=False) + ... def minutes_to_hours(x): + ... return x/60 + + >>> minutes = bpd.Series([0, 30, 60, 90, 120]) + >>> minutes + 0 0 + 1 30 + 2 60 + 3 90 + 4 120 + dtype: Int64 + + >>> hours = minutes.apply(minutes_to_hours) + >>> hours + 0 0.0 + 1 0.5 + 2 1.0 + 3 1.5 + 4 2.0 + dtype: Float64 + + You could turn a user defined function with external package + dependencies into a BigQuery DataFrames remote function. You would + provide the names of the packages via ``packages`` param. + + >>> @bpd.remote_function( + ... [str], + ... str, + ... reuse=False, + ... packages=["cryptography"], + ... ) + ... def get_hash(input): + ... from cryptography.fernet import Fernet + ... + ... # handle missing value + ... if input is None: + ... input = "" + ... + ... key = Fernet.generate_key() + ... f = Fernet(key) + ... return f.encrypt(input.encode()).decode() + + >>> names = bpd.Series(["Alice", "Bob"]) + >>> hashes = names.apply(get_hash) Args: func (function): - Python function or NumPy ufunc to apply. + BigFrames DataFrames ``remote_function`` to apply. The function + should take a scalar and return a scalar. It will be applied to + every element in the ``Series``. Returns: - bigframes.series.Series: If func returns a Series object the result - will be a DataFrame. + bigframes.series.Series: A new Series with values representing the + return value of the ``func`` applied to each element of the original + Series. """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) @@ -920,31 +976,49 @@ def interpolate(self, method: str = "linear"): """ Fill NaN values using an interpolation method. + **Examples:** + + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + + >>> df = bpd.DataFrame({ + ... 'A': [1, 2, 3, None, None, 6], + ... 'B': [None, 6, None, 2, None, 3], + ... }, index=[0, 0.1, 0.3, 0.7, 0.9, 1.0]) + >>> df.interpolate() + A B + 0.0 1.0 + 0.1 2.0 6.0 + 0.3 3.0 4.0 + 0.7 4.0 2.0 + 0.9 5.0 2.5 + 1.0 6.0 3.0 + + [6 rows x 2 columns] + >>> df.interpolate(method="values") + A B + 0.0 1.0 + 0.1 2.0 6.0 + 0.3 3.0 4.666667 + 0.7 4.714286 2.0 + 0.9 5.571429 2.666667 + 1.0 6.0 3.0 + + [6 rows x 2 columns] + + Args: method (str, default 'linear'): Interpolation technique to use. Only 'linear' supported. 'linear': Ignore the index and treat the values as equally spaced. This is the only method supported on MultiIndexes. - + 'index', 'values': use the actual numerical values of the index. + 'pad': Fill in NaNs using existing values. + 'nearest', 'zero', 'slinear': Emulates `scipy.interpolate.interp1d` Returns: Series: Returns the same object type as the caller, interpolated at some or all ``NaN`` values - - **Examples:** - - >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - - >>> series = bpd.Series([1, 2, 3, None, None, 6]) - >>> series.interpolate() - 0 1.0 - 1 2.0 - 2 3.0 - 3 4.0 - 4 5.0 - 5 6.0 - dtype: Float64 """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) diff --git a/third_party/bigframes_vendored/pandas/io/gbq.py b/third_party/bigframes_vendored/pandas/io/gbq.py index 575c501618..2161310b07 100644 --- a/third_party/bigframes_vendored/pandas/io/gbq.py +++ b/third_party/bigframes_vendored/pandas/io/gbq.py @@ -45,16 +45,6 @@ def read_gbq( If the input is a table ID: >>> df = bpd.read_gbq("bigquery-public-data.ml_datasets.penguins") - >>> df.head(2) - species island culmen_length_mm \\ - 0 Adelie Penguin (Pygoscelis adeliae) Dream 36.6 - 1 Adelie Penguin (Pygoscelis adeliae) Dream 39.8 - - culmen_depth_mm flipper_length_mm body_mass_g sex - 0 18.4 184.0 3475.0 FEMALE - 1 19.1 184.0 4650.0 MALE - - [2 rows x 7 columns] Preserve ordering in a query input. diff --git a/third_party/bigframes_vendored/pandas/io/parquet.py b/third_party/bigframes_vendored/pandas/io/parquet.py index f97bd386a4..0f664e70fc 100644 --- a/third_party/bigframes_vendored/pandas/io/parquet.py +++ b/third_party/bigframes_vendored/pandas/io/parquet.py @@ -24,12 +24,6 @@ def read_parquet( >>> gcs_path = "gs://cloud-samples-data/bigquery/us-states/us-states.parquet" >>> df = bpd.read_parquet(path=gcs_path) - >>> df.head(2) - name post_abbr - 0 Alabama AL - 1 Alaska AK - - [2 rows x 2 columns] Args: path (str): diff --git a/third_party/bigframes_vendored/pandas/io/pickle.py b/third_party/bigframes_vendored/pandas/io/pickle.py index 053ba4871c..096d9b13d6 100644 --- a/third_party/bigframes_vendored/pandas/io/pickle.py +++ b/third_party/bigframes_vendored/pandas/io/pickle.py @@ -32,16 +32,6 @@ def read_pickle( >>> gcs_path = "gs://bigframes-dev-testing/test_pickle.pkl" >>> df = bpd.read_pickle(filepath_or_buffer=gcs_path) - >>> df.head(2) - species island culmen_length_mm \\ - 0 Adelie Penguin (Pygoscelis adeliae) Dream 36.6 - 1 Adelie Penguin (Pygoscelis adeliae) Dream 39.8 - - culmen_depth_mm flipper_length_mm body_mass_g sex - 0 18.4 184.0 3475.0 FEMALE - 1 19.1 184.0 4650.0 MALE - - [2 rows x 7 columns] Args: filepath_or_buffer (str, path object, or file-like object):