Skip to content

Commit d8baad5

Browse files
milkshakeiiiHenry J Solberg
and
Henry J Solberg
authored
feat: add pd.get_dummies (#149)
* feat: add pd.get_dummies * remove unneeded prefix case * param/documentation fixes * be stricter about types in test * be stricter about types in series test * remove unneeded comment * adjust for type difference in pandas 1 * add example code (tested) * fix None columns and add test cases * variable names and _get_unique_values per-column * account for pandas 1 behavior difference * remove already_seen set * avoid unnecessary join/projection * fix column ordering edge case * adjust for picky examples checker * example tweak * make part of the example comments * use ellipsis in doctest comment * add <BLANKLINES> to doctest string * extract parameter standardization * extract submethods --------- Co-authored-by: Henry J Solberg <[email protected]>
1 parent 0801d96 commit d8baad5

File tree

6 files changed

+410
-3
lines changed

6 files changed

+410
-3
lines changed

bigframes/pandas/__init__.py

+177
Original file line numberDiff line numberDiff line change
@@ -45,14 +45,18 @@
4545
)
4646

4747
import bigframes._config as config
48+
import bigframes.constants as constants
49+
import bigframes.core.blocks
4850
import bigframes.core.global_session as global_session
4951
import bigframes.core.indexes
5052
import bigframes.core.reshape
5153
import bigframes.dataframe
54+
import bigframes.operations as ops
5255
import bigframes.series
5356
import bigframes.session
5457
import bigframes.session.clients
5558
import third_party.bigframes_vendored.pandas.core.reshape.concat as vendored_pandas_concat
59+
import third_party.bigframes_vendored.pandas.core.reshape.encoding as vendored_pandas_encoding
5660
import third_party.bigframes_vendored.pandas.core.reshape.merge as vendored_pandas_merge
5761
import third_party.bigframes_vendored.pandas.core.reshape.tile as vendored_pandas_tile
5862

@@ -134,6 +138,179 @@ def cut(
134138
cut.__doc__ = vendored_pandas_tile.cut.__doc__
135139

136140

141+
def get_dummies(
142+
data: Union[DataFrame, Series],
143+
prefix: Union[List, dict, str, None] = None,
144+
prefix_sep: Union[List, dict, str, None] = "_",
145+
dummy_na: bool = False,
146+
columns: Optional[List] = None,
147+
drop_first: bool = False,
148+
dtype: Any = None,
149+
) -> DataFrame:
150+
# simplify input parameters into per-input-label lists
151+
# also raise errors for invalid parameters
152+
column_labels, prefixes, prefix_seps = _standardize_get_dummies_params(
153+
data, prefix, prefix_sep, columns, dtype
154+
)
155+
156+
# combine prefixes into per-column-id list
157+
full_columns_prefixes, columns_ids = _determine_get_dummies_columns_from_labels(
158+
data, column_labels, prefix is not None, prefixes, prefix_seps
159+
)
160+
161+
# run queries to compute unique values
162+
block = data._block
163+
max_unique_value = (
164+
bigframes.core.blocks._BQ_MAX_COLUMNS
165+
- len(block.value_columns)
166+
- len(block.index_columns)
167+
- 1
168+
) // len(column_labels)
169+
columns_values = [
170+
block._get_unique_values([col_id], max_unique_value) for col_id in columns_ids
171+
]
172+
173+
# for each dummified column, add the content of the output columns via block operations
174+
intermediate_col_ids = []
175+
for i in range(len(columns_values)):
176+
level = columns_values[i].get_level_values(0).sort_values().dropna()
177+
if drop_first:
178+
level = level[1:]
179+
column_label = full_columns_prefixes[i]
180+
column_id = columns_ids[i]
181+
block, new_intermediate_col_ids = _perform_get_dummies_block_operations(
182+
block, level, column_label, column_id, dummy_na
183+
)
184+
intermediate_col_ids.extend(new_intermediate_col_ids)
185+
186+
# drop dummified columns (and the intermediate columns we added)
187+
block = block.drop_columns(columns_ids + intermediate_col_ids)
188+
return DataFrame(block)
189+
190+
191+
get_dummies.__doc__ = vendored_pandas_encoding.get_dummies.__doc__
192+
193+
194+
def _standardize_get_dummies_params(
195+
data: Union[DataFrame, Series],
196+
prefix: Union[List, dict, str, None],
197+
prefix_sep: Union[List, dict, str, None],
198+
columns: Optional[List],
199+
dtype: Any,
200+
) -> Tuple[List, List[str], List[str]]:
201+
block = data._block
202+
203+
if isinstance(data, Series):
204+
columns = [block.column_labels[0]]
205+
if columns is not None and not pandas.api.types.is_list_like(columns):
206+
raise TypeError("Input must be a list-like for parameter `columns`")
207+
if dtype is not None and dtype not in [
208+
pandas.BooleanDtype,
209+
bool,
210+
"Boolean",
211+
"boolean",
212+
"bool",
213+
]:
214+
raise NotImplementedError(
215+
f"Only Boolean dtype is currently supported. {constants.FEEDBACK_LINK}"
216+
)
217+
218+
if columns is None:
219+
default_dummy_types = [pandas.StringDtype, "string[pyarrow]"]
220+
columns = []
221+
columns_set = set()
222+
for col_id in block.value_columns:
223+
label = block.col_id_to_label[col_id]
224+
if (
225+
label not in columns_set
226+
and block.expr.get_column_type(col_id) in default_dummy_types
227+
):
228+
columns.append(label)
229+
columns_set.add(label)
230+
231+
column_labels: List = typing.cast(List, columns)
232+
233+
def parse_prefix_kwarg(kwarg, kwarg_name) -> Optional[List[str]]:
234+
if kwarg is None:
235+
return None
236+
if isinstance(kwarg, str):
237+
return [kwarg] * len(column_labels)
238+
if isinstance(kwarg, dict):
239+
return [kwarg[column] for column in column_labels]
240+
kwarg = typing.cast(List, kwarg)
241+
if pandas.api.types.is_list_like(kwarg) and len(kwarg) != len(column_labels):
242+
raise ValueError(
243+
f"Length of '{kwarg_name}' ({len(kwarg)}) did not match "
244+
f"the length of the columns being encoded ({len(column_labels)})."
245+
)
246+
if pandas.api.types.is_list_like(kwarg):
247+
return list(map(str, kwarg))
248+
raise TypeError(f"{kwarg_name} kwarg must be a string, list, or dictionary")
249+
250+
prefix_seps = parse_prefix_kwarg(prefix_sep or "_", "prefix_sep")
251+
prefix_seps = typing.cast(List, prefix_seps)
252+
prefixes = parse_prefix_kwarg(prefix, "prefix")
253+
if prefixes is None:
254+
prefixes = column_labels
255+
prefixes = typing.cast(List, prefixes)
256+
257+
return column_labels, prefixes, prefix_seps
258+
259+
260+
def _determine_get_dummies_columns_from_labels(
261+
data: Union[DataFrame, Series],
262+
column_labels: List,
263+
prefix_given: bool,
264+
prefixes: List[str],
265+
prefix_seps: List[str],
266+
) -> Tuple[List[str], List[str]]:
267+
block = data._block
268+
269+
columns_ids = []
270+
columns_prefixes = []
271+
for i in range(len(column_labels)):
272+
label = column_labels[i]
273+
empty_prefix = label is None or (isinstance(data, Series) and not prefix_given)
274+
full_prefix = "" if empty_prefix else prefixes[i] + prefix_seps[i]
275+
276+
for col_id in block.label_to_col_id[label]:
277+
columns_ids.append(col_id)
278+
columns_prefixes.append(full_prefix)
279+
280+
return columns_prefixes, columns_ids
281+
282+
283+
def _perform_get_dummies_block_operations(
284+
block: bigframes.core.blocks.Block,
285+
level: pandas.Index,
286+
column_label: str,
287+
column_id: str,
288+
dummy_na: bool,
289+
) -> Tuple[bigframes.core.blocks.Block, List[str]]:
290+
intermediate_col_ids = []
291+
for value in level:
292+
new_column_label = f"{column_label}{value}"
293+
if column_label == "":
294+
new_column_label = value
295+
new_block, new_id = block.apply_unary_op(
296+
column_id, ops.BinopPartialLeft(ops.eq_op, value)
297+
)
298+
intermediate_col_ids.append(new_id)
299+
block, _ = new_block.apply_unary_op(
300+
new_id,
301+
ops.BinopPartialRight(ops.fillna_op, False),
302+
result_label=new_column_label,
303+
)
304+
if dummy_na:
305+
# dummy column name for na depends on the dtype
306+
na_string = str(pandas.Index([None], dtype=level.dtype)[0])
307+
new_column_label = f"{column_label}{na_string}"
308+
block, _ = block.apply_unary_op(
309+
column_id, ops.isnull_op, result_label=new_column_label
310+
)
311+
return block, intermediate_col_ids
312+
313+
137314
def qcut(
138315
x: bigframes.series.Series,
139316
q: int,

tests/system/small/test_pandas.py

+112
Original file line numberDiff line numberDiff line change
@@ -45,6 +45,118 @@ def test_concat_series(scalars_dfs):
4545
pd.testing.assert_series_equal(bf_result, pd_result)
4646

4747

48+
@pytest.mark.parametrize(
49+
("kwargs"),
50+
[
51+
{
52+
"prefix": ["prefix1", "prefix2"],
53+
"prefix_sep": "_",
54+
"dummy_na": None,
55+
"columns": ["bool_col", "int64_col"],
56+
"drop_first": False,
57+
},
58+
{
59+
"prefix": "prefix",
60+
"prefix_sep": ["_", ","],
61+
"dummy_na": False,
62+
"columns": ["int64_too", "string_col"],
63+
"drop_first": False,
64+
},
65+
{
66+
"prefix": None,
67+
"prefix_sep": ".",
68+
"dummy_na": True,
69+
"columns": ["time_col", "float64_col"],
70+
"drop_first": True,
71+
},
72+
],
73+
)
74+
def test_get_dummies_dataframe(scalars_dfs, kwargs):
75+
scalars_df, scalars_pandas_df = scalars_dfs
76+
77+
bf_result = bpd.get_dummies(scalars_df, **kwargs, dtype=bool)
78+
pd_result = pd.get_dummies(scalars_pandas_df, **kwargs, dtype=bool)
79+
# dtype argument above is needed for pandas v1 only
80+
81+
# adjust for expected dtype differences
82+
for (column_name, type_name) in zip(pd_result.columns, pd_result.dtypes):
83+
if type_name == "bool":
84+
pd_result[column_name] = pd_result[column_name].astype("boolean")
85+
86+
pd.testing.assert_frame_equal(bf_result.to_pandas(), pd_result)
87+
88+
89+
def test_get_dummies_dataframe_duplicate_labels(scalars_dfs):
90+
if pd.__version__.startswith("1."):
91+
pytest.skip("pandas has different behavior in 1.x")
92+
93+
scalars_df, scalars_pandas_df = scalars_dfs
94+
95+
scalars_renamed_df = scalars_df.rename(
96+
columns={"int64_too": "int64_col", "float64_col": None, "string_col": None}
97+
)
98+
scalars_renamed_pandas_df = scalars_pandas_df.rename(
99+
columns={"int64_too": "int64_col", "float64_col": None, "string_col": None}
100+
)
101+
102+
bf_result = bpd.get_dummies(
103+
scalars_renamed_df, columns=["int64_col", None], dtype=bool
104+
)
105+
pd_result = pd.get_dummies(
106+
scalars_renamed_pandas_df, columns=["int64_col", None], dtype=bool
107+
)
108+
# dtype argument above is needed for pandas v1 only
109+
110+
# adjust for expected dtype differences
111+
for (column_name, type_name) in zip(pd_result.columns, pd_result.dtypes):
112+
if type_name == "bool":
113+
pd_result[column_name] = pd_result[column_name].astype("boolean")
114+
115+
pd.testing.assert_frame_equal(bf_result.to_pandas(), pd_result)
116+
117+
118+
def test_get_dummies_series(scalars_dfs):
119+
scalars_df, scalars_pandas_df = scalars_dfs
120+
bf_series = scalars_df.date_col
121+
pd_series = scalars_pandas_df.date_col
122+
123+
bf_result = bpd.get_dummies(bf_series, dtype=bool)
124+
pd_result = pd.get_dummies(pd_series, dtype=bool)
125+
# dtype argument above is needed for pandas v1 only
126+
127+
# adjust for expected dtype differences
128+
for (column_name, type_name) in zip(pd_result.columns, pd_result.dtypes):
129+
if type_name == "bool":
130+
pd_result[column_name] = pd_result[column_name].astype("boolean")
131+
pd_result.columns = pd_result.columns.astype(object)
132+
133+
pd.testing.assert_frame_equal(
134+
bf_result.to_pandas(),
135+
pd_result,
136+
)
137+
138+
139+
def test_get_dummies_series_nameless(scalars_dfs):
140+
scalars_df, scalars_pandas_df = scalars_dfs
141+
bf_series = scalars_df.date_col.rename(None)
142+
pd_series = scalars_pandas_df.date_col.rename(None)
143+
144+
bf_result = bpd.get_dummies(bf_series, dtype=bool)
145+
pd_result = pd.get_dummies(pd_series, dtype=bool)
146+
# dtype argument above is needed for pandas v1 only
147+
148+
# adjust for expected dtype differences
149+
for (column_name, type_name) in zip(pd_result.columns, pd_result.dtypes):
150+
if type_name == "bool":
151+
pd_result[column_name] = pd_result[column_name].astype("boolean")
152+
pd_result.columns = pd_result.columns.astype(object)
153+
154+
pd.testing.assert_frame_equal(
155+
bf_result.to_pandas(),
156+
pd_result,
157+
)
158+
159+
48160
@pytest.mark.parametrize(
49161
("how"),
50162
[

third_party/bigframes_vendored/pandas/core/reshape/concat.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
# Contains code from https://github.com/pandas-dev/pandas/blob/main/pandas/core/reshape/concat.py
22
"""
3-
Concat routines.
3+
Concat routines
44
"""
55
from __future__ import annotations
66

0 commit comments

Comments
 (0)