File tree Expand file tree Collapse file tree 6 files changed +2
-13
lines changed
tensorflow_data_validation Expand file tree Collapse file tree 6 files changed +2
-13
lines changed Original file line number Diff line number Diff line change @@ -104,8 +104,6 @@ def expand(
104
104
def _sample_at_rate (example : pa .RecordBatch , sample_rate : float
105
105
) -> Generator [pa .RecordBatch , None , None ]:
106
106
"""Sample examples at input sampling rate."""
107
- # TODO(pachristopher): Revisit this to decide if we need to fix a seed
108
- # or add an optional seed argument.
109
107
if random .random () <= sample_rate :
110
108
yield example
111
109
Original file line number Diff line number Diff line change @@ -129,7 +129,6 @@ def _may_be_set_legacy_flag(schema: schema_pb2.Schema):
129
129
schema .generate_legacy_feature_spec = False
130
130
131
131
132
- # TODO(pachristopher): Add support for updating only a subset of features.
133
132
def update_schema (schema : schema_pb2 .Schema ,
134
133
statistics : statistics_pb2 .DatasetFeatureStatisticsList ,
135
134
infer_feature_shape : Optional [bool ] = True ,
Original file line number Diff line number Diff line change @@ -529,8 +529,6 @@ def compute(
529
529
result [feature_path ] = {self ._custom_stats_key : 0.0 }
530
530
return stats_util .make_dataset_feature_stats_proto (result )
531
531
532
- # TODO(pachristopher): Currently encoded examples operate on lists. Consider
533
- # using ndarrays and vectorizing the operations.
534
532
encoded_examples = _encode_examples (examples_record_batch ,
535
533
self ._multivalent_features ,
536
534
self ._categorical_features ,
Original file line number Diff line number Diff line change @@ -83,8 +83,6 @@ def expand(
83
83
slicing_util .GenerateSlicesSqlDoFn (
84
84
slice_sqls = self ._options .experimental_slice_sqls )))
85
85
else :
86
- # TODO(pachristopher): Remove this special case if this doesn't give any
87
- # performance improvement.
88
86
dataset = (dataset
89
87
| 'KeyWithVoid' >> beam .Map (lambda v : (None , v )))
90
88
_ = dataset | 'TrackDistinctSliceKeys' >> _TrackDistinctSliceKeys () # pylint: disable=no-value-for-parameter
@@ -551,9 +549,6 @@ def __init__(
551
549
else :
552
550
self ._desired_batch_size = constants .DEFAULT_DESIRED_INPUT_BATCH_SIZE
553
551
554
- # TODO(pachristopher): Understand the cost of incrementing beam counters
555
- # for every input batch. The other option is to update the counters during
556
- # teardown.
557
552
# Metrics
558
553
self ._combine_batch_size = beam .metrics .Metrics .distribution (
559
554
constants .METRICS_NAMESPACE , 'combine_batch_size' )
Original file line number Diff line number Diff line change 28
28
from tfx_bsl .coders import batch_util
29
29
30
30
31
- # TODO(pachristopher ): Deprecate this.
31
+ # TODO(b/221152546 ): Deprecate this.
32
32
@beam .ptransform_fn
33
33
def BatchExamplesToArrowRecordBatches (
34
34
examples : beam .PCollection [types .Example ],
Original file line number Diff line number Diff line change @@ -225,8 +225,7 @@ def generate_statistics_from_dataframe(
225
225
merged_partial_stats = _generate_partial_statistics_from_df (
226
226
dataframe , stats_options , stats_generators )
227
227
else :
228
- # TODO(pachristopher): Investigate why we don't observe linear speedup after
229
- # a certain number of processes.
228
+ # TODO(b/144580609): Consider using Beam for inmemory mode as well.
230
229
splits = np .array_split (dataframe , n_jobs )
231
230
partial_stats = Parallel (n_jobs = n_jobs )(
232
231
delayed (_generate_partial_statistics_from_df )(
You can’t perform that action at this time.
0 commit comments