Add ptransforms to write statistics to text and tfrecord files.

paulgc17 · tfx-copybara · commit 38f36796f66b · 2020-05-13T14:31:25.000-07:00
PiperOrigin-RevId: 311405240
diff --git a/RELEASE.md b/RELEASE.md
@@ -14,6 +14,11 @@
 *   Add utility methods `tfdv.get_slice_stats` to get statistics for a slice and
     `tfdv.compare_slices` to compare statistics of two slices using Facets.
 *   Make `tfdv.load_stats_text` and `tfdv.write_stats_text` public.
+*   Add PTransforms `tfdv.WriteStatisticsToText` and
+    `tfdv.WriteStatisticsToTFRecord` to write statistics proto to text and
+    tfrecord files respectively.
+*   Modify `tfdv.load_statistics` to handle reading statistics from TFRecord and
+    text files.
 *   Requires `pyarrow>=0.16,<1`.
 
 ## Known Issues
diff --git a/tensorflow_data_validation/__init__.py b/tensorflow_data_validation/__init__.py
@@ -22,6 +22,8 @@
 
 # Import stats API.
 from tensorflow_data_validation.api.stats_api import GenerateStatistics
+from tensorflow_data_validation.api.stats_api import WriteStatisticsToText
+from tensorflow_data_validation.api.stats_api import WriteStatisticsToTFRecord
 
 # Import validation API.
 from tensorflow_data_validation.api.validation_api import infer_schema
@@ -69,10 +71,10 @@
 from tensorflow_data_validation.utils.stats_gen_lib import generate_statistics_from_csv
 from tensorflow_data_validation.utils.stats_gen_lib import generate_statistics_from_dataframe
 from tensorflow_data_validation.utils.stats_gen_lib import generate_statistics_from_tfrecord
-from tensorflow_data_validation.utils.stats_gen_lib import load_statistics
 
 # Import stats utilities.
 from tensorflow_data_validation.utils.stats_util import get_slice_stats
+from tensorflow_data_validation.utils.stats_util import load_statistics
 from tensorflow_data_validation.utils.stats_util import load_stats_text
 from tensorflow_data_validation.utils.stats_util import write_stats_text
 
diff --git a/tensorflow_data_validation/api/stats_api.py b/tensorflow_data_validation/api/stats_api.py
@@ -50,7 +50,7 @@
 from tensorflow_data_validation import constants
 from tensorflow_data_validation.statistics import stats_impl
 from tensorflow_data_validation.statistics import stats_options
-from typing import Generator
+from typing import Generator, Text
 
 from tensorflow_metadata.proto.v0 import statistics_pb2
 
@@ -130,3 +130,48 @@ def _sample_at_rate(example: pa.RecordBatch, sample_rate: float
   # or add an optional seed argument.
   if random.random() <= sample_rate:
     yield example
+
+
+@beam.typehints.with_input_types(statistics_pb2.DatasetFeatureStatisticsList)
+@beam.typehints.with_output_types(beam.pvalue.PDone)
+class WriteStatisticsToText(beam.PTransform):
+  """API for writing serialized data statistics to text file."""
+
+  def __init__(self, output_path: Text) -> None:
+    """Initializes the transform.
+
+    Args:
+      output_path: Output path for writing data statistics.
+    """
+    self._output_path = output_path
+
+  def expand(self, stats: beam.pvalue.PCollection) -> beam.pvalue.PDone:
+    return (stats
+            | 'WriteStats' >> beam.io.WriteToText(
+                self._output_path,
+                shard_name_template='',
+                append_trailing_newlines=False,
+                coder=beam.coders.ProtoCoder(
+                    statistics_pb2.DatasetFeatureStatisticsList)))
+
+
+@beam.typehints.with_input_types(statistics_pb2.DatasetFeatureStatisticsList)
+@beam.typehints.with_output_types(beam.pvalue.PDone)
+class WriteStatisticsToTFRecord(beam.PTransform):
+  """API for writing serialized data statistics to TFRecord file."""
+
+  def __init__(self, output_path: Text) -> None:
+    """Initializes the transform.
+
+    Args:
+      output_path: Output path for writing data statistics.
+    """
+    self._output_path = output_path
+
+  def expand(self, stats: beam.pvalue.PCollection) -> beam.pvalue.PDone:
+    return (stats
+            | 'WriteStats' >> beam.io.WriteToTFRecord(
+                self._output_path,
+                shard_name_template='',
+                coder=beam.coders.ProtoCoder(
+                    statistics_pb2.DatasetFeatureStatisticsList)))
diff --git a/tensorflow_data_validation/api/stats_api_test.py b/tensorflow_data_validation/api/stats_api_test.py
@@ -19,13 +19,17 @@
 
 from __future__ import print_function
 
+import os
+import tempfile
 from absl.testing import absltest
 import apache_beam as beam
 from apache_beam.testing import util
 import numpy as np
 import pyarrow as pa
 from tensorflow_data_validation.api import stats_api
 from tensorflow_data_validation.statistics import stats_options
+from tensorflow_data_validation.utils import io_util
+from tensorflow_data_validation.utils import stats_util
 from tensorflow_data_validation.utils import test_util
 
 from google.protobuf import text_format
@@ -34,6 +38,9 @@
 
 class StatsAPITest(absltest.TestCase):
 
+  def _get_temp_dir(self):
+    return tempfile.mkdtemp()
+
   def test_stats_pipeline(self):
     record_batches = [
         pa.RecordBatch.from_arrays([
@@ -636,6 +643,42 @@ def test_invalid_stats_options(self):
             p | beam.Create(record_batches)
             | stats_api.GenerateStatistics(options={}))
 
+  def test_write_stats_to_text(self):
+    stats = text_format.Parse(
+        """
+        datasets {
+          name: 'x'
+          num_examples: 100
+        }
+        """, statistics_pb2.DatasetFeatureStatisticsList())
+    output_path = os.path.join(self._get_temp_dir(), 'stats')
+    with beam.Pipeline() as p:
+      _ = (p | beam.Create([stats]) | stats_api.WriteStatisticsToText(
+          output_path))
+    stats_from_file = statistics_pb2.DatasetFeatureStatisticsList()
+    serialized_stats = io_util.read_file_to_string(
+        output_path, binary_mode=True)
+    stats_from_file.ParseFromString(serialized_stats)
+    self.assertLen(stats_from_file.datasets, 1)
+    test_util.assert_dataset_feature_stats_proto_equal(
+        self, stats_from_file.datasets[0], stats.datasets[0])
+
+  def test_write_stats_to_tfrecrod(self):
+    stats = text_format.Parse(
+        """
+        datasets {
+          name: 'x'
+          num_examples: 100
+        }
+        """, statistics_pb2.DatasetFeatureStatisticsList())
+    output_path = os.path.join(self._get_temp_dir(), 'stats')
+    with beam.Pipeline() as p:
+      _ = (p | beam.Create([stats]) | stats_api.WriteStatisticsToTFRecord(
+          output_path))
+    stats_from_file = stats_util.load_statistics(output_path)
+    self.assertLen(stats_from_file.datasets, 1)
+    test_util.assert_dataset_feature_stats_proto_equal(
+        self, stats_from_file.datasets[0], stats.datasets[0])
 
 if __name__ == '__main__':
   absltest.main()
diff --git a/tensorflow_data_validation/utils/stats_gen_lib.py b/tensorflow_data_validation/utils/stats_gen_lib.py
@@ -42,6 +42,7 @@
 from tensorflow_data_validation.statistics import stats_impl
 from tensorflow_data_validation.statistics import stats_options as options
 from tensorflow_data_validation.statistics.generators import stats_generator
+from tensorflow_data_validation.utils import stats_util
 from tfx_bsl.arrow import array_util
 from typing import Any, List, Optional, Text
 
@@ -120,12 +121,9 @@ def generate_statistics_from_tfrecord(
             desired_batch_size=batch_size)
         | 'GenerateStatistics' >> stats_api.GenerateStatistics(stats_options)
         # TODO(b/112014711) Implement a custom sink to write the stats proto.
-        | 'WriteStatsOutput' >> beam.io.WriteToTFRecord(
-            output_path,
-            shard_name_template='',
-            coder=beam.coders.ProtoCoder(
-                statistics_pb2.DatasetFeatureStatisticsList)))
-  return load_statistics(output_path)
+        | 'WriteStatsOutput' >> stats_api.WriteStatisticsToTFRecord(
+            output_path))
+  return stats_util.load_statistics(output_path)
 
 
 def generate_statistics_from_csv(
@@ -204,12 +202,9 @@ def generate_statistics_from_csv(
             desired_batch_size=batch_size)
         | 'GenerateStatistics' >> stats_api.GenerateStatistics(stats_options)
         # TODO(b/112014711) Implement a custom sink to write the stats proto.
-        | 'WriteStatsOutput' >> beam.io.WriteToTFRecord(
-            output_path,
-            shard_name_template='',
-            coder=beam.coders.ProtoCoder(
-                statistics_pb2.DatasetFeatureStatisticsList)))
-  return load_statistics(output_path)
+        | 'WriteStatsOutput' >> stats_api.WriteStatisticsToTFRecord(
+            output_path))
+  return stats_util.load_statistics(output_path)
 
 
 def generate_statistics_from_dataframe(
@@ -348,19 +343,3 @@ def get_csv_header(data_location: Text,
             'Found empty file when reading the header line: %s' % filename)
 
   return result
-
-
-def load_statistics(
-    input_path: Text) -> statistics_pb2.DatasetFeatureStatisticsList:
-  """Loads data statistics proto from file.
-
-  Args:
-    input_path: Data statistics file path.
-
-  Returns:
-    A DatasetFeatureStatisticsList proto.
-  """
-  serialized_stats = next(tf.compat.v1.io.tf_record_iterator(input_path))
-  result = statistics_pb2.DatasetFeatureStatisticsList()
-  result.ParseFromString(serialized_stats)
-  return result
diff --git a/tensorflow_data_validation/utils/stats_util.py b/tensorflow_data_validation/utils/stats_util.py
@@ -18,8 +18,10 @@
 
 from __future__ import print_function
 
+import logging
 import numpy as np
 import pyarrow as pa
+import tensorflow as tf
 from tensorflow_data_validation import types
 from tensorflow_data_validation.arrow import arrow_util
 from tensorflow_data_validation.utils import io_util
@@ -212,6 +214,22 @@ def load_stats_text(
   return stats_proto
 
 
+def load_stats_tfrecord(
+    input_path: Text) -> statistics_pb2.DatasetFeatureStatisticsList:
+  """Loads data statistics proto from TFRecord file.
+
+  Args:
+    input_path: Data statistics file path.
+
+  Returns:
+    A DatasetFeatureStatisticsList proto.
+  """
+  serialized_stats = next(tf.compat.v1.io.tf_record_iterator(input_path))
+  result = statistics_pb2.DatasetFeatureStatisticsList()
+  result.ParseFromString(serialized_stats)
+  return result
+
+
 def get_feature_stats(stats: statistics_pb2.DatasetFeatureStatistics,
                       feature_path: types.FeaturePath
                      ) -> statistics_pb2.FeatureNameStatistics:
@@ -295,3 +313,27 @@ def get_slice_stats(statistics: statistics_pb2.DatasetFeatureStatisticsList,
       result.datasets.add().CopyFrom(slice_stats)
       return result
   raise ValueError('Invalid slice key.')
+
+
+def load_statistics(
+    input_path: Text) -> statistics_pb2.DatasetFeatureStatisticsList:
+  """Loads data statistics proto from file.
+
+  Args:
+    input_path: Data statistics file path. The file should be a one-record
+      TFRecord file or a plain file containing the serialized statistics proto.
+
+  Returns:
+    A DatasetFeatureStatisticsList proto.
+
+  Raises:
+    IOError: If the input path does not exist.
+  """
+  if not tf.io.gfile.exists(input_path):
+    raise IOError('Invalid input path {}.'.format(input_path))
+  try:
+    return load_stats_tfrecord(input_path)
+  except Exception:  # pylint: disable=broad-except
+    logging.info('File %s did not look like a TFRecord. Try reading as a plain '
+                 'file.', input_path)
+    return load_stats_text(input_path)
diff --git a/tensorflow_data_validation/utils/stats_util_test.py b/tensorflow_data_validation/utils/stats_util_test.py
@@ -22,6 +22,7 @@
 from absl import flags
 from absl.testing import absltest
 import numpy as np
+import tensorflow as tf
 from tensorflow_data_validation import types
 from tensorflow_data_validation.utils import stats_util
 
@@ -129,12 +130,23 @@ def test_get_utf8(self):
 
   def test_write_load_stats_text(self):
     stats = text_format.Parse("""
-      datasets {}
+      datasets { name: 'abc' }
     """, statistics_pb2.DatasetFeatureStatisticsList())
     stats_path = os.path.join(FLAGS.test_tmpdir, 'stats.pbtxt')
     stats_util.write_stats_text(stats=stats, output_path=stats_path)
-    loaded_stats = stats_util.load_stats_text(input_path=stats_path)
-    self.assertEqual(stats, loaded_stats)
+    self.assertEqual(stats, stats_util.load_stats_text(input_path=stats_path))
+    self.assertEqual(stats, stats_util.load_statistics(input_path=stats_path))
+
+  def test_load_stats_tfrecord(self):
+    stats = text_format.Parse("""
+      datasets { name: 'abc' }
+    """, statistics_pb2.DatasetFeatureStatisticsList())
+    stats_path = os.path.join(FLAGS.test_tmpdir, 'stats.tfrecord')
+    with tf.io.TFRecordWriter(stats_path) as writer:
+      writer.write(stats.SerializeToString())
+    self.assertEqual(stats,
+                     stats_util.load_stats_tfrecord(input_path=stats_path))
+    self.assertEqual(stats, stats_util.load_statistics(input_path=stats_path))
 
   def test_write_stats_text_invalid_stats_input(self):
     with self.assertRaisesRegexp(
diff --git a/tensorflow_data_validation/utils/validation_lib.py b/tensorflow_data_validation/utils/validation_lib.py
@@ -31,6 +31,7 @@
 from tensorflow_data_validation.statistics import stats_impl
 from tensorflow_data_validation.statistics import stats_options as options
 from tensorflow_data_validation.utils import stats_gen_lib
+from tensorflow_data_validation.utils import stats_util
 from typing import List, Optional, Text
 
 from tensorflow_metadata.proto.v0 import statistics_pb2
@@ -104,7 +105,7 @@ def validate_examples_in_tfrecord(
             coder=beam.coders.ProtoCoder(
                 statistics_pb2.DatasetFeatureStatisticsList)))
 
-  return stats_gen_lib.load_statistics(output_path)
+  return stats_util.load_statistics(output_path)
 
 
 def validate_examples_in_csv(
@@ -193,4 +194,4 @@ def validate_examples_in_csv(
             coder=beam.coders.ProtoCoder(
                 statistics_pb2.DatasetFeatureStatisticsList)))
 
-  return stats_gen_lib.load_statistics(output_path)
+  return stats_util.load_statistics(output_path)