-
Notifications
You must be signed in to change notification settings - Fork 51
/
Copy pathstatistics.proto
310 lines (264 loc) · 10.3 KB
/
statistics.proto
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
// Copyright 2017 The TensorFlow Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
// =============================================================================
// Definitions for aggregated feature statistics for datasets.
syntax = "proto3";
option cc_enable_arenas = true;
package tensorflow.metadata.v0;
option java_package = "org.tensorflow.metadata.v0";
option java_multiple_files = true;
// Copied from Facets feature_statistics.proto
// Must be kept binary-compatible with the original, until all usages
// are updated to use this version, or we write a proto-to-proto converter.
// A list of features statistics for different datasets. If you wish to compare
// different datasets using this list, then the DatasetFeatureStatistics
// entries should all contain the same list of features.
message DatasetFeatureStatisticsList {
repeated DatasetFeatureStatistics datasets = 1;
}
// The feature statistics for a single dataset.
message DatasetFeatureStatistics {
// The name of the dataset.
string name = 1;
// The number of examples in the dataset.
uint64 num_examples = 2;
// Only valid if the weight feature was specified.
// Treats a missing weighted feature as zero.
double weighted_num_examples = 4;
// The feature statistics for the dataset.
repeated FeatureNameStatistics features = 3;
}
// The complete set of statistics for a given feature name for a dataset.
message FeatureNameStatistics {
// The types supported by the feature statistics. When aggregating
// tf.Examples, if the bytelist contains a string, it is recommended to encode
// it here as STRING instead of BYTES in order to calculate string-specific
// statistical measures.
enum Type {
INT = 0;
FLOAT = 1;
STRING = 2;
BYTES = 3;
STRUCT = 4;
}
// The feature name
string name = 1;
// The data type of the feature
Type type = 2;
// The statistics of the values of the feature.
oneof stats {
NumericStatistics num_stats = 3;
StringStatistics string_stats = 4;
BytesStatistics bytes_stats = 5;
StructStatistics struct_stats = 7;
}
// Any custom statistics can be stored in this list.
repeated CustomStatistic custom_stats = 6;
}
// Common weighted statistics for all feature types.
// If the weighted column is missing, then this counts as a weight of 1
// for that example.
message WeightedCommonStatistics {
// Weighted number of examples not missing.
double num_non_missing = 1;
// Weighted number of examples missing.
// Note that if the weighted column is zero, this does not count
// as missing.
double num_missing = 2;
// average number of values, weighted by the number of examples.
double avg_num_values = 3;
// tot_num_values = avg_num_values * num_non_missing.
// This is calculated directly, so should have less numerical error.
double tot_num_values = 4;
}
// Stores the name and value of any custom statistic. The value can be a string,
// double, or histogram.
message CustomStatistic {
string name = 1;
oneof val {
double num = 2;
string str = 3;
Histogram histogram = 4;
RankHistogram rank_histogram = 5;
}
}
// Statistics for a numeric feature in a dataset.
message NumericStatistics {
CommonStatistics common_stats = 1;
// The mean of the values
double mean = 2;
// The standard deviation of the values
double std_dev = 3;
// The number of values that equal 0
uint64 num_zeros = 4;
// The minimum value
double min = 5;
// The median value
double median = 6;
// The maximum value
double max = 7;
// The histogram(s) of the feature values.
repeated Histogram histograms = 8;
// Weighted statistics for the feature, if the values have weights.
WeightedNumericStatistics weighted_numeric_stats = 9;
}
// Statistics for a string feature in a dataset.
message StringStatistics {
CommonStatistics common_stats = 1;
// The number of unique values
uint64 unique = 2;
message FreqAndValue {
string value = 2;
// The number of times the value occurs. Stored as a double to be able to
// handle weighted features.
double frequency = 3;
// Deleted fields.
reserved 1;
}
// A sorted list of the most-frequent values and their frequencies, with
// the most-frequent being first.
repeated FreqAndValue top_values = 3;
// The average length of the values
float avg_length = 4;
// The rank histogram for the values of the feature.
// The rank is used to measure of how commonly the value is found in the
// dataset. The most common value would have a rank of 1, with the second-most
// common value having a rank of 2, and so on.
RankHistogram rank_histogram = 5;
// Weighted statistics for the feature, if the values have weights.
WeightedStringStatistics weighted_string_stats = 6;
}
// Statistics for a weighted numeric feature in a dataset.
message WeightedNumericStatistics {
// The weighted mean of the values
double mean = 1;
// The weighted standard deviation of the values
double std_dev = 2;
// The weighted median of the values
double median = 3;
// The histogram(s) of the weighted feature values.
repeated Histogram histograms = 4;
}
// Statistics for a weighted string feature in a dataset.
message WeightedStringStatistics {
// A sorted list of the most-frequent values and their weighted frequencies,
// with the most-frequent being first.
repeated StringStatistics.FreqAndValue top_values = 1;
// The rank histogram for the weighted values of the feature.
RankHistogram rank_histogram = 2;
}
// Statistics for a bytes feature in a dataset.
message BytesStatistics {
CommonStatistics common_stats = 1;
// The number of unique values
uint64 unique = 2;
// The average number of bytes in a value
float avg_num_bytes = 3;
// The minimum number of bytes in a value
float min_num_bytes = 4;
// The maximum number of bytes in a value
float max_num_bytes = 5;
}
message StructStatistics {
CommonStatistics common_statistics = 1;
}
// Common statistics for all feature types
message CommonStatistics {
// The number of examples with at least one value for this feature.
uint64 num_non_missing = 1;
// The number of examples with no values for this feature.
uint64 num_missing = 2;
// The minimum number of values in a single example for this feature.
uint64 min_num_values = 3;
// The maximum number of values in a single example for this feature.
uint64 max_num_values = 4;
// The average number of values in a single example for this feature.
float avg_num_values = 5;
// tot_num_values = avg_num_values * num_non_missing.
// This is calculated directly, so should have less numerical error.
uint64 tot_num_values = 8;
// The quantiles histogram for the number of values in this feature.
Histogram num_values_histogram = 6;
WeightedCommonStatistics weighted_common_stats = 7;
// The histogram for the number of features in the feature list (only set if
// this feature is a non-context feature from a tf.SequenceExample).
// This is different from num_values_histogram, as num_values_histogram tracks
// the count of all values for a feature in an example, whereas this tracks
// the length of the feature list for this feature in an example (where each
// feature list can contain multiple values).
Histogram feature_list_length_histogram = 9;
}
// The data used to create a histogram of a numeric feature for a dataset.
message Histogram {
// Each bucket defines its low and high values along with its count. The
// low and high values must be a real number or positive or negative
// infinity. They cannot be NaN or undefined. Counts of those special values
// can be found in the numNaN and numUndefined fields.
message Bucket {
// The low value of the bucket, inclusive.
double low_value = 1;
// The high value of the bucket, exclusive (unless the highValue is
// positive infinity).
double high_value = 2;
// The number of items in the bucket. Stored as a double to be able to
// handle weighted histograms.
double sample_count = 4;
// Deleted fields.
reserved 3;
}
// The number of NaN values in the dataset.
uint64 num_nan = 1;
// The number of undefined values in the dataset.
uint64 num_undefined = 2;
// A list of buckets in the histogram, sorted from lowest bucket to highest
// bucket.
repeated Bucket buckets = 3;
// The type of the histogram. A standard histogram has equal-width buckets.
// The quantiles type is used for when the histogram message is used to store
// quantile information (by using equal-count buckets with variable widths).
enum HistogramType {
STANDARD = 0;
QUANTILES = 1;
}
// The type of the histogram.
HistogramType type = 4;
// An optional descriptive name of the histogram, to be used for labeling.
string name = 5;
}
// The data used to create a rank histogram of a non-numeric feature of a
// dataset. The rank of a value in a feature can be used as a measure of how
// commonly the value is found in the entire dataset. With bucket sizes of one,
// this becomes a distribution function of all feature values.
message RankHistogram {
// Each bucket defines its start and end ranks along with its count.
message Bucket {
// The low rank of the bucket, inclusive.
uint64 low_rank = 1;
// The high rank of the bucket, exclusive.
uint64 high_rank = 2;
// The label for the bucket. Can be used to list or summarize the values in
// this rank bucket.
string label = 4;
// The number of items in the bucket. Stored as a double to be able to
// handle weighted histograms.
double sample_count = 5;
// Deleted fields.
reserved 3;
}
// A list of buckets in the histogram, sorted from lowest-ranked bucket to
// highest-ranked bucket.
repeated Bucket buckets = 1;
// An optional descriptive name of the histogram, to be used for labeling.
string name = 2;
}