-
Notifications
You must be signed in to change notification settings - Fork 4.4k
/
Copy pathdataset.ts
977 lines (902 loc) · 33.4 KB
/
dataset.ts
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
/**
* @license
* Copyright 2019 Google LLC. All Rights Reserved.
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
* =============================================================================
*/
import * as tf from '@tensorflow/tfjs-core';
import * as tfd from '@tensorflow/tfjs-data';
import {normalize} from './browser_fft_utils';
import {arrayBuffer2String, concatenateArrayBuffers, getRandomInteger, getUID, string2ArrayBuffer} from './generic_utils';
import {balancedTrainValSplitNumArrays} from './training_utils';
import {AudioDataAugmentationOptions, Example, SpectrogramData} from './types';
// Descriptor for serialized dataset files: stands for:
// TensorFlow.js Speech-Commands Dataset.
// DO NOT EVER CHANGE THIS!
export const DATASET_SERIALIZATION_DESCRIPTOR = 'TFJSSCDS';
// A version number for the serialization. Since this needs
// to be encoded within a length-1 Uint8 array, it must be
// 1. an positive integer.
// 2. monotonically increasing over its change history.
// Item 1 is checked by unit tests.
export const DATASET_SERIALIZATION_VERSION = 1;
/**
* Specification for an `Example` (see above).
*
* Used for serialization of `Example`.
*/
export interface ExampleSpec {
/** A label for the example. */
label: string;
/** Number of frames in the spectrogram. */
spectrogramNumFrames: number;
/** The length of each frame in the spectrogram. */
spectrogramFrameSize: number;
/** The key frame index of the spectrogram. */
spectrogramKeyFrameIndex?: number;
/** Number of samples in the raw PCM-format audio (if any). */
rawAudioNumSamples?: number;
/** Sampling rate of the raw audio (if any). */
rawAudioSampleRateHz?: number;
}
/**
* Serialized Dataset, containing a number of `Example`s in their
* serialized format.
*
* This format consists of a plain-old JSON object as the manifest,
* along with a flattened binary `ArrayBuffer`. The format facilitates
* storage and transmission.
*/
export interface SerializedExamples {
/**
* Specifications of the serialized `Example`s, serialized as a string.
*/
manifest: ExampleSpec[];
/**
* Serialized binary data from the `Example`s.
*
* Including the spectrograms and the raw audio (if any).
*
* For example, assuming `manifest.length` is `N`, the format of the
* `ArrayBuffer` is as follows:
*
* [spectrogramData1, rawAudio1 (if any),
* spectrogramData2, rawAudio2 (if any),
* ...
* spectrogramDataN, rawAudioN (if any)]
*/
data: ArrayBuffer;
}
export const BACKGROUND_NOISE_TAG = '_background_noise_';
/**
* Configuration for getting spectrograms as tensors.
*/
export interface GetDataConfig extends AudioDataAugmentationOptions {
/**
* Number of frames.
*
* This must be smaller than or equal to the # of frames of each
* example held by the dataset.
*
* If the # of frames of an example is greater than this number,
* the following heuristics will be used to extra >= 1 examples
* of length numFrames from the original example:
*
* - If the label of the example is `BAKCGROUND_NOISE_TAG`,
* the example will be splitted into multiple examples using the
* `hopFrames` parameter (see below).
* - If the label of the example is not `BACKGROUND_NOISE_TAG`,
* the example will be splitted into multiple examples that
* all contain the maximum-intensity frame using the `hopFrames`
* parameter.
*/
numFrames?: number;
/**
* Hop length in number of frames.
*
* Used when splitting a long example into multiple shorter ones.
*
* Must be provided if any such long examples exist.
*/
hopFrames?: number;
/**
* Whether the spectrogram of each example will be normalized.
*
* Normalization means:
* - Subtracting the mean, and
* - Dividing the result by the standard deviation.
*
* Default: `true`.
*/
normalize?: boolean;
/**
* Whether the examples will be shuffled prior to merged into
* `tf.Tensor`s.
*
* Default: `true`.
*/
shuffle?: boolean;
/**
* Whether to obtain a `tf.data.Datasaet` object.
*
* Default: `false`.
*/
getDataset?: boolean;
/**
* Batch size for dataset.
*
* Applicable only if `getDataset === true`.
*/
datasetBatchSize?: number;
/**
* Validation split for the datasaet.
*
* Applicable only if `getDataset === true`.
*
* The data will be divided into two fractions of relative sizes
* `[1 - datasetValidationSplit, datasetValidationSplit]`, for the
* training and validation `tf.data.Dataset` objects, respectively.
*
* Must be a number between 0 and 1.
* Default: 0.15.
*/
datasetValidationSplit?: number;
}
// tslint:disable-next-line:no-any
export type SpectrogramAndTargetsTfDataset = tfd.Dataset<{}>;
/**
* A serializable, mutable set of speech/audio `Example`s;
*/
export class Dataset {
private examples: {[id: string]: Example};
private label2Ids: {[label: string]: string[]};
/**
* Constructor of `Dataset`.
*
* If called with no arguments (i.e., `artifacts` == null), an empty dataset
* will be constructed.
*
* Else, the dataset will be deserialized from `artifacts`.
*
* @param serialized Optional serialization artifacts to deserialize.
*/
constructor(serialized?: ArrayBuffer) {
this.examples = {};
this.label2Ids = {};
if (serialized != null) {
// Deserialize from the provided artifacts.
const artifacts = arrayBuffer2SerializedExamples(serialized);
let offset = 0;
for (let i = 0; i < artifacts.manifest.length; ++i) {
const spec = artifacts.manifest[i];
let byteLen = spec.spectrogramNumFrames * spec.spectrogramFrameSize;
if (spec.rawAudioNumSamples != null) {
byteLen += spec.rawAudioNumSamples;
}
byteLen *= 4;
this.addExample(deserializeExample(
{spec, data: artifacts.data.slice(offset, offset + byteLen)}));
offset += byteLen;
}
}
}
/**
* Add an `Example` to the `Dataset`
*
* @param example A `Example`, with a label. The label must be a non-empty
* string.
* @returns The UID for the added `Example`.
*/
addExample(example: Example): string {
tf.util.assert(example != null, () => 'Got null or undefined example');
tf.util.assert(
example.label != null && example.label.length > 0,
() => `Expected label to be a non-empty string, ` +
`but got ${JSON.stringify(example.label)}`);
const uid = getUID();
this.examples[uid] = example;
if (!(example.label in this.label2Ids)) {
this.label2Ids[example.label] = [];
}
this.label2Ids[example.label].push(uid);
return uid;
}
/**
* Merge the incoming dataset into this dataset
*
* @param dataset The incoming dataset to be merged into this dataset.
*/
merge(dataset: Dataset): void {
tf.util.assert(
dataset !== this, () => 'Cannot merge a dataset into itself');
const vocab = dataset.getVocabulary();
for (const word of vocab) {
const examples = dataset.getExamples(word);
for (const example of examples) {
this.addExample(example.example);
}
}
}
/**
* Get a map from `Example` label to number of `Example`s with the label.
*
* @returns A map from label to number of example counts under that label.
*/
getExampleCounts(): {[label: string]: number} {
const counts: {[label: string]: number} = {};
for (const uid in this.examples) {
const example = this.examples[uid];
if (!(example.label in counts)) {
counts[example.label] = 0;
}
counts[example.label]++;
}
return counts;
}
/**
* Get all examples of a given label, with their UIDs.
*
* @param label The requested label.
* @return All examples of the given `label`, along with their UIDs.
* The examples are sorted in the order in which they are added to the
* `Dataset`.
* @throws Error if label is `null` or `undefined`.
*/
getExamples(label: string): Array<{uid: string, example: Example}> {
tf.util.assert(
label != null,
() =>
`Expected label to be a string, but got ${JSON.stringify(label)}`);
tf.util.assert(
label in this.label2Ids,
() => `No example of label "${label}" exists in dataset`);
const output: Array<{uid: string, example: Example}> = [];
this.label2Ids[label].forEach(id => {
output.push({uid: id, example: this.examples[id]});
});
return output;
}
/**
* Get all examples and labels as tensors.
*
* - If `label` is provided and exists in the vocabulary of the `Dataset`,
* the spectrograms of all `Example`s under the `label` will be returned
* as a 4D `tf.Tensor` as `xs`. The shape of the `tf.Tensor` will be
* `[numExamples, numFrames, frameSize, 1]`
* where
* - `numExamples` is the number of `Example`s with the label
* - `numFrames` is the number of frames in each spectrogram
* - `frameSize` is the size of each spectrogram frame.
* No label Tensor will be returned.
* - If `label` is not provided, all `Example`s will be returned as `xs`.
* In addition, `ys` will contain a one-hot encoded list of labels.
* - The shape of `xs` will be: `[numExamples, numFrames, frameSize, 1]`
* - The shape of `ys` will be: `[numExamples, vocabularySize]`.
*
* @returns If `config.getDataset` is `true`, returns two `tf.data.Dataset`
* objects, one for training and one for validation.
* Else, xs` and `ys` tensors. See description above.
* @throws Error
* - if not all the involved spectrograms have matching `numFrames` and
* `frameSize`, or
* - if `label` is provided and is not present in the vocabulary of the
* `Dataset`, or
* - if the `Dataset` is currently empty.
*/
getData(label?: string, config?: GetDataConfig): {
xs: tf.Tensor4D,
ys?: tf.Tensor2D
}|[SpectrogramAndTargetsTfDataset, SpectrogramAndTargetsTfDataset] {
tf.util.assert(
this.size() > 0,
() =>
`Cannot get spectrograms as tensors because the dataset is empty`);
const vocab = this.getVocabulary();
if (label != null) {
tf.util.assert(
vocab.indexOf(label) !== -1,
() => `Label ${label} is not in the vocabulary ` +
`(${JSON.stringify(vocab)})`);
} else {
// If all words are requested, there must be at least two words in the
// vocabulary to make one-hot encoding possible.
tf.util.assert(
vocab.length > 1,
() => `One-hot encoding of labels requires the vocabulary to have ` +
`at least two words, but it has only ${vocab.length} word.`);
}
if (config == null) {
config = {};
}
// Get the numFrames lengths of all the examples currently held by the
// dataset.
const sortedUniqueNumFrames = this.getSortedUniqueNumFrames();
let numFrames: number;
let hopFrames: number;
if (sortedUniqueNumFrames.length === 1) {
numFrames = config.numFrames == null ? sortedUniqueNumFrames[0] :
config.numFrames;
hopFrames = config.hopFrames == null ? 1 : config.hopFrames;
} else {
numFrames = config.numFrames;
tf.util.assert(
numFrames != null && Number.isInteger(numFrames) && numFrames > 0,
() => `There are ${
sortedUniqueNumFrames.length} unique lengths among ` +
`the ${this.size()} examples of this Dataset, hence numFrames ` +
`is required. But it is not provided.`);
tf.util.assert(
numFrames <= sortedUniqueNumFrames[0],
() => `numFrames (${numFrames}) exceeds the minimum numFrames ` +
`(${sortedUniqueNumFrames[0]}) among the examples of ` +
`the Dataset.`);
hopFrames = config.hopFrames;
tf.util.assert(
hopFrames != null && Number.isInteger(hopFrames) && hopFrames > 0,
() => `There are ${
sortedUniqueNumFrames.length} unique lengths among ` +
`the ${this.size()} examples of this Dataset, hence hopFrames ` +
`is required. But it is not provided.`);
}
// Normalization is performed by default.
const toNormalize = config.normalize == null ? true : config.normalize;
return tf.tidy(() => {
let xTensors: tf.Tensor3D[] = [];
let xArrays: Float32Array[] = [];
let labelIndices: number[] = [];
let uniqueFrameSize: number;
for (let i = 0; i < vocab.length; ++i) {
const currentLabel = vocab[i];
if (label != null && currentLabel !== label) {
continue;
}
const ids = this.label2Ids[currentLabel];
for (const id of ids) {
const example = this.examples[id];
const spectrogram = example.spectrogram;
const frameSize = spectrogram.frameSize;
if (uniqueFrameSize == null) {
uniqueFrameSize = frameSize;
} else {
tf.util.assert(
frameSize === uniqueFrameSize,
() => `Mismatch in frameSize ` +
`(${frameSize} vs ${uniqueFrameSize})`);
}
const snippetLength = spectrogram.data.length / frameSize;
let focusIndex = null;
if (currentLabel !== BACKGROUND_NOISE_TAG) {
focusIndex = spectrogram.keyFrameIndex == null ?
getMaxIntensityFrameIndex(spectrogram).dataSync()[0] :
spectrogram.keyFrameIndex;
}
// TODO(cais): See if we can get rid of dataSync();
const snippet =
tf.tensor3d(spectrogram.data, [snippetLength, frameSize, 1]);
const windows =
getValidWindows(snippetLength, focusIndex, numFrames, hopFrames);
for (const window of windows) {
const windowedSnippet = tf.tidy(() => {
const output = tf.slice(snippet,
[window[0], 0, 0], [window[1] - window[0], -1, -1]);
return toNormalize ? normalize(output) : output;
});
if (config.getDataset) {
// TODO(cais): See if we can do away with dataSync();
// TODO(cais): Shuffling?
xArrays.push(windowedSnippet.dataSync() as Float32Array);
} else {
xTensors.push(windowedSnippet as tf.Tensor3D);
}
if (label == null) {
labelIndices.push(i);
}
}
tf.dispose(snippet); // For memory saving.
}
}
if (config.augmentByMixingNoiseRatio != null) {
this.augmentByMixingNoise(
config.getDataset ? xArrays :
xTensors as Array<Float32Array|tf.Tensor>,
labelIndices, config.augmentByMixingNoiseRatio);
}
const shuffle = config.shuffle == null ? true : config.shuffle;
if (config.getDataset) {
const batchSize =
config.datasetBatchSize == null ? 32 : config.datasetBatchSize;
// Split the data into two splits: training and validation.
const valSplit = config.datasetValidationSplit == null ?
0.15 :
config.datasetValidationSplit;
tf.util.assert(
valSplit > 0 && valSplit < 1,
() => `Invalid dataset validation split: ${valSplit}`);
const zippedXandYArrays =
xArrays.map((xArray, i) => [xArray, labelIndices[i]]);
tf.util.shuffle(
zippedXandYArrays); // Shuffle the data before splitting.
xArrays = zippedXandYArrays.map(item => item[0]) as Float32Array[];
const yArrays = zippedXandYArrays.map(item => item[1]) as number[];
const {trainXs, trainYs, valXs, valYs} =
balancedTrainValSplitNumArrays(xArrays, yArrays, valSplit);
// TODO(cais): The typing around Float32Array is not working properly
// for tf.data currently. Tighten the types when the tf.data bug is
// fixed.
// tslint:disable:no-any
const xTrain =
tfd.array(trainXs as any).map(x => tf.tensor3d(x as any, [
numFrames, uniqueFrameSize, 1
]));
const yTrain = tfd.array(trainYs).map(
y => tf.squeeze(tf.oneHot([y], vocab.length), [0]));
// TODO(cais): See if we can tighten the typing.
let trainDataset = tfd.zip({xs: xTrain, ys: yTrain});
if (shuffle) {
// Shuffle the dataset.
trainDataset = trainDataset.shuffle(xArrays.length);
}
trainDataset = trainDataset.batch(batchSize).prefetch(4);
const xVal =
tfd.array(valXs as any).map(x => tf.tensor3d(x as any, [
numFrames, uniqueFrameSize, 1
]));
const yVal = tfd.array(valYs).map(
y => tf.squeeze(tf.oneHot([y], vocab.length), [0]));
let valDataset = tfd.zip({xs: xVal, ys: yVal});
valDataset = valDataset.batch(batchSize).prefetch(4);
// tslint:enable:no-any
// tslint:disable-next-line:no-any
return [trainDataset, valDataset] as any;
} else {
if (shuffle) {
// Shuffle the data.
const zipped: Array<{x: tf.Tensor3D, y: number}> = [];
xTensors.forEach((xTensor, i) => {
zipped.push({x: xTensor, y: labelIndices[i]});
});
tf.util.shuffle(zipped);
xTensors = zipped.map(item => item.x);
labelIndices = zipped.map(item => item.y);
}
const targets = label == null ?
tf.cast(tf.oneHot(tf.tensor1d(labelIndices, 'int32'), vocab.length),
'float32') :
undefined;
return {
xs: tf.stack(xTensors) as tf.Tensor4D,
ys: targets as tf.Tensor2D
};
}
});
}
private augmentByMixingNoise<T extends tf.Tensor|Float32Array>(
xs: T[], labelIndices: number[], ratio: number): void {
if (xs == null || xs.length === 0) {
throw new Error(
`Cannot perform augmentation because data is null or empty`);
}
const isTypedArray = xs[0] instanceof Float32Array;
const vocab = this.getVocabulary();
const noiseExampleIndices: number[] = [];
const wordExampleIndices: number[] = [];
for (let i = 0; i < labelIndices.length; ++i) {
if (vocab[labelIndices[i]] === BACKGROUND_NOISE_TAG) {
noiseExampleIndices.push(i);
} else {
wordExampleIndices.push(i);
}
}
if (noiseExampleIndices.length === 0) {
throw new Error(
`Cannot perform augmentation by mixing with noise when ` +
`there is no example with label ${BACKGROUND_NOISE_TAG}`);
}
const mixedXTensors: Array<tf.Tensor|Float32Array> = [];
const mixedLabelIndices: number[] = [];
for (const index of wordExampleIndices) {
const noiseIndex = // Randomly sample from the noises, with replacement.
noiseExampleIndices[getRandomInteger(0, noiseExampleIndices.length)];
const signalTensor = isTypedArray ?
tf.tensor1d(xs[index] as Float32Array) :
xs[index] as tf.Tensor;
const noiseTensor = isTypedArray ?
tf.tensor1d(xs[noiseIndex] as Float32Array) :
xs[noiseIndex] as tf.Tensor;
const mixed: tf.Tensor =
tf.tidy(() => normalize(
tf.add(signalTensor, tf.mul(noiseTensor, ratio))));
if (isTypedArray) {
mixedXTensors.push(mixed.dataSync() as Float32Array);
} else {
mixedXTensors.push(mixed);
}
mixedLabelIndices.push(labelIndices[index]);
}
console.log(
`Data augmentation: mixing noise: added ${mixedXTensors.length} ` +
`examples`);
mixedXTensors.forEach(tensor => xs.push(tensor as T));
labelIndices.push(...mixedLabelIndices);
}
private getSortedUniqueNumFrames(): number[] {
const numFramesSet = new Set<number>();
const vocab = this.getVocabulary();
for (let i = 0; i < vocab.length; ++i) {
const label = vocab[i];
const ids = this.label2Ids[label];
for (const id of ids) {
const spectrogram = this.examples[id].spectrogram;
const numFrames = spectrogram.data.length / spectrogram.frameSize;
numFramesSet.add(numFrames);
}
}
const uniqueNumFrames = [...numFramesSet];
uniqueNumFrames.sort();
return uniqueNumFrames;
}
/**
* Remove an example from the `Dataset`.
*
* @param uid The UID of the example to remove.
* @throws Error if the UID doesn't exist in the `Dataset`.
*/
removeExample(uid: string): void {
if (!(uid in this.examples)) {
throw new Error(`Nonexistent example UID: ${uid}`);
}
const label = this.examples[uid].label;
delete this.examples[uid];
const index = this.label2Ids[label].indexOf(uid);
this.label2Ids[label].splice(index, 1);
if (this.label2Ids[label].length === 0) {
delete this.label2Ids[label];
}
}
/**
* Set the key frame index of a given example.
*
* @param uid The UID of the example of which the `keyFrameIndex` is to be
* set.
* @param keyFrameIndex The desired value of the `keyFrameIndex`. Must
* be >= 0, < the number of frames of the example, and an integer.
* @throws Error If the UID and/or the `keyFrameIndex` value is invalid.
*/
setExampleKeyFrameIndex(uid: string, keyFrameIndex: number) {
if (!(uid in this.examples)) {
throw new Error(`Nonexistent example UID: ${uid}`);
}
const spectrogram = this.examples[uid].spectrogram;
const numFrames = spectrogram.data.length / spectrogram.frameSize;
tf.util.assert(
keyFrameIndex >= 0 && keyFrameIndex < numFrames &&
Number.isInteger(keyFrameIndex),
() => `Invalid keyFrameIndex: ${keyFrameIndex}. ` +
`Must be >= 0, < ${numFrames}, and an integer.`);
spectrogram.keyFrameIndex = keyFrameIndex;
}
/**
* Get the total number of `Example` currently held by the `Dataset`.
*
* @returns Total `Example` count.
*/
size(): number {
return Object.keys(this.examples).length;
}
/**
* Get the total duration of the `Example` currently held by `Dataset`,
*
* in milliseconds.
*
* @return Total duration in milliseconds.
*/
durationMillis(): number {
let durMillis = 0;
const DEFAULT_FRAME_DUR_MILLIS = 23.22;
for (const key in this.examples) {
const spectrogram = this.examples[key].spectrogram;
const frameDurMillis =
spectrogram.frameDurationMillis | DEFAULT_FRAME_DUR_MILLIS;
durMillis +=
spectrogram.data.length / spectrogram.frameSize * frameDurMillis;
}
return durMillis;
}
/**
* Query whether the `Dataset` is currently empty.
*
* I.e., holds zero examples.
*
* @returns Whether the `Dataset` is currently empty.
*/
empty(): boolean {
return this.size() === 0;
}
/**
* Remove all `Example`s from the `Dataset`.
*/
clear(): void {
this.examples = {};
}
/**
* Get the list of labels among all `Example`s the `Dataset` currently holds.
*
* @returns A sorted Array of labels, for the unique labels that belong to all
* `Example`s currently held by the `Dataset`.
*/
getVocabulary(): string[] {
const vocab = new Set<string>();
for (const uid in this.examples) {
const example = this.examples[uid];
vocab.add(example.label);
}
const sortedVocab = [...vocab];
sortedVocab.sort();
return sortedVocab;
}
/**
* Serialize the `Dataset`.
*
* The `Examples` are sorted in the following order:
* - First, the labels in the vocabulary are sorted.
* - Second, the `Example`s for every label are sorted by the order in
* which they are added to this `Dataset`.
*
* @param wordLabels Optional word label(s) to serialize. If specified, only
* the examples with labels matching the argument will be serialized. If
* any specified word label does not exist in the vocabulary of this
* dataset, an Error will be thrown.
* @returns A `ArrayBuffer` object amenable to transmission and storage.
*/
serialize(wordLabels?: string|string[]): ArrayBuffer {
const vocab = this.getVocabulary();
tf.util.assert(!this.empty(), () => `Cannot serialize empty Dataset`);
if (wordLabels != null) {
if (!Array.isArray(wordLabels)) {
wordLabels = [wordLabels];
}
wordLabels.forEach(wordLabel => {
if (vocab.indexOf(wordLabel) === -1) {
throw new Error(
`Word label "${wordLabel}" does not exist in the ` +
`vocabulary of this dataset. The vocabulary is: ` +
`${JSON.stringify(vocab)}.`);
}
});
}
const manifest: ExampleSpec[] = [];
const buffers: ArrayBuffer[] = [];
for (const label of vocab) {
if (wordLabels != null && wordLabels.indexOf(label) === -1) {
continue;
}
const ids = this.label2Ids[label];
for (const id of ids) {
const artifact = serializeExample(this.examples[id]);
manifest.push(artifact.spec);
buffers.push(artifact.data);
}
}
return serializedExamples2ArrayBuffer(
{manifest, data: concatenateArrayBuffers(buffers)});
}
}
/** Serialize an `Example`. */
export function serializeExample(example: Example):
{spec: ExampleSpec, data: ArrayBuffer} {
const hasRawAudio = example.rawAudio != null;
const spec: ExampleSpec = {
label: example.label,
spectrogramNumFrames:
example.spectrogram.data.length / example.spectrogram.frameSize,
spectrogramFrameSize: example.spectrogram.frameSize,
};
if (example.spectrogram.keyFrameIndex != null) {
spec.spectrogramKeyFrameIndex = example.spectrogram.keyFrameIndex;
}
let data = example.spectrogram.data.buffer.slice(0);
if (hasRawAudio) {
spec.rawAudioNumSamples = example.rawAudio.data.length;
spec.rawAudioSampleRateHz = example.rawAudio.sampleRateHz;
// Account for the fact that the data are all float32.
data = concatenateArrayBuffers([data, example.rawAudio.data.buffer]);
}
return {spec, data};
}
/** Deserialize an `Example`. */
export function deserializeExample(
artifact: {spec: ExampleSpec, data: ArrayBuffer}): Example {
const spectrogram: SpectrogramData = {
frameSize: artifact.spec.spectrogramFrameSize,
data: new Float32Array(artifact.data.slice(
0,
4 * artifact.spec.spectrogramFrameSize *
artifact.spec.spectrogramNumFrames))
};
if (artifact.spec.spectrogramKeyFrameIndex != null) {
spectrogram.keyFrameIndex = artifact.spec.spectrogramKeyFrameIndex;
}
const ex: Example = {label: artifact.spec.label, spectrogram};
if (artifact.spec.rawAudioNumSamples != null) {
ex.rawAudio = {
sampleRateHz: artifact.spec.rawAudioSampleRateHz,
data: new Float32Array(artifact.data.slice(
4 * artifact.spec.spectrogramFrameSize *
artifact.spec.spectrogramNumFrames))
};
}
return ex;
}
/**
* Encode intermediate serialization format as an ArrayBuffer.
*
* Format of the binary ArrayBuffer:
* 1. An 8-byte descriptor (see above).
* 2. A 4-byte version number as Uint32.
* 3. A 4-byte number for the byte length of the JSON manifest.
* 4. The encoded JSON manifest
* 5. The binary data of the spectrograms, and raw audio (if any).
*
* @param serialized: Intermediate serialization format of a dataset.
* @returns The binary conversion result as an ArrayBuffer.
*/
function serializedExamples2ArrayBuffer(serialized: SerializedExamples):
ArrayBuffer {
const manifestBuffer =
string2ArrayBuffer(JSON.stringify(serialized.manifest));
const descriptorBuffer = string2ArrayBuffer(DATASET_SERIALIZATION_DESCRIPTOR);
const version = new Uint32Array([DATASET_SERIALIZATION_VERSION]);
const manifestLength = new Uint32Array([manifestBuffer.byteLength]);
const headerBuffer = concatenateArrayBuffers(
[descriptorBuffer, version.buffer, manifestLength.buffer]);
return concatenateArrayBuffers(
[headerBuffer, manifestBuffer, serialized.data]);
}
/** Decode an ArrayBuffer as intermediate serialization format. */
export function arrayBuffer2SerializedExamples(buffer: ArrayBuffer):
SerializedExamples {
tf.util.assert(buffer != null, () => 'Received null or undefined buffer');
// Check descriptor.
let offset = 0;
const descriptor = arrayBuffer2String(
buffer.slice(offset, DATASET_SERIALIZATION_DESCRIPTOR.length));
tf.util.assert(
descriptor === DATASET_SERIALIZATION_DESCRIPTOR,
() => `Deserialization error: Invalid descriptor`);
offset += DATASET_SERIALIZATION_DESCRIPTOR.length;
// Skip the version part for now. It may be used in the future.
offset += 4;
// Extract the length of the encoded manifest JSON as a Uint32.
const manifestLength = new Uint32Array(buffer, offset, 1);
offset += 4;
const manifestBeginByte = offset;
offset = manifestBeginByte + manifestLength[0];
const manifestBytes = buffer.slice(manifestBeginByte, offset);
const manifestString = arrayBuffer2String(manifestBytes);
const manifest = JSON.parse(manifestString);
const data = buffer.slice(offset);
return {manifest, data};
}
/**
* Get valid windows in a long snippet.
*
* Each window is represented by an inclusive left index and an exclusive
* right index.
*
* @param snippetLength Long of the entire snippet. Must be a positive
* integer.
* @param focusIndex Optional. If `null` or `undefined`, an array of
* evenly-spaced windows will be generated. The array of windows will
* start from the first possible location (i.e., [0, windowLength]).
* If not `null` or `undefined`, must be an integer >= 0 and < snippetLength.
* @param windowLength Length of each window. Must be a positive integer and
* <= snippetLength.
* @param windowHop Hops between successsive windows. Must be a positive
* integer.
* @returns An array of [beginIndex, endIndex] pairs.
*/
export function getValidWindows(
snippetLength: number, focusIndex: number, windowLength: number,
windowHop: number): Array<[number, number]> {
tf.util.assert(
Number.isInteger(snippetLength) && snippetLength > 0,
() =>
`snippetLength must be a positive integer, but got ${snippetLength}`);
if (focusIndex != null) {
tf.util.assert(
Number.isInteger(focusIndex) && focusIndex >= 0,
() =>
`focusIndex must be a non-negative integer, but got ${focusIndex}`);
}
tf.util.assert(
Number.isInteger(windowLength) && windowLength > 0,
() => `windowLength must be a positive integer, but got ${windowLength}`);
tf.util.assert(
Number.isInteger(windowHop) && windowHop > 0,
() => `windowHop must be a positive integer, but got ${windowHop}`);
tf.util.assert(
windowLength <= snippetLength,
() => `windowLength (${windowLength}) exceeds snippetLength ` +
`(${snippetLength})`);
tf.util.assert(
focusIndex < snippetLength,
() => `focusIndex (${focusIndex}) equals or exceeds snippetLength ` +
`(${snippetLength})`);
if (windowLength === snippetLength) {
return [[0, snippetLength]];
}
const windows: Array<[number, number]> = [];
if (focusIndex == null) {
// Deal with the special case of no focus frame:
// Output an array of evenly-spaced windows, starting from
// the first possible location.
let begin = 0;
while (begin + windowLength <= snippetLength) {
windows.push([begin, begin + windowLength]);
begin += windowHop;
}
return windows;
}
const leftHalf = Math.floor(windowLength / 2);
let left = focusIndex - leftHalf;
if (left < 0) {
left = 0;
} else if (left + windowLength > snippetLength) {
left = snippetLength - windowLength;
}
while (true) {
if (left - windowHop < 0 || focusIndex >= left - windowHop + windowLength) {
break;
}
left -= windowHop;
}
while (left + windowLength <= snippetLength) {
if (focusIndex < left) {
break;
}
windows.push([left, left + windowLength]);
left += windowHop;
}
return windows;
}
/**
* Calculate an intensity profile from a spectrogram.
*
* The intensity at each time frame is caclulated by simply averaging all the
* spectral values that belong to that time frame.
*
* @param spectrogram The input spectrogram.
* @returns The temporal profile of the intensity as a 1D tf.Tensor of shape
* `[numFrames]`.
*/
export function spectrogram2IntensityCurve(spectrogram: SpectrogramData):
tf.Tensor {
return tf.tidy(() => {
const numFrames = spectrogram.data.length / spectrogram.frameSize;
const x = tf.tensor2d(spectrogram.data, [numFrames, spectrogram.frameSize]);
return tf.mean(x, -1);
});
}
/**
* Get the index to the maximum intensity frame.
*
* The intensity of each time frame is calculated as the arithmetic mean of
* all the spectral values belonging to that time frame.
*
* @param spectrogram The input spectrogram.
* @returns The index to the time frame containing the maximum intensity.
*/
export function getMaxIntensityFrameIndex(spectrogram: SpectrogramData):
tf.Scalar {
return tf.tidy(() => tf.argMax(spectrogram2IntensityCurve(spectrogram)));
}