-
Notifications
You must be signed in to change notification settings - Fork 74.6k
/
Copy pathconfig.proto
1023 lines (881 loc) · 44.1 KB
/
config.proto
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
syntax = "proto3";
package tensorflow;
import "xla/tsl/protobuf/coordination_config.proto";
import "tensorflow/core/framework/cost_graph.proto";
import "tensorflow/core/framework/graph.proto";
import "tensorflow/core/framework/step_stats.proto";
import "tensorflow/core/protobuf/cluster.proto";
import "tensorflow/core/protobuf/debug.proto";
import "tensorflow/core/protobuf/rewriter_config.proto";
import "tensorflow/core/protobuf/rpc_options.proto";
option cc_enable_arenas = true;
option java_outer_classname = "ConfigProtos";
option java_multiple_files = true;
option java_package = "org.tensorflow.framework";
option go_package = "github.com/tensorflow/tensorflow/tensorflow/go/core/protobuf/for_core_protos_go_proto";
message GPUOptions {
// Fraction of the total GPU memory to allocate for each process.
// 1 means to allocate all of the GPU memory, 0.5 means the process
// allocates up to ~50% of the total GPU memory.
//
// GPU memory is pre-allocated unless the allow_growth option is enabled.
//
// If greater than 1.0, uses CUDA unified memory to potentially oversubscribe
// the amount of memory available on the GPU device by using host memory as a
// swap space. Accessing memory not available on the device will be
// significantly slower as that would require memory transfer between the host
// and the device. Options to reduce the memory requirement should be
// considered before enabling this option as this may come with a negative
// performance impact. Oversubscription using the unified memory requires
// Pascal class or newer GPUs and it is currently only supported on the Linux
// operating system. See
// https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#um-requirements
// for the detailed requirements.
double per_process_gpu_memory_fraction = 1;
// If true, the allocator does not pre-allocate the entire specified
// GPU memory region, instead starting small and growing as needed.
bool allow_growth = 4;
// The type of GPU allocation strategy to use.
//
// Allowed values:
// "": The empty string (default) uses a system-chosen default
// which may change over time.
//
// "BFC": A "Best-fit with coalescing" algorithm, simplified from a
// version of dlmalloc.
string allocator_type = 2;
// Delay deletion of up to this many bytes to reduce the number of
// interactions with gpu driver code. If 0, the system chooses
// a reasonable default (several MBs).
int64 deferred_deletion_bytes = 3;
// A comma-separated list of GPU ids that determines the 'visible'
// to 'virtual' mapping of GPU devices. For example, if TensorFlow
// can see 8 GPU devices in the process, and one wanted to map
// visible GPU devices 5 and 3 as "/device:GPU:0", and "/device:GPU:1",
// then one would specify this field as "5,3". This field is similar in
// spirit to the CUDA_VISIBLE_DEVICES environment variable, except
// it applies to the visible GPU devices in the process.
//
// NOTE:
// 1. The GPU driver provides the process with the visible GPUs
// in an order which is not guaranteed to have any correlation to
// the *physical* GPU id in the machine. This field is used for
// remapping "visible" to "virtual", which means this operates only
// after the process starts. Users are required to use vendor
// specific mechanisms (e.g., CUDA_VISIBLE_DEVICES) to control the
// physical to visible device mapping prior to invoking TensorFlow.
// 2. In the code, the ids in this list are also called "platform GPU id"s,
// and the 'virtual' ids of GPU devices (i.e. the ids in the device
// name "/device:GPU:<id>") are also called "TF GPU id"s. Please
// refer to third_party/tensorflow/core/common_runtime/gpu/gpu_id.h
// for more information.
// 3. The visible_device_list is also used for PluggableDevice. And
// different types of PluggableDevices share this field. In that case,
// the pluggable_device_type is used to distinguish them, making the
// visible_device_list a list of <pluggable_device_type>:<device_index>,
// e.g. "PluggableDeviceA:0,PluggableDeviceA:1,PluggableDeviceB:0".
string visible_device_list = 5;
// In the event polling loop sleep this many microseconds between
// PollEvents calls, when the queue is not empty. If value is not
// set or set to 0, gets set to a non-zero default.
int32 polling_active_delay_usecs = 6;
// This field is deprecated and ignored.
int32 polling_inactive_delay_msecs = 7;
// Force all tensors to be gpu_compatible. On a GPU-enabled TensorFlow,
// enabling this option forces all CPU tensors to be allocated with Cuda
// pinned memory. Normally, TensorFlow will infer which tensors should be
// allocated as the pinned memory. But in case where the inference is
// incomplete, this option can significantly speed up the cross-device memory
// copy performance as long as it fits the memory.
// Note that this option is not something that should be
// enabled by default for unknown or very large models, since all Cuda pinned
// memory is unpageable, having too much pinned memory might negatively impact
// the overall host system performance.
bool force_gpu_compatible = 8;
message Experimental {
// Configuration for breaking down a visible GPU into multiple "virtual"
// devices.
message VirtualDevices {
// Per "virtual" device memory limit, in MB. The number of elements in
// the list is the number of virtual devices to create on the
// corresponding visible GPU (see "virtual_devices" below).
// If empty and `num_virtual_devices_per_gpu` is not set, it will create
// single virtual device taking all available memory from the device.
//
// For the concept of "visible" and "virtual" GPU, see the comments for
// "visible_device_list" above for more information.
repeated float memory_limit_mb = 1;
// Priority values to use with the virtual devices. Use the cuda function
// cudaDeviceGetStreamPriorityRange to query for valid range of values for
// priority.
//
// On a P4000 GPU with cuda 10.1, the priority range reported was 0 for
// least priority and -1 for greatest priority.
//
// If this field is not specified, then the virtual devices will be
// created with the default. If this field has values set, then the size
// of this must match with the above memory_limit_mb.
repeated int32 priority = 2;
// Virtual Device ordinal number determines the device ID of the device.
// A Virtual device with a lower ordinal number always receives the a
// smaller device id. The phyiscal device id and location in the
// virtual device list is used to break ties.
repeated int32 device_ordinal = 3;
}
// The multi virtual device settings. If empty (not set), it will create
// single virtual device on each visible GPU, according to the settings
// in "visible_device_list" above. Otherwise, the number of elements in the
// list must be the same as the number of visible GPUs (after
// "visible_device_list" filtering if it is set), and the string represented
// device names (e.g. /device:GPU:<id>) will refer to the virtual
// devices and have the <id> field assigned sequentially starting from 0,
// according to the order of the virtual devices determined by
// device_ordinal and the location in the virtual device list.
//
// For example,
// visible_device_list = "1,0"
// virtual_devices { memory_limit: 1GB memory_limit: 2GB }
// virtual_devices { memory_limit: 3GB memory_limit: 4GB }
// will create 4 virtual devices as:
// /device:GPU:0 -> visible GPU 1 with 1GB memory
// /device:GPU:1 -> visible GPU 1 with 2GB memory
// /device:GPU:2 -> visible GPU 0 with 3GB memory
// /device:GPU:3 -> visible GPU 0 with 4GB memory
//
// but
// visible_device_list = "1,0"
// virtual_devices { memory_limit: 1GB memory_limit: 2GB
// device_ordinal: 10 device_ordinal: 20}
// virtual_devices { memory_limit: 3GB memory_limit: 4GB
// device_ordinal: 10 device_ordinal: 20}
// will create 4 virtual devices as:
// /device:GPU:0 -> visible GPU 1 with 1GB memory (ordinal 10)
// /device:GPU:1 -> visible GPU 0 with 3GB memory (ordinal 10)
// /device:GPU:2 -> visible GPU 1 with 2GB memory (ordinal 20)
// /device:GPU:3 -> visible GPU 0 with 4GB memory (ordinal 20)
//
// NOTE:
// 1. It's invalid to set both this and "per_process_gpu_memory_fraction"
// at the same time.
// 2. Currently this setting is per-process, not per-session. Using
// different settings in different sessions within same process will
// result in undefined behavior.
repeated VirtualDevices virtual_devices = 1;
// The number of virtual devices to create on each visible GPU. The
// available memory will be split equally among all virtual devices. If the
// field `memory_limit_mb` in `VirtualDevices` is not empty, this field will
// be ignored.
int32 num_virtual_devices_per_gpu = 15;
// If true, uses CUDA unified memory for memory allocations. If
// per_process_gpu_memory_fraction option is greater than 1.0, then unified
// memory is used regardless of the value for this field. See comments for
// per_process_gpu_memory_fraction field for more details and requirements
// of the unified memory. This option is useful to oversubscribe memory if
// multiple processes are sharing a single GPU while individually using less
// than 1.0 per process memory fraction.
bool use_unified_memory = 2;
// If > 1, the number of device-to-device copy streams to create
// for each GPUDevice. Default value is 0, which is automatically
// converted to 1.
int32 num_dev_to_dev_copy_streams = 3;
// If non-empty, defines a good GPU ring order on a single worker based on
// device interconnect. This assumes that all workers have the same GPU
// topology. Specify as a comma-separated string, e.g. "3,2,1,0,7,6,5,4".
// This ring order is used by the RingReducer implementation of
// CollectiveReduce, and serves as an override to automatic ring order
// generation in OrderTaskDeviceMap() during CollectiveParam resolution.
string collective_ring_order = 4;
// If true then extra work is done by GPUDevice and GPUBFCAllocator to
// keep track of when GPU memory is freed and when kernels actually
// complete so that we can know when a nominally free memory chunk
// is really not subject to pending use.
bool timestamped_allocator = 5;
// reserved id: 6
// Parameters for GPUKernelTracker. By default no kernel tracking is done.
// Note that timestamped_allocator is only effective if some tracking is
// specified.
//
// If kernel_tracker_max_interval = n > 0, then a tracking event
// is inserted after every n kernels without an event.
int32 kernel_tracker_max_interval = 7;
// If kernel_tracker_max_bytes = n > 0, then a tracking event is
// inserted after every series of kernels allocating a sum of
// memory >= n. If one kernel allocates b * n bytes, then one
// event will be inserted after it, but it will count as b against
// the pending limit.
int32 kernel_tracker_max_bytes = 8;
// If kernel_tracker_max_pending > 0 then no more than this many
// tracking events can be outstanding at a time. An attempt to
// launch an additional kernel will stall until an event
// completes.
int32 kernel_tracker_max_pending = 9;
// BFC Allocator can return an allocated chunk of memory upto 2x the
// requested size. For virtual devices with tight memory constraints, and
// proportionately large allocation requests, this can lead to a significant
// reduction in available memory. The threshold below controls when a chunk
// should be split if the chunk size exceeds requested memory size. It is
// expressed as a fraction of total available memory for the tf device. For
// example setting it to 0.05 would imply a chunk needs to be split if its
// size exceeds the requested memory by 5% of the total virtual device/gpu
// memory size.
double internal_fragmentation_fraction = 10;
// When true, use CUDA cudaMallocAsync API instead of TF gpu allocator.
bool use_cuda_malloc_async = 11;
// By default, BFCAllocator may sleep when it runs out of memory, in the
// hopes that another thread will free up memory in the meantime. Setting
// this to true disables the sleep; instead we'll OOM immediately.
bool disallow_retry_on_allocation_failure = 12;
// Memory limit for "GPU host allocator", aka pinned memory allocator. This
// can also be set via the envvar TF_GPU_HOST_MEM_LIMIT_IN_MB.
float gpu_host_mem_limit_in_mb = 13;
// If true, then the host allocator allocates its max memory all upfront and
// never grows. This can be useful for latency-sensitive systems, because
// growing the GPU host memory pool can be expensive.
//
// You probably only want to use this in combination with
// gpu_host_mem_limit_in_mb, because the default GPU host memory limit is
// quite high.
bool gpu_host_mem_disallow_growth = 14;
// Memory limit for gpu system. This can also be set by
// TF_DEVICE_MIN_SYS_MEMORY_IN_MB, which takes precedence over
// gpu_system_memory_size_in_mb. With this, user can configure the gpu
// system memory size for better resource estimation of multi-tenancy(one
// gpu with multiple model) use case.
int32 gpu_system_memory_size_in_mb = 16;
// If true, save information needed for created a PjRt GPU client for
// creating a client with remote devices.
bool populate_pjrt_gpu_client_creation_info = 17;
// node_id for use when creating a PjRt GPU client with remote devices,
// which enumerates jobs*tasks from a ServerDef.
int32 node_id = 18;
// Whether to merge data transfer streams into the compute stream in the
// same stream group. Stream merging helps reduce the overhead caused by
// stream synchronization, especially when data transfers are frequent. For
// example, setting "merge_host_to_device_stream = true" will make the
// compute stream responsible for both computation and host to device memory
// copy.
message StreamMergeOptions {
// If true, the compute stream will be used for host_to_device copy as
// well. It's no longer necessary to record an event before the copy to
// let the copy stream wait for the compute stream to finish. There is
// also no need to wait for the copy to complete before executing the
// callback function.
bool merge_host_to_device_stream = 1;
// If true, the compute stream will be used for device_to_host copy as
// well. It's no longer necessary to record an event before the copy to
// let the copy stream wait for the compute stream to finish.
bool merge_device_to_host_stream = 2;
// If true, the compute stream will be used for device_to_device copy as
// well. It's no longer necessary to record an event before the copy to
// let the copy stream wait for the compute stream of the sending device
// to finish. There is also no need to wait for the compute stream of the
// receiving device to finish if the copy is within the same device.
bool merge_device_to_device_stream = 3;
}
StreamMergeOptions stream_merge_options = 19;
}
// Everything inside experimental is subject to change and is not subject
// to API stability guarantees in
// https://www.tensorflow.org/guide/versions.
Experimental experimental = 9;
}
// Options passed to the graph optimizer
message OptimizerOptions {
// If true, optimize the graph using common subexpression elimination.
// Note: the optimization Level L1 will override this setting to true. So in
// order to disable common subexpression elimination the opt_level has to be
// set to L0.
bool do_common_subexpression_elimination = 1;
// If true, perform constant folding optimization on the graph.
// Note: the optimization Level L1 will override this setting to true. So in
// order to disable constant folding the opt_level has to be set to L0.
bool do_constant_folding = 2;
// Constant folding optimization replaces tensors whose values can be
// predetermined, with constant nodes. To avoid inserting too large constants,
// the size of each constant created can be limited. If this value is zero, a
// default limit of 10 MiB will be applied. If constant folding optimization
// is disabled, this value is ignored.
int64 max_folded_constant_in_bytes = 6;
// If true, perform function inlining on the graph.
bool do_function_inlining = 4;
// Optimization level
enum Level {
// L1 is the default level.
// Optimization performed at L1 :
// 1. Common subexpression elimination
// 2. Constant folding
L1 = 0;
// No optimizations
L0 = -1;
}
// Overall optimization level. The actual optimizations applied will be the
// logical OR of the flags that this level implies and any flags already set.
Level opt_level = 3;
// Control the use of the compiler/jit. Experimental.
enum GlobalJitLevel {
DEFAULT = 0; // Default setting ("off" now, but later expected to be "on")
OFF = -1;
// The following settings turn on compilation, with higher values being
// more aggressive. Higher values may reduce opportunities for parallelism
// and may use more memory. (At present, there is no distinction, but this
// is expected to change.)
ON_1 = 1;
ON_2 = 2;
}
GlobalJitLevel global_jit_level = 5;
// CPU code will be autoclustered only if global_jit_level >= ON_1 and either:
// - this flag is true, or
// - TF_XLA_FLAGS contains --tf_xla_cpu_global_jit=true.
bool cpu_global_jit = 7;
}
message GraphOptions {
// Removed, use optimizer_options below.
reserved "skip_common_subexpression_elimination";
reserved 1;
// If true, use control flow to schedule the activation of Recv nodes.
// (Currently ignored.)
bool enable_recv_scheduling = 2;
// Options controlling how graph is optimized.
OptimizerOptions optimizer_options = 3;
// The number of steps to run before returning a cost model detailing
// the memory usage and performance of each node of the graph. 0 means
// no cost model.
int64 build_cost_model = 4;
// The number of steps to skip before collecting statistics for the
// cost model.
int64 build_cost_model_after = 9;
// Annotate each Node with Op output shape data, to the extent it can
// be statically inferred.
bool infer_shapes = 5;
// Only place the subgraphs that are run, rather than the entire graph.
//
// This is useful for interactive graph building, where one might
// produce graphs that cannot be placed during the debugging
// process. In particular, it allows the client to continue work in
// a session after adding a node to a graph whose placement
// constraints are unsatisfiable.
bool place_pruned_graph = 6;
// If true, transfer float values between processes as bfloat16.
bool enable_bfloat16_sendrecv = 7;
// If > 0, record a timeline every this many steps.
// EXPERIMENTAL: This currently has no effect in MasterSession.
int32 timeline_step = 8;
// Options that control the type and amount of graph rewriting.
// Not currently configurable via the public Python API (i.e. there is no API
// stability guarantee if you import RewriterConfig explicitly).
RewriterConfig rewrite_options = 10;
}
message ThreadPoolOptionProto {
// The number of threads in the pool.
//
// 0 means the system picks a value based on where this option proto is used
// (see the declaration of the specific field for more info).
int32 num_threads = 1;
// The global name of the threadpool.
//
// If empty, then the threadpool is made and used according to the scope it's
// in - e.g., for a session threadpool, it is used by that session only.
//
// If non-empty, then:
// - a global threadpool associated with this name is looked
// up or created. This allows, for example, sharing one threadpool across
// many sessions (e.g., like the default behavior, if
// inter_op_parallelism_threads is not configured), but still partitioning
// into a large and small pool.
// - if the threadpool for this global_name already exists, then it is an
// error if the existing pool was created using a different num_threads
// value as is specified on this call.
// - threadpools created this way are never garbage collected.
string global_name = 2;
}
// Metadata about the session.
//
// This can be used by the runtime and the Ops for debugging, monitoring, etc.
//
// The (name, version) tuple is expected to be a unique identifier for
// sessions within the same process.
//
// NOTE: This is currently used and propagated only by the direct session.
message SessionMetadata {
string name = 1;
// The version is optional. If set, needs to be >= 0.
int64 version = 2;
}
// Session configuration parameters.
// The system picks appropriate values for fields that are not set.
message ConfigProto {
// Map from device type name (e.g., "CPU" or "GPU" ) to maximum
// number of devices of that type to use. If a particular device
// type is not found in the map, the system picks an appropriate
// number.
map<string, int32> device_count = 1;
// The execution of an individual op (for some op types) can be
// parallelized on a pool of intra_op_parallelism_threads.
// 0 means the system picks an appropriate number.
//
// If you create an ordinary session, e.g., from Python or C++,
// then there is exactly one intra op thread pool per process.
// The first session created determines the number of threads in this pool.
// All subsequent sessions reuse/share this one global pool.
//
// There are notable exceptions to the default behavior described above:
// 1. There is an environment variable for overriding this thread pool,
// named TF_OVERRIDE_GLOBAL_THREADPOOL.
// 2. When connecting to a server, such as a remote `tf.train.Server`
// instance, then this option will be ignored altogether.
int32 intra_op_parallelism_threads = 2;
// Nodes that perform blocking operations are enqueued on a pool of
// inter_op_parallelism_threads available in each process.
//
// 0 means the system picks an appropriate number.
// Negative means all operations are performed in caller's thread.
//
// Note that the first Session created in the process sets the
// number of threads for all future sessions unless use_per_session_threads is
// true or session_inter_op_thread_pool is configured.
int32 inter_op_parallelism_threads = 5;
// If true, use a new set of threads for this session rather than the global
// pool of threads. Only supported by direct sessions.
//
// If false, use the global threads created by the first session, or the
// per-session thread pools configured by session_inter_op_thread_pool.
//
// This option is deprecated. The same effect can be achieved by setting
// session_inter_op_thread_pool to have one element, whose num_threads equals
// inter_op_parallelism_threads.
bool use_per_session_threads = 9;
// This option is experimental - it may be replaced with a different mechanism
// in the future.
//
// Configures session thread pools. If this is configured, then RunOptions for
// a Run call can select the thread pool to use.
//
// The intended use is for when some session invocations need to run in a
// background pool limited to a small number of threads:
// - For example, a session may be configured to have one large pool (for
// regular compute) and one small pool (for periodic, low priority work);
// using the small pool is currently the mechanism for limiting the inter-op
// parallelism of the low priority work. Note that it does not limit the
// parallelism of work spawned by a single op kernel implementation.
// - Using this setting is normally not needed in training, but may help some
// serving use cases.
// - It is also generally recommended to set the global_name field of this
// proto, to avoid creating multiple large pools. It is typically better to
// run the non-low-priority work, even across sessions, in a single large
// pool.
repeated ThreadPoolOptionProto session_inter_op_thread_pool = 12;
// Assignment of Nodes to Devices is recomputed every placement_period
// steps until the system warms up (at which point the recomputation
// typically slows down automatically).
int32 placement_period = 3;
// When any filters are present sessions will ignore all devices which do not
// match the filters. Each filter can be partially specified, e.g. "/job:ps"
// "/job:worker/replica:3", etc.
repeated string device_filters = 4;
// Options that apply to all GPUs.
GPUOptions gpu_options = 6;
// Options that apply to pluggable devices.
GPUOptions pluggable_device_options = 18;
// Whether soft placement is allowed. If allow_soft_placement is true,
// an op will be placed on CPU if
// 1. there's no GPU implementation for the OP
// or
// 2. no GPU devices are known or registered
// or
// 3. need to co-locate with reftype input(s) which are from CPU.
bool allow_soft_placement = 7;
// Whether device placements should be logged.
bool log_device_placement = 8;
// Options that apply to all graphs.
GraphOptions graph_options = 10;
// Global timeout for all blocking operations in this session. If non-zero,
// and not overridden on a per-operation basis, this value will be used as the
// deadline for all blocking operations.
int64 operation_timeout_in_ms = 11;
// Options that apply when this session uses the distributed runtime.
RPCOptions rpc_options = 13;
// Optional list of all workers to use in this session.
ClusterDef cluster_def = 14;
// If true, any resources such as Variables used in the session will not be
// shared with other sessions. However, when clusterspec propagation is
// enabled, this field is ignored and sessions are always isolated.
bool isolate_session_state = 15;
// When true, WorkerSessions are created with device attributes from the
// full cluster.
// This is helpful when a worker wants to partition a graph
// (for example during a PartitionedCallOp).
bool share_cluster_devices_in_session = 17;
// Everything inside Experimental is subject to change and is not subject
// to API stability guarantees in
// https://www.tensorflow.org/guide/versions.
message Experimental {
// Task name for group resolution.
string collective_group_leader = 1;
// We removed the flag client_handles_error_formatting. Marking the tag
// number as reserved.
// TODO(shikharagarwal): Should we just remove this tag so that it can be
// used in future for other purpose?
reserved 2;
// Which executor to use, the default executor will be used
// if it is an empty string or "DEFAULT"
string executor_type = 3;
// Guidance to formatting of large RecvBuf fields for transfer.
// Any positive value sets the max chunk size. 0 defaults to 4096.
// Any negative value indicates no max, i.e. one chunk only.
int32 recv_buf_max_chunk = 4;
// If true, and supported by the platform, the runtime will attempt to
// use NUMA affinity where applicable. One consequence will be the
// existence of as many CPU devices as there are available NUMA nodes.
bool use_numa_affinity = 5;
// If true, make collective op execution order sequential and deterministic
// for potentially concurrent collective instances.
bool collective_deterministic_sequential_execution = 6;
// If true, use NCCL for CollectiveOps. This feature is highly
// experimental.
bool collective_nccl = 7;
// In the following, session state means the value of a variable, elements
// in a hash table, or any other resource, accessible by worker sessions
// held by a TF server.
//
// When ClusterSpec propagation is enabled, the value of
// isolate_session_state is ignored when deciding whether to share session
// states in a TF server (for backwards compatibility reasons).
// - If share_session_state_in_clusterspec_propagation is true, the session
// states are shared.
// - If share_session_state_in_clusterspec_propagation is false, session
// states are isolated.
//
// When clusterspec propagation is not used, the value of
// share_session_state_in_clusterspec_propagation is ignored when deciding
// whether to share session states in a TF server.
// - If isolate_session_state is true, session states are isolated.
// - If isolate_session_state is false, session states are shared.
//
// TODO(b/129330037): Add a single API that consistently treats
// isolate_session_state and ClusterSpec propagation.
bool share_session_state_in_clusterspec_propagation = 8;
// If using a direct session, disable spinning while waiting for work in
// the thread pool. This may result in higher latency for completing ops,
// but in the case where there is a lot of spinning may result in lower
// CPU usage.
bool disable_thread_spinning = 9;
// This was promoted to a non-experimental API. Please use
// ConfigProto.share_cluster_devices_in_session instead.
bool share_cluster_devices_in_session = 10;
// Metadata about the session.
//
// If set, this can be used by the runtime and the Ops for debugging,
// monitoring, etc.
//
// NOTE: This is currently used and propagated only by the direct session
// and EagerContext.
SessionMetadata session_metadata = 11;
// If true, the session may treat the graph as being static for optimization
// purposes.
//
// If this option is set to true when a session is created, the full
// GraphDef must be passed in a single call to Session::Create(), and
// Session::Extend() may not be supported.
bool optimize_for_static_graph = 12;
// Whether to enable the MLIR-based TF->XLA bridge. This is only used if set
// to true. Default value or false is ignored. Use mlir_bridge_rollout for
// finer control.
//
// If this option is set to true when a session is created, MLIR is used to
// perform the set of graph transformations to put the graph in a form that
// can be executed with delegation of some computations to an accelerator.
// This builds on the model of XLA where a subset of the graph is
// encapsulated and attached to a "compile" operation, whose result is fed
// to an "execute" operation. The kernel for these operations is responsible
// to lower the encapsulated graph to a particular device.
bool enable_mlir_bridge = 13;
// An enum that describes the state of the MLIR bridge rollout.
enum MlirBridgeRollout {
// If this field is left unspecified, the MLIR bridge may be selectively
// enabled on a per graph basis.
MLIR_BRIDGE_ROLLOUT_UNSPECIFIED = 0;
// Enabling the MLIR bridge enables it for all graphs in this session.
MLIR_BRIDGE_ROLLOUT_ENABLED = 1;
// Disabling the MLIR bridge disables it for all graphs in this session.
MLIR_BRIDGE_ROLLOUT_DISABLED = 2;
reserved 3, 4;
reserved "MLIR_BRIDGE_ROLLOUT_SAFE_MODE_ENABLED",
"MLIR_BRIDGE_ROLLOUT_SAFE_MODE_FALLBACK_ENABLED";
}
// Whether to enable the MLIR-based TF->XLA bridge.
MlirBridgeRollout mlir_bridge_rollout = 17;
// Whether to enable the MLIR-based Graph optimizations.
//
// This will become a part of standard Tensorflow graph optimization
// pipeline, currently this is only used for gradual migration and testing
// new passes that are replacing existing optimizations in Grappler.
bool enable_mlir_graph_optimization = 16;
// If true, the session will not store an additional copy of the graph for
// each subgraph.
//
// If this option is set to true when a session is created, the
// `RunOptions.output_partition_graphs` options must not be set.
bool disable_output_partition_graphs = 14;
// Minimum number of batches run through the XLA graph before XLA fusion
// autotuner is enabled. Default value of zero disables the autotuner.
//
// The XLA fusion autotuner can improve performance by executing a heuristic
// search on the compiler parameters.
int64 xla_fusion_autotuner_thresh = 15;
// Whether runtime execution uses TFRT.
bool use_tfrt = 18;
// If true, use Pathways with TFRT API for multi host support.
bool enable_multi_host = 27;
// If true, use ifrt as the backend for TFRT. This is only used when
// `use_tfrt` is true.
bool tfrt_use_ifrt = 32;
// Port for the Pathways server. Ignored if enable_multi_host=false.
int32 backend_server_port = 28;
// If true, TFRT will use TPU specific compiler passes and perform TPU
// specific initialization.
bool target_tpu = 29;
// If true, TFRT will use GPU specific compiler passes and perform GPU
// specific initialization.
bool target_gpu = 30;
// The threshold to merge small streams in TFRT. The stream with cost
// smaller than the threshold will be merged. Setting it to value 1
// disables all merges.
int32 stream_merge_threshold = 31;
// The field "coordination_service was previously specified as a string;
// this has been replaced with a message below.
reserved 19;
// We removed the flag fetch_remote_devices_in_multi_client. Marking the tag
// number as reserved.
reserved 20;
// Whether functional control flow op lowering should be disabled. This is
// useful when executing within a portable runtime where control flow op
// kernels may not be loaded due to selective registration.
bool disable_functional_ops_lowering = 21;
// Provides a hint to XLA auto clustering to prefer forming a single large
// cluster that encompasses most of the graph.
bool xla_prefer_single_graph_cluster = 22;
// Distributed coordination service configurations.
CoordinationServiceConfig coordination_config = 23;
// If true, the session will treat the graph as being non-static for
// optimization purposes.
//
// If this option is set to true when a session is created, the full
// GraphDef will be retained to enable calls to Session::Extend().
// Calling Extend() without setting this flag will result in errors.
//
// This option is meant to replace `optimize_for_static_graph` and it
// aims to negate its value.
bool disable_optimize_for_static_graph = 24;
// Whether eager remote execution will stream all the function calls or
// allow them to happen in parallel. When true, streaming execution is
// disabled, and parallel execution is allowed.
bool disable_eager_executor_streaming_enqueue = 26;
// If true, the function library runtime will be finalized when the session
// is finalized.
bool finalize_function_library_runtime = 33;
// If true, the resource manager will be finalized when the session
// is finalized.
bool finalize_resource_manager = 34;
reserved 25;
// Next: 35
}
Experimental experimental = 16;
// Next: 19
}
// Options for a single Run() call.
message RunOptions {
// TODO(pbar) Turn this into a TraceOptions proto which allows
// tracing to be controlled in a more orthogonal manner?
enum TraceLevel {
NO_TRACE = 0;
SOFTWARE_TRACE = 1;
HARDWARE_TRACE = 2;
FULL_TRACE = 3;
}
TraceLevel trace_level = 1;
// Time to wait for operation to complete in milliseconds.
int64 timeout_in_ms = 2;
// The thread pool to use, if session_inter_op_thread_pool is configured.
// To use the caller thread set this to -1 - this uses the caller thread
// to execute Session::Run() and thus avoids a context switch. Using the
// caller thread to execute Session::Run() should be done ONLY for simple
// graphs, where the overhead of an additional context switch is
// comparable with the overhead of Session::Run().
int32 inter_op_thread_pool = 3;
// Whether the partition graph(s) executed by the executor(s) should be
// outputted via RunMetadata.
bool output_partition_graphs = 5;
// EXPERIMENTAL. Options used to initialize DebuggerState, if enabled.
DebugOptions debug_options = 6;
// When enabled, causes tensor allocation information to be included in
// the error message when the Run() call fails because the allocator ran
// out of memory (OOM).
//
// Enabling this option can slow down the Run() call.
bool report_tensor_allocations_upon_oom = 7;
// Everything inside Experimental is subject to change and is not subject
// to API stability guarantees in
// https://www.tensorflow.org/guide/version_compat.
message Experimental {
// If non-zero, declares that this graph is going to use collective
// ops and must synchronize step_ids with any other graph with this
// same group_key value (in a distributed computation where tasks
// run disjoint graphs).
int64 collective_graph_key = 1;
// If true, then operations (using the inter-op pool) across all
// session::run() calls will be centrally scheduled, optimizing for (median
// and tail) latency.
// Consider using this option for CPU-bound workloads like inference.
bool use_run_handler_pool = 2;
// Options for run handler thread pool.
message RunHandlerPoolOptions {
// Priority of the request. The run handler thread pool will schedule ops
// based on the priority number. The larger number means higher priority.
int64 priority = 1;
}
RunHandlerPoolOptions run_handler_pool_options = 3;
}
Experimental experimental = 8;
reserved 4;
}
// Metadata output (i.e., non-Tensor) for a single Run() call.
message RunMetadata {
// Statistics traced for this step. Populated if tracing is turned on via the
// "RunOptions" proto.
// EXPERIMENTAL: The format and set of events may change in future versions.
StepStats step_stats = 1;
// The cost graph for the computation defined by the run call.
CostGraphDef cost_graph = 2;
// Graphs of the partitions executed by executors.
repeated GraphDef partition_graphs = 3;
message FunctionGraphs {
// TODO(nareshmodi): Include some sort of function/cache-key identifier?
repeated GraphDef partition_graphs = 1;
GraphDef pre_optimization_graph = 2;
GraphDef post_optimization_graph = 3;
}
// This is only populated for graphs that are run as functions in TensorFlow
// V2. There will be an entry below for each function that is traced.
// The main use cases of the post_optimization_graph and the partition_graphs
// is to give the caller insight into the graphs that were actually run by the
// runtime. Additional information (such as those in step_stats) will match
// these graphs.
// We also include the pre_optimization_graph since it is usually easier to
// read, and is helpful in situations where the caller wants to get a high
// level idea of what the built graph looks like (since the various graph
// optimization passes might change the structure of the graph significantly).
repeated FunctionGraphs function_graphs = 4;
// Metadata about the session.
SessionMetadata session_metadata = 5;
}
// Defines a connection between two tensors in a `GraphDef`.
message TensorConnection {
// A tensor name. The value of this tensor will be substituted for
// the tensor named in `to_tensor`.
string from_tensor = 1;
// A tensor name. The value of this tensor will be bound to the
// value of the tensor named in `from_tensor`.
string to_tensor = 2;
}
// Defines a subgraph in another `GraphDef` as a set of feed points and nodes
// to be fetched or executed.
//
// Compare with the arguments to `Session::Run()`.
message CallableOptions {
// Tensors to be fed in the callable. Each feed is the name of a tensor.
repeated string feed = 1;
// Fetches. A list of tensor names. The caller of the callable expects a
// tensor to be returned for each fetch[i] (see RunStepResponse.tensor). The
// order of specified fetches does not change the execution order.
repeated string fetch = 2;
// Target Nodes. A list of node names. The named nodes will be run by the
// callable but their outputs will not be returned.
repeated string target = 3;
// Options that will be applied to each run.
RunOptions run_options = 4;
// Tensors to be connected in the callable. Each TensorConnection denotes
// a pair of tensors in the graph, between which an edge will be created
// in the callable.
repeated TensorConnection tensor_connection = 5;
// The Tensor objects fed in the callable and fetched from the callable
// are expected to be backed by host (CPU) memory by default.
//
// The options below allow changing that - feeding tensors backed by
// device memory, or returning tensors that are backed by device memory.
//
// The maps below map the name of a feed/fetch tensor (which appears in
// 'feed' or 'fetch' fields above), to the fully qualified name of the device
// owning the memory backing the contents of the tensor.
//
// For example, creating a callable with the following options:
//
// CallableOptions {
// feed: "a:0"
// feed: "b:0"
//
// fetch: "x:0"
// fetch: "y:0"
//
// feed_devices: {
// "a:0": "/job:localhost/replica:0/task:0/device:GPU:0"
// }
//
// fetch_devices: {
// "y:0": "/job:localhost/replica:0/task:0/device:GPU:0"
// }
// }
//
// means that the Callable expects:
// - The first argument ("a:0") is a Tensor backed by GPU memory.
// - The second argument ("b:0") is a Tensor backed by host memory.
// and of its return values:
// - The first output ("x:0") will be backed by host memory.
// - The second output ("y:0") will be backed by GPU memory.
//
// FEEDS:
// It is the responsibility of the caller to ensure that the memory of the fed
// tensors will be correctly initialized and synchronized before it is
// accessed by operations executed during the call to Session::RunCallable().
//
// This is typically ensured by using the TensorFlow memory allocators
// (Device::GetAllocator()) to create the Tensor to be fed.
//
// Alternatively, for CUDA-enabled GPU devices, this typically means that the
// operation that produced the contents of the tensor has completed, i.e., the
// CUDA stream has been synchronized (e.g., via cuCtxSynchronize() or
// cuStreamSynchronize()).
map<string, string> feed_devices = 6;
map<string, string> fetch_devices = 7;
// By default, RunCallable() will synchronize the GPU stream before returning
// fetched tensors on a GPU device, to ensure that the values in those tensors
// have been produced. This simplifies interacting with the tensors, but
// potentially incurs a performance hit.
//
// If this options is set to true, the caller is responsible for ensuring
// that the values in the fetched tensors have been produced before they are
// used. The caller can do this by invoking `Device::Sync()` on the underlying
// device(s), or by feeding the tensors back to the same Session using
// `feed_devices` with the same corresponding device name.
bool fetch_skip_sync = 8;
// Next: 9
}
message BatchingOptions {