tensorflow/core/protobuf/config.proto

syntax = "proto3";

package tensorflow;

import "xla/tsl/protobuf/coordination_config.proto";
import "tensorflow/core/framework/cost_graph.proto";
import "tensorflow/core/framework/graph.proto";
import "tensorflow/core/framework/step_stats.proto";
import "tensorflow/core/protobuf/cluster.proto";
import "tensorflow/core/protobuf/debug.proto";
import "tensorflow/core/protobuf/rewriter_config.proto";
import "tensorflow/core/protobuf/rpc_options.proto";

option cc_enable_arenas = true;
option java_outer_classname = "ConfigProtos";
option java_multiple_files = true;
option java_package = "org.tensorflow.framework";
option go_package = "github.com/tensorflow/tensorflow/tensorflow/go/core/protobuf/for_core_protos_go_proto";

message GPUOptions {
  // Fraction of the total GPU memory to allocate for each process.
  // 1 means to allocate all of the GPU memory, 0.5 means the process
  // allocates up to ~50% of the total GPU memory.
  //
  // GPU memory is pre-allocated unless the allow_growth option is enabled.
  //
  // If greater than 1.0, uses CUDA unified memory to potentially oversubscribe
  // the amount of memory available on the GPU device by using host memory as a
  // swap space. Accessing memory not available on the device will be
  // significantly slower as that would require memory transfer between the host
  // and the device. Options to reduce the memory requirement should be
  // considered before enabling this option as this may come with a negative
  // performance impact. Oversubscription using the unified memory requires
  // Pascal class or newer GPUs and it is currently only supported on the Linux
  // operating system. See
  // https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#um-requirements
  // for the detailed requirements.
  double per_process_gpu_memory_fraction = 1;

  // If true, the allocator does not pre-allocate the entire specified
  // GPU memory region, instead starting small and growing as needed.
  bool allow_growth = 4;

  // The type of GPU allocation strategy to use.
  //
  // Allowed values:
  // "": The empty string (default) uses a system-chosen default
  //     which may change over time.
  //
  // "BFC": A "Best-fit with coalescing" algorithm, simplified from a
  //        version of dlmalloc.
  string allocator_type = 2;

  // Delay deletion of up to this many bytes to reduce the number of
  // interactions with gpu driver code.  If 0, the system chooses
  // a reasonable default (several MBs).
  int64 deferred_deletion_bytes = 3;

  // A comma-separated list of GPU ids that determines the 'visible'
  // to 'virtual' mapping of GPU devices.  For example, if TensorFlow
  // can see 8 GPU devices in the process, and one wanted to map
  // visible GPU devices 5 and 3 as "/device:GPU:0", and "/device:GPU:1",
  // then one would specify this field as "5,3".  This field is similar in
  // spirit to the CUDA_VISIBLE_DEVICES environment variable, except
  // it applies to the visible GPU devices in the process.
  //
  // NOTE:
  // 1. The GPU driver provides the process with the visible GPUs
  //    in an order which is not guaranteed to have any correlation to
  //    the *physical* GPU id in the machine.  This field is used for
  //    remapping "visible" to "virtual", which means this operates only
  //    after the process starts.  Users are required to use vendor
  //    specific mechanisms (e.g., CUDA_VISIBLE_DEVICES) to control the
  //    physical to visible device mapping prior to invoking TensorFlow.
  // 2. In the code, the ids in this list are also called "platform GPU id"s,
  //    and the 'virtual' ids of GPU devices (i.e. the ids in the device
  //    name "/device:GPU:<id>") are also called "TF GPU id"s. Please
  //    refer to third_party/tensorflow/core/common_runtime/gpu/gpu_id.h
  //    for more information.
  // 3. The visible_device_list is also used for PluggableDevice. And
  //    different types of PluggableDevices share this field. In that case,
  //    the pluggable_device_type is used to distinguish them, making the
  //    visible_device_list a list of <pluggable_device_type>:<device_index>,
  //    e.g. "PluggableDeviceA:0,PluggableDeviceA:1,PluggableDeviceB:0".
  string visible_device_list = 5;

  // In the event polling loop sleep this many microseconds between
  // PollEvents calls, when the queue is not empty.  If value is not
  // set or set to 0, gets set to a non-zero default.
  int32 polling_active_delay_usecs = 6;

  // This field is deprecated and ignored.
  int32 polling_inactive_delay_msecs = 7;

  // Force all tensors to be gpu_compatible. On a GPU-enabled TensorFlow,
  // enabling this option forces all CPU tensors to be allocated with Cuda
  // pinned memory. Normally, TensorFlow will infer which tensors should be
  // allocated as the pinned memory. But in case where the inference is
  // incomplete, this option can significantly speed up the cross-device memory
  // copy performance as long as it fits the memory.
  // Note that this option is not something that should be
  // enabled by default for unknown or very large models, since all Cuda pinned
  // memory is unpageable, having too much pinned memory might negatively impact
  // the overall host system performance.
  bool force_gpu_compatible = 8;

  message Experimental {
    // Configuration for breaking down a visible GPU into multiple "virtual"
    // devices.
    message VirtualDevices {
      // Per "virtual" device memory limit, in MB. The number of elements in
      // the list is the number of virtual devices to create on the
      // corresponding visible GPU (see "virtual_devices" below).
      // If empty and `num_virtual_devices_per_gpu` is not set, it will create
      // single virtual device taking all available memory from the device.
      //
      // For the concept of "visible" and "virtual" GPU, see the comments for
      // "visible_device_list" above for more information.
      repeated float memory_limit_mb = 1;

      // Priority values to use with the virtual devices. Use the cuda function
      // cudaDeviceGetStreamPriorityRange to query for valid range of values for
      // priority.
      //
      // On a P4000 GPU with cuda 10.1, the priority range reported was 0 for
      // least priority and -1 for greatest priority.
      //
      // If this field is not specified, then the virtual devices will be
      // created with the default. If this field has values set, then the size
      // of this must match with the above memory_limit_mb.
      repeated int32 priority = 2;

      // Virtual Device ordinal number determines the device ID of the device.
      // A Virtual device with a lower ordinal number always receives the a
      // smaller device id. The phyiscal device id and location in the
      // virtual device list is used to break ties.
      repeated int32 device_ordinal = 3;
    }

    // The multi virtual device settings. If empty (not set), it will create
    // single virtual device on each visible GPU, according to the settings
    // in "visible_device_list" above. Otherwise, the number of elements in the
    // list must be the same as the number of visible GPUs (after
    // "visible_device_list" filtering if it is set), and the string represented
    // device names (e.g. /device:GPU:<id>) will refer to the virtual
    // devices and have the <id> field assigned sequentially starting from 0,
    // according to the order of the virtual devices determined by
    // device_ordinal and the location in the virtual device list.
    //
    // For example,
    //   visible_device_list = "1,0"
    //   virtual_devices { memory_limit: 1GB memory_limit: 2GB }
    //   virtual_devices { memory_limit: 3GB memory_limit: 4GB }
    // will create 4 virtual devices as:
    //   /device:GPU:0 -> visible GPU 1 with 1GB memory
    //   /device:GPU:1 -> visible GPU 1 with 2GB memory
    //   /device:GPU:2 -> visible GPU 0 with 3GB memory
    //   /device:GPU:3 -> visible GPU 0 with 4GB memory
    //
    // but
    //   visible_device_list = "1,0"
    //   virtual_devices { memory_limit: 1GB memory_limit: 2GB
    //                     device_ordinal: 10 device_ordinal: 20}
    //   virtual_devices { memory_limit: 3GB memory_limit: 4GB
    //                     device_ordinal: 10 device_ordinal: 20}
    // will create 4 virtual devices as:
    //   /device:GPU:0 -> visible GPU 1 with 1GB memory  (ordinal 10)
    //   /device:GPU:1 -> visible GPU 0 with 3GB memory  (ordinal 10)
    //   /device:GPU:2 -> visible GPU 1 with 2GB memory  (ordinal 20)
    //   /device:GPU:3 -> visible GPU 0 with 4GB memory  (ordinal 20)
    //
    // NOTE:
    // 1. It's invalid to set both this and "per_process_gpu_memory_fraction"
    //    at the same time.
    // 2. Currently this setting is per-process, not per-session. Using
    //    different settings in different sessions within same process will
    //    result in undefined behavior.
    repeated VirtualDevices virtual_devices = 1;

    // The number of virtual devices to create on each visible GPU. The
    // available memory will be split equally among all virtual devices. If the
    // field `memory_limit_mb` in `VirtualDevices` is not empty, this field will
    // be ignored.
    int32 num_virtual_devices_per_gpu = 15;

    // If true, uses CUDA unified memory for memory allocations. If
    // per_process_gpu_memory_fraction option is greater than 1.0, then unified
    // memory is used regardless of the value for this field. See comments for
    // per_process_gpu_memory_fraction field for more details and requirements
    // of the unified memory. This option is useful to oversubscribe memory if
    // multiple processes are sharing a single GPU while individually using less
    // than 1.0 per process memory fraction.
    bool use_unified_memory = 2;

    // If > 1, the number of device-to-device copy streams to create
    // for each GPUDevice.  Default value is 0, which is automatically
    // converted to 1.
    int32 num_dev_to_dev_copy_streams = 3;

    // If non-empty, defines a good GPU ring order on a single worker based on
    // device interconnect.  This assumes that all workers have the same GPU
    // topology.  Specify as a comma-separated string, e.g. "3,2,1,0,7,6,5,4".
    // This ring order is used by the RingReducer implementation of
    // CollectiveReduce, and serves as an override to automatic ring order
    // generation in OrderTaskDeviceMap() during CollectiveParam resolution.
    string collective_ring_order = 4;

    // If true then extra work is done by GPUDevice and GPUBFCAllocator to
    // keep track of when GPU memory is freed and when kernels actually
    // complete so that we can know when a nominally free memory chunk
    // is really not subject to pending use.
    bool timestamped_allocator = 5;

    // reserved id: 6

    // Parameters for GPUKernelTracker.  By default no kernel tracking is done.
    // Note that timestamped_allocator is only effective if some tracking is
    // specified.
    //
    // If kernel_tracker_max_interval = n > 0, then a tracking event
    // is inserted after every n kernels without an event.
    int32 kernel_tracker_max_interval = 7;
    // If kernel_tracker_max_bytes = n > 0, then a tracking event is
    // inserted after every series of kernels allocating a sum of
    // memory >= n.  If one kernel allocates b * n bytes, then one
    // event will be inserted after it, but it will count as b against
    // the pending limit.
    int32 kernel_tracker_max_bytes = 8;
    // If kernel_tracker_max_pending > 0 then no more than this many
    // tracking events can be outstanding at a time.  An attempt to
    // launch an additional kernel will stall until an event
    // completes.
    int32 kernel_tracker_max_pending = 9;

    // BFC Allocator can return an allocated chunk of memory upto 2x the
    // requested size. For virtual devices with tight memory constraints, and
    // proportionately large allocation requests, this can lead to a significant
    // reduction in available memory. The threshold below controls when a chunk
    // should be split if the chunk size exceeds requested memory size. It is
    // expressed as a fraction of total available memory for the tf device. For
    // example setting it to 0.05 would imply a chunk needs to be split if its
    // size exceeds the requested memory by 5% of the total virtual device/gpu
    // memory size.
    double internal_fragmentation_fraction = 10;

    // When true, use CUDA cudaMallocAsync API instead of TF gpu allocator.
    bool use_cuda_malloc_async = 11;

    // By default, BFCAllocator may sleep when it runs out of memory, in the
    // hopes that another thread will free up memory in the meantime.  Setting
    // this to true disables the sleep; instead we'll OOM immediately.
    bool disallow_retry_on_allocation_failure = 12;

    // Memory limit for "GPU host allocator", aka pinned memory allocator.  This
    // can also be set via the envvar TF_GPU_HOST_MEM_LIMIT_IN_MB.
    float gpu_host_mem_limit_in_mb = 13;

    // If true, then the host allocator allocates its max memory all upfront and
    // never grows.  This can be useful for latency-sensitive systems, because
    // growing the GPU host memory pool can be expensive.
    //
    // You probably only want to use this in combination with
    // gpu_host_mem_limit_in_mb, because the default GPU host memory limit is
    // quite high.
    bool gpu_host_mem_disallow_growth = 14;

    // Memory limit for gpu system. This can also be set by
    // TF_DEVICE_MIN_SYS_MEMORY_IN_MB, which takes precedence over
    // gpu_system_memory_size_in_mb. With this, user can configure the gpu
    // system memory size for better resource estimation of multi-tenancy(one
    // gpu with multiple model) use case.
    int32 gpu_system_memory_size_in_mb = 16;

    // If true, save information needed for created a PjRt GPU client for
    // creating a client with remote devices.
    bool populate_pjrt_gpu_client_creation_info = 17;

    // node_id for use when creating a PjRt GPU client with remote devices,
    // which enumerates jobs*tasks from a ServerDef.
    int32 node_id = 18;

    // Whether to merge data transfer streams into the compute stream in the
    // same stream group. Stream merging helps reduce the overhead caused by
    // stream synchronization, especially when data transfers are frequent. For
    // example, setting "merge_host_to_device_stream = true" will make the
    // compute stream responsible for both computation and host to device memory
    // copy.
    message StreamMergeOptions {
      // If true, the compute stream will be used for host_to_device copy as
      // well. It's no longer necessary to record an event before the copy to
      // let the copy stream wait for the compute stream to finish. There is
      // also no need to wait for the copy to complete before executing the
      // callback function.
      bool merge_host_to_device_stream = 1;

      // If true, the compute stream will be used for device_to_host copy as
      // well. It's no longer necessary to record an event before the copy to
      // let the copy stream wait for the compute stream to finish.
      bool merge_device_to_host_stream = 2;

      // If true, the compute stream will be used for device_to_device copy as
      // well. It's no longer necessary to record an event before the copy to
      // let the copy stream wait for the compute stream of the sending device
      // to finish. There is also no need to wait for the compute stream of the
      // receiving device to finish if the copy is within the same device.
      bool merge_device_to_device_stream = 3;
    }

    StreamMergeOptions stream_merge_options = 19;
  }

  // Everything inside experimental is subject to change and is not subject
  // to API stability guarantees in
  // https://www.tensorflow.org/guide/versions.
  Experimental experimental = 9;
}

// Options passed to the graph optimizer
message OptimizerOptions {
  // If true, optimize the graph using common subexpression elimination.
  // Note: the optimization Level L1 will override this setting to true. So in
  // order to disable common subexpression elimination the opt_level has to be
  // set to L0.
  bool do_common_subexpression_elimination = 1;

  // If true, perform constant folding optimization on the graph.
  // Note: the optimization Level L1 will override this setting to true. So in
  // order to disable constant folding the opt_level has to be set to L0.
  bool do_constant_folding = 2;

  // Constant folding optimization replaces tensors whose values can be
  // predetermined, with constant nodes. To avoid inserting too large constants,
  // the size of each constant created can be limited. If this value is zero, a
  // default limit of 10 MiB will be applied. If constant folding optimization
  // is disabled, this value is ignored.
  int64 max_folded_constant_in_bytes = 6;

  // If true, perform function inlining on the graph.
  bool do_function_inlining = 4;

  // Optimization level
  enum Level {
    // L1 is the default level.
    // Optimization performed at L1 :
    // 1. Common subexpression elimination
    // 2. Constant folding
    L1 = 0;

    // No optimizations
    L0 = -1;
  }

  // Overall optimization level. The actual optimizations applied will be the
  // logical OR of the flags that this level implies and any flags already set.
  Level opt_level = 3;

  // Control the use of the compiler/jit.  Experimental.
  enum GlobalJitLevel {
    DEFAULT = 0;  // Default setting ("off" now, but later expected to be "on")
    OFF = -1;
    // The following settings turn on compilation, with higher values being
    // more aggressive.  Higher values may reduce opportunities for parallelism
    // and may use more memory.  (At present, there is no distinction, but this
    // is expected to change.)
    ON_1 = 1;
    ON_2 = 2;
  }
  GlobalJitLevel global_jit_level = 5;

  // CPU code will be autoclustered only if global_jit_level >= ON_1 and either:
  //  - this flag is true, or
  //  - TF_XLA_FLAGS contains --tf_xla_cpu_global_jit=true.
  bool cpu_global_jit = 7;
}

message GraphOptions {
  // Removed, use optimizer_options below.
  reserved "skip_common_subexpression_elimination";
  reserved 1;

  // If true, use control flow to schedule the activation of Recv nodes.
  // (Currently ignored.)
  bool enable_recv_scheduling = 2;

  // Options controlling how graph is optimized.
  OptimizerOptions optimizer_options = 3;

  // The number of steps to run before returning a cost model detailing
  // the memory usage and performance of each node of the graph. 0 means
  // no cost model.
  int64 build_cost_model = 4;

  // The number of steps to skip before collecting statistics for the
  // cost model.
  int64 build_cost_model_after = 9;

  // Annotate each Node with Op output shape data, to the extent it can
  // be statically inferred.
  bool infer_shapes = 5;

  // Only place the subgraphs that are run, rather than the entire graph.
  //
  // This is useful for interactive graph building, where one might
  // produce graphs that cannot be placed during the debugging
  // process.  In particular, it allows the client to continue work in
  // a session after adding a node to a graph whose placement
  // constraints are unsatisfiable.
  bool place_pruned_graph = 6;

  // If true, transfer float values between processes as bfloat16.
  bool enable_bfloat16_sendrecv = 7;

  // If > 0, record a timeline every this many steps.
  // EXPERIMENTAL: This currently has no effect in MasterSession.
  int32 timeline_step = 8;

  // Options that control the type and amount of graph rewriting.
  // Not currently configurable via the public Python API (i.e. there is no API
  // stability guarantee if you import RewriterConfig explicitly).
  RewriterConfig rewrite_options = 10;
}

message ThreadPoolOptionProto {
  // The number of threads in the pool.
  //
  // 0 means the system picks a value based on where this option proto is used
  // (see the declaration of the specific field for more info).
  int32 num_threads = 1;

  // The global name of the threadpool.
  //
  // If empty, then the threadpool is made and used according to the scope it's
  // in - e.g., for a session threadpool, it is used by that session only.
  //
  // If non-empty, then:
  // - a global threadpool associated with this name is looked
  //   up or created. This allows, for example, sharing one threadpool across
  //   many sessions (e.g., like the default behavior, if
  //   inter_op_parallelism_threads is not configured), but still partitioning
  //   into a large and small pool.
  // - if the threadpool for this global_name already exists, then it is an
  //   error if the existing pool was created using a different num_threads
  //   value as is specified on this call.
  // - threadpools created this way are never garbage collected.
  string global_name = 2;
}

// Metadata about the session.
//
// This can be used by the runtime and the Ops for debugging, monitoring, etc.
//
// The (name, version) tuple is expected to be a unique identifier for
// sessions within the same process.
//
// NOTE: This is currently used and propagated only by the direct session.
message SessionMetadata {
  string name = 1;

  // The version is optional. If set, needs to be >= 0.
  int64 version = 2;
}

// Session configuration parameters.
// The system picks appropriate values for fields that are not set.
message ConfigProto {
  // Map from device type name (e.g., "CPU" or "GPU" ) to maximum
  // number of devices of that type to use.  If a particular device
  // type is not found in the map, the system picks an appropriate
  // number.
  map<string, int32> device_count = 1;

  // The execution of an individual op (for some op types) can be
  // parallelized on a pool of intra_op_parallelism_threads.
  // 0 means the system picks an appropriate number.
  //
  // If you create an ordinary session, e.g., from Python or C++,
  // then there is exactly one intra op thread pool per process.
  // The first session created determines the number of threads in this pool.
  // All subsequent sessions reuse/share this one global pool.
  //
  // There are notable exceptions to the default behavior described above:
  // 1. There is an environment variable  for overriding this thread pool,
  //    named TF_OVERRIDE_GLOBAL_THREADPOOL.
  // 2. When connecting to a server, such as a remote `tf.train.Server`
  //    instance, then this option will be ignored altogether.
  int32 intra_op_parallelism_threads = 2;

  // Nodes that perform blocking operations are enqueued on a pool of
  // inter_op_parallelism_threads available in each process.
  //
  // 0 means the system picks an appropriate number.
  // Negative means all operations are performed in caller's thread.
  //
  // Note that the first Session created in the process sets the
  // number of threads for all future sessions unless use_per_session_threads is
  // true or session_inter_op_thread_pool is configured.
  int32 inter_op_parallelism_threads = 5;

  // If true, use a new set of threads for this session rather than the global
  // pool of threads. Only supported by direct sessions.
  //
  // If false, use the global threads created by the first session, or the
  // per-session thread pools configured by session_inter_op_thread_pool.
  //
  // This option is deprecated. The same effect can be achieved by setting
  // session_inter_op_thread_pool to have one element, whose num_threads equals
  // inter_op_parallelism_threads.
  bool use_per_session_threads = 9;

  // This option is experimental - it may be replaced with a different mechanism
  // in the future.
  //
  // Configures session thread pools. If this is configured, then RunOptions for
  // a Run call can select the thread pool to use.
  //
  // The intended use is for when some session invocations need to run in a
  // background pool limited to a small number of threads:
  // - For example, a session may be configured to have one large pool (for
  // regular compute) and one small pool (for periodic, low priority work);
  // using the small pool is currently the mechanism for limiting the inter-op
  // parallelism of the low priority work.  Note that it does not limit the
  // parallelism of work spawned by a single op kernel implementation.
  // - Using this setting is normally not needed in training, but may help some
  // serving use cases.
  // - It is also generally recommended to set the global_name field of this
  // proto, to avoid creating multiple large pools. It is typically better to
  // run the non-low-priority work, even across sessions, in a single large
  // pool.
  repeated ThreadPoolOptionProto session_inter_op_thread_pool = 12;

  // Assignment of Nodes to Devices is recomputed every placement_period
  // steps until the system warms up (at which point the recomputation
  // typically slows down automatically).
  int32 placement_period = 3;

  // When any filters are present sessions will ignore all devices which do not
  // match the filters. Each filter can be partially specified, e.g. "/job:ps"
  // "/job:worker/replica:3", etc.
  repeated string device_filters = 4;

  // Options that apply to all GPUs.
  GPUOptions gpu_options = 6;

  // Options that apply to pluggable devices.
  GPUOptions pluggable_device_options = 18;

  // Whether soft placement is allowed. If allow_soft_placement is true,
  // an op will be placed on CPU if
  //   1. there's no GPU implementation for the OP
  // or
  //   2. no GPU devices are known or registered
  // or
  //   3. need to co-locate with reftype input(s) which are from CPU.
  bool allow_soft_placement = 7;

  // Whether device placements should be logged.
  bool log_device_placement = 8;

  // Options that apply to all graphs.
  GraphOptions graph_options = 10;

  // Global timeout for all blocking operations in this session.  If non-zero,
  // and not overridden on a per-operation basis, this value will be used as the
  // deadline for all blocking operations.
  int64 operation_timeout_in_ms = 11;

  // Options that apply when this session uses the distributed runtime.
  RPCOptions rpc_options = 13;

  // Optional list of all workers to use in this session.
  ClusterDef cluster_def = 14;

  // If true, any resources such as Variables used in the session will not be
  // shared with other sessions. However, when clusterspec propagation is
  // enabled, this field is ignored and sessions are always isolated.
  bool isolate_session_state = 15;

  // When true, WorkerSessions are created with device attributes from the
  // full cluster.
  // This is helpful when a worker wants to partition a graph
  // (for example during a PartitionedCallOp).
  bool share_cluster_devices_in_session = 17;

  // Everything inside Experimental is subject to change and is not subject
  // to API stability guarantees in
  // https://www.tensorflow.org/guide/versions.
  message Experimental {
    // Task name for group resolution.
    string collective_group_leader = 1;

    // We removed the flag client_handles_error_formatting. Marking the tag
    // number as reserved.
    // TODO(shikharagarwal): Should we just remove this tag so that it can be
    // used in future for other purpose?
    reserved 2;

    // Which executor to use, the default executor will be used
    // if it is an empty string or "DEFAULT"
    string executor_type = 3;

    // Guidance to formatting of large RecvBuf fields for transfer.
    // Any positive value sets the max chunk size.  0 defaults to 4096.
    // Any negative value indicates no max, i.e. one chunk only.
    int32 recv_buf_max_chunk = 4;

    // If true, and supported by the platform, the runtime will attempt to
    // use NUMA affinity where applicable.  One consequence will be the
    // existence of as many CPU devices as there are available NUMA nodes.
    bool use_numa_affinity = 5;

    // If true, make collective op execution order sequential and deterministic
    // for potentially concurrent collective instances.
    bool collective_deterministic_sequential_execution = 6;

    // If true, use NCCL for CollectiveOps.  This feature is highly
    // experimental.
    bool collective_nccl = 7;

    // In the following, session state means the value of a variable, elements
    // in a hash table, or any other resource, accessible by worker sessions
    // held by a TF server.
    //
    // When ClusterSpec propagation is enabled, the value of
    // isolate_session_state is ignored when deciding whether to share session
    // states in a TF server (for backwards compatibility reasons).
    // - If share_session_state_in_clusterspec_propagation is true, the session
    // states are shared.
    // - If share_session_state_in_clusterspec_propagation is false, session
    // states are isolated.
    //
    // When clusterspec propagation is not used, the value of
    // share_session_state_in_clusterspec_propagation is ignored when deciding
    // whether to share session states in a TF server.
    // - If isolate_session_state is true, session states are isolated.
    // - If isolate_session_state is false, session states are shared.
    //
    // TODO(b/129330037): Add a single API that consistently treats
    // isolate_session_state and ClusterSpec propagation.
    bool share_session_state_in_clusterspec_propagation = 8;

    // If using a direct session, disable spinning while waiting for work in
    // the thread pool. This may result in higher latency for completing ops,
    // but in the case where there is a lot of spinning may result in lower
    // CPU usage.
    bool disable_thread_spinning = 9;

    // This was promoted to a non-experimental API. Please use
    // ConfigProto.share_cluster_devices_in_session instead.
    bool share_cluster_devices_in_session = 10;

    // Metadata about the session.
    //
    // If set, this can be used by the runtime and the Ops for debugging,
    // monitoring, etc.
    //
    // NOTE: This is currently used and propagated only by the direct session
    // and EagerContext.
    SessionMetadata session_metadata = 11;

    // If true, the session may treat the graph as being static for optimization
    // purposes.
    //
    // If this option is set to true when a session is created, the full
    // GraphDef must be passed in a single call to Session::Create(), and
    // Session::Extend() may not be supported.
    bool optimize_for_static_graph = 12;

    // Whether to enable the MLIR-based TF->XLA bridge. This is only used if set
    // to true. Default value or false is ignored. Use mlir_bridge_rollout for
    // finer control.
    //
    // If this option is set to true when a session is created, MLIR is used to
    // perform the set of graph transformations to put the graph in a form that
    // can be executed with delegation of some computations to an accelerator.
    // This builds on the model of XLA where a subset of the graph is
    // encapsulated and attached to a "compile" operation, whose result is fed
    // to an "execute" operation. The kernel for these operations is responsible
    // to lower the encapsulated graph to a particular device.
    bool enable_mlir_bridge = 13;

    // An enum that describes the state of the MLIR bridge rollout.
    enum MlirBridgeRollout {
      // If this field is left unspecified, the MLIR bridge may be selectively
      // enabled on a per graph basis.
      MLIR_BRIDGE_ROLLOUT_UNSPECIFIED = 0;
      // Enabling the MLIR bridge enables it for all graphs in this session.
      MLIR_BRIDGE_ROLLOUT_ENABLED = 1;
      // Disabling the MLIR bridge disables it for all graphs in this session.
      MLIR_BRIDGE_ROLLOUT_DISABLED = 2;
      reserved 3, 4;
      reserved "MLIR_BRIDGE_ROLLOUT_SAFE_MODE_ENABLED",
          "MLIR_BRIDGE_ROLLOUT_SAFE_MODE_FALLBACK_ENABLED";
    }
    // Whether to enable the MLIR-based TF->XLA bridge.
    MlirBridgeRollout mlir_bridge_rollout = 17;

    // Whether to enable the MLIR-based Graph optimizations.
    //
    // This will become a part of standard Tensorflow graph optimization
    // pipeline, currently this is only used for gradual migration and testing
    // new passes that are replacing existing optimizations in Grappler.
    bool enable_mlir_graph_optimization = 16;

    // If true, the session will not store an additional copy of the graph for
    // each subgraph.
    //
    // If this option is set to true when a session is created, the
    // `RunOptions.output_partition_graphs` options must not be set.
    bool disable_output_partition_graphs = 14;

    // Minimum number of batches run through the XLA graph before XLA fusion
    // autotuner is enabled. Default value of zero disables the autotuner.
    //
    // The XLA fusion autotuner can improve performance by executing a heuristic
    // search on the compiler parameters.
    int64 xla_fusion_autotuner_thresh = 15;

    // Whether runtime execution uses TFRT.
    bool use_tfrt = 18;

    // If true, use Pathways with TFRT API for multi host support.
    bool enable_multi_host = 27;

    // If true, use ifrt as the backend for TFRT. This is only used when
    // `use_tfrt` is true.
    bool tfrt_use_ifrt = 32;

    // Port for the Pathways server. Ignored if enable_multi_host=false.
    int32 backend_server_port = 28;

    // If true, TFRT will use TPU specific compiler passes and perform TPU
    // specific initialization.
    bool target_tpu = 29;

    // If true, TFRT will use GPU specific compiler passes and perform GPU
    // specific initialization.
    bool target_gpu = 30;

    // The threshold to merge small streams in TFRT. The stream with cost
    // smaller than the threshold will be merged. Setting it to value 1
    // disables all merges.
    int32 stream_merge_threshold = 31;

    // The field "coordination_service was previously specified as a string;
    // this has been replaced with a message below.
    reserved 19;

    // We removed the flag fetch_remote_devices_in_multi_client. Marking the tag
    // number as reserved.
    reserved 20;

    // Whether functional control flow op lowering should be disabled. This is
    // useful when executing within a portable runtime where control flow op
    // kernels may not be loaded due to selective registration.
    bool disable_functional_ops_lowering = 21;

    // Provides a hint to XLA auto clustering to prefer forming a single large
    // cluster that encompasses most of the graph.
    bool xla_prefer_single_graph_cluster = 22;

    // Distributed coordination service configurations.
    CoordinationServiceConfig coordination_config = 23;

    // If true, the session will treat the graph as being non-static for
    // optimization purposes.
    //
    // If this option is set to true when a session is created, the full
    // GraphDef will be retained to enable calls to Session::Extend().
    // Calling Extend() without setting this flag will result in errors.
    //
    // This option is meant to replace `optimize_for_static_graph` and it
    // aims to negate its value.
    bool disable_optimize_for_static_graph = 24;

    // Whether eager remote execution will stream all the function calls or
    // allow them to happen in parallel. When true, streaming execution is
    // disabled, and parallel execution is allowed.
    bool disable_eager_executor_streaming_enqueue = 26;

    // If true, the function library runtime will be finalized when the session
    // is finalized.
    bool finalize_function_library_runtime = 33;

    // If true, the resource manager will be finalized when the session
    // is finalized.
    bool finalize_resource_manager = 34;

    reserved 25;

    // Next: 35
  }

  Experimental experimental = 16;

  // Next: 19
}

// Options for a single Run() call.
message RunOptions {
  // TODO(pbar) Turn this into a TraceOptions proto which allows
  // tracing to be controlled in a more orthogonal manner?
  enum TraceLevel {
    NO_TRACE = 0;
    SOFTWARE_TRACE = 1;
    HARDWARE_TRACE = 2;
    FULL_TRACE = 3;
  }
  TraceLevel trace_level = 1;

  // Time to wait for operation to complete in milliseconds.
  int64 timeout_in_ms = 2;

  // The thread pool to use, if session_inter_op_thread_pool is configured.
  // To use the caller thread set this to -1 - this uses the caller thread
  // to execute Session::Run() and thus avoids a context switch. Using the
  // caller thread to execute Session::Run() should be done ONLY for simple
  // graphs, where the overhead of an additional context switch is
  // comparable with the overhead of Session::Run().
  int32 inter_op_thread_pool = 3;

  // Whether the partition graph(s) executed by the executor(s) should be
  // outputted via RunMetadata.
  bool output_partition_graphs = 5;

  // EXPERIMENTAL.  Options used to initialize DebuggerState, if enabled.
  DebugOptions debug_options = 6;

  // When enabled, causes tensor allocation information to be included in
  // the error message when the Run() call fails because the allocator ran
  // out of memory (OOM).
  //
  // Enabling this option can slow down the Run() call.
  bool report_tensor_allocations_upon_oom = 7;

  // Everything inside Experimental is subject to change and is not subject
  // to API stability guarantees in
  // https://www.tensorflow.org/guide/version_compat.
  message Experimental {
    // If non-zero, declares that this graph is going to use collective
    // ops and must synchronize step_ids with any other graph with this
    // same group_key value (in a distributed computation where tasks
    // run disjoint graphs).
    int64 collective_graph_key = 1;
    // If true, then operations (using the inter-op pool) across all
    // session::run() calls will be centrally scheduled, optimizing for (median
    // and tail) latency.
    // Consider using this option for CPU-bound workloads like inference.
    bool use_run_handler_pool = 2;
    // Options for run handler thread pool.
    message RunHandlerPoolOptions {
      // Priority of the request. The run handler thread pool will schedule ops
      // based on the priority number. The larger number means higher priority.
      int64 priority = 1;
    }
    RunHandlerPoolOptions run_handler_pool_options = 3;
  }

  Experimental experimental = 8;

  reserved 4;
}

// Metadata output (i.e., non-Tensor) for a single Run() call.
message RunMetadata {
  // Statistics traced for this step. Populated if tracing is turned on via the
  // "RunOptions" proto.
  // EXPERIMENTAL: The format and set of events may change in future versions.
  StepStats step_stats = 1;

  // The cost graph for the computation defined by the run call.
  CostGraphDef cost_graph = 2;

  // Graphs of the partitions executed by executors.
  repeated GraphDef partition_graphs = 3;

  message FunctionGraphs {
    // TODO(nareshmodi): Include some sort of function/cache-key identifier?
    repeated GraphDef partition_graphs = 1;

    GraphDef pre_optimization_graph = 2;
    GraphDef post_optimization_graph = 3;
  }
  // This is only populated for graphs that are run as functions in TensorFlow
  // V2. There will be an entry below for each function that is traced.
  // The main use cases of the post_optimization_graph and the partition_graphs
  // is to give the caller insight into the graphs that were actually run by the
  // runtime. Additional information (such as those in step_stats) will match
  // these graphs.
  // We also include the pre_optimization_graph since it is usually easier to
  // read, and is helpful in situations where the caller wants to get a high
  // level idea of what the built graph looks like (since the various graph
  // optimization passes might change the structure of the graph significantly).
  repeated FunctionGraphs function_graphs = 4;

  // Metadata about the session.
  SessionMetadata session_metadata = 5;
}

// Defines a connection between two tensors in a `GraphDef`.
message TensorConnection {
  // A tensor name. The value of this tensor will be substituted for
  // the tensor named in `to_tensor`.
  string from_tensor = 1;

  // A tensor name. The value of this tensor will be bound to the
  // value of the tensor named in `from_tensor`.
  string to_tensor = 2;
}

// Defines a subgraph in another `GraphDef` as a set of feed points and nodes
// to be fetched or executed.
//
// Compare with the arguments to `Session::Run()`.
message CallableOptions {
  // Tensors to be fed in the callable. Each feed is the name of a tensor.
  repeated string feed = 1;

  // Fetches. A list of tensor names. The caller of the callable expects a
  // tensor to be returned for each fetch[i] (see RunStepResponse.tensor). The
  // order of specified fetches does not change the execution order.
  repeated string fetch = 2;

  // Target Nodes. A list of node names. The named nodes will be run by the
  // callable but their outputs will not be returned.
  repeated string target = 3;

  // Options that will be applied to each run.
  RunOptions run_options = 4;

  // Tensors to be connected in the callable. Each TensorConnection denotes
  // a pair of tensors in the graph, between which an edge will be created
  // in the callable.
  repeated TensorConnection tensor_connection = 5;

  // The Tensor objects fed in the callable and fetched from the callable
  // are expected to be backed by host (CPU) memory by default.
  //
  // The options below allow changing that - feeding tensors backed by
  // device memory, or returning tensors that are backed by device memory.
  //
  // The maps below map the name of a feed/fetch tensor (which appears in
  // 'feed' or 'fetch' fields above), to the fully qualified name of the device
  // owning the memory backing the contents of the tensor.
  //
  // For example, creating a callable with the following options:
  //
  // CallableOptions {
  //   feed: "a:0"
  //   feed: "b:0"
  //
  //   fetch: "x:0"
  //   fetch: "y:0"
  //
  //   feed_devices: {
  //     "a:0": "/job:localhost/replica:0/task:0/device:GPU:0"
  //   }
  //
  //   fetch_devices: {
  //     "y:0": "/job:localhost/replica:0/task:0/device:GPU:0"
  //  }
  // }
  //
  // means that the Callable expects:
  // - The first argument ("a:0") is a Tensor backed by GPU memory.
  // - The second argument ("b:0") is a Tensor backed by host memory.
  // and of its return values:
  // - The first output ("x:0") will be backed by host memory.
  // - The second output ("y:0") will be backed by GPU memory.
  //
  // FEEDS:
  // It is the responsibility of the caller to ensure that the memory of the fed
  // tensors will be correctly initialized and synchronized before it is
  // accessed by operations executed during the call to Session::RunCallable().
  //
  // This is typically ensured by using the TensorFlow memory allocators
  // (Device::GetAllocator()) to create the Tensor to be fed.
  //
  // Alternatively, for CUDA-enabled GPU devices, this typically means that the
  // operation that produced the contents of the tensor has completed, i.e., the
  // CUDA stream has been synchronized (e.g., via cuCtxSynchronize() or
  // cuStreamSynchronize()).
  map<string, string> feed_devices = 6;
  map<string, string> fetch_devices = 7;

  // By default, RunCallable() will synchronize the GPU stream before returning
  // fetched tensors on a GPU device, to ensure that the values in those tensors
  // have been produced. This simplifies interacting with the tensors, but
  // potentially incurs a performance hit.
  //
  // If this options is set to true, the caller is responsible for ensuring
  // that the values in the fetched tensors have been produced before they are
  // used. The caller can do this by invoking `Device::Sync()` on the underlying
  // device(s), or by feeding the tensors back to the same Session using
  // `feed_devices` with the same corresponding device name.
  bool fetch_skip_sync = 8;

  // Next: 9
}

message BatchingOptions {
  // Number of scheduling threads for processing batches of work. Determines
  // the number of batches processed in parallel. This should be roughly in line
  // with the number of TPU cores available.
  int32 num_batch_threads = 1;

  // The maximum allowed batch size. Can be larger than allowed_batch_sizes to
  // utilize large batch splitting.
  int32 max_batch_size = 2;

  // Maximum number of microseconds to wait before outputting an incomplete
  // batch.
  int32 batch_timeout_micros = 3;

  // Optional list of allowed batch sizes. If left empty, does nothing.
  // Otherwise, supplies a list of batch sizes, causing the op to pad batches up
  // to one of those sizes. The entries must increase monotonically, and the
  // final entry must be equal or less than the max_batch_size.
  repeated int32 allowed_batch_sizes = 4;

  // Maximum number of batches enqueued for processing before requests are
  // failed fast.
  int32 max_enqueued_batches = 5;
}