本文目的是以Tensorfl中的MatMul这个具有代表性又比较简单的ops为例介绍一下TensorFlow中的图的节点是怎么实现的。我个人认为TensorFlow中的ops是整个TensorFlow的核心,如果理解了这个,那么对TensorFlow就有了比较深的认识。
在阅读这段代码前看一下官方文档中的添加新的op会很有帮助:
中文翻译:http://www.tensorfly.cn/tfdoc/how_tos/adding_an_op.html
英文原文:http://www.tensorflow.org/how_tos/adding_an_op/index.html#adding-a-new-op
好了对添加一个新的op的方法有一定的了解后我们看真实的可实用的例子:MatMul
一个ops其实包含两个计算节点,一个是正向计算节点,一个是梯度计算节点。
正向计算节点在源代码的:core/kernels/matmul_op.cc这个文件里面。
/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
// See docs in ../ops/math_ops.cc.
#define EIGEN_USE_THREADS
#include "tensorflow/core/kernels/matmul_op.h"
#include "tensorflow/core/framework/op.h"
#include "tensorflow/core/framework/op_kernel.h"
#include "tensorflow/core/framework/register_types.h"
#include "tensorflow/core/kernels/fill_functor.h"
#if GOOGLE_CUDA
#include "cuda/include/cuda.h"
#include "tensorflow/core/platform/stream_executor.h"
#endif // GOOGLE_CUDA
namespace tensorflow {
#if GOOGLE_CUDA
namespace {
template <typename T>
perftools::gputools::DeviceMemory<T> AsDeviceMemory(const T* cuda_memory) {
perftools::gputools::DeviceMemoryBase wrapped(const_cast<T*>(cuda_memory));
perftools::gputools::DeviceMemory<T> typed(wrapped);
return typed;
}
} // namespace
#endif // GOOGLE_CUDA
typedef Eigen::ThreadPoolDevice CPUDevice;
typedef Eigen::GpuDevice GPUDevice;
#ifdef TENSORFLOW_USE_SYCL
typedef Eigen::SyclDevice SYCLDevice;
#endif // TENSORFLOW_USE_SYCL
template <typename Device, typename T, bool USE_CUBLAS>
struct LaunchMatMul;
namespace {
// Converts a TensorFlow Tensor to an Eigen Matrix.
template <typename T>
Eigen::Map<
const Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>>
ToEigenMatrix(const Tensor& tensor) {
auto matrix = tensor.matrix<T>();
return Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>::Map(
matrix.data(), matrix.dimension(0), matrix.dimension(1));
}
// Converts a TensorFlow Tensor to an Eigen Vector.
template <typename T>
Eigen::Map<Eigen::Matrix<T, Eigen::Dynamic, 1>> ToEigenVector(Tensor* tensor) {
auto v = tensor->flat<T>();
return Eigen::Matrix<T, Eigen::Dynamic, 1>::Map(v.data(), v.dimension(0));
}
template <typename T>
Eigen::Map<const Eigen::Matrix<T, Eigen::Dynamic, 1>> ToEigenVector(
const Tensor& tensor) {
auto v = tensor.flat<T>();
return Eigen::Matrix<T, Eigen::Dynamic, 1>::Map(v.data(), v.dimension(0));
}
} // namespace
// If either side can be represented as a vector, do an explicit vector
// matrix multiply and return true; else return false.
//
// Note: this uses plain Eigen and not Eigen Tensor because it is more
// efficient.
template <typename T>
bool ExplicitVectorMatrixOptimization(
const Tensor& a, const Tensor& b,
const Eigen::array<Eigen::IndexPair<Eigen::DenseIndex>, 1>& dim_pair,
Tensor* out) {
if (out->dim_size(0) == 1) {
if (dim_pair[0].second == 0) {
// Note: this case is optimized in Eigen Tensors.
return false;
} else {
auto out_v = ToEigenVector<T>(out);
auto a_v = ToEigenVector<T>(a);
auto b_m = ToEigenMatrix<T>(b);
out_v.noalias() = b_m * a_v;
}
return true;
} else if (out->dim_size(1) == 1) {
auto out_v = ToEigenVector<T>(out);
auto a_m = ToEigenMatrix<T>(a);
auto b_v = ToEigenVector<T>(b);
if (dim_pair[0].first == 0) {
out_v.noalias() = a_m.transpose() * b_v;
} else {
out_v.noalias() = a_m * b_v;
}
return true;
}
return false;
}
// Half is not supported.
template <>
bool ExplicitVectorMatrixOptimization<Eigen::half>(
const Tensor& a, const Tensor& b,
const Eigen::array<Eigen::IndexPair<Eigen::DenseIndex>, 1>& dim_pair,
Tensor* out) {
return false;
}
template <typename Device, typename T>
struct LaunchMatMulBase {
static void launch(
OpKernelContext* ctx, OpKernel* kernel, const Tensor& a, const Tensor& b,
const Eigen::array<Eigen::IndexPair<Eigen::DenseIndex>, 1>& dim_pair,
Tensor* out) {
#ifndef TENSORFLOW_USE_SYCL
// An explicit vector-matrix multiply is much better optimized than an
// implicit one and this is a bottleneck during non-batched inference.
bool was_vector = ExplicitVectorMatrixOptimization<T>(a, b, dim_pair, out);
if (!was_vector) {
#endif // TENSORFLOW_USE_SYCL
functor::MatMulFunctor<