Skip to content

Commit 4dffee7

Browse files
author
Vijay Vasudevan
committed
TensorFlow: Minor updates to docs, BUILD, GPU config / perf, etc.
Changes: - Updates to op documentation and index by Josh - More changes to BUILD files for python 3 support by @girving - Fix to Eigen to use DenseIndex everywhere by @jiayq - Enable configuration for cuda compute capability by @zheng-xq, including updates to docs. - Route aggregation method through optimizer by schuster - Updates to install instructions for bazel 0.1.1. Base CL: 107702099
1 parent f2102f4 commit 4dffee7

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

48 files changed

+811
-694
lines changed

configure

+64
Original file line numberDiff line numberDiff line change
@@ -76,6 +76,70 @@ CUDA_TOOLKIT_PATH="$CUDA_TOOLKIT_PATH"
7676
CUDNN_INSTALL_PATH="$CUDNN_INSTALL_PATH"
7777
EOF
7878

79+
function UnofficialSetting() {
80+
echo -e "\nWARNING: You are configuring unofficial settings in TensorFlow. Because some external libraries are not backward compatible, these settings are largely untested and unsupported. \n"
81+
82+
# Configure the compute capabilities that TensorFlow builds for.
83+
# Since Cuda toolkit is not backward-compatible, this is not guaranteed to work.
84+
while true; do
85+
fromuser=""
86+
if [ -z "$TF_CUDA_COMPUTE_CAPABILITIES" ]; then
87+
cat << EOF
88+
Please specify a list of comma-separated Cuda compute capabilities you want to build with.
89+
You can find the compute capability of your device at: https://developer.nvidia.com/cuda-gpus.
90+
Please note that each additional compute capability significantly increases your build time and binary size.
91+
EOF
92+
read -p "[Default is: \"3.5,5.2\"]: " TF_CUDA_COMPUTE_CAPABILITIES
93+
fromuser=1
94+
fi
95+
# Check whether all capabilities from the input is valid
96+
COMPUTE_CAPABILITIES=${TF_CUDA_COMPUTE_CAPABILITIES//,/ }
97+
ALL_VALID=1
98+
for CAPABILITY in $COMPUTE_CAPABILITIES; do
99+
if [[ ! "$CAPABILITY" =~ [0-9]+.[0-9]+ ]]; then
100+
echo "Invalid compute capability: " $CAPABILITY
101+
ALL_VALID=0
102+
break
103+
fi
104+
done
105+
if [ "$ALL_VALID" == "0" ]; then
106+
if [ -z "$fromuser" ]; then
107+
exit 1
108+
fi
109+
else
110+
break
111+
fi
112+
TF_CUDA_COMPUTE_CAPABILITIES=""
113+
done
114+
115+
if [ ! -z "$TF_CUDA_COMPUTE_CAPABILITIES" ]; then
116+
export WARNING="Unofficial setting. DO NOT"" SUBMIT!!!"
117+
function CudaGenCodeOpts() {
118+
OUTPUT=""
119+
for CAPABILITY in $@; do
120+
OUTPUT=${OUTPUT}" \"${CAPABILITY}\", "
121+
done
122+
echo $OUTPUT
123+
}
124+
export CUDA_GEN_CODES_OPTS=$(CudaGenCodeOpts ${TF_CUDA_COMPUTE_CAPABILITIES//,/ })
125+
perl -pi -0 -e 's,\n( *)([^\n]*supported_cuda_compute_capabilities\s*=\s*\[).*?(\]),\n\1# $ENV{WARNING}\n\1\2$ENV{CUDA_GEN_CODES_OPTS}\3,s' third_party/gpus/crosstool/clang/bin/crosstool_wrapper_driver_is_not_gcc
126+
function CudaVersionOpts() {
127+
OUTPUT=""
128+
for CAPABILITY in $@; do
129+
OUTPUT=$OUTPUT"CudaVersion(\"${CAPABILITY}\"), "
130+
done
131+
echo $OUTPUT
132+
}
133+
export CUDA_VERSION_OPTS=$(CudaVersionOpts ${TF_CUDA_COMPUTE_CAPABILITIES//,/ })
134+
perl -pi -0 -e 's,\n( *)([^\n]*supported_cuda_compute_capabilities\s*=\s*\{).*?(\}),\n\1// $ENV{WARNING}\n\1\2$ENV{CUDA_VERSION_OPTS}\3,s' tensorflow/core/common_runtime/gpu/gpu_device.cc
135+
fi
136+
}
137+
138+
# Only run the unofficial settings when users explicitly choose to.
139+
if [ "$TF_UNOFFICIAL_SETTING" == "1" ]; then
140+
UnofficialSetting
141+
fi
142+
79143
# Invoke the cuda_config.sh and set up the TensorFlow's canonical view of the Cuda libraries
80144
(cd third_party/gpus/cuda; ./cuda_config.sh;) || exit -1
81145

six.BUILD

+1
Original file line numberDiff line numberDiff line change
@@ -9,4 +9,5 @@ py_library(
99
name = "six",
1010
srcs = ["six.py"],
1111
visibility = ["//visibility:public"],
12+
srcs_version = "PY2AND3",
1213
)

tensorflow/core/common_runtime/executor.cc

+25
Original file line numberDiff line numberDiff line change
@@ -294,6 +294,31 @@ Status ExecutorImpl::InferAllocAttr(
294294
const DeviceNameUtils::ParsedName& local_dev_name,
295295
AllocatorAttributes* attr) {
296296
Status s;
297+
// Note that it's possible for *n to be a Recv and *dst to be a Send,
298+
// so these two cases are not mutually exclusive.
299+
if (IsRecv(n)) {
300+
string src_name;
301+
s = GetNodeAttr(n->def(), "send_device", &src_name);
302+
if (!s.ok()) return s;
303+
DeviceNameUtils::ParsedName parsed_src_name;
304+
if (!DeviceNameUtils::ParseFullName(src_name, &parsed_src_name)) {
305+
s = errors::Internal("Bad send_device attr '", src_name, "' in node ",
306+
n->name());
307+
return s;
308+
}
309+
if (!DeviceNameUtils::IsSameAddressSpace(parsed_src_name, local_dev_name)) {
310+
// Value is going to be the sink of an RPC.
311+
attr->set_nic_compatible(true);
312+
VLOG(2) << "node " << n->name() << " is the sink of an RPC in";
313+
} else if (local_dev_name.type == "CPU" && parsed_src_name.type == "GPU") {
314+
// Value is going to be the sink of a local DMA from GPU to CPU.
315+
attr->set_gpu_compatible(true);
316+
VLOG(2) << "node " << n->name() << " is the sink of a gpu->cpu copy";
317+
} else {
318+
VLOG(2) << "default alloc case local type " << local_dev_name.type
319+
<< " remote type " << parsed_src_name.type;
320+
}
321+
}
297322
if (IsSend(dst)) {
298323
string dst_name;
299324
s = GetNodeAttr(dst->def(), "recv_device", &dst_name);

tensorflow/core/common_runtime/gpu/gpu_device.cc

+50-7
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@
88

99
#include <stdlib.h>
1010
#include <string.h>
11+
#include <algorithm>
1112

1213
//#include "base/commandlineflags.h"
1314
#include "tensorflow/stream_executor/cuda/cuda_activation.h"
@@ -590,10 +591,50 @@ static int GetMinGPUMultiprocessorCount() {
590591
return kDefaultMinGPUMultiprocessorCount;
591592
}
592593

594+
namespace {
595+
596+
struct CudaVersion {
597+
// Initialize from version_name in the form of "3.5"
598+
explicit CudaVersion(const std::string& version_name) {
599+
size_t dot_pos = version_name.find('.');
600+
CHECK(dot_pos != string::npos);
601+
string major_str = version_name.substr(0, dot_pos);
602+
CHECK(strings::safe_strto32(major_str.c_str(), &major_part));
603+
string minor_str = version_name.substr(dot_pos + 1);
604+
CHECK(strings::safe_strto32(minor_str.c_str(), &minor_part));
605+
}
606+
CudaVersion() {}
607+
bool operator<(const CudaVersion& other) const {
608+
if (this->major_part != other.major_part) {
609+
return this->major_part < other.major_part;
610+
}
611+
return this->minor_part < other.minor_part;
612+
}
613+
friend std::ostream& operator<<(std::ostream& os,
614+
const CudaVersion& version) {
615+
os << version.major_part << "." << version.minor_part;
616+
return os;
617+
}
618+
int major_part = -1;
619+
int minor_part = -1;
620+
};
621+
622+
// "configure" uses the specific name to substitute the following string.
623+
// If you change it, make sure you modify "configure" as well.
624+
std::vector<CudaVersion> supported_cuda_compute_capabilities = {
625+
CudaVersion("3.5"), CudaVersion("5.2")};
626+
627+
} // namespace
628+
593629
void BaseGPUDeviceFactory::GetValidDeviceIds(std::vector<int>* ids) {
594630
auto gpu_manager = GPUMachineManager();
595631
int min_gpu_core_count = GetMinGPUMultiprocessorCount();
596632
if (gpu_manager) {
633+
CHECK(!supported_cuda_compute_capabilities.empty());
634+
CudaVersion min_supported_capability =
635+
*std::min_element(supported_cuda_compute_capabilities.begin(),
636+
supported_cuda_compute_capabilities.end());
637+
597638
auto visible_device_count = gpu_manager->VisibleDeviceCount();
598639
for (int i = 0; i < gpu_manager->VisibleDeviceCount(); ++i) {
599640
auto exec_status = gpu_manager->ExecutorForDevice(i);
@@ -602,17 +643,19 @@ void BaseGPUDeviceFactory::GetValidDeviceIds(std::vector<int>* ids) {
602643
}
603644
gpu::StreamExecutor* se = exec_status.ValueOrDie();
604645
const gpu::DeviceDescription& desc = se->GetDeviceDescription();
605-
int major, minor;
606-
if (!desc.cuda_compute_capability(&major, &minor)) {
646+
CudaVersion device_capability;
647+
if (!desc.cuda_compute_capability(&device_capability.major_part,
648+
&device_capability.minor_part)) {
607649
continue;
608650
}
609-
// Only consider GPUs with compute capability >= 3.5 (Kepler or
610-
// higher)
611-
if (major < 3 || (major == 3 && minor < 5)) {
651+
// Only GPUs with no less than the minimum supported compute capability is
652+
// accepted.
653+
if (device_capability < min_supported_capability) {
612654
LOG(INFO) << "Ignoring gpu device "
613655
<< "(" << GetShortDeviceDescription(i, desc) << ") "
614-
<< "with Cuda compute capability " << major << "." << minor
615-
<< ". The minimum required Cuda capability is 3.5.";
656+
<< "with Cuda compute capability " << device_capability
657+
<< ". The minimum required Cuda capability is "
658+
<< min_supported_capability << ".";
616659
continue;
617660
}
618661

tensorflow/core/framework/rendezvous.cc

+1-1
Original file line numberDiff line numberDiff line change
@@ -188,9 +188,9 @@ class LocalRendezvousImpl : public Rendezvous {
188188
// message arrives.
189189
Item* item = new Item;
190190
item->waiter = done;
191+
item->recv_alloc_attrs = recv_args.alloc_attrs;
191192
if (recv_args.device_context) {
192193
item->recv_dev_context = recv_args.device_context;
193-
item->recv_alloc_attrs = recv_args.alloc_attrs;
194194
item->recv_dev_context->Ref();
195195
}
196196
CHECK(table_.insert({key, item}).second);

tensorflow/core/framework/tensor_slice.h

+6-5
Original file line numberDiff line numberDiff line change
@@ -98,9 +98,10 @@ class TensorSlice {
9898
// We allow NDIMS to be greater than dims(), in which case we will pad the
9999
// higher dimensions with trivial dimensions.
100100
template <int NDIMS>
101-
void FillIndicesAndSizes(const TensorShape& shape,
102-
Eigen::DSizes<ptrdiff_t, NDIMS>* indices,
103-
Eigen::DSizes<ptrdiff_t, NDIMS>* sizes) const;
101+
void FillIndicesAndSizes(
102+
const TensorShape& shape,
103+
Eigen::DSizes<Eigen::DenseIndex, NDIMS>* indices,
104+
Eigen::DSizes<Eigen::DenseIndex, NDIMS>* sizes) const;
104105

105106
// Interaction with other TensorSlices.
106107

@@ -162,8 +163,8 @@ class TensorSlice {
162163

163164
template <int NDIMS>
164165
void TensorSlice::FillIndicesAndSizes(
165-
const TensorShape& shape, Eigen::DSizes<ptrdiff_t, NDIMS>* indices,
166-
Eigen::DSizes<ptrdiff_t, NDIMS>* sizes) const {
166+
const TensorShape& shape, Eigen::DSizes<Eigen::DenseIndex, NDIMS>* indices,
167+
Eigen::DSizes<Eigen::DenseIndex, NDIMS>* sizes) const {
167168
CHECK_EQ(shape.dims(), dims()) << "Incompatible dimensions between shape "
168169
<< "slices: shape = " << shape.DebugString()
169170
<< ", slice = " << DebugString();

tensorflow/core/kernels/concat_op_gpu.cu.cc

+2-2
Original file line numberDiff line numberDiff line change
@@ -18,9 +18,9 @@ void ConcatGPU(const GPUDevice& d,
1818
const std::vector<
1919
std::unique_ptr<typename TTypes<T, 2>::ConstMatrix>>& inputs,
2020
typename TTypes<T, 2>::Matrix* output) {
21-
Eigen::array<ptrdiff_t, 2> offset(0, 0);
21+
Eigen::array<Eigen::DenseIndex, 2> offset(0, 0);
2222
for (int i = 0; i < inputs.size(); ++i) {
23-
Eigen::array<ptrdiff_t, 2> size = inputs[i]->dimensions();
23+
Eigen::array<Eigen::DenseIndex, 2> size = inputs[i]->dimensions();
2424
output->slice(offset, size).device(d) = *inputs[i];
2525
offset[1] += size[1];
2626
}

0 commit comments

Comments
 (0)