Skip to content

Commit 0e30fa6

Browse files
Wei Wenfacebook-github-bot
Wei Wen
authored andcommitted
Faster random number generation in fused_rowwise_random_quantization_ops (pytorch#10634)
Summary: Pull Request resolved: pytorch#10634 ``` Trying example: test_speed_of_rand_quantization(self=<caffe2.caffe2.python.operator_test.rand_quantization_op_speed_test.TestSpeedFloatToFusedRandRowwiseQuantized testMethod=test_speed_of_rand_quantization>, bitwidth_=2, random_=True, data_shape_=array([1024, 1224]), gc=, dc=[, device_type: 1]) Sub+Scale+Sum time: 1.9944190979003908 ms Quantizing time: 2.080512046813965 ms (1.0431669296609765X) De-quantizing time: 0.7375001907348633 ms (0.36978195380863577X) ``` ``` Trying example: test_speed_of_rand_quantization(self=<caffe2.caffe2.python.operator_test.rand_quantization_op_speed_test.TestSpeedFloatToFusedRandRowwiseQuantized testMethod=test_speed_of_rand_quantization>, bitwidth_=1, random_=True, data_shape_=array([1024, 1224]), gc=device_type: 1, dc=[, device_type: 1]) Sub+Scale+Sum time: 1.6691923141479492 ms Quantizing time: 7.500243186950684 ms (4.493336761366071X) De-quantizing time: 1.1209726333618164 ms (0.6715658967876477X) ``` Reviewed By: jspark1105 Differential Revision: D8849770 fbshipit-source-id: 2bb2bac7e633f647f38e419ce980b8958f3bcae2
1 parent 754ec9e commit 0e30fa6

File tree

5 files changed

+722
-217
lines changed

5 files changed

+722
-217
lines changed

caffe2/operators/fused_rowwise_random_quantization_ops.cc

+106
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,113 @@
11
#include "caffe2/operators/fused_rowwise_random_quantization_ops.h"
22
#include "caffe2/core/registry.h"
3+
#include "caffe2/utils/math.h"
34

45
namespace caffe2 {
6+
7+
#define IS_LITTLE_ENDIAN \
8+
[] { \
9+
const int32_t kValue = 1; \
10+
return reinterpret_cast<const uint8_t*>(&kValue)[0] == 1; \
11+
}()
12+
13+
template <class Context>
14+
bool FloatToFusedRandRowwiseQuantizedOp<Context>::RunOnDevice() {
15+
CAFFE_ENFORCE(IS_LITTLE_ENDIAN, "Unsupported endianness");
16+
17+
const auto& input = Input(DATA_FLOAT);
18+
auto* output = Output(DATA_FUSED_QUANTIZED);
19+
20+
CAFFE_ENFORCE_EQ(
21+
input.ndim(),
22+
2,
23+
"Expect input to be a matrix. Reshape the input tensor to a matrix for usage.");
24+
25+
const auto input_rows = input.dim(0);
26+
const auto input_columns = input.dim(1);
27+
28+
// The "fused" representation stores the [bitwidth][tail][min][max]
29+
// with the row-wise quantized data in one tensor. Since we store 8/bitwidth
30+
// quantized data in one byte, the last buckets of some bytes may have
31+
// unused bits. There are totally tail buckets are unused.
32+
// We encode *bitwidth* and *tail* at the beginning of
33+
// each row, following by 32-bit floating data respresenting min and max.
34+
// | bitwidth | tail | min | max | ... int8 data ... |
35+
// | 1B | 1B | 4B | 4B | ...output_data....|
36+
// In output_data: the b-th bucket of the i-th byte stores
37+
// the i-th data of the b-th segment of input row
38+
size_t data_per_byte = 8 / bitwidth_;
39+
// How many bytes in the output
40+
size_t segment_size = (input_columns + data_per_byte - 1) / data_per_byte;
41+
const std::vector<TIndex> output_dimensions = {
42+
input_rows, 10 + static_cast<TIndex>(segment_size)};
43+
output->Resize(output_dimensions);
44+
45+
const auto* input_data = input.template data<float>();
46+
auto* output_data = output->template mutable_data<uint8_t>();
47+
const size_t output_columns = static_cast<size_t>(output->dim(1));
48+
memset(output_data, 0, output->size());
49+
50+
if (random_) {
51+
#ifdef FUSED_ROWWISE_RANDOM_QUANTIZATION_USE_MKL
52+
random_buffer_.resize(input_columns);
53+
#endif
54+
}
55+
56+
for (size_t row = 0; row < input_rows; ++row) {
57+
math::quantize_and_compress(
58+
input_data + row * input_columns,
59+
output_data + row * output_columns,
60+
input_columns,
61+
bitwidth_,
62+
random_,
63+
#ifdef FUSED_ROWWISE_RANDOM_QUANTIZATION_USE_MKL
64+
vslStream_,
65+
random_buffer_
66+
#else
67+
dis_,
68+
gen_
69+
#endif
70+
);
71+
}
72+
73+
return true;
74+
}
75+
76+
template <class Context>
77+
bool FusedRandRowwiseQuantizedToFloatOp<Context>::RunOnDevice() {
78+
CAFFE_ENFORCE(IS_LITTLE_ENDIAN, "Unsupported endianness");
79+
80+
const auto& input = Input(DATA_FUSED_QUANTIZED);
81+
auto* output = Output(DATA_FLOAT);
82+
CAFFE_ENFORCE_EQ(input.ndim(), 2, "Expect input to be a matrix.");
83+
CAFFE_ENFORCE_GE(
84+
input.size(), 4, "Expect input to have size greater than or equal to 4.");
85+
86+
const auto input_rows = input.dim(0);
87+
const auto input_columns = input.dim(1);
88+
const auto* input_data = input.template data<uint8_t>();
89+
const size_t bitwidth = input_data[0];
90+
CAFFE_ENFORCE(
91+
bitwidth == 1 || bitwidth == 2 || bitwidth == 4 || bitwidth == 8,
92+
"Unsupported bitwidth");
93+
const size_t tail = input_data[1];
94+
const size_t output_columns = (input_columns - 10) * (8 / bitwidth) - tail;
95+
const std::vector<TIndex> output_dimensions = {
96+
input_rows, static_cast<TIndex>(output_columns)};
97+
output->Resize(output_dimensions);
98+
auto* output_data = output->template mutable_data<float>();
99+
for (size_t row = 0; row < input_rows; ++row) {
100+
math::decompress_and_dequantize(
101+
input_data + row * input_columns,
102+
output_data + row * output_columns,
103+
input_columns);
104+
}
105+
106+
return true;
107+
}
108+
109+
#undef IS_LITTLE_ENDIAN
110+
5111
REGISTER_CPU_OPERATOR(
6112
FloatToFusedRandRowwiseQuantized,
7113
FloatToFusedRandRowwiseQuantizedOp<CPUContext>);

0 commit comments

Comments
 (0)