GuWei007
diff --git a/‎caffe2/operators/fused_rowwise_random_quantization_ops.cc
+106 b/‎caffe2/operators/fused_rowwise_random_quantization_ops.cc
+106
@@ -1,7 +1,113 @@
 #include "caffe2/operators/fused_rowwise_random_quantization_ops.h"
 #include "caffe2/core/registry.h"
+#include "caffe2/utils/math.h"
 
 namespace caffe2 {
+
+#define IS_LITTLE_ENDIAN                                      \
+  [] {                                                        \
+    const int32_t kValue = 1;                                 \
+    return reinterpret_cast<const uint8_t*>(&kValue)[0] == 1; \
+  }()
+
+template <class Context>
+bool FloatToFusedRandRowwiseQuantizedOp<Context>::RunOnDevice() {
+  CAFFE_ENFORCE(IS_LITTLE_ENDIAN, "Unsupported endianness");
+
+  const auto& input = Input(DATA_FLOAT);
+  auto* output = Output(DATA_FUSED_QUANTIZED);
+
+  CAFFE_ENFORCE_EQ(
+      input.ndim(),
+      2,
+      "Expect input to be a matrix. Reshape the input tensor to a matrix for usage.");
+
+  const auto input_rows = input.dim(0);
+  const auto input_columns = input.dim(1);
+
+  // The "fused" representation stores the [bitwidth][tail][min][max]
+  // with the row-wise quantized data in one tensor. Since we store 8/bitwidth
+  // quantized data in one byte, the last buckets of some bytes may have
+  // unused bits. There are totally tail buckets are unused.
+  // We encode *bitwidth* and *tail* at the beginning of
+  // each row, following by 32-bit floating data respresenting min and max.
+  // | bitwidth | tail | min | max | ... int8 data ... |
+  // |    1B    |  1B  |  4B |  4B | ...output_data....|
+  // In output_data: the b-th bucket of the i-th byte stores
+  // the i-th data of the b-th segment of input row
+  size_t data_per_byte = 8 / bitwidth_;
+  // How many bytes in the output
+  size_t segment_size = (input_columns + data_per_byte - 1) / data_per_byte;
+  const std::vector<TIndex> output_dimensions = {
+      input_rows, 10 + static_cast<TIndex>(segment_size)};
+  output->Resize(output_dimensions);
+
+  const auto* input_data = input.template data<float>();
+  auto* output_data = output->template mutable_data<uint8_t>();
+  const size_t output_columns = static_cast<size_t>(output->dim(1));
+  memset(output_data, 0, output->size());
+
+  if (random_) {
+#ifdef FUSED_ROWWISE_RANDOM_QUANTIZATION_USE_MKL
+    random_buffer_.resize(input_columns);
+#endif
+  }
+
+  for (size_t row = 0; row < input_rows; ++row) {
+    math::quantize_and_compress(
+        input_data + row * input_columns,
+        output_data + row * output_columns,
+        input_columns,
+        bitwidth_,
+        random_,
+#ifdef FUSED_ROWWISE_RANDOM_QUANTIZATION_USE_MKL
+        vslStream_,
+        random_buffer_
+#else
+        dis_,
+        gen_
+#endif
+    );
+  }
+
+  return true;
+}
+
+template <class Context>
+bool FusedRandRowwiseQuantizedToFloatOp<Context>::RunOnDevice() {
+  CAFFE_ENFORCE(IS_LITTLE_ENDIAN, "Unsupported endianness");
+
+  const auto& input = Input(DATA_FUSED_QUANTIZED);
+  auto* output = Output(DATA_FLOAT);
+  CAFFE_ENFORCE_EQ(input.ndim(), 2, "Expect input to be a matrix.");
+  CAFFE_ENFORCE_GE(
+      input.size(), 4, "Expect input to have size greater than or equal to 4.");
+
+  const auto input_rows = input.dim(0);
+  const auto input_columns = input.dim(1);
+  const auto* input_data = input.template data<uint8_t>();
+  const size_t bitwidth = input_data[0];
+  CAFFE_ENFORCE(
+      bitwidth == 1 || bitwidth == 2 || bitwidth == 4 || bitwidth == 8,
+      "Unsupported bitwidth");
+  const size_t tail = input_data[1];
+  const size_t output_columns = (input_columns - 10) * (8 / bitwidth) - tail;
+  const std::vector<TIndex> output_dimensions = {
+      input_rows, static_cast<TIndex>(output_columns)};
+  output->Resize(output_dimensions);
+  auto* output_data = output->template mutable_data<float>();
+  for (size_t row = 0; row < input_rows; ++row) {
+    math::decompress_and_dequantize(
+        input_data + row * input_columns,
+        output_data + row * output_columns,
+        input_columns);
+  }
+
+  return true;
+}
+
+#undef IS_LITTLE_ENDIAN
+
 REGISTER_CPU_OPERATOR(
     FloatToFusedRandRowwiseQuantized,
     FloatToFusedRandRowwiseQuantizedOp<CPUContext>);