arm_compute v18.08

ARM-software · Aug 30, 2018 · 52ba29e · 52ba29e
1 parent e2542c9
commit 52ba29e
Show file tree

Hide file tree

Showing 7,385 changed files with 238,553 additions and 206,814 deletions.
diff --git a/LICENSE b/LICENSE
@@ -1,6 +1,6 @@
 MIT License
 
-Copyright (c) 2017 ARM Software
+Copyright (c) 2017-2018 ARM Software
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal

diff --git a/README.md b/README.md
@@ -1,12 +1,10 @@
 
-:warning: **Deprecation notice: QS8 and QS16 data types will be removed in the next release** (As far as we know nobody uses these data types, if you do or think they are useful please open an Issue or send us an email):warning:
-
 Please report issues here: https://github.com/ARM-software/ComputeLibrary/issues
 **Make sure you are using the latest version of the library before opening an issue. Thanks**
 
 News:
 
-- We're hiring: Senior Machine Learning C++ Software Engineer in Cambridge (UK)
+- We're hiring: Staff Machine Learning C++ Software Engineer in Cambridge (UK)
     - Required skills:
         - Proficient in C++11.
     - Preferred skills:
@@ -16,7 +14,7 @@ News:
         - Experience programming in assembly language.
 
     Interested ? Contact us: [email protected]
-- Come talk to us: [Gian Marco will be presenting his work at the EVS](https://www.embedded-vision.com/summit/even-faster-cnns-exploring-new-class-winograd-algorithms)
+- [Gian Marco's talk on optimizing CNNs with Winograd algorithms at the EVS](https://www.embedded-vision.com/platinum-members/arm/embedded-vision-training/videos/pages/may-2018-embedded-vision-summit-iodice)
 
 Related projects:
 
@@ -27,6 +25,7 @@ Related projects:
 
 Documentation available here:
 
+- [v18.08](https://arm-software.github.io/ComputeLibrary/v18.08/)
 - [v18.05](https://arm-software.github.io/ComputeLibrary/v18.05/)
 - [v18.03](https://arm-software.github.io/ComputeLibrary/v18.03/)
 - [v18.02](https://arm-software.github.io/ComputeLibrary/v18.02/)
@@ -41,6 +40,8 @@ Documentation available here:
 
 Binaries available here:
 
+- [v18.08-linux](https://github.com/ARM-software/ComputeLibrary/releases/download/v18.08/arm_compute-v18.08-bin-linux.tar.gz)
+- [v18.08-android](https://github.com/ARM-software/ComputeLibrary/releases/download/v18.08/arm_compute-v18.08-bin-android.tar.gz)
 - [v18.05-linux](https://github.com/ARM-software/ComputeLibrary/releases/download/v18.05/arm_compute-v18.05-bin-linux.tar.gz)
 - [v18.05-android](https://github.com/ARM-software/ComputeLibrary/releases/download/v18.05/arm_compute-v18.05-bin-android.tar.gz)
 - [v18.03-linux](https://github.com/ARM-software/ComputeLibrary/releases/download/v18.03/arm_compute-v18.03-bin-linux.tar.gz)

diff --git a/SConscript b/SConscript
@@ -24,8 +24,8 @@ import os.path
 import re
 import subprocess
 
-VERSION = "v18.05"
-SONAME_VERSION="11.0.0"
+VERSION = "v18.08"
+SONAME_VERSION="12.0.0"
 
 Import('env')
 Import('vars')
@@ -43,7 +43,7 @@ def build_library(name, sources, static=False, libs=[]):
             library_prefix = obj[0].path[:-(1 + len(SONAME_VERSION))]
             real_lib = "%s.%s" % (library_prefix, SONAME_VERSION)
 
-            for f in Glob("#%s*" % library_prefix):
+            for f in Glob("#%s.*" % library_prefix):
                 if str(f) != real_lib:
                     symlinks.append("%s/%s" % (directory,str(f)))
 
@@ -118,15 +118,16 @@ def create_version_file(target, source, env):
     except (OSError, subprocess.CalledProcessError):
         git_hash="unknown"
 
-    version_filename = "%s/arm_compute_version.embed" % Dir("src/core").path
     build_info = "\"arm_compute_version=%s Build options: %s Git hash=%s\"" % (VERSION, vars.args, git_hash.strip())
     with open(target[0].get_path(), "w") as fd:
         fd.write(build_info)
 
 arm_compute_env = env.Clone()
+version_file = arm_compute_env.Command("src/core/arm_compute_version.embed", "", action=create_version_file)
+arm_compute_env.AlwaysBuild(version_file)
 
 # Generate embed files
-generate_embed = [ arm_compute_env.Command("src/core/arm_compute_version.embed", "", action=create_version_file) ]
+generate_embed = [ version_file ]
 if env['opencl'] and env['embed_kernels']:
     cl_files = Glob('src/core/CL/cl_kernels/*.cl')
     cl_files += Glob('src/core/CL/cl_kernels/*.h')
@@ -190,6 +191,7 @@ if env['opencl']:
 if env['neon']:
     core_files += Glob('src/core/NEON/*.cpp')
     core_files += Glob('src/core/NEON/kernels/*.cpp')
+    core_files += Glob('src/core/NEON/kernels/assembly/*.cpp')
 
     core_files += Glob('src/core/NEON/kernels/arm_gemm/*.cpp')
 
@@ -209,6 +211,7 @@ if env['neon']:
 
     runtime_files += Glob('src/runtime/NEON/*.cpp')
     runtime_files += Glob('src/runtime/NEON/functions/*.cpp')
+    runtime_files += Glob('src/runtime/NEON/functions/assembly/*.cpp')
 
 if env['gles_compute']:
     if env['os'] != 'android':

diff --git a/SConstruct b/SConstruct
@@ -89,7 +89,7 @@ env.Append(CXXFLAGS = ['-Wno-deprecated-declarations','-Wall','-DARCH_ARM',
          '-Wextra','-Wno-unused-parameter','-pedantic','-Wdisabled-optimization','-Wformat=2',
          '-Winit-self','-Wstrict-overflow=2','-Wswitch-default',
          '-fpermissive','-std=gnu++11','-Wno-vla','-Woverloaded-virtual',
-         '-Wctor-dtor-privacy','-Wsign-promo','-Weffc++','-Wno-format-nonliteral','-Wno-overlength-strings','-Wno-strict-overflow','-Wno-implicit-fallthrough'])
+         '-Wctor-dtor-privacy','-Wsign-promo','-Weffc++','-Wno-format-nonliteral','-Wno-overlength-strings','-Wno-strict-overflow'])
 
 env.Append(CPPDEFINES = ['_GLIBCXX_USE_NANOSLEEP'])
 
@@ -104,7 +104,7 @@ if env['os'] == 'android' and ( 'clang++' not in cpp_compiler or 'clang' not in
 if 'clang++' in cpp_compiler:
     env.Append(CXXFLAGS = ['-Wno-format-nonliteral','-Wno-deprecated-increment-bool','-Wno-vla-extension','-Wno-mismatched-tags'])
 else:
-    env.Append(CXXFLAGS = ['-Wlogical-op','-Wnoexcept','-Wstrict-null-sentinel'])
+    env.Append(CXXFLAGS = ['-Wlogical-op','-Wnoexcept','-Wstrict-null-sentinel','-Wno-implicit-fallthrough'])
 
 if env['cppthreads']:
     env.Append(CPPDEFINES = [('ARM_COMPUTE_CPP_SCHEDULER', 1)])

diff --git a/arm_compute/core/CL/CLHelpers.h b/arm_compute/core/CL/CLHelpers.h
@@ -69,7 +69,7 @@ std::string get_underlying_cl_type_from_data_type(const DataType &dt);
  *
  * @return the GPU target
  */
-GPUTarget get_target_from_device(cl::Device &device);
+GPUTarget get_target_from_device(const cl::Device &device);
 
 /** Helper function to get the highest OpenCL version supported
  *
@@ -102,5 +102,30 @@ bool fp16_supported(const cl::Device &device);
  * @return True if the extension is supported
  */
 bool arm_non_uniform_workgroup_supported(const cl::Device &device);
+/** Helper function to check whether the cl_arm_integer_dot_product_int8 extension is supported
+ *
+ * @param[in] device A CL device
+ *
+ * @return True if the extension is supported
+ */
+bool dot8_supported(const cl::Device &device);
+
+/** Helper function to check whether the cl_arm_integer_dot_product_accumulate_int8 extension is supported
+ *
+ * @param[in] device A CL device
+ *
+ * @return True if the extension is supported
+ */
+bool dot8_acc_supported(const cl::Device &device);
+
+/** This function checks if the Winograd configuration (defined through the output tile, kernel size and the data layout) is supported on OpenCL
+ *
+ * @param[in] output_tile Output tile for the Winograd filtering algorithm
+ * @param[in] kernel_size Kernel size for the Winograd filtering algorithm
+ * @param[in] data_layout Data layout of the input tensor
+ *
+ * @return True if the configuration is supported
+ */
+bool cl_winograd_convolution_layer_supported(const Size2D &output_tile, const Size2D &kernel_size, DataLayout data_layout);
 }
 #endif /* __ARM_COMPUTE_CLHELPERS_H__ */
diff --git a/arm_compute/core/CL/CLKernelLibrary.h b/arm_compute/core/CL/CLKernelLibrary.h
@@ -208,11 +208,11 @@ class CLKernelLibrary
     static CLKernelLibrary &get();
     /** Initialises the kernel library.
      *
-     * @param[in] kernel_path (Optional) Path of the directory from which kernel sources are loaded.
-     * @param[in] context     (Optional) CL context used to create programs.
-     * @param[in] device      (Optional) CL device for which the programs are created.
+     * @param[in] kernel_path Path of the directory from which kernel sources are loaded.
+     * @param[in] context     CL context used to create programs.
+     * @param[in] device      CL device for which the programs are created.
      */
-    void init(std::string kernel_path = ".", cl::Context context = cl::Context::getDefault(), cl::Device device = cl::Device::getDefault())
+    void init(std::string kernel_path, cl::Context context, cl::Device device)
     {
         _kernel_path = std::move(kernel_path);
         _context     = std::move(context);
@@ -277,6 +277,12 @@ class CLKernelLibrary
         return _context;
     }
 
+    /** Gets the CL device for which the programs are created. */
+    cl::Device &get_device()
+    {
+        return _device;
+    }
+
     /** Sets the CL device for which the programs are created.
      *
      * @param[in] device A CL device.
@@ -329,6 +335,18 @@ class CLKernelLibrary
      */
     void add_built_program(const std::string &built_program_name, cl::Program program);
 
+    /** Returns true if FP16 is supported by the CL device
+     *
+     * @return true if the CL device supports FP16
+     */
+    bool fp16_supported() const;
+
+    /** Returns true if int64_base_atomics extension is supported by the CL device
+     *
+     * @return true if the CL device supports int64_base_atomics extension
+     */
+    bool int64_base_atomics_supported() const;
+
 private:
     /** Load program and its dependencies.
      *

diff --git a/arm_compute/core/CL/CLKernels.h b/arm_compute/core/CL/CLKernels.h
@@ -29,6 +29,7 @@
 #include "arm_compute/core/CL/kernels/CLAccumulateKernel.h"
 #include "arm_compute/core/CL/kernels/CLActivationLayerKernel.h"
 #include "arm_compute/core/CL/kernels/CLArithmeticAdditionKernel.h"
+#include "arm_compute/core/CL/kernels/CLArithmeticDivisionKernel.h"
 #include "arm_compute/core/CL/kernels/CLArithmeticSubtractionKernel.h"
 #include "arm_compute/core/CL/kernels/CLBatchNormalizationLayerKernel.h"
 #include "arm_compute/core/CL/kernels/CLBitwiseAndKernel.h"
@@ -61,6 +62,7 @@
 #include "arm_compute/core/CL/kernels/CLErodeKernel.h"
 #include "arm_compute/core/CL/kernels/CLFastCornersKernel.h"
 #include "arm_compute/core/CL/kernels/CLFillBorderKernel.h"
+#include "arm_compute/core/CL/kernels/CLFlattenLayerKernel.h"
 #include "arm_compute/core/CL/kernels/CLFloorKernel.h"
 #include "arm_compute/core/CL/kernels/CLGEMMInterleave4x4Kernel.h"
 #include "arm_compute/core/CL/kernels/CLGEMMLowpMatrixMultiplyKernel.h"

diff --git a/arm_compute/core/CL/CLValidate.h b/arm_compute/core/CL/CLValidate.h
@@ -0,0 +1,61 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_CL_VALIDATE_H__
+#define __ARM_COMPUTE_CL_VALIDATE_H__
+
+#include "arm_compute/core/Validate.h"
+
+namespace arm_compute
+{
+#define ARM_COMPUTE_ERROR_ON_F16_UNSUPPORTED(tensor) \
+    ARM_COMPUTE_ERROR_THROW_ON(::arm_compute::error_on_unsupported_fp16(__func__, __FILE__, __LINE__, tensor, CLKernelLibrary::get().fp16_supported()))
+
+#define ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(tensor) \
+    ARM_COMPUTE_RETURN_ON_ERROR(::arm_compute::error_on_unsupported_fp16(__func__, __FILE__, __LINE__, tensor, CLKernelLibrary::get().fp16_supported()))
+
+/** Return an error if int64_base_atomics extension is not supported by the device.
+ *
+ * @param[in] function Function in which the error occurred.
+ * @param[in] file     Name of the file where the error occurred.
+ * @param[in] line     Line on which the error occurred.
+ *
+ * @return Status
+ */
+inline arm_compute::Status error_on_unsupported_int64_base_atomics(const char *function, const char *file, const int line)
+{
+    if(!CLKernelLibrary::get().int64_base_atomics_supported())
+    {
+        return ARM_COMPUTE_CREATE_ERROR_LOC(arm_compute::ErrorCode::UNSUPPORTED_EXTENSION_USE, function, file, line, "Atomic functions are not supported");
+    }
+    return arm_compute::Status{};
+}
+
+#define ARM_COMPUTE_ERROR_ON_INT64_BASE_ATOMICS_UNSUPPORTED() \
+    ARM_COMPUTE_ERROR_THROW_ON(::arm_compute::error_on_unsupported_int64_base_atomics(__func__, __FILE__, __LINE__));
+
+#define ARM_COMPUTE_RETURN_ERROR_ON_INT64_BASE_ATOMICS_UNSUPPORTED() \
+    ARM_COMPUTE_RETURN_ON_ERROR(::arm_compute::error_on_unsupported_int64_base_atomics(__func__, __FILE__, __LINE__));
+
+} // namespace arm_compute
+#endif /* __ARM_COMPUTE_CL_VALIDATE_H__ */
diff --git a/arm_compute/core/CL/ICLKernel.h b/arm_compute/core/CL/ICLKernel.h
@@ -61,11 +61,23 @@ class ICLKernel : public IKernel
     {
         return 2 + 2 * dimension_size;
     }
+    using IKernel::configure; //Prevent children from calling IKernel::configure() directly
+protected:
+    /** Configure the kernel's window and local workgroup size hint.
+     *
+     * @param[in] window   The maximum window which will be returned by window()
+     * @param[in] lws_hint (Optional) Local-Workgroup-Size to use.
+     */
+    void configure_internal(const Window &window, cl::NDRange lws_hint = CLKernelLibrary::get().default_ndrange())
+    {
+        _lws_hint = lws_hint;
+        IKernel::configure(window);
+    }
 
 public:
     /** Constructor */
     ICLKernel()
-        : _kernel(nullptr), _lws_hint(CLKernelLibrary::get().default_ndrange()), _target(GPUTarget::MIDGARD), _config_id(arm_compute::default_config_id), _max_workgroup_size(0)
+        : _kernel(nullptr), _target(GPUTarget::MIDGARD), _config_id(arm_compute::default_config_id), _max_workgroup_size(0), _lws_hint()
     {
     }
     /** Returns a reference to the OpenCL kernel of this object.
@@ -196,6 +208,7 @@ class ICLKernel : public IKernel
      */
     void set_lws_hint(const cl::NDRange &lws_hint)
     {
+        ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); // lws_hint will be overwritten by configure()
         _lws_hint = lws_hint;
     }
 
@@ -282,10 +295,11 @@ class ICLKernel : public IKernel
 
 protected:
     cl::Kernel  _kernel;             /**< OpenCL kernel to run */
-    cl::NDRange _lws_hint;           /**< Local workgroup size hint for the OpenCL kernel */
     GPUTarget   _target;             /**< The targeted GPU */
     std::string _config_id;          /**< Configuration ID */
     size_t      _max_workgroup_size; /**< The maximum workgroup size for this kernel */
+private:
+    cl::NDRange _lws_hint; /**< Local workgroup size hint for the OpenCL kernel */
 };
 
 /** Add the kernel to the command queue with the given window.

diff --git a/arm_compute/core/CL/OpenCL.h b/arm_compute/core/CL/OpenCL.h
@@ -31,10 +31,14 @@
 #ifndef ARM_COMPUTE_NO_EXCEPTIONS
 #define CL_HPP_ENABLE_EXCEPTIONS
 #endif // ARM_COMPUTE_NO_EXCEPTIONS
-#define CL_HPP_CL_1_2_DEFAULT_BUILD
+#define CL_TARGET_OPENCL_VERSION 200
 #define CL_HPP_TARGET_OPENCL_VERSION 110
 #define CL_HPP_MINIMUM_OPENCL_VERSION 110
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Weffc++"
+#pragma GCC diagnostic ignored "-Wignored-qualifiers"
 #include <CL/cl2.hpp>
+#pragma GCC diagnostic pop
 
 namespace cl
 {
@@ -78,6 +82,7 @@ class CLSymbols final
 #define DECLARE_FUNCTION_PTR(func_name) \
     std::function<decltype(func_name)> func_name##_ptr = nullptr
 
+    DECLARE_FUNCTION_PTR(clCreateContext);
     DECLARE_FUNCTION_PTR(clCreateContextFromType);
     DECLARE_FUNCTION_PTR(clCreateCommandQueue);
     DECLARE_FUNCTION_PTR(clGetContextInfo);

diff --git a/arm_compute/core/CL/kernels/CLActivationLayerKernel.h b/arm_compute/core/CL/kernels/CLActivationLayerKernel.h
@@ -51,15 +51,15 @@ class CLActivationLayerKernel : public ICLKernel
      * @note If the output tensor is a nullptr, the activation function will be performed in-place
      *
      * @param[in, out] input    Source tensor. In case of @p output tensor = nullptr, this tensor will store the result
-     *                          of the activation function. Data types supported: QS8/QASYMM8/QS16/F16/F32.
+     *                          of the activation function. Data types supported: QASYMM8/F16/F32.
      * @param[out]     output   Destination tensor. Data type supported: same as @p input
      * @param[in]      act_info Activation layer information.
      */
     void configure(ICLTensor *input, ICLTensor *output, ActivationLayerInfo act_info);
     /** Static function to check if given info will lead to a valid configuration of @ref CLActivationLayerKernel
      *
      * @param[in] input    Source tensor info. In case of @p output tensor info = nullptr, this tensor will store the result
-     *                     of the activation function. Data types supported: QS8/QASYMM8/QS16/F16/F32.
+     *                     of the activation function. Data types supported: QASYMM8/F16/F32.
      * @param[in] output   Destination tensor info. Data type supported: same as @p input
      * @param[in] act_info Activation layer information.
      *

diff --git a/arm_compute/core/CL/kernels/CLArithmeticAdditionKernel.h b/arm_compute/core/CL/kernels/CLArithmeticAdditionKernel.h
@@ -53,17 +53,17 @@ class CLArithmeticAdditionKernel : public ICLKernel
     ~CLArithmeticAdditionKernel() = default;
     /** Initialise the kernel's inputs, output and convertion policy.
      *
-     * @param[in]  input1 First tensor input. Data types supported: U8/QS8/QS16/S16/F16/F32.
-     * @param[in]  input2 Second tensor input. Data types supported: U8/QS8 (only if @p input1 is QS8), QS16 (only if @p input1 is QS16), S16/F16/F32.
-     * @param[out] output Output tensor. Data types supported: U8 (Only if both inputs are U8), QS8 (only if both inputs are QS8), QS16 (only if both inputs are QS16), S16/F16/F32.
+     * @param[in]  input1 First tensor input. Data types supported: U8/QASYMM8/S16/F16/F32.
+     * @param[in]  input2 Second tensor input. Data types supported: U8, QASYMM8 (only if @p input1 is QASYMM8), S16/F16/F32.
+     * @param[out] output Output tensor. Data types supported: U8 (Only if both inputs are U8), QASYMM8 (only if @p input1 is QASYMM8), S16/F16/F32.
      * @param[in]  policy Policy to use to handle overflow.
      */
     void configure(const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output, ConvertPolicy policy);
     /** Static function to check if given info will lead to a valid configuration of @ref CLArithmeticAdditionKernel
      *
-     * @param[in] input1 First tensor input info. Data types supported: U8/QS8/QS16/S16/F16/F32.
-     * @param[in] input2 Second tensor input info. Data types supported: U8/QS8 (only if @p input1 is QS8), QS16 (only if @p input1 is QS16), S16/F16/F32.
-     * @param[in] output Output tensor info. Data types supported: U8 (Only if both inputs are U8), QS8 (only if both inputs are QS8), QS16 (only if both inputs are QS16), S16/F16/F32.
+     * @param[in] input1 First tensor input info. Data types supported: U8/QASYMM8/S16/F16/F32.
+     * @param[in] input2 Second tensor input info. Data types supported: U8, QASYMM8 (only if @p input1 is QASYMM8), S16/F16/F32.
+     * @param[in] output Output tensor info. Data types supported: U8 (Only if both inputs are U8), QASYMM8 (only if both inputs are QASYMM8), S16/F16/F32.
      * @param[in] policy Policy to use to handle overflow.
      *
      * @return a status